//go:build llm_script package main import ( "database/sql" "encoding/json" "flag" "fmt" "html" "io" "net/http" "os" "regexp" "strings" "time" ) const defaultBytedanceArkPricingURL = "https://www.volcengine.com/docs/82379/1544106" type bytedanceArkPricingImportConfig struct { URL string Fixture string DryRun bool Timeout time.Duration } func main() { loadSubscriptionImportEnv() var url string var fixture string var dryRun bool var timeoutSeconds int flag.StringVar(&url, "url", defaultBytedanceArkPricingURL, "火山方舟官方模型价格页") flag.StringVar(&fixture, "fixture", "", "火山方舟价格样例文件") flag.BoolVar(&dryRun, "dry-run", false, "仅解析并打印摘要,不写入数据库") flag.IntVar(&timeoutSeconds, "timeout", 20, "请求超时(秒)") flag.Parse() cfg := bytedanceArkPricingImportConfig{URL: url, Fixture: fixture, DryRun: dryRun, Timeout: time.Duration(timeoutSeconds) * time.Second} var db *sql.DB var err error if !cfg.DryRun { db, err = subscriptionImportDB() if err != nil { fmt.Fprintf(os.Stderr, "open db: %v\n", err) os.Exit(1) } defer db.Close() } if err := runBytedanceArkPricingImport(cfg, db, os.Stdout); err != nil { fmt.Fprintf(os.Stderr, "import_bytedance_pricing: %v\n", err) os.Exit(1) } } func runBytedanceArkPricingImport(cfg bytedanceArkPricingImportConfig, db *sql.DB, out io.Writer) error { client := &http.Client{Timeout: cfg.Timeout} raw, err := fetchRawPricingPage(cfg.URL, cfg.Fixture, client) if err != nil { return err } records, err := parseBytedanceArkPricingCatalog(raw) if err != nil { return err } records = dedupeOfficialPricingRecords(records) if cfg.DryRun { _, err = fmt.Fprintf(out, "source=bytedance-pricing-import models=%d operator=%s dry_run=true\n", len(records), records[0].OperatorName) return err } if db == nil { return fmt.Errorf("db is required when dry-run=false") } if err := upsertOfficialPricingRecords(db, records, "bytedance-pricing-import"); err != nil { return err } var tableRows int if err := db.QueryRow(`SELECT COUNT(*) FROM region_pricing`).Scan(&tableRows); err != nil { return fmt.Errorf("count region_pricing: %w", err) } _, err = fmt.Fprintf(out, "source=bytedance-pricing-import models=%d operator=%s table_rows=%d dry_run=false\n", len(records), records[0].OperatorName, tableRows) return err } func parseBytedanceArkPricingCatalog(raw string) ([]officialPricingRecord, error) { markdown, err := extractBytedanceArkPricingMarkdown(raw) if err != nil { return nil, err } rows, err := extractMarkdownTableRowsForHeading(markdown, "## 在线推理(常规)") if err != nil { return nil, err } if len(rows) < 2 { return nil, fmt.Errorf("unexpected bytedance ark pricing table") } records := make([]officialPricingRecord, 0, len(rows)-1) for _, row := range rows[1:] { if len(row) < 6 { continue } modelName := cleanBytedanceArkCell(row[0]) if modelName == "" || isBytedanceArkConditionRow(modelName) { continue } inputPrice := bytedanceArkPriceValue(row[2]) outputPrice := bytedanceArkPriceValue(row[5]) if inputPrice <= 0 || outputPrice <= 0 { continue } providerName := bytedanceArkProviderName(modelName) providerNameCn, providerCountry, providerWebsite := providerMetadata(providerName) records = append(records, officialPricingRecord{ ModelID: normalizeExternalID("bytedance", modelName), ModelName: modelName, ProviderName: providerName, ProviderNameCn: providerNameCn, ProviderCountry: providerCountry, ProviderWebsite: providerWebsite, OperatorName: "ByteDance Volcano", OperatorNameCn: "火山引擎", OperatorCountry: "CN", OperatorWebsite: "https://www.volcengine.com/product/ark", OperatorType: "official", Region: "CN", Currency: "CNY", InputPrice: inputPrice, OutputPrice: outputPrice, SourceURL: defaultBytedanceArkPricingURL, ModelSourceURL: defaultBytedanceArkPricingURL, DateConfidence: "unknown", DateSourceKind: "official_pricing", Modality: detectModality(modelName), }) } if len(records) == 0 { return nil, fmt.Errorf("no bytedance ark input/output pricing rows found") } return records, nil } func extractBytedanceArkPricingMarkdown(raw string) (string, error) { if !strings.Contains(raw, "window._ROUTER_DATA = ") { return raw, nil } jsonText, err := extractJSONAfterMarker(raw, "window._ROUTER_DATA = ") if err != nil { return "", err } var envelope map[string]any if err := json.Unmarshal([]byte(jsonText), &envelope); err != nil { return "", fmt.Errorf("parse bytedance router json: %w", err) } loaderData, _ := envelope["loaderData"].(map[string]any) page, _ := loaderData["docs/(libid)/(docid$)/page"].(map[string]any) curDoc, _ := page["curDoc"].(map[string]any) markdown, _ := curDoc["MDContent"].(string) if strings.TrimSpace(markdown) == "" { return "", fmt.Errorf("missing bytedance pricing markdown content") } return markdown, nil } func extractJSONAfterMarker(raw string, marker string) (string, error) { start := strings.Index(raw, marker) if start < 0 { return "", fmt.Errorf("marker %q not found", marker) } start += len(marker) braceDepth := 0 inString := false escaped := false end := -1 for i := start; i < len(raw); i++ { ch := raw[i] if inString { if escaped { escaped = false continue } switch ch { case '\\': escaped = true case '"': inString = false } continue } switch ch { case '"': inString = true case '{': braceDepth++ case '}': braceDepth-- if braceDepth == 0 { end = i + 1 i = len(raw) } } } if end <= start { return "", fmt.Errorf("unable to locate router json boundary") } return raw[start:end], nil } func extractMarkdownTableRowsForHeading(markdown string, heading string) ([][]string, error) { lines := strings.Split(markdown, "\n") capturing := false rows := make([][]string, 0) for _, line := range lines { trimmed := strings.TrimSpace(line) switch { case trimmed == heading: capturing = true case capturing && strings.HasPrefix(trimmed, "#") && trimmed != heading: if len(rows) > 0 { return rows, nil } capturing = false } if !capturing || !strings.HasPrefix(trimmed, "|") || strings.Contains(trimmed, "|---") { continue } cells := strings.Split(strings.Trim(trimmed, "|"), "|") for i := range cells { cells[i] = strings.TrimSpace(cells[i]) } rows = append(rows, cells) } if len(rows) == 0 { return nil, fmt.Errorf("missing markdown table for heading %s", heading) } return rows, nil } func cleanBytedanceArkCell(raw string) string { cleaned := html.UnescapeString(strings.TrimSpace(raw)) cleaned = strings.ReplaceAll(cleaned, `\-`, "-") cleaned = strings.ReplaceAll(cleaned, `\`, "") cleaned = strings.ReplaceAll(cleaned, "

", " ") cleaned = strings.ReplaceAll(cleaned, "
", " ") cleaned = strings.ReplaceAll(cleaned, "
", " ") cleaned = strings.ReplaceAll(cleaned, "
", " ") cleaned = regexp.MustCompile(`(?is)<[^>]+>`).ReplaceAllString(cleaned, " ") cleaned = regexp.MustCompile(`\s+`).ReplaceAllString(cleaned, " ") return strings.TrimSpace(cleaned) } func bytedanceArkPriceValue(raw string) float64 { cleaned := cleanBytedanceArkCell(raw) if cleaned == "" || strings.Contains(cleaned, "不支持") { return 0 } match := regexp.MustCompile(`([0-9]+(?:\.[0-9]+)?)`).FindStringSubmatch(cleaned) if len(match) != 2 { return 0 } return mustParseSubscriptionPrice(match[1]) } func isBytedanceArkConditionRow(value string) bool { lower := strings.ToLower(strings.TrimSpace(value)) return lower == "" || strings.HasPrefix(lower, "输入长度") } func bytedanceArkProviderName(modelName string) string { lower := strings.ToLower(strings.TrimSpace(modelName)) switch { case strings.HasPrefix(lower, "deepseek"): return "DeepSeek" case strings.HasPrefix(lower, "glm"): return "Zhipu AI" default: return "ByteDance" } }