- Add import_cucloud_pricing.go for 联通云 payg 公开价抓取 - Add import_bytedance_pricing.go for 火山引擎/ByteDance Ark 定价导入 - Include test files and sample testdata for both importers - Update plan catalog inventory docs and seeds - Add cucloud pricing importer implementation plan - Align pipeline scripts and smoke gate tests
285 lines
7.9 KiB
Go
285 lines
7.9 KiB
Go
//go:build llm_script
|
|
|
|
package main
|
|
|
|
import (
|
|
"database/sql"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"html"
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const defaultBytedanceArkPricingURL = "https://www.volcengine.com/docs/82379/1544106"
|
|
|
|
type bytedanceArkPricingImportConfig struct {
|
|
URL string
|
|
Fixture string
|
|
DryRun bool
|
|
Timeout time.Duration
|
|
}
|
|
|
|
func main() {
|
|
loadSubscriptionImportEnv()
|
|
|
|
var url string
|
|
var fixture string
|
|
var dryRun bool
|
|
var timeoutSeconds int
|
|
|
|
flag.StringVar(&url, "url", defaultBytedanceArkPricingURL, "火山方舟官方模型价格页")
|
|
flag.StringVar(&fixture, "fixture", "", "火山方舟价格样例文件")
|
|
flag.BoolVar(&dryRun, "dry-run", false, "仅解析并打印摘要,不写入数据库")
|
|
flag.IntVar(&timeoutSeconds, "timeout", 20, "请求超时(秒)")
|
|
flag.Parse()
|
|
|
|
cfg := bytedanceArkPricingImportConfig{URL: url, Fixture: fixture, DryRun: dryRun, Timeout: time.Duration(timeoutSeconds) * time.Second}
|
|
|
|
var db *sql.DB
|
|
var err error
|
|
if !cfg.DryRun {
|
|
db, err = subscriptionImportDB()
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "open db: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
defer db.Close()
|
|
}
|
|
|
|
if err := runBytedanceArkPricingImport(cfg, db, os.Stdout); err != nil {
|
|
fmt.Fprintf(os.Stderr, "import_bytedance_pricing: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
|
|
func runBytedanceArkPricingImport(cfg bytedanceArkPricingImportConfig, db *sql.DB, out io.Writer) error {
|
|
client := &http.Client{Timeout: cfg.Timeout}
|
|
raw, err := fetchRawPricingPage(cfg.URL, cfg.Fixture, client)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
records, err := parseBytedanceArkPricingCatalog(raw)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
records = dedupeOfficialPricingRecords(records)
|
|
if cfg.DryRun {
|
|
_, err = fmt.Fprintf(out, "source=bytedance-pricing-import models=%d operator=%s dry_run=true\n", len(records), records[0].OperatorName)
|
|
return err
|
|
}
|
|
if db == nil {
|
|
return fmt.Errorf("db is required when dry-run=false")
|
|
}
|
|
if err := upsertOfficialPricingRecords(db, records, "bytedance-pricing-import"); err != nil {
|
|
return err
|
|
}
|
|
var tableRows int
|
|
if err := db.QueryRow(`SELECT COUNT(*) FROM region_pricing`).Scan(&tableRows); err != nil {
|
|
return fmt.Errorf("count region_pricing: %w", err)
|
|
}
|
|
_, err = fmt.Fprintf(out, "source=bytedance-pricing-import models=%d operator=%s table_rows=%d dry_run=false\n", len(records), records[0].OperatorName, tableRows)
|
|
return err
|
|
}
|
|
|
|
func parseBytedanceArkPricingCatalog(raw string) ([]officialPricingRecord, error) {
|
|
markdown, err := extractBytedanceArkPricingMarkdown(raw)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rows, err := extractMarkdownTableRowsForHeading(markdown, "## 在线推理(常规)")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(rows) < 2 {
|
|
return nil, fmt.Errorf("unexpected bytedance ark pricing table")
|
|
}
|
|
|
|
records := make([]officialPricingRecord, 0, len(rows)-1)
|
|
for _, row := range rows[1:] {
|
|
if len(row) < 6 {
|
|
continue
|
|
}
|
|
modelName := cleanBytedanceArkCell(row[0])
|
|
if modelName == "" || isBytedanceArkConditionRow(modelName) {
|
|
continue
|
|
}
|
|
inputPrice := bytedanceArkPriceValue(row[2])
|
|
outputPrice := bytedanceArkPriceValue(row[5])
|
|
if inputPrice <= 0 || outputPrice <= 0 {
|
|
continue
|
|
}
|
|
providerName := bytedanceArkProviderName(modelName)
|
|
providerNameCn, providerCountry, providerWebsite := providerMetadata(providerName)
|
|
records = append(records, officialPricingRecord{
|
|
ModelID: normalizeExternalID("bytedance", modelName),
|
|
ModelName: modelName,
|
|
ProviderName: providerName,
|
|
ProviderNameCn: providerNameCn,
|
|
ProviderCountry: providerCountry,
|
|
ProviderWebsite: providerWebsite,
|
|
OperatorName: "ByteDance Volcano",
|
|
OperatorNameCn: "火山引擎",
|
|
OperatorCountry: "CN",
|
|
OperatorWebsite: "https://www.volcengine.com/product/ark",
|
|
OperatorType: "official",
|
|
Region: "CN",
|
|
Currency: "CNY",
|
|
InputPrice: inputPrice,
|
|
OutputPrice: outputPrice,
|
|
SourceURL: defaultBytedanceArkPricingURL,
|
|
ModelSourceURL: defaultBytedanceArkPricingURL,
|
|
DateConfidence: "unknown",
|
|
DateSourceKind: "official_pricing",
|
|
Modality: detectModality(modelName),
|
|
})
|
|
}
|
|
if len(records) == 0 {
|
|
return nil, fmt.Errorf("no bytedance ark input/output pricing rows found")
|
|
}
|
|
return records, nil
|
|
}
|
|
|
|
func extractBytedanceArkPricingMarkdown(raw string) (string, error) {
|
|
if !strings.Contains(raw, "window._ROUTER_DATA = ") {
|
|
return raw, nil
|
|
}
|
|
jsonText, err := extractJSONAfterMarker(raw, "window._ROUTER_DATA = ")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
var envelope map[string]any
|
|
if err := json.Unmarshal([]byte(jsonText), &envelope); err != nil {
|
|
return "", fmt.Errorf("parse bytedance router json: %w", err)
|
|
}
|
|
loaderData, _ := envelope["loaderData"].(map[string]any)
|
|
page, _ := loaderData["docs/(libid)/(docid$)/page"].(map[string]any)
|
|
curDoc, _ := page["curDoc"].(map[string]any)
|
|
markdown, _ := curDoc["MDContent"].(string)
|
|
if strings.TrimSpace(markdown) == "" {
|
|
return "", fmt.Errorf("missing bytedance pricing markdown content")
|
|
}
|
|
return markdown, nil
|
|
}
|
|
|
|
func extractJSONAfterMarker(raw string, marker string) (string, error) {
|
|
start := strings.Index(raw, marker)
|
|
if start < 0 {
|
|
return "", fmt.Errorf("marker %q not found", marker)
|
|
}
|
|
start += len(marker)
|
|
braceDepth := 0
|
|
inString := false
|
|
escaped := false
|
|
end := -1
|
|
for i := start; i < len(raw); i++ {
|
|
ch := raw[i]
|
|
if inString {
|
|
if escaped {
|
|
escaped = false
|
|
continue
|
|
}
|
|
switch ch {
|
|
case '\\':
|
|
escaped = true
|
|
case '"':
|
|
inString = false
|
|
}
|
|
continue
|
|
}
|
|
switch ch {
|
|
case '"':
|
|
inString = true
|
|
case '{':
|
|
braceDepth++
|
|
case '}':
|
|
braceDepth--
|
|
if braceDepth == 0 {
|
|
end = i + 1
|
|
i = len(raw)
|
|
}
|
|
}
|
|
}
|
|
if end <= start {
|
|
return "", fmt.Errorf("unable to locate router json boundary")
|
|
}
|
|
return raw[start:end], nil
|
|
}
|
|
|
|
func extractMarkdownTableRowsForHeading(markdown string, heading string) ([][]string, error) {
|
|
lines := strings.Split(markdown, "\n")
|
|
capturing := false
|
|
rows := make([][]string, 0)
|
|
for _, line := range lines {
|
|
trimmed := strings.TrimSpace(line)
|
|
switch {
|
|
case trimmed == heading:
|
|
capturing = true
|
|
case capturing && strings.HasPrefix(trimmed, "#") && trimmed != heading:
|
|
if len(rows) > 0 {
|
|
return rows, nil
|
|
}
|
|
capturing = false
|
|
}
|
|
if !capturing || !strings.HasPrefix(trimmed, "|") || strings.Contains(trimmed, "|---") {
|
|
continue
|
|
}
|
|
cells := strings.Split(strings.Trim(trimmed, "|"), "|")
|
|
for i := range cells {
|
|
cells[i] = strings.TrimSpace(cells[i])
|
|
}
|
|
rows = append(rows, cells)
|
|
}
|
|
if len(rows) == 0 {
|
|
return nil, fmt.Errorf("missing markdown table for heading %s", heading)
|
|
}
|
|
return rows, nil
|
|
}
|
|
|
|
func cleanBytedanceArkCell(raw string) string {
|
|
cleaned := html.UnescapeString(strings.TrimSpace(raw))
|
|
cleaned = strings.ReplaceAll(cleaned, `\-`, "-")
|
|
cleaned = strings.ReplaceAll(cleaned, `\`, "")
|
|
cleaned = strings.ReplaceAll(cleaned, "<br><br>", " ")
|
|
cleaned = strings.ReplaceAll(cleaned, "<br />", " ")
|
|
cleaned = strings.ReplaceAll(cleaned, "<br/>", " ")
|
|
cleaned = strings.ReplaceAll(cleaned, "<br>", " ")
|
|
cleaned = regexp.MustCompile(`(?is)<[^>]+>`).ReplaceAllString(cleaned, " ")
|
|
cleaned = regexp.MustCompile(`\s+`).ReplaceAllString(cleaned, " ")
|
|
return strings.TrimSpace(cleaned)
|
|
}
|
|
|
|
func bytedanceArkPriceValue(raw string) float64 {
|
|
cleaned := cleanBytedanceArkCell(raw)
|
|
if cleaned == "" || strings.Contains(cleaned, "不支持") {
|
|
return 0
|
|
}
|
|
match := regexp.MustCompile(`([0-9]+(?:\.[0-9]+)?)`).FindStringSubmatch(cleaned)
|
|
if len(match) != 2 {
|
|
return 0
|
|
}
|
|
return mustParseSubscriptionPrice(match[1])
|
|
}
|
|
|
|
func isBytedanceArkConditionRow(value string) bool {
|
|
lower := strings.ToLower(strings.TrimSpace(value))
|
|
return lower == "" || strings.HasPrefix(lower, "输入长度")
|
|
}
|
|
|
|
func bytedanceArkProviderName(modelName string) string {
|
|
lower := strings.ToLower(strings.TrimSpace(modelName))
|
|
switch {
|
|
case strings.HasPrefix(lower, "deepseek"):
|
|
return "DeepSeek"
|
|
case strings.HasPrefix(lower, "glm"):
|
|
return "Zhipu AI"
|
|
default:
|
|
return "ByteDance"
|
|
}
|
|
}
|