Files
llm-intelligence/scripts/import_qwen_pricing.go

179 lines
5.3 KiB
Go
Raw Normal View History

//go:build llm_script
package main
import (
"database/sql"
"flag"
"fmt"
"html"
"io"
"net/http"
"os"
"regexp"
"strings"
"time"
)
const defaultQwenPricingURL = "https://help.aliyun.com/zh/model-studio/model-pricing"
var qwenModelLinePattern = regexp.MustCompile(`^(qwen[0-9a-z.-]+|qwq[0-9a-z.-]+|qvq[0-9a-z.-]+)$`)
type qwenPricingImportConfig struct {
URL string
Fixture string
DryRun bool
Timeout time.Duration
}
func main() {
loadSubscriptionImportEnv()
var url string
var fixture string
var dryRun bool
var timeoutSeconds int
flag.StringVar(&url, "url", defaultQwenPricingURL, "通义千问官方模型价格页")
flag.StringVar(&fixture, "fixture", "", "通义千问价格样例文件")
flag.BoolVar(&dryRun, "dry-run", false, "仅解析并打印摘要,不写入数据库")
flag.IntVar(&timeoutSeconds, "timeout", 20, "请求超时(秒)")
flag.Parse()
cfg := qwenPricingImportConfig{URL: url, Fixture: fixture, DryRun: dryRun, Timeout: time.Duration(timeoutSeconds) * time.Second}
var db *sql.DB
var err error
if !cfg.DryRun {
db, err = subscriptionImportDB()
if err != nil {
fmt.Fprintf(os.Stderr, "open db: %v\n", err)
os.Exit(1)
}
defer db.Close()
}
if err := runQwenPricingImport(cfg, db, os.Stdout); err != nil {
fmt.Fprintf(os.Stderr, "import_qwen_pricing: %v\n", err)
os.Exit(1)
}
}
func runQwenPricingImport(cfg qwenPricingImportConfig, db *sql.DB, out io.Writer) error {
client := &http.Client{Timeout: cfg.Timeout}
raw, err := fetchRawPricingPage(cfg.URL, cfg.Fixture, client)
if err != nil {
return err
}
records, err := parseQwenPricingCatalog(raw)
if err != nil {
return err
}
records = dedupeOfficialPricingRecords(records)
if cfg.DryRun {
_, err = fmt.Fprintf(out, "source=qwen-pricing-import models=%d operator=%s dry_run=true\n", len(records), records[0].OperatorName)
return err
}
if db == nil {
return fmt.Errorf("db is required when dry-run=false")
}
if err := upsertOfficialPricingRecords(db, records, "qwen-pricing-import"); err != nil {
return err
}
var tableRows int
if err := db.QueryRow(`SELECT COUNT(*) FROM region_pricing`).Scan(&tableRows); err != nil {
return fmt.Errorf("count region_pricing: %w", err)
}
_, err = fmt.Fprintf(out, "source=qwen-pricing-import models=%d operator=%s table_rows=%d dry_run=false\n", len(records), records[0].OperatorName, tableRows)
return err
}
func parseQwenPricingCatalog(raw string) ([]officialPricingRecord, error) {
lines := qwenPricingLines(raw)
records := make([]officialPricingRecord, 0)
for i := 0; i < len(lines); i++ {
modelName := strings.ToLower(strings.TrimSpace(lines[i]))
if !qwenModelLinePattern.MatchString(modelName) {
continue
}
block := make([]string, 0, 12)
for j := i + 1; j < len(lines) && j < i+14; j++ {
next := strings.ToLower(strings.TrimSpace(lines[j]))
if qwenModelLinePattern.MatchString(next) {
break
}
block = append(block, lines[j])
}
prices := qwenBlockPrices(block)
if len(prices) < 2 {
continue
}
providerNameCn, providerCountry, providerWebsite := providerMetadata("Qwen")
record := officialPricingRecord{
ModelID: normalizeExternalID("qwen", modelName),
ModelName: modelName,
ProviderName: "Qwen",
ProviderNameCn: providerNameCn,
ProviderCountry: providerCountry,
ProviderWebsite: providerWebsite,
OperatorName: "DashScope",
OperatorNameCn: "通义千问 API",
OperatorCountry: "CN",
OperatorWebsite: "https://help.aliyun.com/zh/model-studio/model-pricing",
OperatorType: "official",
Region: "CN",
Currency: "CNY",
InputPrice: prices[0],
OutputPrice: prices[1],
SourceURL: defaultQwenPricingURL,
ModelSourceURL: defaultQwenPricingURL,
DateConfidence: "unknown",
DateSourceKind: "official_pricing",
Modality: detectModality(modelName),
}
records = append(records, record)
}
if len(records) == 0 {
return nil, fmt.Errorf("unexpected qwen pricing content")
}
return records, nil
}
func qwenPricingLines(raw string) []string {
raw = strings.ReplaceAll(raw, `\u003c`, "<")
raw = strings.ReplaceAll(raw, `\u003e`, ">")
raw = strings.ReplaceAll(raw, `\n`, "\n")
raw = strings.ReplaceAll(raw, `\t`, " ")
raw = html.UnescapeString(raw)
replacer := strings.NewReplacer(
"<br>", "\n", "<br/>", "\n", "<br />", "\n",
"</p>", "\n", "</div>", "\n", "</section>", "\n", "</tr>", "\n",
"</td>", "\n", "</th>", "\n", "</li>", "\n", "</h1>", "\n",
"</h2>", "\n", "</h3>", "\n", "</h4>", "\n", "</h5>", "\n", "</h6>", "\n",
)
withBreaks := replacer.Replace(raw)
tagPattern := regexp.MustCompile(`(?is)<[^>]+>`)
withBreaks = tagPattern.ReplaceAllString(withBreaks, " ")
parts := strings.Split(withBreaks, "\n")
lines := make([]string, 0, len(parts))
for _, part := range parts {
line := strings.TrimSpace(regexp.MustCompile(`\s+`).ReplaceAllString(part, " "))
if line != "" {
lines = append(lines, line)
}
}
return lines
}
func qwenBlockPrices(lines []string) []float64 {
pricePattern := regexp.MustCompile(`^([0-9]+(?:\.[0-9]+)?) 元$`)
prices := make([]float64, 0, 4)
for _, line := range lines {
match := pricePattern.FindStringSubmatch(strings.TrimSpace(line))
if len(match) == 2 {
prices = append(prices, mustParseSubscriptionPrice(match[1]))
}
}
return prices
}