forked from niuniu/llm-intelligence
chore: prepare repository for publishing
This commit is contained in:
331
scripts/tencent_catalog_lib.go
Normal file
331
scripts/tencent_catalog_lib.go
Normal file
@@ -0,0 +1,331 @@
|
||||
//go:build llm_script
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultTencentCatalogURL = "https://cloud.tencent.com/document/product/1823/130060"
|
||||
)
|
||||
|
||||
var defaultTencentCatalogTimeout = 20 * time.Second
|
||||
|
||||
type fetchTencentCatalogConfig struct {
|
||||
URL string
|
||||
DryRun bool
|
||||
Timeout time.Duration
|
||||
Fixture string
|
||||
}
|
||||
|
||||
type tencentCatalog struct {
|
||||
UpdatedAt string
|
||||
Plans []tencentPlan
|
||||
Models []tencentModel
|
||||
}
|
||||
|
||||
type tencentPlan struct {
|
||||
Series string
|
||||
Tier string
|
||||
Quota string
|
||||
Price string
|
||||
BillingCycle string
|
||||
Scene string
|
||||
}
|
||||
|
||||
type tencentModel struct {
|
||||
Series string
|
||||
Name string
|
||||
ModelID string
|
||||
ContextLength int
|
||||
Notes []string
|
||||
}
|
||||
|
||||
func fetchTencentCatalogContent(cfg fetchTencentCatalogConfig, client *http.Client) (string, error) {
|
||||
if strings.TrimSpace(cfg.Fixture) != "" {
|
||||
data, err := os.ReadFile(cfg.Fixture)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
req, err := http.NewRequest(http.MethodGet, cfg.URL, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
req.Header.Set("User-Agent", "llm-intelligence/tencent-catalog-fetcher")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("unexpected status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(body), nil
|
||||
}
|
||||
|
||||
func parseTencentCatalog(raw string) (tencentCatalog, error) {
|
||||
lines := normalizeTencentCatalogLines(raw)
|
||||
|
||||
var catalog tencentCatalog
|
||||
var currentSeries string
|
||||
var currentMode string
|
||||
|
||||
for i := 0; i < len(lines); i++ {
|
||||
line := lines[i]
|
||||
|
||||
if catalog.UpdatedAt == "" {
|
||||
if updatedAt := extractUpdatedAt(line); updatedAt != "" {
|
||||
catalog.UpdatedAt = updatedAt
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if series := extractSeriesHeading(line); series != "" {
|
||||
currentSeries = series
|
||||
currentMode = ""
|
||||
continue
|
||||
}
|
||||
|
||||
switch line {
|
||||
case "### 套餐详情":
|
||||
currentMode = "plans"
|
||||
continue
|
||||
case "### 可用模型":
|
||||
currentMode = "models"
|
||||
continue
|
||||
}
|
||||
|
||||
switch currentMode {
|
||||
case "plans":
|
||||
plan, nextIndex, ok := tryParseTencentPlan(lines, i, currentSeries)
|
||||
if ok {
|
||||
catalog.Plans = append(catalog.Plans, plan)
|
||||
i = nextIndex
|
||||
}
|
||||
case "models":
|
||||
model, nextIndex, ok := tryParseTencentModel(lines, i, currentSeries)
|
||||
if ok {
|
||||
catalog.Models = append(catalog.Models, model)
|
||||
i = nextIndex
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if catalog.UpdatedAt == "" {
|
||||
return tencentCatalog{}, fmt.Errorf("catalog updated_at not found")
|
||||
}
|
||||
if len(catalog.Plans) == 0 {
|
||||
return tencentCatalog{}, fmt.Errorf("catalog plans not found")
|
||||
}
|
||||
if len(catalog.Models) == 0 {
|
||||
return tencentCatalog{}, fmt.Errorf("catalog models not found")
|
||||
}
|
||||
return catalog, nil
|
||||
}
|
||||
|
||||
func normalizeTencentCatalogLines(raw string) []string {
|
||||
text := html.UnescapeString(raw)
|
||||
|
||||
replacements := []string{"<br>", "<br/>", "<br />", "</p>", "</div>", "</li>", "</tr>", "</td>", "</h1>", "</h2>", "</h3>", "</h4>", "</pre>", "</main>"}
|
||||
for _, replacement := range replacements {
|
||||
text = strings.ReplaceAll(text, replacement, "\n")
|
||||
}
|
||||
|
||||
tagPattern := regexp.MustCompile(`<[^>]+>`)
|
||||
text = tagPattern.ReplaceAllString(text, "")
|
||||
|
||||
text = strings.ReplaceAll(text, "\r\n", "\n")
|
||||
text = strings.ReplaceAll(text, "\r", "\n")
|
||||
|
||||
rawLines := strings.Split(text, "\n")
|
||||
lines := make([]string, 0, len(rawLines))
|
||||
for _, rawLine := range rawLines {
|
||||
line := strings.TrimSpace(rawLine)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
lines = append(lines, line)
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
func extractUpdatedAt(line string) string {
|
||||
const prefix = "最近更新时间:"
|
||||
if strings.HasPrefix(line, prefix) {
|
||||
return strings.TrimSpace(strings.TrimPrefix(line, prefix))
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func extractSeriesHeading(line string) string {
|
||||
if !strings.HasPrefix(line, "## ") {
|
||||
return ""
|
||||
}
|
||||
series := strings.TrimSpace(strings.TrimPrefix(line, "## "))
|
||||
if strings.Contains(series, "Token Plan") || strings.Contains(series, "Coding Plan") {
|
||||
return strings.TrimSpace(strings.TrimSuffix(series, "套餐"))
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func tryParseTencentPlan(lines []string, start int, series string) (tencentPlan, int, bool) {
|
||||
if start+4 >= len(lines) {
|
||||
return tencentPlan{}, start, false
|
||||
}
|
||||
if !isTencentPlanName(lines[start]) {
|
||||
return tencentPlan{}, start, false
|
||||
}
|
||||
if !isTencentPlanTier(lines[start+1]) {
|
||||
return tencentPlan{}, start, false
|
||||
}
|
||||
if !strings.Contains(lines[start+2], "订阅月") {
|
||||
return tencentPlan{}, start, false
|
||||
}
|
||||
if !strings.Contains(lines[start+3], "Tokens") {
|
||||
return tencentPlan{}, start, false
|
||||
}
|
||||
if !strings.Contains(lines[start+4], "元/月") {
|
||||
return tencentPlan{}, start, false
|
||||
}
|
||||
|
||||
plan := tencentPlan{
|
||||
Series: series,
|
||||
Tier: strings.Trim(lines[start+1], "()() "),
|
||||
BillingCycle: lines[start+2],
|
||||
Quota: lines[start+3],
|
||||
Price: lines[start+4],
|
||||
}
|
||||
|
||||
nextIndex := start + 4
|
||||
if start+5 < len(lines) && !strings.HasPrefix(lines[start+5], "### ") && !isTencentPlanName(lines[start+5]) {
|
||||
plan.Scene = lines[start+5]
|
||||
nextIndex = start + 5
|
||||
}
|
||||
return plan, nextIndex, true
|
||||
}
|
||||
|
||||
func tryParseTencentModel(lines []string, start int, series string) (tencentModel, int, bool) {
|
||||
if start+1 >= len(lines) {
|
||||
return tencentModel{}, start, false
|
||||
}
|
||||
if !isTencentModelID(lines[start+1]) {
|
||||
return tencentModel{}, start, false
|
||||
}
|
||||
if isReservedTencentLine(lines[start]) {
|
||||
return tencentModel{}, start, false
|
||||
}
|
||||
|
||||
model := tencentModel{
|
||||
Series: series,
|
||||
Name: lines[start],
|
||||
ModelID: lines[start+1],
|
||||
}
|
||||
|
||||
notes := make([]string, 0, 4)
|
||||
nextIndex := start + 1
|
||||
for i := start + 2; i < len(lines); i++ {
|
||||
line := lines[i]
|
||||
if strings.HasPrefix(line, "## ") || strings.HasPrefix(line, "### ") {
|
||||
break
|
||||
}
|
||||
if isTencentPlanName(line) && i+1 < len(lines) && isTencentPlanTier(lines[i+1]) {
|
||||
break
|
||||
}
|
||||
if i+1 < len(lines) && isTencentModelID(lines[i+1]) && !isReservedTencentLine(line) {
|
||||
break
|
||||
}
|
||||
notes = append(notes, line)
|
||||
nextIndex = i
|
||||
}
|
||||
|
||||
model.Notes = notes
|
||||
model.ContextLength = extractContextLength(strings.Join(notes, " "))
|
||||
return model, nextIndex, true
|
||||
}
|
||||
|
||||
func isTencentPlanName(line string) bool {
|
||||
switch line {
|
||||
case "体验套餐", "基础套餐", "进阶套餐", "专业套餐":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func isTencentPlanTier(line string) bool {
|
||||
return strings.HasPrefix(line, "(") && strings.HasSuffix(line, ")")
|
||||
}
|
||||
|
||||
func isReservedTencentLine(line string) bool {
|
||||
if strings.HasPrefix(line, "#") {
|
||||
return true
|
||||
}
|
||||
switch line {
|
||||
case "Token Plan 个人版套餐概览", "套餐详情", "可用模型":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func isTencentModelID(line string) bool {
|
||||
modelIDPattern := regexp.MustCompile(`^[a-z0-9][a-z0-9._-]*$`)
|
||||
return modelIDPattern.MatchString(line)
|
||||
}
|
||||
|
||||
func extractContextLength(text string) int {
|
||||
contextPattern := regexp.MustCompile(`(?i)(\d+)\s*([KM])\s*上下文`)
|
||||
matches := contextPattern.FindStringSubmatch(text)
|
||||
if len(matches) != 3 {
|
||||
return 0
|
||||
}
|
||||
|
||||
value := 0
|
||||
fmt.Sscanf(matches[1], "%d", &value)
|
||||
switch strings.ToUpper(matches[2]) {
|
||||
case "K":
|
||||
return value * 1024
|
||||
case "M":
|
||||
return value * 1024 * 1024
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func formatSeriesSummary(plans []tencentPlan) string {
|
||||
counts := make(map[string]int)
|
||||
for _, plan := range plans {
|
||||
counts[plan.Series]++
|
||||
}
|
||||
|
||||
series := make([]string, 0, len(counts))
|
||||
for name := range counts {
|
||||
series = append(series, name)
|
||||
}
|
||||
sort.Strings(series)
|
||||
|
||||
parts := make([]string, 0, len(series))
|
||||
for _, name := range series {
|
||||
parts = append(parts, fmt.Sprintf("%s:%d", name, counts[name]))
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
Reference in New Issue
Block a user