Files
llm-intelligence/scripts/verify_intraday_news_candidates.go
2026-05-29 18:48:48 +08:00

502 lines
16 KiB
Go

//go:build llm_script && !scripts_pkg
package main
import (
"context"
"database/sql"
"encoding/json"
"flag"
"fmt"
"io"
"net/http"
"net/url"
"os"
"regexp"
"strings"
"time"
_ "github.com/lib/pq"
)
type verificationCandidateRow struct {
ID int64
CandidateDate string
EventType string
ProviderName string
ModelName string
ProviderCountry string
Title string
Summary string
CandidateURLs []string
Status string
VerificationConfidence string
}
type intradayVerificationConfig struct {
Date string
DryRun bool
DatabaseURL string
Timeout time.Duration
}
type intradayVerificationResult struct {
CandidateID int64
CandidateStatus string
VerificationConfidence string
VerifierSource string
VerifierURL string
VerifierStatus string
ExtractedFacts map[string]any
Notes string
}
type intradayVerificationSummary struct {
CandidateTotal int `json:"candidate_total"`
VerifiedTotal int `json:"verified_total"`
OfficialConfirmedTotal int `json:"official_confirmed_total"`
SecondaryConfirmedTotal int `json:"secondary_confirmed_total"`
RejectedTotal int `json:"rejected_total"`
DryRun bool `json:"dry_run"`
}
func main() {
loadIntradayEnv()
cfg := intradayVerificationConfig{}
flag.StringVar(&cfg.Date, "date", intradayDateValue(), "验证日期,格式 YYYY-MM-DD")
flag.BoolVar(&cfg.DryRun, "dry-run", false, "仅输出摘要,不写数据库")
flag.Parse()
cfg.DatabaseURL = intradayDefaultDSN()
cfg.Timeout = discoveryTimeoutFromEnv()
if err := runIntradayCandidateVerification(cfg); err != nil {
fmt.Fprintf(os.Stderr, "verify_intraday_news_candidates: %v\n", err)
os.Exit(1)
}
}
func runIntradayCandidateVerification(cfg intradayVerificationConfig) error {
if strings.TrimSpace(cfg.Date) == "" {
return fmt.Errorf("date 未设置")
}
db, err := sql.Open("postgres", cfg.DatabaseURL)
if err != nil {
return fmt.Errorf("open db: %w", err)
}
defer db.Close()
candidates, err := loadIntradayVerificationCandidates(context.Background(), db, cfg.Date)
if err != nil {
return err
}
results := make([]intradayVerificationResult, 0, len(candidates))
for _, candidate := range candidates {
result, err := verifyIntradayCandidate(candidate, cfg.Timeout)
if err != nil {
result = intradayVerificationResult{
CandidateID: candidate.ID,
CandidateStatus: "candidate",
VerificationConfidence: candidate.VerificationConfidence,
VerifierStatus: "error",
Notes: err.Error(),
}
}
results = append(results, result)
}
if !cfg.DryRun {
if err := persistIntradayVerificationResults(context.Background(), db, results); err != nil {
return err
}
}
return printIntradayVerificationSummary(summarizeIntradayVerification(results, cfg.DryRun))
}
func loadIntradayVerificationCandidates(ctx context.Context, db *sql.DB, date string) ([]verificationCandidateRow, error) {
rows, err := db.QueryContext(ctx, `
SELECT id, candidate_date::text, event_type, provider_name, COALESCE(model_name, ''), COALESCE(provider_country, ''),
title, COALESCE(summary, ''), COALESCE(candidate_urls::text, '[]'), status, verification_confidence
FROM intraday_news_candidate
WHERE candidate_date = $1::date
AND status IN ('candidate', 'verifying')
ORDER BY discovered_at DESC, id DESC`, date)
if err != nil {
return nil, fmt.Errorf("query intraday candidates: %w", err)
}
defer rows.Close()
var candidates []verificationCandidateRow
for rows.Next() {
var row verificationCandidateRow
var rawURLs string
if err := rows.Scan(&row.ID, &row.CandidateDate, &row.EventType, &row.ProviderName, &row.ModelName, &row.ProviderCountry, &row.Title, &row.Summary, &rawURLs, &row.Status, &row.VerificationConfidence); err != nil {
return nil, fmt.Errorf("scan intraday candidate: %w", err)
}
if err := json.Unmarshal([]byte(rawURLs), &row.CandidateURLs); err != nil {
return nil, fmt.Errorf("unmarshal candidate urls: %w", err)
}
candidates = append(candidates, row)
}
return candidates, rows.Err()
}
func verifyIntradayCandidate(candidate verificationCandidateRow, timeout time.Duration) (intradayVerificationResult, error) {
client := &http.Client{Timeout: timeout}
best := intradayVerificationResult{
CandidateID: candidate.ID,
CandidateStatus: "candidate",
VerificationConfidence: candidate.VerificationConfidence,
VerifierStatus: "insufficient",
Notes: "未找到足够证据",
ExtractedFacts: map[string]any{},
}
for _, candidateURL := range candidate.CandidateURLs {
body, err := fetchVerificationDocument(candidateURL, client)
if err != nil {
best = preferVerificationResult(best, intradayVerificationResult{
CandidateID: candidate.ID,
CandidateStatus: "candidate",
VerificationConfidence: candidate.VerificationConfidence,
VerifierURL: candidateURL,
VerifierStatus: "error",
Notes: err.Error(),
ExtractedFacts: map[string]any{},
})
continue
}
result := verifyCandidateDocument(candidate, candidateURL, body)
if result.CandidateID == 0 {
result.CandidateID = candidate.ID
}
best = preferVerificationResult(best, result)
if best.CandidateStatus == "verified" && best.VerificationConfidence == "official_confirmed" {
return best, nil
}
}
return best, nil
}
func fetchVerificationDocument(rawURL string, client *http.Client) (string, error) {
req, err := http.NewRequest(http.MethodGet, rawURL, nil)
if err != nil {
return "", fmt.Errorf("build verification request: %w", err)
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; llm-intelligence intraday verifier)")
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("fetch verification document: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
payload, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("fetch verification document: unexpected status %d: %s", resp.StatusCode, strings.TrimSpace(string(payload)))
}
payload, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("read verification document: %w", err)
}
return string(payload), nil
}
func verifyCandidateDocument(candidate verificationCandidateRow, candidateURL, body string) intradayVerificationResult {
source := classifyVerificationSource(candidate.ProviderName, candidateURL)
facts := extractVerificationFacts(body)
modelMatched := bodyMentionsModel(body, candidate.ModelName, candidate.Title)
result := intradayVerificationResult{
CandidateID: candidate.ID,
CandidateStatus: "candidate",
VerificationConfidence: "candidate",
VerifierSource: source,
VerifierURL: candidateURL,
VerifierStatus: "insufficient",
ExtractedFacts: facts,
Notes: "证据不足",
}
if isOfficialVerificationSource(source) {
switch normalizeIntradayEventType(candidate.EventType) {
case "official_release":
if modelMatched {
result.CandidateStatus = "verified"
result.VerificationConfidence = "official_confirmed"
result.VerifierStatus = "matched"
result.Notes = "官方页面命中模型发布线索"
}
case "promo_campaign":
if modelMatched && bodyMentionsPromo(body) {
result.CandidateStatus = "verified"
result.VerificationConfidence = "official_confirmed"
result.VerifierStatus = "matched"
result.Notes = "官方页面命中活动窗口或促销语义"
}
case "price_cut", "price_increase":
if priceResult, ok := deriveVerifiedPriceEvent(candidate.EventType, facts); ok {
result.CandidateStatus = "verified"
result.VerificationConfidence = "official_confirmed"
result.VerifierStatus = "matched"
result.ExtractedFacts = priceResult
result.Notes = "官方价格页命中真实价格变化"
} else if modelMatched {
result.VerifierStatus = "insufficient"
result.Notes = "命中模型但缺少可计算的价格变化事实"
}
case "leak_or_rumor":
if modelMatched {
result.CandidateStatus = "verified"
result.VerificationConfidence = "secondary_confirmed"
result.VerifierStatus = "matched"
result.Notes = "保留为待确认情报,不进入正式事实层"
}
}
} else if modelMatched {
result.CandidateStatus = "verified"
result.VerificationConfidence = "secondary_confirmed"
result.VerifierStatus = "matched"
result.Notes = "仅二手来源命中,不能进入正式事实层"
}
if result.VerifierStatus == "insufficient" && modelMatched && !isOfficialVerificationSource(source) {
result.VerificationConfidence = "secondary_confirmed"
}
return result
}
func isOfficialVerificationSource(source string) bool {
switch source {
case "official_page", "official_docs", "official_blog", "pricing_page":
return true
default:
return false
}
}
func classifyVerificationSource(providerName, rawURL string) string {
parsed, err := url.Parse(rawURL)
if err != nil {
return "secondary_media"
}
host := strings.ToLower(parsed.Host)
path := strings.ToLower(parsed.Path)
if isOfficialProviderHost(providerName, host) {
switch {
case strings.Contains(host, "docs.") || strings.Contains(path, "/docs"):
return "official_docs"
case strings.Contains(host, "pricing") || strings.Contains(path, "pricing") || strings.Contains(path, "price"):
return "pricing_page"
case strings.Contains(path, "blog") || strings.Contains(path, "news") || strings.Contains(path, "announcement"):
return "official_blog"
default:
return "official_page"
}
}
return "secondary_media"
}
func isOfficialProviderHost(providerName, host string) bool {
tokens := providerHostTokens(providerName)
for _, token := range tokens {
if token != "" && strings.Contains(host, token) {
return true
}
}
return false
}
func providerHostTokens(providerName string) []string {
switch strings.ToLower(strings.TrimSpace(providerName)) {
case "openai":
return []string{"openai.com"}
case "anthropic":
return []string{"anthropic.com"}
case "google", "google gemini", "gemini":
return []string{"google.com", "google.dev", "ai.google.dev"}
case "deepseek":
return []string{"deepseek.com", "deepseek.ai"}
case "qwen", "dashscope":
return []string{"aliyun.com", "dashscope.com"}
case "xai":
return []string{"x.ai"}
case "智谱":
return []string{"zhipuai.cn"}
case "百度", "百度文心":
return []string{"baidu.com", "cloud.baidu.com"}
case "腾讯", "腾讯混元":
return []string{"tencent.com", "cloud.tencent.com"}
case "minimax":
return []string{"minimax.io", "minimax.chat"}
default:
clean := strings.ToLower(strings.TrimSpace(providerName))
if clean == "" {
return nil
}
return []string{clean}
}
}
func bodyMentionsModel(body, modelName, title string) bool {
normBody := normalizeEvidenceText(body)
for _, candidate := range []string{modelName, title} {
normCandidate := normalizeEvidenceText(candidate)
if normCandidate != "" && strings.Contains(normBody, normCandidate) {
return true
}
}
return false
}
func bodyMentionsPromo(body string) bool {
lower := strings.ToLower(body)
for _, marker := range []string{"campaign", "promo", "promotion", "discount", "活动", "优惠", "限时", "窗口"} {
if strings.Contains(lower, marker) {
return true
}
}
return false
}
func extractVerificationFacts(body string) map[string]any {
facts := map[string]any{}
oldPrice, newPrice, ok := extractPricePair(body)
if ok {
facts["old_input_price"] = oldPrice
facts["new_input_price"] = newPrice
if oldPrice != 0 {
facts["price_change_pct"] = ((newPrice - oldPrice) / oldPrice) * 100
}
}
return facts
}
func deriveVerifiedPriceEvent(eventType string, facts map[string]any) (map[string]any, bool) {
oldValue, oldOK := facts["old_input_price"].(float64)
newValue, newOK := facts["new_input_price"].(float64)
changePct, pctOK := facts["price_change_pct"].(float64)
if !oldOK || !newOK || !pctOK || oldValue <= 0 || newValue <= 0 {
return nil, false
}
normalized := normalizeIntradayEventType(eventType)
if normalized == "price_cut" && changePct >= 0 {
return nil, false
}
if normalized == "price_increase" && changePct <= 0 {
return nil, false
}
return facts, true
}
func extractPricePair(body string) (float64, float64, bool) {
oldRe := regexp.MustCompile(`(?i)(old|from)\s*price[^0-9$¥]*[$¥]?([0-9]+(?:\.[0-9]+)?)`)
newRe := regexp.MustCompile(`(?i)(new|to)\s*price[^0-9$¥]*[$¥]?([0-9]+(?:\.[0-9]+)?)`)
oldMatch := oldRe.FindStringSubmatch(body)
newMatch := newRe.FindStringSubmatch(body)
if len(oldMatch) < 3 || len(newMatch) < 3 {
return 0, 0, false
}
var oldValue, newValue float64
if _, err := fmt.Sscanf(oldMatch[2], "%f", &oldValue); err != nil {
return 0, 0, false
}
if _, err := fmt.Sscanf(newMatch[2], "%f", &newValue); err != nil {
return 0, 0, false
}
return oldValue, newValue, true
}
func normalizeEvidenceText(value string) string {
value = strings.ToLower(value)
re := regexp.MustCompile(`[^a-z0-9\p{Han}]+`)
value = re.ReplaceAllString(value, "")
return strings.TrimSpace(value)
}
func preferVerificationResult(current, next intradayVerificationResult) intradayVerificationResult {
if verificationScore(next) > verificationScore(current) {
return next
}
return current
}
func verificationScore(result intradayVerificationResult) int {
score := 0
switch result.CandidateStatus {
case "verified":
score += 20
case "rejected":
score += 5
}
switch result.VerificationConfidence {
case "official_confirmed":
score += 10
case "secondary_confirmed":
score += 5
}
switch result.VerifierStatus {
case "matched":
score += 3
case "contradicted":
score += 1
}
return score
}
func persistIntradayVerificationResults(ctx context.Context, db *sql.DB, results []intradayVerificationResult) error {
for _, result := range results {
facts, err := json.Marshal(result.ExtractedFacts)
if err != nil {
return fmt.Errorf("marshal extracted facts: %w", err)
}
_, err = db.ExecContext(ctx, `
INSERT INTO intraday_news_verification (
candidate_id, verifier_source, verifier_url, verifier_status, extracted_facts, notes
) VALUES ($1, NULLIF($2, ''), NULLIF($3, ''), $4, $5::jsonb, NULLIF($6, ''))`,
result.CandidateID,
result.VerifierSource,
result.VerifierURL,
result.VerifierStatus,
string(facts),
result.Notes,
)
if err != nil {
return fmt.Errorf("insert intraday verification: %w", err)
}
_, err = db.ExecContext(ctx, `
UPDATE intraday_news_candidate
SET status = $2,
verification_confidence = $3,
verification_notes = NULLIF($4, ''),
updated_at = CURRENT_TIMESTAMP
WHERE id = $1`,
result.CandidateID,
result.CandidateStatus,
result.VerificationConfidence,
result.Notes,
)
if err != nil {
return fmt.Errorf("update intraday candidate: %w", err)
}
}
return nil
}
func summarizeIntradayVerification(results []intradayVerificationResult, dryRun bool) intradayVerificationSummary {
summary := intradayVerificationSummary{CandidateTotal: len(results), DryRun: dryRun}
for _, result := range results {
if result.CandidateStatus == "verified" {
summary.VerifiedTotal++
}
switch result.VerificationConfidence {
case "official_confirmed":
summary.OfficialConfirmedTotal++
case "secondary_confirmed":
summary.SecondaryConfirmedTotal++
}
if result.CandidateStatus == "rejected" {
summary.RejectedTotal++
}
}
return summary
}
func printIntradayVerificationSummary(summary intradayVerificationSummary) error {
payload, err := json.Marshal(summary)
if err != nil {
return err
}
fmt.Println(string(payload))
return nil
}