502 lines
16 KiB
Go
502 lines
16 KiB
Go
//go:build llm_script && !scripts_pkg
|
|
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
_ "github.com/lib/pq"
|
|
)
|
|
|
|
type verificationCandidateRow struct {
|
|
ID int64
|
|
CandidateDate string
|
|
EventType string
|
|
ProviderName string
|
|
ModelName string
|
|
ProviderCountry string
|
|
Title string
|
|
Summary string
|
|
CandidateURLs []string
|
|
Status string
|
|
VerificationConfidence string
|
|
}
|
|
|
|
type intradayVerificationConfig struct {
|
|
Date string
|
|
DryRun bool
|
|
DatabaseURL string
|
|
Timeout time.Duration
|
|
}
|
|
|
|
type intradayVerificationResult struct {
|
|
CandidateID int64
|
|
CandidateStatus string
|
|
VerificationConfidence string
|
|
VerifierSource string
|
|
VerifierURL string
|
|
VerifierStatus string
|
|
ExtractedFacts map[string]any
|
|
Notes string
|
|
}
|
|
|
|
type intradayVerificationSummary struct {
|
|
CandidateTotal int `json:"candidate_total"`
|
|
VerifiedTotal int `json:"verified_total"`
|
|
OfficialConfirmedTotal int `json:"official_confirmed_total"`
|
|
SecondaryConfirmedTotal int `json:"secondary_confirmed_total"`
|
|
RejectedTotal int `json:"rejected_total"`
|
|
DryRun bool `json:"dry_run"`
|
|
}
|
|
|
|
func main() {
|
|
loadIntradayEnv()
|
|
cfg := intradayVerificationConfig{}
|
|
flag.StringVar(&cfg.Date, "date", intradayDateValue(), "验证日期,格式 YYYY-MM-DD")
|
|
flag.BoolVar(&cfg.DryRun, "dry-run", false, "仅输出摘要,不写数据库")
|
|
flag.Parse()
|
|
cfg.DatabaseURL = intradayDefaultDSN()
|
|
cfg.Timeout = discoveryTimeoutFromEnv()
|
|
if err := runIntradayCandidateVerification(cfg); err != nil {
|
|
fmt.Fprintf(os.Stderr, "verify_intraday_news_candidates: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
|
|
func runIntradayCandidateVerification(cfg intradayVerificationConfig) error {
|
|
if strings.TrimSpace(cfg.Date) == "" {
|
|
return fmt.Errorf("date 未设置")
|
|
}
|
|
db, err := sql.Open("postgres", cfg.DatabaseURL)
|
|
if err != nil {
|
|
return fmt.Errorf("open db: %w", err)
|
|
}
|
|
defer db.Close()
|
|
candidates, err := loadIntradayVerificationCandidates(context.Background(), db, cfg.Date)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
results := make([]intradayVerificationResult, 0, len(candidates))
|
|
for _, candidate := range candidates {
|
|
result, err := verifyIntradayCandidate(candidate, cfg.Timeout)
|
|
if err != nil {
|
|
result = intradayVerificationResult{
|
|
CandidateID: candidate.ID,
|
|
CandidateStatus: "candidate",
|
|
VerificationConfidence: candidate.VerificationConfidence,
|
|
VerifierStatus: "error",
|
|
Notes: err.Error(),
|
|
}
|
|
}
|
|
results = append(results, result)
|
|
}
|
|
if !cfg.DryRun {
|
|
if err := persistIntradayVerificationResults(context.Background(), db, results); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return printIntradayVerificationSummary(summarizeIntradayVerification(results, cfg.DryRun))
|
|
}
|
|
|
|
func loadIntradayVerificationCandidates(ctx context.Context, db *sql.DB, date string) ([]verificationCandidateRow, error) {
|
|
rows, err := db.QueryContext(ctx, `
|
|
SELECT id, candidate_date::text, event_type, provider_name, COALESCE(model_name, ''), COALESCE(provider_country, ''),
|
|
title, COALESCE(summary, ''), COALESCE(candidate_urls::text, '[]'), status, verification_confidence
|
|
FROM intraday_news_candidate
|
|
WHERE candidate_date = $1::date
|
|
AND status IN ('candidate', 'verifying')
|
|
ORDER BY discovered_at DESC, id DESC`, date)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("query intraday candidates: %w", err)
|
|
}
|
|
defer rows.Close()
|
|
var candidates []verificationCandidateRow
|
|
for rows.Next() {
|
|
var row verificationCandidateRow
|
|
var rawURLs string
|
|
if err := rows.Scan(&row.ID, &row.CandidateDate, &row.EventType, &row.ProviderName, &row.ModelName, &row.ProviderCountry, &row.Title, &row.Summary, &rawURLs, &row.Status, &row.VerificationConfidence); err != nil {
|
|
return nil, fmt.Errorf("scan intraday candidate: %w", err)
|
|
}
|
|
if err := json.Unmarshal([]byte(rawURLs), &row.CandidateURLs); err != nil {
|
|
return nil, fmt.Errorf("unmarshal candidate urls: %w", err)
|
|
}
|
|
candidates = append(candidates, row)
|
|
}
|
|
return candidates, rows.Err()
|
|
}
|
|
|
|
func verifyIntradayCandidate(candidate verificationCandidateRow, timeout time.Duration) (intradayVerificationResult, error) {
|
|
client := &http.Client{Timeout: timeout}
|
|
best := intradayVerificationResult{
|
|
CandidateID: candidate.ID,
|
|
CandidateStatus: "candidate",
|
|
VerificationConfidence: candidate.VerificationConfidence,
|
|
VerifierStatus: "insufficient",
|
|
Notes: "未找到足够证据",
|
|
ExtractedFacts: map[string]any{},
|
|
}
|
|
for _, candidateURL := range candidate.CandidateURLs {
|
|
body, err := fetchVerificationDocument(candidateURL, client)
|
|
if err != nil {
|
|
best = preferVerificationResult(best, intradayVerificationResult{
|
|
CandidateID: candidate.ID,
|
|
CandidateStatus: "candidate",
|
|
VerificationConfidence: candidate.VerificationConfidence,
|
|
VerifierURL: candidateURL,
|
|
VerifierStatus: "error",
|
|
Notes: err.Error(),
|
|
ExtractedFacts: map[string]any{},
|
|
})
|
|
continue
|
|
}
|
|
result := verifyCandidateDocument(candidate, candidateURL, body)
|
|
if result.CandidateID == 0 {
|
|
result.CandidateID = candidate.ID
|
|
}
|
|
best = preferVerificationResult(best, result)
|
|
if best.CandidateStatus == "verified" && best.VerificationConfidence == "official_confirmed" {
|
|
return best, nil
|
|
}
|
|
}
|
|
return best, nil
|
|
}
|
|
|
|
func fetchVerificationDocument(rawURL string, client *http.Client) (string, error) {
|
|
req, err := http.NewRequest(http.MethodGet, rawURL, nil)
|
|
if err != nil {
|
|
return "", fmt.Errorf("build verification request: %w", err)
|
|
}
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; llm-intelligence intraday verifier)")
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("fetch verification document: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
payload, _ := io.ReadAll(resp.Body)
|
|
return "", fmt.Errorf("fetch verification document: unexpected status %d: %s", resp.StatusCode, strings.TrimSpace(string(payload)))
|
|
}
|
|
payload, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("read verification document: %w", err)
|
|
}
|
|
return string(payload), nil
|
|
}
|
|
|
|
func verifyCandidateDocument(candidate verificationCandidateRow, candidateURL, body string) intradayVerificationResult {
|
|
source := classifyVerificationSource(candidate.ProviderName, candidateURL)
|
|
facts := extractVerificationFacts(body)
|
|
modelMatched := bodyMentionsModel(body, candidate.ModelName, candidate.Title)
|
|
result := intradayVerificationResult{
|
|
CandidateID: candidate.ID,
|
|
CandidateStatus: "candidate",
|
|
VerificationConfidence: "candidate",
|
|
VerifierSource: source,
|
|
VerifierURL: candidateURL,
|
|
VerifierStatus: "insufficient",
|
|
ExtractedFacts: facts,
|
|
Notes: "证据不足",
|
|
}
|
|
if isOfficialVerificationSource(source) {
|
|
switch normalizeIntradayEventType(candidate.EventType) {
|
|
case "official_release":
|
|
if modelMatched {
|
|
result.CandidateStatus = "verified"
|
|
result.VerificationConfidence = "official_confirmed"
|
|
result.VerifierStatus = "matched"
|
|
result.Notes = "官方页面命中模型发布线索"
|
|
}
|
|
case "promo_campaign":
|
|
if modelMatched && bodyMentionsPromo(body) {
|
|
result.CandidateStatus = "verified"
|
|
result.VerificationConfidence = "official_confirmed"
|
|
result.VerifierStatus = "matched"
|
|
result.Notes = "官方页面命中活动窗口或促销语义"
|
|
}
|
|
case "price_cut", "price_increase":
|
|
if priceResult, ok := deriveVerifiedPriceEvent(candidate.EventType, facts); ok {
|
|
result.CandidateStatus = "verified"
|
|
result.VerificationConfidence = "official_confirmed"
|
|
result.VerifierStatus = "matched"
|
|
result.ExtractedFacts = priceResult
|
|
result.Notes = "官方价格页命中真实价格变化"
|
|
} else if modelMatched {
|
|
result.VerifierStatus = "insufficient"
|
|
result.Notes = "命中模型但缺少可计算的价格变化事实"
|
|
}
|
|
case "leak_or_rumor":
|
|
if modelMatched {
|
|
result.CandidateStatus = "verified"
|
|
result.VerificationConfidence = "secondary_confirmed"
|
|
result.VerifierStatus = "matched"
|
|
result.Notes = "保留为待确认情报,不进入正式事实层"
|
|
}
|
|
}
|
|
} else if modelMatched {
|
|
result.CandidateStatus = "verified"
|
|
result.VerificationConfidence = "secondary_confirmed"
|
|
result.VerifierStatus = "matched"
|
|
result.Notes = "仅二手来源命中,不能进入正式事实层"
|
|
}
|
|
if result.VerifierStatus == "insufficient" && modelMatched && !isOfficialVerificationSource(source) {
|
|
result.VerificationConfidence = "secondary_confirmed"
|
|
}
|
|
return result
|
|
}
|
|
|
|
func isOfficialVerificationSource(source string) bool {
|
|
switch source {
|
|
case "official_page", "official_docs", "official_blog", "pricing_page":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func classifyVerificationSource(providerName, rawURL string) string {
|
|
parsed, err := url.Parse(rawURL)
|
|
if err != nil {
|
|
return "secondary_media"
|
|
}
|
|
host := strings.ToLower(parsed.Host)
|
|
path := strings.ToLower(parsed.Path)
|
|
if isOfficialProviderHost(providerName, host) {
|
|
switch {
|
|
case strings.Contains(host, "docs.") || strings.Contains(path, "/docs"):
|
|
return "official_docs"
|
|
case strings.Contains(host, "pricing") || strings.Contains(path, "pricing") || strings.Contains(path, "price"):
|
|
return "pricing_page"
|
|
case strings.Contains(path, "blog") || strings.Contains(path, "news") || strings.Contains(path, "announcement"):
|
|
return "official_blog"
|
|
default:
|
|
return "official_page"
|
|
}
|
|
}
|
|
return "secondary_media"
|
|
}
|
|
|
|
func isOfficialProviderHost(providerName, host string) bool {
|
|
tokens := providerHostTokens(providerName)
|
|
for _, token := range tokens {
|
|
if token != "" && strings.Contains(host, token) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func providerHostTokens(providerName string) []string {
|
|
switch strings.ToLower(strings.TrimSpace(providerName)) {
|
|
case "openai":
|
|
return []string{"openai.com"}
|
|
case "anthropic":
|
|
return []string{"anthropic.com"}
|
|
case "google", "google gemini", "gemini":
|
|
return []string{"google.com", "google.dev", "ai.google.dev"}
|
|
case "deepseek":
|
|
return []string{"deepseek.com", "deepseek.ai"}
|
|
case "qwen", "dashscope":
|
|
return []string{"aliyun.com", "dashscope.com"}
|
|
case "xai":
|
|
return []string{"x.ai"}
|
|
case "智谱":
|
|
return []string{"zhipuai.cn"}
|
|
case "百度", "百度文心":
|
|
return []string{"baidu.com", "cloud.baidu.com"}
|
|
case "腾讯", "腾讯混元":
|
|
return []string{"tencent.com", "cloud.tencent.com"}
|
|
case "minimax":
|
|
return []string{"minimax.io", "minimax.chat"}
|
|
default:
|
|
clean := strings.ToLower(strings.TrimSpace(providerName))
|
|
if clean == "" {
|
|
return nil
|
|
}
|
|
return []string{clean}
|
|
}
|
|
}
|
|
|
|
func bodyMentionsModel(body, modelName, title string) bool {
|
|
normBody := normalizeEvidenceText(body)
|
|
for _, candidate := range []string{modelName, title} {
|
|
normCandidate := normalizeEvidenceText(candidate)
|
|
if normCandidate != "" && strings.Contains(normBody, normCandidate) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func bodyMentionsPromo(body string) bool {
|
|
lower := strings.ToLower(body)
|
|
for _, marker := range []string{"campaign", "promo", "promotion", "discount", "活动", "优惠", "限时", "窗口"} {
|
|
if strings.Contains(lower, marker) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func extractVerificationFacts(body string) map[string]any {
|
|
facts := map[string]any{}
|
|
oldPrice, newPrice, ok := extractPricePair(body)
|
|
if ok {
|
|
facts["old_input_price"] = oldPrice
|
|
facts["new_input_price"] = newPrice
|
|
if oldPrice != 0 {
|
|
facts["price_change_pct"] = ((newPrice - oldPrice) / oldPrice) * 100
|
|
}
|
|
}
|
|
return facts
|
|
}
|
|
|
|
func deriveVerifiedPriceEvent(eventType string, facts map[string]any) (map[string]any, bool) {
|
|
oldValue, oldOK := facts["old_input_price"].(float64)
|
|
newValue, newOK := facts["new_input_price"].(float64)
|
|
changePct, pctOK := facts["price_change_pct"].(float64)
|
|
if !oldOK || !newOK || !pctOK || oldValue <= 0 || newValue <= 0 {
|
|
return nil, false
|
|
}
|
|
normalized := normalizeIntradayEventType(eventType)
|
|
if normalized == "price_cut" && changePct >= 0 {
|
|
return nil, false
|
|
}
|
|
if normalized == "price_increase" && changePct <= 0 {
|
|
return nil, false
|
|
}
|
|
return facts, true
|
|
}
|
|
|
|
func extractPricePair(body string) (float64, float64, bool) {
|
|
oldRe := regexp.MustCompile(`(?i)(old|from)\s*price[^0-9$¥]*[$¥]?([0-9]+(?:\.[0-9]+)?)`)
|
|
newRe := regexp.MustCompile(`(?i)(new|to)\s*price[^0-9$¥]*[$¥]?([0-9]+(?:\.[0-9]+)?)`)
|
|
oldMatch := oldRe.FindStringSubmatch(body)
|
|
newMatch := newRe.FindStringSubmatch(body)
|
|
if len(oldMatch) < 3 || len(newMatch) < 3 {
|
|
return 0, 0, false
|
|
}
|
|
var oldValue, newValue float64
|
|
if _, err := fmt.Sscanf(oldMatch[2], "%f", &oldValue); err != nil {
|
|
return 0, 0, false
|
|
}
|
|
if _, err := fmt.Sscanf(newMatch[2], "%f", &newValue); err != nil {
|
|
return 0, 0, false
|
|
}
|
|
return oldValue, newValue, true
|
|
}
|
|
|
|
func normalizeEvidenceText(value string) string {
|
|
value = strings.ToLower(value)
|
|
re := regexp.MustCompile(`[^a-z0-9\p{Han}]+`)
|
|
value = re.ReplaceAllString(value, "")
|
|
return strings.TrimSpace(value)
|
|
}
|
|
|
|
func preferVerificationResult(current, next intradayVerificationResult) intradayVerificationResult {
|
|
if verificationScore(next) > verificationScore(current) {
|
|
return next
|
|
}
|
|
return current
|
|
}
|
|
|
|
func verificationScore(result intradayVerificationResult) int {
|
|
score := 0
|
|
switch result.CandidateStatus {
|
|
case "verified":
|
|
score += 20
|
|
case "rejected":
|
|
score += 5
|
|
}
|
|
switch result.VerificationConfidence {
|
|
case "official_confirmed":
|
|
score += 10
|
|
case "secondary_confirmed":
|
|
score += 5
|
|
}
|
|
switch result.VerifierStatus {
|
|
case "matched":
|
|
score += 3
|
|
case "contradicted":
|
|
score += 1
|
|
}
|
|
return score
|
|
}
|
|
|
|
func persistIntradayVerificationResults(ctx context.Context, db *sql.DB, results []intradayVerificationResult) error {
|
|
for _, result := range results {
|
|
facts, err := json.Marshal(result.ExtractedFacts)
|
|
if err != nil {
|
|
return fmt.Errorf("marshal extracted facts: %w", err)
|
|
}
|
|
_, err = db.ExecContext(ctx, `
|
|
INSERT INTO intraday_news_verification (
|
|
candidate_id, verifier_source, verifier_url, verifier_status, extracted_facts, notes
|
|
) VALUES ($1, NULLIF($2, ''), NULLIF($3, ''), $4, $5::jsonb, NULLIF($6, ''))`,
|
|
result.CandidateID,
|
|
result.VerifierSource,
|
|
result.VerifierURL,
|
|
result.VerifierStatus,
|
|
string(facts),
|
|
result.Notes,
|
|
)
|
|
if err != nil {
|
|
return fmt.Errorf("insert intraday verification: %w", err)
|
|
}
|
|
_, err = db.ExecContext(ctx, `
|
|
UPDATE intraday_news_candidate
|
|
SET status = $2,
|
|
verification_confidence = $3,
|
|
verification_notes = NULLIF($4, ''),
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = $1`,
|
|
result.CandidateID,
|
|
result.CandidateStatus,
|
|
result.VerificationConfidence,
|
|
result.Notes,
|
|
)
|
|
if err != nil {
|
|
return fmt.Errorf("update intraday candidate: %w", err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func summarizeIntradayVerification(results []intradayVerificationResult, dryRun bool) intradayVerificationSummary {
|
|
summary := intradayVerificationSummary{CandidateTotal: len(results), DryRun: dryRun}
|
|
for _, result := range results {
|
|
if result.CandidateStatus == "verified" {
|
|
summary.VerifiedTotal++
|
|
}
|
|
switch result.VerificationConfidence {
|
|
case "official_confirmed":
|
|
summary.OfficialConfirmedTotal++
|
|
case "secondary_confirmed":
|
|
summary.SecondaryConfirmedTotal++
|
|
}
|
|
if result.CandidateStatus == "rejected" {
|
|
summary.RejectedTotal++
|
|
}
|
|
}
|
|
return summary
|
|
}
|
|
|
|
func printIntradayVerificationSummary(summary intradayVerificationSummary) error {
|
|
payload, err := json.Marshal(summary)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
fmt.Println(string(payload))
|
|
return nil
|
|
}
|