feat(intraday): add discovery and verification watch pipeline
Some checks failed
CI / go-test (push) Has been cancelled
CI / scripts-regression (push) Has been cancelled
CI / frontend-build (push) Has been cancelled
CI / docker-build (push) Has been cancelled

This commit is contained in:
phamnazage-jpg
2026-05-27 18:54:32 +08:00
parent 32858bfec4
commit 475401bcbe
21 changed files with 2444 additions and 13 deletions

View File

@@ -51,6 +51,9 @@ export OPENROUTER_API_KEY="your-api-key"
export API_AUTH_TOKEN="replace-with-long-random-token" export API_AUTH_TOKEN="replace-with-long-random-token"
# 或者export API_BASIC_AUTH_USER="review" && export API_BASIC_AUTH_PASS="replace-with-password" # 或者export API_BASIC_AUTH_USER="review" && export API_BASIC_AUTH_PASS="replace-with-password"
export FEISHU_WEBHOOK="your-webhook-url" # 可选 export FEISHU_WEBHOOK="your-webhook-url" # 可选
export INTRADAY_DISCOVERY_SEARCH_PROVIDER="command_json" # 候选发现链路可选
export INTRADAY_DISCOVERY_LLM_PROVIDER="command_json" # 候选归纳链路可选
``` ```
@@ -75,6 +78,10 @@ crontab -e
# 日内价格追踪(推荐每 4 小时一次) # 日内价格追踪(推荐每 4 小时一次)
0 */4 * * * cd /path/to/llm-intelligence && bash scripts/run_intraday_price_watch.sh >> /tmp/llm_hub_intraday.log 2>&1 0 */4 * * * cd /path/to/llm-intelligence && bash scripts/run_intraday_price_watch.sh >> /tmp/llm_hub_intraday.log 2>&1
# 日内新闻发现与验证(推荐每 2 小时一次)
0 */2 * * * cd /path/to/llm-intelligence && bash scripts/run_intraday_discovery_watch.sh >> /tmp/llm_hub_intraday_discovery.log 2>&1
# 真实采集 + 写库 + 报告生成的手动复跑入口 # 真实采集 + 写库 + 报告生成的手动复跑入口
cd /path/to/llm-intelligence && bash scripts/run_real_pipeline.sh cd /path/to/llm-intelligence && bash scripts/run_real_pipeline.sh
``` ```
@@ -106,6 +113,11 @@ docker-compose up -d
| API_RATE_LIMIT_WINDOW_SEC | ❌ | `/api/*` 限流窗口秒数,默认 `60` | | API_RATE_LIMIT_WINDOW_SEC | ❌ | `/api/*` 限流窗口秒数,默认 `60` |
| FEISHU_WEBHOOK | ❌ | 飞书告警 Webhook | | FEISHU_WEBHOOK | ❌ | 飞书告警 Webhook |
| REPORT_DATE | ❌ | 手工指定日内追踪/日报日期 | | REPORT_DATE | ❌ | 手工指定日内追踪/日报日期 |
| INTRADAY_DISCOVERY_SEARCH_PROVIDER / INTRADAY_DISCOVERY_LLM_PROVIDER | 条件必填 | discovery 链路 provider 类型;支持 `fixture` / `command_json` / `http_json` |
| INTRADAY_DISCOVERY_SEARCH_COMMAND / INTRADAY_DISCOVERY_LLM_COMMAND | 条件必填 | 当 provider 为 `command_json` 时执行的命令stdout 必须输出 JSON |
| INTRADAY_DISCOVERY_SEARCH_URL / INTRADAY_DISCOVERY_LLM_URL | 条件必填 | 当 provider 为 `http_json` 时调用的接口 URL |
| INTRADAY_DISCOVERY_SEARCH_FIXTURE / INTRADAY_DISCOVERY_LLM_FIXTURE | ❌ | dry-run / 本地 fixture 输入 |
| INTRADAY_DISCOVERY_TIMEOUT_SEC | ❌ | discovery 与验证抓取超时秒数,默认 `20` |
| PORT | ❌ | API Server 监听端口,默认 8080 | | PORT | ❌ | API Server 监听端口,默认 8080 |

View File

@@ -24,6 +24,7 @@
- 手工复跑使用 `scripts/run_real_pipeline.sh`,不会把产物标记成正式日报 - 手工复跑使用 `scripts/run_real_pipeline.sh`,不会把产物标记成正式日报
- 历史补跑使用 `scripts/rebuild_historical_report.sh YYYY-MM-DD` - 历史补跑使用 `scripts/rebuild_historical_report.sh YYYY-MM-DD`
- 日内价格追踪使用 `scripts/run_intraday_price_watch.sh`,只刷新价格与信号,不生成正式日报 - 日内价格追踪使用 `scripts/run_intraday_price_watch.sh`,只刷新价格与信号,不生成正式日报
- 日内新闻候选发现与验证使用 `scripts/run_intraday_discovery_watch.sh`,只刷新候选池、验证轨迹与已验证信号,不生成正式日报
- HTTP API 当前未内建认证、授权和限流;公网暴露前必须在网关层补齐 - HTTP API 当前未内建认证、授权和限流;公网暴露前必须在网关层补齐
@@ -104,6 +105,7 @@ bash scripts/run_intel_pipeline.sh
3. 平台目录核验 3. 平台目录核验
4. 每日关键信号物化到 `daily_signal_snapshot` 4. 每日关键信号物化到 `daily_signal_snapshot`
5. 日内价格追踪可由 `scripts/run_intraday_price_watch.sh` 独立执行,不生成正式日报 5. 日内价格追踪可由 `scripts/run_intraday_price_watch.sh` 独立执行,不生成正式日报
6. 日内新闻候选发现与验证可由 `scripts/run_intraday_discovery_watch.sh` 独立执行,不生成正式日报
### 正式日报调度 ### 正式日报调度
@@ -125,13 +127,6 @@ bash scripts/run_daily.sh
9. 失败时降级复制昨日报告并可选飞书告警 9. 失败时降级复制昨日报告并可选飞书告警
### 手工真实复跑 ### 手工真实复跑
### 日内价格追踪
```bash
bash scripts/run_intraday_price_watch.sh
```
适用于捕捉“小米大降价”“活动窗口上线”“泄露情报”等日内价格事件。该入口只刷新价格与信号层,不写正式 `daily_report`,也不会覆盖 `latest_report` 语义。
```bash ```bash
bash scripts/run_real_pipeline.sh bash scripts/run_real_pipeline.sh
@@ -143,6 +138,22 @@ bash scripts/run_real_pipeline.sh
- `trigger_source=pipeline` - `trigger_source=pipeline`
- `is_official_daily=false` - `is_official_daily=false`
### 日内价格追踪
```bash
bash scripts/run_intraday_price_watch.sh
```
适用于捕捉“小米大降价”“活动窗口上线”等已知入口里的结构化价格变化。该入口只刷新价格与信号层,不写正式 `daily_report`,也不会覆盖 `latest_report` 语义。
### 日内新闻发现与验证
```bash
bash scripts/run_intraday_discovery_watch.sh
```
适用于搜索引擎 + LLM 高召回发现“当天可能发生的价格新闻 / 版本发布 / 活动窗口”,再通过官方页面 / 价格页 / docs 做验证。该入口只刷新候选池、验证轨迹与 `daily_signal_snapshot` 中的已验证事实,不写正式 `daily_report`,也不会覆盖 `latest_report` 语义。
### 历史补跑 ### 历史补跑
```bash ```bash

View File

@@ -0,0 +1,106 @@
-- 日内新闻候选与验证持久化结构
CREATE TABLE IF NOT EXISTS intraday_news_candidate (
id BIGSERIAL PRIMARY KEY,
candidate_date DATE NOT NULL,
discovered_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
event_type TEXT NOT NULL,
provider_name TEXT NOT NULL,
model_name TEXT,
provider_country TEXT,
title TEXT NOT NULL,
summary TEXT,
candidate_urls JSONB NOT NULL DEFAULT '[]'::jsonb,
discovery_source TEXT NOT NULL,
discovery_query TEXT,
discovery_evidence JSONB NOT NULL DEFAULT '{}'::jsonb,
normalized_key TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'candidate',
verification_confidence TEXT NOT NULL DEFAULT 'candidate',
verification_notes TEXT,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint WHERE conname = 'chk_intraday_news_candidate_status'
) THEN
ALTER TABLE intraday_news_candidate
ADD CONSTRAINT chk_intraday_news_candidate_status
CHECK (status IN ('candidate', 'verifying', 'verified', 'rejected', 'stale'));
END IF;
IF NOT EXISTS (
SELECT 1 FROM pg_constraint WHERE conname = 'chk_intraday_news_candidate_confidence'
) THEN
ALTER TABLE intraday_news_candidate
ADD CONSTRAINT chk_intraday_news_candidate_confidence
CHECK (verification_confidence IN ('candidate', 'secondary_confirmed', 'official_confirmed'));
END IF;
END
$$;
CREATE UNIQUE INDEX IF NOT EXISTS idx_intraday_news_candidate_normalized_key
ON intraday_news_candidate(normalized_key);
CREATE INDEX IF NOT EXISTS idx_intraday_news_candidate_date
ON intraday_news_candidate(candidate_date DESC, discovered_at DESC);
CREATE INDEX IF NOT EXISTS idx_intraday_news_candidate_status
ON intraday_news_candidate(status);
CREATE INDEX IF NOT EXISTS idx_intraday_news_candidate_provider_event
ON intraday_news_candidate(provider_name, event_type, candidate_date DESC);
COMMENT ON TABLE intraday_news_candidate IS '搜索引擎与 LLM 发现的日内新闻候选池,尚未直接进入正式日报事实层';
COMMENT ON COLUMN intraday_news_candidate.candidate_urls IS '候选来源 URL 数组,按发现层输出原样保留';
COMMENT ON COLUMN intraday_news_candidate.discovery_evidence IS '发现阶段原始证据 JSONB例如搜索命中、LLM 归纳结果';
COMMENT ON COLUMN intraday_news_candidate.normalized_key IS '同日同事件的去重键,避免重复发现候选';
CREATE TABLE IF NOT EXISTS intraday_news_verification (
id BIGSERIAL PRIMARY KEY,
candidate_id BIGINT NOT NULL REFERENCES intraday_news_candidate(id) ON DELETE CASCADE,
verified_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
verifier_source TEXT NOT NULL,
verifier_url TEXT,
verifier_status TEXT NOT NULL,
extracted_facts JSONB NOT NULL DEFAULT '{}'::jsonb,
notes TEXT,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
);
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_constraint WHERE conname = 'chk_intraday_news_verification_status'
) THEN
ALTER TABLE intraday_news_verification
ADD CONSTRAINT chk_intraday_news_verification_status
CHECK (verifier_status IN ('matched', 'contradicted', 'insufficient', 'error'));
END IF;
END
$$;
CREATE INDEX IF NOT EXISTS idx_intraday_news_verification_candidate_verified_at
ON intraday_news_verification(candidate_id, verified_at DESC);
CREATE INDEX IF NOT EXISTS idx_intraday_news_verification_source
ON intraday_news_verification(verifier_source);
CREATE INDEX IF NOT EXISTS idx_intraday_news_verification_status
ON intraday_news_verification(verifier_status);
COMMENT ON TABLE intraday_news_verification IS '日内新闻候选的验证轨迹,记录验证来源、状态和提取事实';
COMMENT ON COLUMN intraday_news_verification.extracted_facts IS '验证阶段提取出的结构化事实 JSONB';
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1
FROM pg_trigger
WHERE tgname = 'intraday_news_candidate_updated_at'
) THEN
CREATE TRIGGER intraday_news_candidate_updated_at
BEFORE UPDATE ON intraday_news_candidate
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
END IF;
END
$$;

View File

@@ -27,11 +27,20 @@
| `API_RATE_LIMIT_WINDOW_SEC` | 否 | `cmd/server/main.go` | `60` | `/api/*` 限流窗口长度(秒) | | `API_RATE_LIMIT_WINDOW_SEC` | 否 | `cmd/server/main.go` | `60` | `/api/*` 限流窗口长度(秒) |
| `FEISHU_WEBHOOK` | 否 | `run_daily.sh``feishu_alert.sh` | 空 | 正式日报失败时发送飞书告警 | | `FEISHU_WEBHOOK` | 否 | `run_daily.sh``feishu_alert.sh` | 空 | 正式日报失败时发送飞书告警 |
| `REPORT_OUTPUT_DIR` | 否 | `generate_daily_report.go` | `reports/daily` | 日报主产物输出目录 | | `REPORT_OUTPUT_DIR` | 否 | `generate_daily_report.go` | `reports/daily` | 日报主产物输出目录 |
| `REPORT_DATE` | 否 | `generate_daily_report.go``rebuild_historical_report.sh``run_intraday_price_watch.sh` | 当天日期 | 指定日报或日内价格追踪的日期,格式 `YYYY-MM-DD` | | `REPORT_DATE` | 否 | `generate_daily_report.go``rebuild_historical_report.sh``run_intraday_price_watch.sh``run_intraday_discovery_watch.sh` | 当天日期 | 指定日报或日内链路日期,格式 `YYYY-MM-DD` |
| `REPORT_RUN_KIND` | 否 | `generate_daily_report.go` | `manual` | 运行语义,如 `scheduled` / `manual` / `historical_rebuild` | | `REPORT_RUN_KIND` | 否 | `generate_daily_report.go` | `manual` | 运行语义,如 `scheduled` / `manual` / `historical_rebuild` |
| `REPORT_TRIGGER_SOURCE` | 否 | `generate_daily_report.go``materialize_daily_signals.go` | `cli` | 触发来源,如 `cron` / `pipeline` / `intraday` / `rebuild_script` | | `REPORT_TRIGGER_SOURCE` | 否 | `generate_daily_report.go``materialize_daily_signals.go` | `cli` | 触发来源,如 `cron` / `pipeline` / `intraday` / `intraday_discovery` / `rebuild_script` |
| `REPORT_IS_OFFICIAL_DAILY` | 否 | `generate_daily_report.go` | `false` | 是否属于正式日报产出 | | `REPORT_IS_OFFICIAL_DAILY` | 否 | `generate_daily_report.go` | `false` | 是否属于正式日报产出 |
| `REPORT_RUNTIME_AUDIT` | 否 | `generate_daily_report.go` | 空 | 来源级运行审计摘要,通常由流水线脚本注入 | | `REPORT_RUNTIME_AUDIT` | 否 | `generate_daily_report.go` | 空 | 来源级运行审计摘要,通常由流水线脚本注入 |
| `INTRADAY_DISCOVERY_SEARCH_PROVIDER` | 条件必填 | `discover_intraday_news_candidates.go``run_intraday_discovery_watch.sh` | 空 | 候选发现搜索 provider 类型;计划支持 `fixture` / `command_json` / `http_json` |
| `INTRADAY_DISCOVERY_SEARCH_COMMAND` | 条件必填 | `discover_intraday_news_candidates.go` | 空 | 当 `INTRADAY_DISCOVERY_SEARCH_PROVIDER=command_json` 时执行的搜索命令stdout 必须输出 JSON 数组 |
| `INTRADAY_DISCOVERY_SEARCH_URL` | 条件必填 | `discover_intraday_news_candidates.go` | 空 | 当 `INTRADAY_DISCOVERY_SEARCH_PROVIDER=http_json` 时调用的搜索接口 URL |
| `INTRADAY_DISCOVERY_SEARCH_FIXTURE` | 否 | `discover_intraday_news_candidates.go` | 空 | 搜索 provider 样例文件,用于 dry-run / 本地测试 |
| `INTRADAY_DISCOVERY_LLM_PROVIDER` | 条件必填 | `discover_intraday_news_candidates.go``run_intraday_discovery_watch.sh` | 空 | 候选归纳 LLM provider 类型;计划支持 `fixture` / `command_json` / `http_json` |
| `INTRADAY_DISCOVERY_LLM_COMMAND` | 条件必填 | `discover_intraday_news_candidates.go` | 空 | 当 `INTRADAY_DISCOVERY_LLM_PROVIDER=command_json` 时执行的 LLM 命令stdout 必须输出 JSON 数组 |
| `INTRADAY_DISCOVERY_LLM_URL` | 条件必填 | `discover_intraday_news_candidates.go` | 空 | 当 `INTRADAY_DISCOVERY_LLM_PROVIDER=http_json` 时调用的 LLM 接口 URL |
| `INTRADAY_DISCOVERY_LLM_FIXTURE` | 否 | `discover_intraday_news_candidates.go` | 空 | LLM provider 样例文件,用于 dry-run / 本地测试 |
| `INTRADAY_DISCOVERY_TIMEOUT_SEC` | 否 | `discover_intraday_news_candidates.go``verify_intraday_news_candidates.go` | `20` | discovery provider 与验证抓取的默认超时秒数 |
| `PHASE6_PORT` | 否 | `verify_phase6.sh` | 自动挑选 `18080-18120` | Phase 6 验收时临时启动 API Server 的端口 | | `PHASE6_PORT` | 否 | `verify_phase6.sh` | 自动挑选 `18080-18120` | Phase 6 验收时临时启动 API Server 的端口 |
| `LIGHTHOUSE_PORT` | 否 | `verify_lighthouse.sh` | `4173` | Lighthouse 预览端口 | | `LIGHTHOUSE_PORT` | 否 | `verify_lighthouse.sh` | `4173` | Lighthouse 预览端口 |
| `LIGHTHOUSE_SCORE_THRESHOLD` | 否 | `verify_lighthouse.sh` | `80` | 前端性能分数门槛 | | `LIGHTHOUSE_SCORE_THRESHOLD` | 否 | `verify_lighthouse.sh` | `80` | 前端性能分数门槛 |
@@ -83,6 +92,23 @@ bash scripts/run_intraday_price_watch.sh
- 不生成正式 HTML / Markdown 日报 - 不生成正式 HTML / Markdown 日报
- 推荐先按每 4 小时一次调度,再根据外部源稳定性决定是否收紧到每 2 小时 - 推荐先按每 4 小时一次调度,再根据外部源稳定性决定是否收紧到每 2 小时
### 日内候选发现与验证
```bash
export DATABASE_URL="postgres://app_user:***@db:5432/llm_intelligence?sslmode=disable"
export INTRADAY_DISCOVERY_SEARCH_PROVIDER="command_json"
export INTRADAY_DISCOVERY_SEARCH_COMMAND="/usr/local/bin/intraday-search --date $REPORT_DATE"
export INTRADAY_DISCOVERY_LLM_PROVIDER="command_json"
export INTRADAY_DISCOVERY_LLM_COMMAND="/usr/local/bin/intraday-llm --date $REPORT_DATE"
bash scripts/run_intraday_discovery_watch.sh
```
说明:
- 该入口只刷新候选池、验证轨迹与 `daily_signal_snapshot` 中的已验证事实
- 它不会直接写 `daily_report`,不会覆盖 `/api/v1/reports/latest` 对应的正式日报
- 搜索 / LLM provider 缺失时应明确报前置条件错误,不能伪装成“今日无新闻”
- `leak_or_rumor` 默认留在候选层,不进入正式日报事实
## 日报运行语义 ## 日报运行语义
项目用以下字段区分正式日报、手工复跑和历史补跑: 项目用以下字段区分正式日报、手工复跑和历史补跑:

View File

@@ -59,9 +59,11 @@
- 手工复跑命令已确定:`bash scripts/run_real_pipeline.sh` - 手工复跑命令已确定:`bash scripts/run_real_pipeline.sh`
- 历史补跑命令已确定:`bash scripts/rebuild_historical_report.sh YYYY-MM-DD` - 历史补跑命令已确定:`bash scripts/rebuild_historical_report.sh YYYY-MM-DD`
- 日内价格追踪命令已确定:`bash scripts/run_intraday_price_watch.sh` - 日内价格追踪命令已确定:`bash scripts/run_intraday_price_watch.sh`
- 日内新闻发现与验证命令已确定:`bash scripts/run_intraday_discovery_watch.sh`
- `OPENROUTER_API_KEY` 已在正式调度环境可用 - `OPENROUTER_API_KEY` 已在正式调度环境可用
- `FEISHU_WEBHOOK` 已配置或明确不上告警 - `FEISHU_WEBHOOK` 已配置或明确不上告警
- 候选发现所需 search / LLM provider 已配置,缺失时会以前置条件错误失败,不会伪装成“无新闻”
### 安全与访问控制 ### 安全与访问控制
@@ -141,6 +143,8 @@ bash scripts/run_real_pipeline.sh
``` ```
# 日内价格追踪(推荐) # 日内价格追踪(推荐)
0 */4 * * * cd /path/to/llm-intelligence && bash scripts/run_intraday_price_watch.sh >> /tmp/llm_hub_intraday.log 2>&1 0 */4 * * * cd /path/to/llm-intelligence && bash scripts/run_intraday_price_watch.sh >> /tmp/llm_hub_intraday.log 2>&1
# 日内新闻发现与验证(推荐)
0 */2 * * * cd /path/to/llm-intelligence && bash scripts/run_intraday_discovery_watch.sh >> /tmp/llm_hub_intraday_discovery.log 2>&1
### 7. 线上冒烟 ### 7. 线上冒烟

View File

@@ -0,0 +1,420 @@
# Intraday Discovery + Verification Implementation Plan
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
**Goal:** 在不污染正式日报语义的前提下,为现有日内链路增加“搜索引擎 + 大模型候选发现层”和“官方来源验证层”,让当天的大模型价格新闻、版本发布、活动窗口能更早进入候选池,并只把已验证事实接入现有 `daily_signal_snapshot` / 日报语义链路。
**Architecture:** 保留现有 `scripts/run_intraday_price_watch.sh` 作为结构化价格事实刷新入口,不改它“只刷新价格/信号、不生成正式日报”的边界。新增一条独立的 `run_intraday_discovery_watch.sh` 发现链路:先用搜索引擎与 LLM 生成候选事件,再通过官方页面 / 价格页 / docs / 公告页做二次验证。候选与验证结果分别落入新表;只有 `official_confirmed` 的事件才允许映射进 `materialize_daily_signals.go``signalModelEvent`,并由现有 `generate_daily_report.go` 继续消费,不新造第二套日报事实系统。发现层与验证层必须通过仓库内可运行的 provider adapter 落地,不能依赖当前会话专属工具;实现上采用“命令或 HTTP provider 适配层 + fixture 测试”的方式,确保本地 cron 和 CI 环境可执行。已验证 discovery 事件接入现有事件流时必须去重:若同一 `provider + model + event_type + date` 已由 importer / 原生 loader 给出则以原生事实为准discovery 事件只补缺,不覆盖。
**Tech Stack:** Go 1.22、PostgreSQL、Bash、可配置搜索/LLM provider adapter、JSONB
---
### Task 1: 为候选发现与验证链路定义持久化结构
**Files:**
- Create: `db/migrations/017_intraday_news_candidates.sql`
- Modify: `docs/CONFIGURATION.md`
- Modify: `DEPLOYMENT.md`
**Step 1: 新增候选表与验证表 migration**
创建两张表:
- `intraday_news_candidate`
- `intraday_news_verification`
候选表至少包含:
- `candidate_date`
- `event_type`
- `provider_name`
- `model_name`
- `provider_country`
- `title`
- `summary`
- `candidate_urls JSONB`
- `discovery_source`
- `discovery_query`
- `discovery_evidence JSONB`
- `normalized_key`
- `status`
- `verification_confidence`
- `verification_notes`
验证表至少包含:
- `candidate_id`
- `verifier_source`
- `verifier_url`
- `verifier_status`
- `extracted_facts JSONB`
- `notes`
约束:
- `intraday_news_candidate.normalized_key` 必须唯一,用于防止同日重复发现
- `status` 至少支持:`candidate` / `verifying` / `verified` / `rejected` / `stale`
- `verification_confidence` 至少支持:`candidate` / `secondary_confirmed` / `official_confirmed`
**Step 2: 明确与正式事实层的边界文档**
`docs/CONFIGURATION.md``DEPLOYMENT.md` 写明:
- 候选发现层不会直接写 `daily_report`
- 候选发现层不会覆盖 `latest_report`
- `daily_signal_snapshot` 只消费已验证事实,不消费 `candidate_only`
- `leak_or_rumor` 默认只保留在候选层,不进入正式日报事实
**Step 3: 运行 migration 验证**
Run:
- `bash scripts/apply_migration.sh`
Expected:
- 新表创建成功
- 重复执行 migration 不报错
**Step 4: Commit**
```bash
git add db/migrations/017_intraday_news_candidates.sql docs/CONFIGURATION.md DEPLOYMENT.md
git commit -m "feat(intraday): add candidate and verification persistence"
```
---
### Task 2: 实现候选发现层最小闭环
**Files:**
- Create: `scripts/discover_intraday_news_candidates.go`
- Create: `scripts/discover_intraday_news_candidates_test.go`
- Create: `scripts/testdata/intraday_discovery_search_sample.json`
- Create: `scripts/testdata/intraday_discovery_llm_sample.json`
- Modify: `docs/CONFIGURATION.md`
- Create: `scripts/intraday_discovery_provider.go`
**Step 1: 先写失败测试**
补 4 组测试:
- 搜索结果解析测试:验证能从样例结果提取 title / summary / url / provider 线索
- LLM 输出解析测试:验证能把 LLM JSON 输出转成候选事件
- 候选归一化测试:验证同一事件经过标题差异改写后仍生成同一 `normalized_key`
- URL 过滤测试:验证没有 URL 的候选被丢弃,避免 LLM 空口造线索
**Step 2: 运行失败测试**
Run:
- `go test -count=1 -tags llm_script ./scripts/discover_intraday_news_candidates.go ./scripts/discover_intraday_news_candidates_test.go`
Expected:
- 新增测试失败
- 失败原因是缺少解析、归一化或去重逻辑
**Step 3: 实现最小候选发现器**
`discover_intraday_news_candidates.go` 中实现:
- 固定 provider 查询模板集(中英双语)
- 搜索结果抓取适配层
- LLM 候选摘要适配层
- 去重与归一化逻辑
- 写入 `intraday_news_candidate`
- provider adapter 抽象层(搜索 / LLM 均可通过命令或 HTTP provider 接入,默认实现不可依赖当前会话专属工具)
限制:
- LLM 只允许输出候选,不允许直接标成 `verified`
- 无 URL 候选直接丢弃
- 搜索 / LLM provider 未配置时必须以前置条件错误退出,不能伪装成业务无新闻
- 默认事件类型至少支持:
- `price_cut`
- `price_increase`
- `official_release`
- `promo_campaign`
- `leak_or_rumor`
- `unknown`
**Step 4: 重新运行测试**
Run:
- `go test -count=1 -tags llm_script ./scripts/discover_intraday_news_candidates.go ./scripts/discover_intraday_news_candidates_test.go`
Expected:
- 候选解析与归一化测试通过
**Step 5: 运行一次 dry-run 验证**
Run:
- `go run -tags llm_script ./scripts/discover_intraday_news_candidates.go --date=2026-05-25 --dry-run`
Expected:
- 输出 `candidate_total` / `provider_hit_count` / `event_type_counts`
- dry-run 不写 `daily_report`
- dry-run 不改 `latest_report`
**Step 6: Commit**
```bash
git add scripts/discover_intraday_news_candidates.go scripts/discover_intraday_news_candidates_test.go scripts/testdata/intraday_discovery_search_sample.json scripts/testdata/intraday_discovery_llm_sample.json docs/CONFIGURATION.md
git commit -m "feat(intraday): add news candidate discovery pipeline"
```
---
### Task 3: 实现候选验证层并固化“只信官方事实”的规则
**Files:**
- Create: `scripts/verify_intraday_news_candidates.go`
- Create: `scripts/verify_intraday_news_candidates_test.go`
- Create: `scripts/testdata/intraday_verification_official_release.html`
- Create: `scripts/testdata/intraday_verification_pricing_page.html`
- Create: `scripts/testdata/intraday_verification_secondary_media.html`
- Modify: `docs/CONFIGURATION.md`
**Step 1: 先写失败测试**
补 5 组测试:
- 官方发布页验证测试:命中模型名与发布时间时,产出 `official_confirmed`
- 官方价格页验证测试:只有拿到真实价格变化时,才允许产出 `price_cut` / `price_increase`
- 活动页验证测试:官方活动页可映射为 `promo_campaign`
- 二手媒体降级测试:二手媒体最多得到 `secondary_confirmed`,不能直接进入正式事实层
- 泄露类隔离测试:`leak_or_rumor` 即使有外部讨论,也不会升级为正式日报事实
**Step 2: 运行失败测试**
Run:
- `go test -count=1 -tags llm_script ./scripts/verify_intraday_news_candidates.go ./scripts/verify_intraday_news_candidates_test.go`
Expected:
- 新增测试失败
- 失败原因是缺少来源分类与验证状态映射逻辑
**Step 3: 实现验证器**
`verify_intraday_news_candidates.go` 中实现:
- 读取 `candidate` / `verifying` 状态候选
- 拉取 `candidate_urls`
- 基于域名与页面内容判定:
- `official_page`
- `pricing_page`
- `official_docs`
- `official_blog`
- `secondary_media`
- 把验证轨迹写入 `intraday_news_verification`
- 更新 `intraday_news_candidate.status``verification_confidence`
- 验证成功后只更新候选层状态,不直接写 `daily_signal_snapshot`;正式事实仍统一由物化器汇总
规则:
- 只有官方页面 / 价格页 / docs / 公告页可以产出 `official_confirmed`
- 价格新闻若无法拿到真实价格事实,只能维持候选或二级确认,不能伪造价格变化事件
- `leak_or_rumor` 默认不升级为正式事实
**Step 4: 重新运行测试**
Run:
- `go test -count=1 -tags llm_script ./scripts/verify_intraday_news_candidates.go ./scripts/verify_intraday_news_candidates_test.go`
Expected:
- 验证规则测试通过
**Step 5: 运行一次 dry-run 验证**
Run:
- `go run -tags llm_script ./scripts/verify_intraday_news_candidates.go --date=2026-05-25 --dry-run`
Expected:
- 输出 `verified_total` / `official_confirmed_total` / `secondary_confirmed_total`
- dry-run 只打印摘要,不写 `daily_report`
**Step 6: Commit**
```bash
git add scripts/verify_intraday_news_candidates.go scripts/verify_intraday_news_candidates_test.go scripts/testdata/intraday_verification_official_release.html scripts/testdata/intraday_verification_pricing_page.html scripts/testdata/intraday_verification_secondary_media.html docs/CONFIGURATION.md
git commit -m "feat(intraday): add candidate verification pipeline"
```
---
### Task 4: 把已验证事件接入现有 `materialize_daily_signals.go`
**Files:**
- Modify: `scripts/materialize_daily_signals.go`
- Create or Modify: `scripts/materialize_daily_signals_test.go`
- Modify: `docs/plans/2026-05-27-intraday-price-watch-plan.md`
- Modify: `README.md`
- Modify: `docs/PRODUCTION_CHECKLIST.md`
**Step 1: 先写失败测试**
补 4 组测试:
- 已验证官方发布事件会进入 `daily_signal_snapshot.top_events`
- 已验证活动事件会进入 `daily_signal_snapshot.top_events`
- `candidate_only``leak_or_rumor` 不进入正式快照
- 未拿到真实价格变化数据的“价格新闻”不会被错误映射为 `price_cut` / `price_increase`
**Step 2: 运行失败测试**
Run:
- `go test -count=1 -tags llm_script ./scripts/materialize_daily_signals.go ./scripts/materialize_daily_signals_test.go`
Expected:
- 新增测试失败
- 失败原因是当前物化器还不会读取已验证候选事件
**Step 3: 最小实现 verified event loader**
`materialize_daily_signals.go` 中新增:
- `loadVerifiedIntradayNewsEvents(db, date string)`
-`official_confirmed` 的:
- `official_release`
- `promo_campaign`
- 已确认真实价格变化的 `price_cut` / `price_increase`
映射为现有 `signalModelEvent`
- 与现有 `loadSignalModelEvents` 结果做去重合并;同日同模型同事件类型若已由 importer / 原生 loader 给出,则 discovery 事件仅补 `SourceURL` / 证据缺口,不抢占优先级
约束:
- 不新造第二套快照表
- 不改变 `daily_signal_snapshot` 的正式事实语义
- `secondary_confirmed` 默认不进入正式快照
**Step 4: 重新运行测试**
Run:
- `go test -count=1 -tags llm_script ./scripts/materialize_daily_signals.go ./scripts/materialize_daily_signals_test.go`
Expected:
- verified event 相关测试通过
**Step 5: 联合验证日内边界**
Run:
- `REPORT_TRIGGER_SOURCE=intraday_discovery go run -tags llm_script ./scripts/materialize_daily_signals.go --date=2026-05-25 --dry-run`
Expected:
- 输出含 `page_mode` / `event_count`
- 不写 `daily_report`
- 不覆盖 `latest_report`
**Step 6: Commit**
```bash
git add scripts/materialize_daily_signals.go scripts/materialize_daily_signals_test.go README.md docs/PRODUCTION_CHECKLIST.md docs/plans/2026-05-27-intraday-price-watch-plan.md
git commit -m "feat(intraday): materialize verified discovery events"
```
---
### Task 5: 组装新的日内发现入口并补部署说明
**Files:**
- Create: `scripts/run_intraday_discovery_watch.sh`
- Modify: `README.md`
- Modify: `docs/CONFIGURATION.md`
- Modify: `DEPLOYMENT.md`
- Modify: `docs/PRODUCTION_CHECKLIST.md`
**Step 1: 实现独立入口脚本**
脚本顺序固定为:
1. `discover_intraday_news_candidates.go`
2. `verify_intraday_news_candidates.go`
3. `materialize_daily_signals.go`(仅消费 verified 事件)
要求:
- 明确要求 `DATABASE_URL`
- 搜索 / LLM 所需 key 缺失时,输出前置条件错误,不伪装成代码失败
- 不执行 `generate_daily_report.go`
- 不写 `daily_report`
- 不覆盖 `latest_report`
**Step 2: 更新调度文档**
文档里明确两条 cron
- 结构化价格刷新:`run_intraday_price_watch.sh`
- 新闻发现与验证:`run_intraday_discovery_watch.sh`
推荐起步频率:
- `run_intraday_discovery_watch.sh`:每 2 小时一次
- `run_intraday_price_watch.sh`:每 4 小时一次
**Step 3: 运行脚本级 dry-run**
Run:
- `bash scripts/run_intraday_discovery_watch.sh --dry-run`
Expected:
- 输出候选发现摘要 + 验证摘要 + 信号物化摘要
- 不生成正式日报产物
**Step 4: Commit**
```bash
git add scripts/run_intraday_discovery_watch.sh README.md docs/CONFIGURATION.md DEPLOYMENT.md docs/PRODUCTION_CHECKLIST.md
git commit -m "feat(intraday): add discovery watch runner"
```
---
### Task 6: 运行最终联合验收并准备本地提交
**Files:**
- Modify: `README.md`(仅在最终说明缺失时)
- Modify: `docs/CONFIGURATION.md`(仅在最终说明缺失时)
- Modify: `DEPLOYMENT.md`(仅在最终说明缺失时)
**Step 1: 运行 focused Go tests**
Run:
- `go test -count=1 -tags llm_script ./scripts/discover_intraday_news_candidates.go ./scripts/discover_intraday_news_candidates_test.go`
- `go test -count=1 -tags llm_script ./scripts/verify_intraday_news_candidates.go ./scripts/verify_intraday_news_candidates_test.go`
- `go test -count=1 -tags llm_script ./scripts/materialize_daily_signals.go ./scripts/materialize_daily_signals_test.go`
Expected:
- 发现层、验证层、信号物化层 focused tests 全通过
**Step 2: 运行现有日报/前端回归边界**
Run:
- `go test -count=1 -tags llm_script ./scripts/generate_daily_report.go ./scripts/generate_daily_report_test.go ./scripts/official_import_signature_audit_query_lib.go`
- `bash scripts/secret_gate_test.sh`
- `bash scripts/test_importers.sh`
- `cd frontend && npm test -- --run`
- `cd frontend && npm run build`
Expected:
- 原有日报与前端链路不回归
- discovery 新增能力不污染正式日报边界
**Step 3: 运行脚本级联合 dry-run**
Run:
- `bash scripts/run_intraday_discovery_watch.sh --dry-run`
- `REPORT_TRIGGER_SOURCE=intraday go run -tags llm_script ./scripts/materialize_daily_signals.go --date=2026-05-25 --dry-run`
Expected:
- 不写 `daily_report`
- 不覆盖 `latest_report`
- 能稳定输出候选数、验证数、事件数、page_mode、source_audit
**Step 4: 本地提交**
```bash
git add db/migrations/017_intraday_news_candidates.sql scripts/discover_intraday_news_candidates.go scripts/discover_intraday_news_candidates_test.go scripts/verify_intraday_news_candidates.go scripts/verify_intraday_news_candidates_test.go scripts/materialize_daily_signals.go scripts/materialize_daily_signals_test.go scripts/run_intraday_discovery_watch.sh README.md docs/CONFIGURATION.md DEPLOYMENT.md docs/PRODUCTION_CHECKLIST.md docs/plans/2026-05-25-intraday-discovery-verification-implementation-plan.md docs/plans/2026-05-27-intraday-price-watch-plan.md
git commit -m "feat(intraday): add discovery and verification watch pipeline"
```
---
## 验收标准
实现完成后,必须同时满足:
- 搜索 + LLM 只能产生候选事件,不能直接写成正式日报事实
- 只有 `official_confirmed` 的事件才能进入正式 `daily_signal_snapshot` 语义链路
- `leak_or_rumor` 不进入正式日报事实层
- `run_intraday_discovery_watch.sh``run_intraday_price_watch.sh` 职责分离
- 正式日报仍只由 `run_daily.sh` 负责
- 新增链路不会写 `daily_report`、不会覆盖 `latest_report`
- discovery provider adapter 在无配置时会明确报前置条件错误;有 fixture / dry-run 模式可本地验证
- 新增 focused tests、现有日报测试、前端构建全部通过
## 非目标
本计划刻意不做:
- 不新增第二套正式日报系统
- 不让 LLM 直接替代价格 importer 或官方发布 importer
- 不把二手媒体新闻直接映射为 `price_cut` / `price_increase`
- 不在第一阶段引入新的前端“候选情报面板”复杂交互;若后续需要,单独立计划

View File

@@ -55,6 +55,6 @@
## 下一步建议 ## 下一步建议
1. 把前端查询页增加“最近一次价格追踪时间”提示 1. `run_intraday_discovery_watch.sh` 补充生产级 provider adapter 和调度说明
2. `materialize_daily_signals.go` 增加 `trigger_source=intraday` 的文档说明 2.前端查询页增加“最近一次价格追踪时间 / 最近一次 discovery 验证时间”提示
3. 如果日内事件仍不够敏感,再考虑引入独立 `intraday_signal_snapshot` 3. 如果日内事件仍不够敏感,再考虑引入独立 `intraday_signal_snapshot` 或候选情报面板

View File

@@ -0,0 +1,410 @@
//go:build llm_script
package main
import (
"context"
"database/sql"
"encoding/json"
"flag"
"fmt"
"log/slog"
"os"
"sort"
"strings"
"time"
_ "github.com/lib/pq"
)
type intradayNewsCandidate struct {
CandidateDate string
EventType string
ProviderName string
ModelName string
ProviderCountry string
Title string
Summary string
CandidateURLs []string
DiscoverySource string
DiscoveryQuery string
DiscoveryEvidence map[string]any
NormalizedKey string
Status string
VerificationConfidence string
VerificationNotes string
}
type intradayDiscoveryConfig struct {
Date string
DryRun bool
Search intradayProviderConfig
LLM intradayProviderConfig
DatabaseURL string
Timeout time.Duration
ProviderLimit int
}
type intradayDiscoverySummary struct {
CandidateTotal int `json:"candidate_total"`
ProviderHitCount int `json:"provider_hit_count"`
EventTypeCounts map[string]int `json:"event_type_counts"`
DiscoverySourceSet []string `json:"discovery_source_set"`
DryRun bool `json:"dry_run"`
}
var intradayDiscoveryLogger *slog.Logger
func init() {
intradayDiscoveryLogger = slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo}))
}
func main() {
loadIntradayEnv()
cfg := loadIntradayDiscoveryConfig()
if err := runIntradayCandidateDiscovery(cfg); err != nil {
fmt.Fprintf(os.Stderr, "discover_intraday_news_candidates: %v\n", err)
os.Exit(1)
}
}
func loadIntradayDiscoveryConfig() intradayDiscoveryConfig {
var cfg intradayDiscoveryConfig
flag.StringVar(&cfg.Date, "date", intradayDateValue(), "候选发现日期,格式 YYYY-MM-DD")
flag.BoolVar(&cfg.DryRun, "dry-run", false, "仅输出摘要,不写数据库")
flag.IntVar(&cfg.ProviderLimit, "provider-limit", 10, "最大 provider 数")
flag.Parse()
cfg.DatabaseURL = intradayDefaultDSN()
cfg.Timeout = discoveryTimeoutFromEnv()
cfg.Search = intradayProviderConfig{
Mode: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_PROVIDER")),
Command: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_COMMAND")),
URL: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_URL")),
Fixture: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_SEARCH_FIXTURE")),
Timeout: cfg.Timeout,
}
cfg.LLM = intradayProviderConfig{
Mode: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_PROVIDER")),
Command: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_COMMAND")),
URL: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_URL")),
Fixture: strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_LLM_FIXTURE")),
Timeout: cfg.Timeout,
}
return cfg
}
func runIntradayCandidateDiscovery(cfg intradayDiscoveryConfig) error {
if strings.TrimSpace(cfg.Date) == "" {
return fmt.Errorf("date 未设置")
}
if err := validateIntradayProviderConfig("search", cfg.Search); err != nil {
return err
}
if err := validateIntradayProviderConfig("llm", cfg.LLM); err != nil {
return err
}
queries := buildIntradayQueries(cfg.Date, cfg.ProviderLimit)
searchRecords, err := loadIntradaySearchRecords(cfg.Search, cfg.Date, queries)
if err != nil {
return err
}
llmRecords, err := loadIntradayLLMRecords(cfg.LLM, cfg.Date, searchRecords)
if err != nil {
return err
}
candidates := normalizeIntradayCandidates(cfg.Date, searchRecords, llmRecords)
summary := summarizeIntradayCandidates(candidates, cfg.DryRun)
if cfg.DryRun {
return printIntradayDiscoverySummary(summary)
}
db, err := sql.Open("postgres", cfg.DatabaseURL)
if err != nil {
return fmt.Errorf("open db: %w", err)
}
defer db.Close()
if err := upsertIntradayCandidates(context.Background(), db, candidates); err != nil {
return err
}
return printIntradayDiscoverySummary(summary)
}
func validateIntradayProviderConfig(name string, cfg intradayProviderConfig) error {
if strings.TrimSpace(cfg.Mode) == "" {
return fmt.Errorf("%s provider 未设置", name)
}
switch cfg.Mode {
case "fixture":
if strings.TrimSpace(cfg.Fixture) == "" {
return fmt.Errorf("%s provider fixture 未设置", name)
}
case "command_json":
if strings.TrimSpace(cfg.Command) == "" {
return fmt.Errorf("%s provider command 未设置", name)
}
case "http_json":
if strings.TrimSpace(cfg.URL) == "" {
return fmt.Errorf("%s provider url 未设置", name)
}
default:
return fmt.Errorf("%s provider mode 不支持: %s", name, cfg.Mode)
}
return nil
}
func buildIntradayQueries(date string, providerLimit int) []string {
providers := []string{
"OpenAI", "Anthropic", "Google Gemini", "xAI", "DeepSeek",
"DashScope", "Qwen", "智谱", "百度文心", "腾讯混元", "火山方舟", "MiniMax",
}
keywords := []string{"pricing release announcement", "模型 降价 发布 活动"}
if providerLimit > 0 && providerLimit < len(providers) {
providers = providers[:providerLimit]
}
queries := make([]string, 0, len(providers)*len(keywords))
for _, provider := range providers {
for _, keyword := range keywords {
queries = append(queries, strings.TrimSpace(date+" "+provider+" "+keyword))
}
}
return queries
}
func normalizeIntradayCandidates(date string, searchRecords []intradaySearchRecord, llmRecords []intradayLLMRecord) []intradayNewsCandidate {
searchIndex := indexSearchRecordsByURL(searchRecords)
candidatesByKey := map[string]intradayNewsCandidate{}
for _, record := range llmRecords {
candidate := candidateFromLLMRecord(date, record, searchIndex)
if len(candidate.CandidateURLs) == 0 {
continue
}
if candidate.ProviderName == "" {
candidate.ProviderName = inferProviderFromTitle(candidate.Title)
}
candidate.EventType = normalizeIntradayEventType(candidate.EventType)
candidate.NormalizedKey = buildIntradayNormalizedKey(candidate)
mergeIntradayCandidate(candidatesByKey, candidate)
}
result := make([]intradayNewsCandidate, 0, len(candidatesByKey))
for _, candidate := range candidatesByKey {
result = append(result, candidate)
}
sort.Slice(result, func(i, j int) bool {
if result[i].ProviderName != result[j].ProviderName {
return result[i].ProviderName < result[j].ProviderName
}
if result[i].EventType != result[j].EventType {
return result[i].EventType < result[j].EventType
}
return result[i].NormalizedKey < result[j].NormalizedKey
})
return result
}
func candidateFromLLMRecord(date string, record intradayLLMRecord, searchIndex map[string]intradaySearchRecord) intradayNewsCandidate {
candidate := intradayNewsCandidate{
CandidateDate: date,
EventType: record.EventType,
ProviderName: strings.TrimSpace(record.ProviderName),
ModelName: strings.TrimSpace(record.ModelName),
ProviderCountry: strings.TrimSpace(record.ProviderCountry),
Title: strings.TrimSpace(record.Title),
Summary: strings.TrimSpace(record.Summary),
CandidateURLs: dedupeStrings(record.CandidateURLs),
DiscoverySource: "llm_answer",
DiscoveryEvidence: map[string]any{"llm_record": record},
Status: "candidate",
VerificationConfidence: "candidate",
}
for _, url := range candidate.CandidateURLs {
if searchRecord, ok := searchIndex[url]; ok {
candidate.DiscoverySource = "web_search+llm"
candidate.DiscoveryQuery = searchRecord.Title
candidate.DiscoveryEvidence["search_record"] = searchRecord
if candidate.ProviderName == "" {
candidate.ProviderName = strings.TrimSpace(searchRecord.Provider)
}
if candidate.Title == "" {
candidate.Title = strings.TrimSpace(searchRecord.Title)
}
if candidate.Summary == "" {
candidate.Summary = strings.TrimSpace(searchRecord.Summary)
}
}
}
return candidate
}
func indexSearchRecordsByURL(records []intradaySearchRecord) map[string]intradaySearchRecord {
indexed := make(map[string]intradaySearchRecord, len(records))
for _, record := range records {
url := strings.TrimSpace(record.URL)
if url == "" {
continue
}
indexed[url] = record
}
return indexed
}
func mergeIntradayCandidate(target map[string]intradayNewsCandidate, candidate intradayNewsCandidate) {
if candidate.NormalizedKey == "" {
return
}
existing, ok := target[candidate.NormalizedKey]
if !ok {
target[candidate.NormalizedKey] = candidate
return
}
merged := existing
merged.CandidateURLs = dedupeStrings(append(existing.CandidateURLs, candidate.CandidateURLs...))
if strings.TrimSpace(merged.Summary) == "" {
merged.Summary = candidate.Summary
}
if strings.TrimSpace(merged.ProviderCountry) == "" {
merged.ProviderCountry = candidate.ProviderCountry
}
if merged.DiscoverySource != candidate.DiscoverySource && candidate.DiscoverySource != "" {
merged.DiscoverySource = "web_search+llm"
}
if merged.DiscoveryEvidence == nil {
merged.DiscoveryEvidence = map[string]any{}
}
if llmRecord, ok := candidate.DiscoveryEvidence["llm_record"]; ok {
merged.DiscoveryEvidence["llm_record"] = llmRecord
}
if searchRecord, ok := candidate.DiscoveryEvidence["search_record"]; ok {
merged.DiscoveryEvidence["search_record"] = searchRecord
}
target[candidate.NormalizedKey] = merged
}
func buildIntradayNormalizedKey(candidate intradayNewsCandidate) string {
provider := normalizeWord(candidate.ProviderName)
model := normalizeWord(candidate.ModelName)
if model == "" {
model = normalizeWord(candidate.Title)
}
return strings.Join([]string{
candidate.CandidateDate,
normalizeWord(candidate.EventType),
provider,
model,
}, "|")
}
func summarizeIntradayCandidates(candidates []intradayNewsCandidate, dryRun bool) intradayDiscoverySummary {
eventTypeCounts := make(map[string]int)
providerSet := map[string]struct{}{}
sourceSet := map[string]struct{}{}
for _, candidate := range candidates {
eventTypeCounts[candidate.EventType]++
if candidate.ProviderName != "" {
providerSet[candidate.ProviderName] = struct{}{}
}
if candidate.DiscoverySource != "" {
sourceSet[candidate.DiscoverySource] = struct{}{}
}
}
sources := make([]string, 0, len(sourceSet))
for source := range sourceSet {
sources = append(sources, source)
}
sort.Strings(sources)
return intradayDiscoverySummary{
CandidateTotal: len(candidates),
ProviderHitCount: len(providerSet),
EventTypeCounts: eventTypeCounts,
DiscoverySourceSet: sources,
DryRun: dryRun,
}
}
func printIntradayDiscoverySummary(summary intradayDiscoverySummary) error {
payload, err := json.Marshal(summary)
if err != nil {
return err
}
fmt.Println(string(payload))
return nil
}
func upsertIntradayCandidates(ctx context.Context, db *sql.DB, candidates []intradayNewsCandidate) error {
if db == nil {
return fmt.Errorf("db is nil")
}
for _, candidate := range candidates {
urls, err := json.Marshal(candidate.CandidateURLs)
if err != nil {
return fmt.Errorf("marshal candidate urls: %w", err)
}
evidence, err := json.Marshal(candidate.DiscoveryEvidence)
if err != nil {
return fmt.Errorf("marshal discovery evidence: %w", err)
}
_, err = db.ExecContext(ctx, `
INSERT INTO intraday_news_candidate (
candidate_date, event_type, provider_name, model_name, provider_country,
title, summary, candidate_urls, discovery_source, discovery_query,
discovery_evidence, normalized_key, status, verification_confidence, verification_notes
) VALUES (
$1::date, $2, $3, NULLIF($4, ''), NULLIF($5, ''),
$6, NULLIF($7, ''), $8::jsonb, $9, NULLIF($10, ''),
$11::jsonb, $12, $13, $14, NULLIF($15, '')
)
ON CONFLICT (normalized_key) DO UPDATE SET
title = EXCLUDED.title,
summary = COALESCE(NULLIF(EXCLUDED.summary, ''), intraday_news_candidate.summary),
candidate_urls = EXCLUDED.candidate_urls,
discovery_source = EXCLUDED.discovery_source,
discovery_query = COALESCE(NULLIF(EXCLUDED.discovery_query, ''), intraday_news_candidate.discovery_query),
discovery_evidence = EXCLUDED.discovery_evidence,
provider_country = COALESCE(NULLIF(EXCLUDED.provider_country, ''), intraday_news_candidate.provider_country),
updated_at = CURRENT_TIMESTAMP`,
candidate.CandidateDate,
candidate.EventType,
candidate.ProviderName,
candidate.ModelName,
candidate.ProviderCountry,
candidate.Title,
candidate.Summary,
string(urls),
candidate.DiscoverySource,
candidate.DiscoveryQuery,
string(evidence),
candidate.NormalizedKey,
candidate.Status,
candidate.VerificationConfidence,
candidate.VerificationNotes,
)
if err != nil {
return fmt.Errorf("upsert intraday candidate %s: %w", candidate.NormalizedKey, err)
}
}
return nil
}
func inferProviderFromTitle(title string) string {
lower := strings.ToLower(title)
for _, pair := range []struct{ match, provider string }{
{"openai", "OpenAI"},
{"anthropic", "Anthropic"},
{"gemini", "Google"},
{"deepseek", "DeepSeek"},
{"qwen", "Qwen"},
{"dashscope", "DashScope"},
{"xai", "xAI"},
{"minimax", "MiniMax"},
{"智谱", "智谱"},
{"百度", "百度"},
{"腾讯", "腾讯"},
} {
if strings.Contains(lower, pair.match) {
return pair.provider
}
}
return ""
}

View File

@@ -0,0 +1,127 @@
//go:build llm_script
package main
import (
"context"
"database/sql"
"path/filepath"
"strings"
"testing"
)
func TestLoadIntradaySearchRecordsFromFixture(t *testing.T) {
cfg := intradayProviderConfig{
Mode: "fixture",
Fixture: filepath.Join("testdata", "intraday_discovery_search_sample.json"),
}
records, err := loadIntradaySearchRecords(cfg, "2026-05-25", []string{"OpenAI pricing release"})
if err != nil {
t.Fatalf("loadIntradaySearchRecords 返回错误: %v", err)
}
if len(records) != 2 {
t.Fatalf("搜索样例条数错误: got=%d", len(records))
}
if records[0].URL == "" || records[0].Provider == "" {
t.Fatalf("搜索样例未保留 URL/provider: %+v", records[0])
}
}
func TestLoadIntradayLLMRecordsFromFixture(t *testing.T) {
cfg := intradayProviderConfig{
Mode: "fixture",
Fixture: filepath.Join("testdata", "intraday_discovery_llm_sample.json"),
}
records, err := loadIntradayLLMRecords(cfg, "2026-05-25", nil)
if err != nil {
t.Fatalf("loadIntradayLLMRecords 返回错误: %v", err)
}
if len(records) != 2 {
t.Fatalf("LLM 样例条数错误: got=%d", len(records))
}
if records[0].EventType != "official_release" {
t.Fatalf("LLM 事件类型错误: %+v", records[0])
}
}
func TestNormalizeIntradayCandidatesDedupesEquivalentEvents(t *testing.T) {
searchRecords := []intradaySearchRecord{{
Title: "OpenAI announces GPT-5.6 preview pricing update",
Summary: "Search summary",
URL: "https://openai.example.com/news/gpt-5-6-pricing",
Provider: "OpenAI",
}}
llmRecords := []intradayLLMRecord{
{
EventType: "official_release",
ProviderName: "OpenAI",
ModelName: "GPT-5.6",
ProviderCountry: "US",
Title: "GPT-5.6 preview pricing update",
Summary: "First summary",
CandidateURLs: []string{"https://openai.example.com/news/gpt-5-6-pricing"},
},
{
EventType: "official_release",
ProviderName: "OpenAI",
ModelName: "GPT 5.6",
ProviderCountry: "US",
Title: "OpenAI GPT 5.6 preview pricing update",
Summary: "Second summary",
CandidateURLs: []string{"https://openai.example.com/news/gpt-5-6-pricing"},
},
}
candidates := normalizeIntradayCandidates("2026-05-25", searchRecords, llmRecords)
if len(candidates) != 1 {
t.Fatalf("期望去重后只剩 1 条候选, got=%d", len(candidates))
}
if candidates[0].DiscoverySource != "web_search+llm" {
t.Fatalf("期望 discovery source 合并, got=%q", candidates[0].DiscoverySource)
}
}
func TestNormalizeIntradayCandidatesDropsURLlessRecords(t *testing.T) {
llmRecords := []intradayLLMRecord{{
EventType: "promo_campaign",
ProviderName: "DeepSeek",
ModelName: "DeepSeek-V4-Flash",
Title: "No URL candidate",
Summary: "Should be dropped",
}}
candidates := normalizeIntradayCandidates("2026-05-25", nil, llmRecords)
if len(candidates) != 0 {
t.Fatalf("无 URL 候选应被丢弃, got=%d", len(candidates))
}
}
func TestValidateIntradayProviderConfigRequiresCommandOrURLOrFixture(t *testing.T) {
if err := validateIntradayProviderConfig("search", intradayProviderConfig{Mode: "command_json"}); err == nil {
t.Fatal("缺少 command 时应报错")
}
if err := validateIntradayProviderConfig("llm", intradayProviderConfig{Mode: "http_json"}); err == nil {
t.Fatal("缺少 url 时应报错")
}
if err := validateIntradayProviderConfig("search", intradayProviderConfig{Mode: "fixture", Fixture: "fixture.json"}); err != nil {
t.Fatalf("fixture provider 不应报错: %v", err)
}
}
func TestBuildIntradayNormalizedKeyUsesProviderModelAndDate(t *testing.T) {
key := buildIntradayNormalizedKey(intradayNewsCandidate{
CandidateDate: "2026-05-25",
EventType: "official_release",
ProviderName: "OpenAI",
ModelName: "GPT-5.6",
})
if !strings.Contains(key, "2026-05-25") || !strings.Contains(key, "openai") || !strings.Contains(key, "gpt-5-6") {
t.Fatalf("normalized key 不符合预期: %q", key)
}
}
func TestUpsertIntradayCandidatesRequiresDB(t *testing.T) {
var db *sql.DB
err := upsertIntradayCandidates(context.Background(), db, nil)
if err == nil {
t.Fatal("nil db 时应报错")
}
}

View File

@@ -0,0 +1,111 @@
//go:build llm_script
package main
import (
"fmt"
"os"
"regexp"
"strings"
"time"
)
func loadIntradayEnv() {
for _, path := range []string{".env.local", ".env"} {
data, err := os.ReadFile(path)
if err != nil {
continue
}
for _, line := range strings.Split(string(data), "\n") {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
key, value, ok := strings.Cut(line, "=")
if !ok {
continue
}
key = strings.TrimSpace(key)
value = strings.Trim(strings.TrimSpace(value), `"'`)
if key == "" {
continue
}
if _, exists := os.LookupEnv(key); exists {
continue
}
_ = os.Setenv(key, value)
}
}
}
func intradayDefaultDSN() string {
if dsn := os.Getenv("DATABASE_URL"); dsn != "" {
return dsn
}
return "postgres://long@/llm_intelligence?host=/var/run/postgresql"
}
func intradayDateValue() string {
if value := strings.TrimSpace(os.Getenv("REPORT_DATE")); value != "" {
return value
}
return time.Now().Format("2006-01-02")
}
func discoveryTimeoutFromEnv() time.Duration {
raw := strings.TrimSpace(os.Getenv("INTRADAY_DISCOVERY_TIMEOUT_SEC"))
if raw == "" {
return 20 * time.Second
}
var seconds int
if _, err := fmt.Sscanf(raw, "%d", &seconds); err != nil || seconds <= 0 {
return 20 * time.Second
}
return time.Duration(seconds) * time.Second
}
func normalizeIntradayEventType(value string) string {
switch strings.TrimSpace(strings.ToLower(value)) {
case "price_cut":
return "price_cut"
case "price_increase":
return "price_increase"
case "official_release":
return "official_release"
case "promo_campaign":
return "promo_campaign"
case "leak_or_rumor":
return "leak_or_rumor"
default:
return "unknown"
}
}
func normalizeWord(value string) string {
value = strings.ToLower(strings.TrimSpace(value))
value = strings.ReplaceAll(value, "_", "-")
re := regexp.MustCompile(`[^a-z0-9\-]+`)
value = re.ReplaceAllString(value, "-")
value = strings.Trim(value, "-")
if value == "" {
return "unknown"
}
return value
}
func dedupeStrings(values []string) []string {
seen := map[string]struct{}{}
result := make([]string, 0, len(values))
for _, value := range values {
trimmed := strings.TrimSpace(value)
if trimmed == "" {
continue
}
if _, exists := seen[trimmed]; exists {
continue
}
seen[trimmed] = struct{}{}
result = append(result, trimmed)
}
return result
}

View File

@@ -0,0 +1,188 @@
//go:build llm_script
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"strings"
"time"
)
type intradayProviderConfig struct {
Mode string
Command string
URL string
Fixture string
Timeout time.Duration
}
type intradaySearchRecord struct {
Title string `json:"title"`
Summary string `json:"summary"`
URL string `json:"url"`
Provider string `json:"provider"`
ProviderURL string `json:"provider_url"`
PublishedAt string `json:"published_at"`
}
type intradayLLMRecord struct {
EventType string `json:"event_type"`
ProviderName string `json:"provider_name"`
ModelName string `json:"model_name"`
ProviderCountry string `json:"provider_country"`
Title string `json:"title"`
Summary string `json:"summary"`
CandidateURLs []string `json:"candidate_urls"`
}
type intradayLLMRequest struct {
Date string `json:"date"`
SearchResults []intradaySearchRecord `json:"search_results"`
}
func loadIntradaySearchRecords(cfg intradayProviderConfig, date string, queries []string) ([]intradaySearchRecord, error) {
var all []intradaySearchRecord
for _, query := range queries {
payload, err := loadIntradayProviderPayload(cfg, intradayProviderPayloadInput{
Date: date,
Query: query,
})
if err != nil {
return nil, err
}
if len(bytes.TrimSpace(payload)) == 0 {
continue
}
var records []intradaySearchRecord
if err := json.Unmarshal(payload, &records); err != nil {
return nil, fmt.Errorf("unmarshal search records for query %q: %w", query, err)
}
all = append(all, records...)
if cfg.Mode == "fixture" {
break
}
}
return all, nil
}
func loadIntradayLLMRecords(cfg intradayProviderConfig, date string, searchResults []intradaySearchRecord) ([]intradayLLMRecord, error) {
request := intradayLLMRequest{Date: date, SearchResults: searchResults}
body, err := json.Marshal(request)
if err != nil {
return nil, fmt.Errorf("marshal llm request: %w", err)
}
payload, err := loadIntradayProviderPayload(cfg, intradayProviderPayloadInput{
Date: date,
RequestBody: body,
})
if err != nil {
return nil, err
}
if len(bytes.TrimSpace(payload)) == 0 {
return nil, nil
}
var records []intradayLLMRecord
if err := json.Unmarshal(payload, &records); err != nil {
return nil, fmt.Errorf("unmarshal llm records: %w", err)
}
return records, nil
}
type intradayProviderPayloadInput struct {
Date string
Query string
RequestBody []byte
}
func loadIntradayProviderPayload(cfg intradayProviderConfig, input intradayProviderPayloadInput) ([]byte, error) {
mode := strings.TrimSpace(cfg.Mode)
switch mode {
case "fixture":
if strings.TrimSpace(cfg.Fixture) == "" {
return nil, fmt.Errorf("provider fixture 未设置")
}
return os.ReadFile(cfg.Fixture)
case "command_json":
if strings.TrimSpace(cfg.Command) == "" {
return nil, fmt.Errorf("provider command 未设置")
}
return runIntradayCommand(cfg, input)
case "http_json":
if strings.TrimSpace(cfg.URL) == "" {
return nil, fmt.Errorf("provider url 未设置")
}
return fetchIntradayHTTP(cfg, input)
default:
return nil, fmt.Errorf("unsupported provider mode %q", mode)
}
}
func runIntradayCommand(cfg intradayProviderConfig, input intradayProviderPayloadInput) ([]byte, error) {
command := strings.TrimSpace(cfg.Command)
command = strings.ReplaceAll(command, "{{date}}", input.Date)
command = strings.ReplaceAll(command, "{{query}}", shellEscapeSingleArg(input.Query))
cmd := exec.Command("sh", "-c", command)
cmd.Env = append(os.Environ(),
"INTRADAY_DISCOVERY_DATE="+input.Date,
"INTRADAY_DISCOVERY_QUERY="+input.Query,
)
if len(input.RequestBody) > 0 {
cmd.Stdin = bytes.NewReader(input.RequestBody)
}
out, err := cmd.Output()
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
return nil, fmt.Errorf("run provider command: %w: %s", err, strings.TrimSpace(string(exitErr.Stderr)))
}
return nil, fmt.Errorf("run provider command: %w", err)
}
return out, nil
}
func fetchIntradayHTTP(cfg intradayProviderConfig, input intradayProviderPayloadInput) ([]byte, error) {
client := &http.Client{Timeout: cfg.Timeout}
rawURL := strings.TrimSpace(cfg.URL)
rawURL = strings.ReplaceAll(rawURL, "{{date}}", input.Date)
rawURL = strings.ReplaceAll(rawURL, "{{query}}", input.Query)
method := http.MethodGet
var body io.Reader
if len(input.RequestBody) > 0 {
method = http.MethodPost
body = bytes.NewReader(input.RequestBody)
}
req, err := http.NewRequest(method, rawURL, body)
if err != nil {
return nil, fmt.Errorf("build provider request: %w", err)
}
if len(input.RequestBody) > 0 {
req.Header.Set("Content-Type", "application/json")
}
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("call provider url: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
payload, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("call provider url: unexpected status %d: %s", resp.StatusCode, strings.TrimSpace(string(payload)))
}
payload, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read provider response: %w", err)
}
return payload, nil
}
func shellEscapeSingleArg(value string) string {
if value == "" {
return "''"
}
return "'" + strings.ReplaceAll(value, "'", "'\"'\"'") + "'"
}

View File

@@ -46,6 +46,7 @@ type signalModelEvent struct {
TrustLabel string `json:"trust_label"` TrustLabel string `json:"trust_label"`
SourceKindLabel string `json:"source_kind_label"` SourceKindLabel string `json:"source_kind_label"`
PrimarySource string `json:"primary_source"` PrimarySource string `json:"primary_source"`
SourceURL string `json:"source_url"`
UpdatedAt string `json:"updated_at"` UpdatedAt string `json:"updated_at"`
EvidenceDetail string `json:"evidence_detail"` EvidenceDetail string `json:"evidence_detail"`
Baseline string `json:"baseline"` Baseline string `json:"baseline"`
@@ -367,6 +368,12 @@ func loadSignalModelEvents(db *sql.DB, date string) ([]signalModelEvent, error)
} }
events = append(events, priceEvents...) events = append(events, priceEvents...)
discoveryEvents, err := loadVerifiedDiscoverySignalEvents(db, date)
if err != nil {
return nil, err
}
events = mergeVerifiedDiscoveryEvents(events, discoveryEvents)
sort.Slice(events, func(i, j int) bool { sort.Slice(events, func(i, j int) bool {
if events[i].Priority != events[j].Priority { if events[i].Priority != events[j].Priority {
return events[i].Priority > events[j].Priority return events[i].Priority > events[j].Priority
@@ -409,6 +416,7 @@ func loadSignalPromoCampaignEvents(date string) ([]signalModelEvent, error) {
TrustLabel: signalFirstNonEmpty(definition.TrustLabel, "官方来源 / 一级证据"), TrustLabel: signalFirstNonEmpty(definition.TrustLabel, "官方来源 / 一级证据"),
SourceKindLabel: signalFirstNonEmpty(definition.SourceKindLabel, "官方活动页"), SourceKindLabel: signalFirstNonEmpty(definition.SourceKindLabel, "官方活动页"),
PrimarySource: definition.PrimarySource, PrimarySource: definition.PrimarySource,
SourceURL: definition.PrimarySource,
UpdatedAt: signalFormatEventUpdatedAt("", definition.Date), UpdatedAt: signalFormatEventUpdatedAt("", definition.Date),
EvidenceDetail: definition.EvidenceDetail, EvidenceDetail: definition.EvidenceDetail,
Baseline: signalFirstNonEmpty(definition.Baseline, "活动窗口开启"), Baseline: signalFirstNonEmpty(definition.Baseline, "活动窗口开启"),
@@ -520,6 +528,7 @@ func loadSignalOfficialReleaseEvents(db *sql.DB, date string) ([]signalModelEven
TrustLabel: buildSignalReleaseTrustLabel(model, dateConfidence), TrustLabel: buildSignalReleaseTrustLabel(model, dateConfidence),
SourceKindLabel: buildSignalReleaseSourceKindLabel(dateSourceKind, dateConfidence), SourceKindLabel: buildSignalReleaseSourceKindLabel(dateSourceKind, dateConfidence),
PrimarySource: sourceURL, PrimarySource: sourceURL,
SourceURL: sourceURL,
UpdatedAt: releaseDate.Format("2006-01-02 15:04"), UpdatedAt: releaseDate.Format("2006-01-02 15:04"),
EvidenceDetail: buildSignalReleaseEvidenceDetail(dateSourceKind, dateConfidence), EvidenceDetail: buildSignalReleaseEvidenceDetail(dateSourceKind, dateConfidence),
Baseline: "官方首次发布", Baseline: "官方首次发布",
@@ -610,6 +619,7 @@ func loadSignalNewModelEvents(db *sql.DB, date string) ([]signalModelEvent, erro
TrustLabel: buildSignalTrustLabel(model), TrustLabel: buildSignalTrustLabel(model),
SourceKindLabel: "模型快照", SourceKindLabel: "模型快照",
PrimarySource: buildSignalPrimarySource("region_pricing", model.OperatorName), PrimarySource: buildSignalPrimarySource("region_pricing", model.OperatorName),
SourceURL: buildSignalPrimarySource("region_pricing", model.OperatorName),
UpdatedAt: createdAt.Format("2006-01-02 15:04"), UpdatedAt: createdAt.Format("2006-01-02 15:04"),
EvidenceDetail: "models.created_at = 今日,且已存在最新价格快照", EvidenceDetail: "models.created_at = 今日,且已存在最新价格快照",
Baseline: "首次出现", Baseline: "首次出现",
@@ -709,6 +719,7 @@ func loadSignalPriceChangeEvents(db *sql.DB, date string) ([]signalModelEvent, e
TrustLabel: buildSignalTrustLabel(model), TrustLabel: buildSignalTrustLabel(model),
SourceKindLabel: "价格快照", SourceKindLabel: "价格快照",
PrimarySource: "pricing_history", PrimarySource: "pricing_history",
SourceURL: buildSignalPrimarySource("region_pricing", model.OperatorName),
UpdatedAt: changedAt.Format("2006-01-02 15:04"), UpdatedAt: changedAt.Format("2006-01-02 15:04"),
EvidenceDetail: buildSignalPriceEvidenceDetail(changePct, oldInputPrice, newInputPrice, model.Currency), EvidenceDetail: buildSignalPriceEvidenceDetail(changePct, oldInputPrice, newInputPrice, model.Currency),
Baseline: fmt.Sprintf("较昨日 %+.0f%%", changePct), Baseline: fmt.Sprintf("较昨日 %+.0f%%", changePct),
@@ -747,6 +758,241 @@ func dedupeSignalEvents(events []signalModelEvent) []signalModelEvent {
return result return result
} }
func loadVerifiedDiscoverySignalEvents(db *sql.DB, date string) ([]signalModelEvent, error) {
rows, err := db.Query(`
SELECT
event_type,
provider_name,
COALESCE(model_name, ''),
COALESCE(provider_country, ''),
title,
COALESCE(summary, ''),
COALESCE(candidate_urls::text, '[]'),
COALESCE(verification_notes, ''),
updated_at
FROM intraday_news_candidate
WHERE candidate_date = $1::date
AND status = 'verified'
AND verification_confidence = 'official_confirmed'
ORDER BY updated_at DESC, id DESC
`, date)
if err != nil {
if strings.Contains(err.Error(), `relation "intraday_news_candidate" does not exist`) {
return nil, nil
}
return nil, err
}
defer rows.Close()
var events []signalModelEvent
for rows.Next() {
var (
eventType string
providerName string
modelName string
providerCountry string
title string
summary string
rawURLs string
notes string
updatedAt time.Time
)
if err := rows.Scan(&eventType, &providerName, &modelName, &providerCountry, &title, &summary, &rawURLs, &notes, &updatedAt); err != nil {
return nil, err
}
var urls []string
if err := json.Unmarshal([]byte(rawURLs), &urls); err != nil {
return nil, fmt.Errorf("unmarshal discovery candidate urls: %w", err)
}
primaryURL := firstString(urls)
if strings.TrimSpace(primaryURL) == "" {
continue
}
normalizedType := signalNormalizeIntradayEventType(eventType)
events = append(events, signalModelEvent{
EventType: normalizedType,
ModelName: signalFirstNonEmpty(modelName, title),
ProviderName: providerName,
OperatorName: providerName,
Audience: buildDiscoveryAudience(normalizedType),
TrustLabel: "官方来源 / discovery 验证",
SourceKindLabel: buildDiscoverySourceKind(normalizedType),
PrimarySource: primaryURL,
SourceURL: primaryURL,
UpdatedAt: updatedAt.Format("2006-01-02 15:04"),
EvidenceDetail: signalFirstNonEmpty(notes, summary),
Baseline: buildDiscoveryBaseline(normalizedType),
Summary: signalFirstNonEmpty(summary, title),
Priority: buildDiscoveryPriority(normalizedType),
})
}
if err := rows.Err(); err != nil {
return nil, err
}
return filterVerifiedDiscoverySignalEvents(events), nil
}
func filterVerifiedDiscoverySignalEvents(events []signalModelEvent) []signalModelEvent {
filtered := make([]signalModelEvent, 0, len(events))
for _, event := range events {
switch event.EventType {
case "official_release", "promo_campaign", "price_cut", "price_increase":
filtered = append(filtered, event)
}
}
return filtered
}
func mergeVerifiedDiscoveryEvents(nativeEvents, discoveryEvents []signalModelEvent) []signalModelEvent {
merged := append([]signalModelEvent{}, nativeEvents...)
index := make(map[string]int, len(merged))
for i, event := range merged {
index[signalEventMergeKey(event)] = i
}
for _, event := range filterVerifiedDiscoverySignalEvents(discoveryEvents) {
key := signalEventMergeKey(event)
if idx, exists := index[key]; exists {
merged[idx] = mergeSignalEventEvidence(merged[idx], event)
continue
}
index[key] = len(merged)
merged = append(merged, event)
}
return merged
}
func mergeSignalEventEvidence(native, discovery signalModelEvent) signalModelEvent {
merged := native
if strings.TrimSpace(merged.SourceKindLabel) == "" {
merged.SourceKindLabel = discovery.SourceKindLabel
}
if strings.TrimSpace(merged.SourceURL) == "" {
merged.SourceURL = discovery.SourceURL
}
if strings.TrimSpace(merged.PrimarySource) == "" {
merged.PrimarySource = discovery.PrimarySource
}
if strings.TrimSpace(merged.EvidenceDetail) == "" {
merged.EvidenceDetail = discovery.EvidenceDetail
}
if strings.TrimSpace(merged.TrustLabel) == "" {
merged.TrustLabel = discovery.TrustLabel
}
return merged
}
func signalEventMergeKey(event signalModelEvent) string {
return strings.Join([]string{
signalNormalizeIntradayEventType(event.EventType),
signalNormalizeWord(event.ProviderName),
signalNormalizeWord(event.ModelName),
}, "|")
}
func buildDiscoveryAudience(eventType string) string {
switch eventType {
case "official_release":
return "适合需要尽快复查默认选型与路线图影响的团队"
case "promo_campaign":
return "适合想利用活动窗口压低成本的团队"
case "price_cut":
return "适合准备趁降价重排默认模型的团队"
case "price_increase":
return "适合提前准备替代模型和预算回退方案的团队"
default:
return "适合关注日内情报变化的读者"
}
}
func buildDiscoverySourceKind(eventType string) string {
switch eventType {
case "official_release":
return "discovery 验证 / 官方发布页"
case "promo_campaign":
return "discovery 验证 / 官方活动页"
case "price_cut", "price_increase":
return "discovery 验证 / 官方价格页"
default:
return "discovery 验证"
}
}
func buildDiscoveryBaseline(eventType string) string {
switch eventType {
case "official_release":
return "discovery 验证通过"
case "promo_campaign":
return "活动窗口已验证"
case "price_cut", "price_increase":
return "official_confirmed"
default:
return "discovery verified"
}
}
func buildDiscoveryPriority(eventType string) int {
switch eventType {
case "official_release":
return 118
case "promo_campaign":
return 112
case "price_cut":
return 96
case "price_increase":
return 94
default:
return 80
}
}
func firstString(values []string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return value
}
}
return ""
}
func signalNormalizeIntradayEventType(value string) string {
switch strings.TrimSpace(strings.ToLower(value)) {
case "price_cut":
return "price_cut"
case "price_increase":
return "price_increase"
case "official_release":
return "official_release"
case "promo_campaign":
return "promo_campaign"
default:
return "unknown"
}
}
func signalNormalizeWord(value string) string {
value = strings.ToLower(strings.TrimSpace(value))
value = strings.ReplaceAll(value, "_", "-")
var b strings.Builder
lastDash := false
for _, r := range value {
isAlphaNum := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
if isAlphaNum {
b.WriteRune(r)
lastDash = false
continue
}
if !lastDash {
b.WriteByte('-')
lastDash = true
}
}
result := strings.Trim(b.String(), "-")
if result == "" {
return "unknown"
}
return result
}
func classifySignalFreeSource(model signalModelInfo) string { func classifySignalFreeSource(model signalModelInfo) string {
switch model.OperatorType { switch model.OperatorType {
case "official", "cloud": case "official", "cloud":

View File

@@ -31,3 +31,64 @@ func TestBuildSignalPageMode(t *testing.T) {
t.Fatalf("官方发布日 page_mode 错误: %q", got) t.Fatalf("官方发布日 page_mode 错误: %q", got)
} }
} }
func TestBuildSignalPageModeTreatsVerifiedDiscoveryPromoAsHot(t *testing.T) {
got := buildSignalPageMode(signalDailySignals{}, []signalModelEvent{{EventType: "promo_campaign", ModelName: "GPT-5.6"}})
if got != "hot" {
t.Fatalf("已验证活动事件应触发 hot, got=%q", got)
}
}
func TestFilterDiscoveryEventsDropsLeakAndCandidateOnly(t *testing.T) {
events := []signalModelEvent{
{EventType: "official_release", ModelName: "GPT-5.6", Priority: 120},
{EventType: "leak_or_rumor", ModelName: "GPT-5.6", Priority: 200},
{EventType: "unknown", ModelName: "Mystery", Priority: 50},
}
filtered := filterVerifiedDiscoverySignalEvents(events)
if len(filtered) != 1 {
t.Fatalf("期望仅保留 1 条正式事实事件, got=%d", len(filtered))
}
if filtered[0].EventType != "official_release" {
t.Fatalf("错误保留了非正式事件: %+v", filtered)
}
}
func TestMergeVerifiedDiscoveryEventsPrefersNativeFact(t *testing.T) {
native := []signalModelEvent{{
EventType: "official_release",
ModelName: "GPT-5.6",
ProviderName: "OpenAI",
PrimarySource: "native_release",
EvidenceDetail: "native evidence",
Priority: 120,
}}
discovery := []signalModelEvent{{
EventType: "official_release",
ModelName: "GPT-5.6",
ProviderName: "OpenAI",
PrimarySource: "discovery_release",
EvidenceDetail: "discovery evidence",
SourceKindLabel: "官方博客",
Priority: 110,
}}
merged := mergeVerifiedDiscoveryEvents(native, discovery)
if len(merged) != 1 {
t.Fatalf("期望去重后只剩 1 条事件, got=%d", len(merged))
}
if merged[0].PrimarySource != "native_release" {
t.Fatalf("原生事实不应被 discovery 覆盖: %+v", merged[0])
}
if merged[0].SourceKindLabel != "官方博客" {
t.Fatalf("原生事实应补入 discovery 证据缺口: %+v", merged[0])
}
}
func TestMergeVerifiedDiscoveryEventsDropsUnverifiedPriceNarrative(t *testing.T) {
native := []signalModelEvent{{EventType: "new_model", ModelName: "DeepSeek-V4-Flash", ProviderName: "DeepSeek", Priority: 80}}
discovery := []signalModelEvent{{EventType: "leak_or_rumor", ModelName: "DeepSeek-V4-Flash", ProviderName: "DeepSeek", Priority: 130}}
merged := mergeVerifiedDiscoveryEvents(native, discovery)
if len(merged) != 1 || merged[0].EventType != "new_model" {
t.Fatalf("非正式 discovery 事件不应进入正式快照: %+v", merged)
}
}

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
if [[ -f ".env.local" ]]; then
# shellcheck disable=SC1091
source ".env.local"
fi
if [[ -f ".env" ]]; then
# shellcheck disable=SC1091
source ".env"
fi
if [[ -z "${DATABASE_URL:-}" ]]; then
echo "DATABASE_URL 未设置" >&2
exit 1
fi
if [[ -z "${INTRADAY_DISCOVERY_SEARCH_PROVIDER:-}" ]]; then
echo "INTRADAY_DISCOVERY_SEARCH_PROVIDER 未设置" >&2
exit 1
fi
if [[ -z "${INTRADAY_DISCOVERY_LLM_PROVIDER:-}" ]]; then
echo "INTRADAY_DISCOVERY_LLM_PROVIDER 未设置" >&2
exit 1
fi
REPORT_DATE="${REPORT_DATE:-$(date +%F)}"
DRY_RUN="false"
if [[ "${1:-}" == "--dry-run" ]]; then
DRY_RUN="true"
fi
discovery_args=(--date "$REPORT_DATE")
verification_args=(--date "$REPORT_DATE")
materialize_args=(--date "$REPORT_DATE")
if [[ "$DRY_RUN" == "true" ]]; then
discovery_args+=(--dry-run)
verification_args+=(--dry-run)
materialize_args+=(--dry-run)
fi
go run -tags llm_script ./scripts/discover_intraday_news_candidates.go ./scripts/intraday_discovery_provider.go ./scripts/intraday_discovery_common.go "${discovery_args[@]}"
go run -tags llm_script ./scripts/verify_intraday_news_candidates.go ./scripts/intraday_discovery_common.go "${verification_args[@]}"
REPORT_TRIGGER_SOURCE="intraday_discovery" go run -tags llm_script ./scripts/materialize_daily_signals.go "${materialize_args[@]}"

View File

@@ -0,0 +1,24 @@
[
{
"event_type": "official_release",
"provider_name": "OpenAI",
"model_name": "GPT-5.6",
"provider_country": "US",
"title": "GPT-5.6 preview pricing update",
"summary": "OpenAI preview material indicates GPT-5.6 entered a preview pricing window.",
"candidate_urls": [
"https://openai.example.com/news/gpt-5-6-pricing"
]
},
{
"event_type": "promo_campaign",
"provider_name": "DeepSeek",
"model_name": "DeepSeek-V4-Flash",
"provider_country": "CN",
"title": "DeepSeek V4 Flash campaign",
"summary": "Official campaign page shows a temporary promotional window for DeepSeek-V4-Flash.",
"candidate_urls": [
"https://deepseek.example.com/campaign/v4-flash"
]
}
]

View File

@@ -0,0 +1,18 @@
[
{
"title": "OpenAI announces GPT-5.6 preview pricing update",
"summary": "OpenAI preview announcement mentions GPT-5.6 and updated API pricing references.",
"url": "https://openai.example.com/news/gpt-5-6-pricing",
"provider": "OpenAI",
"provider_url": "https://openai.example.com",
"published_at": "2026-05-25T09:00:00Z"
},
{
"title": "DeepSeek launches V4 Flash campaign",
"summary": "Campaign page suggests temporary promotional pricing for DeepSeek-V4-Flash.",
"url": "https://deepseek.example.com/campaign/v4-flash",
"provider": "DeepSeek",
"provider_url": "https://deepseek.example.com",
"published_at": "2026-05-25T10:00:00Z"
}
]

View File

@@ -0,0 +1,7 @@
<html><body>
<article>
<h1>OpenAI announces GPT-5.6 preview pricing update</h1>
<p>GPT-5.6 preview is now available in official preview channels.</p>
<p>Published 2026-05-25.</p>
</article>
</body></html>

View File

@@ -0,0 +1,8 @@
<html><body>
<section>
<h1>DeepSeek-V4-Flash pricing</h1>
<p>Old price: $10</p>
<p>New price: $6</p>
<p>Campaign window active now.</p>
</section>
</body></html>

View File

@@ -0,0 +1,6 @@
<html><body>
<article>
<h1>Industry blog discusses GPT-5.6 leak</h1>
<p>Writers speculate GPT-5.6 may appear soon based on references.</p>
</article>
</body></html>

View File

@@ -0,0 +1,501 @@
//go:build llm_script
package main
import (
"context"
"database/sql"
"encoding/json"
"flag"
"fmt"
"io"
"net/http"
"net/url"
"os"
"regexp"
"strings"
"time"
_ "github.com/lib/pq"
)
type verificationCandidateRow struct {
ID int64
CandidateDate string
EventType string
ProviderName string
ModelName string
ProviderCountry string
Title string
Summary string
CandidateURLs []string
Status string
VerificationConfidence string
}
type intradayVerificationConfig struct {
Date string
DryRun bool
DatabaseURL string
Timeout time.Duration
}
type intradayVerificationResult struct {
CandidateID int64
CandidateStatus string
VerificationConfidence string
VerifierSource string
VerifierURL string
VerifierStatus string
ExtractedFacts map[string]any
Notes string
}
type intradayVerificationSummary struct {
CandidateTotal int `json:"candidate_total"`
VerifiedTotal int `json:"verified_total"`
OfficialConfirmedTotal int `json:"official_confirmed_total"`
SecondaryConfirmedTotal int `json:"secondary_confirmed_total"`
RejectedTotal int `json:"rejected_total"`
DryRun bool `json:"dry_run"`
}
func main() {
loadIntradayEnv()
cfg := intradayVerificationConfig{}
flag.StringVar(&cfg.Date, "date", intradayDateValue(), "验证日期,格式 YYYY-MM-DD")
flag.BoolVar(&cfg.DryRun, "dry-run", false, "仅输出摘要,不写数据库")
flag.Parse()
cfg.DatabaseURL = intradayDefaultDSN()
cfg.Timeout = discoveryTimeoutFromEnv()
if err := runIntradayCandidateVerification(cfg); err != nil {
fmt.Fprintf(os.Stderr, "verify_intraday_news_candidates: %v\n", err)
os.Exit(1)
}
}
func runIntradayCandidateVerification(cfg intradayVerificationConfig) error {
if strings.TrimSpace(cfg.Date) == "" {
return fmt.Errorf("date 未设置")
}
db, err := sql.Open("postgres", cfg.DatabaseURL)
if err != nil {
return fmt.Errorf("open db: %w", err)
}
defer db.Close()
candidates, err := loadIntradayVerificationCandidates(context.Background(), db, cfg.Date)
if err != nil {
return err
}
results := make([]intradayVerificationResult, 0, len(candidates))
for _, candidate := range candidates {
result, err := verifyIntradayCandidate(candidate, cfg.Timeout)
if err != nil {
result = intradayVerificationResult{
CandidateID: candidate.ID,
CandidateStatus: "candidate",
VerificationConfidence: candidate.VerificationConfidence,
VerifierStatus: "error",
Notes: err.Error(),
}
}
results = append(results, result)
}
if !cfg.DryRun {
if err := persistIntradayVerificationResults(context.Background(), db, results); err != nil {
return err
}
}
return printIntradayVerificationSummary(summarizeIntradayVerification(results, cfg.DryRun))
}
func loadIntradayVerificationCandidates(ctx context.Context, db *sql.DB, date string) ([]verificationCandidateRow, error) {
rows, err := db.QueryContext(ctx, `
SELECT id, candidate_date::text, event_type, provider_name, COALESCE(model_name, ''), COALESCE(provider_country, ''),
title, COALESCE(summary, ''), COALESCE(candidate_urls::text, '[]'), status, verification_confidence
FROM intraday_news_candidate
WHERE candidate_date = $1::date
AND status IN ('candidate', 'verifying')
ORDER BY discovered_at DESC, id DESC`, date)
if err != nil {
return nil, fmt.Errorf("query intraday candidates: %w", err)
}
defer rows.Close()
var candidates []verificationCandidateRow
for rows.Next() {
var row verificationCandidateRow
var rawURLs string
if err := rows.Scan(&row.ID, &row.CandidateDate, &row.EventType, &row.ProviderName, &row.ModelName, &row.ProviderCountry, &row.Title, &row.Summary, &rawURLs, &row.Status, &row.VerificationConfidence); err != nil {
return nil, fmt.Errorf("scan intraday candidate: %w", err)
}
if err := json.Unmarshal([]byte(rawURLs), &row.CandidateURLs); err != nil {
return nil, fmt.Errorf("unmarshal candidate urls: %w", err)
}
candidates = append(candidates, row)
}
return candidates, rows.Err()
}
func verifyIntradayCandidate(candidate verificationCandidateRow, timeout time.Duration) (intradayVerificationResult, error) {
client := &http.Client{Timeout: timeout}
best := intradayVerificationResult{
CandidateID: candidate.ID,
CandidateStatus: "candidate",
VerificationConfidence: candidate.VerificationConfidence,
VerifierStatus: "insufficient",
Notes: "未找到足够证据",
ExtractedFacts: map[string]any{},
}
for _, candidateURL := range candidate.CandidateURLs {
body, err := fetchVerificationDocument(candidateURL, client)
if err != nil {
best = preferVerificationResult(best, intradayVerificationResult{
CandidateID: candidate.ID,
CandidateStatus: "candidate",
VerificationConfidence: candidate.VerificationConfidence,
VerifierURL: candidateURL,
VerifierStatus: "error",
Notes: err.Error(),
ExtractedFacts: map[string]any{},
})
continue
}
result := verifyCandidateDocument(candidate, candidateURL, body)
if result.CandidateID == 0 {
result.CandidateID = candidate.ID
}
best = preferVerificationResult(best, result)
if best.CandidateStatus == "verified" && best.VerificationConfidence == "official_confirmed" {
return best, nil
}
}
return best, nil
}
func fetchVerificationDocument(rawURL string, client *http.Client) (string, error) {
req, err := http.NewRequest(http.MethodGet, rawURL, nil)
if err != nil {
return "", fmt.Errorf("build verification request: %w", err)
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; llm-intelligence intraday verifier)")
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("fetch verification document: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
payload, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("fetch verification document: unexpected status %d: %s", resp.StatusCode, strings.TrimSpace(string(payload)))
}
payload, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("read verification document: %w", err)
}
return string(payload), nil
}
func verifyCandidateDocument(candidate verificationCandidateRow, candidateURL, body string) intradayVerificationResult {
source := classifyVerificationSource(candidate.ProviderName, candidateURL)
facts := extractVerificationFacts(body)
modelMatched := bodyMentionsModel(body, candidate.ModelName, candidate.Title)
result := intradayVerificationResult{
CandidateID: candidate.ID,
CandidateStatus: "candidate",
VerificationConfidence: "candidate",
VerifierSource: source,
VerifierURL: candidateURL,
VerifierStatus: "insufficient",
ExtractedFacts: facts,
Notes: "证据不足",
}
if isOfficialVerificationSource(source) {
switch normalizeIntradayEventType(candidate.EventType) {
case "official_release":
if modelMatched {
result.CandidateStatus = "verified"
result.VerificationConfidence = "official_confirmed"
result.VerifierStatus = "matched"
result.Notes = "官方页面命中模型发布线索"
}
case "promo_campaign":
if modelMatched && bodyMentionsPromo(body) {
result.CandidateStatus = "verified"
result.VerificationConfidence = "official_confirmed"
result.VerifierStatus = "matched"
result.Notes = "官方页面命中活动窗口或促销语义"
}
case "price_cut", "price_increase":
if priceResult, ok := deriveVerifiedPriceEvent(candidate.EventType, facts); ok {
result.CandidateStatus = "verified"
result.VerificationConfidence = "official_confirmed"
result.VerifierStatus = "matched"
result.ExtractedFacts = priceResult
result.Notes = "官方价格页命中真实价格变化"
} else if modelMatched {
result.VerifierStatus = "insufficient"
result.Notes = "命中模型但缺少可计算的价格变化事实"
}
case "leak_or_rumor":
if modelMatched {
result.CandidateStatus = "verified"
result.VerificationConfidence = "secondary_confirmed"
result.VerifierStatus = "matched"
result.Notes = "保留为待确认情报,不进入正式事实层"
}
}
} else if modelMatched {
result.CandidateStatus = "verified"
result.VerificationConfidence = "secondary_confirmed"
result.VerifierStatus = "matched"
result.Notes = "仅二手来源命中,不能进入正式事实层"
}
if result.VerifierStatus == "insufficient" && modelMatched && !isOfficialVerificationSource(source) {
result.VerificationConfidence = "secondary_confirmed"
}
return result
}
func isOfficialVerificationSource(source string) bool {
switch source {
case "official_page", "official_docs", "official_blog", "pricing_page":
return true
default:
return false
}
}
func classifyVerificationSource(providerName, rawURL string) string {
parsed, err := url.Parse(rawURL)
if err != nil {
return "secondary_media"
}
host := strings.ToLower(parsed.Host)
path := strings.ToLower(parsed.Path)
if isOfficialProviderHost(providerName, host) {
switch {
case strings.Contains(host, "docs.") || strings.Contains(path, "/docs"):
return "official_docs"
case strings.Contains(host, "pricing") || strings.Contains(path, "pricing") || strings.Contains(path, "price"):
return "pricing_page"
case strings.Contains(path, "blog") || strings.Contains(path, "news") || strings.Contains(path, "announcement"):
return "official_blog"
default:
return "official_page"
}
}
return "secondary_media"
}
func isOfficialProviderHost(providerName, host string) bool {
tokens := providerHostTokens(providerName)
for _, token := range tokens {
if token != "" && strings.Contains(host, token) {
return true
}
}
return false
}
func providerHostTokens(providerName string) []string {
switch strings.ToLower(strings.TrimSpace(providerName)) {
case "openai":
return []string{"openai.com"}
case "anthropic":
return []string{"anthropic.com"}
case "google", "google gemini", "gemini":
return []string{"google.com", "google.dev", "ai.google.dev"}
case "deepseek":
return []string{"deepseek.com", "deepseek.ai"}
case "qwen", "dashscope":
return []string{"aliyun.com", "dashscope.com"}
case "xai":
return []string{"x.ai"}
case "智谱":
return []string{"zhipuai.cn"}
case "百度", "百度文心":
return []string{"baidu.com", "cloud.baidu.com"}
case "腾讯", "腾讯混元":
return []string{"tencent.com", "cloud.tencent.com"}
case "minimax":
return []string{"minimax.io", "minimax.chat"}
default:
clean := strings.ToLower(strings.TrimSpace(providerName))
if clean == "" {
return nil
}
return []string{clean}
}
}
func bodyMentionsModel(body, modelName, title string) bool {
normBody := normalizeEvidenceText(body)
for _, candidate := range []string{modelName, title} {
normCandidate := normalizeEvidenceText(candidate)
if normCandidate != "" && strings.Contains(normBody, normCandidate) {
return true
}
}
return false
}
func bodyMentionsPromo(body string) bool {
lower := strings.ToLower(body)
for _, marker := range []string{"campaign", "promo", "promotion", "discount", "活动", "优惠", "限时", "窗口"} {
if strings.Contains(lower, marker) {
return true
}
}
return false
}
func extractVerificationFacts(body string) map[string]any {
facts := map[string]any{}
oldPrice, newPrice, ok := extractPricePair(body)
if ok {
facts["old_input_price"] = oldPrice
facts["new_input_price"] = newPrice
if oldPrice != 0 {
facts["price_change_pct"] = ((newPrice - oldPrice) / oldPrice) * 100
}
}
return facts
}
func deriveVerifiedPriceEvent(eventType string, facts map[string]any) (map[string]any, bool) {
oldValue, oldOK := facts["old_input_price"].(float64)
newValue, newOK := facts["new_input_price"].(float64)
changePct, pctOK := facts["price_change_pct"].(float64)
if !oldOK || !newOK || !pctOK || oldValue <= 0 || newValue <= 0 {
return nil, false
}
normalized := normalizeIntradayEventType(eventType)
if normalized == "price_cut" && changePct >= 0 {
return nil, false
}
if normalized == "price_increase" && changePct <= 0 {
return nil, false
}
return facts, true
}
func extractPricePair(body string) (float64, float64, bool) {
oldRe := regexp.MustCompile(`(?i)(old|from)\s*price[^0-9$¥]*[$¥]?([0-9]+(?:\.[0-9]+)?)`)
newRe := regexp.MustCompile(`(?i)(new|to)\s*price[^0-9$¥]*[$¥]?([0-9]+(?:\.[0-9]+)?)`)
oldMatch := oldRe.FindStringSubmatch(body)
newMatch := newRe.FindStringSubmatch(body)
if len(oldMatch) < 3 || len(newMatch) < 3 {
return 0, 0, false
}
var oldValue, newValue float64
if _, err := fmt.Sscanf(oldMatch[2], "%f", &oldValue); err != nil {
return 0, 0, false
}
if _, err := fmt.Sscanf(newMatch[2], "%f", &newValue); err != nil {
return 0, 0, false
}
return oldValue, newValue, true
}
func normalizeEvidenceText(value string) string {
value = strings.ToLower(value)
re := regexp.MustCompile(`[^a-z0-9\p{Han}]+`)
value = re.ReplaceAllString(value, "")
return strings.TrimSpace(value)
}
func preferVerificationResult(current, next intradayVerificationResult) intradayVerificationResult {
if verificationScore(next) > verificationScore(current) {
return next
}
return current
}
func verificationScore(result intradayVerificationResult) int {
score := 0
switch result.CandidateStatus {
case "verified":
score += 20
case "rejected":
score += 5
}
switch result.VerificationConfidence {
case "official_confirmed":
score += 10
case "secondary_confirmed":
score += 5
}
switch result.VerifierStatus {
case "matched":
score += 3
case "contradicted":
score += 1
}
return score
}
func persistIntradayVerificationResults(ctx context.Context, db *sql.DB, results []intradayVerificationResult) error {
for _, result := range results {
facts, err := json.Marshal(result.ExtractedFacts)
if err != nil {
return fmt.Errorf("marshal extracted facts: %w", err)
}
_, err = db.ExecContext(ctx, `
INSERT INTO intraday_news_verification (
candidate_id, verifier_source, verifier_url, verifier_status, extracted_facts, notes
) VALUES ($1, NULLIF($2, ''), NULLIF($3, ''), $4, $5::jsonb, NULLIF($6, ''))`,
result.CandidateID,
result.VerifierSource,
result.VerifierURL,
result.VerifierStatus,
string(facts),
result.Notes,
)
if err != nil {
return fmt.Errorf("insert intraday verification: %w", err)
}
_, err = db.ExecContext(ctx, `
UPDATE intraday_news_candidate
SET status = $2,
verification_confidence = $3,
verification_notes = NULLIF($4, ''),
updated_at = CURRENT_TIMESTAMP
WHERE id = $1`,
result.CandidateID,
result.CandidateStatus,
result.VerificationConfidence,
result.Notes,
)
if err != nil {
return fmt.Errorf("update intraday candidate: %w", err)
}
}
return nil
}
func summarizeIntradayVerification(results []intradayVerificationResult, dryRun bool) intradayVerificationSummary {
summary := intradayVerificationSummary{CandidateTotal: len(results), DryRun: dryRun}
for _, result := range results {
if result.CandidateStatus == "verified" {
summary.VerifiedTotal++
}
switch result.VerificationConfidence {
case "official_confirmed":
summary.OfficialConfirmedTotal++
case "secondary_confirmed":
summary.SecondaryConfirmedTotal++
}
if result.CandidateStatus == "rejected" {
summary.RejectedTotal++
}
}
return summary
}
func printIntradayVerificationSummary(summary intradayVerificationSummary) error {
payload, err := json.Marshal(summary)
if err != nil {
return err
}
fmt.Println(string(payload))
return nil
}

View File

@@ -0,0 +1,99 @@
//go:build llm_script
package main
import (
"os"
"path/filepath"
"testing"
)
func TestVerifyCandidateDocumentOfficialRelease(t *testing.T) {
body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_official_release.html"))
if err != nil {
t.Fatalf("读取 official release fixture 失败: %v", err)
}
candidate := verificationCandidateRow{
ID: 1,
EventType: "official_release",
ProviderName: "OpenAI",
ModelName: "GPT-5.6",
Title: "GPT-5.6 preview pricing update",
}
result := verifyCandidateDocument(candidate, "https://openai.com/news/gpt-5-6-preview", string(body))
if result.CandidateStatus != "verified" || result.VerificationConfidence != "official_confirmed" {
t.Fatalf("官方发布应被确认: %+v", result)
}
}
func TestVerifyCandidateDocumentPriceCutNeedsRealPriceFacts(t *testing.T) {
body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_pricing_page.html"))
if err != nil {
t.Fatalf("读取 pricing fixture 失败: %v", err)
}
candidate := verificationCandidateRow{
ID: 2,
EventType: "price_cut",
ProviderName: "DeepSeek",
ModelName: "DeepSeek-V4-Flash",
Title: "DeepSeek-V4-Flash price cut",
}
result := verifyCandidateDocument(candidate, "https://deepseek.com/pricing/v4-flash", string(body))
if result.CandidateStatus != "verified" || result.VerificationConfidence != "official_confirmed" {
t.Fatalf("价格页命中真实价格变化后应确认: %+v", result)
}
}
func TestVerifyCandidateDocumentPromoCampaignOfficial(t *testing.T) {
body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_pricing_page.html"))
if err != nil {
t.Fatalf("读取 promo fixture 失败: %v", err)
}
candidate := verificationCandidateRow{
ID: 3,
EventType: "promo_campaign",
ProviderName: "DeepSeek",
ModelName: "DeepSeek-V4-Flash",
Title: "DeepSeek V4 Flash campaign",
}
result := verifyCandidateDocument(candidate, "https://deepseek.com/campaign/v4-flash", string(body))
if result.CandidateStatus != "verified" || result.VerificationConfidence != "official_confirmed" {
t.Fatalf("官方活动页应被确认: %+v", result)
}
}
func TestVerifyCandidateDocumentSecondaryMediaDowngrades(t *testing.T) {
body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_secondary_media.html"))
if err != nil {
t.Fatalf("读取 secondary fixture 失败: %v", err)
}
candidate := verificationCandidateRow{
ID: 4,
EventType: "official_release",
ProviderName: "OpenAI",
ModelName: "GPT-5.6",
Title: "GPT-5.6 leak discussion",
}
result := verifyCandidateDocument(candidate, "https://someblog.example.com/gpt-5-6-leak", string(body))
if result.VerificationConfidence != "secondary_confirmed" {
t.Fatalf("二手媒体应降级为 secondary_confirmed: %+v", result)
}
}
func TestVerifyCandidateDocumentLeakStaysOutOfOfficialFacts(t *testing.T) {
body, err := os.ReadFile(filepath.Join("testdata", "intraday_verification_secondary_media.html"))
if err != nil {
t.Fatalf("读取 leak fixture 失败: %v", err)
}
candidate := verificationCandidateRow{
ID: 5,
EventType: "leak_or_rumor",
ProviderName: "OpenAI",
ModelName: "GPT-5.6",
Title: "GPT-5.6 leak discussion",
}
result := verifyCandidateDocument(candidate, "https://someblog.example.com/gpt-5-6-leak", string(body))
if result.VerificationConfidence == "official_confirmed" {
t.Fatalf("泄露类不应升级为正式事实: %+v", result)
}
}