Files
llm-intelligence/scripts/real_intraday_search_provider.py
phamnazage-jpg 88833fac8b
Some checks failed
CI / go-test (push) Has been cancelled
CI / scripts-regression (push) Has been cancelled
CI / frontend-build (push) Has been cancelled
CI / docker-build (push) Has been cancelled
feat(intraday): monitor DeepSeek official page drift
2026-05-27 22:01:20 +08:00

66 lines
2.1 KiB
Python

#!/usr/bin/env python3
import email.utils
import json
import os
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
def normalize_pubdate(value: str) -> str:
if not value:
return ''
try:
dt = email.utils.parsedate_to_datetime(value)
return dt.strftime('%Y-%m-%d')
except Exception:
return value
def infer_provider(title: str, link: str) -> str:
text = (title + ' ' + link).lower()
for needle, provider in [
('openai', 'OpenAI'), ('anthropic', 'Anthropic'), ('claude', 'Anthropic'), ('gemini', 'Google'), ('google', 'Google'),
('deepseek', 'DeepSeek'), ('qwen', 'Qwen'), ('dashscope', 'DashScope'), ('zhipu', '智谱'),
('baidu', '百度'), ('tencent', '腾讯'), ('minimax', 'MiniMax'), ('x.ai', 'xAI'), ('xai', 'xAI')
]:
if needle in text:
return provider
return ''
query = os.environ.get("INTRADAY_DISCOVERY_QUERY", "").strip()
if not query:
print("[]")
raise SystemExit(0)
url = "https://www.bing.com/search?format=rss&q=" + urllib.parse.quote(query)
req = urllib.request.Request(url, headers={
"User-Agent": "Mozilla/5.0",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
})
with urllib.request.urlopen(req, timeout=20) as resp:
body = resp.read().decode("utf-8", errors="ignore")
root = ET.fromstring(body)
items = []
for item in root.findall('./channel/item'):
title = (item.findtext('title') or '').strip()
link = (item.findtext('link') or '').strip()
desc = (item.findtext('description') or '').strip()
pub = (item.findtext('pubDate') or '').strip()
provider = infer_provider(title, link)
provider_url = ''
if link:
parsed = urllib.parse.urlparse(link)
provider_url = f"{parsed.scheme}://{parsed.netloc}" if parsed.scheme and parsed.netloc else ''
items.append({
"title": title,
"summary": desc,
"url": link,
"provider": provider,
"provider_url": provider_url,
"published_at": normalize_pubdate(pub),
})
print(json.dumps(items, ensure_ascii=False))