llm-intelligence/scripts/real_intraday_search_provider.py

#!/usr/bin/env python3
import email.utils
import json
import os
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET


def normalize_pubdate(value: str) -> str:
    if not value:
        return ''
    try:
        dt = email.utils.parsedate_to_datetime(value)
        return dt.strftime('%Y-%m-%d')
    except Exception:
        return value


def infer_provider(title: str, link: str) -> str:
    text = (title + ' ' + link).lower()
    for needle, provider in [
        ('openai', 'OpenAI'), ('anthropic', 'Anthropic'), ('claude', 'Anthropic'), ('gemini', 'Google'), ('google', 'Google'),
        ('deepseek', 'DeepSeek'), ('qwen', 'Qwen'), ('dashscope', 'DashScope'), ('zhipu', '智谱'),
        ('baidu', '百度'), ('tencent', '腾讯'), ('minimax', 'MiniMax'), ('x.ai', 'xAI'), ('xai', 'xAI')
    ]:
        if needle in text:
            return provider
    return ''


query = os.environ.get("INTRADAY_DISCOVERY_QUERY", "").strip()
if not query:
    print("[]")
    raise SystemExit(0)

url = "https://www.bing.com/search?format=rss&q=" + urllib.parse.quote(query)
req = urllib.request.Request(url, headers={
    "User-Agent": "Mozilla/5.0",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
})
with urllib.request.urlopen(req, timeout=20) as resp:
    body = resp.read().decode("utf-8", errors="ignore")

root = ET.fromstring(body)
items = []
for item in root.findall('./channel/item'):
    title = (item.findtext('title') or '').strip()
    link = (item.findtext('link') or '').strip()
    desc = (item.findtext('description') or '').strip()
    pub = (item.findtext('pubDate') or '').strip()
    provider = infer_provider(title, link)
    provider_url = ''
    if link:
        parsed = urllib.parse.urlparse(link)
        provider_url = f"{parsed.scheme}://{parsed.netloc}" if parsed.scheme and parsed.netloc else ''
    items.append({
        "title": title,
        "summary": desc,
        "url": link,
        "provider": provider,
        "provider_url": provider_url,
        "published_at": normalize_pubdate(pub),
    })
print(json.dumps(items, ensure_ascii=False))