# ============================================================================= # Alertmanager 配置文件 # ============================================================================= global: smtp_smarthost: 'localhost:587' smtp_from: 'alerts@sub2api.org' smtp_auth_username: '' smtp_auth_password: '' # 告警分组 slack_api_url: '${SLACK_WEBHOOK_URL}' # 解决通知等待时间 resolve_timeout: 5m # 路由树 templates: - '/etc/alertmanager/templates/*.tmpl' route: group_by: ['alertname', 'cluster', 'service'] group_wait: 10s group_interval: 10s repeat_interval: 12h receiver: 'default' routes: # Critical 告警 - 立即通知 - match: severity: critical receiver: 'critical' group_wait: 0s repeat_interval: 5m continue: true # High 告警 - 快速通知 - match: severity: high receiver: 'high' group_wait: 30s repeat_interval: 30m continue: true # SLO 相关告警 - match: slo: api-availability receiver: 'slo-team' continue: true # 基础设施告警 - match_re: alertname: Database.*|Redis.*|HighCPU.*|HighMemory.* receiver: 'infra-team' continue: true # 所有告警都桥接回内置 ops_alert_events(末尾,continue: false) - receiver: 'ops-bridge' # 接收器配置 receivers: - name: 'default' email_configs: - to: 'oncall@sub2api.org' send_resolved: true headers: Subject: '[Alert] {{ .GroupLabels.alertname }}' - name: 'critical' email_configs: - to: 'sre-lead@sub2api.org' send_resolved: true slack_configs: - channel: '#alerts-critical' send_resolved: true title: '🔴 CRITICAL: {{ .GroupLabels.alertname }}' text: | {{ range .Alerts }} *Summary:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Runbook:* {{ .Annotations.runbook_url }} {{ end }} webhook_configs: - url: '${PAGERDUTY_WEBHOOK_URL}' send_resolved: true - name: 'high' email_configs: - to: 'oncall@sub2api.org' send_resolved: true slack_configs: - channel: '#alerts-high' send_resolved: true title: '🟠 HIGH: {{ .GroupLabels.alertname }}' text: | {{ range .Alerts }} *Summary:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} {{ end }} - name: 'slo-team' email_configs: - to: 'slo-team@sub2api.org' send_resolved: true - name: 'infra-team' slack_configs: - channel: '#infra-alerts' send_resolved: true title: '🔧 INFRA: {{ .GroupLabels.alertname }}' # ops-bridge: 将 Prometheus 告警写回内置 ops_alert_events 表 # 这使得运维人员可在现有 Ops Dashboard 中统一查看所有告警 # 需要通过环境变量注入 bearer token: # ALERTMANAGER_INTERNAL_TOKEN= - name: 'ops-bridge' webhook_configs: - url: 'http://host.docker.internal:8080/admin/ops/prometheus-alerts' send_resolved: true max_alerts: 50 http_config: bearer_token: '${ALERTMANAGER_INTERNAL_TOKEN}' # 抑制规则 inhibit_rules: # 高严重级别抑制低严重级别 - source_match: severity: 'critical' target_match: severity: 'high' equal: ['alertname', 'cluster', 'service'] # 相同告警抑制重复通知 - source_match: severity: 'high' target_match: severity: 'medium' equal: ['alertname', 'cluster', 'service'] # 应用宕机时抑制所有应用层告警(避免告警风暴) - source_match: alertname: 'Sub2APIDown' target_match_re: alertname: 'HighErrorRate|HighLatency.*|SLOErrorBudget.*' equal: ['job']