Files

146 lines
3.8 KiB
YAML
Raw Permalink Normal View History

# =============================================================================
# Alertmanager 配置文件
# =============================================================================
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@sub2api.org'
smtp_auth_username: ''
smtp_auth_password: ''
# 告警分组
slack_api_url: '${SLACK_WEBHOOK_URL}'
# 解决通知等待时间
resolve_timeout: 5m
# 路由树
templates:
- '/etc/alertmanager/templates/*.tmpl'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
routes:
# Critical 告警 - 立即通知
- match:
severity: critical
receiver: 'critical'
group_wait: 0s
repeat_interval: 5m
continue: true
# High 告警 - 快速通知
- match:
severity: high
receiver: 'high'
group_wait: 30s
repeat_interval: 30m
continue: true
# SLO 相关告警
- match:
slo: api-availability
receiver: 'slo-team'
continue: true
# 基础设施告警
- match_re:
alertname: Database.*|Redis.*|HighCPU.*|HighMemory.*
receiver: 'infra-team'
continue: true
# 所有告警都桥接回内置 ops_alert_events末尾continue: false
- receiver: 'ops-bridge'
# 接收器配置
receivers:
- name: 'default'
email_configs:
- to: 'oncall@sub2api.org'
send_resolved: true
headers:
Subject: '[Alert] {{ .GroupLabels.alertname }}'
- name: 'critical'
email_configs:
- to: 'sre-lead@sub2api.org'
send_resolved: true
slack_configs:
- channel: '#alerts-critical'
send_resolved: true
title: '🔴 CRITICAL: {{ .GroupLabels.alertname }}'
text: |
{{ range .Alerts }}
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Runbook:* {{ .Annotations.runbook_url }}
{{ end }}
webhook_configs:
- url: '${PAGERDUTY_WEBHOOK_URL}'
send_resolved: true
- name: 'high'
email_configs:
- to: 'oncall@sub2api.org'
send_resolved: true
slack_configs:
- channel: '#alerts-high'
send_resolved: true
title: '🟠 HIGH: {{ .GroupLabels.alertname }}'
text: |
{{ range .Alerts }}
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
{{ end }}
- name: 'slo-team'
email_configs:
- to: 'slo-team@sub2api.org'
send_resolved: true
- name: 'infra-team'
slack_configs:
- channel: '#infra-alerts'
send_resolved: true
title: '🔧 INFRA: {{ .GroupLabels.alertname }}'
# ops-bridge: 将 Prometheus 告警写回内置 ops_alert_events 表
# 这使得运维人员可在现有 Ops Dashboard 中统一查看所有告警
# 需要通过环境变量注入 bearer token:
# ALERTMANAGER_INTERNAL_TOKEN=<same value as app INTERNAL_WEBHOOK_TOKEN>
- name: 'ops-bridge'
webhook_configs:
- url: 'http://host.docker.internal:8080/admin/ops/prometheus-alerts'
send_resolved: true
max_alerts: 50
http_config:
bearer_token: '${ALERTMANAGER_INTERNAL_TOKEN}'
# 抑制规则
inhibit_rules:
# 高严重级别抑制低严重级别
- source_match:
severity: 'critical'
target_match:
severity: 'high'
equal: ['alertname', 'cluster', 'service']
# 相同告警抑制重复通知
- source_match:
severity: 'high'
target_match:
severity: 'medium'
equal: ['alertname', 'cluster', 'service']
# 应用宕机时抑制所有应用层告警(避免告警风暴)
- source_match:
alertname: 'Sub2APIDown'
target_match_re:
alertname: 'HighErrorRate|HighLatency.*|SLOErrorBudget.*'
equal: ['job']