146 lines
3.8 KiB
YAML
146 lines
3.8 KiB
YAML
|
|
# =============================================================================
|
|||
|
|
# Alertmanager 配置文件
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
global:
|
|||
|
|
smtp_smarthost: 'localhost:587'
|
|||
|
|
smtp_from: 'alerts@sub2api.org'
|
|||
|
|
smtp_auth_username: ''
|
|||
|
|
smtp_auth_password: ''
|
|||
|
|
|
|||
|
|
# 告警分组
|
|||
|
|
slack_api_url: '${SLACK_WEBHOOK_URL}'
|
|||
|
|
|
|||
|
|
# 解决通知等待时间
|
|||
|
|
resolve_timeout: 5m
|
|||
|
|
|
|||
|
|
# 路由树
|
|||
|
|
templates:
|
|||
|
|
- '/etc/alertmanager/templates/*.tmpl'
|
|||
|
|
|
|||
|
|
route:
|
|||
|
|
group_by: ['alertname', 'cluster', 'service']
|
|||
|
|
group_wait: 10s
|
|||
|
|
group_interval: 10s
|
|||
|
|
repeat_interval: 12h
|
|||
|
|
receiver: 'default'
|
|||
|
|
|
|||
|
|
routes:
|
|||
|
|
# Critical 告警 - 立即通知
|
|||
|
|
- match:
|
|||
|
|
severity: critical
|
|||
|
|
receiver: 'critical'
|
|||
|
|
group_wait: 0s
|
|||
|
|
repeat_interval: 5m
|
|||
|
|
continue: true
|
|||
|
|
|
|||
|
|
# High 告警 - 快速通知
|
|||
|
|
- match:
|
|||
|
|
severity: high
|
|||
|
|
receiver: 'high'
|
|||
|
|
group_wait: 30s
|
|||
|
|
repeat_interval: 30m
|
|||
|
|
continue: true
|
|||
|
|
|
|||
|
|
# SLO 相关告警
|
|||
|
|
- match:
|
|||
|
|
slo: api-availability
|
|||
|
|
receiver: 'slo-team'
|
|||
|
|
continue: true
|
|||
|
|
|
|||
|
|
# 基础设施告警
|
|||
|
|
- match_re:
|
|||
|
|
alertname: Database.*|Redis.*|HighCPU.*|HighMemory.*
|
|||
|
|
receiver: 'infra-team'
|
|||
|
|
continue: true
|
|||
|
|
|
|||
|
|
# 所有告警都桥接回内置 ops_alert_events(末尾,continue: false)
|
|||
|
|
- receiver: 'ops-bridge'
|
|||
|
|
|
|||
|
|
# 接收器配置
|
|||
|
|
receivers:
|
|||
|
|
- name: 'default'
|
|||
|
|
email_configs:
|
|||
|
|
- to: 'oncall@sub2api.org'
|
|||
|
|
send_resolved: true
|
|||
|
|
headers:
|
|||
|
|
Subject: '[Alert] {{ .GroupLabels.alertname }}'
|
|||
|
|
|
|||
|
|
- name: 'critical'
|
|||
|
|
email_configs:
|
|||
|
|
- to: 'sre-lead@sub2api.org'
|
|||
|
|
send_resolved: true
|
|||
|
|
slack_configs:
|
|||
|
|
- channel: '#alerts-critical'
|
|||
|
|
send_resolved: true
|
|||
|
|
title: '🔴 CRITICAL: {{ .GroupLabels.alertname }}'
|
|||
|
|
text: |
|
|||
|
|
{{ range .Alerts }}
|
|||
|
|
*Summary:* {{ .Annotations.summary }}
|
|||
|
|
*Description:* {{ .Annotations.description }}
|
|||
|
|
*Runbook:* {{ .Annotations.runbook_url }}
|
|||
|
|
{{ end }}
|
|||
|
|
webhook_configs:
|
|||
|
|
- url: '${PAGERDUTY_WEBHOOK_URL}'
|
|||
|
|
send_resolved: true
|
|||
|
|
|
|||
|
|
- name: 'high'
|
|||
|
|
email_configs:
|
|||
|
|
- to: 'oncall@sub2api.org'
|
|||
|
|
send_resolved: true
|
|||
|
|
slack_configs:
|
|||
|
|
- channel: '#alerts-high'
|
|||
|
|
send_resolved: true
|
|||
|
|
title: '🟠 HIGH: {{ .GroupLabels.alertname }}'
|
|||
|
|
text: |
|
|||
|
|
{{ range .Alerts }}
|
|||
|
|
*Summary:* {{ .Annotations.summary }}
|
|||
|
|
*Description:* {{ .Annotations.description }}
|
|||
|
|
{{ end }}
|
|||
|
|
|
|||
|
|
- name: 'slo-team'
|
|||
|
|
email_configs:
|
|||
|
|
- to: 'slo-team@sub2api.org'
|
|||
|
|
send_resolved: true
|
|||
|
|
|
|||
|
|
- name: 'infra-team'
|
|||
|
|
slack_configs:
|
|||
|
|
- channel: '#infra-alerts'
|
|||
|
|
send_resolved: true
|
|||
|
|
title: '🔧 INFRA: {{ .GroupLabels.alertname }}'
|
|||
|
|
|
|||
|
|
# ops-bridge: 将 Prometheus 告警写回内置 ops_alert_events 表
|
|||
|
|
# 这使得运维人员可在现有 Ops Dashboard 中统一查看所有告警
|
|||
|
|
# 需要通过环境变量注入 bearer token:
|
|||
|
|
# ALERTMANAGER_INTERNAL_TOKEN=<same value as app INTERNAL_WEBHOOK_TOKEN>
|
|||
|
|
- name: 'ops-bridge'
|
|||
|
|
webhook_configs:
|
|||
|
|
- url: 'http://host.docker.internal:8080/admin/ops/prometheus-alerts'
|
|||
|
|
send_resolved: true
|
|||
|
|
max_alerts: 50
|
|||
|
|
http_config:
|
|||
|
|
bearer_token: '${ALERTMANAGER_INTERNAL_TOKEN}'
|
|||
|
|
|
|||
|
|
# 抑制规则
|
|||
|
|
inhibit_rules:
|
|||
|
|
# 高严重级别抑制低严重级别
|
|||
|
|
- source_match:
|
|||
|
|
severity: 'critical'
|
|||
|
|
target_match:
|
|||
|
|
severity: 'high'
|
|||
|
|
equal: ['alertname', 'cluster', 'service']
|
|||
|
|
|
|||
|
|
# 相同告警抑制重复通知
|
|||
|
|
- source_match:
|
|||
|
|
severity: 'high'
|
|||
|
|
target_match:
|
|||
|
|
severity: 'medium'
|
|||
|
|
equal: ['alertname', 'cluster', 'service']
|
|||
|
|
|
|||
|
|
# 应用宕机时抑制所有应用层告警(避免告警风暴)
|
|||
|
|
- source_match:
|
|||
|
|
alertname: 'Sub2APIDown'
|
|||
|
|
target_match_re:
|
|||
|
|
alertname: 'HighErrorRate|HighLatency.*|SLOErrorBudget.*'
|
|||
|
|
equal: ['job']
|