- Remove old review reports (keep latest only) - Move docs/ to deploy/docs-backup/ - Move performance-testing/ to deploy/ - Clean up test output files - Organize root directory
146 lines
3.8 KiB
YAML
146 lines
3.8 KiB
YAML
# =============================================================================
|
||
# Alertmanager 配置文件
|
||
# =============================================================================
|
||
|
||
global:
|
||
smtp_smarthost: 'localhost:587'
|
||
smtp_from: 'alerts@sub2api.org'
|
||
smtp_auth_username: ''
|
||
smtp_auth_password: ''
|
||
|
||
# 告警分组
|
||
slack_api_url: '${SLACK_WEBHOOK_URL}'
|
||
|
||
# 解决通知等待时间
|
||
resolve_timeout: 5m
|
||
|
||
# 路由树
|
||
templates:
|
||
- '/etc/alertmanager/templates/*.tmpl'
|
||
|
||
route:
|
||
group_by: ['alertname', 'cluster', 'service']
|
||
group_wait: 10s
|
||
group_interval: 10s
|
||
repeat_interval: 12h
|
||
receiver: 'default'
|
||
|
||
routes:
|
||
# Critical 告警 - 立即通知
|
||
- match:
|
||
severity: critical
|
||
receiver: 'critical'
|
||
group_wait: 0s
|
||
repeat_interval: 5m
|
||
continue: true
|
||
|
||
# High 告警 - 快速通知
|
||
- match:
|
||
severity: high
|
||
receiver: 'high'
|
||
group_wait: 30s
|
||
repeat_interval: 30m
|
||
continue: true
|
||
|
||
# SLO 相关告警
|
||
- match:
|
||
slo: api-availability
|
||
receiver: 'slo-team'
|
||
continue: true
|
||
|
||
# 基础设施告警
|
||
- match_re:
|
||
alertname: Database.*|Redis.*|HighCPU.*|HighMemory.*
|
||
receiver: 'infra-team'
|
||
continue: true
|
||
|
||
# 所有告警都桥接回内置 ops_alert_events(末尾,continue: false)
|
||
- receiver: 'ops-bridge'
|
||
|
||
# 接收器配置
|
||
receivers:
|
||
- name: 'default'
|
||
email_configs:
|
||
- to: 'oncall@sub2api.org'
|
||
send_resolved: true
|
||
headers:
|
||
Subject: '[Alert] {{ .GroupLabels.alertname }}'
|
||
|
||
- name: 'critical'
|
||
email_configs:
|
||
- to: 'sre-lead@sub2api.org'
|
||
send_resolved: true
|
||
slack_configs:
|
||
- channel: '#alerts-critical'
|
||
send_resolved: true
|
||
title: '🔴 CRITICAL: {{ .GroupLabels.alertname }}'
|
||
text: |
|
||
{{ range .Alerts }}
|
||
*Summary:* {{ .Annotations.summary }}
|
||
*Description:* {{ .Annotations.description }}
|
||
*Runbook:* {{ .Annotations.runbook_url }}
|
||
{{ end }}
|
||
webhook_configs:
|
||
- url: '${PAGERDUTY_WEBHOOK_URL}'
|
||
send_resolved: true
|
||
|
||
- name: 'high'
|
||
email_configs:
|
||
- to: 'oncall@sub2api.org'
|
||
send_resolved: true
|
||
slack_configs:
|
||
- channel: '#alerts-high'
|
||
send_resolved: true
|
||
title: '🟠 HIGH: {{ .GroupLabels.alertname }}'
|
||
text: |
|
||
{{ range .Alerts }}
|
||
*Summary:* {{ .Annotations.summary }}
|
||
*Description:* {{ .Annotations.description }}
|
||
{{ end }}
|
||
|
||
- name: 'slo-team'
|
||
email_configs:
|
||
- to: 'slo-team@sub2api.org'
|
||
send_resolved: true
|
||
|
||
- name: 'infra-team'
|
||
slack_configs:
|
||
- channel: '#infra-alerts'
|
||
send_resolved: true
|
||
title: '🔧 INFRA: {{ .GroupLabels.alertname }}'
|
||
|
||
# ops-bridge: 将 Prometheus 告警写回内置 ops_alert_events 表
|
||
# 这使得运维人员可在现有 Ops Dashboard 中统一查看所有告警
|
||
# 需要通过环境变量注入 bearer token:
|
||
# ALERTMANAGER_INTERNAL_TOKEN=<same value as app INTERNAL_WEBHOOK_TOKEN>
|
||
- name: 'ops-bridge'
|
||
webhook_configs:
|
||
- url: 'http://host.docker.internal:8080/admin/ops/prometheus-alerts'
|
||
send_resolved: true
|
||
max_alerts: 50
|
||
http_config:
|
||
bearer_token: '${ALERTMANAGER_INTERNAL_TOKEN}'
|
||
|
||
# 抑制规则
|
||
inhibit_rules:
|
||
# 高严重级别抑制低严重级别
|
||
- source_match:
|
||
severity: 'critical'
|
||
target_match:
|
||
severity: 'high'
|
||
equal: ['alertname', 'cluster', 'service']
|
||
|
||
# 相同告警抑制重复通知
|
||
- source_match:
|
||
severity: 'high'
|
||
target_match:
|
||
severity: 'medium'
|
||
equal: ['alertname', 'cluster', 'service']
|
||
|
||
# 应用宕机时抑制所有应用层告警(避免告警风暴)
|
||
- source_match:
|
||
alertname: 'Sub2APIDown'
|
||
target_match_re:
|
||
alertname: 'HighErrorRate|HighLatency.*|SLOErrorBudget.*'
|
||
equal: ['job']
|