Files
user-system/deployment/alertmanager/alertmanager.yml

135 lines
4.3 KiB
YAML
Raw Normal View History

global:
resolve_timeout: 5m
# 飞书 Webhook 全局超时
http_config:
follow_redirects: true
# 注意:
# 该文件为模板文件,生产环境必须先注入并渲染 `${ALERTMANAGER_*}` 变量,
# 再将渲染结果交给 Alertmanager 使用。
# 飞书 Webhook 地址从环境变量 ${FEISHU_WEBHOOK_URL} 注入
# PagerDuty integration key 从 ${PAGERDUTY_INTEGRATION_KEY} 注入
# 告警路由
route:
group_by: ['alertname', 'service', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h # 降低重复告警频率原12h过长改4h
receiver: 'default'
routes:
# P0: Critical — 立即通知,同时走飞书 + 邮件On-Call 链路)
- match:
severity: critical
receiver: 'critical-oncall'
group_wait: 10s
repeat_interval: 30m # Critical 30min 没恢复重新告警
continue: false # Critical 不继续向下路由
# P1: Warning — 走飞书频道,不发邮件
- match:
severity: warning
receiver: 'warning-feishu'
group_wait: 1m
repeat_interval: 2h
continue: false
# P2: Info — 仅飞书记录
- match:
severity: info
receiver: 'info-feishu'
group_wait: 5m
repeat_interval: 24h
continue: false
# 告警接收者
receivers:
# 默认接收者(邮件兜底)
- name: 'default'
email_configs:
- to: '${ALERTMANAGER_DEFAULT_TO}'
from: '${ALERTMANAGER_FROM}'
smarthost: '${ALERTMANAGER_SMARTHOST}'
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
send_resolved: true
headers:
Subject: '[{{ .Status | toUpper }}][UMS] {{ .GroupLabels.alertname }}'
html: |
{{ range .Alerts }}
<b>告警名称:</b> {{ .Labels.alertname }}<br>
<b>严重级别:</b> {{ .Labels.severity }}<br>
<b>摘要:</b> {{ .Annotations.summary }}<br>
<b>详情:</b> {{ .Annotations.description }}<br>
<b>时间:</b> {{ .StartsAt.Format "2006-01-02 15:04:05" }}<br>
<hr>
{{ end }}
# CRIT-04 修复: Critical On-Call 接收者(飞书 + 邮件双通道)
- name: 'critical-oncall'
# 飞书机器人 WebhookCRIT-04 核心修复:原来全是占位符,现在是真实可用的格式)
webhook_configs:
- url: '${FEISHU_WEBHOOK_URL_CRITICAL}'
send_resolved: true
http_config:
bearer_token: '${FEISHU_WEBHOOK_SECRET}'
max_alerts: 10
# 邮件兜底
email_configs:
- to: '${ALERTMANAGER_CRITICAL_TO}'
from: '${ALERTMANAGER_FROM}'
smarthost: '${ALERTMANAGER_SMARTHOST}'
auth_username: '${ALERTMANAGER_AUTH_USERNAME}'
auth_password: '${ALERTMANAGER_AUTH_PASSWORD}'
send_resolved: true
headers:
Subject: '[CRITICAL][UMS] {{ .GroupLabels.alertname }} — 立即处理'
html: |
<h2 style="color:red">⚠️ CRITICAL 告警</h2>
{{ range .Alerts }}
<b>告警:</b> {{ .Labels.alertname }}<br>
<b>摘要:</b> {{ .Annotations.summary }}<br>
<b>详情:</b> {{ .Annotations.description }}<br>
<b>Runbook:</b> {{ .Annotations.runbook_url }}<br>
<b>触发时间:</b> {{ .StartsAt.Format "2006-01-02 15:04:05" }}<br>
<hr>
{{ end }}
# Warning 接收者(飞书频道)
- name: 'warning-feishu'
webhook_configs:
- url: '${FEISHU_WEBHOOK_URL_WARNING}'
send_resolved: true
max_alerts: 20
# Info 接收者(飞书日志频道)
- name: 'info-feishu'
webhook_configs:
- url: '${FEISHU_WEBHOOK_URL_INFO}'
send_resolved: false # Info 级别恢复不再通知
max_alerts: 50
# 告警抑制规则
inhibit_rules:
# critical 告警激活时,抑制同一服务的 warning
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'service']
# critical 告警激活时,抑制同一服务的 info
- source_match:
severity: 'critical'
target_match:
severity: 'info'
equal: ['service']
# warning 告警激活时,抑制同一服务的 info
- source_match:
severity: 'warning'
target_match:
severity: 'info'
equal: ['service']