groups: # ========================================================================= # SLO 燃烧率告警(基于错误预算,替代简单阈值告警) # 参考:Google SRE Book - Alerting on SLOs # ========================================================================= - name: ums-slo-burn-rate interval: 30s rules: # ----------------------------------------------------------------------- # SLO-1: API 可用性 (目标: 99.9% / 30天错误预算: 43.8分钟) # ----------------------------------------------------------------------- # 快速燃烧:5m + 1h 双窗口确认,燃烧率 14.4x # 含义:若持续,将在 2小时内 消耗本月 2% 错误预算 - alert: APIAvailability_FastBurn expr: | ( sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) ) > (1 - 0.999) * 14.4 AND ( sum(rate(http_requests_total{status=~"5.."}[1h])) / sum(rate(http_requests_total[1h])) ) > (1 - 0.999) * 14.4 for: 2m labels: severity: critical slo: api-availability page: "true" service: user-management annotations: summary: "🔴 [P0] API 可用性 SLO 快速燃烧 — 立即响应" description: | 错误预算正在以 14.4x 速率消耗(正常速率的14倍) 当前5分钟错误率: {{ $value | humanizePercentage }} 若持续2小时,将消耗本月约 2% 错误预算(约50分钟) SLO 目标: 99.9% (月度允许宕机: 43.8分钟) 运维手册: docs/sre/runbooks/api-availability.md dashboard_url: "http://grafana:3000/d/ums-slo" # 慢速燃烧:30m + 6h 双窗口确认,燃烧率 6x # 含义:若持续,将在 1天内 消耗本月 5% 错误预算 - alert: APIAvailability_SlowBurn expr: | ( sum(rate(http_requests_total{status=~"5.."}[30m])) / sum(rate(http_requests_total[30m])) ) > (1 - 0.999) * 6 AND ( sum(rate(http_requests_total{status=~"5.."}[6h])) / sum(rate(http_requests_total[6h])) ) > (1 - 0.999) * 6 for: 15m labels: severity: warning slo: api-availability page: "false" service: user-management annotations: summary: "🟡 [P2] API 可用性 SLO 缓慢燃烧 — 需在工作时间内关注" description: | 错误预算正在以 6x 速率缓慢消耗 若持续1天,将消耗本月 5% 错误预算 当前30分钟错误率: {{ $value | humanizePercentage }} # ----------------------------------------------------------------------- # SLO-2: API 延迟 (目标: P99 < 500ms 覆盖 99% 请求) # ----------------------------------------------------------------------- - alert: APILatency_FastBurn expr: | histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le) ) > 0.5 AND histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[1h])) by (le) ) > 0.5 for: 5m labels: severity: critical slo: api-latency page: "true" service: user-management annotations: summary: "🔴 [P0] API 延迟 SLO 违规 — P99 超过 500ms" description: | 当前 P99 延迟: {{ $value | humanizeDuration }} SLO 目标: P99 < 500ms 请检查慢查询和数据库连接池 - alert: APILatency_CriticalPath expr: | histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{ path=~".*auth/login.*|.*auth/refresh.*" }[5m])) by (le, path) ) > 0.3 for: 3m labels: severity: critical slo: api-latency-auth service: user-management annotations: summary: "🔴 [P0] 认证关键路径延迟超标" description: | 路径 {{ $labels.path }} 的 P99 延迟: {{ $value | humanizeDuration }} 认证路径 SLO: P99 < 300ms # ----------------------------------------------------------------------- # SLO-3: 登录成功率 (目标: 99% 非攻击流量) # ----------------------------------------------------------------------- - alert: LoginSuccessRate_Degraded expr: | ( sum(rate(user_logins_total{status="success"}[10m])) / sum(rate(user_logins_total[10m])) ) < 0.9 for: 5m labels: severity: warning slo: login-success-rate service: user-management annotations: summary: "🟡 [P2] 登录成功率下降" description: | 当前10分钟登录成功率: {{ $value | humanizePercentage }} SLO 目标: 99% 注意:高失败率可能是暴力破解也可能是系统问题,请结合安全事件判断 # ========================================================================= # 基础设施告警(阈值型,高置信度) # ========================================================================= - name: ums-infrastructure interval: 30s rules: # 服务宕机(最高优先级) - alert: ServiceDown expr: up{job="user-management"} == 0 for: 1m labels: severity: critical page: "true" service: user-management annotations: summary: "🚨 [P0] 用户管理服务实例宕机" description: "实例 {{ $labels.instance }} 已离线超过 1 分钟,健康检查失败" # 数据库不可用(通过高 503 率推断) - alert: DatabaseConnectionFailed expr: | sum(rate(http_requests_total{status="503"}[2m])) > 1 for: 1m labels: severity: critical page: "true" service: user-management annotations: summary: "🚨 [P0] 数据库连接失败,服务不可用" description: | 大量 503 响应,可能是数据库连接池耗尽或数据库宕机 运维手册: docs/sre/runbooks/database-down.md # 数据库连接池使用率 - alert: DatabaseConnectionPoolHigh expr: | (db_connections_active / db_connections_max) > 0.8 for: 3m labels: severity: warning service: user-management annotations: summary: "🟡 数据库连接池使用率超过 80%" description: | 活跃连接: {{ $value | humanizePercentage }} 使用率 若持续增长,可能导致连接拒绝 建议:检查慢查询,或增加连接池大小 # 高内存使用 - alert: HighMemoryUsage expr: | system_memory_usage_bytes > 800000000 # 800MB for: 5m labels: severity: warning service: user-management annotations: summary: "🟡 内存使用超过 800MB" description: "当前内存使用: {{ $value | humanize1024 }}B,请检查内存泄漏" # Goroutine 数量异常 - alert: GoroutineLeakSuspected expr: system_goroutines > 1000 for: 10m labels: severity: warning service: user-management annotations: summary: "🟡 Goroutine 数量异常,疑似泄漏" description: "当前 goroutine 数量: {{ $value }},超过 1000" # 高响应时间(保留,作为绝对阈值兜底) - alert: HighResponseTime_Absolute expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path) ) > 2 for: 5m labels: severity: warning service: user-management annotations: summary: "🟡 API P95 响应时间超过 2 秒" description: "路径 {{ $labels.path }} 响应时间 P95: {{ $value }}s,超过绝对阈值 2s" # ========================================================================= # 安全事件告警 # ========================================================================= - name: ums-security interval: 30s rules: # 暴力破解检测 - alert: BruteForceAttackDetected expr: | ( sum(rate(user_logins_total{status="failed"}[5m])) / sum(rate(user_logins_total[5m])) ) > 0.5 AND sum(rate(user_logins_total[5m])) > 1 for: 3m labels: severity: critical category: security page: "true" service: user-management annotations: summary: "🔐 [P0-SEC] 疑似暴力破解攻击" description: | 登录失败率: {{ $value | humanizePercentage }},超过 50% 请立即检查来源 IP 并确认封禁是否生效 运维手册: docs/sre/runbooks/brute-force.md # 异常检测激增 - alert: AnomalyDetectionSpike expr: | sum(rate(anomaly_detected_total[5m])) > 5 for: 2m labels: severity: warning category: security service: user-management annotations: summary: "🔐 [P2-SEC] 异常登录检测激增" description: | 每秒检测到 {{ $value | humanize }} 个异常事件 可能存在地理位置异常、未知设备或账号泄露 # Token 刷新失败激增 - alert: TokenRefreshFailureSpike expr: | sum(rate(token_refresh_total{status="failure"}[5m])) > 10 for: 2m labels: severity: warning category: auth service: user-management annotations: summary: "🟡 Token 刷新失败激增" description: | 每分钟 Token 刷新失败: {{ $value | humanize }} 可能原因:JWT Secret 轮换、时钟偏差、Redis 不可用 # 账号锁定激增 - alert: AccountLockoutSpike expr: | rate(account_lock_total[10m]) > 0.5 for: 5m labels: severity: warning category: security service: user-management annotations: summary: "🔐 账号锁定事件激增" description: "每分钟账号锁定: {{ $value | humanize }},可能存在针对性攻击" # ========================================================================= # 缓存健康告警 # ========================================================================= - name: ums-cache interval: 60s rules: # 缓存命中率低 - alert: LowCacheHitRate expr: | ( sum(rate(cache_hits_total[10m])) / sum(rate(cache_operations_total[10m])) ) < 0.6 AND sum(rate(cache_operations_total[10m])) > 1 for: 15m labels: severity: warning service: user-management annotations: summary: "🟡 缓存命中率低于 60%" description: | 当前命中率: {{ $value | humanizePercentage }} 可能导致数据库压力增大 请检查缓存 TTL 配置和热点 Key 分布 # ========================================================================= # 业务异常告警(信息类) # ========================================================================= - name: ums-business interval: 60s rules: # API 请求量异常(使用相对偏差,而非绝对值) - alert: APIRequestVolumeAnomaly expr: | ( sum(rate(http_requests_total[5m])) / avg_over_time(sum(rate(http_requests_total[5m]))[1h:5m]) ) > 3 OR ( sum(rate(http_requests_total[5m])) / avg_over_time(sum(rate(http_requests_total[5m]))[1h:5m]) ) < 0.1 for: 5m labels: severity: info service: user-management annotations: summary: "📊 API 请求量异常偏离基线" description: | 当前请求量是过去1小时均值的 {{ $value | humanize }} 倍 可能是流量突增(>3x)或流量断崖(<0.1x)