# ============================================================================= # Sub2API 告警规则 # ============================================================================= groups: # =========================================================================== # SLO 相关告警 - 基于错误预算燃烧率 # =========================================================================== - name: slo_alerts interval: 30s rules: # 错误预算快速燃烧 (Critical) - 2% 预算在1小时内耗尽 - alert: ErrorBudgetBurnRateCritical expr: | ( sum(rate(sub2api_http_requests_total{status=~"5.."}[5m])) / sum(rate(sub2api_http_requests_total[5m])) ) > 0.0005 * 14.4 for: 2m labels: severity: critical slo: api-availability team: sre annotations: summary: "错误预算快速燃烧 (Critical)" description: "过去5分钟错误率是30天预算的 {{ $value | humanizePercentage }},预计1小时内耗尽2%错误预算" runbook_url: "https://wiki.internal/runbooks/error-budget-burn" dashboard: "http://localhost:3000/d/slo-dashboard" # 错误预算燃烧加速 (Warning) - 5% 预算在6小时内耗尽 - alert: ErrorBudgetBurnRateWarning expr: | ( sum(rate(sub2api_http_requests_total{status=~"5.."}[30m])) / sum(rate(sub2api_http_requests_total[30m])) ) > 0.0005 * 6 for: 5m labels: severity: high slo: api-availability team: sre annotations: summary: "错误预算燃烧加速 (Warning)" description: "过去30分钟错误率是30天预算的 {{ $value | humanizePercentage }}" runbook_url: "https://wiki.internal/runbooks/error-budget-burn" # SLO 即将违约告警 - alert: SLOViolationImminent expr: | ( 1 - ( sum(rate(sub2api_http_requests_total{status=~"5.."}[24h])) / sum(rate(sub2api_http_requests_total[24h])) ) ) < 0.9995 for: 5m labels: severity: high slo: api-availability annotations: summary: "SLO 即将违约" description: "过去24小时可用性为 {{ $value | humanizePercentage }},低于目标 99.95%" # =========================================================================== # 延迟告警 # =========================================================================== - name: latency_alerts rules: - alert: HighLatencyP99 expr: | histogram_quantile(0.99, sum(rate(sub2api_http_request_duration_seconds_bucket[5m])) by (le) ) > 5 for: 5m labels: severity: high annotations: summary: "P99 延迟过高" description: "P99 延迟: {{ $value }}s,超过阈值 5s" - alert: HighLatencyP95 expr: | histogram_quantile(0.95, sum(rate(sub2api_http_request_duration_seconds_bucket[5m])) by (le) ) > 2 for: 10m labels: severity: medium annotations: summary: "P95 延迟过高" description: "P95 延迟: {{ $value }}s,超过阈值 2s" - alert: HighTTFT expr: | histogram_quantile(0.99, sum(rate(sub2api_gateway_ttft_seconds_bucket[5m])) by (le) ) > 3 for: 5m labels: severity: high annotations: summary: "首 Token 延迟过高" description: "TTFT P99: {{ $value }}s,超过阈值 3s" # =========================================================================== # 错误率告警 # =========================================================================== - name: error_alerts rules: - alert: HighErrorRate expr: | sum(rate(sub2api_http_requests_total{status=~"5.."}[5m])) / sum(rate(sub2api_http_requests_total[5m])) > 0.05 for: 2m labels: severity: critical annotations: summary: "错误率过高 (Critical)" description: "错误率: {{ $value | humanizePercentage }},超过 5%" - alert: ElevatedErrorRate expr: | sum(rate(sub2api_http_requests_total{status=~"5.."}[5m])) / sum(rate(sub2api_http_requests_total[5m])) > 0.01 for: 5m labels: severity: medium annotations: summary: "错误率升高" description: "错误率: {{ $value | humanizePercentage }},超过 1%" - alert: UpstreamErrorRateHigh expr: | sum(rate(sub2api_gateway_upstream_errors_total[5m])) / sum(rate(sub2api_gateway_requests_total[5m])) > 0.1 for: 5m labels: severity: high annotations: summary: "上游错误率过高" description: "上游错误率: {{ $value | humanizePercentage }}" # =========================================================================== # 基础设施告警 # =========================================================================== - name: infrastructure_alerts rules: - alert: DatabaseConnectionsHigh expr: | sub2api_db_connections{state="active"} / sub2api_db_connections{state="max"} > 0.8 for: 5m labels: severity: high annotations: summary: "数据库连接池使用率过高" description: "数据库连接池使用率: {{ $value | humanizePercentage }}" - alert: RedisMemoryHigh expr: | redis_memory_used_bytes / redis_memory_max_bytes > 0.9 for: 5m labels: severity: critical annotations: summary: "Redis 内存使用率过高" description: "Redis 内存使用率: {{ $value | humanizePercentage }}" - alert: HighCPUUsage expr: | 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 10m labels: severity: medium annotations: summary: "CPU 使用率过高" description: "实例 {{ $labels.instance }} CPU 使用率: {{ $value }}%" - alert: HighMemoryUsage expr: | (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.85 for: 5m labels: severity: high annotations: summary: "内存使用率过高" description: "实例 {{ $labels.instance }} 内存使用率: {{ $value | humanizePercentage }}" - alert: DiskSpaceLow expr: | (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15 for: 5m labels: severity: high annotations: summary: "磁盘空间不足" description: "实例 {{ $labels.instance }} 磁盘剩余空间: {{ $value | humanizePercentage }}" # =========================================================================== # 业务告警 # =========================================================================== - name: business_alerts rules: - alert: LowQPS expr: | sum(rate(sub2api_http_requests_total[5m])) < 1 for: 10m labels: severity: medium annotations: summary: "请求量异常低" description: "当前 QPS: {{ $value }},可能存在流量异常" - alert: AccountSwitchRateHigh expr: | sum(rate(sub2api_account_switches_total[5m])) > 10 for: 5m labels: severity: warning annotations: summary: "账号切换频率过高" description: "账号切换率: {{ $value }}/s,可能存在上游不稳定" - alert: JobHeartbeatStale expr: | time() - sub2api_job_heartbeat_last_success_timestamp > 300 for: 2m labels: severity: high annotations: summary: "后台任务心跳超时" description: "任务 {{ $labels.job_name }} 超过5分钟未报告成功"