Files

238 lines
8.1 KiB
YAML
Raw Permalink Normal View History

# =============================================================================
# Sub2API 告警规则
# =============================================================================
groups:
# ===========================================================================
# SLO 相关告警 - 基于错误预算燃烧率
# ===========================================================================
- name: slo_alerts
interval: 30s
rules:
# 错误预算快速燃烧 (Critical) - 2% 预算在1小时内耗尽
- alert: ErrorBudgetBurnRateCritical
expr: |
(
sum(rate(sub2api_http_requests_total{status=~"5.."}[5m]))
/
sum(rate(sub2api_http_requests_total[5m]))
) > 0.0005 * 14.4
for: 2m
labels:
severity: critical
slo: api-availability
team: sre
annotations:
summary: "错误预算快速燃烧 (Critical)"
description: "过去5分钟错误率是30天预算的 {{ $value | humanizePercentage }}预计1小时内耗尽2%错误预算"
runbook_url: "https://wiki.internal/runbooks/error-budget-burn"
dashboard: "http://localhost:3000/d/slo-dashboard"
# 错误预算燃烧加速 (Warning) - 5% 预算在6小时内耗尽
- alert: ErrorBudgetBurnRateWarning
expr: |
(
sum(rate(sub2api_http_requests_total{status=~"5.."}[30m]))
/
sum(rate(sub2api_http_requests_total[30m]))
) > 0.0005 * 6
for: 5m
labels:
severity: high
slo: api-availability
team: sre
annotations:
summary: "错误预算燃烧加速 (Warning)"
description: "过去30分钟错误率是30天预算的 {{ $value | humanizePercentage }}"
runbook_url: "https://wiki.internal/runbooks/error-budget-burn"
# SLO 即将违约告警
- alert: SLOViolationImminent
expr: |
(
1 - (
sum(rate(sub2api_http_requests_total{status=~"5.."}[24h]))
/
sum(rate(sub2api_http_requests_total[24h]))
)
) < 0.9995
for: 5m
labels:
severity: high
slo: api-availability
annotations:
summary: "SLO 即将违约"
description: "过去24小时可用性为 {{ $value | humanizePercentage }},低于目标 99.95%"
# ===========================================================================
# 延迟告警
# ===========================================================================
- name: latency_alerts
rules:
- alert: HighLatencyP99
expr: |
histogram_quantile(0.99,
sum(rate(sub2api_http_request_duration_seconds_bucket[5m])) by (le)
) > 5
for: 5m
labels:
severity: high
annotations:
summary: "P99 延迟过高"
description: "P99 延迟: {{ $value }}s超过阈值 5s"
- alert: HighLatencyP95
expr: |
histogram_quantile(0.95,
sum(rate(sub2api_http_request_duration_seconds_bucket[5m])) by (le)
) > 2
for: 10m
labels:
severity: medium
annotations:
summary: "P95 延迟过高"
description: "P95 延迟: {{ $value }}s超过阈值 2s"
- alert: HighTTFT
expr: |
histogram_quantile(0.99,
sum(rate(sub2api_gateway_ttft_seconds_bucket[5m])) by (le)
) > 3
for: 5m
labels:
severity: high
annotations:
summary: "首 Token 延迟过高"
description: "TTFT P99: {{ $value }}s超过阈值 3s"
# ===========================================================================
# 错误率告警
# ===========================================================================
- name: error_alerts
rules:
- alert: HighErrorRate
expr: |
sum(rate(sub2api_http_requests_total{status=~"5.."}[5m]))
/
sum(rate(sub2api_http_requests_total[5m])) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "错误率过高 (Critical)"
description: "错误率: {{ $value | humanizePercentage }},超过 5%"
- alert: ElevatedErrorRate
expr: |
sum(rate(sub2api_http_requests_total{status=~"5.."}[5m]))
/
sum(rate(sub2api_http_requests_total[5m])) > 0.01
for: 5m
labels:
severity: medium
annotations:
summary: "错误率升高"
description: "错误率: {{ $value | humanizePercentage }},超过 1%"
- alert: UpstreamErrorRateHigh
expr: |
sum(rate(sub2api_gateway_upstream_errors_total[5m]))
/
sum(rate(sub2api_gateway_requests_total[5m])) > 0.1
for: 5m
labels:
severity: high
annotations:
summary: "上游错误率过高"
description: "上游错误率: {{ $value | humanizePercentage }}"
# ===========================================================================
# 基础设施告警
# ===========================================================================
- name: infrastructure_alerts
rules:
- alert: DatabaseConnectionsHigh
expr: |
sub2api_db_connections{state="active"} / sub2api_db_connections{state="max"} > 0.8
for: 5m
labels:
severity: high
annotations:
summary: "数据库连接池使用率过高"
description: "数据库连接池使用率: {{ $value | humanizePercentage }}"
- alert: RedisMemoryHigh
expr: |
redis_memory_used_bytes / redis_memory_max_bytes > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "Redis 内存使用率过高"
description: "Redis 内存使用率: {{ $value | humanizePercentage }}"
- alert: HighCPUUsage
expr: |
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: medium
annotations:
summary: "CPU 使用率过高"
description: "实例 {{ $labels.instance }} CPU 使用率: {{ $value }}%"
- alert: HighMemoryUsage
expr: |
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.85
for: 5m
labels:
severity: high
annotations:
summary: "内存使用率过高"
description: "实例 {{ $labels.instance }} 内存使用率: {{ $value | humanizePercentage }}"
- alert: DiskSpaceLow
expr: |
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
for: 5m
labels:
severity: high
annotations:
summary: "磁盘空间不足"
description: "实例 {{ $labels.instance }} 磁盘剩余空间: {{ $value | humanizePercentage }}"
# ===========================================================================
# 业务告警
# ===========================================================================
- name: business_alerts
rules:
- alert: LowQPS
expr: |
sum(rate(sub2api_http_requests_total[5m])) < 1
for: 10m
labels:
severity: medium
annotations:
summary: "请求量异常低"
description: "当前 QPS: {{ $value }},可能存在流量异常"
- alert: AccountSwitchRateHigh
expr: |
sum(rate(sub2api_account_switches_total[5m])) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "账号切换频率过高"
description: "账号切换率: {{ $value }}/s可能存在上游不稳定"
- alert: JobHeartbeatStale
expr: |
time() - sub2api_job_heartbeat_last_success_timestamp > 300
for: 2m
labels:
severity: high
annotations:
summary: "后台任务心跳超时"
description: "任务 {{ $labels.job_name }} 超过5分钟未报告成功"