- Remove old review reports (keep latest only) - Move docs/ to deploy/docs-backup/ - Move performance-testing/ to deploy/ - Clean up test output files - Organize root directory
238 lines
8.1 KiB
YAML
238 lines
8.1 KiB
YAML
# =============================================================================
|
||
# Sub2API 告警规则
|
||
# =============================================================================
|
||
|
||
groups:
|
||
# ===========================================================================
|
||
# SLO 相关告警 - 基于错误预算燃烧率
|
||
# ===========================================================================
|
||
- name: slo_alerts
|
||
interval: 30s
|
||
rules:
|
||
# 错误预算快速燃烧 (Critical) - 2% 预算在1小时内耗尽
|
||
- alert: ErrorBudgetBurnRateCritical
|
||
expr: |
|
||
(
|
||
sum(rate(sub2api_http_requests_total{status=~"5.."}[5m]))
|
||
/
|
||
sum(rate(sub2api_http_requests_total[5m]))
|
||
) > 0.0005 * 14.4
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
slo: api-availability
|
||
team: sre
|
||
annotations:
|
||
summary: "错误预算快速燃烧 (Critical)"
|
||
description: "过去5分钟错误率是30天预算的 {{ $value | humanizePercentage }},预计1小时内耗尽2%错误预算"
|
||
runbook_url: "https://wiki.internal/runbooks/error-budget-burn"
|
||
dashboard: "http://localhost:3000/d/slo-dashboard"
|
||
|
||
# 错误预算燃烧加速 (Warning) - 5% 预算在6小时内耗尽
|
||
- alert: ErrorBudgetBurnRateWarning
|
||
expr: |
|
||
(
|
||
sum(rate(sub2api_http_requests_total{status=~"5.."}[30m]))
|
||
/
|
||
sum(rate(sub2api_http_requests_total[30m]))
|
||
) > 0.0005 * 6
|
||
for: 5m
|
||
labels:
|
||
severity: high
|
||
slo: api-availability
|
||
team: sre
|
||
annotations:
|
||
summary: "错误预算燃烧加速 (Warning)"
|
||
description: "过去30分钟错误率是30天预算的 {{ $value | humanizePercentage }}"
|
||
runbook_url: "https://wiki.internal/runbooks/error-budget-burn"
|
||
|
||
# SLO 即将违约告警
|
||
- alert: SLOViolationImminent
|
||
expr: |
|
||
(
|
||
1 - (
|
||
sum(rate(sub2api_http_requests_total{status=~"5.."}[24h]))
|
||
/
|
||
sum(rate(sub2api_http_requests_total[24h]))
|
||
)
|
||
) < 0.9995
|
||
for: 5m
|
||
labels:
|
||
severity: high
|
||
slo: api-availability
|
||
annotations:
|
||
summary: "SLO 即将违约"
|
||
description: "过去24小时可用性为 {{ $value | humanizePercentage }},低于目标 99.95%"
|
||
|
||
# ===========================================================================
|
||
# 延迟告警
|
||
# ===========================================================================
|
||
- name: latency_alerts
|
||
rules:
|
||
- alert: HighLatencyP99
|
||
expr: |
|
||
histogram_quantile(0.99,
|
||
sum(rate(sub2api_http_request_duration_seconds_bucket[5m])) by (le)
|
||
) > 5
|
||
for: 5m
|
||
labels:
|
||
severity: high
|
||
annotations:
|
||
summary: "P99 延迟过高"
|
||
description: "P99 延迟: {{ $value }}s,超过阈值 5s"
|
||
|
||
- alert: HighLatencyP95
|
||
expr: |
|
||
histogram_quantile(0.95,
|
||
sum(rate(sub2api_http_request_duration_seconds_bucket[5m])) by (le)
|
||
) > 2
|
||
for: 10m
|
||
labels:
|
||
severity: medium
|
||
annotations:
|
||
summary: "P95 延迟过高"
|
||
description: "P95 延迟: {{ $value }}s,超过阈值 2s"
|
||
|
||
- alert: HighTTFT
|
||
expr: |
|
||
histogram_quantile(0.99,
|
||
sum(rate(sub2api_gateway_ttft_seconds_bucket[5m])) by (le)
|
||
) > 3
|
||
for: 5m
|
||
labels:
|
||
severity: high
|
||
annotations:
|
||
summary: "首 Token 延迟过高"
|
||
description: "TTFT P99: {{ $value }}s,超过阈值 3s"
|
||
|
||
# ===========================================================================
|
||
# 错误率告警
|
||
# ===========================================================================
|
||
- name: error_alerts
|
||
rules:
|
||
- alert: HighErrorRate
|
||
expr: |
|
||
sum(rate(sub2api_http_requests_total{status=~"5.."}[5m]))
|
||
/
|
||
sum(rate(sub2api_http_requests_total[5m])) > 0.05
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "错误率过高 (Critical)"
|
||
description: "错误率: {{ $value | humanizePercentage }},超过 5%"
|
||
|
||
- alert: ElevatedErrorRate
|
||
expr: |
|
||
sum(rate(sub2api_http_requests_total{status=~"5.."}[5m]))
|
||
/
|
||
sum(rate(sub2api_http_requests_total[5m])) > 0.01
|
||
for: 5m
|
||
labels:
|
||
severity: medium
|
||
annotations:
|
||
summary: "错误率升高"
|
||
description: "错误率: {{ $value | humanizePercentage }},超过 1%"
|
||
|
||
- alert: UpstreamErrorRateHigh
|
||
expr: |
|
||
sum(rate(sub2api_gateway_upstream_errors_total[5m]))
|
||
/
|
||
sum(rate(sub2api_gateway_requests_total[5m])) > 0.1
|
||
for: 5m
|
||
labels:
|
||
severity: high
|
||
annotations:
|
||
summary: "上游错误率过高"
|
||
description: "上游错误率: {{ $value | humanizePercentage }}"
|
||
|
||
# ===========================================================================
|
||
# 基础设施告警
|
||
# ===========================================================================
|
||
- name: infrastructure_alerts
|
||
rules:
|
||
- alert: DatabaseConnectionsHigh
|
||
expr: |
|
||
sub2api_db_connections{state="active"} / sub2api_db_connections{state="max"} > 0.8
|
||
for: 5m
|
||
labels:
|
||
severity: high
|
||
annotations:
|
||
summary: "数据库连接池使用率过高"
|
||
description: "数据库连接池使用率: {{ $value | humanizePercentage }}"
|
||
|
||
- alert: RedisMemoryHigh
|
||
expr: |
|
||
redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Redis 内存使用率过高"
|
||
description: "Redis 内存使用率: {{ $value | humanizePercentage }}"
|
||
|
||
- alert: HighCPUUsage
|
||
expr: |
|
||
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||
for: 10m
|
||
labels:
|
||
severity: medium
|
||
annotations:
|
||
summary: "CPU 使用率过高"
|
||
description: "实例 {{ $labels.instance }} CPU 使用率: {{ $value }}%"
|
||
|
||
- alert: HighMemoryUsage
|
||
expr: |
|
||
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.85
|
||
for: 5m
|
||
labels:
|
||
severity: high
|
||
annotations:
|
||
summary: "内存使用率过高"
|
||
description: "实例 {{ $labels.instance }} 内存使用率: {{ $value | humanizePercentage }}"
|
||
|
||
- alert: DiskSpaceLow
|
||
expr: |
|
||
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
||
for: 5m
|
||
labels:
|
||
severity: high
|
||
annotations:
|
||
summary: "磁盘空间不足"
|
||
description: "实例 {{ $labels.instance }} 磁盘剩余空间: {{ $value | humanizePercentage }}"
|
||
|
||
# ===========================================================================
|
||
# 业务告警
|
||
# ===========================================================================
|
||
- name: business_alerts
|
||
rules:
|
||
- alert: LowQPS
|
||
expr: |
|
||
sum(rate(sub2api_http_requests_total[5m])) < 1
|
||
for: 10m
|
||
labels:
|
||
severity: medium
|
||
annotations:
|
||
summary: "请求量异常低"
|
||
description: "当前 QPS: {{ $value }},可能存在流量异常"
|
||
|
||
- alert: AccountSwitchRateHigh
|
||
expr: |
|
||
sum(rate(sub2api_account_switches_total[5m])) > 10
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "账号切换频率过高"
|
||
description: "账号切换率: {{ $value }}/s,可能存在上游不稳定"
|
||
|
||
- alert: JobHeartbeatStale
|
||
expr: |
|
||
time() - sub2api_job_heartbeat_last_success_timestamp > 300
|
||
for: 2m
|
||
labels:
|
||
severity: high
|
||
annotations:
|
||
summary: "后台任务心跳超时"
|
||
description: "任务 {{ $labels.job_name }} 超过5分钟未报告成功"
|