Files
tokens-reef/deploy/monitoring/prometheus/rules/sub2api-alerts.yml
Developer 349d783fd1 refactor: clean up project structure
- Remove old review reports (keep latest only)
- Move docs/ to deploy/docs-backup/
- Move performance-testing/ to deploy/
- Clean up test output files
- Organize root directory
2026-04-06 23:36:03 +08:00

238 lines
8.1 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# =============================================================================
# Sub2API 告警规则
# =============================================================================
groups:
# ===========================================================================
# SLO 相关告警 - 基于错误预算燃烧率
# ===========================================================================
- name: slo_alerts
interval: 30s
rules:
# 错误预算快速燃烧 (Critical) - 2% 预算在1小时内耗尽
- alert: ErrorBudgetBurnRateCritical
expr: |
(
sum(rate(sub2api_http_requests_total{status=~"5.."}[5m]))
/
sum(rate(sub2api_http_requests_total[5m]))
) > 0.0005 * 14.4
for: 2m
labels:
severity: critical
slo: api-availability
team: sre
annotations:
summary: "错误预算快速燃烧 (Critical)"
description: "过去5分钟错误率是30天预算的 {{ $value | humanizePercentage }}预计1小时内耗尽2%错误预算"
runbook_url: "https://wiki.internal/runbooks/error-budget-burn"
dashboard: "http://localhost:3000/d/slo-dashboard"
# 错误预算燃烧加速 (Warning) - 5% 预算在6小时内耗尽
- alert: ErrorBudgetBurnRateWarning
expr: |
(
sum(rate(sub2api_http_requests_total{status=~"5.."}[30m]))
/
sum(rate(sub2api_http_requests_total[30m]))
) > 0.0005 * 6
for: 5m
labels:
severity: high
slo: api-availability
team: sre
annotations:
summary: "错误预算燃烧加速 (Warning)"
description: "过去30分钟错误率是30天预算的 {{ $value | humanizePercentage }}"
runbook_url: "https://wiki.internal/runbooks/error-budget-burn"
# SLO 即将违约告警
- alert: SLOViolationImminent
expr: |
(
1 - (
sum(rate(sub2api_http_requests_total{status=~"5.."}[24h]))
/
sum(rate(sub2api_http_requests_total[24h]))
)
) < 0.9995
for: 5m
labels:
severity: high
slo: api-availability
annotations:
summary: "SLO 即将违约"
description: "过去24小时可用性为 {{ $value | humanizePercentage }},低于目标 99.95%"
# ===========================================================================
# 延迟告警
# ===========================================================================
- name: latency_alerts
rules:
- alert: HighLatencyP99
expr: |
histogram_quantile(0.99,
sum(rate(sub2api_http_request_duration_seconds_bucket[5m])) by (le)
) > 5
for: 5m
labels:
severity: high
annotations:
summary: "P99 延迟过高"
description: "P99 延迟: {{ $value }}s超过阈值 5s"
- alert: HighLatencyP95
expr: |
histogram_quantile(0.95,
sum(rate(sub2api_http_request_duration_seconds_bucket[5m])) by (le)
) > 2
for: 10m
labels:
severity: medium
annotations:
summary: "P95 延迟过高"
description: "P95 延迟: {{ $value }}s超过阈值 2s"
- alert: HighTTFT
expr: |
histogram_quantile(0.99,
sum(rate(sub2api_gateway_ttft_seconds_bucket[5m])) by (le)
) > 3
for: 5m
labels:
severity: high
annotations:
summary: "首 Token 延迟过高"
description: "TTFT P99: {{ $value }}s超过阈值 3s"
# ===========================================================================
# 错误率告警
# ===========================================================================
- name: error_alerts
rules:
- alert: HighErrorRate
expr: |
sum(rate(sub2api_http_requests_total{status=~"5.."}[5m]))
/
sum(rate(sub2api_http_requests_total[5m])) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "错误率过高 (Critical)"
description: "错误率: {{ $value | humanizePercentage }},超过 5%"
- alert: ElevatedErrorRate
expr: |
sum(rate(sub2api_http_requests_total{status=~"5.."}[5m]))
/
sum(rate(sub2api_http_requests_total[5m])) > 0.01
for: 5m
labels:
severity: medium
annotations:
summary: "错误率升高"
description: "错误率: {{ $value | humanizePercentage }},超过 1%"
- alert: UpstreamErrorRateHigh
expr: |
sum(rate(sub2api_gateway_upstream_errors_total[5m]))
/
sum(rate(sub2api_gateway_requests_total[5m])) > 0.1
for: 5m
labels:
severity: high
annotations:
summary: "上游错误率过高"
description: "上游错误率: {{ $value | humanizePercentage }}"
# ===========================================================================
# 基础设施告警
# ===========================================================================
- name: infrastructure_alerts
rules:
- alert: DatabaseConnectionsHigh
expr: |
sub2api_db_connections{state="active"} / sub2api_db_connections{state="max"} > 0.8
for: 5m
labels:
severity: high
annotations:
summary: "数据库连接池使用率过高"
description: "数据库连接池使用率: {{ $value | humanizePercentage }}"
- alert: RedisMemoryHigh
expr: |
redis_memory_used_bytes / redis_memory_max_bytes > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "Redis 内存使用率过高"
description: "Redis 内存使用率: {{ $value | humanizePercentage }}"
- alert: HighCPUUsage
expr: |
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: medium
annotations:
summary: "CPU 使用率过高"
description: "实例 {{ $labels.instance }} CPU 使用率: {{ $value }}%"
- alert: HighMemoryUsage
expr: |
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.85
for: 5m
labels:
severity: high
annotations:
summary: "内存使用率过高"
description: "实例 {{ $labels.instance }} 内存使用率: {{ $value | humanizePercentage }}"
- alert: DiskSpaceLow
expr: |
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
for: 5m
labels:
severity: high
annotations:
summary: "磁盘空间不足"
description: "实例 {{ $labels.instance }} 磁盘剩余空间: {{ $value | humanizePercentage }}"
# ===========================================================================
# 业务告警
# ===========================================================================
- name: business_alerts
rules:
- alert: LowQPS
expr: |
sum(rate(sub2api_http_requests_total[5m])) < 1
for: 10m
labels:
severity: medium
annotations:
summary: "请求量异常低"
description: "当前 QPS: {{ $value }},可能存在流量异常"
- alert: AccountSwitchRateHigh
expr: |
sum(rate(sub2api_account_switches_total[5m])) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "账号切换频率过高"
description: "账号切换率: {{ $value }}/s可能存在上游不稳定"
- alert: JobHeartbeatStale
expr: |
time() - sub2api_job_heartbeat_last_success_timestamp > 300
for: 2m
labels:
severity: high
annotations:
summary: "后台任务心跳超时"
description: "任务 {{ $labels.job_name }} 超过5分钟未报告成功"