Files
tokens-reef/deploy/monitoring/prometheus/rules/sub2api-alerts-light.yml
Developer 349d783fd1 refactor: clean up project structure
- Remove old review reports (keep latest only)
- Move docs/ to deploy/docs-backup/
- Move performance-testing/ to deploy/
- Clean up test output files
- Organize root directory
2026-04-06 23:36:03 +08:00

463 lines
16 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Sub2API 单机版轻量告警规则
# 针对 2核4G 环境优化,避免告警风暴
groups:
# ==================== 系统资源告警 ====================
- name: system-alerts
interval: 60s
rules:
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
category: infrastructure
annotations:
summary: "CPU 使用率过高"
description: "实例 {{ $labels.instance }} CPU 使用率超过 80%,当前值: {{ $value }}%"
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
category: infrastructure
annotations:
summary: "内存使用率过高"
description: "实例 {{ $labels.instance }} 内存使用率超过 85%,当前值: {{ $value }}%"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 10
for: 5m
labels:
severity: critical
category: infrastructure
annotations:
summary: "磁盘空间不足"
description: "实例 {{ $labels.instance }} 磁盘剩余空间不足 10%,当前值: {{ $value }}%"
- alert: DiskWillFillIn24Hours
expr: predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 24*3600) < 0
for: 1h
labels:
severity: warning
category: infrastructure
annotations:
summary: "磁盘预计24小时内写满"
description: "实例 {{ $labels.instance }} 按当前趋势磁盘将在24小时内写满"
# ==================== 应用健康告警 ====================
- name: application-alerts
interval: 30s
rules:
- alert: Sub2APIDown
expr: up{job="sub2api-app"} == 0
for: 1m
labels:
severity: critical
category: availability
annotations:
summary: "Sub2API 服务不可用"
description: "Sub2API 应用实例 {{ $labels.instance }} 已宕机超过 1 分钟"
- alert: HighErrorRate
expr: |
(
sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[5m]))
/
sum(rate(sub2api_http_requests_total{job="sub2api-app"}[5m]))
) > 0.05
for: 2m
labels:
severity: warning
category: availability
annotations:
summary: "错误率过高"
description: "API 错误率超过 5%,当前值: {{ $value | humanizePercentage }}"
- alert: HighLatencyP99
expr: histogram_quantile(0.99, sum(rate(sub2api_http_request_duration_seconds_bucket{job="sub2api-app"}[5m])) by (le)) > 5
for: 3m
labels:
severity: warning
category: performance
annotations:
summary: "P99 延迟过高"
description: "API P99 延迟超过 5 秒,当前值: {{ $value }}s"
- alert: HighLatencyP95
expr: histogram_quantile(0.95, sum(rate(sub2api_http_request_duration_seconds_bucket{job="sub2api-app"}[5m])) by (le)) > 2
for: 5m
labels:
severity: info
category: performance
annotations:
summary: "P95 延迟升高"
description: "API P95 延迟超过 2 秒,当前值: {{ $value }}s"
# ==================== 数据库告警 ====================
- name: database-alerts
interval: 60s
rules:
- alert: DatabaseConnectionsHigh
expr: |
(
sub2api_db_connections{state="active"} / sub2api_db_connections{state="max"}
) > 0.8
for: 5m
labels:
severity: warning
category: database
annotations:
summary: "数据库连接池使用率过高"
description: "数据库连接池使用率超过 80%,当前值: {{ $value | humanizePercentage }}"
# ==================== 业务指标告警 ====================
- name: business-alerts
interval: 60s
rules:
- alert: LowUpstreamSuccessRate
expr: |
(
sum(rate(sub2api_upstream_requests_total{status="success"}[10m]))
/
sum(rate(sub2api_upstream_requests_total[10m]))
) < 0.95
for: 5m
labels:
severity: warning
category: business
annotations:
summary: "上游服务成功率下降"
description: "上游服务成功率低于 95%,当前值: {{ $value | humanizePercentage }}"
- alert: HighRateLimitHits
expr: rate(sub2api_rate_limit_hits_total[5m]) > 10
for: 5m
labels:
severity: info
category: business
annotations:
summary: "限流触发频繁"
description: "限流触发频率过高,当前: {{ $value }}/s"
# ==================== Prometheus 自身监控告警 ====================
- name: prometheus-alerts
interval: 60s
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus 抓取目标丢失"
description: "{{ $labels.job }} 实例 {{ $labels.instance }} 无法访问"
- alert: PrometheusHighMemoryUsage
expr: |
(
process_resident_memory_bytes{job="prometheus"} / 1024 / 1024
) > 100
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus 内存使用过高"
description: "Prometheus 内存使用超过 100MB当前: {{ $value }}MB"
- alert: PrometheusTSDBReloadsFailing
expr: rate(prometheus_tsdb_reloads_failures_total[5m]) > 0
for: 5m
labels:
severity: critical
category: monitoring
annotations:
summary: "Prometheus TSDB 重载失败"
description: "Prometheus 时序数据库重载失败,可能需要人工介入"
- alert: PrometheusRuleEvaluationFailures
expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus 规则评估失败"
description: "告警规则评估失败,请检查规则配置"
- alert: PrometheusDiskSpaceLow
expr: |
(
prometheus_tsdb_storage_blocks_bytes / 1024 / 1024 / 1024
) > 1.8
for: 5m
labels:
severity: critical
category: monitoring
annotations:
summary: "Prometheus 磁盘空间不足"
description: "Prometheus 数据占用超过 1.8GB,接近 2GB 限制"
# ==================== 证书过期告警 ====================
- name: certificate-alerts
interval: 3600s # 每小时检查一次
rules:
- alert: TLSCertificateExpiringSoon7Days
expr: |
(
sub2api_tls_certificate_expiry_timestamp - time()
) / 86400 < 7
for: 1h
labels:
severity: warning
category: security
annotations:
summary: "TLS 证书即将过期 (7天内)"
description: "{{ $labels.domain }} 证书将在 {{ $value | humanizeDuration }} 后过期"
- alert: TLSCertificateExpiringSoon3Days
expr: |
(
sub2api_tls_certificate_expiry_timestamp - time()
) / 86400 < 3
for: 1h
labels:
severity: critical
category: security
annotations:
summary: "TLS 证书即将过期 (3天内)"
description: "{{ $labels.domain }} 证书将在 {{ $value | humanizeDuration }} 后过期,请立即续期"
- alert: TLSCertificateExpired
expr: sub2api_tls_certificate_expiry_timestamp - time() < 0
for: 1m
labels:
severity: critical
category: security
annotations:
summary: "TLS 证书已过期"
description: "{{ $labels.domain }} 证书已过期,服务可能无法正常访问"
# ==================== 备份任务告警 ====================
- name: backup-alerts
interval: 3600s # 每小时检查一次
rules:
- alert: DatabaseBackupFailed
expr: |
time() - sub2api_job_heartbeat_last_success_timestamp{job_name="database_backup"} > 90000
for: 1h
labels:
severity: warning
category: backup
annotations:
summary: "数据库备份失败"
description: "数据库备份已超过 25 小时未成功执行"
- alert: DatabaseBackupMissing
expr: absent(sub2api_job_heartbeat_last_success_timestamp{job_name="database_backup"})
for: 1h
labels:
severity: info
category: backup
annotations:
summary: "数据库备份监控未配置"
description: "未检测到数据库备份心跳指标,请检查备份脚本是否上报"
- alert: GrafanaConfigBackupFailed
expr: |
time() - sub2api_job_heartbeat_last_success_timestamp{job_name="grafana_backup"} > 172800
for: 1h
labels:
severity: info
category: backup
annotations:
summary: "Grafana 配置备份失败"
description: "Grafana 配置备份已超过 48 小时未成功执行"
# ==================== SLO 基础告警 (简化版) ====================
- name: slo-alerts
interval: 60s
rules:
# 可用性 SLO: 99.9% (30天窗口)
# 快速燃烧率: 2% 错误预算在1小时内消耗完
- alert: SLOErrorBudgetBurnRateFast
expr: |
(
sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[1h]))
/
sum(rate(sub2api_http_requests_total{job="sub2api-app"}[1h]))
) > 0.0144 * 14.4
for: 5m
labels:
severity: critical
category: slo
slo: availability
annotations:
summary: "SLO 错误预算快速燃烧"
description: "可用性错误预算正在快速消耗可能在1小时内耗尽"
# 慢速燃烧率: 5% 错误预算在6小时内消耗完
- alert: SLOErrorBudgetBurnRateSlow
expr: |
(
sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[6h]))
/
sum(rate(sub2api_http_requests_total{job="sub2api-app"}[6h]))
) > 0.0144 * 6
for: 30m
labels:
severity: warning
category: slo
slo: availability
annotations:
summary: "SLO 错误预算慢速燃烧"
description: "可用性错误预算正在持续消耗可能在6小时内耗尽"
# ==================== Prometheus 自身监控 ====================
- name: prometheus-self-monitoring
interval: 60s
rules:
# 采集目标下线
- alert: PrometheusTargetMissing
expr: up == 0
for: 3m
labels:
severity: critical
category: monitoring
annotations:
summary: "监控采集目标下线"
description: "目标 {{ $labels.job }}/{{ $labels.instance }} 已无法采集超过 3 分钟"
# 采集耗时过长 (可能影响数据精度)
- alert: PrometheusScrapeDurationHigh
expr: scrape_duration_seconds > 10
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus 采集耗时过长"
description: "目标 {{ $labels.job }} 采集耗时 {{ $value }}s超过 10s"
# TSDB 重载失败
- alert: PrometheusTSDBReloadFailing
expr: increase(prometheus_tsdb_reloads_failures_total[1h]) > 0
for: 0m
labels:
severity: critical
category: monitoring
annotations:
summary: "Prometheus TSDB 重载失败"
description: "过去 1 小时内 TSDB 重载出现失败,可能影响数据持久化"
# WAL 截断失败
- alert: PrometheusWALCorruption
expr: increase(prometheus_tsdb_wal_corruptions_total[5m]) > 0
for: 0m
labels:
severity: critical
category: monitoring
annotations:
summary: "Prometheus WAL 损坏"
description: "Prometheus WAL 出现损坏,请立即检查数据完整性"
# 告警规则评估失败
- alert: PrometheusRuleEvaluationFailing
expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "告警规则评估失败"
description: "Prometheus 有告警规则评估失败,可能导致漏报"
# Prometheus 存储接近容量上限
- alert: PrometheusStorageFull
expr: |
(prometheus_tsdb_storage_blocks_bytes / 1024 / 1024 / 1024) > 1.8
for: 10m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus 存储接近上限"
description: "Prometheus 存储已用 {{ $value | humanize }}GB接近 2GB 上限,请及时清理"
# ==================== 证书过期告警 ====================
- name: certificate-alerts
interval: 300s # 5分钟检查一次不需要高频
rules:
# 证书 7 天内过期
- alert: TLSCertificateExpiringSoon
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 7
for: 1h
labels:
severity: warning
category: security
annotations:
summary: "TLS 证书即将过期"
description: "{{ $labels.instance }} 的证书将在 {{ $value | humanizeDuration }} 后过期,请及时续期"
# 证书 3 天内过期 - 提升为 critical
- alert: TLSCertificateExpiringCritical
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 3
for: 0m
labels:
severity: critical
category: security
annotations:
summary: "TLS 证书紧急过期警告"
description: "{{ $labels.instance }} 的证书将在 {{ $value | humanizeDuration }} 后过期!"
# 证书已过期
- alert: TLSCertificateExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 0m
labels:
severity: critical
category: security
annotations:
summary: "TLS 证书已过期!"
description: "{{ $labels.instance }} 的 TLS 证书已过期,请立即续期"
# ==================== 备份与定时任务监控 ====================
- name: backup-and-job-alerts
interval: 300s
rules:
# 数据库备份超过 25 小时未成功 (允许 1 小时误差)
- alert: DatabaseBackupMissing
expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="db-backup"} > 90000
for: 0m
labels:
severity: warning
category: backup
annotations:
summary: "数据库备份超时未完成"
description: "距上次成功备份已超过 25 小时,请检查备份任务"
# 数据库备份超过 49 小时 - 严重
- alert: DatabaseBackupCriticallyMissing
expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="db-backup"} > 176400
for: 0m
labels:
severity: critical
category: backup
annotations:
summary: "数据库备份连续 2 天未完成!"
description: "距上次成功备份已超过 49 小时,存在数据丢失风险"
# OpsMetricsCollector 定时任务停止心跳
- alert: OpsCollectorJobStale
expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="ops-collector"} > 180
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "OpsMetricsCollector 心跳超时"
description: "OpsMetricsCollector 定时任务已超过 3 分钟未上报心跳,可能已停止"