Files
tokens-reef/deploy/monitoring/prometheus/rules/sub2api-alerts-light.yml

463 lines
16 KiB
YAML
Raw Permalink Normal View History

# Sub2API 单机版轻量告警规则
# 针对 2核4G 环境优化,避免告警风暴
groups:
# ==================== 系统资源告警 ====================
- name: system-alerts
interval: 60s
rules:
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
category: infrastructure
annotations:
summary: "CPU 使用率过高"
description: "实例 {{ $labels.instance }} CPU 使用率超过 80%,当前值: {{ $value }}%"
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
category: infrastructure
annotations:
summary: "内存使用率过高"
description: "实例 {{ $labels.instance }} 内存使用率超过 85%,当前值: {{ $value }}%"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 10
for: 5m
labels:
severity: critical
category: infrastructure
annotations:
summary: "磁盘空间不足"
description: "实例 {{ $labels.instance }} 磁盘剩余空间不足 10%,当前值: {{ $value }}%"
- alert: DiskWillFillIn24Hours
expr: predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 24*3600) < 0
for: 1h
labels:
severity: warning
category: infrastructure
annotations:
summary: "磁盘预计24小时内写满"
description: "实例 {{ $labels.instance }} 按当前趋势磁盘将在24小时内写满"
# ==================== 应用健康告警 ====================
- name: application-alerts
interval: 30s
rules:
- alert: Sub2APIDown
expr: up{job="sub2api-app"} == 0
for: 1m
labels:
severity: critical
category: availability
annotations:
summary: "Sub2API 服务不可用"
description: "Sub2API 应用实例 {{ $labels.instance }} 已宕机超过 1 分钟"
- alert: HighErrorRate
expr: |
(
sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[5m]))
/
sum(rate(sub2api_http_requests_total{job="sub2api-app"}[5m]))
) > 0.05
for: 2m
labels:
severity: warning
category: availability
annotations:
summary: "错误率过高"
description: "API 错误率超过 5%,当前值: {{ $value | humanizePercentage }}"
- alert: HighLatencyP99
expr: histogram_quantile(0.99, sum(rate(sub2api_http_request_duration_seconds_bucket{job="sub2api-app"}[5m])) by (le)) > 5
for: 3m
labels:
severity: warning
category: performance
annotations:
summary: "P99 延迟过高"
description: "API P99 延迟超过 5 秒,当前值: {{ $value }}s"
- alert: HighLatencyP95
expr: histogram_quantile(0.95, sum(rate(sub2api_http_request_duration_seconds_bucket{job="sub2api-app"}[5m])) by (le)) > 2
for: 5m
labels:
severity: info
category: performance
annotations:
summary: "P95 延迟升高"
description: "API P95 延迟超过 2 秒,当前值: {{ $value }}s"
# ==================== 数据库告警 ====================
- name: database-alerts
interval: 60s
rules:
- alert: DatabaseConnectionsHigh
expr: |
(
sub2api_db_connections{state="active"} / sub2api_db_connections{state="max"}
) > 0.8
for: 5m
labels:
severity: warning
category: database
annotations:
summary: "数据库连接池使用率过高"
description: "数据库连接池使用率超过 80%,当前值: {{ $value | humanizePercentage }}"
# ==================== 业务指标告警 ====================
- name: business-alerts
interval: 60s
rules:
- alert: LowUpstreamSuccessRate
expr: |
(
sum(rate(sub2api_upstream_requests_total{status="success"}[10m]))
/
sum(rate(sub2api_upstream_requests_total[10m]))
) < 0.95
for: 5m
labels:
severity: warning
category: business
annotations:
summary: "上游服务成功率下降"
description: "上游服务成功率低于 95%,当前值: {{ $value | humanizePercentage }}"
- alert: HighRateLimitHits
expr: rate(sub2api_rate_limit_hits_total[5m]) > 10
for: 5m
labels:
severity: info
category: business
annotations:
summary: "限流触发频繁"
description: "限流触发频率过高,当前: {{ $value }}/s"
# ==================== Prometheus 自身监控告警 ====================
- name: prometheus-alerts
interval: 60s
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus 抓取目标丢失"
description: "{{ $labels.job }} 实例 {{ $labels.instance }} 无法访问"
- alert: PrometheusHighMemoryUsage
expr: |
(
process_resident_memory_bytes{job="prometheus"} / 1024 / 1024
) > 100
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus 内存使用过高"
description: "Prometheus 内存使用超过 100MB当前: {{ $value }}MB"
- alert: PrometheusTSDBReloadsFailing
expr: rate(prometheus_tsdb_reloads_failures_total[5m]) > 0
for: 5m
labels:
severity: critical
category: monitoring
annotations:
summary: "Prometheus TSDB 重载失败"
description: "Prometheus 时序数据库重载失败,可能需要人工介入"
- alert: PrometheusRuleEvaluationFailures
expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus 规则评估失败"
description: "告警规则评估失败,请检查规则配置"
- alert: PrometheusDiskSpaceLow
expr: |
(
prometheus_tsdb_storage_blocks_bytes / 1024 / 1024 / 1024
) > 1.8
for: 5m
labels:
severity: critical
category: monitoring
annotations:
summary: "Prometheus 磁盘空间不足"
description: "Prometheus 数据占用超过 1.8GB,接近 2GB 限制"
# ==================== 证书过期告警 ====================
- name: certificate-alerts
interval: 3600s # 每小时检查一次
rules:
- alert: TLSCertificateExpiringSoon7Days
expr: |
(
sub2api_tls_certificate_expiry_timestamp - time()
) / 86400 < 7
for: 1h
labels:
severity: warning
category: security
annotations:
summary: "TLS 证书即将过期 (7天内)"
description: "{{ $labels.domain }} 证书将在 {{ $value | humanizeDuration }} 后过期"
- alert: TLSCertificateExpiringSoon3Days
expr: |
(
sub2api_tls_certificate_expiry_timestamp - time()
) / 86400 < 3
for: 1h
labels:
severity: critical
category: security
annotations:
summary: "TLS 证书即将过期 (3天内)"
description: "{{ $labels.domain }} 证书将在 {{ $value | humanizeDuration }} 后过期,请立即续期"
- alert: TLSCertificateExpired
expr: sub2api_tls_certificate_expiry_timestamp - time() < 0
for: 1m
labels:
severity: critical
category: security
annotations:
summary: "TLS 证书已过期"
description: "{{ $labels.domain }} 证书已过期,服务可能无法正常访问"
# ==================== 备份任务告警 ====================
- name: backup-alerts
interval: 3600s # 每小时检查一次
rules:
- alert: DatabaseBackupFailed
expr: |
time() - sub2api_job_heartbeat_last_success_timestamp{job_name="database_backup"} > 90000
for: 1h
labels:
severity: warning
category: backup
annotations:
summary: "数据库备份失败"
description: "数据库备份已超过 25 小时未成功执行"
- alert: DatabaseBackupMissing
expr: absent(sub2api_job_heartbeat_last_success_timestamp{job_name="database_backup"})
for: 1h
labels:
severity: info
category: backup
annotations:
summary: "数据库备份监控未配置"
description: "未检测到数据库备份心跳指标,请检查备份脚本是否上报"
- alert: GrafanaConfigBackupFailed
expr: |
time() - sub2api_job_heartbeat_last_success_timestamp{job_name="grafana_backup"} > 172800
for: 1h
labels:
severity: info
category: backup
annotations:
summary: "Grafana 配置备份失败"
description: "Grafana 配置备份已超过 48 小时未成功执行"
# ==================== SLO 基础告警 (简化版) ====================
- name: slo-alerts
interval: 60s
rules:
# 可用性 SLO: 99.9% (30天窗口)
# 快速燃烧率: 2% 错误预算在1小时内消耗完
- alert: SLOErrorBudgetBurnRateFast
expr: |
(
sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[1h]))
/
sum(rate(sub2api_http_requests_total{job="sub2api-app"}[1h]))
) > 0.0144 * 14.4
for: 5m
labels:
severity: critical
category: slo
slo: availability
annotations:
summary: "SLO 错误预算快速燃烧"
description: "可用性错误预算正在快速消耗可能在1小时内耗尽"
# 慢速燃烧率: 5% 错误预算在6小时内消耗完
- alert: SLOErrorBudgetBurnRateSlow
expr: |
(
sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[6h]))
/
sum(rate(sub2api_http_requests_total{job="sub2api-app"}[6h]))
) > 0.0144 * 6
for: 30m
labels:
severity: warning
category: slo
slo: availability
annotations:
summary: "SLO 错误预算慢速燃烧"
description: "可用性错误预算正在持续消耗可能在6小时内耗尽"
# ==================== Prometheus 自身监控 ====================
- name: prometheus-self-monitoring
interval: 60s
rules:
# 采集目标下线
- alert: PrometheusTargetMissing
expr: up == 0
for: 3m
labels:
severity: critical
category: monitoring
annotations:
summary: "监控采集目标下线"
description: "目标 {{ $labels.job }}/{{ $labels.instance }} 已无法采集超过 3 分钟"
# 采集耗时过长 (可能影响数据精度)
- alert: PrometheusScrapeDurationHigh
expr: scrape_duration_seconds > 10
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus 采集耗时过长"
description: "目标 {{ $labels.job }} 采集耗时 {{ $value }}s超过 10s"
# TSDB 重载失败
- alert: PrometheusTSDBReloadFailing
expr: increase(prometheus_tsdb_reloads_failures_total[1h]) > 0
for: 0m
labels:
severity: critical
category: monitoring
annotations:
summary: "Prometheus TSDB 重载失败"
description: "过去 1 小时内 TSDB 重载出现失败,可能影响数据持久化"
# WAL 截断失败
- alert: PrometheusWALCorruption
expr: increase(prometheus_tsdb_wal_corruptions_total[5m]) > 0
for: 0m
labels:
severity: critical
category: monitoring
annotations:
summary: "Prometheus WAL 损坏"
description: "Prometheus WAL 出现损坏,请立即检查数据完整性"
# 告警规则评估失败
- alert: PrometheusRuleEvaluationFailing
expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "告警规则评估失败"
description: "Prometheus 有告警规则评估失败,可能导致漏报"
# Prometheus 存储接近容量上限
- alert: PrometheusStorageFull
expr: |
(prometheus_tsdb_storage_blocks_bytes / 1024 / 1024 / 1024) > 1.8
for: 10m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus 存储接近上限"
description: "Prometheus 存储已用 {{ $value | humanize }}GB接近 2GB 上限,请及时清理"
# ==================== 证书过期告警 ====================
- name: certificate-alerts
interval: 300s # 5分钟检查一次不需要高频
rules:
# 证书 7 天内过期
- alert: TLSCertificateExpiringSoon
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 7
for: 1h
labels:
severity: warning
category: security
annotations:
summary: "TLS 证书即将过期"
description: "{{ $labels.instance }} 的证书将在 {{ $value | humanizeDuration }} 后过期,请及时续期"
# 证书 3 天内过期 - 提升为 critical
- alert: TLSCertificateExpiringCritical
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 3
for: 0m
labels:
severity: critical
category: security
annotations:
summary: "TLS 证书紧急过期警告"
description: "{{ $labels.instance }} 的证书将在 {{ $value | humanizeDuration }} 后过期!"
# 证书已过期
- alert: TLSCertificateExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 0m
labels:
severity: critical
category: security
annotations:
summary: "TLS 证书已过期!"
description: "{{ $labels.instance }} 的 TLS 证书已过期,请立即续期"
# ==================== 备份与定时任务监控 ====================
- name: backup-and-job-alerts
interval: 300s
rules:
# 数据库备份超过 25 小时未成功 (允许 1 小时误差)
- alert: DatabaseBackupMissing
expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="db-backup"} > 90000
for: 0m
labels:
severity: warning
category: backup
annotations:
summary: "数据库备份超时未完成"
description: "距上次成功备份已超过 25 小时,请检查备份任务"
# 数据库备份超过 49 小时 - 严重
- alert: DatabaseBackupCriticallyMissing
expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="db-backup"} > 176400
for: 0m
labels:
severity: critical
category: backup
annotations:
summary: "数据库备份连续 2 天未完成!"
description: "距上次成功备份已超过 49 小时,存在数据丢失风险"
# OpsMetricsCollector 定时任务停止心跳
- alert: OpsCollectorJobStale
expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="ops-collector"} > 180
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "OpsMetricsCollector 心跳超时"
description: "OpsMetricsCollector 定时任务已超过 3 分钟未上报心跳,可能已停止"