# Sub2API 单机版轻量告警规则 # 针对 2核4G 环境优化,避免告警风暴 groups: # ==================== 系统资源告警 ==================== - name: system-alerts interval: 60s rules: - alert: HighCPUUsage expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning category: infrastructure annotations: summary: "CPU 使用率过高" description: "实例 {{ $labels.instance }} CPU 使用率超过 80%,当前值: {{ $value }}%" - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85 for: 5m labels: severity: warning category: infrastructure annotations: summary: "内存使用率过高" description: "实例 {{ $labels.instance }} 内存使用率超过 85%,当前值: {{ $value }}%" - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 10 for: 5m labels: severity: critical category: infrastructure annotations: summary: "磁盘空间不足" description: "实例 {{ $labels.instance }} 磁盘剩余空间不足 10%,当前值: {{ $value }}%" - alert: DiskWillFillIn24Hours expr: predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 24*3600) < 0 for: 1h labels: severity: warning category: infrastructure annotations: summary: "磁盘预计24小时内写满" description: "实例 {{ $labels.instance }} 按当前趋势,磁盘将在24小时内写满" # ==================== 应用健康告警 ==================== - name: application-alerts interval: 30s rules: - alert: Sub2APIDown expr: up{job="sub2api-app"} == 0 for: 1m labels: severity: critical category: availability annotations: summary: "Sub2API 服务不可用" description: "Sub2API 应用实例 {{ $labels.instance }} 已宕机超过 1 分钟" - alert: HighErrorRate expr: | ( sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[5m])) / sum(rate(sub2api_http_requests_total{job="sub2api-app"}[5m])) ) > 0.05 for: 2m labels: severity: warning category: availability annotations: summary: "错误率过高" description: "API 错误率超过 5%,当前值: {{ $value | humanizePercentage }}" - alert: HighLatencyP99 expr: histogram_quantile(0.99, sum(rate(sub2api_http_request_duration_seconds_bucket{job="sub2api-app"}[5m])) by (le)) > 5 for: 3m labels: severity: warning category: performance annotations: summary: "P99 延迟过高" description: "API P99 延迟超过 5 秒,当前值: {{ $value }}s" - alert: HighLatencyP95 expr: histogram_quantile(0.95, sum(rate(sub2api_http_request_duration_seconds_bucket{job="sub2api-app"}[5m])) by (le)) > 2 for: 5m labels: severity: info category: performance annotations: summary: "P95 延迟升高" description: "API P95 延迟超过 2 秒,当前值: {{ $value }}s" # ==================== 数据库告警 ==================== - name: database-alerts interval: 60s rules: - alert: DatabaseConnectionsHigh expr: | ( sub2api_db_connections{state="active"} / sub2api_db_connections{state="max"} ) > 0.8 for: 5m labels: severity: warning category: database annotations: summary: "数据库连接池使用率过高" description: "数据库连接池使用率超过 80%,当前值: {{ $value | humanizePercentage }}" # ==================== 业务指标告警 ==================== - name: business-alerts interval: 60s rules: - alert: LowUpstreamSuccessRate expr: | ( sum(rate(sub2api_upstream_requests_total{status="success"}[10m])) / sum(rate(sub2api_upstream_requests_total[10m])) ) < 0.95 for: 5m labels: severity: warning category: business annotations: summary: "上游服务成功率下降" description: "上游服务成功率低于 95%,当前值: {{ $value | humanizePercentage }}" - alert: HighRateLimitHits expr: rate(sub2api_rate_limit_hits_total[5m]) > 10 for: 5m labels: severity: info category: business annotations: summary: "限流触发频繁" description: "限流触发频率过高,当前: {{ $value }}/s" # ==================== Prometheus 自身监控告警 ==================== - name: prometheus-alerts interval: 60s rules: - alert: PrometheusTargetMissing expr: up == 0 for: 5m labels: severity: warning category: monitoring annotations: summary: "Prometheus 抓取目标丢失" description: "{{ $labels.job }} 实例 {{ $labels.instance }} 无法访问" - alert: PrometheusHighMemoryUsage expr: | ( process_resident_memory_bytes{job="prometheus"} / 1024 / 1024 ) > 100 for: 5m labels: severity: warning category: monitoring annotations: summary: "Prometheus 内存使用过高" description: "Prometheus 内存使用超过 100MB,当前: {{ $value }}MB" - alert: PrometheusTSDBReloadsFailing expr: rate(prometheus_tsdb_reloads_failures_total[5m]) > 0 for: 5m labels: severity: critical category: monitoring annotations: summary: "Prometheus TSDB 重载失败" description: "Prometheus 时序数据库重载失败,可能需要人工介入" - alert: PrometheusRuleEvaluationFailures expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0 for: 5m labels: severity: warning category: monitoring annotations: summary: "Prometheus 规则评估失败" description: "告警规则评估失败,请检查规则配置" - alert: PrometheusDiskSpaceLow expr: | ( prometheus_tsdb_storage_blocks_bytes / 1024 / 1024 / 1024 ) > 1.8 for: 5m labels: severity: critical category: monitoring annotations: summary: "Prometheus 磁盘空间不足" description: "Prometheus 数据占用超过 1.8GB,接近 2GB 限制" # ==================== 证书过期告警 ==================== - name: certificate-alerts interval: 3600s # 每小时检查一次 rules: - alert: TLSCertificateExpiringSoon7Days expr: | ( sub2api_tls_certificate_expiry_timestamp - time() ) / 86400 < 7 for: 1h labels: severity: warning category: security annotations: summary: "TLS 证书即将过期 (7天内)" description: "{{ $labels.domain }} 证书将在 {{ $value | humanizeDuration }} 后过期" - alert: TLSCertificateExpiringSoon3Days expr: | ( sub2api_tls_certificate_expiry_timestamp - time() ) / 86400 < 3 for: 1h labels: severity: critical category: security annotations: summary: "TLS 证书即将过期 (3天内)" description: "{{ $labels.domain }} 证书将在 {{ $value | humanizeDuration }} 后过期,请立即续期" - alert: TLSCertificateExpired expr: sub2api_tls_certificate_expiry_timestamp - time() < 0 for: 1m labels: severity: critical category: security annotations: summary: "TLS 证书已过期" description: "{{ $labels.domain }} 证书已过期,服务可能无法正常访问" # ==================== 备份任务告警 ==================== - name: backup-alerts interval: 3600s # 每小时检查一次 rules: - alert: DatabaseBackupFailed expr: | time() - sub2api_job_heartbeat_last_success_timestamp{job_name="database_backup"} > 90000 for: 1h labels: severity: warning category: backup annotations: summary: "数据库备份失败" description: "数据库备份已超过 25 小时未成功执行" - alert: DatabaseBackupMissing expr: absent(sub2api_job_heartbeat_last_success_timestamp{job_name="database_backup"}) for: 1h labels: severity: info category: backup annotations: summary: "数据库备份监控未配置" description: "未检测到数据库备份心跳指标,请检查备份脚本是否上报" - alert: GrafanaConfigBackupFailed expr: | time() - sub2api_job_heartbeat_last_success_timestamp{job_name="grafana_backup"} > 172800 for: 1h labels: severity: info category: backup annotations: summary: "Grafana 配置备份失败" description: "Grafana 配置备份已超过 48 小时未成功执行" # ==================== SLO 基础告警 (简化版) ==================== - name: slo-alerts interval: 60s rules: # 可用性 SLO: 99.9% (30天窗口) # 快速燃烧率: 2% 错误预算在1小时内消耗完 - alert: SLOErrorBudgetBurnRateFast expr: | ( sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[1h])) / sum(rate(sub2api_http_requests_total{job="sub2api-app"}[1h])) ) > 0.0144 * 14.4 for: 5m labels: severity: critical category: slo slo: availability annotations: summary: "SLO 错误预算快速燃烧" description: "可用性错误预算正在快速消耗,可能在1小时内耗尽" # 慢速燃烧率: 5% 错误预算在6小时内消耗完 - alert: SLOErrorBudgetBurnRateSlow expr: | ( sum(rate(sub2api_http_requests_total{job="sub2api-app",status=~"5.."}[6h])) / sum(rate(sub2api_http_requests_total{job="sub2api-app"}[6h])) ) > 0.0144 * 6 for: 30m labels: severity: warning category: slo slo: availability annotations: summary: "SLO 错误预算慢速燃烧" description: "可用性错误预算正在持续消耗,可能在6小时内耗尽" # ==================== Prometheus 自身监控 ==================== - name: prometheus-self-monitoring interval: 60s rules: # 采集目标下线 - alert: PrometheusTargetMissing expr: up == 0 for: 3m labels: severity: critical category: monitoring annotations: summary: "监控采集目标下线" description: "目标 {{ $labels.job }}/{{ $labels.instance }} 已无法采集超过 3 分钟" # 采集耗时过长 (可能影响数据精度) - alert: PrometheusScrapeDurationHigh expr: scrape_duration_seconds > 10 for: 5m labels: severity: warning category: monitoring annotations: summary: "Prometheus 采集耗时过长" description: "目标 {{ $labels.job }} 采集耗时 {{ $value }}s,超过 10s" # TSDB 重载失败 - alert: PrometheusTSDBReloadFailing expr: increase(prometheus_tsdb_reloads_failures_total[1h]) > 0 for: 0m labels: severity: critical category: monitoring annotations: summary: "Prometheus TSDB 重载失败" description: "过去 1 小时内 TSDB 重载出现失败,可能影响数据持久化" # WAL 截断失败 - alert: PrometheusWALCorruption expr: increase(prometheus_tsdb_wal_corruptions_total[5m]) > 0 for: 0m labels: severity: critical category: monitoring annotations: summary: "Prometheus WAL 损坏" description: "Prometheus WAL 出现损坏,请立即检查数据完整性" # 告警规则评估失败 - alert: PrometheusRuleEvaluationFailing expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0 for: 5m labels: severity: warning category: monitoring annotations: summary: "告警规则评估失败" description: "Prometheus 有告警规则评估失败,可能导致漏报" # Prometheus 存储接近容量上限 - alert: PrometheusStorageFull expr: | (prometheus_tsdb_storage_blocks_bytes / 1024 / 1024 / 1024) > 1.8 for: 10m labels: severity: warning category: monitoring annotations: summary: "Prometheus 存储接近上限" description: "Prometheus 存储已用 {{ $value | humanize }}GB,接近 2GB 上限,请及时清理" # ==================== 证书过期告警 ==================== - name: certificate-alerts interval: 300s # 5分钟检查一次,不需要高频 rules: # 证书 7 天内过期 - alert: TLSCertificateExpiringSoon expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 7 for: 1h labels: severity: warning category: security annotations: summary: "TLS 证书即将过期" description: "{{ $labels.instance }} 的证书将在 {{ $value | humanizeDuration }} 后过期,请及时续期" # 证书 3 天内过期 - 提升为 critical - alert: TLSCertificateExpiringCritical expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 3 for: 0m labels: severity: critical category: security annotations: summary: "TLS 证书紧急过期警告" description: "{{ $labels.instance }} 的证书将在 {{ $value | humanizeDuration }} 后过期!" # 证书已过期 - alert: TLSCertificateExpired expr: probe_ssl_earliest_cert_expiry - time() <= 0 for: 0m labels: severity: critical category: security annotations: summary: "TLS 证书已过期!" description: "{{ $labels.instance }} 的 TLS 证书已过期,请立即续期" # ==================== 备份与定时任务监控 ==================== - name: backup-and-job-alerts interval: 300s rules: # 数据库备份超过 25 小时未成功 (允许 1 小时误差) - alert: DatabaseBackupMissing expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="db-backup"} > 90000 for: 0m labels: severity: warning category: backup annotations: summary: "数据库备份超时未完成" description: "距上次成功备份已超过 25 小时,请检查备份任务" # 数据库备份超过 49 小时 - 严重 - alert: DatabaseBackupCriticallyMissing expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="db-backup"} > 176400 for: 0m labels: severity: critical category: backup annotations: summary: "数据库备份连续 2 天未完成!" description: "距上次成功备份已超过 49 小时,存在数据丢失风险" # OpsMetricsCollector 定时任务停止心跳 - alert: OpsCollectorJobStale expr: time() - sub2api_job_heartbeat_last_success_timestamp{job_name="ops-collector"} > 180 for: 5m labels: severity: warning category: monitoring annotations: summary: "OpsMetricsCollector 心跳超时" description: "OpsMetricsCollector 定时任务已超过 3 分钟未上报心跳,可能已停止"