# Prometheus Alerting Rules for sub2api-cn-relay-manager # Place this file in your Prometheus rules directory groups: - name: sub2api-relay-manager-alerts interval: 30s rules: # 服务可用性告警 - alert: ServiceDown expr: up{job="sub2api-relay-manager"} == 0 for: 1m labels: severity: critical team: ops annotations: summary: "sub2api-relay-manager service is down" description: "The sub2api-relay-manager service has been down for more than 1 minute." # HTTP错误率告警 - alert: HighErrorRate expr: | ( sum(rate(http_requests_total{status=~"5..|4.."}[5m])) / sum(rate(http_requests_total[5m])) ) > 0.05 for: 2m labels: severity: warning team: ops annotations: summary: "High error rate detected" description: "Error rate is above 5% for more than 2 minutes. Current value: {{ $value | humanizePercentage }}" # 请求延迟告警 - alert: HighLatency expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le) ) > 1.0 for: 3m labels: severity: warning team: ops annotations: summary: "High request latency" description: "95th percentile latency is above 1 second for more than 3 minutes." # 路由故障转移告警 - alert: RouteFailoverSpike expr: | ( rate(route_failovers_total[5m]) > 2 * avg_over_time(rate(route_failovers_total[1h])[1h:5m]) ) for: 1m labels: severity: warning team: ops annotations: summary: "Route failover spike detected" description: "Route failovers have spiked above normal levels. Current rate: {{ $value }}" # 活跃Provider数量告警 - alert: NoActiveProviders expr: active_providers == 0 for: 1m labels: severity: critical team: ops annotations: summary: "No active providers" description: "There are no active providers configured. The system cannot route requests." - alert: LowActiveProviders expr: active_providers < 2 for: 5m labels: severity: warning team: ops annotations: summary: "Low number of active providers" description: "Only {{ $value }} active provider(s) detected. Consider adding more for redundancy." # 活跃Host告警 - alert: NoActiveHosts expr: active_hosts == 0 for: 1m labels: severity: critical team: ops annotations: summary: "No active hosts" description: "There are no active hosts. The system cannot import providers." # 数据库连接告警 - alert: HighDBConnections expr: db_connections_active > 50 for: 5m labels: severity: warning team: ops annotations: summary: "High database connection count" description: "Active DB connections: {{ $value }}. Consider connection pool tuning." # 数据库操作错误告警 - alert: DBOperationErrors expr: | rate(db_operations_total{operation=~"INSERT|UPDATE|DELETE"}[5m]) > 100 for: 2m labels: severity: warning team: ops annotations: summary: "High database write rate" description: "DB write operations are above threshold: {{ $value }} ops/sec" # 日志系统告警 - alert: LogFlushErrors expr: rate(log_flush_errors_total[5m]) > 0 for: 1m labels: severity: warning team: ops annotations: summary: "Log flush errors detected" description: "Log flush errors have been detected. Check log storage/backend." - alert: LogDroppedEvents expr: | rate(log_dropped_events_total[5m]) > 10 for: 1m labels: severity: warning team: ops annotations: summary: "Log events being dropped" description: "Log events are being dropped at {{ $value }} events/sec. Check log buffer capacity." # 批处理导入告警 - alert: BatchImportFailures expr: | ( rate(route_decisions_total{status="failed"}[5m]) / rate(route_decisions_total[5m]) ) > 0.1 for: 5m labels: severity: warning team: ops annotations: summary: "High batch import failure rate" description: "Batch import failure rate is above 10%. Check provider configurations." # API认证失败告警 - alert: AuthFailures expr: | rate(http_requests_total{status="401"}[5m]) > 10 for: 2m labels: severity: warning team: security annotations: summary: "High authentication failure rate" description: "Auth failures detected. Possible credential issues or attacks." # 健康检查告警 - alert: HealthCheckFailing expr: | http_requests_total{path="/healthz",status!="200"} > 0 for: 30s labels: severity: critical team: ops annotations: summary: "Health check failing" description: "The /healthz endpoint is returning non-200 status."