# Prometheus Alerting Rules for sub2api-cn-relay-manager # Aligned with current vNext.3 metrics semantics (2026-06-08) groups: - name: sub2api-relay-manager-alerts interval: 30s rules: - alert: ServiceDown expr: up{job="sub2api-relay-manager"} == 0 for: 1m labels: severity: critical team: ops annotations: summary: "sub2api-relay-manager service is down" description: "The sub2api-relay-manager service has been down for more than 1 minute." - alert: HighErrorRate expr: | ( sum(rate(http_requests_total{status=~"4..|5.."}[5m])) / clamp_min(sum(rate(http_requests_total[5m])), 0.001) ) > 0.05 for: 2m labels: severity: warning team: ops annotations: summary: "High HTTP error rate detected" description: "HTTP 4xx/5xx error rate is above 5% for more than 2 minutes." - alert: UserKeyChatSuccessRateLow expr: | ( sum(rate(user_key_chat_requests_total{result="ok"}[10m])) / clamp_min(sum(rate(user_key_chat_requests_total[10m])), 0.001) ) < 0.95 and sum(rate(user_key_chat_requests_total[10m])) > 0 for: 10m labels: severity: critical team: ops annotations: summary: "User-key chat success rate below SLO" description: "Recent user-key chat success rate is below 95% for 10 minutes." - alert: UserKeyChatP95LatencyHigh expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{path="/v1/chat/completions"}[10m])) by (le) ) > 5 for: 10m labels: severity: warning team: ops annotations: summary: "User-key chat P95 latency is high" description: "P95 latency for /v1/chat/completions exceeds 5 seconds for 10 minutes." - alert: UserKeyCreateFailures expr: | sum(rate(user_key_operations_total{operation="create",result!~"success|rate_limited"}[10m])) > 0.02 for: 10m labels: severity: critical team: ops annotations: summary: "User-key create failures detected" description: "Non-rate-limit create failures are occurring on the self-service path." - alert: UserKeyResetFailures expr: | sum(rate(user_key_operations_total{operation="reset",result!~"success|rate_limited"}[10m])) > 0.02 for: 10m labels: severity: warning team: ops annotations: summary: "User-key reset failures detected" description: "Non-rate-limit reset failures are occurring on the self-service path." - alert: UserKeyQuotaExhaustedSpike expr: | sum(rate(user_key_chat_requests_total{result="quota_exhausted"}[10m])) > 0.05 for: 10m labels: severity: warning team: ops annotations: summary: "Quota exhausted events are rising" description: "quota_exhausted responses are rising on the public user-key gateway path." - alert: UserKeyAuthFailuresSpike expr: | sum(rate(user_key_chat_requests_total{result=~"unauthorized|invalid_api_key"}[10m])) > 0.05 for: 10m labels: severity: warning team: security annotations: summary: "User-key auth failures are rising" description: "unauthorized/invalid_api_key outcomes are rising on the public gateway path." - alert: RouteFailoverShareHigh expr: | ( sum(rate(route_decisions_total{status="failover"}[10m])) / clamp_min(sum(rate(route_decisions_total[10m])), 0.001) ) > 0.20 and sum(rate(route_decisions_total[10m])) > 0 for: 10m labels: severity: warning team: ops annotations: summary: "Route failover share is high" description: "More than 20% of recent route decisions are failovers." - alert: NoActiveProviders expr: active_providers == 0 for: 1m labels: severity: critical team: ops annotations: summary: "No active providers" description: "There are no active providers configured. The system cannot route requests." - alert: NoActiveHosts expr: active_hosts == 0 for: 1m labels: severity: critical team: ops annotations: summary: "No active hosts" description: "There are no active hosts. The system cannot import providers." - alert: LogFlushErrors expr: rate(log_flush_errors_total[5m]) > 0 for: 1m labels: severity: warning team: ops annotations: summary: "Log flush errors detected" description: "Log flush errors have been detected. Check log storage/backend." - alert: LogDroppedEvents expr: rate(log_dropped_events_total[5m]) > 10 for: 1m labels: severity: warning team: ops annotations: summary: "Log events being dropped" description: "Log events are being dropped. Check log buffer capacity." - alert: HealthCheckFailing expr: http_requests_total{path="/healthz",status!="200"} > 0 for: 30s labels: severity: critical team: ops annotations: summary: "Health check failing" description: "The /healthz endpoint is returning non-200 status."