Files
sub2api-cn-relay-manager/deploy/monitoring/prometheus-rules.yml
phamnazage-jpg dd6f332b53
Some checks are pending
CI / Build & Test (push) Waiting to run
CI / Lint (push) Waiting to run
CI / Security Scan (push) Waiting to run
CI / Docker Build (push) Waiting to run
CI / Release (push) Blocked by required conditions
feat: close v3 slo gates and lifecycle metrics
2026-06-08 14:49:06 +08:00

171 lines
5.6 KiB
YAML

# Prometheus Alerting Rules for sub2api-cn-relay-manager
# Aligned with current vNext.3 metrics semantics (2026-06-08)
groups:
- name: sub2api-relay-manager-alerts
interval: 30s
rules:
- alert: ServiceDown
expr: up{job="sub2api-relay-manager"} == 0
for: 1m
labels:
severity: critical
team: ops
annotations:
summary: "sub2api-relay-manager service is down"
description: "The sub2api-relay-manager service has been down for more than 1 minute."
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"4..|5.."}[5m]))
/
clamp_min(sum(rate(http_requests_total[5m])), 0.001)
) > 0.05
for: 2m
labels:
severity: warning
team: ops
annotations:
summary: "High HTTP error rate detected"
description: "HTTP 4xx/5xx error rate is above 5% for more than 2 minutes."
- alert: UserKeyChatSuccessRateLow
expr: |
(
sum(rate(user_key_chat_requests_total{result="ok"}[10m]))
/
clamp_min(sum(rate(user_key_chat_requests_total[10m])), 0.001)
) < 0.95
and sum(rate(user_key_chat_requests_total[10m])) > 0
for: 10m
labels:
severity: critical
team: ops
annotations:
summary: "User-key chat success rate below SLO"
description: "Recent user-key chat success rate is below 95% for 10 minutes."
- alert: UserKeyChatP95LatencyHigh
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket{path="/v1/chat/completions"}[10m])) by (le)
) > 5
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "User-key chat P95 latency is high"
description: "P95 latency for /v1/chat/completions exceeds 5 seconds for 10 minutes."
- alert: UserKeyCreateFailures
expr: |
sum(rate(user_key_operations_total{operation="create",result!~"success|rate_limited"}[10m])) > 0.02
for: 10m
labels:
severity: critical
team: ops
annotations:
summary: "User-key create failures detected"
description: "Non-rate-limit create failures are occurring on the self-service path."
- alert: UserKeyResetFailures
expr: |
sum(rate(user_key_operations_total{operation="reset",result!~"success|rate_limited"}[10m])) > 0.02
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "User-key reset failures detected"
description: "Non-rate-limit reset failures are occurring on the self-service path."
- alert: UserKeyQuotaExhaustedSpike
expr: |
sum(rate(user_key_chat_requests_total{result="quota_exhausted"}[10m])) > 0.05
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "Quota exhausted events are rising"
description: "quota_exhausted responses are rising on the public user-key gateway path."
- alert: UserKeyAuthFailuresSpike
expr: |
sum(rate(user_key_chat_requests_total{result=~"unauthorized|invalid_api_key"}[10m])) > 0.05
for: 10m
labels:
severity: warning
team: security
annotations:
summary: "User-key auth failures are rising"
description: "unauthorized/invalid_api_key outcomes are rising on the public gateway path."
- alert: RouteFailoverShareHigh
expr: |
(
sum(rate(route_decisions_total{status="failover"}[10m]))
/
clamp_min(sum(rate(route_decisions_total[10m])), 0.001)
) > 0.20
and sum(rate(route_decisions_total[10m])) > 0
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "Route failover share is high"
description: "More than 20% of recent route decisions are failovers."
- alert: NoActiveProviders
expr: active_providers == 0
for: 1m
labels:
severity: critical
team: ops
annotations:
summary: "No active providers"
description: "There are no active providers configured. The system cannot route requests."
- alert: NoActiveHosts
expr: active_hosts == 0
for: 1m
labels:
severity: critical
team: ops
annotations:
summary: "No active hosts"
description: "There are no active hosts. The system cannot import providers."
- alert: LogFlushErrors
expr: rate(log_flush_errors_total[5m]) > 0
for: 1m
labels:
severity: warning
team: ops
annotations:
summary: "Log flush errors detected"
description: "Log flush errors have been detected. Check log storage/backend."
- alert: LogDroppedEvents
expr: rate(log_dropped_events_total[5m]) > 10
for: 1m
labels:
severity: warning
team: ops
annotations:
summary: "Log events being dropped"
description: "Log events are being dropped. Check log buffer capacity."
- alert: HealthCheckFailing
expr: http_requests_total{path="/healthz",status!="200"} > 0
for: 30s
labels:
severity: critical
team: ops
annotations:
summary: "Health check failing"
description: "The /healthz endpoint is returning non-200 status."