sub2api-cn-relay-manager/deploy/monitoring/prometheus-rules.yml

# Prometheus Alerting Rules for sub2api-cn-relay-manager
# Aligned with current vNext.3 metrics semantics (2026-06-08)

groups:
  - name: sub2api-relay-manager-alerts
    interval: 30s
    rules:
      - alert: ServiceDown
        expr: up{job="sub2api-relay-manager"} == 0
        for: 1m
        labels:
          severity: critical
          team: ops
        annotations:
          summary: "sub2api-relay-manager service is down"
          description: "The sub2api-relay-manager service has been down for more than 1 minute."

      - alert: HighErrorRate
        expr: |
          (
            sum(rate(http_requests_total{status=~"4..|5.."}[5m]))
            /
            clamp_min(sum(rate(http_requests_total[5m])), 0.001)
          ) > 0.05
        for: 2m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "High HTTP error rate detected"
          description: "HTTP 4xx/5xx error rate is above 5% for more than 2 minutes."

      - alert: UserKeyChatSuccessRateLow
        expr: |
          (
            sum(rate(user_key_chat_requests_total{result="ok"}[10m]))
            /
            clamp_min(sum(rate(user_key_chat_requests_total[10m])), 0.001)
          ) < 0.95
          and sum(rate(user_key_chat_requests_total[10m])) > 0
        for: 10m
        labels:
          severity: critical
          team: ops
        annotations:
          summary: "User-key chat success rate below SLO"
          description: "Recent user-key chat success rate is below 95% for 10 minutes."

      - alert: UserKeyChatP95LatencyHigh
        expr: |
          histogram_quantile(0.95,
            sum(rate(http_request_duration_seconds_bucket{path="/v1/chat/completions"}[10m])) by (le)
          ) > 5
        for: 10m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "User-key chat P95 latency is high"
          description: "P95 latency for /v1/chat/completions exceeds 5 seconds for 10 minutes."

      - alert: UserKeyCreateFailures
        expr: |
          sum(rate(user_key_operations_total{operation="create",result!~"success|rate_limited"}[10m])) > 0.02
        for: 10m
        labels:
          severity: critical
          team: ops
        annotations:
          summary: "User-key create failures detected"
          description: "Non-rate-limit create failures are occurring on the self-service path."

      - alert: UserKeyResetFailures
        expr: |
          sum(rate(user_key_operations_total{operation="reset",result!~"success|rate_limited"}[10m])) > 0.02
        for: 10m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "User-key reset failures detected"
          description: "Non-rate-limit reset failures are occurring on the self-service path."

      - alert: UserKeyQuotaExhaustedSpike
        expr: |
          sum(rate(user_key_chat_requests_total{result="quota_exhausted"}[10m])) > 0.05
        for: 10m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "Quota exhausted events are rising"
          description: "quota_exhausted responses are rising on the public user-key gateway path."

      - alert: UserKeyAuthFailuresSpike
        expr: |
          sum(rate(user_key_chat_requests_total{result=~"unauthorized|invalid_api_key"}[10m])) > 0.05
        for: 10m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "User-key auth failures are rising"
          description: "unauthorized/invalid_api_key outcomes are rising on the public gateway path."

      - alert: RouteFailoverShareHigh
        expr: |
          (
            sum(rate(route_decisions_total{status="failover"}[10m]))
            /
            clamp_min(sum(rate(route_decisions_total[10m])), 0.001)
          ) > 0.20
          and sum(rate(route_decisions_total[10m])) > 0
        for: 10m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "Route failover share is high"
          description: "More than 20% of recent route decisions are failovers."

      - alert: NoActiveProviders
        expr: active_providers == 0
        for: 1m
        labels:
          severity: critical
          team: ops
        annotations:
          summary: "No active providers"
          description: "There are no active providers configured. The system cannot route requests."

      - alert: NoActiveHosts
        expr: active_hosts == 0
        for: 1m
        labels:
          severity: critical
          team: ops
        annotations:
          summary: "No active hosts"
          description: "There are no active hosts. The system cannot import providers."

      - alert: LogFlushErrors
        expr: rate(log_flush_errors_total[5m]) > 0
        for: 1m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "Log flush errors detected"
          description: "Log flush errors have been detected. Check log storage/backend."

      - alert: LogDroppedEvents
        expr: rate(log_dropped_events_total[5m]) > 10
        for: 1m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "Log events being dropped"
          description: "Log events are being dropped. Check log buffer capacity."

      - alert: HealthCheckFailing
        expr: http_requests_total{path="/healthz",status!="200"} > 0
        for: 30s
        labels:
          severity: critical
          team: ops
        annotations:
          summary: "Health check failing"
          description: "The /healthz endpoint is returning non-200 status."