feat: close v3 slo gates and lifecycle metrics
Some checks failed
CI / Build & Test (push) Has been cancelled
CI / Lint (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
CI / Release (push) Has been cancelled

This commit is contained in:
phamnazage-jpg
2026-06-08 14:49:06 +08:00
parent dbbb313a36
commit dd6f332b53
14 changed files with 775 additions and 156 deletions

View File

@@ -1,11 +1,10 @@
# Prometheus Alerting Rules for sub2api-cn-relay-manager
# Place this file in your Prometheus rules directory
# Aligned with current vNext.3 metrics semantics (2026-06-08)
groups:
- name: sub2api-relay-manager-alerts
interval: 30s
rules:
# 服务可用性告警
- alert: ServiceDown
expr: up{job="sub2api-relay-manager"} == 0
for: 1m
@@ -16,53 +15,110 @@ groups:
summary: "sub2api-relay-manager service is down"
description: "The sub2api-relay-manager service has been down for more than 1 minute."
# HTTP错误率告警
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"5..|4.."}[5m]))
/
sum(rate(http_requests_total[5m]))
sum(rate(http_requests_total{status=~"4..|5.."}[5m]))
/
clamp_min(sum(rate(http_requests_total[5m])), 0.001)
) > 0.05
for: 2m
labels:
severity: warning
team: ops
annotations:
summary: "High error rate detected"
description: "Error rate is above 5% for more than 2 minutes. Current value: {{ $value | humanizePercentage }}"
summary: "High HTTP error rate detected"
description: "HTTP 4xx/5xx error rate is above 5% for more than 2 minutes."
# 请求延迟告警
- alert: HighLatency
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 1.0
for: 3m
labels:
severity: warning
team: ops
annotations:
summary: "High request latency"
description: "95th percentile latency is above 1 second for more than 3 minutes."
# 路由故障转移告警
- alert: RouteFailoverSpike
- alert: UserKeyChatSuccessRateLow
expr: |
(
rate(route_failovers_total[5m])
>
2 * avg_over_time(rate(route_failovers_total[1h])[1h:5m])
)
for: 1m
sum(rate(user_key_chat_requests_total{result="ok"}[10m]))
/
clamp_min(sum(rate(user_key_chat_requests_total[10m])), 0.001)
) < 0.95
and sum(rate(user_key_chat_requests_total[10m])) > 0
for: 10m
labels:
severity: critical
team: ops
annotations:
summary: "User-key chat success rate below SLO"
description: "Recent user-key chat success rate is below 95% for 10 minutes."
- alert: UserKeyChatP95LatencyHigh
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket{path="/v1/chat/completions"}[10m])) by (le)
) > 5
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "Route failover spike detected"
description: "Route failovers have spiked above normal levels. Current rate: {{ $value }}"
summary: "User-key chat P95 latency is high"
description: "P95 latency for /v1/chat/completions exceeds 5 seconds for 10 minutes."
- alert: UserKeyCreateFailures
expr: |
sum(rate(user_key_operations_total{operation="create",result!~"success|rate_limited"}[10m])) > 0.02
for: 10m
labels:
severity: critical
team: ops
annotations:
summary: "User-key create failures detected"
description: "Non-rate-limit create failures are occurring on the self-service path."
- alert: UserKeyResetFailures
expr: |
sum(rate(user_key_operations_total{operation="reset",result!~"success|rate_limited"}[10m])) > 0.02
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "User-key reset failures detected"
description: "Non-rate-limit reset failures are occurring on the self-service path."
- alert: UserKeyQuotaExhaustedSpike
expr: |
sum(rate(user_key_chat_requests_total{result="quota_exhausted"}[10m])) > 0.05
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "Quota exhausted events are rising"
description: "quota_exhausted responses are rising on the public user-key gateway path."
- alert: UserKeyAuthFailuresSpike
expr: |
sum(rate(user_key_chat_requests_total{result=~"unauthorized|invalid_api_key"}[10m])) > 0.05
for: 10m
labels:
severity: warning
team: security
annotations:
summary: "User-key auth failures are rising"
description: "unauthorized/invalid_api_key outcomes are rising on the public gateway path."
- alert: RouteFailoverShareHigh
expr: |
(
sum(rate(route_decisions_total{status="failover"}[10m]))
/
clamp_min(sum(rate(route_decisions_total[10m])), 0.001)
) > 0.20
and sum(rate(route_decisions_total[10m])) > 0
for: 10m
labels:
severity: warning
team: ops
annotations:
summary: "Route failover share is high"
description: "More than 20% of recent route decisions are failovers."
# 活跃Provider数量告警
- alert: NoActiveProviders
expr: active_providers == 0
for: 1m
@@ -73,17 +129,6 @@ groups:
summary: "No active providers"
description: "There are no active providers configured. The system cannot route requests."
- alert: LowActiveProviders
expr: active_providers < 2
for: 5m
labels:
severity: warning
team: ops
annotations:
summary: "Low number of active providers"
description: "Only {{ $value }} active provider(s) detected. Consider adding more for redundancy."
# 活跃Host告警
- alert: NoActiveHosts
expr: active_hosts == 0
for: 1m
@@ -94,31 +139,6 @@ groups:
summary: "No active hosts"
description: "There are no active hosts. The system cannot import providers."
# 数据库连接告警
- alert: HighDBConnections
expr: db_connections_active > 50
for: 5m
labels:
severity: warning
team: ops
annotations:
summary: "High database connection count"
description: "Active DB connections: {{ $value }}. Consider connection pool tuning."
# 数据库操作错误告警
- alert: DBOperationErrors
expr: |
rate(db_operations_total{operation=~"INSERT|UPDATE|DELETE"}[5m])
> 100
for: 2m
labels:
severity: warning
team: ops
annotations:
summary: "High database write rate"
description: "DB write operations are above threshold: {{ $value }} ops/sec"
# 日志系统告警
- alert: LogFlushErrors
expr: rate(log_flush_errors_total[5m]) > 0
for: 1m
@@ -130,48 +150,17 @@ groups:
description: "Log flush errors have been detected. Check log storage/backend."
- alert: LogDroppedEvents
expr: |
rate(log_dropped_events_total[5m]) > 10
expr: rate(log_dropped_events_total[5m]) > 10
for: 1m
labels:
severity: warning
team: ops
annotations:
summary: "Log events being dropped"
description: "Log events are being dropped at {{ $value }} events/sec. Check log buffer capacity."
description: "Log events are being dropped. Check log buffer capacity."
# 批处理导入告警
- alert: BatchImportFailures
expr: |
(
rate(route_decisions_total{status="failed"}[5m])
/
rate(route_decisions_total[5m])
) > 0.1
for: 5m
labels:
severity: warning
team: ops
annotations:
summary: "High batch import failure rate"
description: "Batch import failure rate is above 10%. Check provider configurations."
# API认证失败告警
- alert: AuthFailures
expr: |
rate(http_requests_total{status="401"}[5m]) > 10
for: 2m
labels:
severity: warning
team: security
annotations:
summary: "High authentication failure rate"
description: "Auth failures detected. Possible credential issues or attacks."
# 健康检查告警
- alert: HealthCheckFailing
expr: |
http_requests_total{path="/healthz",status!="200"} > 0
expr: http_requests_total{path="/healthz",status!="200"} > 0
for: 30s
labels:
severity: critical