feat: close v3 slo gates and lifecycle metrics
This commit is contained in:
@@ -1,11 +1,10 @@
|
||||
# Prometheus Alerting Rules for sub2api-cn-relay-manager
|
||||
# Place this file in your Prometheus rules directory
|
||||
# Aligned with current vNext.3 metrics semantics (2026-06-08)
|
||||
|
||||
groups:
|
||||
- name: sub2api-relay-manager-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# 服务可用性告警
|
||||
- alert: ServiceDown
|
||||
expr: up{job="sub2api-relay-manager"} == 0
|
||||
for: 1m
|
||||
@@ -16,53 +15,110 @@ groups:
|
||||
summary: "sub2api-relay-manager service is down"
|
||||
description: "The sub2api-relay-manager service has been down for more than 1 minute."
|
||||
|
||||
# HTTP错误率告警
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{status=~"5..|4.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total[5m]))
|
||||
sum(rate(http_requests_total{status=~"4..|5.."}[5m]))
|
||||
/
|
||||
clamp_min(sum(rate(http_requests_total[5m])), 0.001)
|
||||
) > 0.05
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "High error rate detected"
|
||||
description: "Error rate is above 5% for more than 2 minutes. Current value: {{ $value | humanizePercentage }}"
|
||||
summary: "High HTTP error rate detected"
|
||||
description: "HTTP 4xx/5xx error rate is above 5% for more than 2 minutes."
|
||||
|
||||
# 请求延迟告警
|
||||
- alert: HighLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
|
||||
) > 1.0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "High request latency"
|
||||
description: "95th percentile latency is above 1 second for more than 3 minutes."
|
||||
|
||||
# 路由故障转移告警
|
||||
- alert: RouteFailoverSpike
|
||||
- alert: UserKeyChatSuccessRateLow
|
||||
expr: |
|
||||
(
|
||||
rate(route_failovers_total[5m])
|
||||
>
|
||||
2 * avg_over_time(rate(route_failovers_total[1h])[1h:5m])
|
||||
)
|
||||
for: 1m
|
||||
sum(rate(user_key_chat_requests_total{result="ok"}[10m]))
|
||||
/
|
||||
clamp_min(sum(rate(user_key_chat_requests_total[10m])), 0.001)
|
||||
) < 0.95
|
||||
and sum(rate(user_key_chat_requests_total[10m])) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "User-key chat success rate below SLO"
|
||||
description: "Recent user-key chat success rate is below 95% for 10 minutes."
|
||||
|
||||
- alert: UserKeyChatP95LatencyHigh
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_request_duration_seconds_bucket{path="/v1/chat/completions"}[10m])) by (le)
|
||||
) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "Route failover spike detected"
|
||||
description: "Route failovers have spiked above normal levels. Current rate: {{ $value }}"
|
||||
summary: "User-key chat P95 latency is high"
|
||||
description: "P95 latency for /v1/chat/completions exceeds 5 seconds for 10 minutes."
|
||||
|
||||
- alert: UserKeyCreateFailures
|
||||
expr: |
|
||||
sum(rate(user_key_operations_total{operation="create",result!~"success|rate_limited"}[10m])) > 0.02
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "User-key create failures detected"
|
||||
description: "Non-rate-limit create failures are occurring on the self-service path."
|
||||
|
||||
- alert: UserKeyResetFailures
|
||||
expr: |
|
||||
sum(rate(user_key_operations_total{operation="reset",result!~"success|rate_limited"}[10m])) > 0.02
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "User-key reset failures detected"
|
||||
description: "Non-rate-limit reset failures are occurring on the self-service path."
|
||||
|
||||
- alert: UserKeyQuotaExhaustedSpike
|
||||
expr: |
|
||||
sum(rate(user_key_chat_requests_total{result="quota_exhausted"}[10m])) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "Quota exhausted events are rising"
|
||||
description: "quota_exhausted responses are rising on the public user-key gateway path."
|
||||
|
||||
- alert: UserKeyAuthFailuresSpike
|
||||
expr: |
|
||||
sum(rate(user_key_chat_requests_total{result=~"unauthorized|invalid_api_key"}[10m])) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "User-key auth failures are rising"
|
||||
description: "unauthorized/invalid_api_key outcomes are rising on the public gateway path."
|
||||
|
||||
- alert: RouteFailoverShareHigh
|
||||
expr: |
|
||||
(
|
||||
sum(rate(route_decisions_total{status="failover"}[10m]))
|
||||
/
|
||||
clamp_min(sum(rate(route_decisions_total[10m])), 0.001)
|
||||
) > 0.20
|
||||
and sum(rate(route_decisions_total[10m])) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "Route failover share is high"
|
||||
description: "More than 20% of recent route decisions are failovers."
|
||||
|
||||
# 活跃Provider数量告警
|
||||
- alert: NoActiveProviders
|
||||
expr: active_providers == 0
|
||||
for: 1m
|
||||
@@ -73,17 +129,6 @@ groups:
|
||||
summary: "No active providers"
|
||||
description: "There are no active providers configured. The system cannot route requests."
|
||||
|
||||
- alert: LowActiveProviders
|
||||
expr: active_providers < 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "Low number of active providers"
|
||||
description: "Only {{ $value }} active provider(s) detected. Consider adding more for redundancy."
|
||||
|
||||
# 活跃Host告警
|
||||
- alert: NoActiveHosts
|
||||
expr: active_hosts == 0
|
||||
for: 1m
|
||||
@@ -94,31 +139,6 @@ groups:
|
||||
summary: "No active hosts"
|
||||
description: "There are no active hosts. The system cannot import providers."
|
||||
|
||||
# 数据库连接告警
|
||||
- alert: HighDBConnections
|
||||
expr: db_connections_active > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "High database connection count"
|
||||
description: "Active DB connections: {{ $value }}. Consider connection pool tuning."
|
||||
|
||||
# 数据库操作错误告警
|
||||
- alert: DBOperationErrors
|
||||
expr: |
|
||||
rate(db_operations_total{operation=~"INSERT|UPDATE|DELETE"}[5m])
|
||||
> 100
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "High database write rate"
|
||||
description: "DB write operations are above threshold: {{ $value }} ops/sec"
|
||||
|
||||
# 日志系统告警
|
||||
- alert: LogFlushErrors
|
||||
expr: rate(log_flush_errors_total[5m]) > 0
|
||||
for: 1m
|
||||
@@ -130,48 +150,17 @@ groups:
|
||||
description: "Log flush errors have been detected. Check log storage/backend."
|
||||
|
||||
- alert: LogDroppedEvents
|
||||
expr: |
|
||||
rate(log_dropped_events_total[5m]) > 10
|
||||
expr: rate(log_dropped_events_total[5m]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "Log events being dropped"
|
||||
description: "Log events are being dropped at {{ $value }} events/sec. Check log buffer capacity."
|
||||
description: "Log events are being dropped. Check log buffer capacity."
|
||||
|
||||
# 批处理导入告警
|
||||
- alert: BatchImportFailures
|
||||
expr: |
|
||||
(
|
||||
rate(route_decisions_total{status="failed"}[5m])
|
||||
/
|
||||
rate(route_decisions_total[5m])
|
||||
) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
annotations:
|
||||
summary: "High batch import failure rate"
|
||||
description: "Batch import failure rate is above 10%. Check provider configurations."
|
||||
|
||||
# API认证失败告警
|
||||
- alert: AuthFailures
|
||||
expr: |
|
||||
rate(http_requests_total{status="401"}[5m]) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "High authentication failure rate"
|
||||
description: "Auth failures detected. Possible credential issues or attacks."
|
||||
|
||||
# 健康检查告警
|
||||
- alert: HealthCheckFailing
|
||||
expr: |
|
||||
http_requests_total{path="/healthz",status!="200"} > 0
|
||||
expr: http_requests_total{path="/healthz",status!="200"} > 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
Reference in New Issue
Block a user