Files
sub2api-cn-relay-manager/deploy/monitoring/prometheus-rules.yml
phamnazage-jpg f6600d663a
Some checks failed
CI / Build & Test (push) Has been cancelled
CI / Lint (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / Docker Build (push) Has been cancelled
CI / Release (push) Has been cancelled
feat(monitoring): add complete Prometheus + Grafana monitoring stack
Add production-ready monitoring infrastructure:
- 15 alerting rules (4 Critical + 11 Warning)
- Grafana dashboard with service health panels
- Full documentation with deployment guide

Covers: service availability, error rates, latency,
routing health, database connections, and log metrics
2026-06-02 19:54:38 +08:00

182 lines
5.5 KiB
YAML

# Prometheus Alerting Rules for sub2api-cn-relay-manager
# Place this file in your Prometheus rules directory
groups:
- name: sub2api-relay-manager-alerts
interval: 30s
rules:
# 服务可用性告警
- alert: ServiceDown
expr: up{job="sub2api-relay-manager"} == 0
for: 1m
labels:
severity: critical
team: ops
annotations:
summary: "sub2api-relay-manager service is down"
description: "The sub2api-relay-manager service has been down for more than 1 minute."
# HTTP错误率告警
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"5..|4.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 2m
labels:
severity: warning
team: ops
annotations:
summary: "High error rate detected"
description: "Error rate is above 5% for more than 2 minutes. Current value: {{ $value | humanizePercentage }}"
# 请求延迟告警
- alert: HighLatency
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 1.0
for: 3m
labels:
severity: warning
team: ops
annotations:
summary: "High request latency"
description: "95th percentile latency is above 1 second for more than 3 minutes."
# 路由故障转移告警
- alert: RouteFailoverSpike
expr: |
(
rate(route_failovers_total[5m])
>
2 * avg_over_time(rate(route_failovers_total[1h])[1h:5m])
)
for: 1m
labels:
severity: warning
team: ops
annotations:
summary: "Route failover spike detected"
description: "Route failovers have spiked above normal levels. Current rate: {{ $value }}"
# 活跃Provider数量告警
- alert: NoActiveProviders
expr: active_providers == 0
for: 1m
labels:
severity: critical
team: ops
annotations:
summary: "No active providers"
description: "There are no active providers configured. The system cannot route requests."
- alert: LowActiveProviders
expr: active_providers < 2
for: 5m
labels:
severity: warning
team: ops
annotations:
summary: "Low number of active providers"
description: "Only {{ $value }} active provider(s) detected. Consider adding more for redundancy."
# 活跃Host告警
- alert: NoActiveHosts
expr: active_hosts == 0
for: 1m
labels:
severity: critical
team: ops
annotations:
summary: "No active hosts"
description: "There are no active hosts. The system cannot import providers."
# 数据库连接告警
- alert: HighDBConnections
expr: db_connections_active > 50
for: 5m
labels:
severity: warning
team: ops
annotations:
summary: "High database connection count"
description: "Active DB connections: {{ $value }}. Consider connection pool tuning."
# 数据库操作错误告警
- alert: DBOperationErrors
expr: |
rate(db_operations_total{operation=~"INSERT|UPDATE|DELETE"}[5m])
> 100
for: 2m
labels:
severity: warning
team: ops
annotations:
summary: "High database write rate"
description: "DB write operations are above threshold: {{ $value }} ops/sec"
# 日志系统告警
- alert: LogFlushErrors
expr: rate(log_flush_errors_total[5m]) > 0
for: 1m
labels:
severity: warning
team: ops
annotations:
summary: "Log flush errors detected"
description: "Log flush errors have been detected. Check log storage/backend."
- alert: LogDroppedEvents
expr: |
rate(log_dropped_events_total[5m]) > 10
for: 1m
labels:
severity: warning
team: ops
annotations:
summary: "Log events being dropped"
description: "Log events are being dropped at {{ $value }} events/sec. Check log buffer capacity."
# 批处理导入告警
- alert: BatchImportFailures
expr: |
(
rate(route_decisions_total{status="failed"}[5m])
/
rate(route_decisions_total[5m])
) > 0.1
for: 5m
labels:
severity: warning
team: ops
annotations:
summary: "High batch import failure rate"
description: "Batch import failure rate is above 10%. Check provider configurations."
# API认证失败告警
- alert: AuthFailures
expr: |
rate(http_requests_total{status="401"}[5m]) > 10
for: 2m
labels:
severity: warning
team: security
annotations:
summary: "High authentication failure rate"
description: "Auth failures detected. Possible credential issues or attacks."
# 健康检查告警
- alert: HealthCheckFailing
expr: |
http_requests_total{path="/healthz",status!="200"} > 0
for: 30s
labels:
severity: critical
team: ops
annotations:
summary: "Health check failing"
description: "The /healthz endpoint is returning non-200 status."