Add production-ready monitoring infrastructure: - 15 alerting rules (4 Critical + 11 Warning) - Grafana dashboard with service health panels - Full documentation with deployment guide Covers: service availability, error rates, latency, routing health, database connections, and log metrics
182 lines
5.5 KiB
YAML
182 lines
5.5 KiB
YAML
# Prometheus Alerting Rules for sub2api-cn-relay-manager
|
|
# Place this file in your Prometheus rules directory
|
|
|
|
groups:
|
|
- name: sub2api-relay-manager-alerts
|
|
interval: 30s
|
|
rules:
|
|
# 服务可用性告警
|
|
- alert: ServiceDown
|
|
expr: up{job="sub2api-relay-manager"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
annotations:
|
|
summary: "sub2api-relay-manager service is down"
|
|
description: "The sub2api-relay-manager service has been down for more than 1 minute."
|
|
|
|
# HTTP错误率告警
|
|
- alert: HighErrorRate
|
|
expr: |
|
|
(
|
|
sum(rate(http_requests_total{status=~"5..|4.."}[5m]))
|
|
/
|
|
sum(rate(http_requests_total[5m]))
|
|
) > 0.05
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Error rate is above 5% for more than 2 minutes. Current value: {{ $value | humanizePercentage }}"
|
|
|
|
# 请求延迟告警
|
|
- alert: HighLatency
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
|
|
) > 1.0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
annotations:
|
|
summary: "High request latency"
|
|
description: "95th percentile latency is above 1 second for more than 3 minutes."
|
|
|
|
# 路由故障转移告警
|
|
- alert: RouteFailoverSpike
|
|
expr: |
|
|
(
|
|
rate(route_failovers_total[5m])
|
|
>
|
|
2 * avg_over_time(rate(route_failovers_total[1h])[1h:5m])
|
|
)
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
annotations:
|
|
summary: "Route failover spike detected"
|
|
description: "Route failovers have spiked above normal levels. Current rate: {{ $value }}"
|
|
|
|
# 活跃Provider数量告警
|
|
- alert: NoActiveProviders
|
|
expr: active_providers == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
annotations:
|
|
summary: "No active providers"
|
|
description: "There are no active providers configured. The system cannot route requests."
|
|
|
|
- alert: LowActiveProviders
|
|
expr: active_providers < 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
annotations:
|
|
summary: "Low number of active providers"
|
|
description: "Only {{ $value }} active provider(s) detected. Consider adding more for redundancy."
|
|
|
|
# 活跃Host告警
|
|
- alert: NoActiveHosts
|
|
expr: active_hosts == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
annotations:
|
|
summary: "No active hosts"
|
|
description: "There are no active hosts. The system cannot import providers."
|
|
|
|
# 数据库连接告警
|
|
- alert: HighDBConnections
|
|
expr: db_connections_active > 50
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
annotations:
|
|
summary: "High database connection count"
|
|
description: "Active DB connections: {{ $value }}. Consider connection pool tuning."
|
|
|
|
# 数据库操作错误告警
|
|
- alert: DBOperationErrors
|
|
expr: |
|
|
rate(db_operations_total{operation=~"INSERT|UPDATE|DELETE"}[5m])
|
|
> 100
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
annotations:
|
|
summary: "High database write rate"
|
|
description: "DB write operations are above threshold: {{ $value }} ops/sec"
|
|
|
|
# 日志系统告警
|
|
- alert: LogFlushErrors
|
|
expr: rate(log_flush_errors_total[5m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
annotations:
|
|
summary: "Log flush errors detected"
|
|
description: "Log flush errors have been detected. Check log storage/backend."
|
|
|
|
- alert: LogDroppedEvents
|
|
expr: |
|
|
rate(log_dropped_events_total[5m]) > 10
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
annotations:
|
|
summary: "Log events being dropped"
|
|
description: "Log events are being dropped at {{ $value }} events/sec. Check log buffer capacity."
|
|
|
|
# 批处理导入告警
|
|
- alert: BatchImportFailures
|
|
expr: |
|
|
(
|
|
rate(route_decisions_total{status="failed"}[5m])
|
|
/
|
|
rate(route_decisions_total[5m])
|
|
) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
annotations:
|
|
summary: "High batch import failure rate"
|
|
description: "Batch import failure rate is above 10%. Check provider configurations."
|
|
|
|
# API认证失败告警
|
|
- alert: AuthFailures
|
|
expr: |
|
|
rate(http_requests_total{status="401"}[5m]) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: security
|
|
annotations:
|
|
summary: "High authentication failure rate"
|
|
description: "Auth failures detected. Possible credential issues or attacks."
|
|
|
|
# 健康检查告警
|
|
- alert: HealthCheckFailing
|
|
expr: |
|
|
http_requests_total{path="/healthz",status!="200"} > 0
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
annotations:
|
|
summary: "Health check failing"
|
|
description: "The /healthz endpoint is returning non-200 status."
|