sub2api-cn-relay-manager/deploy/monitoring/prometheus-rules.yml

# Prometheus Alerting Rules for sub2api-cn-relay-manager
# Place this file in your Prometheus rules directory

groups:
  - name: sub2api-relay-manager-alerts
    interval: 30s
    rules:
      # 服务可用性告警
      - alert: ServiceDown
        expr: up{job="sub2api-relay-manager"} == 0
        for: 1m
        labels:
          severity: critical
          team: ops
        annotations:
          summary: "sub2api-relay-manager service is down"
          description: "The sub2api-relay-manager service has been down for more than 1 minute."

      # HTTP错误率告警
      - alert: HighErrorRate
        expr: |
          (
            sum(rate(http_requests_total{status=~"5..|4.."}[5m]))
            /
            sum(rate(http_requests_total[5m]))
          ) > 0.05
        for: 2m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "High error rate detected"
          description: "Error rate is above 5% for more than 2 minutes. Current value: {{ $value | humanizePercentage }}"

      # 请求延迟告警
      - alert: HighLatency
        expr: |
          histogram_quantile(0.95,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
          ) > 1.0
        for: 3m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "High request latency"
          description: "95th percentile latency is above 1 second for more than 3 minutes."

      # 路由故障转移告警
      - alert: RouteFailoverSpike
        expr: |
          (
            rate(route_failovers_total[5m])
            >
            2 * avg_over_time(rate(route_failovers_total[1h])[1h:5m])
          )
        for: 1m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "Route failover spike detected"
          description: "Route failovers have spiked above normal levels. Current rate: {{ $value }}"

      # 活跃Provider数量告警
      - alert: NoActiveProviders
        expr: active_providers == 0
        for: 1m
        labels:
          severity: critical
          team: ops
        annotations:
          summary: "No active providers"
          description: "There are no active providers configured. The system cannot route requests."

      - alert: LowActiveProviders
        expr: active_providers < 2
        for: 5m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "Low number of active providers"
          description: "Only {{ $value }} active provider(s) detected. Consider adding more for redundancy."

      # 活跃Host告警
      - alert: NoActiveHosts
        expr: active_hosts == 0
        for: 1m
        labels:
          severity: critical
          team: ops
        annotations:
          summary: "No active hosts"
          description: "There are no active hosts. The system cannot import providers."

      # 数据库连接告警
      - alert: HighDBConnections
        expr: db_connections_active > 50
        for: 5m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "High database connection count"
          description: "Active DB connections: {{ $value }}. Consider connection pool tuning."

      # 数据库操作错误告警
      - alert: DBOperationErrors
        expr: |
          rate(db_operations_total{operation=~"INSERT|UPDATE|DELETE"}[5m])
          > 100
        for: 2m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "High database write rate"
          description: "DB write operations are above threshold: {{ $value }} ops/sec"

      # 日志系统告警
      - alert: LogFlushErrors
        expr: rate(log_flush_errors_total[5m]) > 0
        for: 1m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "Log flush errors detected"
          description: "Log flush errors have been detected. Check log storage/backend."

      - alert: LogDroppedEvents
        expr: |
          rate(log_dropped_events_total[5m]) > 10
        for: 1m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "Log events being dropped"
          description: "Log events are being dropped at {{ $value }} events/sec. Check log buffer capacity."

      # 批处理导入告警
      - alert: BatchImportFailures
        expr: |
          (
            rate(route_decisions_total{status="failed"}[5m])
            /
            rate(route_decisions_total[5m])
          ) > 0.1
        for: 5m
        labels:
          severity: warning
          team: ops
        annotations:
          summary: "High batch import failure rate"
          description: "Batch import failure rate is above 10%. Check provider configurations."

      # API认证失败告警
      - alert: AuthFailures
        expr: |
          rate(http_requests_total{status="401"}[5m]) > 10
        for: 2m
        labels:
          severity: warning
          team: security
        annotations:
          summary: "High authentication failure rate"
          description: "Auth failures detected. Possible credential issues or attacks."

      # 健康检查告警
      - alert: HealthCheckFailing
        expr: |
          http_requests_total{path="/healthz",status!="200"} > 0
        for: 30s
        labels:
          severity: critical
          team: ops
        annotations:
          summary: "Health check failing"
          description: "The /healthz endpoint is returning non-200 status."