134 lines
4.1 KiB
YAML
134 lines
4.1 KiB
YAML
|
|
groups:
|
|||
|
|
- name: user-ms-alerts
|
|||
|
|
interval: 30s
|
|||
|
|
rules:
|
|||
|
|
# 高错误率告警
|
|||
|
|
- alert: HighErrorRate
|
|||
|
|
expr: |
|
|||
|
|
(
|
|||
|
|
sum(rate(http_requests_total{status=~"5.."}[5m]))
|
|||
|
|
/
|
|||
|
|
sum(rate(http_requests_total[5m]))
|
|||
|
|
) > 0.05
|
|||
|
|
for: 5m
|
|||
|
|
labels:
|
|||
|
|
severity: critical
|
|||
|
|
service: user-management
|
|||
|
|
annotations:
|
|||
|
|
summary: "高错误率告警"
|
|||
|
|
description: "过去5分钟错误率超过5%,当前值: {{ $value | humanizePercentage }}"
|
|||
|
|
|
|||
|
|
# 高响应时间告警
|
|||
|
|
- alert: HighResponseTime
|
|||
|
|
expr: |
|
|||
|
|
histogram_quantile(0.95,
|
|||
|
|
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
|
|||
|
|
) > 1
|
|||
|
|
for: 5m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
service: user-management
|
|||
|
|
annotations:
|
|||
|
|
summary: "高响应时间告警"
|
|||
|
|
description: "API P95响应时间超过1秒,路径: {{ $labels.path }},当前值: {{ $value }}s"
|
|||
|
|
|
|||
|
|
# 低缓存命中率告警
|
|||
|
|
- alert: LowCacheHitRate
|
|||
|
|
expr: |
|
|||
|
|
(
|
|||
|
|
sum(rate(cache_hits_total[5m]))
|
|||
|
|
/
|
|||
|
|
sum(rate(cache_operations_total[5m]))
|
|||
|
|
) < 0.7
|
|||
|
|
for: 10m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
service: user-management
|
|||
|
|
annotations:
|
|||
|
|
summary: "低缓存命中率告警"
|
|||
|
|
description: "缓存命中率低于70%,当前值: {{ $value | humanizePercentage }}"
|
|||
|
|
|
|||
|
|
# CPU 使用率告警
|
|||
|
|
- alert: HighCPUUsage
|
|||
|
|
expr: rate(process_cpu_seconds_total[5m]) > 0.8
|
|||
|
|
for: 5m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
service: user-management
|
|||
|
|
annotations:
|
|||
|
|
summary: "高CPU使用率告警"
|
|||
|
|
description: "CPU使用率超过80%,当前值: {{ $value | humanizePercentage }}"
|
|||
|
|
|
|||
|
|
# 内存使用率告警
|
|||
|
|
- alert: HighMemoryUsage
|
|||
|
|
expr: |
|
|||
|
|
(
|
|||
|
|
system_memory_usage_bytes /
|
|||
|
|
(node_memory_MemTotal_bytes)
|
|||
|
|
) > 0.85
|
|||
|
|
for: 5m
|
|||
|
|
labels:
|
|||
|
|
severity: critical
|
|||
|
|
service: user-management
|
|||
|
|
annotations:
|
|||
|
|
summary: "高内存使用率告警"
|
|||
|
|
description: "内存使用率超过85%,当前值: {{ $value | humanizePercentage }}"
|
|||
|
|
|
|||
|
|
# 数据库连接告警
|
|||
|
|
- alert: DatabaseConnectionPoolExhausted
|
|||
|
|
expr: |
|
|||
|
|
(
|
|||
|
|
db_connections_active /
|
|||
|
|
db_connections_max
|
|||
|
|
) > 0.9
|
|||
|
|
for: 3m
|
|||
|
|
labels:
|
|||
|
|
severity: critical
|
|||
|
|
service: user-management
|
|||
|
|
annotations:
|
|||
|
|
summary: "数据库连接池耗尽告警"
|
|||
|
|
description: "数据库连接池使用率超过90%,当前值: {{ $value | humanizePercentage }}"
|
|||
|
|
|
|||
|
|
# 在线用户数告警
|
|||
|
|
- alert: LowOnlineUsers
|
|||
|
|
expr: active_users{period="5m"} < 10
|
|||
|
|
for: 30m
|
|||
|
|
labels:
|
|||
|
|
severity: info
|
|||
|
|
service: user-management
|
|||
|
|
annotations:
|
|||
|
|
summary: "在线用户数告警"
|
|||
|
|
description: "过去5分钟活跃用户数低于10,当前值: {{ $value }}"
|
|||
|
|
|
|||
|
|
# 登录失败率告警
|
|||
|
|
- alert: HighLoginFailureRate
|
|||
|
|
expr: |
|
|||
|
|
(
|
|||
|
|
sum(rate(user_logins_total{status="failed"}[5m]))
|
|||
|
|
/
|
|||
|
|
sum(rate(user_logins_total[5m]))
|
|||
|
|
) > 0.3
|
|||
|
|
for: 5m
|
|||
|
|
labels:
|
|||
|
|
severity: warning
|
|||
|
|
service: user-management
|
|||
|
|
annotations:
|
|||
|
|
summary: "高登录失败率告警"
|
|||
|
|
description: "登录失败率超过30%,可能存在暴力破解,当前值: {{ $value | humanizePercentage }}"
|
|||
|
|
|
|||
|
|
# API QPS 异常告警
|
|||
|
|
- alert: UnusualAPIRequestRate
|
|||
|
|
expr: |
|
|||
|
|
abs(
|
|||
|
|
sum(rate(http_requests_total[5m]))
|
|||
|
|
-
|
|||
|
|
avg(sum(rate(http_requests_total[5m])) over 1h)
|
|||
|
|
) / avg(sum(rate(http_requests_total[5m])) over 1h) > 0.5
|
|||
|
|
for: 5m
|
|||
|
|
labels:
|
|||
|
|
severity: info
|
|||
|
|
service: user-management
|
|||
|
|
annotations:
|
|||
|
|
summary: "API请求量异常告警"
|
|||
|
|
description: "API请求量与1小时平均值偏差超过50%,当前值: {{ $value | humanizePercentage }}"
|