134 lines
4.1 KiB
YAML
134 lines
4.1 KiB
YAML
groups:
|
||
- name: user-ms-alerts
|
||
interval: 30s
|
||
rules:
|
||
# 高错误率告警
|
||
- alert: HighErrorRate
|
||
expr: |
|
||
(
|
||
sum(rate(http_requests_total{status=~"5.."}[5m]))
|
||
/
|
||
sum(rate(http_requests_total[5m]))
|
||
) > 0.05
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
service: user-management
|
||
annotations:
|
||
summary: "高错误率告警"
|
||
description: "过去5分钟错误率超过5%,当前值: {{ $value | humanizePercentage }}"
|
||
|
||
# 高响应时间告警
|
||
- alert: HighResponseTime
|
||
expr: |
|
||
histogram_quantile(0.95,
|
||
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, path)
|
||
) > 1
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
service: user-management
|
||
annotations:
|
||
summary: "高响应时间告警"
|
||
description: "API P95响应时间超过1秒,路径: {{ $labels.path }},当前值: {{ $value }}s"
|
||
|
||
# 低缓存命中率告警
|
||
- alert: LowCacheHitRate
|
||
expr: |
|
||
(
|
||
sum(rate(cache_hits_total[5m]))
|
||
/
|
||
sum(rate(cache_operations_total[5m]))
|
||
) < 0.7
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
service: user-management
|
||
annotations:
|
||
summary: "低缓存命中率告警"
|
||
description: "缓存命中率低于70%,当前值: {{ $value | humanizePercentage }}"
|
||
|
||
# CPU 使用率告警
|
||
- alert: HighCPUUsage
|
||
expr: rate(process_cpu_seconds_total[5m]) > 0.8
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
service: user-management
|
||
annotations:
|
||
summary: "高CPU使用率告警"
|
||
description: "CPU使用率超过80%,当前值: {{ $value | humanizePercentage }}"
|
||
|
||
# 内存使用率告警
|
||
- alert: HighMemoryUsage
|
||
expr: |
|
||
(
|
||
system_memory_usage_bytes /
|
||
(node_memory_MemTotal_bytes)
|
||
) > 0.85
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
service: user-management
|
||
annotations:
|
||
summary: "高内存使用率告警"
|
||
description: "内存使用率超过85%,当前值: {{ $value | humanizePercentage }}"
|
||
|
||
# 数据库连接告警
|
||
- alert: DatabaseConnectionPoolExhausted
|
||
expr: |
|
||
(
|
||
db_connections_active /
|
||
db_connections_max
|
||
) > 0.9
|
||
for: 3m
|
||
labels:
|
||
severity: critical
|
||
service: user-management
|
||
annotations:
|
||
summary: "数据库连接池耗尽告警"
|
||
description: "数据库连接池使用率超过90%,当前值: {{ $value | humanizePercentage }}"
|
||
|
||
# 在线用户数告警
|
||
- alert: LowOnlineUsers
|
||
expr: active_users{period="5m"} < 10
|
||
for: 30m
|
||
labels:
|
||
severity: info
|
||
service: user-management
|
||
annotations:
|
||
summary: "在线用户数告警"
|
||
description: "过去5分钟活跃用户数低于10,当前值: {{ $value }}"
|
||
|
||
# 登录失败率告警
|
||
- alert: HighLoginFailureRate
|
||
expr: |
|
||
(
|
||
sum(rate(user_logins_total{status="failed"}[5m]))
|
||
/
|
||
sum(rate(user_logins_total[5m]))
|
||
) > 0.3
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
service: user-management
|
||
annotations:
|
||
summary: "高登录失败率告警"
|
||
description: "登录失败率超过30%,可能存在暴力破解,当前值: {{ $value | humanizePercentage }}"
|
||
|
||
# API QPS 异常告警
|
||
- alert: UnusualAPIRequestRate
|
||
expr: |
|
||
abs(
|
||
sum(rate(http_requests_total[5m]))
|
||
-
|
||
avg(sum(rate(http_requests_total[5m])) over 1h)
|
||
) / avg(sum(rate(http_requests_total[5m])) over 1h) > 0.5
|
||
for: 5m
|
||
labels:
|
||
severity: info
|
||
service: user-management
|
||
annotations:
|
||
summary: "API请求量异常告警"
|
||
description: "API请求量与1小时平均值偏差超过50%,当前值: {{ $value | humanizePercentage }}"
|