Files
user-system/scripts/ops/sre-daily-healthcheck.ps1

185 lines
7.2 KiB
PowerShell
Raw Normal View History

# SRE 日常健康巡检脚本
# 每日自动运行,输出系统健康状态报告
param(
[string]$BaseURL = "http://localhost:8080",
[string]$ReportDir = "docs\evidence\daily-health",
[switch]$AlertOnFailure
)
$ErrorActionPreference = "Continue"
$date = Get-Date -Format "yyyyMMdd-HHmmss"
$reportFile = "$ReportDir\HEALTH_CHECK_$date.md"
# 确保报告目录存在
New-Item -ItemType Directory -Force -Path $ReportDir | Out-Null
$report = @()
$totalChecks = 0
$passedChecks = 0
$criticalFailures = 0
function Add-Check {
param($name, $status, $detail, $isCritical = $false)
$script:totalChecks++
if ($status -eq "PASS") {
$script:passedChecks++
$icon = ""
} elseif ($status -eq "WARN") {
$icon = "⚠️"
} else {
$icon = ""
if ($isCritical) { $script:criticalFailures++ }
}
$line = "| $icon | $name | $status | $detail |"
$script:report += $line
Write-Host " $icon $name : $status$detail"
}
Write-Host "=== UMS SRE 日常健康巡检 ===" -ForegroundColor Cyan
Write-Host "时间: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')"
Write-Host "目标: $BaseURL"
Write-Host ""
# 1. 健康检查端点
Write-Host "[1/6] 健康检查端点" -ForegroundColor Yellow
try {
$health = Invoke-RestMethod -Uri "$BaseURL/health/ready" -TimeoutSec 10
$dbStatus = $health.checks.database.status
Add-Check "服务就绪检查 /health/ready" "PASS" "状态: $($health.status)" $true
Add-Check "数据库连接" $(if ($dbStatus -eq "UP") {"PASS"} else {"FAIL"}) "状态: $dbStatus" $true
if ($health.checks.redis) {
Add-Check "Redis 连接" $(if ($health.checks.redis.status -eq "UP") {"PASS"} elseif ($health.checks.redis.status -eq "UNKNOWN") {"WARN"} else {"FAIL"}) "状态: $($health.checks.redis.status)"
}
if ($health.uptime) {
Add-Check "服务运行时间" "PASS" $health.uptime
}
} catch {
Add-Check "服务就绪检查 /health/ready" "FAIL" $_.Exception.Message $true
}
try {
$live = Invoke-WebRequest -Uri "$BaseURL/health/live" -TimeoutSec 5
Add-Check "存活检查 /health/live" $(if ($live.StatusCode -lt 300) {"PASS"} else {"FAIL"}) "HTTP $($live.StatusCode)" $true
} catch {
Add-Check "存活检查 /health/live" "FAIL" $_.Exception.Message $true
}
# 2. 关键 API 响应时间
Write-Host "`n[2/6] 关键 API 响应时间" -ForegroundColor Yellow
$criticalPaths = @(
@{path="/api/v1/auth/capabilities"; desc="认证能力接口"; threshold=500},
@{path="/health"; desc="健康检查接口"; threshold=100}
)
foreach ($ep in $criticalPaths) {
try {
$sw = [System.Diagnostics.Stopwatch]::StartNew()
Invoke-RestMethod -Uri "$BaseURL$($ep.path)" -TimeoutSec 5 | Out-Null
$sw.Stop()
$ms = $sw.ElapsedMilliseconds
$status = if ($ms -le $ep.threshold) {"PASS"} elseif ($ms -le $ep.threshold * 2) {"WARN"} else {"FAIL"}
Add-Check "$($ep.desc) $($ep.path)" $status "${ms}ms (阈值: $($ep.threshold)ms)"
} catch {
Add-Check "$($ep.desc) $($ep.path)" "FAIL" $_.Exception.Message
}
}
# 3. Prometheus 指标端点
Write-Host "`n[3/6] Prometheus 指标端点" -ForegroundColor Yellow
try {
$metrics = Invoke-WebRequest -Uri "$BaseURL/metrics" -TimeoutSec 5
if ($metrics.StatusCode -eq 200) {
$content = $metrics.Content
$hasHTTPMetrics = $content -match "http_requests_total"
$hasDBMetrics = $content -match "db_query"
Add-Check "指标端点 /metrics" "PASS" "HTTP $($metrics.StatusCode)"
Add-Check "HTTP 请求指标" $(if ($hasHTTPMetrics) {"PASS"} else {"FAIL"}) $(if ($hasHTTPMetrics) {"存在 http_requests_total"} else {"缺少 http_requests_total — 需要接入 PrometheusMiddleware"})
Add-Check "数据库指标" $(if ($hasDBMetrics) {"PASS"} else {"WARN"}) $(if ($hasDBMetrics) {"存在 db_query"} else {"缺少 db_query 指标"})
}
} catch {
Add-Check "指标端点 /metrics" "FAIL" "端点不可用 — P0 问题:需要在 router.go 注册 /metrics" $true
}
# 4. 速率限制验证
Write-Host "`n[4/6] 速率限制功能验证" -ForegroundColor Yellow
$rateLimitTriggered = $false
$rlTotal = 0; $rl429 = 0
1..10 | ForEach-Object {
try {
$body = '{"account":"sre_healthcheck","password":"invalid_test_pwd"}'
$resp = Invoke-WebRequest -Uri "$BaseURL/api/v1/auth/login" -Method POST -Body $body -ContentType "application/json" -ErrorAction SilentlyContinue -TimeoutSec 3
$rlTotal++
if ($resp.StatusCode -eq 429) { $rl429++; $rateLimitTriggered = $true }
} catch { $rlTotal++ }
}
Add-Check "速率限制功能" $(if ($rateLimitTriggered) {"PASS"} else {"WARN"}) "$(10) 次请求中触发 ${rl429} 次 429$(if (-not $rateLimitTriggered) {' (10次内未触发可能需要更多请求)'})"
# 5. Swagger 文档
Write-Host "`n[5/6] API 文档" -ForegroundColor Yellow
try {
$swagger = Invoke-WebRequest -Uri "$BaseURL/swagger/index.html" -TimeoutSec 5
Add-Check "Swagger 文档" $(if ($swagger.StatusCode -eq 200) {"PASS"} else {"WARN"}) "HTTP $($swagger.StatusCode)"
} catch {
Add-Check "Swagger 文档" "WARN" "不可访问(非阻塞)"
}
# 6. 配置健全性检查
Write-Host "`n[6/6] 配置健全性" -ForegroundColor Yellow
$configFile = "config\config.yaml"
if (Test-Path $configFile) {
$config = Get-Content $configFile -Raw
$hasDefaultJWT = $config -match "change-me-in-production"
$isSQLite = $config -match "type: sqlite"
Add-Check "JWT Secret 配置" $(if ($hasDefaultJWT) {"FAIL"} else {"PASS"}) $(if ($hasDefaultJWT) {"使用默认 Secret — 生产环境必须替换!"} else {"已自定义"}) $hasDefaultJWT
Add-Check "数据库类型" $(if ($isSQLite) {"WARN"} else {"PASS"}) $(if ($isSQLite) {"SQLite — 生产环境应迁移至 PostgreSQL"} else {"PostgreSQL/MySQL")
} else {
Add-Check "配置文件" "WARN" "config.yaml 不存在,可能使用环境变量配置"
}
# 生成报告
$passRate = [math]::Round($passedChecks / [math]::Max($totalChecks, 1) * 100, 1)
$overallStatus = if ($criticalFailures -gt 0) {"🔴 CRITICAL"} elseif ($passedChecks -lt $totalChecks) {"🟡 DEGRADED"} else {"🟢 HEALTHY"}
$mdReport = @"
# UMS 日常健康巡检报告
- **检查时间**: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')
- **服务地址**: $BaseURL
- **总体状态**: $overallStatus
- **通过率**: ${passedChecks}/${totalChecks} ($passRate%)
- **严重失败**: $criticalFailures
## 检查详情
| 状态 | 检查项 | 结果 | 说明 |
|------|--------|------|------|
$($report -join "`n")
## 后续行动
$(if ($criticalFailures -gt 0) {
"⚠️ **存在 $criticalFailures 个严重问题,需立即处理!**"
} elseif ($passedChecks -lt $totalChecks) {
"📋 存在非严重警告,请在工作时间内跟进。"
} else {
"✅ 所有检查通过,系统健康。"
})
---
* scripts/ops/sre-daily-healthcheck.ps1 自动生成*
"@
$mdReport | Set-Content -Path $reportFile -Encoding UTF8
Write-Host "`n=== 巡检汇总 ===" -ForegroundColor Cyan
Write-Host "总体状态: $overallStatus"
Write-Host "通过率: ${passedChecks}/${totalChecks} ($passRate%)"
Write-Host "报告已保存至: $reportFile"
if ($criticalFailures -gt 0 -and $AlertOnFailure) {
Write-Host "`n⚠️ 存在严重问题,应触发告警通知!" -ForegroundColor Red
exit 1
}
exit 0