# SRE 日常健康巡检脚本 # 每日自动运行,输出系统健康状态报告 param( [string]$BaseURL = "http://localhost:8080", [string]$ReportDir = "docs\evidence\daily-health", [switch]$AlertOnFailure ) $ErrorActionPreference = "Continue" $date = Get-Date -Format "yyyyMMdd-HHmmss" $reportFile = "$ReportDir\HEALTH_CHECK_$date.md" # 确保报告目录存在 New-Item -ItemType Directory -Force -Path $ReportDir | Out-Null $report = @() $totalChecks = 0 $passedChecks = 0 $criticalFailures = 0 function Add-Check { param($name, $status, $detail, $isCritical = $false) $script:totalChecks++ if ($status -eq "PASS") { $script:passedChecks++ $icon = "✅" } elseif ($status -eq "WARN") { $icon = "⚠️" } else { $icon = "❌" if ($isCritical) { $script:criticalFailures++ } } $line = "| $icon | $name | $status | $detail |" $script:report += $line Write-Host " $icon $name : $status — $detail" } Write-Host "=== UMS SRE 日常健康巡检 ===" -ForegroundColor Cyan Write-Host "时间: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" Write-Host "目标: $BaseURL" Write-Host "" # 1. 健康检查端点 Write-Host "[1/6] 健康检查端点" -ForegroundColor Yellow try { $health = Invoke-RestMethod -Uri "$BaseURL/health/ready" -TimeoutSec 10 $dbStatus = $health.checks.database.status Add-Check "服务就绪检查 /health/ready" "PASS" "状态: $($health.status)" $true Add-Check "数据库连接" $(if ($dbStatus -eq "UP") {"PASS"} else {"FAIL"}) "状态: $dbStatus" $true if ($health.checks.redis) { Add-Check "Redis 连接" $(if ($health.checks.redis.status -eq "UP") {"PASS"} elseif ($health.checks.redis.status -eq "UNKNOWN") {"WARN"} else {"FAIL"}) "状态: $($health.checks.redis.status)" } if ($health.uptime) { Add-Check "服务运行时间" "PASS" $health.uptime } } catch { Add-Check "服务就绪检查 /health/ready" "FAIL" $_.Exception.Message $true } try { $live = Invoke-WebRequest -Uri "$BaseURL/health/live" -TimeoutSec 5 Add-Check "存活检查 /health/live" $(if ($live.StatusCode -lt 300) {"PASS"} else {"FAIL"}) "HTTP $($live.StatusCode)" $true } catch { Add-Check "存活检查 /health/live" "FAIL" $_.Exception.Message $true } # 2. 关键 API 响应时间 Write-Host "`n[2/6] 关键 API 响应时间" -ForegroundColor Yellow $criticalPaths = @( @{path="/api/v1/auth/capabilities"; desc="认证能力接口"; threshold=500}, @{path="/health"; desc="健康检查接口"; threshold=100} ) foreach ($ep in $criticalPaths) { try { $sw = [System.Diagnostics.Stopwatch]::StartNew() Invoke-RestMethod -Uri "$BaseURL$($ep.path)" -TimeoutSec 5 | Out-Null $sw.Stop() $ms = $sw.ElapsedMilliseconds $status = if ($ms -le $ep.threshold) {"PASS"} elseif ($ms -le $ep.threshold * 2) {"WARN"} else {"FAIL"} Add-Check "$($ep.desc) $($ep.path)" $status "${ms}ms (阈值: $($ep.threshold)ms)" } catch { Add-Check "$($ep.desc) $($ep.path)" "FAIL" $_.Exception.Message } } # 3. Prometheus 指标端点 Write-Host "`n[3/6] Prometheus 指标端点" -ForegroundColor Yellow try { $metrics = Invoke-WebRequest -Uri "$BaseURL/metrics" -TimeoutSec 5 if ($metrics.StatusCode -eq 200) { $content = $metrics.Content $hasHTTPMetrics = $content -match "http_requests_total" $hasDBMetrics = $content -match "db_query" Add-Check "指标端点 /metrics" "PASS" "HTTP $($metrics.StatusCode)" Add-Check "HTTP 请求指标" $(if ($hasHTTPMetrics) {"PASS"} else {"FAIL"}) $(if ($hasHTTPMetrics) {"存在 http_requests_total"} else {"缺少 http_requests_total — 需要接入 PrometheusMiddleware"}) Add-Check "数据库指标" $(if ($hasDBMetrics) {"PASS"} else {"WARN"}) $(if ($hasDBMetrics) {"存在 db_query"} else {"缺少 db_query 指标"}) } } catch { Add-Check "指标端点 /metrics" "FAIL" "端点不可用 — P0 问题:需要在 router.go 注册 /metrics" $true } # 4. 速率限制验证 Write-Host "`n[4/6] 速率限制功能验证" -ForegroundColor Yellow $rateLimitTriggered = $false $rlTotal = 0; $rl429 = 0 1..10 | ForEach-Object { try { $body = '{"account":"sre_healthcheck","password":"invalid_test_pwd"}' $resp = Invoke-WebRequest -Uri "$BaseURL/api/v1/auth/login" -Method POST -Body $body -ContentType "application/json" -ErrorAction SilentlyContinue -TimeoutSec 3 $rlTotal++ if ($resp.StatusCode -eq 429) { $rl429++; $rateLimitTriggered = $true } } catch { $rlTotal++ } } Add-Check "速率限制功能" $(if ($rateLimitTriggered) {"PASS"} else {"WARN"}) "$(10) 次请求中触发 ${rl429} 次 429$(if (-not $rateLimitTriggered) {' (10次内未触发,可能需要更多请求)'})" # 5. Swagger 文档 Write-Host "`n[5/6] API 文档" -ForegroundColor Yellow try { $swagger = Invoke-WebRequest -Uri "$BaseURL/swagger/index.html" -TimeoutSec 5 Add-Check "Swagger 文档" $(if ($swagger.StatusCode -eq 200) {"PASS"} else {"WARN"}) "HTTP $($swagger.StatusCode)" } catch { Add-Check "Swagger 文档" "WARN" "不可访问(非阻塞)" } # 6. 配置健全性检查 Write-Host "`n[6/6] 配置健全性" -ForegroundColor Yellow $configFile = "config\config.yaml" if (Test-Path $configFile) { $config = Get-Content $configFile -Raw $hasDefaultJWT = $config -match "change-me-in-production" $isSQLite = $config -match "type: sqlite" Add-Check "JWT Secret 配置" $(if ($hasDefaultJWT) {"FAIL"} else {"PASS"}) $(if ($hasDefaultJWT) {"使用默认 Secret — 生产环境必须替换!"} else {"已自定义"}) $hasDefaultJWT Add-Check "数据库类型" $(if ($isSQLite) {"WARN"} else {"PASS"}) $(if ($isSQLite) {"SQLite — 生产环境应迁移至 PostgreSQL"} else {"PostgreSQL/MySQL") } else { Add-Check "配置文件" "WARN" "config.yaml 不存在,可能使用环境变量配置" } # 生成报告 $passRate = [math]::Round($passedChecks / [math]::Max($totalChecks, 1) * 100, 1) $overallStatus = if ($criticalFailures -gt 0) {"🔴 CRITICAL"} elseif ($passedChecks -lt $totalChecks) {"🟡 DEGRADED"} else {"🟢 HEALTHY"} $mdReport = @" # UMS 日常健康巡检报告 - **检查时间**: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss') - **服务地址**: $BaseURL - **总体状态**: $overallStatus - **通过率**: ${passedChecks}/${totalChecks} ($passRate%) - **严重失败**: $criticalFailures ## 检查详情 | 状态 | 检查项 | 结果 | 说明 | |------|--------|------|------| $($report -join "`n") ## 后续行动 $(if ($criticalFailures -gt 0) { "⚠️ **存在 $criticalFailures 个严重问题,需立即处理!**" } elseif ($passedChecks -lt $totalChecks) { "📋 存在非严重警告,请在工作时间内跟进。" } else { "✅ 所有检查通过,系统健康。" }) --- *由 scripts/ops/sre-daily-healthcheck.ps1 自动生成* "@ $mdReport | Set-Content -Path $reportFile -Encoding UTF8 Write-Host "`n=== 巡检汇总 ===" -ForegroundColor Cyan Write-Host "总体状态: $overallStatus" Write-Host "通过率: ${passedChecks}/${totalChecks} ($passRate%)" Write-Host "报告已保存至: $reportFile" if ($criticalFailures -gt 0 -and $AlertOnFailure) { Write-Host "`n⚠️ 存在严重问题,应触发告警通知!" -ForegroundColor Red exit 1 } exit 0