Files
ai-ops/internal/service/healing_engine.go
2026-05-12 17:48:22 +08:00

254 lines
7.5 KiB
Go

package service
import (
"bytes"
"context"
"crypto/rand"
"encoding/hex"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"time"
"github.com/company/ai-ops/internal/domain/model"
"github.com/company/ai-ops/internal/domain/repository"
)
// HealingEngine 是自愈引擎
type HealingEngine struct {
alertRepo repository.AlertRepository
healingRepo HealingRepository
client *http.Client
interval time.Duration
stopCh chan struct{}
}
// HealingRepository 是自愈记录存储接口
type HealingRepository interface {
CreateHealing(ctx context.Context, h *HealingLog) error
UpdateHealingStatus(ctx context.Context, id, status string, result map[string]any, errCode string) error
}
// HealingLog 是自愈执行记录
type HealingLog struct {
ID string `json:"id"`
AlertID string `json:"alert_id"`
ActionType string `json:"action_type"`
Config map[string]any `json:"config"`
Status string `json:"status"`
DryRun bool `json:"dry_run"`
ResultDetail map[string]any `json:"result_detail,omitempty"`
ErrorCode string `json:"error_code,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt *time.Time `json:"completed_at,omitempty"`
}
// NewHealingEngine 创建自愈引擎
func NewHealingEngine(ar repository.AlertRepository, hr HealingRepository) *HealingEngine {
return &HealingEngine{
alertRepo: ar,
healingRepo: hr,
client: &http.Client{Timeout: 20 * time.Second},
interval: 30 * time.Second,
stopCh: make(chan struct{}),
}
}
// Start 启动自愈引擎
func (e *HealingEngine) Start() {
slog.Info("healing_engine_started", "interval", e.interval)
go e.loop()
}
// Stop 停止自愈引擎
func (e *HealingEngine) Stop() {
close(e.stopCh)
slog.Info("healing_engine_stopped")
}
func (e *HealingEngine) loop() {
ticker := time.NewTicker(e.interval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
e.process(context.Background())
case <-e.stopCh:
return
}
}
}
func (e *HealingEngine) process(ctx context.Context) {
// 查询 triggered 状态的告警事件
events, _, err := e.alertRepo.ListEvents(ctx, "triggered", 1, 100)
if err != nil {
slog.Error("list_triggered_events_failed", "error", err)
return
}
for _, event := range events {
if err := e.handleEvent(ctx, &event); err != nil {
slog.Error("handle_event_failed", "event_id", event.ID, "error", err)
}
}
}
func (e *HealingEngine) handleEvent(ctx context.Context, event *model.AlertEvent) error {
// 获取规则配置
rule, err := e.alertRepo.GetRuleByID(ctx, event.RuleID)
if err != nil {
return fmt.Errorf("get rule: %w", err)
}
// 检查是否有自愈动作
if rule.HealingAction == nil || *rule.HealingAction == "" {
return nil
}
// 创建自愈记录
healing := &HealingLog{
ID: generateHealingID(),
AlertID: event.ID,
ActionType: *rule.HealingAction,
Config: rule.HealingConfig,
Status: "pending",
DryRun: rule.IsSandboxed,
StartedAt: time.Now(),
}
if err := e.healingRepo.CreateHealing(ctx, healing); err != nil {
return fmt.Errorf("create healing log: %w", err)
}
// 沙盒模式:只记录不执行
if healing.DryRun {
slog.Info("healing_dry_run",
"healing_id", healing.ID,
"action", healing.ActionType,
"alert_id", event.ID,
)
healing.Status = "succeeded"
healing.ResultDetail = map[string]any{"message": "dry run, no actual action executed"}
return e.healingRepo.UpdateHealingStatus(ctx, healing.ID, healing.Status, healing.ResultDetail, "")
}
// 执行自愈动作
result, err := e.executeAction(ctx, healing)
if err != nil {
healing.Status = "failed"
healing.ErrorCode = "HEALING_EXEC_FAILED"
slog.Error("healing_action_failed",
"healing_id", healing.ID,
"action", healing.ActionType,
"error", err,
)
} else {
healing.Status = "succeeded"
healing.ResultDetail = result
slog.Info("healing_action_succeeded",
"healing_id", healing.ID,
"action", healing.ActionType,
)
}
return e.healingRepo.UpdateHealingStatus(ctx, healing.ID, healing.Status, healing.ResultDetail, healing.ErrorCode)
}
func (e *HealingEngine) executeAction(ctx context.Context, healing *HealingLog) (map[string]any, error) {
switch healing.ActionType {
case "switch_route":
return e.executeSwitchRoute(ctx, healing)
case "throttle":
return e.executeThrottle(ctx, healing)
case "restart_instance":
return e.executeRestartInstance(ctx, healing)
case "invoke_script":
return e.executeInvokeScript(ctx, healing)
default:
return nil, fmt.Errorf("unsupported healing action: %s", healing.ActionType)
}
}
func (e *HealingEngine) executeSwitchRoute(ctx context.Context, healing *HealingLog) (map[string]any, error) {
return e.callConfiguredEndpoint(ctx, healing, "switch_route")
}
func (e *HealingEngine) executeThrottle(ctx context.Context, healing *HealingLog) (map[string]any, error) {
return e.callConfiguredEndpoint(ctx, healing, "throttle")
}
func (e *HealingEngine) executeRestartInstance(ctx context.Context, healing *HealingLog) (map[string]any, error) {
if allowed, _ := healing.Config["allow_restart"].(bool); !allowed {
return nil, fmt.Errorf("restart_instance requires allow_restart=true")
}
return e.callConfiguredEndpoint(ctx, healing, "restart_instance")
}
func (e *HealingEngine) executeInvokeScript(ctx context.Context, healing *HealingLog) (map[string]any, error) {
if _, ok := healing.Config["script_id"].(string); !ok {
return nil, fmt.Errorf("invoke_script requires script_id; raw script content is not allowed")
}
return e.callConfiguredEndpoint(ctx, healing, "invoke_script")
}
func (e *HealingEngine) callConfiguredEndpoint(ctx context.Context, healing *HealingLog, action string) (map[string]any, error) {
endpoint, ok := healing.Config["endpoint"].(string)
if !ok || endpoint == "" {
return nil, fmt.Errorf("%s requires endpoint", action)
}
method, _ := healing.Config["method"].(string)
if method == "" {
method = http.MethodPost
}
if method != http.MethodPost && method != http.MethodPut && method != http.MethodPatch {
return nil, fmt.Errorf("%s method %s is not allowed", action, method)
}
payload := map[string]any{
"healing_id": healing.ID,
"alert_id": healing.AlertID,
"action_type": healing.ActionType,
"config": healing.Config,
"dry_run": healing.DryRun,
}
body, err := json.Marshal(payload)
if err != nil {
return nil, fmt.Errorf("marshal healing payload: %w", err)
}
req, err := http.NewRequestWithContext(ctx, method, endpoint, bytes.NewReader(body))
if err != nil {
return nil, fmt.Errorf("create healing request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
if token, _ := healing.Config["token"].(string); token != "" {
req.Header.Set("Authorization", "Bearer "+token)
}
resp, err := e.client.Do(req)
if err != nil {
return nil, fmt.Errorf("call healing endpoint: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("healing endpoint returned status %d", resp.StatusCode)
}
return map[string]any{
"message": action + " executed",
"endpoint": endpoint,
"status_code": resp.StatusCode,
}, nil
}
func generateHealingID() string {
b := make([]byte, 16)
if _, err := rand.Read(b); err != nil {
return fmt.Sprintf("00000000-0000-4000-8000-%012d", time.Now().UnixNano()%1_000_000_000_000)
}
b[6] = (b[6] & 0x0f) | 0x40
b[8] = (b[8] & 0x3f) | 0x80
return fmt.Sprintf("%s-%s-%s-%s-%s", hex.EncodeToString(b[0:4]), hex.EncodeToString(b[4:6]), hex.EncodeToString(b[6:8]), hex.EncodeToString(b[8:10]), hex.EncodeToString(b[10:16]))
}