254 lines
7.5 KiB
Go
254 lines
7.5 KiB
Go
package service
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"crypto/rand"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/company/ai-ops/internal/domain/model"
|
|
"github.com/company/ai-ops/internal/domain/repository"
|
|
)
|
|
|
|
// HealingEngine 是自愈引擎
|
|
type HealingEngine struct {
|
|
alertRepo repository.AlertRepository
|
|
healingRepo HealingRepository
|
|
client *http.Client
|
|
interval time.Duration
|
|
stopCh chan struct{}
|
|
}
|
|
|
|
// HealingRepository 是自愈记录存储接口
|
|
type HealingRepository interface {
|
|
CreateHealing(ctx context.Context, h *HealingLog) error
|
|
UpdateHealingStatus(ctx context.Context, id, status string, result map[string]any, errCode string) error
|
|
}
|
|
|
|
// HealingLog 是自愈执行记录
|
|
type HealingLog struct {
|
|
ID string `json:"id"`
|
|
AlertID string `json:"alert_id"`
|
|
ActionType string `json:"action_type"`
|
|
Config map[string]any `json:"config"`
|
|
Status string `json:"status"`
|
|
DryRun bool `json:"dry_run"`
|
|
ResultDetail map[string]any `json:"result_detail,omitempty"`
|
|
ErrorCode string `json:"error_code,omitempty"`
|
|
StartedAt time.Time `json:"started_at"`
|
|
CompletedAt *time.Time `json:"completed_at,omitempty"`
|
|
}
|
|
|
|
// NewHealingEngine 创建自愈引擎
|
|
func NewHealingEngine(ar repository.AlertRepository, hr HealingRepository) *HealingEngine {
|
|
return &HealingEngine{
|
|
alertRepo: ar,
|
|
healingRepo: hr,
|
|
client: &http.Client{Timeout: 20 * time.Second},
|
|
interval: 30 * time.Second,
|
|
stopCh: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// Start 启动自愈引擎
|
|
func (e *HealingEngine) Start() {
|
|
slog.Info("healing_engine_started", "interval", e.interval)
|
|
go e.loop()
|
|
}
|
|
|
|
// Stop 停止自愈引擎
|
|
func (e *HealingEngine) Stop() {
|
|
close(e.stopCh)
|
|
slog.Info("healing_engine_stopped")
|
|
}
|
|
|
|
func (e *HealingEngine) loop() {
|
|
ticker := time.NewTicker(e.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
e.process(context.Background())
|
|
case <-e.stopCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (e *HealingEngine) process(ctx context.Context) {
|
|
// 查询 triggered 状态的告警事件
|
|
events, _, err := e.alertRepo.ListEvents(ctx, "triggered", 1, 100)
|
|
if err != nil {
|
|
slog.Error("list_triggered_events_failed", "error", err)
|
|
return
|
|
}
|
|
|
|
for _, event := range events {
|
|
if err := e.handleEvent(ctx, &event); err != nil {
|
|
slog.Error("handle_event_failed", "event_id", event.ID, "error", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (e *HealingEngine) handleEvent(ctx context.Context, event *model.AlertEvent) error {
|
|
// 获取规则配置
|
|
rule, err := e.alertRepo.GetRuleByID(ctx, event.RuleID)
|
|
if err != nil {
|
|
return fmt.Errorf("get rule: %w", err)
|
|
}
|
|
|
|
// 检查是否有自愈动作
|
|
if rule.HealingAction == nil || *rule.HealingAction == "" {
|
|
return nil
|
|
}
|
|
|
|
// 创建自愈记录
|
|
healing := &HealingLog{
|
|
ID: generateHealingID(),
|
|
AlertID: event.ID,
|
|
ActionType: *rule.HealingAction,
|
|
Config: rule.HealingConfig,
|
|
Status: "pending",
|
|
DryRun: rule.IsSandboxed,
|
|
StartedAt: time.Now(),
|
|
}
|
|
|
|
if err := e.healingRepo.CreateHealing(ctx, healing); err != nil {
|
|
return fmt.Errorf("create healing log: %w", err)
|
|
}
|
|
|
|
// 沙盒模式:只记录不执行
|
|
if healing.DryRun {
|
|
slog.Info("healing_dry_run",
|
|
"healing_id", healing.ID,
|
|
"action", healing.ActionType,
|
|
"alert_id", event.ID,
|
|
)
|
|
healing.Status = "succeeded"
|
|
healing.ResultDetail = map[string]any{"message": "dry run, no actual action executed"}
|
|
return e.healingRepo.UpdateHealingStatus(ctx, healing.ID, healing.Status, healing.ResultDetail, "")
|
|
}
|
|
|
|
// 执行自愈动作
|
|
result, err := e.executeAction(ctx, healing)
|
|
if err != nil {
|
|
healing.Status = "failed"
|
|
healing.ErrorCode = "HEALING_EXEC_FAILED"
|
|
slog.Error("healing_action_failed",
|
|
"healing_id", healing.ID,
|
|
"action", healing.ActionType,
|
|
"error", err,
|
|
)
|
|
} else {
|
|
healing.Status = "succeeded"
|
|
healing.ResultDetail = result
|
|
slog.Info("healing_action_succeeded",
|
|
"healing_id", healing.ID,
|
|
"action", healing.ActionType,
|
|
)
|
|
}
|
|
|
|
return e.healingRepo.UpdateHealingStatus(ctx, healing.ID, healing.Status, healing.ResultDetail, healing.ErrorCode)
|
|
}
|
|
|
|
func (e *HealingEngine) executeAction(ctx context.Context, healing *HealingLog) (map[string]any, error) {
|
|
switch healing.ActionType {
|
|
case "switch_route":
|
|
return e.executeSwitchRoute(ctx, healing)
|
|
case "throttle":
|
|
return e.executeThrottle(ctx, healing)
|
|
case "restart_instance":
|
|
return e.executeRestartInstance(ctx, healing)
|
|
case "invoke_script":
|
|
return e.executeInvokeScript(ctx, healing)
|
|
default:
|
|
return nil, fmt.Errorf("unsupported healing action: %s", healing.ActionType)
|
|
}
|
|
}
|
|
|
|
func (e *HealingEngine) executeSwitchRoute(ctx context.Context, healing *HealingLog) (map[string]any, error) {
|
|
return e.callConfiguredEndpoint(ctx, healing, "switch_route")
|
|
}
|
|
|
|
func (e *HealingEngine) executeThrottle(ctx context.Context, healing *HealingLog) (map[string]any, error) {
|
|
return e.callConfiguredEndpoint(ctx, healing, "throttle")
|
|
}
|
|
|
|
func (e *HealingEngine) executeRestartInstance(ctx context.Context, healing *HealingLog) (map[string]any, error) {
|
|
if allowed, _ := healing.Config["allow_restart"].(bool); !allowed {
|
|
return nil, fmt.Errorf("restart_instance requires allow_restart=true")
|
|
}
|
|
return e.callConfiguredEndpoint(ctx, healing, "restart_instance")
|
|
}
|
|
|
|
func (e *HealingEngine) executeInvokeScript(ctx context.Context, healing *HealingLog) (map[string]any, error) {
|
|
if _, ok := healing.Config["script_id"].(string); !ok {
|
|
return nil, fmt.Errorf("invoke_script requires script_id; raw script content is not allowed")
|
|
}
|
|
return e.callConfiguredEndpoint(ctx, healing, "invoke_script")
|
|
}
|
|
|
|
func (e *HealingEngine) callConfiguredEndpoint(ctx context.Context, healing *HealingLog, action string) (map[string]any, error) {
|
|
endpoint, ok := healing.Config["endpoint"].(string)
|
|
if !ok || endpoint == "" {
|
|
return nil, fmt.Errorf("%s requires endpoint", action)
|
|
}
|
|
method, _ := healing.Config["method"].(string)
|
|
if method == "" {
|
|
method = http.MethodPost
|
|
}
|
|
if method != http.MethodPost && method != http.MethodPut && method != http.MethodPatch {
|
|
return nil, fmt.Errorf("%s method %s is not allowed", action, method)
|
|
}
|
|
|
|
payload := map[string]any{
|
|
"healing_id": healing.ID,
|
|
"alert_id": healing.AlertID,
|
|
"action_type": healing.ActionType,
|
|
"config": healing.Config,
|
|
"dry_run": healing.DryRun,
|
|
}
|
|
body, err := json.Marshal(payload)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("marshal healing payload: %w", err)
|
|
}
|
|
req, err := http.NewRequestWithContext(ctx, method, endpoint, bytes.NewReader(body))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create healing request: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
if token, _ := healing.Config["token"].(string); token != "" {
|
|
req.Header.Set("Authorization", "Bearer "+token)
|
|
}
|
|
|
|
resp, err := e.client.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("call healing endpoint: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode >= 400 {
|
|
return nil, fmt.Errorf("healing endpoint returned status %d", resp.StatusCode)
|
|
}
|
|
return map[string]any{
|
|
"message": action + " executed",
|
|
"endpoint": endpoint,
|
|
"status_code": resp.StatusCode,
|
|
}, nil
|
|
}
|
|
|
|
func generateHealingID() string {
|
|
b := make([]byte, 16)
|
|
if _, err := rand.Read(b); err != nil {
|
|
return fmt.Sprintf("00000000-0000-4000-8000-%012d", time.Now().UnixNano()%1_000_000_000_000)
|
|
}
|
|
b[6] = (b[6] & 0x0f) | 0x40
|
|
b[8] = (b[8] & 0x3f) | 0x80
|
|
return fmt.Sprintf("%s-%s-%s-%s-%s", hex.EncodeToString(b[0:4]), hex.EncodeToString(b[4:6]), hex.EncodeToString(b[6:8]), hex.EncodeToString(b[8:10]), hex.EncodeToString(b[10:16]))
|
|
}
|