internal/retry/retry.go

// internal/retry/retry.go
// 指数退避重试机制
package retry

import (
	"context"
	"fmt"
	"math"
	"time"
)

// Strategy 重试策略
type Strategy struct {
	MaxRetries  int           // 最大重试次数（0=不重试）
	BaseDelay   time.Duration // 基础延迟
	MaxDelay    time.Duration // 最大延迟上限
	Multiplier  float64       // 乘数（默认2.0）
	Jitter      bool          // 是否添加随机抖动
	Retryable   func(error) bool // 判断错误是否可重试
}

// DefaultStrategy 返回默认重试策略
func DefaultStrategy() Strategy {
	return Strategy{
		MaxRetries: 3,
		BaseDelay:  1 * time.Second,
		MaxDelay:   30 * time.Second,
		Multiplier: 2.0,
		Jitter:     true,
		Retryable:  IsRetryable,
	}
}

// IsRetryable 默认重试判定：网络错误、超时、5xx状态码等可重试
func IsRetryable(err error) bool {
	if err == nil {
		return false
	}
	// 这里可以扩展更多错误类型判定
	return true
}

// Do 执行带重试的操作
func Do(ctx context.Context, strategy Strategy, fn func() error) error {
	var lastErr error
	
	for attempt := 0; attempt <= strategy.MaxRetries; attempt++ {
		if err := fn(); err != nil {
			lastErr = err
			
			// 不判断最后一次是否需要重试
			if attempt == strategy.MaxRetries {
				break
			}
			
			// 检查是否可重试
			if strategy.Retryable != nil && !strategy.Retryable(err) {
				return fmt.Errorf("non-retryable error on attempt %d: %w", attempt+1, err)
			}
			
			// 计算退避延迟
			delay := calculateDelay(strategy, attempt)
			
			// 检查上下文是否已取消
			select {
			case <-ctx.Done():
				return fmt.Errorf("context cancelled after attempt %d: %w", attempt+1, ctx.Err())
			case <-time.After(delay):
				// 继续重试
			}
		} else {
			return nil
		}
	}
	
	return fmt.Errorf("all %d attempts failed, last error: %w", strategy.MaxRetries+1, lastErr)
}

// calculateDelay 计算指数退避延迟
func calculateDelay(s Strategy, attempt int) time.Duration {
	// 指数退避: base * multiplier^attempt
	delay := float64(s.BaseDelay) * math.Pow(s.Multiplier, float64(attempt))
	
	// 添加上限
	if max := float64(s.MaxDelay); delay > max {
		delay = max
	}
	
	// 添加抖动（±25%）
	if s.Jitter {
		jitter := delay * 0.25
		delay = delay - jitter + (jitter * 2 * float64(time.Now().Nanosecond()%1000) / 1000)
	}
	
	return time.Duration(delay)
}

// DoWithResult 执行带重试的操作并返回结果
func DoWithResult[T any](ctx context.Context, strategy Strategy, fn func() (T, error)) (T, error) {
	var zero T
	var lastErr error
	
	for attempt := 0; attempt <= strategy.MaxRetries; attempt++ {
		result, err := fn()
		if err == nil {
			return result, nil
		}
		
		lastErr = err
		if attempt == strategy.MaxRetries {
			break
		}
		
		if strategy.Retryable != nil && !strategy.Retryable(err) {
			return zero, fmt.Errorf("non-retryable error on attempt %d: %w", attempt+1, err)
		}
		
		delay := calculateDelay(strategy, attempt)
		
		select {
		case <-ctx.Done():
			return zero, fmt.Errorf("context cancelled after attempt %d: %w", attempt+1, ctx.Err())
		case <-time.After(delay):
		}
	}
	
	return zero, fmt.Errorf("all %d attempts failed, last error: %w", strategy.MaxRetries+1, lastErr)
}

// Metrics 重试统计
type Metrics struct {
	Attempts   int
	Success    bool
	TotalDelay time.Duration
}

// DoWithMetrics 执行带重试并返回统计信息
func DoWithMetrics(ctx context.Context, strategy Strategy, fn func() error) (Metrics, error) {
	m := Metrics{}
	var lastErr error
	start := time.Now()
	
	for attempt := 0; attempt <= strategy.MaxRetries; attempt++ {
		m.Attempts = attempt + 1
		if err := fn(); err != nil {
			lastErr = err
			if attempt == strategy.MaxRetries {
				break
			}
			if strategy.Retryable != nil && !strategy.Retryable(err) {
				m.TotalDelay = time.Since(start)
				return m, fmt.Errorf("non-retryable error on attempt %d: %w", attempt+1, err)
			}
			delay := calculateDelay(strategy, attempt)
			select {
			case <-ctx.Done():
				m.TotalDelay = time.Since(start)
				return m, fmt.Errorf("context cancelled after attempt %d: %w", attempt+1, ctx.Err())
			case <-time.After(delay):
			}
		} else {
			m.Success = true
			m.TotalDelay = time.Since(start)
			return m, nil
		}
	}
	
	m.TotalDelay = time.Since(start)
	return m, fmt.Errorf("all %d attempts failed, last error: %w", strategy.MaxRetries+1, lastErr)
}