- Remove old review reports (keep latest only) - Move docs/ to deploy/docs-backup/ - Move performance-testing/ to deploy/ - Clean up test output files - Organize root directory
368 lines
8.9 KiB
Markdown
368 lines
8.9 KiB
Markdown
# Sub2API 模块分析报告:运营与监控系统
|
||
|
||
## 1. 模块概述
|
||
|
||
### 1.1 模块定位
|
||
运营与监控系统是Sub2API的运维管理核心,提供系统监控、告警、运维日志、备份恢复等功能,帮助运维人员管理和监控系统运行状态。
|
||
|
||
### 1.2 核心职责
|
||
- **系统监控**:监控服务健康、性能指标
|
||
- **告警管理**:配置和触发告警通知
|
||
- **运维日志**:记录运维操作历史
|
||
- **备份恢复**:数据备份和恢复
|
||
- **系统设置**:系统参数配置
|
||
|
||
## 2. 代码结构分析
|
||
|
||
### 2.1 核心文件
|
||
|
||
| 文件路径 | 职责 | 代码行数 |
|
||
|---------|------|----------|
|
||
| `service/ops_service.go` | 运营服务核心 | ~1000行 |
|
||
| `service/ops_aggregation_service.go` | 数据聚合服务 | ~500行 |
|
||
| `service/backup_service.go` | 备份恢复服务 | ~600行 |
|
||
| `service/alert_service.go` | 告警服务 | ~400行 |
|
||
| `handler/admin/ops_handler.go` | 运营API | ~900行 |
|
||
| `handler/admin/backup_handler.go` | 备份API | ~300行 |
|
||
| `handler/setting_handler.go` | 设置API | ~600行 |
|
||
|
||
## 3. 功能详细分析
|
||
|
||
### 3.1 系统监控
|
||
|
||
#### 3.1.1 实时指标
|
||
|
||
```go
|
||
// service/ops_realtime_service.go - GetRealtimeStats
|
||
func (s *OpsService) GetRealtimeStats() *RealtimeStats {
|
||
return &RealtimeStats{
|
||
// 活跃连接数
|
||
ActiveConnections: s.getActiveConnections(),
|
||
|
||
// QPS
|
||
QPS: s.getQPS(),
|
||
|
||
// 错误率
|
||
ErrorRate: s.getErrorRate(),
|
||
|
||
// 延迟分布
|
||
LatencyP50: s.getLatencyPercentile(50),
|
||
LatencyP95: s.getLatencyPercentile(95),
|
||
LatencyP99: s.getLatencyPercentile(99),
|
||
|
||
// 账户状态分布
|
||
AccountStatus: s.getAccountStatusDistribution(),
|
||
|
||
// 用户活跃度
|
||
ActiveUsers: s.getActiveUsers(),
|
||
}
|
||
}
|
||
```
|
||
|
||
#### 3.1.2 历史趋势
|
||
|
||
```go
|
||
// 趋势数据聚合
|
||
func (s *OpsAggregationService) GetTrendData(period string, limit int) ([]DataPoint, error) {
|
||
var table string
|
||
switch period {
|
||
case "hourly":
|
||
table = "metrics_hourly"
|
||
case "daily":
|
||
table = "metrics_daily"
|
||
default:
|
||
table = "metrics_minute"
|
||
}
|
||
|
||
return s.queryTrendData(table, limit)
|
||
}
|
||
```
|
||
|
||
### 3.2 告警管理
|
||
|
||
#### 3.2.1 告警规则
|
||
|
||
```go
|
||
// 告警规则定义
|
||
type AlertRule struct {
|
||
ID string
|
||
Name string
|
||
Metric string // 监控指标
|
||
Condition string // 条件:gt/lt/eq
|
||
Threshold float64 // 阈值
|
||
Duration int // 持续时间(秒)
|
||
Severity string // critical/warning/info
|
||
Enabled bool
|
||
Actions []AlertAction
|
||
}
|
||
|
||
type AlertAction struct {
|
||
Type string // email/webhook/slack
|
||
Config string // 配置
|
||
}
|
||
|
||
// 检查告警
|
||
func (s *AlertService) EvaluateRules() {
|
||
for _, rule := range s.rules {
|
||
if !rule.Enabled {
|
||
continue
|
||
}
|
||
|
||
value := s.getMetricValue(rule.Metric)
|
||
if s.checkCondition(value, rule.Condition, rule.Threshold) {
|
||
s.triggerAlert(rule, value)
|
||
}
|
||
}
|
||
}
|
||
```
|
||
|
||
#### 3.2.2 告警通知
|
||
|
||
```go
|
||
// 触发告警
|
||
func (s *AlertService) triggerAlert(rule *AlertRule, value float64) {
|
||
// 1. 记录告警历史
|
||
alert := &Alert{
|
||
RuleID: rule.ID,
|
||
Metric: rule.Metric,
|
||
Value: value,
|
||
Threshold: rule.Threshold,
|
||
Severity: rule.Severity,
|
||
FiredAt: time.Now(),
|
||
}
|
||
s.saveAlert(alert)
|
||
|
||
// 2. 发送通知
|
||
for _, action := range rule.Actions {
|
||
switch action.Type {
|
||
case "email":
|
||
s.sendEmail(action.Config, alert)
|
||
case "webhook":
|
||
s.sendWebhook(action.Config, alert)
|
||
case "slack":
|
||
s.sendSlack(action.Config, alert)
|
||
}
|
||
}
|
||
}
|
||
```
|
||
|
||
### 3.3 运维日志
|
||
|
||
#### 3.3.1 操作记录
|
||
|
||
```go
|
||
// 记录运维操作
|
||
func (s *OpsService) LogOperation(op *OperationLog) error {
|
||
op.Timestamp = time.Now()
|
||
op.Operator = getCurrentUser()
|
||
op.IP = getClientIP()
|
||
|
||
return s.opsLogRepo.Create(ctx, op)
|
||
}
|
||
|
||
// 运维日志查询
|
||
func (s *OpsService) QueryOperationLogs(filters OperationFilters) ([]OperationLog, int64, error) {
|
||
return s.opsLogRepo.List(ctx, filters)
|
||
}
|
||
|
||
// 日志类型
|
||
const (
|
||
OpTypeCreate = "create"
|
||
OpTypeUpdate = "update"
|
||
OpTypeDelete = "delete"
|
||
OpTypeEnable = "enable"
|
||
OpTypeDisable = "disable"
|
||
OpTypeLogin = "login"
|
||
OpTypeLogout = "logout"
|
||
)
|
||
```
|
||
|
||
### 3.4 备份恢复
|
||
|
||
#### 3.4.1 备份
|
||
|
||
```go
|
||
// service/backup_service.go - CreateBackup
|
||
func (s *BackupService) CreateBackup(ctx context.Context, req CreateBackupRequest) (*Backup, error) {
|
||
// 1. 创建备份记录
|
||
backup := &Backup{
|
||
ID: generateBackupID(),
|
||
Type: req.Type, // full/incremental
|
||
Description: req.Description,
|
||
Status: StatusInProgress,
|
||
}
|
||
s.backupRepo.Create(ctx, backup)
|
||
|
||
// 2. 异步执行备份
|
||
go func() {
|
||
err := s.executeBackup(ctx, backup)
|
||
if err != nil {
|
||
backup.Status = StatusFailed
|
||
backup.Error = err.Error()
|
||
} else {
|
||
backup.Status = StatusCompleted
|
||
backup.Size = s.getBackupSize(backup.ID)
|
||
}
|
||
s.backupRepo.Update(ctx, backup)
|
||
}()
|
||
|
||
return backup, nil
|
||
}
|
||
|
||
func (s *BackupService) executeBackup(ctx context.Context, backup *Backup) error {
|
||
// 1. 备份数据库
|
||
err := s.backupDatabase(backup.ID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
// 2. 备份配置文件
|
||
err = s.backupConfig(backup.ID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
return nil
|
||
}
|
||
```
|
||
|
||
#### 3.4.2 恢复
|
||
|
||
```go
|
||
// 恢复数据
|
||
func (s *BackupService) Restore(ctx context.Context, backupID string, targetType string) error {
|
||
// 1. 验证备份存在
|
||
backup, err := s.backupRepo.GetByID(ctx, backupID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
// 2. 执行恢复
|
||
switch targetType {
|
||
case "database":
|
||
return s.restoreDatabase(ctx, backup)
|
||
case "config":
|
||
return s.restoreConfig(ctx, backup)
|
||
case "all":
|
||
if err := s.restoreDatabase(ctx, backup); err != nil {
|
||
return err
|
||
}
|
||
return s.restoreConfig(ctx, backup)
|
||
}
|
||
|
||
return nil
|
||
}
|
||
```
|
||
|
||
### 3.5 系统设置
|
||
|
||
#### 3.5.1 设置管理
|
||
|
||
```go
|
||
// handler/setting_handler.go
|
||
func (h *SettingHandler) GetSettings(c *gin.Context) {
|
||
// 获取所有设置(脱敏)
|
||
settings, _ := h.settingService.GetAllSettings()
|
||
|
||
// 敏感信息脱敏
|
||
for i := range settings {
|
||
if isSensitiveKey(settings[i].Key) {
|
||
settings[i].Value = "***"
|
||
}
|
||
}
|
||
|
||
c.JSON(200, settings)
|
||
}
|
||
|
||
func (h *SettingHandler) UpdateSetting(c *gin.Context) {
|
||
var req UpdateSettingRequest
|
||
if err := c.ShouldBindJSON(&req); err != nil {
|
||
c.JSON(400, gin.H{"error": err.Error()})
|
||
return
|
||
}
|
||
|
||
// 更新设置
|
||
err := h.settingService.UpdateSetting(ctx, req.Key, req.Value)
|
||
if err != nil {
|
||
c.JSON(500, gin.H{"error": err.Error()})
|
||
return
|
||
}
|
||
|
||
// 记录操作日志
|
||
h.opsService.LogOperation(&OperationLog{
|
||
Type: OpTypeUpdate,
|
||
Target: "setting:" + req.Key,
|
||
})
|
||
|
||
c.JSON(200, gin.H{"success": true})
|
||
}
|
||
```
|
||
|
||
## 4. 监控指标
|
||
|
||
### 4.1 系统指标
|
||
|
||
| 指标 | 说明 | 告警阈值 |
|
||
|------|------|----------|
|
||
| `ops_active_connections` | 活跃连接数 | > 10000 |
|
||
| `ops_qps` | 每秒请求数 | > 5000 |
|
||
| `ops_error_rate` | 错误率 | > 1% |
|
||
| `ops_latency_p99` | P99延迟 | > 10s |
|
||
| `ops_account_healthy` | 健康账号比例 | < 80% |
|
||
|
||
### 4.2 资源指标
|
||
|
||
| 指标 | 说明 | 告警阈值 |
|
||
|------|------|----------|
|
||
| `ops_cpu_usage` | CPU使用率 | > 80% |
|
||
| `ops_memory_usage` | 内存使用率 | > 85% |
|
||
| `ops_disk_usage` | 磁盘使用率 | > 90% |
|
||
| `ops_db_connections` | 数据库连接数 | > 80% |
|
||
|
||
## 5. 修改和扩展指南
|
||
|
||
### 5.1 添加自定义指标
|
||
|
||
```go
|
||
// 添加自定义监控指标
|
||
func (s *OpsService) RegisterCustomMetric(name string, collector MetricCollector) {
|
||
s.customMetrics[name] = collector
|
||
}
|
||
|
||
// 使用
|
||
s.opsService.RegisterCustomMetric("custom_business", func() float64 {
|
||
return getBusinessMetricValue()
|
||
})
|
||
```
|
||
|
||
### 5.2 添加告警渠道
|
||
|
||
```go
|
||
// 添加新的告警渠道
|
||
func (s *AlertService) registerChannel(channelType string, handler AlertChannelHandler) {
|
||
s.channels[channelType] = handler
|
||
}
|
||
|
||
// 使用
|
||
s.alertService.registerChannel("dingtalk", func(alert *Alert) error {
|
||
return sendDingtalkMessage(alert)
|
||
})
|
||
```
|
||
|
||
## 6. 总结
|
||
|
||
运营与监控系统特点:
|
||
|
||
- **全面监控**:覆盖系统、性能、业务多维度
|
||
- **灵活告警**:支持多种告警规则和通知方式
|
||
- **操作审计**:完整的运维日志记录
|
||
- **数据保护**:可靠的备份恢复机制
|
||
|
||
**修改建议:**
|
||
- 告警规则可根据实际需求调整
|
||
- 备份策略根据数据量设置
|
||
|
||
---
|
||
*文档版本:1.0*
|
||
*最后更新:2025-01*
|
||
*分析基于:Sub2API v0.1.104* |