# ============================================================================= # Sub2API 监控栈 - Prometheus + Grafana + Loki + Jaeger # ============================================================================= # 使用方法: # 1. 创建监控目录: mkdir -p monitoring/{prometheus-data,grafana-data,loki-data} # 2. 启动: docker-compose -f docker-compose.monitoring.yml up -d # 3. 访问 Grafana: http://localhost:3000 (admin/admin) # ============================================================================= version: '3.8' services: # =========================================================================== # Prometheus - 时序数据库 # =========================================================================== prometheus: image: prom/prometheus:v2.50.0 container_name: sub2api-prometheus restart: unless-stopped command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention.time=30d' - '--storage.tsdb.retention.size=50GB' - '--web.enable-lifecycle' - '--web.console.libraries=/usr/share/prometheus/console_libraries' - '--web.console.templates=/usr/share/prometheus/consoles' volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./prometheus/rules:/etc/prometheus/rules:ro - ./prometheus-data:/prometheus ports: - "9090:9090" networks: - monitoring-network healthcheck: test: ["CMD", "wget", "-q", "-O", "-", "http://localhost:9090/-/healthy"] interval: 30s timeout: 10s retries: 3 # =========================================================================== # Grafana - 可视化平台 # =========================================================================== grafana: image: grafana/grafana:10.3.1 container_name: sub2api-grafana restart: unless-stopped environment: - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} - GF_USERS_ALLOW_SIGN_UP=false - GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3000} - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource volumes: - ./grafana-data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning:ro - ./grafana/dashboards:/var/lib/grafana/dashboards:ro ports: - "3000:3000" networks: - monitoring-network depends_on: - prometheus - loki - jaeger healthcheck: test: ["CMD", "wget", "-q", "-O", "-", "http://localhost:3000/api/health"] interval: 30s timeout: 10s retries: 3 # =========================================================================== # Loki - 日志聚合 # =========================================================================== loki: image: grafana/loki:2.9.4 container_name: sub2api-loki restart: unless-stopped command: -config.file=/etc/loki/local-config.yaml volumes: - ./loki/loki-config.yaml:/etc/loki/local-config.yaml:ro - ./loki-data:/loki ports: - "3100:3100" networks: - monitoring-network healthcheck: test: ["CMD", "wget", "-q", "-O", "-", "http://localhost:3100/ready"] interval: 30s timeout: 10s retries: 3 # =========================================================================== # Promtail - 日志收集器 # =========================================================================== promtail: image: grafana/promtail:2.9.4 container_name: sub2api-promtail restart: unless-stopped command: -config.file=/etc/promtail/config.yml volumes: - ./promtail/promtail-config.yml:/etc/promtail/config.yml:ro - /var/log:/var/log:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro networks: - monitoring-network depends_on: - loki # =========================================================================== # Jaeger - 分布式追踪 # =========================================================================== jaeger: image: jaegertracing/all-in-one:1.54 container_name: sub2api-jaeger restart: unless-stopped environment: - COLLECTOR_OTLP_ENABLED=true ports: - "16686:16686" # UI - "4317:4317" # OTLP gRPC - "4318:4318" # OTLP HTTP - "14268:14268" # Jaeger Thrift networks: - monitoring-network healthcheck: test: ["CMD", "wget", "-q", "-O", "-", "http://localhost:16686"] interval: 30s timeout: 10s retries: 3 # =========================================================================== # Node Exporter - 主机指标 # =========================================================================== node-exporter: image: prom/node-exporter:v1.7.0 container_name: sub2api-node-exporter restart: unless-stopped command: - '--path.procfs=/host/proc' - '--path.rootfs=/rootfs' - '--path.sysfs=/host/sys' - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro ports: - "9100:9100" networks: - monitoring-network # =========================================================================== # cAdvisor - 容器指标 # =========================================================================== cadvisor: image: gcr.io/cadvisor/cadvisor:v0.47.2 container_name: sub2api-cadvisor restart: unless-stopped privileged: true devices: - /dev/kmsg:/dev/kmsg volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker:/var/lib/docker:ro - /dev/disk:/dev/disk:ro ports: - "8081:8080" networks: - monitoring-network # =========================================================================== # Alertmanager - 告警管理 # =========================================================================== alertmanager: image: prom/alertmanager:v0.27.0 container_name: sub2api-alertmanager restart: unless-stopped command: - '--config.file=/etc/alertmanager/config.yml' - '--storage.path=/alertmanager' - '--web.external-url=http://localhost:9093' volumes: - ./alertmanager/config.yml:/etc/alertmanager/config.yml:ro - ./alertmanager-data:/alertmanager ports: - "9093:9093" networks: - monitoring-network healthcheck: test: ["CMD", "wget", "-q", "-O", "-", "http://localhost:9093/-/healthy"] interval: 30s timeout: 10s retries: 3 networks: monitoring-network: driver: bridge