Add production-ready monitoring infrastructure: - 15 alerting rules (4 Critical + 11 Warning) - Grafana dashboard with service health panels - Full documentation with deployment guide Covers: service availability, error rates, latency, routing health, database connections, and log metrics
86 lines
2.0 KiB
JSON
86 lines
2.0 KiB
JSON
{
|
|
"dashboard": {
|
|
"id": null,
|
|
"title": "Sub2API Relay Manager",
|
|
"tags": ["sub2api", "relay", "monitoring"],
|
|
"timezone": "UTC",
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"title": "Service Status",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "up{job=\"sub2api-relay-manager\"}",
|
|
"legendFormat": "Service Up"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"steps": [
|
|
{ "color": "red", "value": 0 },
|
|
{ "color": "green", "value": 1 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }
|
|
},
|
|
{
|
|
"id": 2,
|
|
"title": "Active Providers",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "active_providers",
|
|
"legendFormat": "Providers"
|
|
}
|
|
],
|
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "Active Hosts",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "active_hosts",
|
|
"legendFormat": "Hosts"
|
|
}
|
|
],
|
|
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "Request Rate",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(http_requests_total[5m])",
|
|
"legendFormat": "{{method}} {{path}}"
|
|
}
|
|
],
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }
|
|
},
|
|
{
|
|
"id": 5,
|
|
"title": "Request Duration p95",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p95"
|
|
}
|
|
],
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }
|
|
}
|
|
],
|
|
"time": {
|
|
"from": "now-1h",
|
|
"to": "now"
|
|
},
|
|
"refresh": "30s"
|
|
}
|
|
}
|