117 lines
3.4 KiB
Bash
117 lines
3.4 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
BASE_URL="${BASE_URL:-http://127.0.0.1:8080}"
|
||
|
|
CONSUMER="${CONSUMER:-gateway}"
|
||
|
|
APPLIED_RATIO_THRESHOLD="${APPLIED_RATIO_THRESHOLD:-0.95}"
|
||
|
|
FAILED_BURST_THRESHOLD="${FAILED_BURST_THRESHOLD:-3}"
|
||
|
|
PENDING_RETRY_THRESHOLD="${PENDING_RETRY_THRESHOLD:-10}"
|
||
|
|
|
||
|
|
need() {
|
||
|
|
command -v "$1" >/dev/null 2>&1 || {
|
||
|
|
echo "missing required command: $1" >&2
|
||
|
|
exit 1
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
need curl
|
||
|
|
need python3
|
||
|
|
|
||
|
|
health=$(curl -fsS "$BASE_URL/healthz")
|
||
|
|
metrics=$(curl -fsS "$BASE_URL/metrics")
|
||
|
|
status=$(curl -fsS "$BASE_URL/internal/supply-intelligence/gateway/runtime-status")
|
||
|
|
|
||
|
|
echo "=== healthz ==="
|
||
|
|
echo "$health"
|
||
|
|
echo "=== runtime status ==="
|
||
|
|
echo "$status"
|
||
|
|
echo "=== metrics excerpt ==="
|
||
|
|
printf '%s
|
||
|
|
' "$metrics" | grep 'supply_intelligence_gateway_' || true
|
||
|
|
|
||
|
|
export METRICS_TEXT="$metrics"
|
||
|
|
export RUNTIME_STATUS_JSON="$status"
|
||
|
|
export CONSUMER
|
||
|
|
export APPLIED_RATIO_THRESHOLD
|
||
|
|
export FAILED_BURST_THRESHOLD
|
||
|
|
export PENDING_RETRY_THRESHOLD
|
||
|
|
|
||
|
|
python3 <<'PY'
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
|
||
|
|
metrics = os.environ['METRICS_TEXT']
|
||
|
|
status = json.loads(os.environ['RUNTIME_STATUS_JSON'])
|
||
|
|
consumer = os.environ['CONSUMER']
|
||
|
|
ratio_threshold = float(os.environ['APPLIED_RATIO_THRESHOLD'])
|
||
|
|
failed_threshold = int(os.environ['FAILED_BURST_THRESHOLD'])
|
||
|
|
pending_threshold = int(os.environ['PENDING_RETRY_THRESHOLD'])
|
||
|
|
|
||
|
|
processed = {}
|
||
|
|
for line in metrics.splitlines():
|
||
|
|
if not line.startswith('supply_intelligence_gateway_events_processed_total'):
|
||
|
|
continue
|
||
|
|
head, _, tail = line.rpartition(' ')
|
||
|
|
if not tail:
|
||
|
|
continue
|
||
|
|
m = re.search(r'\{([^}]*)\}$', head)
|
||
|
|
if not m:
|
||
|
|
continue
|
||
|
|
labels = {}
|
||
|
|
for part in m.group(1).split(','):
|
||
|
|
if '=' not in part:
|
||
|
|
continue
|
||
|
|
k, v = part.split('=', 1)
|
||
|
|
labels[k.strip()] = v.strip().strip('"')
|
||
|
|
result_label = labels.get('result')
|
||
|
|
if not result_label:
|
||
|
|
continue
|
||
|
|
processed[result_label] = processed.get(result_label, 0.0) + float(tail)
|
||
|
|
|
||
|
|
pending_retry = 0.0
|
||
|
|
failed_events = 0.0
|
||
|
|
for line in metrics.splitlines():
|
||
|
|
if line.startswith('supply_intelligence_gateway_pending_retry_events') and f'consumer="{consumer}"' in line:
|
||
|
|
pending_retry = float(line.rsplit(' ', 1)[-1])
|
||
|
|
if line.startswith('supply_intelligence_gateway_failed_events') and f'consumer="{consumer}"' in line:
|
||
|
|
failed_events = float(line.rsplit(' ', 1)[-1])
|
||
|
|
|
||
|
|
total_terminal = processed.get('applied', 0.0) + processed.get('failed', 0.0)
|
||
|
|
applied_ratio = (processed.get('applied', 0.0) / total_terminal) if total_terminal > 0 else 1.0
|
||
|
|
|
||
|
|
decision = 'continue'
|
||
|
|
reasons = []
|
||
|
|
if not status.get('started', False):
|
||
|
|
decision = 'pause'
|
||
|
|
reasons.append('runtime_not_started')
|
||
|
|
if status.get('last_error'):
|
||
|
|
decision = 'pause'
|
||
|
|
reasons.append('runtime_last_error')
|
||
|
|
if pending_retry > pending_threshold:
|
||
|
|
decision = 'pause'
|
||
|
|
reasons.append('pending_retry_threshold_exceeded')
|
||
|
|
if applied_ratio < ratio_threshold:
|
||
|
|
decision = 'pause'
|
||
|
|
reasons.append('applied_ratio_below_threshold')
|
||
|
|
if failed_events >= failed_threshold:
|
||
|
|
decision = 'rollback'
|
||
|
|
reasons.append('failed_events_threshold_exceeded')
|
||
|
|
|
||
|
|
print(json.dumps({
|
||
|
|
'decision': decision,
|
||
|
|
'reasons': reasons,
|
||
|
|
'applied_ratio': applied_ratio,
|
||
|
|
'processed': processed,
|
||
|
|
'pending_retry_events': pending_retry,
|
||
|
|
'failed_events': failed_events,
|
||
|
|
'runtime': status,
|
||
|
|
}, ensure_ascii=False, indent=2))
|
||
|
|
|
||
|
|
if decision == 'rollback':
|
||
|
|
sys.exit(2)
|
||
|
|
if decision == 'pause':
|
||
|
|
sys.exit(1)
|
||
|
|
PY
|