#!/usr/bin/env bash set -euo pipefail BASE_URL="${BASE_URL:-http://127.0.0.1:8080}" CONSUMER="${CONSUMER:-gateway}" APPLIED_RATIO_THRESHOLD="${APPLIED_RATIO_THRESHOLD:-0.95}" FAILED_BURST_THRESHOLD="${FAILED_BURST_THRESHOLD:-3}" PENDING_RETRY_THRESHOLD="${PENDING_RETRY_THRESHOLD:-10}" need() { command -v "$1" >/dev/null 2>&1 || { echo "missing required command: $1" >&2 exit 1 } } need curl need python3 health=$(curl -fsS "$BASE_URL/healthz") metrics=$(curl -fsS "$BASE_URL/metrics") status=$(curl -fsS "$BASE_URL/internal/supply-intelligence/gateway/runtime-status") echo "=== healthz ===" echo "$health" echo "=== runtime status ===" echo "$status" echo "=== metrics excerpt ===" printf '%s ' "$metrics" | grep 'supply_intelligence_gateway_' || true export METRICS_TEXT="$metrics" export RUNTIME_STATUS_JSON="$status" export CONSUMER export APPLIED_RATIO_THRESHOLD export FAILED_BURST_THRESHOLD export PENDING_RETRY_THRESHOLD python3 <<'PY' import json import os import re import sys metrics = os.environ['METRICS_TEXT'] status = json.loads(os.environ['RUNTIME_STATUS_JSON']) consumer = os.environ['CONSUMER'] ratio_threshold = float(os.environ['APPLIED_RATIO_THRESHOLD']) failed_threshold = int(os.environ['FAILED_BURST_THRESHOLD']) pending_threshold = int(os.environ['PENDING_RETRY_THRESHOLD']) processed = {} for line in metrics.splitlines(): if not line.startswith('supply_intelligence_gateway_events_processed_total'): continue head, _, tail = line.rpartition(' ') if not tail: continue m = re.search(r'\{([^}]*)\}$', head) if not m: continue labels = {} for part in m.group(1).split(','): if '=' not in part: continue k, v = part.split('=', 1) labels[k.strip()] = v.strip().strip('"') result_label = labels.get('result') if not result_label: continue processed[result_label] = processed.get(result_label, 0.0) + float(tail) pending_retry = 0.0 failed_events = 0.0 for line in metrics.splitlines(): if line.startswith('supply_intelligence_gateway_pending_retry_events') and f'consumer="{consumer}"' in line: pending_retry = float(line.rsplit(' ', 1)[-1]) if line.startswith('supply_intelligence_gateway_failed_events') and f'consumer="{consumer}"' in line: failed_events = float(line.rsplit(' ', 1)[-1]) total_terminal = processed.get('applied', 0.0) + processed.get('failed', 0.0) applied_ratio = (processed.get('applied', 0.0) / total_terminal) if total_terminal > 0 else 1.0 decision = 'continue' reasons = [] if not status.get('started', False): decision = 'pause' reasons.append('runtime_not_started') if status.get('last_error'): decision = 'pause' reasons.append('runtime_last_error') if pending_retry > pending_threshold: decision = 'pause' reasons.append('pending_retry_threshold_exceeded') if applied_ratio < ratio_threshold: decision = 'pause' reasons.append('applied_ratio_below_threshold') if failed_events >= failed_threshold: decision = 'rollback' reasons.append('failed_events_threshold_exceeded') print(json.dumps({ 'decision': decision, 'reasons': reasons, 'applied_ratio': applied_ratio, 'processed': processed, 'pending_retry_events': pending_retry, 'failed_events': failed_events, 'runtime': status, }, ensure_ascii=False, indent=2)) if decision == 'rollback': sys.exit(2) if decision == 'pause': sys.exit(1) PY