Files
ai-customer-service/scripts/verify_gate_c_rollback.sh

392 lines
12 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
RUN_ID="${RUN_ID:-gatec-rollback-$(date +%Y%m%d%H%M%S)}"
ARTIFACT_DIR="${ARTIFACT_DIR:-/tmp/ai-customer-service-gate-c-rollback/$RUN_ID}"
GO_HELPER_DIR="$ROOT_DIR/.tmp/verify_gate_c_rollback/$RUN_ID"
SUMMARY_FILE="$ARTIFACT_DIR/summary.txt"
BASELINE_LOG_FILE="$ARTIFACT_DIR/baseline-service.log"
BROKEN_LOG_FILE="$ARTIFACT_DIR/broken-service.log"
ROLLED_BACK_LOG_FILE="$ARTIFACT_DIR/rolled-back-service.log"
DEFAULT_APP_BIN="$ARTIFACT_DIR/ai-customer-service"
APP_BIN="${APP_BIN:-$DEFAULT_APP_BIN}"
mkdir -p "$ARTIFACT_DIR"
mkdir -p "$GO_HELPER_DIR"
PASS_COUNT=0
FAIL_COUNT=0
APP_PID=""
BASE_URL=""
BROKEN_AI_CS_POSTGRES_DSN="${BROKEN_AI_CS_POSTGRES_DSN:-}"
log() {
printf '%s\n' "$*" | tee -a "$SUMMARY_FILE"
}
pass() {
PASS_COUNT=$((PASS_COUNT + 1))
log "[PASS] $*"
}
fail() {
FAIL_COUNT=$((FAIL_COUNT + 1))
log "[FAIL] $*"
exit 1
}
require_cmd() {
if ! command -v "$1" >/dev/null 2>&1; then
fail "missing command: $1"
fi
}
require_env() {
local key="$1"
if [[ -z "${!key:-}" ]]; then
fail "missing required env: $key"
fi
}
stop_service() {
if [[ -n "$APP_PID" ]] && kill -0 "$APP_PID" >/dev/null 2>&1; then
kill "$APP_PID" >/dev/null 2>&1 || true
wait "$APP_PID" >/dev/null 2>&1 || true
fi
APP_PID=""
}
cleanup() {
stop_service
}
trap cleanup EXIT
extract_base_url() {
local addr="$1"
local host=""
local port=""
if [[ "$addr" == :* ]]; then
host="127.0.0.1"
port="${addr#:}"
else
host="${addr%:*}"
port="${addr##*:}"
if [[ -z "$host" || "$host" == "$addr" ]]; then
fail "AI_CS_ADDR must be host:port or :port, got: $addr"
fi
if [[ "$host" == "0.0.0.0" ]]; then
host="127.0.0.1"
fi
fi
printf 'http://%s:%s' "$host" "$port"
}
derive_broken_dsn() {
python3 - "$AI_CS_POSTGRES_DSN" <<'PY'
import re
import sys
dsn = sys.argv[1]
if dsn.startswith("postgres://") or dsn.startswith("postgresql://"):
if re.search(r":\d+/", dsn):
print(re.sub(r":\d+/", ":1/", dsn, count=1), end="")
else:
print(dsn, end="")
elif "port=" in dsn:
print(re.sub(r"port=\d+", "port=1", dsn, count=1), end="")
else:
print(f"{dsn} port=1", end="")
PY
}
DB_QUERY_HELPER="$GO_HELPER_DIR/db_query.go"
cat >"$DB_QUERY_HELPER" <<'EOF'
package main
import (
"database/sql"
"fmt"
"os"
_ "github.com/lib/pq"
)
func main() {
dsn := os.Getenv("DB_DSN")
query := os.Getenv("SQL_QUERY")
if dsn == "" || query == "" {
fmt.Fprintln(os.Stderr, "DB_DSN and SQL_QUERY are required")
os.Exit(2)
}
db, err := sql.Open("postgres", dsn)
if err != nil {
fmt.Fprintln(os.Stderr, err.Error())
os.Exit(2)
}
defer db.Close()
if err := db.Ping(); err != nil {
fmt.Fprintln(os.Stderr, err.Error())
os.Exit(2)
}
var value string
if err := db.QueryRow(query).Scan(&value); err != nil {
fmt.Fprintln(os.Stderr, err.Error())
os.Exit(2)
}
fmt.Print(value)
}
EOF
db_value() {
local sql="$1"
DB_DSN="$AI_CS_POSTGRES_DSN" SQL_QUERY="$sql" go run "$DB_QUERY_HELPER"
}
assert_eq() {
local actual="$1"
local expected="$2"
local message="$3"
if [[ "$actual" != "$expected" ]]; then
fail "$message (got=$actual want=$expected)"
fi
pass "$message"
}
assert_nonzero_count() {
local actual="$1"
local message="$2"
if [[ "$actual" =~ ^[1-9][0-9]*$ ]]; then
pass "$message"
return
fi
fail "$message (got=$actual want>=1)"
}
start_service_with_env() {
local dsn="$1"
local log_file="$2"
stop_service
(
cd "$ROOT_DIR"
AI_CS_RUNTIME_ENV="$AI_CS_RUNTIME_ENV" \
AI_CS_ADDR="$AI_CS_ADDR" \
AI_CS_POSTGRES_ENABLED="$AI_CS_POSTGRES_ENABLED" \
AI_CS_POSTGRES_DSN="$dsn" \
AI_CS_POSTGRES_MIGRATION_DIR="$AI_CS_POSTGRES_MIGRATION_DIR" \
AI_CS_WEBHOOK_SECRET="$AI_CS_WEBHOOK_SECRET" \
AI_CS_WEBHOOK_TIMESTAMP_HEADER="$AI_CS_WEBHOOK_TIMESTAMP_HEADER" \
AI_CS_WEBHOOK_SIGNATURE_HEADER="$AI_CS_WEBHOOK_SIGNATURE_HEADER" \
AI_CS_WEBHOOK_MAX_SKEW_SECONDS="$AI_CS_WEBHOOK_MAX_SKEW_SECONDS" \
"$APP_BIN"
) >"$log_file" 2>&1 &
APP_PID=$!
}
wait_ready() {
local log_file="$1"
local ready_ok=""
for _ in $(seq 1 30); do
if curl -fsS "$BASE_URL/actuator/health/live" >/dev/null 2>&1 && curl -fsS "$BASE_URL/actuator/health/ready" >/dev/null 2>&1; then
ready_ok="yes"
break
fi
sleep 1
done
if [[ "$ready_ok" != "yes" ]]; then
tail -100 "$log_file" | tee -a "$SUMMARY_FILE" >/dev/null || true
fail "service did not become live+ready"
fi
}
wait_broken_startup() {
local log_file="$1"
for _ in $(seq 1 12); do
if [[ -n "$APP_PID" ]] && ! kill -0 "$APP_PID" >/dev/null 2>&1; then
pass "broken release process exited as expected"
return
fi
if curl -fsS "$BASE_URL/actuator/health/ready" >/dev/null 2>&1; then
tail -100 "$log_file" | tee -a "$SUMMARY_FILE" >/dev/null || true
fail "broken release unexpectedly became ready"
fi
sleep 1
done
if curl -fsS "$BASE_URL/actuator/health/ready" >/dev/null 2>&1; then
tail -100 "$log_file" | tee -a "$SUMMARY_FILE" >/dev/null || true
fail "broken release unexpectedly became ready after timeout"
fi
pass "broken release never became ready"
}
send_signed_webhook() {
local message_id="$1"
local open_id="$2"
local response_file="$3"
local body_file="$ARTIFACT_DIR/${message_id}.json"
MESSAGE_ID="$message_id" OPEN_ID="$open_id" python3 >"$body_file" <<'PY'
import json
import os
import sys
payload = {
"message_id": os.environ["MESSAGE_ID"],
"channel": "widget",
"open_id": os.environ["OPEN_ID"],
"content": "我要退款",
}
sys.stdout.write(json.dumps(payload, ensure_ascii=False, separators=(",", ":")))
PY
local ts
ts="$(date +%s)"
local sig
sig="$(python3 - "$ts" "$body_file" "$AI_CS_WEBHOOK_SECRET" <<'PY'
import hashlib
import hmac
import sys
timestamp, body_path, secret = sys.argv[1], sys.argv[2], sys.argv[3]
with open(body_path, "rb") as fh:
body = fh.read()
payload = timestamp.encode("utf-8") + b"." + body
print(hmac.new(secret.encode("utf-8"), payload, hashlib.sha256).hexdigest(), end="")
PY
)"
curl -sS -o "$response_file" -w '%{http_code}' \
-X POST "$BASE_URL/api/v1/customer-service/webhook" \
-H "Content-Type: application/json" \
-H "$AI_CS_WEBHOOK_TIMESTAMP_HEADER: $ts" \
-H "$AI_CS_WEBHOOK_SIGNATURE_HEADER: $sig" \
--data-binary "@$body_file"
}
extract_response_field() {
local response_file="$1"
local field="$2"
python3 - "$response_file" "$field" <<'PY'
import json
import sys
with open(sys.argv[1], "r", encoding="utf-8") as fh:
data = json.load(fh)
value = data.get(sys.argv[2], "")
if isinstance(value, bool):
print(str(value).lower(), end="")
else:
print(value, end="")
PY
}
log "# verify_gate_c_rollback.sh"
log "run_id=$RUN_ID"
log "artifact_dir=$ARTIFACT_DIR"
log "root_dir=$ROOT_DIR"
require_cmd curl
require_cmd go
require_cmd openssl
require_cmd python3
pass "required commands available"
require_env AI_CS_RUNTIME_ENV
require_env AI_CS_ADDR
require_env AI_CS_POSTGRES_ENABLED
require_env AI_CS_POSTGRES_DSN
require_env AI_CS_POSTGRES_MIGRATION_DIR
require_env AI_CS_WEBHOOK_SECRET
AI_CS_WEBHOOK_TIMESTAMP_HEADER="${AI_CS_WEBHOOK_TIMESTAMP_HEADER:-X-CS-Timestamp}"
AI_CS_WEBHOOK_SIGNATURE_HEADER="${AI_CS_WEBHOOK_SIGNATURE_HEADER:-X-CS-Signature}"
AI_CS_WEBHOOK_MAX_SKEW_SECONDS="${AI_CS_WEBHOOK_MAX_SKEW_SECONDS:-300}"
BASE_URL="$(extract_base_url "$AI_CS_ADDR")"
if [[ -z "$BROKEN_AI_CS_POSTGRES_DSN" ]]; then
BROKEN_AI_CS_POSTGRES_DSN="$(derive_broken_dsn)"
fi
assert_eq "$AI_CS_RUNTIME_ENV" "production" "runtime env is production"
assert_eq "$AI_CS_POSTGRES_ENABLED" "true" "postgres mode enabled for rollback drill"
if [[ ! -d "$AI_CS_POSTGRES_MIGRATION_DIR" ]]; then
fail "migration dir not found: $AI_CS_POSTGRES_MIGRATION_DIR"
fi
pass "migration dir exists: $AI_CS_POSTGRES_MIGRATION_DIR"
if [[ "$APP_BIN" == "$DEFAULT_APP_BIN" ]]; then
(
cd "$ROOT_DIR"
go build -o "$APP_BIN" ./cmd/ai-customer-service
)
pass "built current source into rollback drill app binary: $APP_BIN"
elif [[ ! -x "$APP_BIN" ]]; then
fail "app binary is not executable: $APP_BIN"
else
pass "using provided executable app binary: $APP_BIN"
fi
if [[ -n "$(db_value "SELECT '1'")" ]]; then
pass "postgres connectivity check passed"
else
fail "postgres connectivity check returned empty result"
fi
BASELINE_MESSAGE_ID="${RUN_ID}-baseline-message"
BASELINE_OPEN_ID="${RUN_ID}-baseline-open"
BASELINE_RESP_FILE="$ARTIFACT_DIR/baseline_webhook_response.json"
start_service_with_env "$AI_CS_POSTGRES_DSN" "$BASELINE_LOG_FILE"
pass "baseline service process started (pid=$APP_PID)"
wait_ready "$BASELINE_LOG_FILE"
pass "baseline service live and ready probes passed"
HTTP_CODE="$(send_signed_webhook "$BASELINE_MESSAGE_ID" "$BASELINE_OPEN_ID" "$BASELINE_RESP_FILE")"
assert_eq "$HTTP_CODE" "200" "baseline signed webhook request returned HTTP 200"
assert_eq "$(extract_response_field "$BASELINE_RESP_FILE" "received")" "true" "baseline webhook response received=true"
assert_eq "$(extract_response_field "$BASELINE_RESP_FILE" "handoff")" "true" "baseline webhook response handoff=true"
stop_service
pass "baseline service stopped before broken release"
start_service_with_env "$BROKEN_AI_CS_POSTGRES_DSN" "$BROKEN_LOG_FILE"
pass "broken release process started (pid=$APP_PID)"
wait_broken_startup "$BROKEN_LOG_FILE"
start_service_with_env "$AI_CS_POSTGRES_DSN" "$ROLLED_BACK_LOG_FILE"
pass "rollback restart process started (pid=$APP_PID)"
wait_ready "$ROLLED_BACK_LOG_FILE"
pass "rolled-back service live and ready probes passed"
ROLLED_BACK_MESSAGE_ID="${RUN_ID}-rollback-message"
ROLLED_BACK_OPEN_ID="${RUN_ID}-rollback-open"
ROLLED_BACK_RESP_FILE="$ARTIFACT_DIR/rolled_back_webhook_response.json"
HTTP_CODE="$(send_signed_webhook "$ROLLED_BACK_MESSAGE_ID" "$ROLLED_BACK_OPEN_ID" "$ROLLED_BACK_RESP_FILE")"
assert_eq "$HTTP_CODE" "200" "rolled-back signed webhook request returned HTTP 200"
assert_eq "$(extract_response_field "$ROLLED_BACK_RESP_FILE" "received")" "true" "rolled-back webhook response received=true"
assert_eq "$(extract_response_field "$ROLLED_BACK_RESP_FILE" "handoff")" "true" "rolled-back webhook response handoff=true"
ROLLED_BACK_TICKET_ID="$(extract_response_field "$ROLLED_BACK_RESP_FILE" "ticket_id")"
ROLLED_BACK_SESSION_ID="$(extract_response_field "$ROLLED_BACK_RESP_FILE" "session_id")"
if [[ -z "$ROLLED_BACK_TICKET_ID" || -z "$ROLLED_BACK_SESSION_ID" ]]; then
fail "rolled-back webhook response missing ticket_id or session_id"
fi
pass "rolled-back webhook response returned ticket_id and session_id"
assert_eq "$(db_value "SELECT status FROM cs_tickets WHERE id = '$ROLLED_BACK_TICKET_ID'::uuid")" "open" "rolled-back webhook created open ticket"
assert_nonzero_count "$(db_value "SELECT COUNT(*)::text FROM cs_message_dedup WHERE channel = 'widget' AND message_id = '$ROLLED_BACK_MESSAGE_ID'")" "rolled-back webhook persisted dedup row"
assert_nonzero_count "$(db_value "SELECT COUNT(*)::text FROM cs_audit_logs WHERE object_type = 'message_processed' AND action = 'process' AND actor_id = '$ROLLED_BACK_OPEN_ID'")" "rolled-back webhook persisted message_processed audit"
assert_nonzero_count "$(db_value "SELECT COUNT(*)::text FROM cs_tickets t JOIN cs_sessions s ON s.id = t.session_id WHERE s.channel = 'widget' AND s.open_id = '$ROLLED_BACK_OPEN_ID'")" "rolled-back webhook persisted ticket linked to session"
pass "gate-c rollback drill completed successfully"
log "baseline_message_id=$BASELINE_MESSAGE_ID"
log "rolled_back_message_id=$ROLLED_BACK_MESSAGE_ID"
log "rolled_back_ticket_id=$ROLLED_BACK_TICKET_ID"
log "rolled_back_session_id=$ROLLED_BACK_SESSION_ID"
log "baseline_log_file=$BASELINE_LOG_FILE"
log "broken_log_file=$BROKEN_LOG_FILE"
log "rolled_back_log_file=$ROLLED_BACK_LOG_FILE"
log "summary: pass=$PASS_COUNT fail=$FAIL_COUNT"