fix(deploy): production CRM deployment improvements

- Fix deploy_crm_only.sh: non-destructive hot reload
  - Enhanced stop logic with pgrep + fuser for port release
  - Added 3-layer verification (process/control/user)
  - Check /proc/$pid/exe for (deleted) marker
  - Never delete DB

- Fix portal script contracts: crm_session → crm_subject
  - deploy_tksea_portal.sh: use $cookie_crm_subject
  - test_tksea_portal_assets.sh: assert crm_subject exists
  - nginx.example.conf: updated trusted subject header

- Add systemd service management
  - sub2api-crm.service.template
  - install_crm_systemd.sh
  - verify_crm_deployment.sh

Update docs/plans/2026-06-04-next-version-plan.md with deployment findings.
This commit is contained in:
phamnazage-jpg
2026-06-10 15:44:45 +08:00
parent 85954e516a
commit 47ced19c7b
10 changed files with 915 additions and 60 deletions

View File

@@ -7,11 +7,19 @@
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
DEPLOY_ENV_FILE="${DEPLOY_ENV_FILE:-$ROOT_DIR/scripts/deploy/.env.deploy}"
if [[ -f "$DEPLOY_ENV_FILE" ]]; then
set -a
# shellcheck disable=SC1090
source "$DEPLOY_ENV_FILE"
set +a
fi
# shellcheck disable=SC1091
source "$ROOT_DIR/scripts/deploy/remote43_patched_stack_lib.sh"
KEY="${KEY:-/home/long/下载/zjsea.pem}"
REMOTE="${REMOTE:-ubuntu@43.155.133.187}"
KEY="${KEY:-}"
REMOTE="${REMOTE:-}"
STACK_NAME="${STACK_NAME:-crm-only-$(date +%Y%m%d)}"
CRM_PORT="${CRM_PORT:-18190}"
CRM_BINARY="${CRM_BINARY:-$ROOT_DIR/server}"
@@ -120,29 +128,121 @@ if [[ -f "$REMOTE_REPO_BUNDLE" ]]; then
git -C "$REMOTE_REPO_ROOT" config user.email "remote43-crm@tksea.top"
fi
# 非破坏性热更新:先确认旧进程退出,再启动新进程
# 禁止删除DB生产数据必须保留
# 改进的停止逻辑:不仅按 PID 文件,还按进程名和端口清理
echo "Stopping any existing CRM processes..."
# 1. 按 PID 文件停止(如果存在)
if [[ -f "$CRM_PID_FILE" ]]; then
OLD_PID="$(cat "$CRM_PID_FILE")"
if kill "$OLD_PID" >/dev/null 2>&1; then
sleep 1
if kill -0 "$OLD_PID" >/dev/null 2>&1; then
echo "Stopping PID from pidfile: $OLD_PID"
kill "$OLD_PID" >/dev/null 2>&1 || true
for i in {1..20}; do
if ! kill -0 "$OLD_PID" >/dev/null 2>&1; then break; fi
sleep 0.5
done
if kill -0 "$OLD_PID" >/dev/null 2>&1; then
kill -9 "$OLD_PID" >/dev/null 2>&1 || true
sleep 1
fi
fi
rm -f "$CRM_PID_FILE"
fi
rm -f "$CRM_DB_FILE" "$CRM_LOG_FILE"
nohup bash -lc 'set -a; source "$CRM_ENV_FILE"; set +a; exec "$CRM_BINARY"' >"$CRM_LOG_FILE" 2>&1 &
echo $! > "$CRM_PID_FILE"
# 2. 按进程名停止任何残留的 CRM 进程
for pattern in 'sub2api.*crm' 'sub2api.*relay-manager'; do
for pid in $(pgrep -f "$pattern" 2>/dev/null); do
echo "Stopping process by pattern ($pattern): $pid"
kill "$pid" 2>/dev/null || true
sleep 0.5
if kill -0 "$pid" 2>/dev/null; then
kill -9 "$pid" 2>/dev/null || true
fi
done
done
python3 - "$CRM_PORT" <<'PY'
import subprocess, sys, time
url = f"http://127.0.0.1:{sys.argv[1]}/healthz"
for _ in range(30):
r = subprocess.run(["curl", "-fsS", url], text=True, capture_output=True)
# 3. 强制释放端口(如有必要)
if command -v fuser >/dev/null 2>&1; then
fuser -k "${CRM_PORT}/tcp" 2>/dev/null || true
fi
# 清理日志但不碰DB
rm -f "$CRM_LOG_FILE"
# 验证端口未被占用
for i in {1..10}; do
if ! ss -tlnp 2>/dev/null | grep -q ":$CRM_PORT " && \
! netstat -tlnp 2>/dev/null | grep -q ":$CRM_PORT "; then
break
fi
echo "Waiting for port $CRM_PORT to be released... (attempt $i/10)"
sleep 1
done
if ss -tlnp 2>/dev/null | grep -q ":$CRM_PORT " || netstat -tlnp 2>/dev/null | grep -q ":$CRM_PORT "; then
echo "ERROR: Port $CRM_PORT is still in use after cleanup. Cannot start new CRM." >&2
ss -tlnp 2>/dev/null | grep ":$CRM_PORT " || netstat -tlnp 2>/dev/null | grep ":$CRM_PORT "
exit 1
fi
echo "Port $CRM_PORT is free. Starting new CRM..."
# 使用更可靠的方式启动优先systemd回退nohup
if command -v systemctl >/dev/null 2>&1 && [[ -f /etc/systemd/system/sub2api-crm.service ]]; then
systemctl restart sub2api-crm || exit 1
else
nohup bash -lc 'set -a; source "$CRM_ENV_FILE"; set +a; exec "$CRM_BINARY"' >"$CRM_LOG_FILE" 2>&1 &
echo $! > "$CRM_PID_FILE"
fi
python3 - "$CRM_PORT" "$CRM_PID_FILE" <<'PY'
import subprocess, sys, time, os
port = sys.argv[1]
pid_file = sys.argv[2]
# 1. 等待 healthz
healthz_url = f"http://127.0.0.1:{port}/healthz"
for i in range(30):
r = subprocess.run(["curl", "-fsS", healthz_url], text=True, capture_output=True)
if r.returncode == 0 and r.stdout.strip() == "ok":
raise SystemExit(0)
print(f"Health check passed on attempt {i+1}")
break
time.sleep(1)
raise SystemExit(f"crm healthz did not become ready on {url}")
else:
raise SystemExit(f"crm healthz did not become ready on {healthz_url}")
# 2. 验证二进制不是 deleted 状态
with open(pid_file) as f:
pid = f.read().strip()
exe_link = f"/proc/{pid}/exe"
if os.path.islink(exe_link):
target = os.readlink(exe_link)
if "deleted" in target:
raise SystemExit(f"ERROR: Binary shows (deleted): {target}")
print(f"Binary OK: {target}")
# 3. 验证 portal session 路由(新版本应有此路由)
session_url = f"http://127.0.0.1:{port}/api/portal/session/state"
r = subprocess.run(["curl", "-fsS", session_url], text=True, capture_output=True)
if r.returncode == 0:
print(f"Portal session route OK: {r.stdout.strip()}")
elif r.returncode == 22 and "404" in r.stderr:
raise SystemExit(f"ERROR: Portal session route returns 404 - may be running old version")
else:
print(f"Warning: Portal session route check failed: {r.stderr}")
raise SystemExit(0)
PY
# 部署验证完成
echo "=== Deployment Verification ==="
NEW_PID=$(cat "$CRM_PID_FILE")
echo "New CRM PID: $NEW_PID"
ls -la "/proc/$NEW_PID/exe" 2>/dev/null | grep -v deleted && echo "Binary state: OK (not deleted)" || echo "WARNING: Binary may be deleted"
printf "crm_base=http://127.0.0.1:%s\n" "$CRM_PORT"
printf "crm_pid_file=%s\n" "$CRM_PID_FILE"
printf "crm_log=%s\n" "$CRM_LOG_FILE"
@@ -156,9 +256,11 @@ BOOTSTRAP_EOF
main() {
require_cmd bash curl git python3 ssh scp
remote43_require_file "$KEY" "ssh key"
remote43_require_file "$CRM_BINARY" "crm server binary"
require_cmd bash curl git python3 ssh scp
[[ -n "$KEY" ]] || die "KEY is required; copy scripts/deploy/.env.deploy.example to scripts/deploy/.env.deploy and fill it"
[[ -n "$REMOTE" ]] || die "REMOTE is required; copy scripts/deploy/.env.deploy.example to scripts/deploy/.env.deploy and fill it"
remote43_require_file "$KEY" "ssh key"
remote43_require_file "$CRM_BINARY" "crm server binary"
rm -f "$LOCAL_REPO_BUNDLE"
git -C "$ROOT_DIR" bundle create "$LOCAL_REPO_BUNDLE" main
@@ -187,34 +289,95 @@ main() {
cp "$bootstrap_file" "$LOCAL_DEPLOY_DIR/bootstrap.sh"
ssh_remote "mkdir -p $(printf "%q" "$REMOTE_ROOT")
# 改进的停止逻辑:不仅按 PID 文件,还按进程名和端口清理
echo 'Stopping any existing CRM processes...'
# 1. 按 PID 文件停止(如果存在)
if [[ -f $(printf "%q" "$REMOTE_CRM_PID_FILE") ]]; then
OLDPID=\$(cat $(printf "%q" "$REMOTE_CRM_PID_FILE"))
kill \$OLDPID 2>/dev/null || true
sleep 1
if kill -0 \$OLDPID 2>/dev/null; then
echo \"Stopping PID from pidfile: \$OLDPID\"
kill \$OLDPID 2>/dev/null || true
for i in {1..20}; do
if ! kill -0 \$OLDPID 2>/dev/null; then break; fi
sleep 0.5
done
if kill -0 \$OLDPID 2>/dev/null; then kill -9 \$OLDPID 2>/dev/null || true; sleep 1; fi
fi
rm -f $(printf "%q" "$REMOTE_CRM_PID_FILE")
fi
rm -f $(printf "%q" "$REMOTE_CRM_PID_FILE") $(printf "%q" "$REMOTE_CRM_DB_FILE") $(printf "%q" "$REMOTE_CRM_LOG_FILE") $(printf "%q" "$REMOTE_CRM_BINARY")"
# 2. 按进程名停止任何残留的 CRM 进程
for pattern in 'sub2api.*crm' 'sub2api.*relay-manager'; do
for pid in \$(pgrep -f \"\$pattern\" 2>/dev/null); do
echo \"Stopping process by pattern (\$pattern): \$pid\"
kill \$pid 2>/dev/null || true
sleep 0.5
if kill -0 \$pid 2>/dev/null; then kill -9 \$pid 2>/dev/null || true; fi
done
done
# 3. 强制释放端口
fuser -k $(printf "%q" "$CRM_PORT")/tcp 2>/dev/null || true
# 4. 验证端口释放
for i in {1..5}; do
if ! ss -tlnp 2>/dev/null | grep -q '$(printf "%q" ":$CRM_PORT")' && \\
! netstat -tlnp 2>/dev/null | grep -q '$(printf "%q" ":$CRM_PORT")'; then
break
fi
echo \"Waiting for port release... (\$i/5)\"
sleep 1
done
# 禁止删除DBrm -f DB_FILE 已被移除
rm -f $(printf "%q" "$REMOTE_CRM_LOG_FILE") $(printf "%q" "$REMOTE_CRM_BINARY")"
scp_remote "$CRM_BINARY" "$REMOTE:$REMOTE_CRM_BINARY"
scp_remote "$LOCAL_REPO_BUNDLE" "$REMOTE:$REMOTE_REPO_BUNDLE"
scp_remote "$crm_env_file" "$REMOTE:$REMOTE_CRM_ENV_FILE"
scp_remote "$bootstrap_file" "$REMOTE:$REMOTE_BOOTSTRAP_FILE"
ssh_remote "bash $(printf "%q" "$REMOTE_BOOTSTRAP_FILE")"
ssh_remote "bash $(printf '%q' "$REMOTE_BOOTSTRAP_FILE")"
cat <<EOF
crm-only stack prepared
remote crm base: http://127.0.0.1:${CRM_PORT}
remote crm env file: ${REMOTE_CRM_ENV_FILE}
remote crm log: ${REMOTE_CRM_LOG_FILE}
remote repo root: ${REMOTE_REPO_ROOT}
local operator env file: ${LOCAL_OPERATOR_ENV_FILE}
local tunnel script: ${LOCAL_TUNNEL_SCRIPT}
local deploy dir: ${LOCAL_DEPLOY_DIR}
echo ""
echo "=== Post-Deployment Verification ==="
next:
1. 在另一终端运行: ${LOCAL_TUNNEL_SCRIPT}
2. 当前终端执行: set -a; source ${LOCAL_OPERATOR_ENV_FILE}; set +a
3. 验证: curl -fsS http://127.0.0.1:${CRM_PORT}/healthz
curl -fsS -H "Authorization: Bearer \$crm_admin_token" http://127.0.0.1:${CRM_PORT}/api/packs
EOF
# 等待服务启动
sleep 3
# 验证 healthz
echo -n "1. Health check: "
if ssh_remote "curl -fsS http://127.0.0.1:${CRM_PORT}/healthz 2>/dev/null" | grep -q "^ok$"; then
echo "[PASS]"
else
echo "[FAIL]"
fi
# 验证 portal session 路由
echo -n "2. Portal session route: "
SESSION_RESULT=$(ssh_remote "curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:${CRM_PORT}/api/portal/session/state 2>/dev/null")
if [[ "$SESSION_RESULT" == "200" ]]; then
echo "[PASS] Returned 200 - new version"
elif [[ "$SESSION_RESULT" == "404" ]]; then
echo "[WARN] Returned 404 - may be old version"
else
echo "[UNKNOWN] Returned $SESSION_RESULT"
fi
# 验证二进制状态
echo -n "3. Binary state check: "
PID_VAL=$(ssh_remote "cat $(printf '%q' "$REMOTE_CRM_PID_FILE") 2>/dev/null")
if [[ -n "$PID_VAL" ]]; then
BINARY_LINK=$(ssh_remote "ls /proc/${PID_VAL}/exe 2>/dev/null")
if echo "$BINARY_LINK" | grep -q deleted; then
echo "[FAIL] Binary shows deleted"
else
echo "[OK] Binary not deleted"
fi
else
echo "[WARN] Cannot check binary state"
fi
echo ""
}
main "$@"