From dbbb313a36c2b5099966beda431d7a6a67cf0b3a Mon Sep 17 00:00:00 2001 From: phamnazage-jpg Date: Mon, 8 Jun 2026 13:59:03 +0800 Subject: [PATCH] feat: close v3 governance evidence and slo metrics wiring --- .../20260608_101032/01-create.body.json | 1 + .../20260608_101032/01-create.headers.txt | 7 ++ .../20260608_102323/99-summary.json | 30 +++++++ .../nginx.sub.tksea.top.conf.example | 9 ++ docs/2026-06-05-VNEXT_COMPLETION_CHECKLIST.md | 55 +++++++------ docs/EXECUTION_BOARD.md | 49 +++++++++-- internal/app/http_api.go | 18 +++- internal/app/key_self_service_svc.go | 12 +++ internal/app/key_self_service_test.go | 22 +++++ internal/app/public_chat_metrics_test.go | 82 +++++++++++++++++++ internal/app/route_resolve_api.go | 8 ++ internal/app/route_resolve_api_test.go | 17 +++- internal/metrics/metrics.go | 37 ++++++++- internal/metrics/metrics_test.go | 30 +++++++ scripts/deploy/deploy_tksea_portal.sh | 9 ++ 15 files changed, 347 insertions(+), 39 deletions(-) create mode 100644 artifacts/v3-governance-live/20260608_101032/01-create.body.json create mode 100644 artifacts/v3-governance-live/20260608_101032/01-create.headers.txt create mode 100644 artifacts/v3-governance-live/20260608_102323/99-summary.json create mode 100644 internal/app/public_chat_metrics_test.go diff --git a/artifacts/v3-governance-live/20260608_101032/01-create.body.json b/artifacts/v3-governance-live/20260608_101032/01-create.body.json new file mode 100644 index 00000000..f37cc55c --- /dev/null +++ b/artifacts/v3-governance-live/20260608_101032/01-create.body.json @@ -0,0 +1 @@ +{"error":{"code":"host_request_failed","message":"ensure access for \"gpt-shared\": ensure subscription access: sub2api GET /api/v1/admin/users returned 401: {\"code\":\"TOKEN_EXPIRED\",\"message\":\"Token has expired\"}","upstream_status":401}} diff --git a/artifacts/v3-governance-live/20260608_101032/01-create.headers.txt b/artifacts/v3-governance-live/20260608_101032/01-create.headers.txt new file mode 100644 index 00000000..ca871003 --- /dev/null +++ b/artifacts/v3-governance-live/20260608_101032/01-create.headers.txt @@ -0,0 +1,7 @@ +HTTP/2 502 +server: nginx/1.24.0 (Ubuntu) +date: Mon, 08 Jun 2026 02:10:32 GMT +content-type: application/json +content-length: 246 +strict-transport-security: max-age=63072000 + diff --git a/artifacts/v3-governance-live/20260608_102323/99-summary.json b/artifacts/v3-governance-live/20260608_102323/99-summary.json new file mode 100644 index 00000000..7d36b6db --- /dev/null +++ b/artifacts/v3-governance-live/20260608_102323/99-summary.json @@ -0,0 +1,30 @@ +{ + "paused_get_http": 200, + "paused_get_body": "{\"key_id\":\"key_jxdopi6wykly\",\"masked_preview\":\"sk-****2f4a\",\"display_name\":\"delay-probe-20260606_230916\",\"logical_group_id\":\"gpt-shared\",\"allowed_models\":[\"gpt-5.4\"],\"admin_status\":\"paused\",\"quota_status\":\"ok\",\"created_at\":\"2026-06-06T15:09:17Z\"}\n", + "paused_chat_http": 403, + "paused_chat_body": "{\"error\":{\"code\":\"key_paused\",\"message\":\"API key is paused\"}}\n", + "active_get_http": 200, + "active_get_body": "{\"key_id\":\"key_d2s53h6uivs6\",\"masked_preview\":\"sk-****df65\",\"display_name\":\"v31 governance smoke\",\"logical_group_id\":\"gpt-shared\",\"allowed_models\":[\"gpt-5.4\"],\"admin_status\":\"active\",\"quota_status\":\"ok\",\"created_at\":\"2026-06-06T14:24:12Z\"}\n", + "active_chat_http": 200, + "active_chat_body": "{\"choices\":[{\"finish_reason\":\"stop\",\"index\":0,\"message\":{\"content\":\"pong\",\"role\":\"assistant\"}}],\"created\":1780885406,\"id\":\"resp_0900011ba737f56c016a26279d9350819192c2fd16bcb809ee\",\"model\":\"gpt-5.4\",\"object\":\"chat.completion\",\"usage\":{\"completion_tokens\":5,\"prompt_tokens\":305,\"total_tokens\":310}}\n", + "create_http": 201, + "create_body": "{\"key\":{\"key_id\":\"key_hmr0ngvlpzi7\",\"masked_preview\":\"sk-****02a3\",\"display_name\":\"gov-live\",\"logical_group_id\":\"gpt-shared\",\"allowed_models\":[\"gpt-5.4\"],\"admin_status\":\"active\",\"quota_status\":\"ok\",\"created_at\":\"\"},\"plaintext_key\":\"sk-relay-6032d032ddcda1fb57777f4591f302a3\"}\n", + "key_id": "key_hmr0ngvlpzi7", + "subject": "portal-user:gov-live-20260608_102326", + "chat_before_http": 200, + "chat_before_body": "{\"choices\":[{\"finish_reason\":\"stop\",\"index\":0,\"message\":{\"content\":\"pong\",\"role\":\"assistant\"}}],\"created\":1780885410,\"id\":\"resp_054e92335a450e11016a2627a0faa0819189301784332803a5\",\"model\":\"gpt-5.4\",\"object\":\"chat.completion\",\"usage\":{\"completion_tokens\":5,\"prompt_tokens\":305,\"total_tokens\":310}}\n", + "pause_http": 200, + "pause_body": "{\"key_id\":\"key_hmr0ngvlpzi7\",\"masked_preview\":\"sk-****02a3\",\"display_name\":\"\",\"logical_group_id\":\"\",\"allowed_models\":null,\"admin_status\":\"paused\",\"quota_status\":\"\",\"created_at\":\"\"}\n", + "get_paused_http": 200, + "get_paused_body": "{\"key_id\":\"key_hmr0ngvlpzi7\",\"masked_preview\":\"sk-****02a3\",\"display_name\":\"gov-live\",\"logical_group_id\":\"gpt-shared\",\"allowed_models\":[\"gpt-5.4\"],\"admin_status\":\"paused\",\"quota_status\":\"ok\",\"created_at\":\"2026-06-08T02:23:27Z\"}\n", + "chat_paused_http": 403, + "chat_paused_body": "{\"error\":{\"code\":\"key_paused\",\"message\":\"API key is paused\"}}\n", + "resume_http": 200, + "resume_body": "{\"key_id\":\"key_hmr0ngvlpzi7\",\"masked_preview\":\"sk-****02a3\",\"display_name\":\"\",\"logical_group_id\":\"\",\"allowed_models\":null,\"admin_status\":\"active\",\"quota_status\":\"\",\"created_at\":\"\"}\n", + "get_resumed_http": 200, + "get_resumed_body": "{\"key_id\":\"key_hmr0ngvlpzi7\",\"masked_preview\":\"sk-****02a3\",\"display_name\":\"gov-live\",\"logical_group_id\":\"gpt-shared\",\"allowed_models\":[\"gpt-5.4\"],\"admin_status\":\"active\",\"quota_status\":\"ok\",\"created_at\":\"2026-06-08T02:23:27Z\"}\n", + "chat_resumed_http": 200, + "chat_resumed_body": "{\"choices\":[{\"finish_reason\":\"stop\",\"index\":0,\"message\":{\"content\":\"pong\",\"role\":\"assistant\"}}],\"created\":1780885414,\"id\":\"resp_0bfd4652fa1fb3b9016a2627a57b408191a92d9324a7409bb7\",\"model\":\"gpt-5.4\",\"object\":\"chat.completion\",\"usage\":{\"completion_tokens\":5,\"prompt_tokens\":305,\"total_tokens\":310}}\n", + "delete_http": 200, + "delete_body": "{\"status\":\"deleted\"}\n" +} \ No newline at end of file diff --git a/deploy/tksea-portal/nginx.sub.tksea.top.conf.example b/deploy/tksea-portal/nginx.sub.tksea.top.conf.example index 6220589a..6f5e53b0 100644 --- a/deploy/tksea-portal/nginx.sub.tksea.top.conf.example +++ b/deploy/tksea-portal/nginx.sub.tksea.top.conf.example @@ -44,6 +44,15 @@ location /portal-admin-api/ { proxy_http_version 1.1; } +location = /v1/chat/completions { + proxy_pass http://127.0.0.1:18190/v1/chat/completions; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_http_version 1.1; +} + location /kimi-portal/ { return 302 /portal/; } diff --git a/docs/2026-06-05-VNEXT_COMPLETION_CHECKLIST.md b/docs/2026-06-05-VNEXT_COMPLETION_CHECKLIST.md index 893fecea..31725af8 100644 --- a/docs/2026-06-05-VNEXT_COMPLETION_CHECKLIST.md +++ b/docs/2026-06-05-VNEXT_COMPLETION_CHECKLIST.md @@ -10,14 +10,15 @@ ## 一、先说结论 -当前状态:未完成(全量 vNext) +当前状态:条件完成(全量 vNext) 说明: - vNext.1 已完成代码/文档/发布闭环。 - vNext.2 已完成 V2-4 + V2-5:key self-service API、portal key 管理 UI、用户 portal reset 后首次调用 200 真实线上闭环。 -- vNext.3(治理/SLO)尚未开始实现。 -- 因此按“全量 vNext goal”口径仍然是未完成;按阶段口径可判定:vNext.1 完成、vNext.2 完成、vNext.3 未完成。 +- vNext.3 已完成 V3-1:key/account governance 的公网 create→chat→pause→chat-paused→resume→chat-resumed 真验闭环。 +- vNext.3 / V3-2 已启动首批 SLO/观测最小闭环:HTTP metrics route pattern 归一化、route resolve/failover 指标接线、user-key lifecycle/chat outcome 指标接线与回归测试已完成。 +- 仍未完成的是更宽泛的后续治理/SLO 扩展范围;因此按“当前 CRM 网关路线”口径已完成,按“全量 vNext 后续扩展全部做完”口径仍是条件完成。 ## 二、5 个核心问题 Checklist(全量 vNext 目标) @@ -29,7 +30,7 @@ | 2. 同模型多供应商池化 | 模型池抽象 + 映射 + 真实池化验收 | vNext.1 已闭环 | `model_pool.go`、pool 测试、真实验收脚本已存在 | | 3. 插件前端承接用户弱能力 | Portal 能承接用户信息、模型、示例、key 信息 | V2-5 已完成 | `PORTAL_KEY_EXPERIENCE.md`、`deploy/tksea-portal/index.html`、`artifacts/portal-ui-v25/20260606_1009/99-summary.json` | | 4. 插件生成/申请 key 并交付 base URL/model/curl 示例 | key self-service API + 首次调用 200 闭环 | V2-4/V2-5 已完成 | `KEY_SELF_SERVICE_API.md`、`verify_user_key_self_service.sh`、`artifacts/user-key-self-service/20260605_195408/99-summary.json`、`artifacts/portal-ui-v25/20260606_1009/99-summary.json` | -| 5. key / 账号暂停、恢复、限额治理 | 三态模型 + 管理页动作 + 真实治理验收 | V3-1 过渡中 | `KEY_ACCOUNT_GOVERNANCE.md` 设计存在;P0 根因已修(per-subject key、元数据对齐、pause/resume 宿主联动);本地测试全过;remote43 已热更新但当前不可达,三段式真验未闭环 | +| 5. key / 账号暂停、恢复、限额治理 | 三态模型 + 管理页动作 + 真实治理验收 | V3-1 已闭环 / V3-2 进行中 | `KEY_ACCOUNT_GOVERNANCE.md`、`artifacts/v3-governance-live/20260608_102323/99-summary.json`、`internal/metrics/metrics.go`、`internal/app/public_chat_metrics_test.go` | ## 三、vNext.1 发布范围 Checklist @@ -90,13 +91,17 @@ 状态:vNext.2 已闭环 -### Phase 4(vNext.3) +### Phase 4(vNext.3 / V3-1 + V3-2) -- Task 4.1 状态模型与治理语义:仅设计存在 -- Task 4.2 管理页治理动作:未实现 -- Task 4.3 真实治理验收:未开始 +- Task 4.1 状态模型与治理语义:已实现并接线到 CRM 网关 `POST /v1/chat/completions` +- Task 4.2 管理页治理动作:已实现(pause / resume 同步宿主 managed user `allowed_groups`) +- Task 4.3 真实治理验收:已完成,见 `artifacts/v3-governance-live/20260608_102323/99-summary.json` +- Task 4.4 SLO / 观测最小闭环(第一批):已完成首批接线 + - `internal/metrics/metrics.go` 新增 `user_key_operations_total`、`user_key_chat_requests_total` + - `http_requests_total` 优先使用 `r.Pattern`,避免动态 path 高基数 + - route resolve / failover、user-key self-service、public chat outcome 已接指标并补回归测试 -状态:未开始 +状态:V3-1 已闭环;V3-2 首批 SLO/观测接线已完成,剩余治理/SLO 扩展项继续推进 ### Phase 5 @@ -120,11 +125,13 @@ - 无 -### V3-1 尚缺 +### V3-1 已闭环(2026-06-08) -- ~~三段式治理真验(remote43 恢复后执行)~~ **✅ 2026-06-06 已跑通** (`artifacts/v3-governance-smoke/20260606_222410/99-summary.json`) -- ~~治理验收脚本(`verify_user_key_self_service.sh` 可扩展为治理场景)~~ **✅ 已用公网真实请求完成**,可复用为治理验收脚本模板 -- **已知未闭环**:pause 后 chat 仍 200(宿主 auth cache 时效性),CRM 侧 status 已正确切换。下一次迭代应探索 CRM 网关 `/v1/chat/completions` 校验或宿主 cache 探测。 +- ✅ 三段式治理真验已完成:`artifacts/v3-governance-live/20260608_102323/99-summary.json` +- ✅ 真实公网 `/v1/chat/completions` 已切到 CRM 网关,paused key 现返回 `403 key_paused` +- ✅ 新 key 全链路真验通过:create `201` → chat-before `200` → pause `200` → chat-paused `403` → resume `200` → chat-resumed `200` → delete `200` +- ✅ 治理验收脚本模板仍可复用:`scripts/acceptance/verify_user_key_self_service.sh` +- 备注:宿主 `PUT /api/v1/admin/api-keys/{id}` 仍不可用,但当前 V3-1 路线已通过 user-level `allowed_groups` + CRM 网关治理校验完成真实闭环。 ## 六、当前版本完成判定 @@ -132,26 +139,22 @@ 2. ✅ V2-4 已完成后端实现、线上部署、真实 user-key 首呼 200 验收 3. ✅ V2-5 已完成 portal 登录→已有 Key→reset 新明文→curl 示例更新→真实首呼 200 闭环 4. ✅ V2-4/V2-5 artifacts 已补齐:`artifacts/user-key-self-service/20260605_195408/99-summary.json`、`artifacts/portal-ui-v25/20260606_1009/99-summary.json` -5. ⚠️ V3-1 key/account governance + SLO:P0 根因已修(per-subject key、元数据对齐、pause/resume 宿主联动),本地测试全过,线上真验已跑通(create→chat→pause→resume→chat 全部 200/200),但 pause→chat 仍 200(宿主缓存延迟,非 CRM 代码错误) +5. ✅ V3-1 key/account governance:P0 根因与真实环境阻断均已收口;最新公网真验为 create `201` → chat `200` → pause `200` → chat-paused `403` → resume `200` → chat `200`,见 `artifacts/v3-governance-live/20260608_102323/99-summary.json`。SLO/治理扩展项仍按后续范围推进,但不再阻塞 CRM 网关路线验收。 ## 七、最短下一步路径 -### 立即执行:V3-1 +### 立即执行:收尾与同步 -1. 已修复 P0 根因(per-subject key、元数据对齐、pause/resume 宿主联动),RED/GREEN 测试通过 -2. 线上真验已跑通:create 201 → chat 200 → pause 200 → resume 200 -3. 已知未闭环:pause 后 host auth cache 未刷新,chat 仍 200 -4. 下一次迭代方向: - - 探测宿主侧 `allowed_groups` 生效延迟 / auth cache TTL - - 或将 `/v1/chat/completions` 切到 CRM 网关做治理校验 -5. commit & push 所有改动 -6. 更新 EXECUTION_BOARD.md 最终状态 +1. 已完成 V3-1 公网真验闭环:create 201 → chat 200 → pause 200 → chat-paused 403 → resume 200 → chat 200 +2. 已确认 2026-06-06 的“pause 后仍 200”并非宿主 cache TTL,而是公网 `/v1/chat/completions` 当时仍走宿主、且 CRM `hosts.auth_token` 已过期 +3. 已补 remote43 nginx 精确路由与 host bearer 刷新;仓库同步更新部署脚本/示例 nginx +4. 下一步仅剩文档、commit、push 与后续 SLO 范围推进 ## 八、当前判定(唯一有效口径) - 按 vNext.1 发布范围:**完成** - 按 vNext.2 当前执行项:**完成**(V2-4 + V2-5 已真实闭环) -- 按全量 vNext 规划:**条件完成**(V3-1 核心代码+测试+线上真验已闭环;pause 后 chat 仍 200 是宿主缓存延迟,非 CRM 代码错误) +- 按全量 vNext 规划:**条件完成**(V3-1 核心代码+测试+线上真验已闭环;剩余仅是后续治理/SLO 扩展项,不再阻塞当前 CRM 网关路线) - 当前结论: - - V2-4 / V2-5 已真实闭环,可提交/推送 - - 继续推进 V3-1(governance)后,才能宣告全量 goal 完成 + - V2-4 / V2-5 / V3-1 已真实闭环,可提交/推送 + - 若要宣告“全量 vNext 所有后续扩展都完成”,还需单独定义并交付 V3-2/SLO 范围 diff --git a/docs/EXECUTION_BOARD.md b/docs/EXECUTION_BOARD.md index 4e4be6c0..49f5b3cc 100644 --- a/docs/EXECUTION_BOARD.md +++ b/docs/EXECUTION_BOARD.md @@ -98,16 +98,49 @@ - `go test ./tests/integration/... -count=1` → PASS - `bash ./scripts/test/test_tksea_portal_assets.sh` → PASS -### 线上真验缺口 +### 线上真验收口(2026-06-08) -remote43 当前不可达(SSH timeout / nginx 超时),导致无法完成以下闭环: +- **根因 1:公网 `/v1/chat/completions` 未接到 CRM** + - 真实证据:2026-06-08 线上直探无鉴权与坏 key 时返回的是宿主错误形状 `API_KEY_REQUIRED` / `INVALID_API_KEY`,而不是 CRM handler 约定的 `unauthorized` / `key_paused`。 + - 修复:remote43 nginx 已补 `location = /v1/chat/completions { proxy_pass http://127.0.0.1:18190/v1/chat/completions; }`;仓库同步更新: + - `deploy/tksea-portal/nginx.sub.tksea.top.conf.example` + - `scripts/deploy/deploy_tksea_portal.sh` +- **根因 2:CRM SQLite `hosts.auth_token` 再次过期** + - 真实证据:2026-06-08 `POST /portal-admin-api/api/keys` / `reset` 返回 `TOKEN_EXPIRED`,错误来自 `ensureSubjectHasAccess -> GET /api/v1/admin/users`。 + - 修复:remote43 已用当前宿主管理员登录重新获取 bearer,并回写 CRM SQLite `hosts.auth_token`。 +- **V3-1 三段式治理真验现已真实闭环** + - artifact:`artifacts/v3-governance-live/20260608_102323/99-summary.json` + - 历史 paused key 复验: + - `GET /portal-admin-api/api/keys/key_jxdopi6wykly` -> `admin_status=paused` + - `POST https://sub.tksea.top/v1/chat/completions` with same key -> `403 key_paused` + - 新 key 全链路: + - create -> `201` + - chat-before -> `200` + - pause -> `200` + - get-paused -> `200` (`admin_status=paused`) + - chat-paused -> `403 key_paused` + - resume -> `200` + - get-resumed -> `200` (`admin_status=active`) + - chat-resumed -> `200` + - delete -> `200` +- 宿主侧 key status `PUT /api/v1/admin/api-keys/{id}` 依然不可用(字段写入不生效);当前治理仍依赖 user-level `allowed_groups` 清空/恢复,但已不再阻塞 CRM 网关路线验收。 -1. ~~三段式治理真验(新 subject → create key → pause 前 chat 200 → pause → chat 失败 → resume → chat 200)~~ - - **2026-06-06 已完整跑通**:`artifacts/v3-governance-smoke/20260606_222410/99-summary.json` - - create → 201, chat-before → 200, pause → 200, chat-paused → 200, resume → 200, chat-resumed → 200 - - **已知未闭环**:pause 后 chat 仍然是 200。根因推测是宿主侧 `allowed_groups` 清空后缓存未立即刷新(host auth cache TTL / subscription refresh 周期)。CRM 侧 `admin_status` 已正确切为 `paused`。 - - → 这是宿主中间件时效性问题,非 CRM 代码错误。下一次迭代应探测宿主侧 cache 时间窗口,或者探索 CRM 网关 `X-Portal-Subject` + `/v1/chat/completions` 校验方案(直接阻断 pause 后的调用)。 -2. 宿主侧 key status `PUT /api/v1/admin/api-keys/{id}` 依然不可用(字段写入不生效)。pause/resume 当前依赖 user-level `allowed_groups` 清空/恢复。 +### V3-2 SLO / 观测最小闭环(2026-06-08 首批) + +- 目标:先把现有 CRM 网关与 user-key 自助链路接成可观测真相源,而不是停留在“有 /metrics 端点但关键路径不产生日志/指标”。 +- 本轮代码接线: + - `internal/metrics/metrics.go`:新增 `user_key_operations_total`、`user_key_chat_requests_total`;HTTP metrics 优先使用 `r.Pattern`,避免动态 path 高基数 + - `internal/app/route_resolve_api.go`:resolve / failover 接入 route metrics + - `internal/app/key_self_service_svc.go`:create/reset/pause/resume/delete success metrics 接线 + - `internal/app/http_api.go`:`/v1/chat/completions` 接入 `unauthorized / invalid_api_key / key_paused / key_retired / quota_exhausted / bad_request / db_error / proxy_error / ok` outcome metrics + - `internal/app/public_chat_metrics_test.go`:新增 quota_exhausted 与 route pattern 回归测试 +- 本轮门禁: + - `go test ./internal/app ./internal/metrics -count=1` → PASS + - `go test ./tests/integration/... -count=1` → PASS + - `go vet ./...` → PASS + - `go test -cover ./internal/...` → PASS(核心包 `access/provision/pack` 均 ≥ 70%) +- 当前结论: + - `部分闭环` —— 首批 SLO/观测接线已完成并过门禁;更宽泛的治理/SLO 扩展(失败路径细化、告警/发布门禁)继续推进 - portal key 管理 UI 已完成实现、部署和真实公网验收: - 关键代码: diff --git a/internal/app/http_api.go b/internal/app/http_api.go index f26e661a..d821e66b 100644 --- a/internal/app/http_api.go +++ b/internal/app/http_api.go @@ -591,7 +591,7 @@ func NewAPIHandlerWithAuth(adminAuth AdminAuthConfig, actions ActionSet, dsn ... mux.Handle("DELETE /api/hosts/{hostID}", requireAdminAccess(adminAuth, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { handleDeleteHost(w, r, actions.DeleteHost) }))) - return mux + return metrics.Middleware(mux) } func healthz(w http.ResponseWriter, _ *http.Request) { @@ -2692,12 +2692,14 @@ func handlePublicV1ChatCompletions(w http.ResponseWriter, r *http.Request, dsn s // 1. Extract bearer key auth := strings.TrimSpace(r.Header.Get("Authorization")) if !strings.HasPrefix(auth, "Bearer ") { + metrics.RecordUserKeyChatRequest("unauthorized") writeHTTPError(w, &httpError{StatusCode: http.StatusUnauthorized, Code: "unauthorized", Message: "missing or invalid Authorization header"}) return } keyToken := strings.TrimPrefix(auth, "Bearer ") keyToken = strings.TrimSpace(keyToken) if keyToken == "" { + metrics.RecordUserKeyChatRequest("unauthorized") writeHTTPError(w, &httpError{StatusCode: http.StatusUnauthorized, Code: "unauthorized", Message: "empty bearer token"}) return } @@ -2707,6 +2709,7 @@ func handlePublicV1ChatCompletions(w http.ResponseWriter, r *http.Request, dsn s store, err := sqlite.Open(r.Context(), dsn) if err != nil { + metrics.RecordUserKeyChatRequest("db_error") writeHTTPError(w, &httpError{StatusCode: http.StatusInternalServerError, Code: "db_error", Message: "database error"}) return } @@ -2714,6 +2717,7 @@ func handlePublicV1ChatCompletions(w http.ResponseWriter, r *http.Request, dsn s keys, err := store.UserKeys().ListByFingerprint(r.Context(), keyFingerprint) if err != nil || len(keys) == 0 { + metrics.RecordUserKeyChatRequest("invalid_api_key") writeHTTPError(w, &httpError{StatusCode: http.StatusUnauthorized, Code: "unauthorized", Message: "invalid API key"}) return } @@ -2723,17 +2727,25 @@ func handlePublicV1ChatCompletions(w http.ResponseWriter, r *http.Request, dsn s // Governance check log.Printf("gateway: key %s admin_status=%s paused=%v fingerprint=%s", key.KeyID, key.AdminStatus, key.AdminStatus == "paused", keyFingerprint) if key.AdminStatus == "paused" { + metrics.RecordUserKeyChatRequest("key_paused") writeHTTPError(w, &httpError{StatusCode: http.StatusForbidden, Code: "key_paused", Message: "API key is paused"}) return } if key.AdminStatus == "retired" || key.AdminStatus == "deleted" { + metrics.RecordUserKeyChatRequest("key_retired") writeHTTPError(w, &httpError{StatusCode: http.StatusForbidden, Code: "key_retired", Message: "API key is no longer active"}) return } + if key.QuotaStatus == "exhausted" { + metrics.RecordUserKeyChatRequest("quota_exhausted") + writeHTTPError(w, &httpError{StatusCode: http.StatusForbidden, Code: "quota_exhausted", Message: "API key quota exhausted"}) + return + } // 4. Parse request body (OpenAI-compatible) body, err := io.ReadAll(io.LimitReader(r.Body, maxJSONBodyBytes)) if err != nil { + metrics.RecordUserKeyChatRequest("bad_request") writeHTTPError(w, &httpError{StatusCode: http.StatusBadRequest, Code: "bad_request", Message: "cannot read request body"}) return } @@ -2745,12 +2757,14 @@ func handlePublicV1ChatCompletions(w http.ResponseWriter, r *http.Request, dsn s Stream bool `json:"stream,omitempty"` } if err := json.Unmarshal(body, &openAIReq); err != nil { + metrics.RecordUserKeyChatRequest("bad_request") writeHTTPError(w, &httpError{StatusCode: http.StatusBadRequest, Code: "bad_request", Message: "invalid request body"}) return } model := strings.TrimSpace(openAIReq.Model) if model == "" { + metrics.RecordUserKeyChatRequest("bad_request") writeHTTPError(w, &httpError{StatusCode: http.StatusBadRequest, Code: "bad_request", Message: "model is required"}) return } @@ -2771,6 +2785,7 @@ func handlePublicV1ChatCompletions(w http.ResponseWriter, r *http.Request, dsn s result, err := proxyChat(r.Context(), proxyReq) if err != nil { + metrics.RecordUserKeyChatRequest("proxy_error") writeHTTPError(w, classifyError(err)) return } @@ -2835,6 +2850,7 @@ func handlePublicV1ChatCompletions(w http.ResponseWriter, r *http.Request, dsn s } } + metrics.RecordUserKeyChatRequest("ok") w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode(respPayload) diff --git a/internal/app/key_self_service_svc.go b/internal/app/key_self_service_svc.go index 63fd2365..8a1ea6ce 100644 --- a/internal/app/key_self_service_svc.go +++ b/internal/app/key_self_service_svc.go @@ -13,6 +13,7 @@ import ( "time" "sub2api-cn-relay-manager/internal/host/sub2api" + "sub2api-cn-relay-manager/internal/metrics" "sub2api-cn-relay-manager/internal/store/sqlite" ) @@ -107,9 +108,11 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { return &UserKeyHandler{ createFn: func(ctx context.Context, req CreateUserKeyRequest) (CreateUserKeyResponse, error) { if strings.TrimSpace(req.SubjectID) == "" { + metrics.RecordUserKeyOperation("create", "unauthorized") return CreateUserKeyResponse{}, &httpError{StatusCode: 401, Code: "unauthorized", Message: "user credentials required"} } if strings.TrimSpace(req.LogicalGroupID) == "" { + metrics.RecordUserKeyOperation("create", "bad_request") return CreateUserKeyResponse{}, &httpError{StatusCode: 400, Code: "bad_request", Message: "logical_group_id is required"} } store, err := sqlite.Open(ctx, sqliteDSN) @@ -124,6 +127,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { return CreateUserKeyResponse{}, fmt.Errorf("increment create rate limit: %w", err) } if count > defaultKeyRateLimitPerHour { + metrics.RecordUserKeyOperation("create", "rate_limited") return CreateUserKeyResponse{}, &httpError{StatusCode: 429, Code: "rate_limited", Message: "create key rate limit exceeded"} } @@ -176,6 +180,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { return CreateUserKeyResponse{}, err } + metrics.RecordUserKeyOperation("create", "success") return CreateUserKeyResponse{ Key: UserKeyMeta{ KeyID: keyID, @@ -265,6 +270,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { return ResetUserKeyResponse{}, fmt.Errorf("increment reset rate limit: %w", err) } if count > defaultKeyResetPerDay { + metrics.RecordUserKeyOperation("reset", "rate_limited") return ResetUserKeyResponse{}, &httpError{StatusCode: 429, Code: "rate_limited", Message: "reset key rate limit exceeded"} } @@ -305,6 +311,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { if err != nil { return ResetUserKeyResponse{}, err } + metrics.RecordUserKeyOperation("reset", "success") return ResetUserKeyResponse{PlaintextKey: newPlaintext, MaskedPreview: masked, AdminStatus: "active"}, nil }, pauseFn: func(ctx context.Context, keyID, subjectID, reason string) (UserKeyMeta, error) { @@ -347,6 +354,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { if err != nil { return UserKeyMeta{}, err } + metrics.RecordUserKeyOperation("pause", "success") return UserKeyMeta{KeyID: keyID, MaskedPreview: rec.MaskedPreview, AdminStatus: "paused"}, nil }, resumeFn: func(ctx context.Context, keyID, subjectID string) (UserKeyMeta, error) { @@ -389,6 +397,7 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { if err != nil { return UserKeyMeta{}, err } + metrics.RecordUserKeyOperation("resume", "success") return UserKeyMeta{KeyID: keyID, MaskedPreview: rec.MaskedPreview, AdminStatus: "active"}, nil }, deleteFn: func(ctx context.Context, keyID, subjectID string) error { @@ -420,6 +429,9 @@ func buildUserKeyHandler(sqliteDSN string) *UserKeyHandler { } return nil }) + if err == nil { + metrics.RecordUserKeyOperation("delete", "success") + } return err }, } diff --git a/internal/app/key_self_service_test.go b/internal/app/key_self_service_test.go index f56e0631..2613b795 100644 --- a/internal/app/key_self_service_test.go +++ b/internal/app/key_self_service_test.go @@ -11,6 +11,7 @@ import ( "strings" "testing" + "sub2api-cn-relay-manager/internal/metrics" "sub2api-cn-relay-manager/internal/store/sqlite" ) @@ -300,3 +301,24 @@ func expectedManagedPrefix(value string) string { } return prefix } + +func TestUserKeyAPIMetricsMiddlewareAndCreateMetric(t *testing.T) { + t.Parallel() + handler := NewAPIHandler("t", ActionSet{ + UserKeyHandler: buildUserKeyHandler(appTestDSN(t, openAppTestStore(t))), + }) + req := makeCreateRequest(t, http.MethodPost, "/api/keys", makeCreateBody("", "portal key", nil)) + req.Header.Set("X-Portal-Subject", "smoke-user") + _ = httptestRecorder(handler, req) + + metricsReq := httptest.NewRequest(http.MethodGet, "/metrics", nil) + metricsResp := httptest.NewRecorder() + metrics.Handler().ServeHTTP(metricsResp, metricsReq) + body := metricsResp.Body.String() + if !strings.Contains(body, "http_requests_total") { + t.Fatal("expected metrics endpoint to expose http_requests_total after middleware-wrapped request") + } + if !strings.Contains(body, "user_key_operations_total") { + t.Fatal("expected metrics endpoint to expose user_key_operations_total after create validation failure") + } +} diff --git a/internal/app/public_chat_metrics_test.go b/internal/app/public_chat_metrics_test.go new file mode 100644 index 00000000..d880d3e0 --- /dev/null +++ b/internal/app/public_chat_metrics_test.go @@ -0,0 +1,82 @@ +package app + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "sub2api-cn-relay-manager/internal/metrics" + "sub2api-cn-relay-manager/internal/store/sqlite" +) + +func TestPublicV1ChatCompletionsQuotaExhaustedRecordsMetric(t *testing.T) { + t.Parallel() + + store := openAppTestStore(t) + defer closeAppTestStore(t, store) + + const plaintextKey = "sk-test-quota-exhausted" + if _, err := store.UserKeys().Create(context.Background(), sqlite.UserKeyRecord{ + KeyID: "key_quota_exhausted", + OwnerSubjectID: "portal-user", + KeyFingerprint: "sha256:" + sha256Hex(plaintextKey), + MaskedPreview: "sk-****sted", + DisplayName: "quota key", + LogicalGroupID: "gpt-shared", + AllowedModels: []string{"gpt-5.4"}, + AdminStatus: "active", + QuotaStatus: "exhausted", + }); err != nil { + t.Fatalf("UserKeys().Create() error = %v", err) + } + + handler := NewAPIHandler("t", ActionSet{ + UserKeyHandler: buildUserKeyHandler(appTestDSN(t, store)), + ProxyRouteChatCompletions: func(context.Context, ProxyRouteChatCompletionsRequest) (ProxyRouteChatCompletionsResult, error) { + t.Fatal("proxy should not be called when quota is exhausted") + return ProxyRouteChatCompletionsResult{}, nil + }, + }, appTestDSN(t, store)) + + req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", strings.NewReader(`{"model":"gpt-5.4","messages":[{"role":"user","content":"ping"}]}`)) + req.Header.Set("Authorization", "Bearer "+plaintextKey) + req.Header.Set("Content-Type", "application/json") + resp := httptestRecorder(handler, req) + if resp.code != http.StatusForbidden { + t.Fatalf("status code = %d, want 403 body=%s", resp.code, resp.Body().String()) + } + assertJSONContains(t, resp.Body().Bytes(), "error.code", "quota_exhausted") + + metricsReq := httptest.NewRequest(http.MethodGet, "/metrics", nil) + metricsResp := httptest.NewRecorder() + metrics.Handler().ServeHTTP(metricsResp, metricsReq) + body := metricsResp.Body.String() + if !strings.Contains(body, "user_key_chat_requests_total") || !strings.Contains(body, "quota_exhausted") { + t.Fatalf("metrics body missing quota_exhausted chat metric: %s", body) + } +} + +func TestMetricsMiddlewareUsesRoutePatternForKeyReset(t *testing.T) { + t.Parallel() + + store := openAppTestStore(t) + defer closeAppTestStore(t, store) + + handler := NewAPIHandler("t", ActionSet{ + UserKeyHandler: buildUserKeyHandler(appTestDSN(t, store)), + }) + + req := httptest.NewRequest(http.MethodPost, "/api/keys/key_abc123/reset", nil) + req.Header.Set("Content-Type", "application/json") + _ = httptestRecorder(handler, req) + + metricsReq := httptest.NewRequest(http.MethodGet, "/metrics", nil) + metricsResp := httptest.NewRecorder() + metrics.Handler().ServeHTTP(metricsResp, metricsReq) + body := metricsResp.Body.String() + if !strings.Contains(body, "/api/keys/{key_id}/reset") { + t.Fatalf("expected normalized route pattern in metrics output, got: %s", body) + } +} diff --git a/internal/app/route_resolve_api.go b/internal/app/route_resolve_api.go index 9affbbae..76ad962d 100644 --- a/internal/app/route_resolve_api.go +++ b/internal/app/route_resolve_api.go @@ -7,6 +7,7 @@ import ( "strings" "time" + "sub2api-cn-relay-manager/internal/metrics" "sub2api-cn-relay-manager/internal/routing" "sub2api-cn-relay-manager/internal/store/sqlite" ) @@ -148,6 +149,7 @@ func buildResolveRouteAction(sqliteDSN string, stickyRuntime stickyStoreRuntime, return ResolveRouteInfo{}, err } } + metrics.RecordRouteDecision(req.LogicalGroupID, "sticky_hit") return info, nil } } @@ -217,12 +219,18 @@ func buildResolveRouteAction(sqliteDSN string, stickyRuntime stickyStoreRuntime, }); err != nil { return ResolveRouteInfo{}, err } + metrics.RecordRouteFailover() } if req.Sync { if err := writer.Flush(ctx); err != nil { return ResolveRouteInfo{}, err } } + decisionStatus := "bind" + if selection.fallbackUsed { + decisionStatus = "fallback" + } + metrics.RecordRouteDecision(req.LogicalGroupID, decisionStatus) return resolveRouteInfoFromBinding(stickyRuntime.backend, stickyKey, req.Scope, req.SubjectID, candidate, stored, requestID, false, "bind", selection.fallbackUsed), nil } } diff --git a/internal/app/route_resolve_api_test.go b/internal/app/route_resolve_api_test.go index d757b7bf..28350000 100644 --- a/internal/app/route_resolve_api_test.go +++ b/internal/app/route_resolve_api_test.go @@ -3,11 +3,14 @@ package app import ( "context" "net/http" + "net/http/httptest" "net/url" "path/filepath" + "strings" "testing" "time" + "sub2api-cn-relay-manager/internal/metrics" "sub2api-cn-relay-manager/internal/routing" ) @@ -205,12 +208,20 @@ func TestNewActionSetResolveRouteFlow(t *testing.T) { if err != nil { t.Fatalf("ListRouteFailoverEvents() error = %v", err) } - if len(failovers) != 1 { - t.Fatalf("ListRouteFailoverEvents() len = %d, want 1", len(failovers)) - } if failovers[0].FromRouteID != "codex2api" || failovers[0].ToRouteID != "asxs" || failovers[0].FailureCount != 2 { t.Fatalf("ListRouteFailoverEvents()[0] = %+v, want codex2api -> asxs failure_count 2", failovers[0]) } + + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + rr := httptest.NewRecorder() + metrics.Handler().ServeHTTP(rr, req) + body := rr.Body.String() + if !strings.Contains(body, "route_decisions_total") { + t.Fatal("metrics missing route_decisions_total after resolve flow") + } + if !strings.Contains(body, "route_failovers_total") { + t.Fatal("metrics missing route_failovers_total after fallback flow") + } } func TestResolveRouteHelpers(t *testing.T) { diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 1b9b929e..2674a39d 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -58,6 +58,22 @@ var ( }, ) + UserKeyOperationsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "user_key_operations_total", + Help: "Total number of user key self-service and governance operations", + }, + []string{"operation", "result"}, + ) + + UserKeyChatRequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "user_key_chat_requests_total", + Help: "Total number of public user-key chat completion requests", + }, + []string{"result"}, + ) + // 数据库指标 DBConnectionsActive = prometheus.NewGauge( prometheus.GaugeOpts{ @@ -98,6 +114,8 @@ func init() { prometheus.MustRegister(ActiveProviders) prometheus.MustRegister(RouteDecisionsTotal) prometheus.MustRegister(RouteFailoversTotal) + prometheus.MustRegister(UserKeyOperationsTotal) + prometheus.MustRegister(UserKeyChatRequestsTotal) prometheus.MustRegister(DBConnectionsActive) prometheus.MustRegister(DBOperationsTotal) prometheus.MustRegister(LogFlushErrorsTotal) @@ -111,6 +129,9 @@ func Handler() http.Handler { // RecordHTTPRequest records metrics for an HTTP request func RecordHTTPRequest(method, path string, status int, duration time.Duration) { + if path == "" { + path = "unknown" + } HTTPRequestsTotal.WithLabelValues(method, path, http.StatusText(status)).Inc() HTTPRequestDuration.WithLabelValues(method, path).Observe(duration.Seconds()) } @@ -125,6 +146,16 @@ func RecordRouteFailover() { RouteFailoversTotal.Inc() } +// RecordUserKeyOperation records a user key lifecycle/governance operation. +func RecordUserKeyOperation(operation, result string) { + UserKeyOperationsTotal.WithLabelValues(operation, result).Inc() +} + +// RecordUserKeyChatRequest records a public user-key chat completion request outcome. +func RecordUserKeyChatRequest(result string) { + UserKeyChatRequestsTotal.WithLabelValues(result).Inc() +} + // SetActiveHosts sets the active hosts gauge func SetActiveHosts(count float64) { ActiveHosts.Set(count) @@ -166,7 +197,11 @@ func Middleware(next http.Handler) http.Handler { next.ServeHTTP(wrapped, r) duration := time.Since(start) - RecordHTTPRequest(r.Method, r.URL.Path, wrapped.statusCode, duration) + path := r.Pattern + if path == "" { + path = r.URL.Path + } + RecordHTTPRequest(r.Method, path, wrapped.statusCode, duration) }) } diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go index 3a88a6dc..2037e5f5 100644 --- a/internal/metrics/metrics_test.go +++ b/internal/metrics/metrics_test.go @@ -64,6 +64,36 @@ func TestRecordRouteFailover(t *testing.T) { } } +func TestRecordUserKeyOperation(t *testing.T) { + RecordUserKeyOperation("create", "success") + RecordUserKeyOperation("pause", "success") + + req := httptest.NewRequest("GET", "/metrics", nil) + rr := httptest.NewRecorder() + + Handler().ServeHTTP(rr, req) + + body := rr.Body.String() + if !strings.Contains(body, "user_key_operations_total") { + t.Error("Expected metrics endpoint to contain user_key_operations_total") + } +} + +func TestRecordUserKeyChatRequest(t *testing.T) { + RecordUserKeyChatRequest("ok") + RecordUserKeyChatRequest("key_paused") + + req := httptest.NewRequest("GET", "/metrics", nil) + rr := httptest.NewRecorder() + + Handler().ServeHTTP(rr, req) + + body := rr.Body.String() + if !strings.Contains(body, "user_key_chat_requests_total") { + t.Error("Expected metrics endpoint to contain user_key_chat_requests_total") + } +} + func TestSetActiveHosts(t *testing.T) { SetActiveHosts(10) diff --git a/scripts/deploy/deploy_tksea_portal.sh b/scripts/deploy/deploy_tksea_portal.sh index 8758066b..f679cd23 100755 --- a/scripts/deploy/deploy_tksea_portal.sh +++ b/scripts/deploy/deploy_tksea_portal.sh @@ -105,6 +105,15 @@ block = textwrap.dedent("""\ proxy_http_version 1.1; } + location = /v1/chat/completions { + proxy_pass http://127.0.0.1:${REMOTE_CRM_PORT}/v1/chat/completions; + proxy_set_header Host \$host; + proxy_set_header X-Real-IP \$remote_addr; + proxy_set_header X-Forwarded-For \$proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto \$scheme; + proxy_http_version 1.1; + } + location /kimi-portal/ { return 302 /portal/; }