- commit
- c5e269d
- parent
- 926860a
- author
- im_wower
- date
- 2026-03-22 04:26:28 +0800 CST
feat: record T-029 live stability regression
5 files changed,
+778,
-8
1@@ -1,10 +1,10 @@
2 ---
3 task_id: T-029
4 title: 多节点长时间稳定性回归
5-status: todo
6+status: review
7 branch: feat/T-029-stability-regression
8 repo: /Users/george/code/baa-conductor
9-base_ref: main
10+base_ref: main@926860a
11 depends_on:
12 - T-028
13 write_scope:
14@@ -14,7 +14,7 @@ write_scope:
15 - scripts/smoke/**
16 - scripts/failover/**
17 - scripts/runtime/**
18-updated_at: 2026-03-22
19+updated_at: 2026-03-22 04:26:10 CST
20 ---
21
22 # T-029 多节点长时间稳定性回归
23@@ -68,23 +68,58 @@ updated_at: 2026-03-22
24
25 ## files_changed
26
27-- 待填写
28+- coordination/tasks/T-029-stability-regression.md
29+- docs/ops/README.md
30+- docs/ops/real-stability-regression-2026-03-22.md
31+- scripts/smoke/README.md
32+- scripts/smoke/live-regression.mjs
33
34 ## commands_run
35
36-- 待填写
37+- `npx --yes pnpm install`
38+- `./scripts/failover/print-topology.sh --env /Users/george/.config/baa-conductor/ops.env`
39+- `./scripts/failover/rehearsal-check.sh --env /Users/george/.config/baa-conductor/ops.env --basic-auth "$DIRECT_BASIC_AUTH" --bearer-token "$CONTROL_API_OPS_ADMIN_TOKEN" --expect-leader mini`
40+- `node scripts/smoke/live-regression.mjs --env /Users/george/.config/baa-conductor/ops.env --control-secrets /Users/george/.config/baa-conductor/control-api-worker.secrets.env --basic-auth-file /Users/george/.config/baa-conductor/direct-node-basic-auth.env --expect-leader mini`
41+- `./scripts/runtime/check-node.sh --repo-dir /Users/george/code/baa-conductor-T028-v2 --node mini --service conductor --service status-api --install-dir /Users/george/Library/LaunchAgents --local-api-base http://100.71.210.78:4317 --local-api-allowed-hosts 100.71.210.78 --status-api-base http://100.71.210.78:4318 --status-api-host 100.71.210.78 --expected-rolez leader --check-loaded`
42+- `ssh george@100.112.239.13 'cd /Users/george/code/baa-conductor-T028-v2 && ./scripts/runtime/check-node.sh --repo-dir /Users/george/code/baa-conductor-T028-v2 --node mac --service conductor --service status-api --install-dir /Users/george/Library/LaunchAgents --local-api-base http://100.112.239.13:4317 --local-api-allowed-hosts 100.112.239.13 --status-api-base http://100.112.239.13:4318 --status-api-host 100.112.239.13 --expected-rolez standby --check-loaded'`
43+- `for i in 1 2 3 4 5 6 7; do node scripts/smoke/live-regression.mjs ... --expect-leader mini --compact; sleep 300; done`
44+- `ps -axo pid=,command= | grep '/apps/worker-runner/dist/index.js' | grep -v grep || true`
45+- `ssh george@100.112.239.13 "ps -axo pid=,command= | grep '/apps/worker-runner/dist/index.js' | grep -v grep || true"`
46+- `curl -X POST ... https://control-api.makefile.so/v1/system/drain`
47+- `curl -X POST ... https://control-api.makefile.so/v1/system/pause`
48+- `launchctl bootout "gui/$(id -u)" "$HOME/Library/LaunchAgents/so.makefile.baa-conductor.plist"`
49+- `./scripts/failover/rehearsal-check.sh --env /Users/george/.config/baa-conductor/ops.env --basic-auth "$DIRECT_BASIC_AUTH" --bearer-token "$CONTROL_API_OPS_ADMIN_TOKEN" --skip-node mini --expect-leader mac`
50+- `curl -X POST ... https://control-api.makefile.so/v1/system/resume`
51+- `ssh george@100.112.239.13 'launchctl bootout "gui/$(id -u)" "$HOME/Library/LaunchAgents/so.makefile.baa-conductor.plist"'`
52+- `cd /Users/george/code/baa-conductor-T028-v2 && ./scripts/runtime/reload-launchd.sh --service conductor --install-dir /Users/george/Library/LaunchAgents`
53+- `./scripts/failover/rehearsal-check.sh --env /Users/george/.config/baa-conductor/ops.env --basic-auth "$DIRECT_BASIC_AUTH" --bearer-token "$CONTROL_API_OPS_ADMIN_TOKEN" --skip-node mac --expect-leader mini`
54+- `ssh george@100.112.239.13 'cd /Users/george/code/baa-conductor-T028-v2 && ./scripts/runtime/reload-launchd.sh --service conductor --install-dir /Users/george/Library/LaunchAgents'`
55+- `dig @giancarlo.ns.cloudflare.com +short conductor.makefile.so`
56+- `echo | openssl s_client -servername conductor.makefile.so -connect 192.210.137.113:443 2>/dev/null | openssl x509 -noout -dates`
57
58 ## result
59
60-- 待填写
61+- 新增 `scripts/smoke/live-regression.mjs`,把 control-api、公网入口、直连入口、status-api 和鉴权结果收成一份真实环境快照;`scripts/smoke/README.md` 与 `docs/ops/README.md` 已补用法与报告入口。
62+- 在 live `mini/mac/VPS/Cloudflare` 环境上完成了一轮基线 smoke、一轮 `30.3` 分钟持续观察、一次 planned failover 和一次 switchback,并把过程写入 `docs/ops/real-stability-regression-2026-03-22.md`。
63+- steady-state 结论:control-api、公网 conductor、mini/mac 直连 conductor、Basic Auth、mini/mac on-node `check-node.sh` 都可以工作;final steady-state 已回到 `mini-main` leader、`mac-standby` standby、`mode=running`。
64+- failover / switchback 都真实成功,但都观察到了短暂外部不一致窗口:停掉当前 leader 后,公网或恢复中的节点会先返回 `standby`,随后才稳定到新的 `leader`。
65+- 本轮 smoke 没有完全通过:两台节点的 `status-api /v1/status` 全程都停在 `source=empty`, `mode=paused`, `leaderId=null`,与 control-api 真相不一致。
66
67 ## risks
68
69-- 待填写
70+- live `status-api` 目前不是可靠真相源;它不能用来判断自动化是否真的在 `running`,也不能确认当前 leader、queueDepth 或 activeRuns。
71+- mini/mac 两侧 launchd 安装副本仍然指向 `/Users/george/code/baa-conductor-T028-v2`,还没有整理到 canonical repo path `/Users/george/code/baa-conductor`。
72+- switchback 不是“停 mac、起 mini”就完全结束;要恢复 canonical `mini leader + mac standby`,还需要额外显式 reload `mac` conductor。
73+- authoritative DNS 仍是 DNS-only,Cloudflare proxy 没有重新开启;本轮没有尝试把 conductor hosts 切回橙云。
74+- 本机非权威 DNS / 代理层对 `conductor.makefile.so`、`mac-conductor.makefile.so` 的解析与权威结果不一致,直接做 DNS/TLS 诊断时必须区分 authoritative 结果和本地 resolver 结果。
75+- 在 planned failover 前,我有一次把 `GET /v1/system/state` 和 `POST /v1/system/pause` 并行发出,所以那次 `mode=paused` 读数不能单独拿来证明 `drain` 的独立效果;后续状态读取都改回串行。
76
77 ## next_handoff
78
79-- 待填写
80+- 优先排查 live `status-api` 为什么持续返回 `source=empty`,并让 `/v1/status` 跟随 control-api 的真实 leader / mode / queue state。
81+- 规划一次 runtime canonicalization,把 mini/mac 的 launchd、logs、runs、worktrees 路径从 `/Users/george/code/baa-conductor-T028-v2` 收口到 `/Users/george/code/baa-conductor`。
82+- 如果要把 switchback 降为更可重复的运维流程,runbook 或脚本层需要显式补上“恢复 mac standby”的尾步骤。
83+- 如果后续要重新启用 Cloudflare proxy,先把 DNS / TLS / zone SSL 模式的真实控制面整理清楚,再做单独演练,不要在当前 DNS-only 基线之上直接切橙云。
84
85 开始时建议直接把 `status` 改为 `in_progress`。
86
+4,
-0
1@@ -32,6 +32,10 @@
2 - `launchd` 决定节点上的 conductor / status-api 是保持 loopback,还是显式切到节点自己的 Tailscale `100.x`
3 - Nginx 不知道谁持有 leader lease,所以“逻辑 leader 已切走”不等于“公网一定已切走”
4
5+最新真实回归记录:
6+
7+- [`real-stability-regression-2026-03-22.md`](./real-stability-regression-2026-03-22.md)
8+
9 ## 单一来源 inventory
10
11 本任务把公网域名、VPS 公网 IP、内网 Tailscale `100.x` 和 Nginx 安装路径收口到一份 inventory:
1@@ -0,0 +1,236 @@
2+# 2026-03-22 Real Stability Regression
3+
4+## Scope
5+
6+This pass reused the live environment from `T-028` instead of redeploying:
7+
8+- `control-api.makefile.so`
9+- `conductor.makefile.so`
10+- `mini-conductor.makefile.so`
11+- `mac-conductor.makefile.so`
12+- `mini` on Tailscale `100.71.210.78`
13+- `mac` on Tailscale `100.112.239.13`
14+- VPS `192.210.137.113`
15+
16+The goal was a real live-environment regression on smoke, observation, planned failover, switchback, and residual risk capture.
17+
18+## Environment Snapshot
19+
20+- authoritative DNS at `@giancarlo.ns.cloudflare.com` returned `192.210.137.113` for all three conductor hosts during this pass
21+- both installed launchd plists still point at `/Users/george/code/baa-conductor-T028-v2`, not `/Users/george/code/baa-conductor`
22+- `ps` showed no active `worker-runner` process on either node before planned failover
23+- origin TLS on `192.210.137.113:443` currently presents these expiry dates:
24+ - `conductor.makefile.so`: `notAfter=Jun 19 18:17:37 2026 GMT`
25+ - `mini-conductor.makefile.so`: `notAfter=Jun 19 18:18:23 2026 GMT`
26+ - `mac-conductor.makefile.so`: `notAfter=Jun 19 18:19:28 2026 GMT`
27+
28+## Baseline Smoke
29+
30+Executed on 2026-03-22 CST before failover:
31+
32+```bash
33+./scripts/failover/print-topology.sh --env /Users/george/.config/baa-conductor/ops.env
34+
35+./scripts/failover/rehearsal-check.sh \
36+ --env /Users/george/.config/baa-conductor/ops.env \
37+ --basic-auth "$DIRECT_BASIC_AUTH" \
38+ --bearer-token "$CONTROL_API_OPS_ADMIN_TOKEN" \
39+ --expect-leader mini
40+
41+node scripts/smoke/live-regression.mjs \
42+ --env /Users/george/.config/baa-conductor/ops.env \
43+ --control-secrets /Users/george/.config/baa-conductor/control-api-worker.secrets.env \
44+ --basic-auth-file /Users/george/.config/baa-conductor/direct-node-basic-auth.env \
45+ --expect-leader mini
46+
47+./scripts/runtime/check-node.sh \
48+ --repo-dir /Users/george/code/baa-conductor-T028-v2 \
49+ --node mini \
50+ --service conductor \
51+ --service status-api \
52+ --install-dir /Users/george/Library/LaunchAgents \
53+ --local-api-base http://100.71.210.78:4317 \
54+ --local-api-allowed-hosts 100.71.210.78 \
55+ --status-api-base http://100.71.210.78:4318 \
56+ --status-api-host 100.71.210.78 \
57+ --expected-rolez leader \
58+ --check-loaded
59+
60+ssh george@100.112.239.13 \
61+ 'cd /Users/george/code/baa-conductor-T028-v2 && ./scripts/runtime/check-node.sh \
62+ --repo-dir /Users/george/code/baa-conductor-T028-v2 \
63+ --node mac \
64+ --service conductor \
65+ --service status-api \
66+ --install-dir /Users/george/Library/LaunchAgents \
67+ --local-api-base http://100.112.239.13:4317 \
68+ --local-api-allowed-hosts 100.112.239.13 \
69+ --status-api-base http://100.112.239.13:4318 \
70+ --status-api-host 100.112.239.13 \
71+ --expected-rolez standby \
72+ --check-loaded'
73+```
74+
75+Observed baseline:
76+
77+- `control-api /v1/system/state` returned `holder_id=mini-main`, `mode=running`, `term=1`
78+- public ingress returned `healthz=ok`, `readyz=ready`, `rolez=leader`
79+- `mini` direct host returned `healthz=ok`, `readyz=ready`, `rolez=leader`
80+- `mac` direct host returned `healthz=ok`, `readyz=ready`, `rolez=standby`
81+- unauthenticated direct-host probes returned `401`
82+- both `check-node.sh` passes succeeded against the live `T028-v2` runtime
83+
84+Smoke did not fully pass:
85+
86+- both `http://100.71.210.78:4318/v1/status` and `http://100.112.239.13:4318/v1/status` returned `ok=true`
87+- but both payloads stayed at `source=empty`, `mode=paused`, `leaderId=null`
88+- this did not match the live control plane state (`mini-main`, `running`)
89+
90+## 30-Minute Observation Window
91+
92+Sampling method:
93+
94+- command: `node scripts/smoke/live-regression.mjs ... --expect-leader mini --compact`
95+- sample count: `7`
96+- interval: `300s`
97+- raw log: `/tmp/t029-observation-20260322.jsonl`
98+- window: `2026-03-22 03:43:28 CST` to `2026-03-22 04:13:44 CST` (`30.3` minutes)
99+
100+Observed across all 7 samples:
101+
102+- control plane holder stayed `mini-main`
103+- control plane mode stayed `running`
104+- public `/rolez` stayed `leader`
105+- `mini` direct `/rolez` stayed `leader`
106+- `mac` direct `/rolez` stayed `standby`
107+- no `healthz` or `readyz` failure was observed on public or direct conductor hosts
108+- direct-host Basic Auth remained enforced
109+- the only repeated smoke failure was status drift:
110+ - `mini status-api`: `source=empty`, `mode=paused`
111+ - `mac status-api`: `source=empty`, `mode=paused`
112+
113+## Planned Failover
114+
115+Preconditions captured before cutover:
116+
117+- no `worker-runner` process on `mini`
118+- no `worker-runner` process on `mac`
119+- note: because `status-api` was already stale, active-run confirmation had to rely on host process checks instead of `/v1/status`
120+
121+Commands used:
122+
123+```bash
124+curl -X POST \
125+ -H "Authorization: Bearer $CONTROL_API_BROWSER_ADMIN_TOKEN" \
126+ -H 'Content-Type: application/json' \
127+ -d '{"requested_by":"T-029","reason":"planned_failover_rehearsal"}' \
128+ https://control-api.makefile.so/v1/system/drain
129+
130+curl -X POST \
131+ -H "Authorization: Bearer $CONTROL_API_BROWSER_ADMIN_TOKEN" \
132+ -H 'Content-Type: application/json' \
133+ -d '{"requested_by":"T-029","reason":"planned_failover_cutover"}' \
134+ https://control-api.makefile.so/v1/system/pause
135+
136+launchctl bootout "gui/$(id -u)" "$HOME/Library/LaunchAgents/so.makefile.baa-conductor.plist"
137+```
138+
139+Timeline in CST:
140+
141+- `04:14:48`: `drain` accepted
142+- `04:15:03`: control plane confirmed `mode=paused`
143+- `04:15:09`: `mini` conductor booted out from launchd
144+- `04:15:25`: first post-stop probe showed `public /rolez=standby`, `mac /rolez=standby`, control holder still `mini-main`
145+- `04:15:40`: second probe showed `public /rolez=standby`, `mac /rolez=leader`, control holder `mac-standby`, `term=2`
146+- `04:15:54`: third probe showed `public /rolez=leader`, `mac /rolez=leader`, control holder `mac-standby`, `term=2`
147+- `04:16:29`: authenticated `mini-conductor.makefile.so/healthz` returned `502`; `curl --noproxy '*' http://100.71.210.78:4317/healthz` failed with curl `7` / HTTP `000`
148+- `04:17:00`: `resume` accepted and control plane returned `mode=running`, `holder_id=mac-standby`
149+
150+Failover conclusion:
151+
152+- planned failover worked on the real environment
153+- lease moved from `mini-main` to `mac-standby`
154+- public ingress recovered back to `leader` without DNS edits
155+- there was a real transient cutover window after stopping `mini`
156+ - public `/rolez` was observed as `standby`
157+ - the window was still present at `04:15:40`
158+ - it had cleared by `04:15:54`
159+
160+## Switchback
161+
162+Commands used:
163+
164+```bash
165+curl -X POST \
166+ -H "Authorization: Bearer $CONTROL_API_BROWSER_ADMIN_TOKEN" \
167+ -H 'Content-Type: application/json' \
168+ -d '{"requested_by":"T-029","reason":"switchback_prepare"}' \
169+ https://control-api.makefile.so/v1/system/pause
170+
171+ssh george@100.112.239.13 \
172+ 'launchctl bootout "gui/$(id -u)" "$HOME/Library/LaunchAgents/so.makefile.baa-conductor.plist"'
173+
174+cd /Users/george/code/baa-conductor-T028-v2
175+./scripts/runtime/reload-launchd.sh --service conductor --install-dir /Users/george/Library/LaunchAgents
176+```
177+
178+Timeline in CST:
179+
180+- `04:17:17`: switchback `pause` accepted
181+- `04:17:24`: `mac` conductor booted out; `mini` conductor reload completed
182+- `04:17:41`: first probe showed `public /rolez=standby`, `mini /rolez=standby`, control holder still `mac-standby`
183+- by the next probe window, control holder had returned to `mini-main`, `term=3`, and both public/mini `/rolez` were back to `leader`
184+- `04:18:08`: `resume` accepted and control plane returned `mode=running`, `holder_id=mini-main`
185+
186+Switchback was not fully closed at that point:
187+
188+- `04:18:23` snapshot still showed `mac-conductor.makefile.so` returning `502`
189+- that was expected from the runbook sequence because `mac` had been stopped, but it meant the topology was not yet back to `mini leader + mac standby`
190+
191+Canonical topology restoration required one more explicit step:
192+
193+```bash
194+ssh george@100.112.239.13 \
195+ 'cd /Users/george/code/baa-conductor-T028-v2 && ./scripts/runtime/reload-launchd.sh \
196+ --service conductor \
197+ --install-dir /Users/george/Library/LaunchAgents'
198+```
199+
200+Observed after reloading `mac`:
201+
202+- `04:18:41`: remote reload started
203+- first remote `check-node.sh` after reload saw the `mac` conductor process but reported `conductor is not listening on TCP port 4317`
204+- a few seconds later, `rehearsal-check.sh` showed `mac-conductor.makefile.so` back at `rolez=standby`
205+- `04:19:51`: final `live-regression.mjs` snapshot showed public/direct roles back to `leader / leader / standby`
206+- final local and remote `check-node.sh` runs both passed
207+
208+Switchback conclusion:
209+
210+- control plane and public ingress successfully returned to `mini-main`, `term=3`
211+- restoring `mac` as standby required an explicit post-switchback reload
212+- there was a short runtime window where the process existed before `4317` was actually listening
213+
214+## Current Findings
215+
216+### Confirmed working
217+
218+- authoritative DNS still points all conductor hosts at the VPS public IP
219+- public ingress stayed available through planned failover and switchback
220+- leader lease moved `mini -> mac -> mini`
221+- direct-host Basic Auth stayed enforced
222+- both nodes can return to a healthy `leader / standby` split after explicit reloads
223+
224+### Confirmed regression
225+
226+- both `status-api /v1/status` endpoints are stale in live
227+- they continue to report `source=empty`, `mode=paused`, `leaderId=null`
228+- this remained true before failover, during failover, after switchback, and after both nodes re-entered healthy roles
229+
230+## Residual Risks
231+
232+- `status-api` is not a reliable truth source in the live environment right now; it cannot be used to decide whether automation is really `running` or whether leader identity has updated.
233+- installed launchd plists on both nodes still target `/Users/george/code/baa-conductor-T028-v2`; the runtime has not been normalized to the canonical repo path.
234+- switchback is not a one-command return to `mini leader + mac standby`; after moving leadership back to `mini`, `mac` still needed an explicit `reload-launchd.sh --service conductor`.
235+- there is a real short-lived public inconsistency window during both failover and switchback where `/rolez` can read `standby` before the next leader is externally visible as `leader`.
236+- this rollout is still operating in DNS-only mode from the authoritative DNS perspective; Cloudflare proxy is not back in service for the conductor hosts.
237+- local non-authoritative resolver output was inconsistent with authoritative DNS for some hostnames during this task, which is another reminder that local DNS/proxy layers can mislead direct diagnostics.
+22,
-0
1@@ -23,3 +23,25 @@ bash scripts/smoke/stop-stack.sh --state-dir <tmp/smoke-...> --json
2 - 执行一次最小主备切换,确认 lease 从 `smoke-mini` 漂移到 `smoke-mac`
3
4 所有临时状态、数据库和日志都会落到 `tmp/smoke-*` 目录。
5+
6+## Live 环境快照
7+
8+真实环境回归可以直接跑:
9+
10+```bash
11+node scripts/smoke/live-regression.mjs \
12+ --env /Users/george/.config/baa-conductor/ops.env \
13+ --control-secrets /Users/george/.config/baa-conductor/control-api-worker.secrets.env \
14+ --basic-auth-file /Users/george/.config/baa-conductor/direct-node-basic-auth.env \
15+ --expect-leader mini
16+```
17+
18+它会一次性采样这些面:
19+
20+- `GET https://control-api.makefile.so/v1/system/state`
21+- `conductor.makefile.so` 的 `/healthz` `/readyz` `/rolez`
22+- `mini-conductor.makefile.so` / `mac-conductor.makefile.so` 的 Basic Auth 与 `/healthz` `/readyz` `/rolez`
23+- `http://100.71.210.78:4318/v1/status`
24+- `http://100.112.239.13:4318/v1/status`
25+
26+默认输出 JSON 快照,并把 status-api 视图是否和 control-api 当前 leader / mode 对齐也一起标出来。
+473,
-0
1@@ -0,0 +1,473 @@
2+#!/usr/bin/env node
3+
4+import { execFile } from "node:child_process";
5+import { readFileSync } from "node:fs";
6+import { basename } from "node:path";
7+import { promisify } from "node:util";
8+
9+const DEFAULT_CONTROL_API_BASE = "https://control-api.makefile.so";
10+const DEFAULT_CONDUCTOR_PORT = 4317;
11+const DEFAULT_STATUS_API_PORT = 4318;
12+const DEFAULT_TIMEOUT_MS = 5000;
13+const execFileAsync = promisify(execFile);
14+
15+function usage() {
16+ process.stdout.write(`Usage:
17+ node scripts/smoke/live-regression.mjs [options]
18+
19+Options:
20+ --env PATH Inventory env file.
21+ --control-secrets PATH Secrets env file with control-api tokens.
22+ --basic-auth-file PATH Env file with direct-node Basic Auth credentials.
23+ --bearer-token TOKEN Override the bearer token used for GET /v1/system/state.
24+ --basic-auth USER:PASS Override the direct-node Basic Auth value.
25+ --expect-leader VALUE mini, mac, or any. Defaults to mini.
26+ --timeout-ms N Per-request timeout in milliseconds. Defaults to 5000.
27+ --compact Emit compact JSON instead of pretty-printed JSON.
28+ --strict Exit non-zero when assertions fail.
29+ --help Show this help text.
30+`);
31+}
32+
33+function parseArgs(argv) {
34+ const options = {
35+ compact: false,
36+ expectLeader: "mini",
37+ strict: false,
38+ timeoutMs: DEFAULT_TIMEOUT_MS
39+ };
40+
41+ for (let index = 0; index < argv.length; index += 1) {
42+ const token = argv[index];
43+ switch (token) {
44+ case "--env":
45+ options.envPath = requireValue(argv, ++index, token);
46+ break;
47+ case "--control-secrets":
48+ options.controlSecretsPath = requireValue(argv, ++index, token);
49+ break;
50+ case "--basic-auth-file":
51+ options.basicAuthFile = requireValue(argv, ++index, token);
52+ break;
53+ case "--bearer-token":
54+ options.bearerToken = requireValue(argv, ++index, token);
55+ break;
56+ case "--basic-auth":
57+ options.basicAuth = requireValue(argv, ++index, token);
58+ break;
59+ case "--expect-leader":
60+ options.expectLeader = requireValue(argv, ++index, token);
61+ break;
62+ case "--timeout-ms":
63+ options.timeoutMs = Number.parseInt(requireValue(argv, ++index, token), 10);
64+ break;
65+ case "--compact":
66+ options.compact = true;
67+ break;
68+ case "--strict":
69+ options.strict = true;
70+ break;
71+ case "--help":
72+ usage();
73+ process.exit(0);
74+ default:
75+ throw new Error(`Unknown option "${token}".`);
76+ }
77+ }
78+
79+ if (!options.envPath) {
80+ throw new Error("--env is required.");
81+ }
82+
83+ if (!Number.isFinite(options.timeoutMs) || options.timeoutMs <= 0) {
84+ throw new Error("--timeout-ms must be a positive integer.");
85+ }
86+
87+ if (!["mini", "mac", "any"].includes(options.expectLeader)) {
88+ throw new Error(`Unsupported --expect-leader value "${options.expectLeader}".`);
89+ }
90+
91+ return options;
92+}
93+
94+function requireValue(argv, index, optionName) {
95+ const value = argv[index];
96+ if (!value) {
97+ throw new Error(`${optionName} requires a value.`);
98+ }
99+ return value;
100+}
101+
102+function loadEnvFile(path) {
103+ const text = readFileSync(path, "utf8");
104+ const entries = {};
105+
106+ for (const rawLine of text.split(/\r?\n/u)) {
107+ const line = rawLine.trim();
108+ if (!line || line.startsWith("#")) {
109+ continue;
110+ }
111+
112+ const separatorIndex = line.indexOf("=");
113+ if (separatorIndex <= 0) {
114+ continue;
115+ }
116+
117+ const key = line.slice(0, separatorIndex).trim();
118+ let value = line.slice(separatorIndex + 1).trim();
119+
120+ if (
121+ (value.startsWith('"') && value.endsWith('"')) ||
122+ (value.startsWith("'") && value.endsWith("'"))
123+ ) {
124+ value = value.slice(1, -1);
125+ }
126+
127+ entries[key] = value;
128+ }
129+
130+ return entries;
131+}
132+
133+function pickBearerToken(options, secrets) {
134+ return (
135+ options.bearerToken ||
136+ secrets.CONTROL_API_OPS_ADMIN_TOKEN ||
137+ secrets.CONTROL_API_READONLY_TOKEN ||
138+ secrets.CONTROL_API_BROWSER_ADMIN_TOKEN ||
139+ ""
140+ );
141+}
142+
143+function pickBasicAuth(options, authEnv) {
144+ if (options.basicAuth) {
145+ return options.basicAuth;
146+ }
147+
148+ const user = authEnv.BAA_DIRECT_NODE_BASIC_AUTH_USER || "";
149+ const password = authEnv.BAA_DIRECT_NODE_BASIC_AUTH_PASSWORD || "";
150+ if (!user || !password) {
151+ return "";
152+ }
153+
154+ return `${user}:${password}`;
155+}
156+
157+function toEpochIso(value) {
158+ if (value === null || value === undefined || value === "") {
159+ return null;
160+ }
161+
162+ const numericValue = Number(value);
163+ if (!Number.isFinite(numericValue)) {
164+ return null;
165+ }
166+
167+ const millis = numericValue > 1_000_000_000_000 ? numericValue : numericValue * 1000;
168+ return new Date(millis).toISOString();
169+}
170+
171+function normalizeControlState(payload) {
172+ const data = payload?.data ?? payload ?? {};
173+ return {
174+ holderId: data.holder_id ?? payload?.holder_id ?? null,
175+ leaseExpiresAt: data.lease_expires_at ?? payload?.lease_expires_at ?? null,
176+ leaseExpiresAtIso: toEpochIso(data.lease_expires_at ?? payload?.lease_expires_at ?? null),
177+ mode: data.mode ?? payload?.mode ?? null,
178+ ok: payload?.ok === true,
179+ requestId: payload?.request_id ?? null,
180+ term: data.term ?? payload?.term ?? null
181+ };
182+}
183+
184+function normalizeStatusSnapshot(payload) {
185+ const data = payload?.data ?? payload ?? {};
186+ return {
187+ activeRuns: data.activeRuns ?? null,
188+ leaderHost: data.leaderHost ?? null,
189+ leaderId: data.leaderId ?? null,
190+ leaseActive: data.leaseActive ?? null,
191+ leaseExpiresAt: data.leaseExpiresAt ?? null,
192+ leaseTerm: data.leaseTerm ?? null,
193+ mode: data.mode ?? null,
194+ observedAt: data.observedAt ?? null,
195+ ok: payload?.ok === true,
196+ queueDepth: data.queueDepth ?? null,
197+ source: data.source ?? null,
198+ updatedAt: data.updatedAt ?? null
199+ };
200+}
201+
202+function expectedLeaderFromControl(controlState) {
203+ const holderId = controlState?.holderId ?? "";
204+ if (holderId.startsWith("mini-")) {
205+ return "mini";
206+ }
207+ if (holderId.startsWith("mac-")) {
208+ return "mac";
209+ }
210+ return null;
211+}
212+
213+async function fetchProbe(url, { headers = {}, timeoutMs = DEFAULT_TIMEOUT_MS } = {}) {
214+ const args = [
215+ "-sS",
216+ "-L",
217+ "--max-time",
218+ String(Math.max(1, Math.ceil(timeoutMs / 1000)))
219+ ];
220+
221+ for (const [key, value] of Object.entries(headers)) {
222+ args.push("-H", `${key}: ${value}`);
223+ }
224+
225+ args.push("-w", "\n__HTTP_STATUS__:%{http_code}", url);
226+
227+ try {
228+ const { stdout } = await execFileAsync("curl", args, {
229+ maxBuffer: 10 * 1024 * 1024,
230+ timeout: timeoutMs + 1000
231+ });
232+ const marker = "\n__HTTP_STATUS__:";
233+ const markerIndex = stdout.lastIndexOf(marker);
234+ const body = markerIndex >= 0 ? stdout.slice(0, markerIndex) : stdout;
235+ const statusText = markerIndex >= 0 ? stdout.slice(markerIndex + marker.length).trim() : "";
236+ const httpStatus = Number.parseInt(statusText, 10);
237+
238+ return {
239+ body: body.replace(/\r/g, "").replace(/\n+$/u, ""),
240+ httpStatus: Number.isFinite(httpStatus) ? httpStatus : null,
241+ ok: httpStatus >= 200 && httpStatus < 300,
242+ url
243+ };
244+ } catch (error) {
245+ return {
246+ body: "",
247+ error:
248+ error && typeof error === "object" && "stderr" in error && typeof error.stderr === "string"
249+ ? error.stderr.trim() || (error instanceof Error ? error.message : String(error))
250+ : error instanceof Error
251+ ? error.message
252+ : String(error),
253+ httpStatus: null,
254+ ok: false,
255+ url
256+ };
257+ }
258+}
259+
260+async function probeConductor(baseUrl, headers, timeoutMs) {
261+ const [healthz, readyz, rolez] = await Promise.all([
262+ fetchProbe(`${baseUrl}/healthz`, { headers, timeoutMs }),
263+ fetchProbe(`${baseUrl}/readyz`, { headers, timeoutMs }),
264+ fetchProbe(`${baseUrl}/rolez`, { headers, timeoutMs })
265+ ]);
266+
267+ return { baseUrl, healthz, readyz, rolez };
268+}
269+
270+async function probeStatusApi(baseUrl, timeoutMs) {
271+ const [healthz, status] = await Promise.all([
272+ fetchProbe(`${baseUrl}/healthz`, { timeoutMs }),
273+ fetchProbe(`${baseUrl}/v1/status`, { timeoutMs })
274+ ]);
275+
276+ let parsedStatus = null;
277+ if (!status.error && status.body) {
278+ try {
279+ parsedStatus = JSON.parse(status.body);
280+ } catch (error) {
281+ status.parseError = error instanceof Error ? error.message : String(error);
282+ }
283+ }
284+
285+ return {
286+ baseUrl,
287+ healthz,
288+ status,
289+ summary: parsedStatus ? normalizeStatusSnapshot(parsedStatus) : null
290+ };
291+}
292+
293+function addIssue(issues, condition, message) {
294+ if (!condition) {
295+ issues.push(message);
296+ }
297+}
298+
299+function responseMatches(probe, expectedStatus, expectedBody) {
300+ return probe?.httpStatus === expectedStatus && probe?.body === expectedBody;
301+}
302+
303+function statusMatchesControl(controlState, statusSummary) {
304+ if (!controlState?.holderId || !controlState?.mode || !statusSummary) {
305+ return false;
306+ }
307+
308+ return (
309+ statusSummary.ok === true &&
310+ statusSummary.mode === controlState.mode &&
311+ statusSummary.leaderId === controlState.holderId &&
312+ statusSummary.source !== "empty"
313+ );
314+}
315+
316+async function main() {
317+ const options = parseArgs(process.argv.slice(2));
318+ const inventory = loadEnvFile(options.envPath);
319+ const controlSecrets = options.controlSecretsPath ? loadEnvFile(options.controlSecretsPath) : {};
320+ const basicAuthEnv = options.basicAuthFile ? loadEnvFile(options.basicAuthFile) : {};
321+
322+ const bearerToken = pickBearerToken(options, controlSecrets);
323+ const basicAuth = pickBasicAuth(options, basicAuthEnv);
324+
325+ if (!bearerToken) {
326+ throw new Error("No bearer token available. Pass --bearer-token or --control-secrets.");
327+ }
328+
329+ if (!basicAuth) {
330+ throw new Error("No Basic Auth available. Pass --basic-auth or --basic-auth-file.");
331+ }
332+
333+ const controlApiBase = inventory.BAA_CONTROL_API_BASE || DEFAULT_CONTROL_API_BASE;
334+ const conductorPort = Number.parseInt(inventory.BAA_CONDUCTOR_PORT || "", 10) || DEFAULT_CONDUCTOR_PORT;
335+ const statusApiPort = DEFAULT_STATUS_API_PORT;
336+ const publicBaseUrl = `https://${inventory.BAA_CONDUCTOR_HOST}`;
337+ const miniDirectBaseUrl = `https://${inventory.BAA_MINI_DIRECT_HOST}`;
338+ const macDirectBaseUrl = `https://${inventory.BAA_MAC_DIRECT_HOST}`;
339+ const miniStatusBaseUrl = `http://${inventory.BAA_MINI_TAILSCALE_IP}:${statusApiPort}`;
340+ const macStatusBaseUrl = `http://${inventory.BAA_MAC_TAILSCALE_IP}:${statusApiPort}`;
341+
342+ const authHeaders = {
343+ Authorization: `Basic ${Buffer.from(basicAuth).toString("base64")}`
344+ };
345+
346+ const [controlStateProbe, publicConductor, miniNoAuth, miniDirect, macNoAuth, macDirect, miniStatus, macStatus] =
347+ await Promise.all([
348+ fetchProbe(`${controlApiBase.replace(/\/+$/u, "")}/v1/system/state`, {
349+ headers: {
350+ Accept: "application/json",
351+ Authorization: `Bearer ${bearerToken}`
352+ },
353+ timeoutMs: options.timeoutMs
354+ }),
355+ probeConductor(publicBaseUrl, {}, options.timeoutMs),
356+ fetchProbe(`${miniDirectBaseUrl}/healthz`, { timeoutMs: options.timeoutMs }),
357+ probeConductor(miniDirectBaseUrl, authHeaders, options.timeoutMs),
358+ fetchProbe(`${macDirectBaseUrl}/healthz`, { timeoutMs: options.timeoutMs }),
359+ probeConductor(macDirectBaseUrl, authHeaders, options.timeoutMs),
360+ probeStatusApi(miniStatusBaseUrl, options.timeoutMs),
361+ probeStatusApi(macStatusBaseUrl, options.timeoutMs)
362+ ]);
363+
364+ let controlState = null;
365+ if (!controlStateProbe.error && controlStateProbe.body) {
366+ try {
367+ controlState = normalizeControlState(JSON.parse(controlStateProbe.body));
368+ } catch (error) {
369+ controlStateProbe.parseError = error instanceof Error ? error.message : String(error);
370+ }
371+ }
372+
373+ const inferredLeader = options.expectLeader === "any" ? expectedLeaderFromControl(controlState) : options.expectLeader;
374+ const expectedLeader = inferredLeader || options.expectLeader;
375+ const expectedStandby = expectedLeader === "mini" ? "mac" : expectedLeader === "mac" ? "mini" : null;
376+
377+ const issues = [];
378+
379+ addIssue(issues, controlStateProbe.httpStatus === 200, `control-api /v1/system/state HTTP ${controlStateProbe.httpStatus ?? "error"}`);
380+ addIssue(issues, controlState?.ok === true, "control-api /v1/system/state did not return ok=true");
381+ addIssue(issues, responseMatches(publicConductor.healthz, 200, "ok"), "public /healthz did not return 200(ok)");
382+ addIssue(issues, responseMatches(publicConductor.readyz, 200, "ready"), "public /readyz did not return 200(ready)");
383+ addIssue(issues, responseMatches(publicConductor.rolez, 200, "leader"), "public /rolez did not return 200(leader)");
384+ addIssue(issues, miniNoAuth.httpStatus === 401, `mini direct no-auth /healthz expected 401, got ${miniNoAuth.httpStatus ?? "error"}`);
385+ addIssue(issues, macNoAuth.httpStatus === 401, `mac direct no-auth /healthz expected 401, got ${macNoAuth.httpStatus ?? "error"}`);
386+ addIssue(issues, responseMatches(miniDirect.healthz, 200, "ok"), "mini direct /healthz did not return 200(ok)");
387+ addIssue(issues, responseMatches(miniDirect.readyz, 200, "ready"), "mini direct /readyz did not return 200(ready)");
388+ addIssue(issues, responseMatches(macDirect.healthz, 200, "ok"), "mac direct /healthz did not return 200(ok)");
389+ addIssue(issues, responseMatches(macDirect.readyz, 200, "ready"), "mac direct /readyz did not return 200(ready)");
390+
391+ if (expectedLeader === "mini") {
392+ addIssue(issues, responseMatches(miniDirect.rolez, 200, "leader"), "mini direct /rolez did not return 200(leader)");
393+ addIssue(issues, responseMatches(macDirect.rolez, 200, "standby"), "mac direct /rolez did not return 200(standby)");
394+ addIssue(issues, controlState?.holderId?.startsWith("mini-") === true, `control-api holder_id did not point at mini: ${controlState?.holderId ?? "null"}`);
395+ } else if (expectedLeader === "mac") {
396+ addIssue(issues, responseMatches(macDirect.rolez, 200, "leader"), "mac direct /rolez did not return 200(leader)");
397+ addIssue(issues, responseMatches(miniDirect.rolez, 200, "standby"), "mini direct /rolez did not return 200(standby)");
398+ addIssue(issues, controlState?.holderId?.startsWith("mac-") === true, `control-api holder_id did not point at mac: ${controlState?.holderId ?? "null"}`);
399+ }
400+
401+ for (const [nodeName, statusProbe] of [
402+ ["mini", miniStatus],
403+ ["mac", macStatus]
404+ ]) {
405+ addIssue(issues, responseMatches(statusProbe.healthz, 200, "ok"), `${nodeName} status-api /healthz did not return 200(ok)`);
406+ addIssue(issues, statusProbe.status.httpStatus === 200, `${nodeName} status-api /v1/status HTTP ${statusProbe.status.httpStatus ?? "error"}`);
407+ addIssue(issues, statusProbe.summary?.ok === true, `${nodeName} status-api /v1/status did not return ok=true`);
408+ addIssue(
409+ issues,
410+ statusMatchesControl(controlState, statusProbe.summary),
411+ `${nodeName} status-api snapshot does not match control-api state`
412+ );
413+ }
414+
415+ const result = {
416+ ok: issues.length === 0,
417+ observedAt: new Date().toISOString(),
418+ inventory: {
419+ conductorHost: inventory.BAA_CONDUCTOR_HOST,
420+ envPath: options.envPath,
421+ macDirectHost: inventory.BAA_MAC_DIRECT_HOST,
422+ macTailscaleIp: inventory.BAA_MAC_TAILSCALE_IP,
423+ miniDirectHost: inventory.BAA_MINI_DIRECT_HOST,
424+ miniTailscaleIp: inventory.BAA_MINI_TAILSCALE_IP,
425+ publicIpv4: inventory.BAA_PUBLIC_IPV4 || null,
426+ proxies: {
427+ conductor: inventory.BAA_CF_PROXY_CONDUCTOR ?? null,
428+ mac: inventory.BAA_CF_PROXY_MAC ?? null,
429+ mini: inventory.BAA_CF_PROXY_MINI ?? null
430+ },
431+ statusApiPort,
432+ conductorPort
433+ },
434+ expectations: {
435+ expectedLeader,
436+ requestedExpectLeader: options.expectLeader,
437+ expectedStandby
438+ },
439+ controlApi: {
440+ baseUrl: controlApiBase,
441+ probe: controlStateProbe,
442+ state: controlState
443+ },
444+ conductors: {
445+ public: publicConductor,
446+ miniDirect: {
447+ noAuthHealthz: miniNoAuth,
448+ auth: miniDirect
449+ },
450+ macDirect: {
451+ noAuthHealthz: macNoAuth,
452+ auth: macDirect
453+ }
454+ },
455+ statusApis: {
456+ mini: miniStatus,
457+ mac: macStatus
458+ },
459+ issues
460+ };
461+
462+ const json = JSON.stringify(result, null, options.compact ? 0 : 2);
463+ process.stdout.write(`${json}\n`);
464+
465+ if (options.strict && !result.ok) {
466+ process.exit(1);
467+ }
468+}
469+
470+main().catch((error) => {
471+ const message = error instanceof Error ? error.message : String(error);
472+ process.stderr.write(`${basename(process.argv[1])}: ${message}\n`);
473+ process.exit(1);
474+});