im_wower
·
2026-03-26
check-node.sh
1#!/usr/bin/env bash
2set -euo pipefail
3
4SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
5# shellcheck source=./common.sh
6source "${SCRIPT_DIR}/common.sh"
7
8usage() {
9 cat <<'EOF'
10Usage:
11 scripts/runtime/check-node.sh [options]
12
13Options:
14 --node mini Select node defaults. Defaults to mini.
15 --scope agent|daemon Expected launchd scope. Defaults to agent.
16 --service NAME Add one service to the runtime check set. Repeatable.
17 --all-services Check conductor, codexd, worker-runner, and status-api.
18 --repo-dir PATH Repo root used to derive runtime paths.
19 --home-dir PATH HOME value expected in installed plist files.
20 --install-dir PATH Validate installed copies under this directory.
21 --shared-token TOKEN Expect this exact token in installed copies.
22 --shared-token-file PATH Read the expected token from a file.
23 --public-api-base URL Expected conductor BAA_CONDUCTOR_PUBLIC_API_BASE in installed copies.
24 --control-api-base URL Legacy alias for --public-api-base.
25 --local-api-base URL Conductor local API base URL. Defaults to 127.0.0.1:4317.
26 --local-api-allowed-hosts CSV
27 Expected BAA_CONDUCTOR_LOCAL_API_ALLOWED_HOSTS in installed copies.
28 --codexd-api-base URL codexd local API base URL. Defaults to 127.0.0.1:4319.
29 --status-api-base URL Status API base URL. Defaults to 127.0.0.1:4318.
30 --status-api-host HOST Expected BAA_STATUS_API_HOST in installed copies.
31 --username NAME Expected UserName for LaunchDaemons.
32 --domain TARGET launchctl domain target for --check-loaded.
33 --check-loaded Also require launchctl print to succeed for each service.
34 --expected-rolez VALUE Expected conductor /rolez body: leader or any.
35 --skip-static-check Skip the underlying check-launchd.sh pass.
36 --skip-port-check Skip local TCP LISTEN checks.
37 --skip-process-check Skip host process command-line checks.
38 --skip-http-check Skip conductor/codexd/status-api HTTP probes.
39 --skip-log-check Skip launchd stdout/stderr file checks.
40 --help Show this help text.
41
42Notes:
43 The default runtime check set is conductor + codexd. Use --service status-api
44 to include the optional local read-only observer, or --all-services to also
45 include worker-runner.
46 status-api defaults to BAA_CONDUCTOR_LOCAL_API /v1/system/state; launchd no
47 longer writes conductor public-api base env vars for it.
48 conductor HTTP probes include /v1/codex to ensure proxy wiring to codexd.
49 For codexd, the HTTP probes only cover /healthz and /v1/codexd/status.
50 /v1/codexd/runs* and codex exec are not part of node verification.
51EOF
52}
53
54require_command awk
55require_command curl
56require_command lsof
57require_command ps
58
59node="mini"
60scope="agent"
61repo_dir="${BAA_RUNTIME_REPO_DIR_DEFAULT}"
62home_dir="$(default_home_dir)"
63install_dir=""
64shared_token=""
65shared_token_file=""
66public_api_base=""
67legacy_control_api_base=""
68local_api_base="http://100.71.210.78:4317"
69local_api_allowed_hosts="${BAA_CONDUCTOR_LOCAL_API_ALLOWED_HOSTS:-100.71.210.78}"
70codexd_api_base="${BAA_RUNTIME_DEFAULT_CODEXD_LOCAL_API}"
71status_api_base="http://100.71.210.78:4318"
72status_api_host="${BAA_STATUS_API_HOST:-100.71.210.78}"
73username="$(default_username)"
74domain_target=""
75check_loaded="0"
76expected_rolez="any"
77skip_static_check="0"
78skip_port_check="0"
79skip_process_check="0"
80skip_http_check="0"
81skip_log_check="0"
82services=()
83
84while [[ $# -gt 0 ]]; do
85 case "$1" in
86 --node)
87 node="$2"
88 shift 2
89 ;;
90 --scope)
91 scope="$2"
92 shift 2
93 ;;
94 --service)
95 validate_service "$2"
96 if ! contains_value "$2" "${services[@]-}"; then
97 services+=("$2")
98 fi
99 shift 2
100 ;;
101 --all-services)
102 while IFS= read -r service; do
103 if ! contains_value "$service" "${services[@]-}"; then
104 services+=("$service")
105 fi
106 done < <(all_services)
107 shift
108 ;;
109 --repo-dir)
110 repo_dir="$2"
111 shift 2
112 ;;
113 --home-dir)
114 home_dir="$2"
115 shift 2
116 ;;
117 --install-dir)
118 install_dir="$2"
119 shift 2
120 ;;
121 --shared-token)
122 shared_token="$2"
123 shift 2
124 ;;
125 --shared-token-file)
126 shared_token_file="$2"
127 shift 2
128 ;;
129 --public-api-base)
130 public_api_base="$2"
131 shift 2
132 ;;
133 --control-api-base)
134 legacy_control_api_base="$2"
135 shift 2
136 ;;
137 --local-api-base)
138 local_api_base="$2"
139 shift 2
140 ;;
141 --local-api-allowed-hosts)
142 local_api_allowed_hosts="$2"
143 shift 2
144 ;;
145 --codexd-api-base)
146 codexd_api_base="$2"
147 shift 2
148 ;;
149 --status-api-base)
150 status_api_base="$2"
151 shift 2
152 ;;
153 --status-api-host)
154 status_api_host="$2"
155 shift 2
156 ;;
157 --username)
158 username="$2"
159 shift 2
160 ;;
161 --domain)
162 domain_target="$2"
163 shift 2
164 ;;
165 --check-loaded)
166 check_loaded="1"
167 shift
168 ;;
169 --expected-rolez)
170 expected_rolez="$2"
171 shift 2
172 ;;
173 --skip-static-check)
174 skip_static_check="1"
175 shift
176 ;;
177 --skip-port-check)
178 skip_port_check="1"
179 shift
180 ;;
181 --skip-process-check)
182 skip_process_check="1"
183 shift
184 ;;
185 --skip-http-check)
186 skip_http_check="1"
187 shift
188 ;;
189 --skip-log-check)
190 skip_log_check="1"
191 shift
192 ;;
193 --help)
194 usage
195 exit 0
196 ;;
197 *)
198 die "Unknown option: $1"
199 ;;
200 esac
201done
202
203validate_node "$node"
204validate_scope "$scope"
205
206if [[ -z "$public_api_base" ]]; then
207 if [[ -n "$legacy_control_api_base" ]]; then
208 public_api_base="$legacy_control_api_base"
209 elif [[ -n "${BAA_CONDUCTOR_PUBLIC_API_BASE:-}" ]]; then
210 public_api_base="${BAA_CONDUCTOR_PUBLIC_API_BASE}"
211 elif [[ -n "${BAA_CONTROL_API_BASE:-}" ]]; then
212 public_api_base="${BAA_CONTROL_API_BASE}"
213 else
214 public_api_base="${BAA_RUNTIME_DEFAULT_PUBLIC_API_BASE}"
215 fi
216fi
217
218case "$expected_rolez" in
219 any | leader) ;;
220 *)
221 die "Unsupported --expected-rolez value: ${expected_rolez}"
222 ;;
223esac
224
225if [[ "${#services[@]}" -eq 0 ]]; then
226 while IFS= read -r service; do
227 services+=("$service")
228 done < <(default_node_verification_services)
229fi
230
231if [[ -z "$install_dir" ]]; then
232 install_dir="$(default_install_dir "$scope" "$home_dir")"
233fi
234
235if [[ "$check_loaded" == "1" && -z "$domain_target" ]]; then
236 domain_target="$(default_domain_target "$scope")"
237fi
238
239set -- $(resolve_node_defaults "$node")
240conductor_host="$1"
241conductor_role="$2"
242node_id="$3"
243
244logs_launchd_dir="${repo_dir}/logs/launchd"
245HTTP_STATUS=""
246HTTP_BODY=""
247
248normalize_base_url() {
249 local value="$1"
250
251 while [[ "$value" == */ ]]; do
252 value="${value%/}"
253 done
254
255 printf '%s\n' "$value"
256}
257
258extract_port_from_url() {
259 local service="$1"
260 local url="$2"
261 local authority
262 local default_port
263
264 authority="${url#*://}"
265 authority="${authority%%/*}"
266
267 if [[ "$authority" == *:* ]]; then
268 printf '%s\n' "${authority##*:}"
269 return 0
270 fi
271
272 default_port="$(service_default_port "$service")"
273 if [[ -n "$default_port" ]]; then
274 printf '%s\n' "$default_port"
275 return 0
276 fi
277
278 case "$url" in
279 https://*)
280 printf '%s\n' "443"
281 ;;
282 http://*)
283 printf '%s\n' "80"
284 ;;
285 *)
286 die "Could not derive a TCP port from URL: ${url}"
287 ;;
288 esac
289}
290
291http_get() {
292 local url="$1"
293 local body_file
294
295 body_file="$(mktemp "${TMPDIR:-/tmp}/baa-runtime-http.XXXXXX")"
296
297 HTTP_STATUS="$(curl -sS --max-time 5 -o "$body_file" -w '%{http_code}' "$url")" || {
298 rm -f "$body_file"
299 die "Request failed: ${url}"
300 }
301
302 HTTP_BODY="$(tr -d '\r' <"$body_file")"
303 rm -f "$body_file"
304
305 while [[ "$HTTP_BODY" == *$'\n' ]]; do
306 HTTP_BODY="${HTTP_BODY%$'\n'}"
307 done
308}
309
310assert_http_equals() {
311 local name="$1"
312 local url="$2"
313 local expected_status="$3"
314 local expected_body="$4"
315
316 http_get "$url"
317
318 if [[ "$HTTP_STATUS" != "$expected_status" ]]; then
319 die "${name} returned HTTP ${HTTP_STATUS}, expected ${expected_status}"
320 fi
321
322 if [[ "$HTTP_BODY" != "$expected_body" ]]; then
323 die "${name} body mismatch: expected '${expected_body}', got '${HTTP_BODY}'"
324 fi
325
326 runtime_log "${name} ok"
327}
328
329assert_http_contains() {
330 local name="$1"
331 local url="$2"
332 local expected_status="$3"
333 local expected_substring="$4"
334
335 http_get "$url"
336
337 if [[ "$HTTP_STATUS" != "$expected_status" ]]; then
338 die "${name} returned HTTP ${HTTP_STATUS}, expected ${expected_status}"
339 fi
340
341 if [[ "$HTTP_BODY" != *"$expected_substring"* ]]; then
342 die "${name} body does not contain expected text: ${expected_substring}"
343 fi
344
345 runtime_log "${name} ok"
346}
347
348check_loaded_services() {
349 require_command launchctl
350
351 for service in "${services[@]}"; do
352 launchctl print "${domain_target}/$(service_label "$service")" >/dev/null
353 runtime_log "launchctl loaded: $(service_label "$service")"
354 done
355}
356
357run_static_checks() {
358 local static_args=()
359 local service
360
361 static_args+=(
362 --node "$node"
363 --scope "$scope"
364 --repo-dir "$repo_dir"
365 --home-dir "$home_dir"
366 --install-dir "$install_dir"
367 --local-api-base "$local_api_base"
368 --local-api-allowed-hosts "$local_api_allowed_hosts"
369 --codexd-local-api-base "$codexd_api_base"
370 --status-api-host "$status_api_host"
371 --username "$username"
372 )
373
374 for service in "${services[@]}"; do
375 if service_uses_public_api_base "$service"; then
376 static_args+=(--public-api-base "$public_api_base")
377 break
378 fi
379 done
380
381 for service in "${services[@]}"; do
382 static_args+=(--service "$service")
383 done
384
385 if [[ -n "$shared_token" ]]; then
386 static_args+=(--shared-token "$shared_token")
387 fi
388
389 if [[ -n "$shared_token_file" ]]; then
390 static_args+=(--shared-token-file "$shared_token_file")
391 fi
392
393 if [[ "$check_loaded" == "1" ]]; then
394 static_args+=(--check-loaded --domain "$domain_target")
395 fi
396
397 "${SCRIPT_DIR}/check-launchd.sh" "${static_args[@]}"
398}
399
400check_service_process() {
401 local service="$1"
402 local process_pattern="$2"
403 local process_lines
404
405 process_lines="$(ps -axo pid=,command= | awk -v pattern="$process_pattern" 'index($0, pattern) > 0 { print }')"
406
407 if [[ -z "$process_lines" ]]; then
408 die "${service} process not found for pattern: ${process_pattern}"
409 fi
410
411 runtime_log "${service} process ok: $(printf '%s\n' "$process_lines" | sed -n '1p')"
412}
413
414check_listen_port() {
415 local service="$1"
416 local port="$2"
417 local socket_lines
418
419 socket_lines="$(lsof -nP -iTCP:"$port" -sTCP:LISTEN 2>/dev/null || true)"
420
421 if [[ -z "$socket_lines" ]]; then
422 die "${service} is not listening on TCP port ${port}"
423 fi
424
425 runtime_log "${service} listening on TCP ${port}"
426}
427
428check_service_logs() {
429 local service="$1"
430 local stdout_path
431 local stderr_path
432
433 stdout_path="$(service_stdout_path "$logs_launchd_dir" "$service")"
434 stderr_path="$(service_stderr_path "$logs_launchd_dir" "$service")"
435
436 assert_directory "$(dirname -- "$stdout_path")"
437 assert_file "$stdout_path"
438 assert_file "$stderr_path"
439
440 runtime_log "${service} log files present"
441}
442
443check_conductor_runtime() {
444 local conductor_base_url="$1"
445 local port
446
447 if [[ "$skip_port_check" != "1" ]]; then
448 port="$(extract_port_from_url "conductor" "$conductor_base_url")"
449 check_listen_port "conductor" "$port"
450 fi
451
452 if [[ "$skip_http_check" != "1" ]]; then
453 assert_http_equals "conductor /healthz" "${conductor_base_url}/healthz" "200" "ok"
454 assert_http_equals "conductor /readyz" "${conductor_base_url}/readyz" "200" "ready"
455 assert_http_contains "conductor /v1/codex" "${conductor_base_url}/v1/codex" "200" "\"backend\": \"independent_codexd\""
456
457 http_get "${conductor_base_url}/rolez"
458 if [[ "$HTTP_STATUS" != "200" ]]; then
459 die "conductor /rolez returned HTTP ${HTTP_STATUS}, expected 200"
460 fi
461
462 case "$expected_rolez" in
463 any)
464 if [[ -z "$HTTP_BODY" ]]; then
465 die "conductor /rolez returned an empty body"
466 fi
467 ;;
468 *)
469 if [[ "$HTTP_BODY" != "$expected_rolez" ]]; then
470 die "conductor /rolez mismatch: expected '${expected_rolez}', got '${HTTP_BODY}'"
471 fi
472 ;;
473 esac
474
475 runtime_log "conductor /rolez ok: ${HTTP_BODY}"
476 fi
477}
478
479check_status_api_runtime() {
480 local status_base_url="$1"
481 local port
482
483 if [[ "$skip_port_check" != "1" ]]; then
484 port="$(extract_port_from_url "status-api" "$status_base_url")"
485 check_listen_port "status-api" "$port"
486 fi
487
488 if [[ "$skip_http_check" != "1" ]]; then
489 assert_http_equals "status-api /healthz" "${status_base_url}/healthz" "200" "ok"
490 assert_http_contains "status-api /v1/status" "${status_base_url}/v1/status" "200" "\"ok\": true"
491 assert_http_contains "status-api /v1/status/ui" "${status_base_url}/v1/status/ui" "200" "BAA Conductor Status"
492 fi
493}
494
495local_api_base="$(normalize_base_url "$local_api_base")"
496codexd_api_base="$(normalize_base_url "$codexd_api_base")"
497status_api_base="$(normalize_base_url "$status_api_base")"
498
499check_codexd_runtime() {
500 local codexd_base_url="$1"
501 local port
502
503 if [[ "$skip_port_check" != "1" ]]; then
504 port="$(extract_port_from_url "codexd" "$codexd_base_url")"
505 check_listen_port "codexd" "$port"
506 fi
507
508 if [[ "$skip_http_check" != "1" ]]; then
509 assert_http_contains "codexd /healthz" "${codexd_base_url}/healthz" "200" "\"status\": \"ok\""
510 assert_http_contains "codexd /v1/codexd/status" "${codexd_base_url}/v1/codexd/status" "200" "\"mode\": \"app-server\""
511 fi
512}
513
514if [[ "$skip_static_check" != "1" ]]; then
515 run_static_checks
516elif [[ "$check_loaded" == "1" ]]; then
517 check_loaded_services
518fi
519
520for service in "${services[@]}"; do
521 if [[ "$skip_process_check" != "1" ]]; then
522 check_service_process "$service" "$(service_process_match "$repo_dir" "$service" "$conductor_host" "$conductor_role")"
523 fi
524
525 if [[ "$skip_log_check" != "1" ]]; then
526 check_service_logs "$service"
527 fi
528
529 case "$service" in
530 conductor)
531 check_conductor_runtime "$local_api_base"
532 ;;
533 codexd)
534 check_codexd_runtime "$codexd_api_base"
535 ;;
536 status-api)
537 check_status_api_runtime "$status_api_base"
538 ;;
539 esac
540done
541
542runtime_log "node checks passed for ${node} (${node_id})"