baa-conductor


baa-conductor / scripts / runtime
im_wower  ·  2026-03-26

check-node.sh

  1#!/usr/bin/env bash
  2set -euo pipefail
  3
  4SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
  5# shellcheck source=./common.sh
  6source "${SCRIPT_DIR}/common.sh"
  7
  8usage() {
  9  cat <<'EOF'
 10Usage:
 11  scripts/runtime/check-node.sh [options]
 12
 13Options:
 14  --node mini                  Select node defaults. Defaults to mini.
 15  --scope agent|daemon         Expected launchd scope. Defaults to agent.
 16  --service NAME               Add one service to the runtime check set. Repeatable.
 17  --all-services               Check conductor, codexd, worker-runner, and status-api.
 18  --repo-dir PATH              Repo root used to derive runtime paths.
 19  --home-dir PATH              HOME value expected in installed plist files.
 20  --install-dir PATH           Validate installed copies under this directory.
 21  --shared-token TOKEN         Expect this exact token in installed copies.
 22  --shared-token-file PATH     Read the expected token from a file.
 23  --public-api-base URL        Expected conductor BAA_CONDUCTOR_PUBLIC_API_BASE in installed copies.
 24  --control-api-base URL       Legacy alias for --public-api-base.
 25  --local-api-base URL         Conductor local API base URL. Defaults to 127.0.0.1:4317.
 26  --local-api-allowed-hosts CSV
 27                               Expected BAA_CONDUCTOR_LOCAL_API_ALLOWED_HOSTS in installed copies.
 28  --codexd-api-base URL        codexd local API base URL. Defaults to 127.0.0.1:4319.
 29  --status-api-base URL        Status API base URL. Defaults to 127.0.0.1:4318.
 30  --status-api-host HOST       Expected BAA_STATUS_API_HOST in installed copies.
 31  --username NAME              Expected UserName for LaunchDaemons.
 32  --domain TARGET              launchctl domain target for --check-loaded.
 33  --check-loaded               Also require launchctl print to succeed for each service.
 34  --expected-rolez VALUE       Expected conductor /rolez body: leader or any.
 35  --skip-static-check          Skip the underlying check-launchd.sh pass.
 36  --skip-port-check            Skip local TCP LISTEN checks.
 37  --skip-process-check         Skip host process command-line checks.
 38  --skip-http-check            Skip conductor/codexd/status-api HTTP probes.
 39  --skip-log-check             Skip launchd stdout/stderr file checks.
 40  --help                       Show this help text.
 41
 42Notes:
 43  The default runtime check set is conductor + codexd. Use --service status-api
 44  to include the optional local read-only observer, or --all-services to also
 45  include worker-runner.
 46  status-api defaults to BAA_CONDUCTOR_LOCAL_API /v1/system/state; launchd no
 47  longer writes conductor public-api base env vars for it.
 48  conductor HTTP probes include /v1/codex to ensure proxy wiring to codexd.
 49  For codexd, the HTTP probes only cover /healthz and /v1/codexd/status.
 50  /v1/codexd/runs* and codex exec are not part of node verification.
 51EOF
 52}
 53
 54require_command awk
 55require_command curl
 56require_command lsof
 57require_command ps
 58
 59node="mini"
 60scope="agent"
 61repo_dir="${BAA_RUNTIME_REPO_DIR_DEFAULT}"
 62home_dir="$(default_home_dir)"
 63install_dir=""
 64shared_token=""
 65shared_token_file=""
 66public_api_base=""
 67legacy_control_api_base=""
 68local_api_base="http://100.71.210.78:4317"
 69local_api_allowed_hosts="${BAA_CONDUCTOR_LOCAL_API_ALLOWED_HOSTS:-100.71.210.78}"
 70codexd_api_base="${BAA_RUNTIME_DEFAULT_CODEXD_LOCAL_API}"
 71status_api_base="http://100.71.210.78:4318"
 72status_api_host="${BAA_STATUS_API_HOST:-100.71.210.78}"
 73username="$(default_username)"
 74domain_target=""
 75check_loaded="0"
 76expected_rolez="any"
 77skip_static_check="0"
 78skip_port_check="0"
 79skip_process_check="0"
 80skip_http_check="0"
 81skip_log_check="0"
 82services=()
 83
 84while [[ $# -gt 0 ]]; do
 85  case "$1" in
 86    --node)
 87      node="$2"
 88      shift 2
 89      ;;
 90    --scope)
 91      scope="$2"
 92      shift 2
 93      ;;
 94    --service)
 95      validate_service "$2"
 96      if ! contains_value "$2" "${services[@]-}"; then
 97        services+=("$2")
 98      fi
 99      shift 2
100      ;;
101    --all-services)
102      while IFS= read -r service; do
103        if ! contains_value "$service" "${services[@]-}"; then
104          services+=("$service")
105        fi
106      done < <(all_services)
107      shift
108      ;;
109    --repo-dir)
110      repo_dir="$2"
111      shift 2
112      ;;
113    --home-dir)
114      home_dir="$2"
115      shift 2
116      ;;
117    --install-dir)
118      install_dir="$2"
119      shift 2
120      ;;
121    --shared-token)
122      shared_token="$2"
123      shift 2
124      ;;
125    --shared-token-file)
126      shared_token_file="$2"
127      shift 2
128      ;;
129    --public-api-base)
130      public_api_base="$2"
131      shift 2
132      ;;
133    --control-api-base)
134      legacy_control_api_base="$2"
135      shift 2
136      ;;
137    --local-api-base)
138      local_api_base="$2"
139      shift 2
140      ;;
141    --local-api-allowed-hosts)
142      local_api_allowed_hosts="$2"
143      shift 2
144      ;;
145    --codexd-api-base)
146      codexd_api_base="$2"
147      shift 2
148      ;;
149    --status-api-base)
150      status_api_base="$2"
151      shift 2
152      ;;
153    --status-api-host)
154      status_api_host="$2"
155      shift 2
156      ;;
157    --username)
158      username="$2"
159      shift 2
160      ;;
161    --domain)
162      domain_target="$2"
163      shift 2
164      ;;
165    --check-loaded)
166      check_loaded="1"
167      shift
168      ;;
169    --expected-rolez)
170      expected_rolez="$2"
171      shift 2
172      ;;
173    --skip-static-check)
174      skip_static_check="1"
175      shift
176      ;;
177    --skip-port-check)
178      skip_port_check="1"
179      shift
180      ;;
181    --skip-process-check)
182      skip_process_check="1"
183      shift
184      ;;
185    --skip-http-check)
186      skip_http_check="1"
187      shift
188      ;;
189    --skip-log-check)
190      skip_log_check="1"
191      shift
192      ;;
193    --help)
194      usage
195      exit 0
196      ;;
197    *)
198      die "Unknown option: $1"
199      ;;
200  esac
201done
202
203validate_node "$node"
204validate_scope "$scope"
205
206if [[ -z "$public_api_base" ]]; then
207  if [[ -n "$legacy_control_api_base" ]]; then
208    public_api_base="$legacy_control_api_base"
209  elif [[ -n "${BAA_CONDUCTOR_PUBLIC_API_BASE:-}" ]]; then
210    public_api_base="${BAA_CONDUCTOR_PUBLIC_API_BASE}"
211  elif [[ -n "${BAA_CONTROL_API_BASE:-}" ]]; then
212    public_api_base="${BAA_CONTROL_API_BASE}"
213  else
214    public_api_base="${BAA_RUNTIME_DEFAULT_PUBLIC_API_BASE}"
215  fi
216fi
217
218case "$expected_rolez" in
219  any | leader) ;;
220  *)
221    die "Unsupported --expected-rolez value: ${expected_rolez}"
222    ;;
223esac
224
225if [[ "${#services[@]}" -eq 0 ]]; then
226  while IFS= read -r service; do
227    services+=("$service")
228  done < <(default_node_verification_services)
229fi
230
231if [[ -z "$install_dir" ]]; then
232  install_dir="$(default_install_dir "$scope" "$home_dir")"
233fi
234
235if [[ "$check_loaded" == "1" && -z "$domain_target" ]]; then
236  domain_target="$(default_domain_target "$scope")"
237fi
238
239set -- $(resolve_node_defaults "$node")
240conductor_host="$1"
241conductor_role="$2"
242node_id="$3"
243
244logs_launchd_dir="${repo_dir}/logs/launchd"
245HTTP_STATUS=""
246HTTP_BODY=""
247
248normalize_base_url() {
249  local value="$1"
250
251  while [[ "$value" == */ ]]; do
252    value="${value%/}"
253  done
254
255  printf '%s\n' "$value"
256}
257
258extract_port_from_url() {
259  local service="$1"
260  local url="$2"
261  local authority
262  local default_port
263
264  authority="${url#*://}"
265  authority="${authority%%/*}"
266
267  if [[ "$authority" == *:* ]]; then
268    printf '%s\n' "${authority##*:}"
269    return 0
270  fi
271
272  default_port="$(service_default_port "$service")"
273  if [[ -n "$default_port" ]]; then
274    printf '%s\n' "$default_port"
275    return 0
276  fi
277
278  case "$url" in
279    https://*)
280      printf '%s\n' "443"
281      ;;
282    http://*)
283      printf '%s\n' "80"
284      ;;
285    *)
286      die "Could not derive a TCP port from URL: ${url}"
287      ;;
288  esac
289}
290
291http_get() {
292  local url="$1"
293  local body_file
294
295  body_file="$(mktemp "${TMPDIR:-/tmp}/baa-runtime-http.XXXXXX")"
296
297  HTTP_STATUS="$(curl -sS --max-time 5 -o "$body_file" -w '%{http_code}' "$url")" || {
298    rm -f "$body_file"
299    die "Request failed: ${url}"
300  }
301
302  HTTP_BODY="$(tr -d '\r' <"$body_file")"
303  rm -f "$body_file"
304
305  while [[ "$HTTP_BODY" == *$'\n' ]]; do
306    HTTP_BODY="${HTTP_BODY%$'\n'}"
307  done
308}
309
310assert_http_equals() {
311  local name="$1"
312  local url="$2"
313  local expected_status="$3"
314  local expected_body="$4"
315
316  http_get "$url"
317
318  if [[ "$HTTP_STATUS" != "$expected_status" ]]; then
319    die "${name} returned HTTP ${HTTP_STATUS}, expected ${expected_status}"
320  fi
321
322  if [[ "$HTTP_BODY" != "$expected_body" ]]; then
323    die "${name} body mismatch: expected '${expected_body}', got '${HTTP_BODY}'"
324  fi
325
326  runtime_log "${name} ok"
327}
328
329assert_http_contains() {
330  local name="$1"
331  local url="$2"
332  local expected_status="$3"
333  local expected_substring="$4"
334
335  http_get "$url"
336
337  if [[ "$HTTP_STATUS" != "$expected_status" ]]; then
338    die "${name} returned HTTP ${HTTP_STATUS}, expected ${expected_status}"
339  fi
340
341  if [[ "$HTTP_BODY" != *"$expected_substring"* ]]; then
342    die "${name} body does not contain expected text: ${expected_substring}"
343  fi
344
345  runtime_log "${name} ok"
346}
347
348check_loaded_services() {
349  require_command launchctl
350
351  for service in "${services[@]}"; do
352    launchctl print "${domain_target}/$(service_label "$service")" >/dev/null
353    runtime_log "launchctl loaded: $(service_label "$service")"
354  done
355}
356
357run_static_checks() {
358  local static_args=()
359  local service
360
361  static_args+=(
362    --node "$node"
363    --scope "$scope"
364    --repo-dir "$repo_dir"
365    --home-dir "$home_dir"
366    --install-dir "$install_dir"
367    --local-api-base "$local_api_base"
368    --local-api-allowed-hosts "$local_api_allowed_hosts"
369    --codexd-local-api-base "$codexd_api_base"
370    --status-api-host "$status_api_host"
371    --username "$username"
372  )
373
374  for service in "${services[@]}"; do
375    if service_uses_public_api_base "$service"; then
376      static_args+=(--public-api-base "$public_api_base")
377      break
378    fi
379  done
380
381  for service in "${services[@]}"; do
382    static_args+=(--service "$service")
383  done
384
385  if [[ -n "$shared_token" ]]; then
386    static_args+=(--shared-token "$shared_token")
387  fi
388
389  if [[ -n "$shared_token_file" ]]; then
390    static_args+=(--shared-token-file "$shared_token_file")
391  fi
392
393  if [[ "$check_loaded" == "1" ]]; then
394    static_args+=(--check-loaded --domain "$domain_target")
395  fi
396
397  "${SCRIPT_DIR}/check-launchd.sh" "${static_args[@]}"
398}
399
400check_service_process() {
401  local service="$1"
402  local process_pattern="$2"
403  local process_lines
404
405  process_lines="$(ps -axo pid=,command= | awk -v pattern="$process_pattern" 'index($0, pattern) > 0 { print }')"
406
407  if [[ -z "$process_lines" ]]; then
408    die "${service} process not found for pattern: ${process_pattern}"
409  fi
410
411  runtime_log "${service} process ok: $(printf '%s\n' "$process_lines" | sed -n '1p')"
412}
413
414check_listen_port() {
415  local service="$1"
416  local port="$2"
417  local socket_lines
418
419  socket_lines="$(lsof -nP -iTCP:"$port" -sTCP:LISTEN 2>/dev/null || true)"
420
421  if [[ -z "$socket_lines" ]]; then
422    die "${service} is not listening on TCP port ${port}"
423  fi
424
425  runtime_log "${service} listening on TCP ${port}"
426}
427
428check_service_logs() {
429  local service="$1"
430  local stdout_path
431  local stderr_path
432
433  stdout_path="$(service_stdout_path "$logs_launchd_dir" "$service")"
434  stderr_path="$(service_stderr_path "$logs_launchd_dir" "$service")"
435
436  assert_directory "$(dirname -- "$stdout_path")"
437  assert_file "$stdout_path"
438  assert_file "$stderr_path"
439
440  runtime_log "${service} log files present"
441}
442
443check_conductor_runtime() {
444  local conductor_base_url="$1"
445  local port
446
447  if [[ "$skip_port_check" != "1" ]]; then
448    port="$(extract_port_from_url "conductor" "$conductor_base_url")"
449    check_listen_port "conductor" "$port"
450  fi
451
452  if [[ "$skip_http_check" != "1" ]]; then
453    assert_http_equals "conductor /healthz" "${conductor_base_url}/healthz" "200" "ok"
454    assert_http_equals "conductor /readyz" "${conductor_base_url}/readyz" "200" "ready"
455    assert_http_contains "conductor /v1/codex" "${conductor_base_url}/v1/codex" "200" "\"backend\": \"independent_codexd\""
456
457    http_get "${conductor_base_url}/rolez"
458    if [[ "$HTTP_STATUS" != "200" ]]; then
459      die "conductor /rolez returned HTTP ${HTTP_STATUS}, expected 200"
460    fi
461
462    case "$expected_rolez" in
463      any)
464        if [[ -z "$HTTP_BODY" ]]; then
465          die "conductor /rolez returned an empty body"
466        fi
467        ;;
468      *)
469        if [[ "$HTTP_BODY" != "$expected_rolez" ]]; then
470          die "conductor /rolez mismatch: expected '${expected_rolez}', got '${HTTP_BODY}'"
471        fi
472        ;;
473    esac
474
475    runtime_log "conductor /rolez ok: ${HTTP_BODY}"
476  fi
477}
478
479check_status_api_runtime() {
480  local status_base_url="$1"
481  local port
482
483  if [[ "$skip_port_check" != "1" ]]; then
484    port="$(extract_port_from_url "status-api" "$status_base_url")"
485    check_listen_port "status-api" "$port"
486  fi
487
488  if [[ "$skip_http_check" != "1" ]]; then
489    assert_http_equals "status-api /healthz" "${status_base_url}/healthz" "200" "ok"
490    assert_http_contains "status-api /v1/status" "${status_base_url}/v1/status" "200" "\"ok\": true"
491    assert_http_contains "status-api /v1/status/ui" "${status_base_url}/v1/status/ui" "200" "BAA Conductor Status"
492  fi
493}
494
495local_api_base="$(normalize_base_url "$local_api_base")"
496codexd_api_base="$(normalize_base_url "$codexd_api_base")"
497status_api_base="$(normalize_base_url "$status_api_base")"
498
499check_codexd_runtime() {
500  local codexd_base_url="$1"
501  local port
502
503  if [[ "$skip_port_check" != "1" ]]; then
504    port="$(extract_port_from_url "codexd" "$codexd_base_url")"
505    check_listen_port "codexd" "$port"
506  fi
507
508  if [[ "$skip_http_check" != "1" ]]; then
509    assert_http_contains "codexd /healthz" "${codexd_base_url}/healthz" "200" "\"status\": \"ok\""
510    assert_http_contains "codexd /v1/codexd/status" "${codexd_base_url}/v1/codexd/status" "200" "\"mode\": \"app-server\""
511  fi
512}
513
514if [[ "$skip_static_check" != "1" ]]; then
515  run_static_checks
516elif [[ "$check_loaded" == "1" ]]; then
517  check_loaded_services
518fi
519
520for service in "${services[@]}"; do
521  if [[ "$skip_process_check" != "1" ]]; then
522    check_service_process "$service" "$(service_process_match "$repo_dir" "$service" "$conductor_host" "$conductor_role")"
523  fi
524
525  if [[ "$skip_log_check" != "1" ]]; then
526    check_service_logs "$service"
527  fi
528
529  case "$service" in
530    conductor)
531      check_conductor_runtime "$local_api_base"
532      ;;
533    codexd)
534      check_codexd_runtime "$codexd_api_base"
535      ;;
536    status-api)
537      check_status_api_runtime "$status_api_base"
538      ;;
539  esac
540done
541
542runtime_log "node checks passed for ${node} (${node_id})"