baa-conductor

git clone 

commit
532c4ee
parent
2778b51
author
im_wower
date
2026-03-22 21:22:23 +0800 CST
fix(runtime): recover conductor after launchd reload
3 files changed,  +192, -0
M docs/runtime/README.md
+1, -0
1@@ -63,3 +63,4 @@ Firefox WS 说明:
2   - `./scripts/runtime/start-launchd.sh`
3 - 重启:
4   - `./scripts/runtime/restart-launchd.sh`
5+  - 脚本会在 reload 后等待 `conductor /healthz` 恢复;如果服务处于 loaded 但未提供 HTTP,会自动再做一次 `launchctl kickstart -k`
M docs/runtime/launchd.md
+31, -0
 1@@ -71,6 +71,22 @@
 2 ./scripts/runtime/restart-launchd.sh
 3 ```
 4 
 5+当前 `restart-launchd.sh` / `reload-launchd.sh` 在 `bootstrap + kickstart` 结束后不会只看 `launchctl` 返回码,而会继续检查 `conductor /healthz`。
 6+
 7+- 优先读取已安装 `conductor` plist 里的 `BAA_CONDUCTOR_LOCAL_API`
 8+- 默认检查 `<local-api-base>/healthz`
 9+- 如果第一次 reload 后仍然是 loaded but not serving,会自动再执行一次:
10+
11+```bash
12+launchctl kickstart -k gui/$(id -u)/so.makefile.baa-conductor
13+```
14+
15+- 如果二次 kickstart 后仍未恢复,脚本会返回非零,并输出:
16+  - 最后一次 health probe 的 curl 结果
17+  - `launchctl print` 诊断
18+  - `conductor` stdout/stderr 日志尾部
19+  - 手工兜底命令提示
20+
21 ## 1. 构建
22 
23 ```bash
24@@ -126,6 +142,21 @@ npx --yes pnpm -r build
25   --service status-api
26 ```
27 
28+推荐最小验证:
29+
30+```bash
31+./scripts/runtime/restart-launchd.sh
32+curl -fsS http://100.71.210.78:4317/healthz
33+curl -fsS http://100.71.210.78:4317/rolez
34+```
35+
36+预期:
37+
38+- `restart-launchd.sh` 直接成功返回
39+- `/healthz` 返回 `ok`
40+- `/rolez` 返回当前角色
41+- 不需要再手工执行一次 `launchctl kickstart -k gui/$(id -u)/so.makefile.baa-conductor`
42+
43 ## 6. 节点检查
44 
45 ```bash
M scripts/runtime/reload-launchd.sh
+160, -0
  1@@ -25,6 +25,8 @@ EOF
  2 
  3 require_command launchctl
  4 require_command plutil
  5+require_command curl
  6+assert_file /usr/libexec/PlistBuddy
  7 
  8 scope="agent"
  9 home_dir="$(default_home_dir)"
 10@@ -101,6 +103,10 @@ if [[ -z "$domain_target" ]]; then
 11   domain_target="$(default_domain_target "$scope")"
 12 fi
 13 
 14+CURL_LAST_EXIT_CODE="0"
 15+CURL_LAST_HTTP_STATUS="000"
 16+CURL_LAST_ERROR=""
 17+
 18 bootout_service() {
 19   local plist_path="$1"
 20 
 21@@ -112,6 +118,156 @@ bootout_service() {
 22   launchctl bootout "$domain_target" "$plist_path" 2>/dev/null || true
 23 }
 24 
 25+normalize_url() {
 26+  local value="$1"
 27+
 28+  while [[ "$value" == */ ]]; do
 29+    value="${value%/}"
 30+  done
 31+
 32+  printf '%s\n' "$value"
 33+}
 34+
 35+read_plist_value_or_default() {
 36+  local plist_path="$1"
 37+  local key="$2"
 38+  local default_value="$3"
 39+  local value=""
 40+
 41+  if value="$(/usr/libexec/PlistBuddy -c "Print ${key}" "$plist_path" 2>/dev/null)"; then
 42+    printf '%s\n' "$value"
 43+    return 0
 44+  fi
 45+
 46+  printf '%s\n' "$default_value"
 47+}
 48+
 49+probe_http_healthz() {
 50+  local url="$1"
 51+  local stderr_file
 52+  local http_status="000"
 53+
 54+  stderr_file="$(mktemp "${TMPDIR:-/tmp}/baa-runtime-reload-curl.XXXXXX")"
 55+  CURL_LAST_ERROR=""
 56+
 57+  if http_status="$(curl -sS --max-time 5 -o /dev/null -w '%{http_code}' "$url" 2>"$stderr_file")"; then
 58+    CURL_LAST_EXIT_CODE="0"
 59+  else
 60+    CURL_LAST_EXIT_CODE="$?"
 61+  fi
 62+
 63+  CURL_LAST_HTTP_STATUS="${http_status:-000}"
 64+  CURL_LAST_ERROR="$(tr -d '\r' <"$stderr_file")"
 65+  rm -f "$stderr_file"
 66+
 67+  [[ "$CURL_LAST_EXIT_CODE" == "0" && "$CURL_LAST_HTTP_STATUS" == "200" ]]
 68+}
 69+
 70+wait_for_http_healthz() {
 71+  local name="$1"
 72+  local url="$2"
 73+  local attempts="${3:-30}"
 74+  local delay="${4:-1}"
 75+  local index
 76+
 77+  for ((index = 1; index <= attempts; index += 1)); do
 78+    if probe_http_healthz "$url"; then
 79+      runtime_log "${name} is ready: ${url}"
 80+      return 0
 81+    fi
 82+
 83+    sleep "$delay"
 84+  done
 85+
 86+  return 1
 87+}
 88+
 89+print_log_tail() {
 90+  local label="$1"
 91+  local path="$2"
 92+
 93+  if [[ -z "$path" ]]; then
 94+    runtime_error "${label}: unavailable"
 95+    return 0
 96+  fi
 97+
 98+  if [[ ! -f "$path" ]]; then
 99+    runtime_error "${label}: missing ${path}"
100+    return 0
101+  fi
102+
103+  runtime_error "${label}: ${path} (last 20 lines)"
104+  tail -n 20 "$path" >&2
105+}
106+
107+print_conductor_diagnostics() {
108+  local plist_path="$1"
109+  local label="$2"
110+  local healthz_url="$3"
111+  local stdout_path
112+  local stderr_path
113+
114+  stdout_path="$(read_plist_value_or_default "$plist_path" ":StandardOutPath" "")"
115+  stderr_path="$(read_plist_value_or_default "$plist_path" ":StandardErrorPath" "")"
116+
117+  runtime_error "conductor did not recover after launchd reload"
118+  runtime_error "health probe: ${healthz_url}"
119+  runtime_error "last curl result: exit=${CURL_LAST_EXIT_CODE} http=${CURL_LAST_HTTP_STATUS:-000}"
120+  if [[ -n "$CURL_LAST_ERROR" ]]; then
121+    runtime_error "last curl stderr: ${CURL_LAST_ERROR}"
122+  fi
123+
124+  runtime_error "launchctl print ${domain_target}/${label}:"
125+  if ! launchctl print "${domain_target}/${label}" >&2; then
126+    runtime_error "launchctl print failed for ${domain_target}/${label}"
127+  fi
128+
129+  print_log_tail "conductor stdout" "$stdout_path"
130+  print_log_tail "conductor stderr" "$stderr_path"
131+  runtime_error "manual recovery hint: launchctl kickstart -k ${domain_target}/${label}"
132+}
133+
134+recover_conductor_after_reload() {
135+  local service="conductor"
136+  local label
137+  local plist_path
138+  local local_api_base
139+  local healthz_url
140+
141+  if ! contains_value "$service" "${services[@]}"; then
142+    return 0
143+  fi
144+
145+  label="$(service_label "$service")"
146+  plist_path="$(service_install_path "$install_dir" "$service")"
147+  local_api_base="$(read_plist_value_or_default "$plist_path" ":EnvironmentVariables:BAA_CONDUCTOR_LOCAL_API" "$BAA_RUNTIME_DEFAULT_LOCAL_API")"
148+  healthz_url="$(normalize_url "$local_api_base")/healthz"
149+
150+  if wait_for_http_healthz "$service" "$healthz_url"; then
151+    return 0
152+  fi
153+
154+  if [[ "$skip_kickstart" == "1" ]]; then
155+    print_conductor_diagnostics "$plist_path" "$label" "$healthz_url"
156+    die "conductor stayed unhealthy after reload with --skip-kickstart"
157+  fi
158+
159+  runtime_error "conductor was loaded but not serving; retrying launchctl kickstart -k ${domain_target}/${label}"
160+  if ! launchctl kickstart -k "${domain_target}/${label}"; then
161+    runtime_error "retry kickstart returned non-zero for ${domain_target}/${label}"
162+    print_conductor_diagnostics "$plist_path" "$label" "$healthz_url"
163+    die "conductor retry kickstart failed after reload"
164+  fi
165+
166+  if wait_for_http_healthz "$service" "$healthz_url"; then
167+    runtime_log "conductor recovered after retry kickstart"
168+    return 0
169+  fi
170+
171+  print_conductor_diagnostics "$plist_path" "$label" "$healthz_url"
172+  die "conductor failed to recover after reload"
173+}
174+
175 for service in "${services[@]}"; do
176   plist_path="$(service_install_path "$install_dir" "$service")"
177   assert_file "$plist_path"
178@@ -134,4 +290,8 @@ if [[ "$skip_kickstart" != "1" ]]; then
179   done
180 fi
181 
182+if [[ "$dry_run" != "1" ]]; then
183+  recover_conductor_after_reload
184+fi
185+
186 runtime_log "launchd reload completed for ${domain_target}"