- commit
- 532c4ee
- parent
- 2778b51
- author
- im_wower
- date
- 2026-03-22 21:22:23 +0800 CST
fix(runtime): recover conductor after launchd reload
3 files changed,
+192,
-0
+1,
-0
1@@ -63,3 +63,4 @@ Firefox WS 说明:
2 - `./scripts/runtime/start-launchd.sh`
3 - 重启:
4 - `./scripts/runtime/restart-launchd.sh`
5+ - 脚本会在 reload 后等待 `conductor /healthz` 恢复;如果服务处于 loaded 但未提供 HTTP,会自动再做一次 `launchctl kickstart -k`
+31,
-0
1@@ -71,6 +71,22 @@
2 ./scripts/runtime/restart-launchd.sh
3 ```
4
5+当前 `restart-launchd.sh` / `reload-launchd.sh` 在 `bootstrap + kickstart` 结束后不会只看 `launchctl` 返回码,而会继续检查 `conductor /healthz`。
6+
7+- 优先读取已安装 `conductor` plist 里的 `BAA_CONDUCTOR_LOCAL_API`
8+- 默认检查 `<local-api-base>/healthz`
9+- 如果第一次 reload 后仍然是 loaded but not serving,会自动再执行一次:
10+
11+```bash
12+launchctl kickstart -k gui/$(id -u)/so.makefile.baa-conductor
13+```
14+
15+- 如果二次 kickstart 后仍未恢复,脚本会返回非零,并输出:
16+ - 最后一次 health probe 的 curl 结果
17+ - `launchctl print` 诊断
18+ - `conductor` stdout/stderr 日志尾部
19+ - 手工兜底命令提示
20+
21 ## 1. 构建
22
23 ```bash
24@@ -126,6 +142,21 @@ npx --yes pnpm -r build
25 --service status-api
26 ```
27
28+推荐最小验证:
29+
30+```bash
31+./scripts/runtime/restart-launchd.sh
32+curl -fsS http://100.71.210.78:4317/healthz
33+curl -fsS http://100.71.210.78:4317/rolez
34+```
35+
36+预期:
37+
38+- `restart-launchd.sh` 直接成功返回
39+- `/healthz` 返回 `ok`
40+- `/rolez` 返回当前角色
41+- 不需要再手工执行一次 `launchctl kickstart -k gui/$(id -u)/so.makefile.baa-conductor`
42+
43 ## 6. 节点检查
44
45 ```bash
+160,
-0
1@@ -25,6 +25,8 @@ EOF
2
3 require_command launchctl
4 require_command plutil
5+require_command curl
6+assert_file /usr/libexec/PlistBuddy
7
8 scope="agent"
9 home_dir="$(default_home_dir)"
10@@ -101,6 +103,10 @@ if [[ -z "$domain_target" ]]; then
11 domain_target="$(default_domain_target "$scope")"
12 fi
13
14+CURL_LAST_EXIT_CODE="0"
15+CURL_LAST_HTTP_STATUS="000"
16+CURL_LAST_ERROR=""
17+
18 bootout_service() {
19 local plist_path="$1"
20
21@@ -112,6 +118,156 @@ bootout_service() {
22 launchctl bootout "$domain_target" "$plist_path" 2>/dev/null || true
23 }
24
25+normalize_url() {
26+ local value="$1"
27+
28+ while [[ "$value" == */ ]]; do
29+ value="${value%/}"
30+ done
31+
32+ printf '%s\n' "$value"
33+}
34+
35+read_plist_value_or_default() {
36+ local plist_path="$1"
37+ local key="$2"
38+ local default_value="$3"
39+ local value=""
40+
41+ if value="$(/usr/libexec/PlistBuddy -c "Print ${key}" "$plist_path" 2>/dev/null)"; then
42+ printf '%s\n' "$value"
43+ return 0
44+ fi
45+
46+ printf '%s\n' "$default_value"
47+}
48+
49+probe_http_healthz() {
50+ local url="$1"
51+ local stderr_file
52+ local http_status="000"
53+
54+ stderr_file="$(mktemp "${TMPDIR:-/tmp}/baa-runtime-reload-curl.XXXXXX")"
55+ CURL_LAST_ERROR=""
56+
57+ if http_status="$(curl -sS --max-time 5 -o /dev/null -w '%{http_code}' "$url" 2>"$stderr_file")"; then
58+ CURL_LAST_EXIT_CODE="0"
59+ else
60+ CURL_LAST_EXIT_CODE="$?"
61+ fi
62+
63+ CURL_LAST_HTTP_STATUS="${http_status:-000}"
64+ CURL_LAST_ERROR="$(tr -d '\r' <"$stderr_file")"
65+ rm -f "$stderr_file"
66+
67+ [[ "$CURL_LAST_EXIT_CODE" == "0" && "$CURL_LAST_HTTP_STATUS" == "200" ]]
68+}
69+
70+wait_for_http_healthz() {
71+ local name="$1"
72+ local url="$2"
73+ local attempts="${3:-30}"
74+ local delay="${4:-1}"
75+ local index
76+
77+ for ((index = 1; index <= attempts; index += 1)); do
78+ if probe_http_healthz "$url"; then
79+ runtime_log "${name} is ready: ${url}"
80+ return 0
81+ fi
82+
83+ sleep "$delay"
84+ done
85+
86+ return 1
87+}
88+
89+print_log_tail() {
90+ local label="$1"
91+ local path="$2"
92+
93+ if [[ -z "$path" ]]; then
94+ runtime_error "${label}: unavailable"
95+ return 0
96+ fi
97+
98+ if [[ ! -f "$path" ]]; then
99+ runtime_error "${label}: missing ${path}"
100+ return 0
101+ fi
102+
103+ runtime_error "${label}: ${path} (last 20 lines)"
104+ tail -n 20 "$path" >&2
105+}
106+
107+print_conductor_diagnostics() {
108+ local plist_path="$1"
109+ local label="$2"
110+ local healthz_url="$3"
111+ local stdout_path
112+ local stderr_path
113+
114+ stdout_path="$(read_plist_value_or_default "$plist_path" ":StandardOutPath" "")"
115+ stderr_path="$(read_plist_value_or_default "$plist_path" ":StandardErrorPath" "")"
116+
117+ runtime_error "conductor did not recover after launchd reload"
118+ runtime_error "health probe: ${healthz_url}"
119+ runtime_error "last curl result: exit=${CURL_LAST_EXIT_CODE} http=${CURL_LAST_HTTP_STATUS:-000}"
120+ if [[ -n "$CURL_LAST_ERROR" ]]; then
121+ runtime_error "last curl stderr: ${CURL_LAST_ERROR}"
122+ fi
123+
124+ runtime_error "launchctl print ${domain_target}/${label}:"
125+ if ! launchctl print "${domain_target}/${label}" >&2; then
126+ runtime_error "launchctl print failed for ${domain_target}/${label}"
127+ fi
128+
129+ print_log_tail "conductor stdout" "$stdout_path"
130+ print_log_tail "conductor stderr" "$stderr_path"
131+ runtime_error "manual recovery hint: launchctl kickstart -k ${domain_target}/${label}"
132+}
133+
134+recover_conductor_after_reload() {
135+ local service="conductor"
136+ local label
137+ local plist_path
138+ local local_api_base
139+ local healthz_url
140+
141+ if ! contains_value "$service" "${services[@]}"; then
142+ return 0
143+ fi
144+
145+ label="$(service_label "$service")"
146+ plist_path="$(service_install_path "$install_dir" "$service")"
147+ local_api_base="$(read_plist_value_or_default "$plist_path" ":EnvironmentVariables:BAA_CONDUCTOR_LOCAL_API" "$BAA_RUNTIME_DEFAULT_LOCAL_API")"
148+ healthz_url="$(normalize_url "$local_api_base")/healthz"
149+
150+ if wait_for_http_healthz "$service" "$healthz_url"; then
151+ return 0
152+ fi
153+
154+ if [[ "$skip_kickstart" == "1" ]]; then
155+ print_conductor_diagnostics "$plist_path" "$label" "$healthz_url"
156+ die "conductor stayed unhealthy after reload with --skip-kickstart"
157+ fi
158+
159+ runtime_error "conductor was loaded but not serving; retrying launchctl kickstart -k ${domain_target}/${label}"
160+ if ! launchctl kickstart -k "${domain_target}/${label}"; then
161+ runtime_error "retry kickstart returned non-zero for ${domain_target}/${label}"
162+ print_conductor_diagnostics "$plist_path" "$label" "$healthz_url"
163+ die "conductor retry kickstart failed after reload"
164+ fi
165+
166+ if wait_for_http_healthz "$service" "$healthz_url"; then
167+ runtime_log "conductor recovered after retry kickstart"
168+ return 0
169+ fi
170+
171+ print_conductor_diagnostics "$plist_path" "$label" "$healthz_url"
172+ die "conductor failed to recover after reload"
173+}
174+
175 for service in "${services[@]}"; do
176 plist_path="$(service_install_path "$install_dir" "$service")"
177 assert_file "$plist_path"
178@@ -134,4 +290,8 @@ if [[ "$skip_kickstart" != "1" ]]; then
179 done
180 fi
181
182+if [[ "$dry_run" != "1" ]]; then
183+ recover_conductor_after_reload
184+fi
185+
186 runtime_log "launchd reload completed for ${domain_target}"