codex@macbookpro
·
2026-03-31
validation_real_textbooks.py
1from __future__ import annotations
2
3import argparse
4import json
5import math
6import subprocess
7import sys
8from pathlib import Path
9from typing import Any, Callable, Dict, List, Sequence
10
11from .runtime import CIERuntime, REQUIRED_SNAPSHOT_KEYS
12
13
14BASE_COMMIT = "419ae8d39150806011c1eb6082c7fc8c6a337735"
15BRANCH_B_REFERENCE_COMMIT = "c7342881bb2ebfa5e7f927c91a7806416288573b"
16BRANCH_B_REFERENCE_LABEL = "c734288"
17REPO_ROOT = Path(__file__).resolve().parent.parent
18DATASET_DIR = Path("/Users/george/code/china-text-book-md")
19DEFAULT_JSON_REPORT_PATH = REPO_ROOT / "reports" / "2026-03-31_branch_a_real_textbook_validation.json"
20DEFAULT_MARKDOWN_REPORT_PATH = REPO_ROOT / "reports" / "2026-03-31_branch_a_real_textbook_validation.md"
21DEFAULT_REVIEW_REPORT_PATH = REPO_ROOT / "reviews" / "2026-03-31_branch_a_real_textbook_validation.md"
22VALID_SCENARIO_STATUSES = ("PASS", "FAIL", "N/A", "STRUCTURAL MISMATCH")
23STAGE_NAMES = ("memory", "experience", "skill_belt", "ability_core")
24REQUIRED_REPORT_KEYS = (
25 "branch",
26 "base_commit",
27 "branch_b_reference_commit",
28 "dataset_dir",
29 "dataset_files",
30 "dataset_check",
31 "scenarios",
32 "overall_summary",
33 "structural_mismatches",
34 "known_limitations",
35 "recommendation",
36)
37
38TEXTBOOKS = {
39 "小学语文一上": "小学_语文_统编版_义务教育教科书·语文一年级上册.md",
40 "小学数学一上": "小学_数学_人教版_义务教育教科书 · 数学一年级上册.md",
41 "初中语文七上": "初中_语文_统编版-人民教育出版社_七年级_义务教育教科书·语文七年级上册.md",
42 "初中数学七上": "初中_数学_人教版-人民教育出版社_七年级_义务教育教科书·数学七年级上册.md",
43 "高中语文必修上": "高中_语文_统编版-人民教育出版社_普通高中教科书·语文必修 上册.md",
44}
45
46BRANCH_B_REFERENCE_EXPECTATIONS = {
47 "A01": {
48 "summary": "Branch B stage report: PASS; 126 nodes, 166 flows on 小学语文一上 pipeline.",
49 "source": "STAGE_REPORT.md @ c734288",
50 },
51 "A02": {
52 "summary": "Branch B stage report: PASS; 58 nodes, has_cn=True on 小学数学一上 mixed text.",
53 "source": "STAGE_REPORT.md @ c734288",
54 },
55 "A03": {
56 "summary": "Branch B stage report: PASS; 276 nodes, 20 sedimentation traces on 初中语文七上.",
57 "source": "STAGE_REPORT.md @ c734288",
58 },
59 "A04": {
60 "summary": "Branch B stage report: PASS; 294 edges, asymmetry ratio 1.00 on 初中数学七上.",
61 "source": "STAGE_REPORT.md @ c734288",
62 },
63 "A05": {
64 "summary": "Branch B stage report: PASS; 397 nodes, phi range [-0.13, 0.15] on 高中语文必修上.",
65 "source": "STAGE_REPORT.md @ c734288",
66 },
67 "A06": {
68 "summary": "Branch B stage report: PASS; 8 new nodes after 语文→数学 subject switch.",
69 "source": "STAGE_REPORT.md @ c734288",
70 },
71 "A07": {
72 "summary": "Branch B stage report: PASS; 182/189 phi entries preserved after reset.",
73 "source": "STAGE_REPORT.md @ c734288",
74 },
75 "A08": {
76 "summary": "Branch B stage report: PASS; confidence 0.333→0.889→0.381 after positive/negative feedback.",
77 "source": "STAGE_REPORT.md @ c734288",
78 },
79 "A09": {
80 "summary": "Branch B stage report: PASS; sedimentation gradient (20,4)→(20,10).",
81 "source": "STAGE_REPORT.md @ c734288",
82 },
83 "A10": {
84 "summary": "Branch B stage report: PASS; 16 snapshot fields present on real textbook state.",
85 "source": "STAGE_REPORT.md @ c734288",
86 },
87}
88
89
90def _round(value: float) -> float:
91 return round(float(value), 4)
92
93
94def _git_stdout(args: Sequence[str], fallback: str) -> str:
95 try:
96 completed = subprocess.run(
97 ["git", *args],
98 cwd=REPO_ROOT,
99 check=True,
100 capture_output=True,
101 text=True,
102 )
103 except (FileNotFoundError, subprocess.CalledProcessError):
104 return fallback
105 output = completed.stdout.strip()
106 return output or fallback
107
108
109def _current_branch() -> str:
110 return _git_stdout(["rev-parse", "--abbrev-ref", "HEAD"], "review/branch-a-real-textbook-validation")
111
112
113def dataset_file_rows() -> List[Dict[str, Any]]:
114 rows = []
115 for textbook, filename in TEXTBOOKS.items():
116 path = DATASET_DIR / filename
117 rows.append(
118 {
119 "textbook": textbook,
120 "filename": filename,
121 "path": str(path),
122 "exists": path.is_file(),
123 }
124 )
125 return rows
126
127
128def check_dataset() -> Dict[str, Any]:
129 rows = dataset_file_rows()
130 missing_paths = [row["path"] for row in rows if not row["exists"]]
131 directory_exists = DATASET_DIR.is_dir()
132 return {
133 "directory_exists": directory_exists,
134 "all_required_files_exist": directory_exists and not missing_paths,
135 "missing_paths": missing_paths,
136 "file_results": rows,
137 }
138
139
140def _require_dataset() -> Dict[str, Any]:
141 dataset_check = check_dataset()
142 if dataset_check["all_required_files_exist"]:
143 return dataset_check
144 missing = dataset_check["missing_paths"] or [str(DATASET_DIR)]
145 raise FileNotFoundError("Required textbook dataset is missing:\n" + "\n".join(missing))
146
147
148def load_textbook_paragraphs(name: str) -> List[str]:
149 path = DATASET_DIR / TEXTBOOKS[name]
150 paragraphs: List[str] = []
151 raw = path.read_text(encoding="utf-8")
152 for line in raw.splitlines():
153 line = line.strip()
154 if not line:
155 continue
156 if line.startswith("#") or line.startswith("**") or line.startswith("---"):
157 continue
158 if line.startswith("!["):
159 continue
160 control_chars = sum(1 for char in line if ord(char) < 32 and char not in "\n\t")
161 if control_chars > len(line) * 0.3:
162 continue
163 chinese_chars = sum(1 for char in line if "\u4e00" <= char <= "\u9fff")
164 if chinese_chars >= 2:
165 paragraphs.append(line)
166 return paragraphs
167
168
169def _slice_paragraphs(name: str, paragraph_count: int) -> List[str]:
170 paragraphs = load_textbook_paragraphs(name)
171 if len(paragraphs) < paragraph_count:
172 raise ValueError(f"{name} only has {len(paragraphs)} cleaned paragraphs; need {paragraph_count}.")
173 return paragraphs[:paragraph_count]
174
175
176def _feed(runtime: CIERuntime, paragraphs: List[str], char_limit: int, step_n: int) -> List[str]:
177 used_slices: List[str] = []
178 for paragraph in paragraphs:
179 used = paragraph[:char_limit]
180 runtime.ingest(used)
181 runtime.step(n=step_n)
182 used_slices.append(used)
183 return used_slices
184
185
186def _stage_counts(runtime: CIERuntime) -> Dict[str, int]:
187 counts = {stage: 0 for stage in STAGE_NAMES}
188 for profile in runtime.state.sedimentation.values():
189 counts[profile.stage] = counts.get(profile.stage, 0) + 1
190 return counts
191
192
193def _phi_range(runtime: CIERuntime) -> Dict[str, float] | None:
194 values = list(runtime.state.phi.values())
195 if not values:
196 return None
197 return {"min": _round(min(values)), "max": _round(max(values))}
198
199
200def _max_abs(mapping: Dict[Any, float]) -> float:
201 if not mapping:
202 return 0.0
203 return _round(max(abs(value) for value in mapping.values()))
204
205
206def _all_finite(runtime: CIERuntime) -> bool:
207 for mapping in (runtime.state.phi, runtime.state.mu, runtime.state.J, runtime.state.anchor_nodes):
208 for value in mapping.values():
209 if not math.isfinite(value):
210 return False
211 return True
212
213
214def _contains_chinese(node: str) -> bool:
215 return any("\u4e00" <= char <= "\u9fff" for char in node)
216
217
218def _contains_digit(node: str) -> bool:
219 return any(char.isdigit() for char in node)
220
221
222def _top_level_snapshot_metrics(runtime: CIERuntime, snapshot: Dict[str, Any], output: str | None = None) -> Dict[str, Any]:
223 metrics: Dict[str, Any] = {
224 "phi_summary": snapshot["phi_summary"],
225 "mu_summary": snapshot["mu_summary"],
226 "J_summary": snapshot["J_summary"],
227 "active_region": snapshot["active_region"],
228 "active_region_size": len(snapshot["active_region"]),
229 "bound_ability_core": snapshot["bound_ability_core"],
230 "anchor_pull": snapshot["anchor_pull"],
231 "drift_score": snapshot["drift_score"],
232 "free_capacity": snapshot["free_capacity"],
233 "experience_regions_count": len(snapshot["experience_regions"]),
234 "skill_belt_candidates_count": len(snapshot["skill_belt_candidates"]),
235 "sedimentation_trace_count": len(snapshot["sedimentation_trace"]),
236 "merge_events_count": len(snapshot["merge_events"]),
237 "decay_events_count": len(snapshot["decay_events"]),
238 "output_mode": snapshot["output_mode"],
239 "feedback_effect": snapshot["feedback_effect"],
240 "phi_range": _phi_range(runtime),
241 "stage_counts": _stage_counts(runtime),
242 "graph_node_count": len(runtime.state.graph.nodes()),
243 "graph_edge_count_proxy": len(runtime.state.J),
244 "attention_usage": "NOT APPLICABLE: Branch A exposes free_capacity but not attention used/total.",
245 }
246 if output is not None:
247 metrics["emit_output"] = output
248 return metrics
249
250
251def _scenario_result(
252 scenario_id: str,
253 title: str,
254 textbooks: List[str],
255 status: str,
256 reason: str,
257 *,
258 ran_successfully: bool,
259 key_metrics: Dict[str, Any],
260 fairness_notes: List[str] | None = None,
261 structural_mismatch: str | None = None,
262) -> Dict[str, Any]:
263 if status not in VALID_SCENARIO_STATUSES:
264 raise ValueError(f"Invalid scenario status: {status}")
265 return {
266 "scenario_id": scenario_id,
267 "title": title,
268 "textbook_used": textbooks,
269 "dataset_files": [str(DATASET_DIR / TEXTBOOKS[name]) for name in textbooks],
270 "ran_successfully": ran_successfully,
271 "status": status,
272 "reason": reason,
273 "key_metrics": key_metrics,
274 "branch_b_reference_expectation": BRANCH_B_REFERENCE_EXPECTATIONS[scenario_id],
275 "fairness_notes": fairness_notes or [],
276 "structural_mismatch": structural_mismatch,
277 }
278
279
280def _scenario_failure(
281 scenario_id: str,
282 title: str,
283 textbooks: List[str],
284 exc: Exception,
285) -> Dict[str, Any]:
286 return _scenario_result(
287 scenario_id,
288 title,
289 textbooks,
290 "FAIL",
291 f"Scenario execution raised {exc.__class__.__name__}: {exc}",
292 ran_successfully=False,
293 key_metrics={"exception": str(exc)},
294 fairness_notes=[],
295 structural_mismatch=None,
296 )
297
298
299def run_a01() -> Dict[str, Any]:
300 runtime = CIERuntime()
301 used = _feed(runtime, _slice_paragraphs("小学语文一上", 30), char_limit=50, step_n=3)
302 output = runtime.emit()
303 snapshot = runtime.snapshot_state()
304 status = "PASS" if snapshot["mu_summary"]["active_count"] > 0 and snapshot["J_summary"]["edge_count"] > 0 else "FAIL"
305 reason = (
306 "Pipeline ran on the required real-data slice and produced non-empty phi/mu/J state."
307 if status == "PASS"
308 else "Pipeline did not retain active state after the required real-data slice."
309 )
310 metrics = _top_level_snapshot_metrics(runtime, snapshot, output)
311 metrics.update(
312 {
313 "input_slice": {"paragraphs": 30, "chars_per_paragraph": 50, "step_n": 3},
314 "used_paragraph_count": len(used),
315 }
316 )
317 return _scenario_result(
318 "A01",
319 "小学语文一上 — pipeline",
320 ["小学语文一上"],
321 status,
322 reason,
323 ran_successfully=True,
324 key_metrics=metrics,
325 fairness_notes=[
326 "Branch A emit() returns a string, so output mode and active counts come from snapshot_state().",
327 "Observed state is much smaller than Branch B's reference counts, but the scenario does complete end-to-end.",
328 ],
329 structural_mismatch=None,
330 )
331
332
333def run_a02() -> Dict[str, Any]:
334 runtime = CIERuntime()
335 _feed(runtime, _slice_paragraphs("小学数学一上", 20), char_limit=40, step_n=3)
336 output = runtime.emit()
337 snapshot = runtime.snapshot_state()
338 nodes = runtime.state.graph.nodes()
339 chinese_nodes = [node for node in nodes if _contains_chinese(node)]
340 digit_nodes = [node for node in nodes if _contains_digit(node)]
341 status = "PASS" if chinese_nodes else "FAIL"
342 reason = (
343 "Chinese-bearing nodes exist on the mixed textbook slice; digit-bearing nodes are reported separately."
344 if status == "PASS"
345 else "No Chinese-bearing nodes were formed on the mixed textbook slice."
346 )
347 metrics = _top_level_snapshot_metrics(runtime, snapshot, output)
348 metrics.update(
349 {
350 "input_slice": {"paragraphs": 20, "chars_per_paragraph": 40, "step_n": 3},
351 "has_chinese_nodes": bool(chinese_nodes),
352 "has_digit_nodes": bool(digit_nodes),
353 "chinese_node_count": len(chinese_nodes),
354 "digit_node_count": len(digit_nodes),
355 "sample_digit_nodes": digit_nodes[:5],
356 }
357 )
358 return _scenario_result(
359 "A02",
360 "小学数学一上 — mixed text",
361 ["小学数学一上"],
362 status,
363 reason,
364 ran_successfully=True,
365 key_metrics=metrics,
366 fairness_notes=[
367 "Branch A tokenizes with Unicode word regexes, so digits may be absorbed into coarse tokens or absent from this slice.",
368 "The required honest report here is whether Chinese nodes exist and whether digit-bearing nodes were actually observed.",
369 ],
370 structural_mismatch=None,
371 )
372
373
374def run_a03() -> Dict[str, Any]:
375 runtime = CIERuntime()
376 _feed(runtime, _slice_paragraphs("初中语文七上", 50), char_limit=60, step_n=3)
377 snapshot = runtime.snapshot_state()
378 has_sedimentation = bool(snapshot["sedimentation_trace"]) and bool(snapshot["experience_regions"])
379 status = "PASS" if has_sedimentation else "FAIL"
380 reason = (
381 "Sedimentation and experience-region observables are present on the required real-data slice."
382 if status == "PASS"
383 else "Sedimentation observables did not materialize on the required real-data slice."
384 )
385 metrics = _top_level_snapshot_metrics(runtime, snapshot)
386 metrics.update({"input_slice": {"paragraphs": 50, "chars_per_paragraph": 60, "step_n": 3}})
387 return _scenario_result(
388 "A03",
389 "初中语文七上 — complexity / sedimentation",
390 ["初中语文七上"],
391 status,
392 reason,
393 ran_successfully=True,
394 key_metrics=metrics,
395 fairness_notes=[
396 "Branch A exposes sedimentation_trace and experience_regions, but its tokenized graph remains much smaller than Branch B's reference run.",
397 "sedimentation_trace is capped, so count saturation is expected and should not be over-interpreted.",
398 ],
399 structural_mismatch=None,
400 )
401
402
403def run_a04() -> Dict[str, Any]:
404 runtime = CIERuntime()
405 _feed(runtime, _slice_paragraphs("初中数学七上", 30), char_limit=50, step_n=3)
406 snapshot = runtime.snapshot_state()
407 asymmetry_proxy = []
408 for (left, right), value in runtime.state.J.items():
409 reverse = runtime.state.J.get((right, left), 0.0)
410 asymmetry_proxy.append(abs(value - reverse) / max(value, reverse, 1e-9))
411 metrics = _top_level_snapshot_metrics(runtime, snapshot)
412 metrics.update(
413 {
414 "input_slice": {"paragraphs": 30, "chars_per_paragraph": 50, "step_n": 3},
415 "directed_flow_asymmetry_proxy_avg": _round(sum(asymmetry_proxy) / len(asymmetry_proxy)) if asymmetry_proxy else 0.0,
416 "top_flows": snapshot["J_summary"]["top_flows"],
417 }
418 )
419 mismatch = (
420 "Branch B's A04 metric is based on forward/backward graph edge weights. Branch A only exposes directed J flow, "
421 "not a directly comparable directed graph-edge surface, so a fair asymmetry-ratio comparison is a structural mismatch."
422 )
423 return _scenario_result(
424 "A04",
425 "初中数学七上 — formula / structure",
426 ["初中数学七上"],
427 "STRUCTURAL MISMATCH",
428 "The scenario ran, but the primary Branch B asymmetry-ratio metric does not map cleanly onto Branch A.",
429 ran_successfully=True,
430 key_metrics=metrics,
431 fairness_notes=[
432 "Directed J flow can be described, but it is not the same observable as Branch B's directed graph edge weights.",
433 "Using the J proxy as if it were the same metric would overstate comparability.",
434 ],
435 structural_mismatch=mismatch,
436 )
437
438
439def run_a05() -> Dict[str, Any]:
440 runtime = CIERuntime()
441 _feed(runtime, _slice_paragraphs("高中语文必修上", 80), char_limit=80, step_n=2)
442 snapshot = runtime.snapshot_state()
443 finite = _all_finite(runtime)
444 obvious_divergence = (
445 not finite
446 or _max_abs(runtime.state.phi) > 1000.0
447 or _max_abs(runtime.state.mu) > 1000.0
448 or _max_abs(runtime.state.J) > 1000.0
449 )
450 status = "PASS" if not obvious_divergence else "FAIL"
451 reason = (
452 "Long-text run stayed finite and showed no obvious overflow/divergence symptom."
453 if status == "PASS"
454 else "Long-text run showed non-finite values or obvious divergence."
455 )
456 metrics = _top_level_snapshot_metrics(runtime, snapshot)
457 metrics.update(
458 {
459 "input_slice": {"paragraphs": 80, "chars_per_paragraph": 80, "step_n": 2},
460 "all_finite": finite,
461 "max_abs_phi": _max_abs(runtime.state.phi),
462 "max_abs_mu": _max_abs(runtime.state.mu),
463 "max_abs_J": _max_abs(runtime.state.J),
464 }
465 )
466 return _scenario_result(
467 "A05",
468 "高中语文必修上 — long text stability",
469 ["高中语文必修上"],
470 status,
471 reason,
472 ran_successfully=True,
473 key_metrics=metrics,
474 fairness_notes=[
475 "Branch A does not expose attention.used/total; free_capacity is the closest locked observable.",
476 "phi min/max are derived from runtime.state.phi because Branch A's snapshot summary does not include range fields.",
477 ],
478 structural_mismatch=None,
479 )
480
481
482def run_a06() -> Dict[str, Any]:
483 runtime = CIERuntime()
484 _feed(runtime, _slice_paragraphs("小学语文一上", 15), char_limit=40, step_n=3)
485 before = runtime.snapshot_state()
486 before_active = set(before["active_region"])
487 _feed(runtime, _slice_paragraphs("小学数学一上", 15), char_limit=40, step_n=3)
488 after = runtime.snapshot_state()
489 after_active = set(after["active_region"])
490 new_nodes = sorted(after_active - before_active)
491 preserved = sum(1 for node in before_active if abs(runtime.state.phi.get(node, 0.0)) > 0.001)
492 status = "PASS" if new_nodes and preserved > 0 else "FAIL"
493 reason = (
494 "Active region changes under subject switch while some earlier structures remain alive."
495 if status == "PASS"
496 else "Subject switch did not show both migration and persistence under the required schedule."
497 )
498 metrics = _top_level_snapshot_metrics(runtime, after)
499 metrics.update(
500 {
501 "input_slice": {
502 "phase_1": {"textbook": "小学语文一上", "paragraphs": 15, "chars_per_paragraph": 40, "step_n": 3},
503 "phase_2": {"textbook": "小学数学一上", "paragraphs": 15, "chars_per_paragraph": 40, "step_n": 3},
504 },
505 "active_region_before": sorted(before_active),
506 "active_region_after": sorted(after_active),
507 "new_active_nodes_after_switch": new_nodes,
508 "preserved_prior_active_phi_count": preserved,
509 }
510 )
511 return _scenario_result(
512 "A06",
513 "cross-subject transfer",
514 ["小学语文一上", "小学数学一上"],
515 status,
516 reason,
517 ran_successfully=True,
518 key_metrics=metrics,
519 fairness_notes=[
520 "This mirrors Branch B's no-reset subject switch. Branch A does show migration, but on a much smaller token set.",
521 ],
522 structural_mismatch=None,
523 )
524
525
526def run_a07() -> Dict[str, Any]:
527 runtime = CIERuntime()
528 _feed(runtime, _slice_paragraphs("初中语文七上", 30), char_limit=50, step_n=3)
529 phi_before = dict(runtime.state.phi)
530 j_before = dict(runtime.state.J)
531 graph_node_count_before = len(runtime.state.graph.nodes())
532 stage_counts_before = _stage_counts(runtime)
533 runtime.reset_session()
534 snapshot = runtime.snapshot_state()
535 preserved_phi_entries = sum(
536 1 for node, value in phi_before.items() if runtime.state.phi.get(node) == value
537 )
538 status = "PASS" if snapshot["mu_summary"]["active_count"] == 0 and not snapshot["active_region"] and preserved_phi_entries == len(phi_before) else "FAIL"
539 reason = (
540 "reset_session() clears session activation while preserving long-term graph/potential structure."
541 if status == "PASS"
542 else "reset_session() did not cleanly separate session state from long-term structure."
543 )
544 metrics = _top_level_snapshot_metrics(runtime, snapshot)
545 metrics.update(
546 {
547 "input_slice": {"paragraphs": 30, "chars_per_paragraph": 50, "step_n": 3},
548 "phi_entries_before_reset": len(phi_before),
549 "phi_entries_preserved_exactly": preserved_phi_entries,
550 "J_entries_before_reset": len(j_before),
551 "J_entries_after_reset": len(runtime.state.J),
552 "graph_nodes_before_reset": graph_node_count_before,
553 "graph_nodes_after_reset": len(runtime.state.graph.nodes()),
554 "stage_counts_before_reset": stage_counts_before,
555 "stage_counts_after_reset": _stage_counts(runtime),
556 }
557 )
558 return _scenario_result(
559 "A07",
560 "session reset preserves long-term structure",
561 ["初中语文七上"],
562 status,
563 reason,
564 ran_successfully=True,
565 key_metrics=metrics,
566 fairness_notes=[
567 "This is one of Branch A's clearer matched wins: session clearing and long-term retention separate cleanly.",
568 ],
569 structural_mismatch=None,
570 )
571
572
573def run_a08() -> Dict[str, Any]:
574 runtime = CIERuntime()
575 paragraph = _slice_paragraphs("小学语文一上", 1)[0][:30]
576 runtime.ingest(paragraph)
577 runtime.step(n=5)
578 output = runtime.emit()
579 snapshot = runtime.snapshot_state()
580 target = snapshot["active_region"][0] if snapshot["active_region"] else None
581 metrics = _top_level_snapshot_metrics(runtime, snapshot, output)
582 metrics.update({"input_slice": {"paragraphs": 1, "chars_per_paragraph": 30, "step_n": 5}, "feedback_target": target})
583 if not target or output == "minimal: idle":
584 return _scenario_result(
585 "A08",
586 "multi-round feedback",
587 ["小学语文一上"],
588 "FAIL",
589 "emit() returned no activated output target on the required slice, so the positive/negative feedback loop could not be meaningfully exercised.",
590 ran_successfully=True,
591 key_metrics=metrics,
592 fairness_notes=[
593 "This is reported as a real Branch A failure, not normalized away.",
594 "Branch A feedback is queued and applied on the next step, but that did not matter here because no target emerged.",
595 ],
596 structural_mismatch=None,
597 )
598
599 initial = {
600 "phi": _round(runtime.state.phi.get(target, 0.0)),
601 "mu": _round(runtime.state.mu.get(target, 0.0)),
602 }
603 positive_rounds = []
604 for round_index in range(5):
605 runtime.commit_feedback({"text": target, "value": 1.0})
606 runtime.step()
607 positive_rounds.append(
608 {
609 "round": round_index + 1,
610 "phi": _round(runtime.state.phi.get(target, 0.0)),
611 "mu": _round(runtime.state.mu.get(target, 0.0)),
612 "feedback_effect": dict(runtime.state.feedback_effect),
613 }
614 )
615 runtime.commit_feedback({"text": target, "value": -0.5})
616 runtime.step()
617 negative_round = {
618 "phi": _round(runtime.state.phi.get(target, 0.0)),
619 "mu": _round(runtime.state.mu.get(target, 0.0)),
620 "feedback_effect": dict(runtime.state.feedback_effect),
621 }
622 positive_improves = positive_rounds[-1]["phi"] >= initial["phi"] and positive_rounds[-1]["mu"] >= initial["mu"]
623 negative_reduces = negative_round["phi"] <= positive_rounds[-1]["phi"] and negative_round["mu"] <= positive_rounds[-1]["mu"]
624 status = "PASS" if positive_improves and negative_reduces else "FAIL"
625 reason = (
626 "Positive rounds strengthened the chosen target and the negative round weakened it."
627 if status == "PASS"
628 else "Feedback rounds did not show the expected positive-then-negative observable change."
629 )
630 metrics.update(
631 {
632 "initial_target_state": initial,
633 "positive_rounds": positive_rounds,
634 "negative_round": negative_round,
635 }
636 )
637 return _scenario_result(
638 "A08",
639 "multi-round feedback",
640 ["小学语文一上"],
641 status,
642 reason,
643 ran_successfully=True,
644 key_metrics=metrics,
645 fairness_notes=[
646 "Branch A feedback is asynchronous: commit_feedback() queues a signal that is applied on the next step.",
647 ],
648 structural_mismatch=None,
649 )
650
651
652def run_a09() -> Dict[str, Any]:
653 runtime = CIERuntime()
654 paragraphs = _slice_paragraphs("小学语文一上", 10)
655 round_history = []
656 for round_index in range(5):
657 _feed(runtime, paragraphs, char_limit=30, step_n=3)
658 snapshot = runtime.snapshot_state()
659 round_history.append(
660 {
661 "round": round_index + 1,
662 "sedimentation_trace_count": len(snapshot["sedimentation_trace"]),
663 "experience_regions_count": len(snapshot["experience_regions"]),
664 "skill_belt_candidates_count": len(snapshot["skill_belt_candidates"]),
665 "phi_node_count": snapshot["phi_summary"]["node_count"],
666 "active_count": snapshot["mu_summary"]["active_count"],
667 "stage_counts": _stage_counts(runtime),
668 }
669 )
670 initial_complexity = round_history[0]["stage_counts"]["skill_belt"] + round_history[0]["stage_counts"]["ability_core"]
671 final_complexity = round_history[-1]["stage_counts"]["skill_belt"] + round_history[-1]["stage_counts"]["ability_core"]
672 progressed = final_complexity > initial_complexity or any(
673 round_entry["stage_counts"]["skill_belt"] > round_history[0]["stage_counts"]["skill_belt"]
674 for round_entry in round_history[1:]
675 )
676 status = "PASS" if progressed else "FAIL"
677 reason = (
678 "Repeated rounds show incremental stage progression, even though several observable lists are capped."
679 if status == "PASS"
680 else "Repeated rounds did not show incremental sedimentation progression."
681 )
682 final_snapshot = runtime.snapshot_state()
683 metrics = _top_level_snapshot_metrics(runtime, final_snapshot)
684 metrics.update(
685 {
686 "input_slice": {"paragraphs": 10, "chars_per_paragraph": 30, "step_n": 3, "rounds": 5},
687 "round_history": round_history,
688 }
689 )
690 return _scenario_result(
691 "A09",
692 "incremental sedimentation",
693 ["小学语文一上"],
694 status,
695 reason,
696 ran_successfully=True,
697 key_metrics=metrics,
698 fairness_notes=[
699 "sedimentation_trace and skill_belt_candidates are capped lists in Branch A, so stage_counts are the more honest growth indicator here.",
700 ],
701 structural_mismatch=None,
702 )
703
704
705def run_a10() -> Dict[str, Any]:
706 runtime = CIERuntime()
707 _feed(runtime, _slice_paragraphs("初中数学七上", 20), char_limit=40, step_n=3)
708 output = runtime.emit()
709 runtime.step()
710 runtime.commit_feedback({"text": "validation", "value": 0.2})
711 runtime.step()
712 snapshot = runtime.snapshot_state()
713 missing = sorted(REQUIRED_SNAPSHOT_KEYS.difference(snapshot))
714 status = "PASS" if not missing else "FAIL"
715 reason = (
716 "All Branch A locked snapshot fields are present on real textbook-driven state."
717 if status == "PASS"
718 else f"Snapshot is missing required locked fields: {missing}"
719 )
720 metrics = _top_level_snapshot_metrics(runtime, snapshot, output)
721 metrics.update(
722 {
723 "input_slice": {"paragraphs": 20, "chars_per_paragraph": 40, "step_n": 3},
724 "required_snapshot_keys": sorted(REQUIRED_SNAPSHOT_KEYS),
725 "observed_snapshot_keys": sorted(snapshot),
726 "missing_snapshot_keys": missing,
727 }
728 )
729 return _scenario_result(
730 "A10",
731 "snapshot completeness on real textbook input",
732 ["初中数学七上"],
733 status,
734 reason,
735 ran_successfully=True,
736 key_metrics=metrics,
737 fairness_notes=[
738 "Branch A needs one extra step after feedback to observe the applied feedback_effect because feedback is queued.",
739 "Branch B's report mentions 16 fields including attention, but Branch A's locked comparable surface is the 15-field spec set.",
740 ],
741 structural_mismatch=None,
742 )
743
744
745SCENARIOS: List[Dict[str, Any]] = [
746 {"id": "A01", "title": "小学语文一上 — pipeline", "textbooks": ["小学语文一上"], "runner": run_a01},
747 {"id": "A02", "title": "小学数学一上 — mixed text", "textbooks": ["小学数学一上"], "runner": run_a02},
748 {"id": "A03", "title": "初中语文七上 — complexity / sedimentation", "textbooks": ["初中语文七上"], "runner": run_a03},
749 {"id": "A04", "title": "初中数学七上 — formula / structure", "textbooks": ["初中数学七上"], "runner": run_a04},
750 {"id": "A05", "title": "高中语文必修上 — long text stability", "textbooks": ["高中语文必修上"], "runner": run_a05},
751 {"id": "A06", "title": "cross-subject transfer", "textbooks": ["小学语文一上", "小学数学一上"], "runner": run_a06},
752 {"id": "A07", "title": "session reset preserves long-term structure", "textbooks": ["初中语文七上"], "runner": run_a07},
753 {"id": "A08", "title": "multi-round feedback", "textbooks": ["小学语文一上"], "runner": run_a08},
754 {"id": "A09", "title": "incremental sedimentation", "textbooks": ["小学语文一上"], "runner": run_a09},
755 {"id": "A10", "title": "snapshot completeness on real textbook input", "textbooks": ["初中数学七上"], "runner": run_a10},
756]
757
758
759def _run_scenarios() -> List[Dict[str, Any]]:
760 results = []
761 for scenario in SCENARIOS:
762 try:
763 results.append(scenario["runner"]())
764 except Exception as exc:
765 results.append(_scenario_failure(scenario["id"], scenario["title"], scenario["textbooks"], exc))
766 return results
767
768
769def _status_counts(scenarios: List[Dict[str, Any]]) -> Dict[str, int]:
770 counts = {status: 0 for status in VALID_SCENARIO_STATUSES}
771 for scenario in scenarios:
772 counts[scenario["status"]] = counts.get(scenario["status"], 0) + 1
773 return counts
774
775
776def _collect_structural_mismatches(scenarios: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
777 mismatches: List[Dict[str, Any]] = []
778 for scenario in scenarios:
779 if scenario["structural_mismatch"]:
780 mismatches.append(
781 {
782 "scenario_id": scenario["scenario_id"],
783 "title": scenario["title"],
784 "detail": scenario["structural_mismatch"],
785 }
786 )
787 mismatches.extend(
788 [
789 {
790 "scenario_id": "A01/A08",
791 "title": "emit surface mismatch",
792 "detail": "Branch A emit() returns a plain string, not Branch B's structured payload with activated nodes and active_count.",
793 },
794 {
795 "scenario_id": "A05/A10",
796 "title": "attention surface mismatch",
797 "detail": "Branch A does not expose attention used/total. free_capacity is the locked comparable field instead.",
798 },
799 {
800 "scenario_id": "A08/A10",
801 "title": "feedback timing mismatch",
802 "detail": "Branch A commit_feedback() is queued and becomes observable on the next step, unlike Branch B's more immediate feedback probes.",
803 },
804 ]
805 )
806 return mismatches
807
808
809def _overall_summary(scenarios: List[Dict[str, Any]], dataset_check: Dict[str, Any]) -> Dict[str, Any]:
810 counts = _status_counts(scenarios)
811 ran_successfully = sum(1 for scenario in scenarios if scenario["ran_successfully"])
812 return {
813 "scenario_count": len(scenarios),
814 "ran_successfully_count": ran_successfully,
815 "status_counts": counts,
816 "fairness_gap_reduced": bool(dataset_check["all_required_files_exist"] and len(scenarios) == 10),
817 "materially_changes_previous_ab_conclusion": False,
818 "summary": (
819 "Branch A was run on the same 5 real textbooks and A01-A10 scenario family used by Branch B. "
820 "Eight scenarios passed on Branch A's own observable surface, A04 is a structural mismatch, and A08 failed on the required slice."
821 ),
822 }
823
824
825def _known_limitations() -> List[str]:
826 return [
827 "Branch A tokenizes each ingest call with a Unicode word regex and keeps at most 8 tokens, so long textbook slices compress into coarse tokens instead of Branch B's character/bigram view.",
828 "Branch A does not expose attention.used/attention.total; free_capacity is the closest locked observable, and phi min/max must be derived from internal state for diagnostics.",
829 "Branch A emit() is string-only and commit_feedback() is asynchronous, so some Branch B feedback/output probes can only be approximated, not matched exactly.",
830 "The first cleaned textbook paragraphs include front matter and publishing metadata; this is shared with Branch B's slice definition but is amplified by Branch A's coarse tokenization.",
831 ]
832
833
834def _recommendation() -> Dict[str, str]:
835 return {
836 "decision": "enough to proceed with merge decision",
837 "reason": (
838 "The main A/B fairness gap was the unmatched real-data harness. This validation closes that gap enough to make a merge decision on current evidence. "
839 "The remaining issues are explicit Branch A results: one failed scenario (A08) and one true structural mismatch (A04), not hidden harness differences."
840 ),
841 }
842
843
844def _result_textbooks(result: Dict[str, Any]) -> str:
845 return ", ".join(result["textbook_used"])
846
847
848def _branch_a_observed_summary(result: Dict[str, Any]) -> str:
849 metrics = result["key_metrics"]
850 if result["scenario_id"] == "A04":
851 return (
852 f"phi={metrics['phi_summary']['node_count']}, J={metrics['J_summary']['edge_count']}, "
853 f"flow-asym-proxy={metrics['directed_flow_asymmetry_proxy_avg']}"
854 )
855 if result["scenario_id"] == "A08":
856 return f"mode={metrics['output_mode']}, emit={metrics.get('emit_output', '')}, active={metrics['mu_summary']['active_count']}"
857 return (
858 f"phi={metrics['phi_summary']['node_count']}, mu={metrics['mu_summary']['active_count']}, "
859 f"J={metrics['J_summary']['edge_count']}, mode={metrics['output_mode']}"
860 )
861
862
863def _render_markdown(report: Dict[str, Any]) -> str:
864 dataset_check = report["dataset_check"]
865 lines = [
866 "# Branch A Real Textbook Validation",
867 "",
868 "## Purpose",
869 "Run Branch A on the same 5 real textbook files and the same A01-A10 real-data scenario family used by Branch B, then report the result honestly without changing Branch A runtime behavior.",
870 "",
871 "## Base Commits",
872 f"- Branch A base commit: `{report['base_commit']}`",
873 f"- Branch B reference commit: `{report['branch_b_reference_commit']}` (`{BRANCH_B_REFERENCE_LABEL}`)",
874 f"- Branch under test: `{report['branch']}`",
875 "",
876 "## Dataset Path And File Check",
877 f"- Dataset path: `{report['dataset_dir']}`",
878 f"- Directory exists: `{dataset_check['directory_exists']}`",
879 f"- All 5 required files present: `{dataset_check['all_required_files_exist']}`",
880 ]
881 for row in report["dataset_files"]:
882 lines.append(f"- {'OK' if row['exists'] else 'MISSING'} `{row['path']}`")
883 lines.extend(
884 [
885 "",
886 "## Scenario Results",
887 "| ID | Textbook | Status | Branch B Reference | Branch A Observed | Reason |",
888 "| --- | --- | --- | --- | --- | --- |",
889 ]
890 )
891 for result in report["scenarios"]:
892 lines.append(
893 f"| {result['scenario_id']} | {_result_textbooks(result)} | {result['status']} | "
894 f"{result['branch_b_reference_expectation']['summary']} | {_branch_a_observed_summary(result)} | {result['reason']} |"
895 )
896 lines.extend(["", "## Explicit Structural Mismatch"])
897 for mismatch in report["structural_mismatches"]:
898 lines.append(f"- `{mismatch['scenario_id']}`: {mismatch['detail']}")
899 lines.extend(
900 [
901 "",
902 "## Concise Fairness Interpretation",
903 "- This run materially reduces the main A/B fairness gap because Branch A was executed on the same dataset, same file set, and same A01-A10 slice family as Branch B.",
904 "- It does not erase Branch A's current disadvantages: A08 fails on the mandated slice, A04 is not directly comparable, and most Branch A state sizes remain much smaller than Branch B's reference values.",
905 "",
906 "## Does This Reduce The Main A/B Fairness Gap?",
907 "- Yes. The earlier fairness concern was unmatched real-data coverage. That concern is now materially reduced because Branch A was run on the same real textbooks and scenario family.",
908 "",
909 "## Recommendation",
910 f"- Decision: `{report['recommendation']['decision']}`",
911 f"- Reason: {report['recommendation']['reason']}",
912 "",
913 ]
914 )
915 return "\n".join(lines)
916
917
918def _render_review(report: Dict[str, Any]) -> str:
919 passed = [scenario["scenario_id"] for scenario in report["scenarios"] if scenario["status"] == "PASS"]
920 failed = [scenario["scenario_id"] for scenario in report["scenarios"] if scenario["status"] == "FAIL"]
921 mismatched = [scenario["scenario_id"] for scenario in report["scenarios"] if scenario["status"] == "STRUCTURAL MISMATCH"]
922 lines = [
923 "# Review: Branch A Real Textbook Validation",
924 "",
925 "## What Was Run",
926 f"- Branch A base commit `{report['base_commit']}` on branch `{report['branch']}`.",
927 f"- Branch B reference commit `{report['branch_b_reference_commit']}` for dataset/scenario parity.",
928 f"- Same dataset directory: `{report['dataset_dir']}` with the exact 5 textbook files required by Branch B.",
929 f"- Same real-data scenario family: A01-A10.",
930 "",
931 "## Outcome",
932 f"- Succeeded: {', '.join(passed) if passed else 'none'}",
933 f"- Failed: {', '.join(failed) if failed else 'none'}",
934 f"- Structurally not comparable: {', '.join(mismatched) if mismatched else 'none'}",
935 "",
936 "## Decision Readout",
937 "- The matched real-textbook run materially reduces the earlier fairness gap.",
938 "- It does not materially change a conclusion that Branch B currently has broader and cleaner real-data validation coverage.",
939 f"- Recommendation: `{report['recommendation']['decision']}`",
940 f"- Rationale: {report['recommendation']['reason']}",
941 "",
942 ]
943 return "\n".join(lines)
944
945
946def generate_validation_report(
947 json_path: Path | str = DEFAULT_JSON_REPORT_PATH,
948 markdown_path: Path | str = DEFAULT_MARKDOWN_REPORT_PATH,
949 review_path: Path | str = DEFAULT_REVIEW_REPORT_PATH,
950) -> Dict[str, Any]:
951 dataset_check = _require_dataset()
952 scenarios = _run_scenarios()
953 report: Dict[str, Any] = {
954 "branch": _current_branch(),
955 "base_commit": BASE_COMMIT,
956 "branch_b_reference_commit": BRANCH_B_REFERENCE_COMMIT,
957 "dataset_dir": str(DATASET_DIR),
958 "dataset_files": dataset_file_rows(),
959 "dataset_check": dataset_check,
960 "scenarios": scenarios,
961 "overall_summary": _overall_summary(scenarios, dataset_check),
962 "structural_mismatches": _collect_structural_mismatches(scenarios),
963 "known_limitations": _known_limitations(),
964 "recommendation": _recommendation(),
965 }
966 if tuple(report) != REQUIRED_REPORT_KEYS:
967 raise RuntimeError(f"Unexpected report key order: {tuple(report)!r}")
968
969 json_path = Path(json_path)
970 markdown_path = Path(markdown_path)
971 review_path = Path(review_path)
972 json_path.parent.mkdir(parents=True, exist_ok=True)
973 markdown_path.parent.mkdir(parents=True, exist_ok=True)
974 review_path.parent.mkdir(parents=True, exist_ok=True)
975 json_path.write_text(json.dumps(report, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
976 markdown_path.write_text(_render_markdown(report), encoding="utf-8")
977 review_path.write_text(_render_review(report), encoding="utf-8")
978 return report
979
980
981def _parse_args(argv: Sequence[str]) -> argparse.Namespace:
982 parser = argparse.ArgumentParser(description="Branch A real-textbook validation against Branch B dataset/scenarios.")
983 parser.add_argument("--json-out", default=str(DEFAULT_JSON_REPORT_PATH))
984 parser.add_argument("--markdown-out", default=str(DEFAULT_MARKDOWN_REPORT_PATH))
985 parser.add_argument("--review-out", default=str(DEFAULT_REVIEW_REPORT_PATH))
986 return parser.parse_args(argv)
987
988
989def main(argv: Sequence[str] | None = None) -> int:
990 args = _parse_args(sys.argv[1:] if argv is None else argv)
991 generate_validation_report(args.json_out, args.markdown_out, args.review_out)
992 return 0
993
994
995if __name__ == "__main__":
996 raise SystemExit(main())