cie/validation_real_textbooks.py@archive/review-branch-a-real-textbook-validation-58075c8

CIE-Unified / cie
review: branch-a real textbook validation against branch-b dataset
58075c8
codex@macbookpro · 2026-03-31
validation_real_textbooks.py

  1from __future__ import annotations
  2
  3import argparse
  4import json
  5import math
  6import subprocess
  7import sys
  8from pathlib import Path
  9from typing import Any, Callable, Dict, List, Sequence
 10
 11from .runtime import CIERuntime, REQUIRED_SNAPSHOT_KEYS
 12
 13
 14BASE_COMMIT = "419ae8d39150806011c1eb6082c7fc8c6a337735"
 15BRANCH_B_REFERENCE_COMMIT = "c7342881bb2ebfa5e7f927c91a7806416288573b"
 16BRANCH_B_REFERENCE_LABEL = "c734288"
 17REPO_ROOT = Path(__file__).resolve().parent.parent
 18DATASET_DIR = Path("/Users/george/code/china-text-book-md")
 19DEFAULT_JSON_REPORT_PATH = REPO_ROOT / "reports" / "2026-03-31_branch_a_real_textbook_validation.json"
 20DEFAULT_MARKDOWN_REPORT_PATH = REPO_ROOT / "reports" / "2026-03-31_branch_a_real_textbook_validation.md"
 21DEFAULT_REVIEW_REPORT_PATH = REPO_ROOT / "reviews" / "2026-03-31_branch_a_real_textbook_validation.md"
 22VALID_SCENARIO_STATUSES = ("PASS", "FAIL", "N/A", "STRUCTURAL MISMATCH")
 23STAGE_NAMES = ("memory", "experience", "skill_belt", "ability_core")
 24REQUIRED_REPORT_KEYS = (
 25    "branch",
 26    "base_commit",
 27    "branch_b_reference_commit",
 28    "dataset_dir",
 29    "dataset_files",
 30    "dataset_check",
 31    "scenarios",
 32    "overall_summary",
 33    "structural_mismatches",
 34    "known_limitations",
 35    "recommendation",
 36)
 37
 38TEXTBOOKS = {
 39    "小学语文一上": "小学_语文_统编版_义务教育教科书·语文一年级上册.md",
 40    "小学数学一上": "小学_数学_人教版_义务教育教科书 · 数学一年级上册.md",
 41    "初中语文七上": "初中_语文_统编版-人民教育出版社_七年级_义务教育教科书·语文七年级上册.md",
 42    "初中数学七上": "初中_数学_人教版-人民教育出版社_七年级_义务教育教科书·数学七年级上册.md",
 43    "高中语文必修上": "高中_语文_统编版-人民教育出版社_普通高中教科书·语文必修 上册.md",
 44}
 45
 46BRANCH_B_REFERENCE_EXPECTATIONS = {
 47    "A01": {
 48        "summary": "Branch B stage report: PASS; 126 nodes, 166 flows on 小学语文一上 pipeline.",
 49        "source": "STAGE_REPORT.md @ c734288",
 50    },
 51    "A02": {
 52        "summary": "Branch B stage report: PASS; 58 nodes, has_cn=True on 小学数学一上 mixed text.",
 53        "source": "STAGE_REPORT.md @ c734288",
 54    },
 55    "A03": {
 56        "summary": "Branch B stage report: PASS; 276 nodes, 20 sedimentation traces on 初中语文七上.",
 57        "source": "STAGE_REPORT.md @ c734288",
 58    },
 59    "A04": {
 60        "summary": "Branch B stage report: PASS; 294 edges, asymmetry ratio 1.00 on 初中数学七上.",
 61        "source": "STAGE_REPORT.md @ c734288",
 62    },
 63    "A05": {
 64        "summary": "Branch B stage report: PASS; 397 nodes, phi range [-0.13, 0.15] on 高中语文必修上.",
 65        "source": "STAGE_REPORT.md @ c734288",
 66    },
 67    "A06": {
 68        "summary": "Branch B stage report: PASS; 8 new nodes after 语文→数学 subject switch.",
 69        "source": "STAGE_REPORT.md @ c734288",
 70    },
 71    "A07": {
 72        "summary": "Branch B stage report: PASS; 182/189 phi entries preserved after reset.",
 73        "source": "STAGE_REPORT.md @ c734288",
 74    },
 75    "A08": {
 76        "summary": "Branch B stage report: PASS; confidence 0.333→0.889→0.381 after positive/negative feedback.",
 77        "source": "STAGE_REPORT.md @ c734288",
 78    },
 79    "A09": {
 80        "summary": "Branch B stage report: PASS; sedimentation gradient (20,4)→(20,10).",
 81        "source": "STAGE_REPORT.md @ c734288",
 82    },
 83    "A10": {
 84        "summary": "Branch B stage report: PASS; 16 snapshot fields present on real textbook state.",
 85        "source": "STAGE_REPORT.md @ c734288",
 86    },
 87}
 88
 89
 90def _round(value: float) -> float:
 91    return round(float(value), 4)
 92
 93
 94def _git_stdout(args: Sequence[str], fallback: str) -> str:
 95    try:
 96        completed = subprocess.run(
 97            ["git", *args],
 98            cwd=REPO_ROOT,
 99            check=True,
100            capture_output=True,
101            text=True,
102        )
103    except (FileNotFoundError, subprocess.CalledProcessError):
104        return fallback
105    output = completed.stdout.strip()
106    return output or fallback
107
108
109def _current_branch() -> str:
110    return _git_stdout(["rev-parse", "--abbrev-ref", "HEAD"], "review/branch-a-real-textbook-validation")
111
112
113def dataset_file_rows() -> List[Dict[str, Any]]:
114    rows = []
115    for textbook, filename in TEXTBOOKS.items():
116        path = DATASET_DIR / filename
117        rows.append(
118            {
119                "textbook": textbook,
120                "filename": filename,
121                "path": str(path),
122                "exists": path.is_file(),
123            }
124        )
125    return rows
126
127
128def check_dataset() -> Dict[str, Any]:
129    rows = dataset_file_rows()
130    missing_paths = [row["path"] for row in rows if not row["exists"]]
131    directory_exists = DATASET_DIR.is_dir()
132    return {
133        "directory_exists": directory_exists,
134        "all_required_files_exist": directory_exists and not missing_paths,
135        "missing_paths": missing_paths,
136        "file_results": rows,
137    }
138
139
140def _require_dataset() -> Dict[str, Any]:
141    dataset_check = check_dataset()
142    if dataset_check["all_required_files_exist"]:
143        return dataset_check
144    missing = dataset_check["missing_paths"] or [str(DATASET_DIR)]
145    raise FileNotFoundError("Required textbook dataset is missing:\n" + "\n".join(missing))
146
147
148def load_textbook_paragraphs(name: str) -> List[str]:
149    path = DATASET_DIR / TEXTBOOKS[name]
150    paragraphs: List[str] = []
151    raw = path.read_text(encoding="utf-8")
152    for line in raw.splitlines():
153        line = line.strip()
154        if not line:
155            continue
156        if line.startswith("#") or line.startswith("**") or line.startswith("---"):
157            continue
158        if line.startswith("!["):
159            continue
160        control_chars = sum(1 for char in line if ord(char) < 32 and char not in "\n\t")
161        if control_chars > len(line) * 0.3:
162            continue
163        chinese_chars = sum(1 for char in line if "\u4e00" <= char <= "\u9fff")
164        if chinese_chars >= 2:
165            paragraphs.append(line)
166    return paragraphs
167
168
169def _slice_paragraphs(name: str, paragraph_count: int) -> List[str]:
170    paragraphs = load_textbook_paragraphs(name)
171    if len(paragraphs) < paragraph_count:
172        raise ValueError(f"{name} only has {len(paragraphs)} cleaned paragraphs; need {paragraph_count}.")
173    return paragraphs[:paragraph_count]
174
175
176def _feed(runtime: CIERuntime, paragraphs: List[str], char_limit: int, step_n: int) -> List[str]:
177    used_slices: List[str] = []
178    for paragraph in paragraphs:
179        used = paragraph[:char_limit]
180        runtime.ingest(used)
181        runtime.step(n=step_n)
182        used_slices.append(used)
183    return used_slices
184
185
186def _stage_counts(runtime: CIERuntime) -> Dict[str, int]:
187    counts = {stage: 0 for stage in STAGE_NAMES}
188    for profile in runtime.state.sedimentation.values():
189        counts[profile.stage] = counts.get(profile.stage, 0) + 1
190    return counts
191
192
193def _phi_range(runtime: CIERuntime) -> Dict[str, float] | None:
194    values = list(runtime.state.phi.values())
195    if not values:
196        return None
197    return {"min": _round(min(values)), "max": _round(max(values))}
198
199
200def _max_abs(mapping: Dict[Any, float]) -> float:
201    if not mapping:
202        return 0.0
203    return _round(max(abs(value) for value in mapping.values()))
204
205
206def _all_finite(runtime: CIERuntime) -> bool:
207    for mapping in (runtime.state.phi, runtime.state.mu, runtime.state.J, runtime.state.anchor_nodes):
208        for value in mapping.values():
209            if not math.isfinite(value):
210                return False
211    return True
212
213
214def _contains_chinese(node: str) -> bool:
215    return any("\u4e00" <= char <= "\u9fff" for char in node)
216
217
218def _contains_digit(node: str) -> bool:
219    return any(char.isdigit() for char in node)
220
221
222def _top_level_snapshot_metrics(runtime: CIERuntime, snapshot: Dict[str, Any], output: str | None = None) -> Dict[str, Any]:
223    metrics: Dict[str, Any] = {
224        "phi_summary": snapshot["phi_summary"],
225        "mu_summary": snapshot["mu_summary"],
226        "J_summary": snapshot["J_summary"],
227        "active_region": snapshot["active_region"],
228        "active_region_size": len(snapshot["active_region"]),
229        "bound_ability_core": snapshot["bound_ability_core"],
230        "anchor_pull": snapshot["anchor_pull"],
231        "drift_score": snapshot["drift_score"],
232        "free_capacity": snapshot["free_capacity"],
233        "experience_regions_count": len(snapshot["experience_regions"]),
234        "skill_belt_candidates_count": len(snapshot["skill_belt_candidates"]),
235        "sedimentation_trace_count": len(snapshot["sedimentation_trace"]),
236        "merge_events_count": len(snapshot["merge_events"]),
237        "decay_events_count": len(snapshot["decay_events"]),
238        "output_mode": snapshot["output_mode"],
239        "feedback_effect": snapshot["feedback_effect"],
240        "phi_range": _phi_range(runtime),
241        "stage_counts": _stage_counts(runtime),
242        "graph_node_count": len(runtime.state.graph.nodes()),
243        "graph_edge_count_proxy": len(runtime.state.J),
244        "attention_usage": "NOT APPLICABLE: Branch A exposes free_capacity but not attention used/total.",
245    }
246    if output is not None:
247        metrics["emit_output"] = output
248    return metrics
249
250
251def _scenario_result(
252    scenario_id: str,
253    title: str,
254    textbooks: List[str],
255    status: str,
256    reason: str,
257    *,
258    ran_successfully: bool,
259    key_metrics: Dict[str, Any],
260    fairness_notes: List[str] | None = None,
261    structural_mismatch: str | None = None,
262) -> Dict[str, Any]:
263    if status not in VALID_SCENARIO_STATUSES:
264        raise ValueError(f"Invalid scenario status: {status}")
265    return {
266        "scenario_id": scenario_id,
267        "title": title,
268        "textbook_used": textbooks,
269        "dataset_files": [str(DATASET_DIR / TEXTBOOKS[name]) for name in textbooks],
270        "ran_successfully": ran_successfully,
271        "status": status,
272        "reason": reason,
273        "key_metrics": key_metrics,
274        "branch_b_reference_expectation": BRANCH_B_REFERENCE_EXPECTATIONS[scenario_id],
275        "fairness_notes": fairness_notes or [],
276        "structural_mismatch": structural_mismatch,
277    }
278
279
280def _scenario_failure(
281    scenario_id: str,
282    title: str,
283    textbooks: List[str],
284    exc: Exception,
285) -> Dict[str, Any]:
286    return _scenario_result(
287        scenario_id,
288        title,
289        textbooks,
290        "FAIL",
291        f"Scenario execution raised {exc.__class__.__name__}: {exc}",
292        ran_successfully=False,
293        key_metrics={"exception": str(exc)},
294        fairness_notes=[],
295        structural_mismatch=None,
296    )
297
298
299def run_a01() -> Dict[str, Any]:
300    runtime = CIERuntime()
301    used = _feed(runtime, _slice_paragraphs("小学语文一上", 30), char_limit=50, step_n=3)
302    output = runtime.emit()
303    snapshot = runtime.snapshot_state()
304    status = "PASS" if snapshot["mu_summary"]["active_count"] > 0 and snapshot["J_summary"]["edge_count"] > 0 else "FAIL"
305    reason = (
306        "Pipeline ran on the required real-data slice and produced non-empty phi/mu/J state."
307        if status == "PASS"
308        else "Pipeline did not retain active state after the required real-data slice."
309    )
310    metrics = _top_level_snapshot_metrics(runtime, snapshot, output)
311    metrics.update(
312        {
313            "input_slice": {"paragraphs": 30, "chars_per_paragraph": 50, "step_n": 3},
314            "used_paragraph_count": len(used),
315        }
316    )
317    return _scenario_result(
318        "A01",
319        "小学语文一上 — pipeline",
320        ["小学语文一上"],
321        status,
322        reason,
323        ran_successfully=True,
324        key_metrics=metrics,
325        fairness_notes=[
326            "Branch A emit() returns a string, so output mode and active counts come from snapshot_state().",
327            "Observed state is much smaller than Branch B's reference counts, but the scenario does complete end-to-end.",
328        ],
329        structural_mismatch=None,
330    )
331
332
333def run_a02() -> Dict[str, Any]:
334    runtime = CIERuntime()
335    _feed(runtime, _slice_paragraphs("小学数学一上", 20), char_limit=40, step_n=3)
336    output = runtime.emit()
337    snapshot = runtime.snapshot_state()
338    nodes = runtime.state.graph.nodes()
339    chinese_nodes = [node for node in nodes if _contains_chinese(node)]
340    digit_nodes = [node for node in nodes if _contains_digit(node)]
341    status = "PASS" if chinese_nodes else "FAIL"
342    reason = (
343        "Chinese-bearing nodes exist on the mixed textbook slice; digit-bearing nodes are reported separately."
344        if status == "PASS"
345        else "No Chinese-bearing nodes were formed on the mixed textbook slice."
346    )
347    metrics = _top_level_snapshot_metrics(runtime, snapshot, output)
348    metrics.update(
349        {
350            "input_slice": {"paragraphs": 20, "chars_per_paragraph": 40, "step_n": 3},
351            "has_chinese_nodes": bool(chinese_nodes),
352            "has_digit_nodes": bool(digit_nodes),
353            "chinese_node_count": len(chinese_nodes),
354            "digit_node_count": len(digit_nodes),
355            "sample_digit_nodes": digit_nodes[:5],
356        }
357    )
358    return _scenario_result(
359        "A02",
360        "小学数学一上 — mixed text",
361        ["小学数学一上"],
362        status,
363        reason,
364        ran_successfully=True,
365        key_metrics=metrics,
366        fairness_notes=[
367            "Branch A tokenizes with Unicode word regexes, so digits may be absorbed into coarse tokens or absent from this slice.",
368            "The required honest report here is whether Chinese nodes exist and whether digit-bearing nodes were actually observed.",
369        ],
370        structural_mismatch=None,
371    )
372
373
374def run_a03() -> Dict[str, Any]:
375    runtime = CIERuntime()
376    _feed(runtime, _slice_paragraphs("初中语文七上", 50), char_limit=60, step_n=3)
377    snapshot = runtime.snapshot_state()
378    has_sedimentation = bool(snapshot["sedimentation_trace"]) and bool(snapshot["experience_regions"])
379    status = "PASS" if has_sedimentation else "FAIL"
380    reason = (
381        "Sedimentation and experience-region observables are present on the required real-data slice."
382        if status == "PASS"
383        else "Sedimentation observables did not materialize on the required real-data slice."
384    )
385    metrics = _top_level_snapshot_metrics(runtime, snapshot)
386    metrics.update({"input_slice": {"paragraphs": 50, "chars_per_paragraph": 60, "step_n": 3}})
387    return _scenario_result(
388        "A03",
389        "初中语文七上 — complexity / sedimentation",
390        ["初中语文七上"],
391        status,
392        reason,
393        ran_successfully=True,
394        key_metrics=metrics,
395        fairness_notes=[
396            "Branch A exposes sedimentation_trace and experience_regions, but its tokenized graph remains much smaller than Branch B's reference run.",
397            "sedimentation_trace is capped, so count saturation is expected and should not be over-interpreted.",
398        ],
399        structural_mismatch=None,
400    )
401
402
403def run_a04() -> Dict[str, Any]:
404    runtime = CIERuntime()
405    _feed(runtime, _slice_paragraphs("初中数学七上", 30), char_limit=50, step_n=3)
406    snapshot = runtime.snapshot_state()
407    asymmetry_proxy = []
408    for (left, right), value in runtime.state.J.items():
409        reverse = runtime.state.J.get((right, left), 0.0)
410        asymmetry_proxy.append(abs(value - reverse) / max(value, reverse, 1e-9))
411    metrics = _top_level_snapshot_metrics(runtime, snapshot)
412    metrics.update(
413        {
414            "input_slice": {"paragraphs": 30, "chars_per_paragraph": 50, "step_n": 3},
415            "directed_flow_asymmetry_proxy_avg": _round(sum(asymmetry_proxy) / len(asymmetry_proxy)) if asymmetry_proxy else 0.0,
416            "top_flows": snapshot["J_summary"]["top_flows"],
417        }
418    )
419    mismatch = (
420        "Branch B's A04 metric is based on forward/backward graph edge weights. Branch A only exposes directed J flow, "
421        "not a directly comparable directed graph-edge surface, so a fair asymmetry-ratio comparison is a structural mismatch."
422    )
423    return _scenario_result(
424        "A04",
425        "初中数学七上 — formula / structure",
426        ["初中数学七上"],
427        "STRUCTURAL MISMATCH",
428        "The scenario ran, but the primary Branch B asymmetry-ratio metric does not map cleanly onto Branch A.",
429        ran_successfully=True,
430        key_metrics=metrics,
431        fairness_notes=[
432            "Directed J flow can be described, but it is not the same observable as Branch B's directed graph edge weights.",
433            "Using the J proxy as if it were the same metric would overstate comparability.",
434        ],
435        structural_mismatch=mismatch,
436    )
437
438
439def run_a05() -> Dict[str, Any]:
440    runtime = CIERuntime()
441    _feed(runtime, _slice_paragraphs("高中语文必修上", 80), char_limit=80, step_n=2)
442    snapshot = runtime.snapshot_state()
443    finite = _all_finite(runtime)
444    obvious_divergence = (
445        not finite
446        or _max_abs(runtime.state.phi) > 1000.0
447        or _max_abs(runtime.state.mu) > 1000.0
448        or _max_abs(runtime.state.J) > 1000.0
449    )
450    status = "PASS" if not obvious_divergence else "FAIL"
451    reason = (
452        "Long-text run stayed finite and showed no obvious overflow/divergence symptom."
453        if status == "PASS"
454        else "Long-text run showed non-finite values or obvious divergence."
455    )
456    metrics = _top_level_snapshot_metrics(runtime, snapshot)
457    metrics.update(
458        {
459            "input_slice": {"paragraphs": 80, "chars_per_paragraph": 80, "step_n": 2},
460            "all_finite": finite,
461            "max_abs_phi": _max_abs(runtime.state.phi),
462            "max_abs_mu": _max_abs(runtime.state.mu),
463            "max_abs_J": _max_abs(runtime.state.J),
464        }
465    )
466    return _scenario_result(
467        "A05",
468        "高中语文必修上 — long text stability",
469        ["高中语文必修上"],
470        status,
471        reason,
472        ran_successfully=True,
473        key_metrics=metrics,
474        fairness_notes=[
475            "Branch A does not expose attention.used/total; free_capacity is the closest locked observable.",
476            "phi min/max are derived from runtime.state.phi because Branch A's snapshot summary does not include range fields.",
477        ],
478        structural_mismatch=None,
479    )
480
481
482def run_a06() -> Dict[str, Any]:
483    runtime = CIERuntime()
484    _feed(runtime, _slice_paragraphs("小学语文一上", 15), char_limit=40, step_n=3)
485    before = runtime.snapshot_state()
486    before_active = set(before["active_region"])
487    _feed(runtime, _slice_paragraphs("小学数学一上", 15), char_limit=40, step_n=3)
488    after = runtime.snapshot_state()
489    after_active = set(after["active_region"])
490    new_nodes = sorted(after_active - before_active)
491    preserved = sum(1 for node in before_active if abs(runtime.state.phi.get(node, 0.0)) > 0.001)
492    status = "PASS" if new_nodes and preserved > 0 else "FAIL"
493    reason = (
494        "Active region changes under subject switch while some earlier structures remain alive."
495        if status == "PASS"
496        else "Subject switch did not show both migration and persistence under the required schedule."
497    )
498    metrics = _top_level_snapshot_metrics(runtime, after)
499    metrics.update(
500        {
501            "input_slice": {
502                "phase_1": {"textbook": "小学语文一上", "paragraphs": 15, "chars_per_paragraph": 40, "step_n": 3},
503                "phase_2": {"textbook": "小学数学一上", "paragraphs": 15, "chars_per_paragraph": 40, "step_n": 3},
504            },
505            "active_region_before": sorted(before_active),
506            "active_region_after": sorted(after_active),
507            "new_active_nodes_after_switch": new_nodes,
508            "preserved_prior_active_phi_count": preserved,
509        }
510    )
511    return _scenario_result(
512        "A06",
513        "cross-subject transfer",
514        ["小学语文一上", "小学数学一上"],
515        status,
516        reason,
517        ran_successfully=True,
518        key_metrics=metrics,
519        fairness_notes=[
520            "This mirrors Branch B's no-reset subject switch. Branch A does show migration, but on a much smaller token set.",
521        ],
522        structural_mismatch=None,
523    )
524
525
526def run_a07() -> Dict[str, Any]:
527    runtime = CIERuntime()
528    _feed(runtime, _slice_paragraphs("初中语文七上", 30), char_limit=50, step_n=3)
529    phi_before = dict(runtime.state.phi)
530    j_before = dict(runtime.state.J)
531    graph_node_count_before = len(runtime.state.graph.nodes())
532    stage_counts_before = _stage_counts(runtime)
533    runtime.reset_session()
534    snapshot = runtime.snapshot_state()
535    preserved_phi_entries = sum(
536        1 for node, value in phi_before.items() if runtime.state.phi.get(node) == value
537    )
538    status = "PASS" if snapshot["mu_summary"]["active_count"] == 0 and not snapshot["active_region"] and preserved_phi_entries == len(phi_before) else "FAIL"
539    reason = (
540        "reset_session() clears session activation while preserving long-term graph/potential structure."
541        if status == "PASS"
542        else "reset_session() did not cleanly separate session state from long-term structure."
543    )
544    metrics = _top_level_snapshot_metrics(runtime, snapshot)
545    metrics.update(
546        {
547            "input_slice": {"paragraphs": 30, "chars_per_paragraph": 50, "step_n": 3},
548            "phi_entries_before_reset": len(phi_before),
549            "phi_entries_preserved_exactly": preserved_phi_entries,
550            "J_entries_before_reset": len(j_before),
551            "J_entries_after_reset": len(runtime.state.J),
552            "graph_nodes_before_reset": graph_node_count_before,
553            "graph_nodes_after_reset": len(runtime.state.graph.nodes()),
554            "stage_counts_before_reset": stage_counts_before,
555            "stage_counts_after_reset": _stage_counts(runtime),
556        }
557    )
558    return _scenario_result(
559        "A07",
560        "session reset preserves long-term structure",
561        ["初中语文七上"],
562        status,
563        reason,
564        ran_successfully=True,
565        key_metrics=metrics,
566        fairness_notes=[
567            "This is one of Branch A's clearer matched wins: session clearing and long-term retention separate cleanly.",
568        ],
569        structural_mismatch=None,
570    )
571
572
573def run_a08() -> Dict[str, Any]:
574    runtime = CIERuntime()
575    paragraph = _slice_paragraphs("小学语文一上", 1)[0][:30]
576    runtime.ingest(paragraph)
577    runtime.step(n=5)
578    output = runtime.emit()
579    snapshot = runtime.snapshot_state()
580    target = snapshot["active_region"][0] if snapshot["active_region"] else None
581    metrics = _top_level_snapshot_metrics(runtime, snapshot, output)
582    metrics.update({"input_slice": {"paragraphs": 1, "chars_per_paragraph": 30, "step_n": 5}, "feedback_target": target})
583    if not target or output == "minimal: idle":
584        return _scenario_result(
585            "A08",
586            "multi-round feedback",
587            ["小学语文一上"],
588            "FAIL",
589            "emit() returned no activated output target on the required slice, so the positive/negative feedback loop could not be meaningfully exercised.",
590            ran_successfully=True,
591            key_metrics=metrics,
592            fairness_notes=[
593                "This is reported as a real Branch A failure, not normalized away.",
594                "Branch A feedback is queued and applied on the next step, but that did not matter here because no target emerged.",
595            ],
596            structural_mismatch=None,
597        )
598
599    initial = {
600        "phi": _round(runtime.state.phi.get(target, 0.0)),
601        "mu": _round(runtime.state.mu.get(target, 0.0)),
602    }
603    positive_rounds = []
604    for round_index in range(5):
605        runtime.commit_feedback({"text": target, "value": 1.0})
606        runtime.step()
607        positive_rounds.append(
608            {
609                "round": round_index + 1,
610                "phi": _round(runtime.state.phi.get(target, 0.0)),
611                "mu": _round(runtime.state.mu.get(target, 0.0)),
612                "feedback_effect": dict(runtime.state.feedback_effect),
613            }
614        )
615    runtime.commit_feedback({"text": target, "value": -0.5})
616    runtime.step()
617    negative_round = {
618        "phi": _round(runtime.state.phi.get(target, 0.0)),
619        "mu": _round(runtime.state.mu.get(target, 0.0)),
620        "feedback_effect": dict(runtime.state.feedback_effect),
621    }
622    positive_improves = positive_rounds[-1]["phi"] >= initial["phi"] and positive_rounds[-1]["mu"] >= initial["mu"]
623    negative_reduces = negative_round["phi"] <= positive_rounds[-1]["phi"] and negative_round["mu"] <= positive_rounds[-1]["mu"]
624    status = "PASS" if positive_improves and negative_reduces else "FAIL"
625    reason = (
626        "Positive rounds strengthened the chosen target and the negative round weakened it."
627        if status == "PASS"
628        else "Feedback rounds did not show the expected positive-then-negative observable change."
629    )
630    metrics.update(
631        {
632            "initial_target_state": initial,
633            "positive_rounds": positive_rounds,
634            "negative_round": negative_round,
635        }
636    )
637    return _scenario_result(
638        "A08",
639        "multi-round feedback",
640        ["小学语文一上"],
641        status,
642        reason,
643        ran_successfully=True,
644        key_metrics=metrics,
645        fairness_notes=[
646            "Branch A feedback is asynchronous: commit_feedback() queues a signal that is applied on the next step.",
647        ],
648        structural_mismatch=None,
649    )
650
651
652def run_a09() -> Dict[str, Any]:
653    runtime = CIERuntime()
654    paragraphs = _slice_paragraphs("小学语文一上", 10)
655    round_history = []
656    for round_index in range(5):
657        _feed(runtime, paragraphs, char_limit=30, step_n=3)
658        snapshot = runtime.snapshot_state()
659        round_history.append(
660            {
661                "round": round_index + 1,
662                "sedimentation_trace_count": len(snapshot["sedimentation_trace"]),
663                "experience_regions_count": len(snapshot["experience_regions"]),
664                "skill_belt_candidates_count": len(snapshot["skill_belt_candidates"]),
665                "phi_node_count": snapshot["phi_summary"]["node_count"],
666                "active_count": snapshot["mu_summary"]["active_count"],
667                "stage_counts": _stage_counts(runtime),
668            }
669        )
670    initial_complexity = round_history[0]["stage_counts"]["skill_belt"] + round_history[0]["stage_counts"]["ability_core"]
671    final_complexity = round_history[-1]["stage_counts"]["skill_belt"] + round_history[-1]["stage_counts"]["ability_core"]
672    progressed = final_complexity > initial_complexity or any(
673        round_entry["stage_counts"]["skill_belt"] > round_history[0]["stage_counts"]["skill_belt"]
674        for round_entry in round_history[1:]
675    )
676    status = "PASS" if progressed else "FAIL"
677    reason = (
678        "Repeated rounds show incremental stage progression, even though several observable lists are capped."
679        if status == "PASS"
680        else "Repeated rounds did not show incremental sedimentation progression."
681    )
682    final_snapshot = runtime.snapshot_state()
683    metrics = _top_level_snapshot_metrics(runtime, final_snapshot)
684    metrics.update(
685        {
686            "input_slice": {"paragraphs": 10, "chars_per_paragraph": 30, "step_n": 3, "rounds": 5},
687            "round_history": round_history,
688        }
689    )
690    return _scenario_result(
691        "A09",
692        "incremental sedimentation",
693        ["小学语文一上"],
694        status,
695        reason,
696        ran_successfully=True,
697        key_metrics=metrics,
698        fairness_notes=[
699            "sedimentation_trace and skill_belt_candidates are capped lists in Branch A, so stage_counts are the more honest growth indicator here.",
700        ],
701        structural_mismatch=None,
702    )
703
704
705def run_a10() -> Dict[str, Any]:
706    runtime = CIERuntime()
707    _feed(runtime, _slice_paragraphs("初中数学七上", 20), char_limit=40, step_n=3)
708    output = runtime.emit()
709    runtime.step()
710    runtime.commit_feedback({"text": "validation", "value": 0.2})
711    runtime.step()
712    snapshot = runtime.snapshot_state()
713    missing = sorted(REQUIRED_SNAPSHOT_KEYS.difference(snapshot))
714    status = "PASS" if not missing else "FAIL"
715    reason = (
716        "All Branch A locked snapshot fields are present on real textbook-driven state."
717        if status == "PASS"
718        else f"Snapshot is missing required locked fields: {missing}"
719    )
720    metrics = _top_level_snapshot_metrics(runtime, snapshot, output)
721    metrics.update(
722        {
723            "input_slice": {"paragraphs": 20, "chars_per_paragraph": 40, "step_n": 3},
724            "required_snapshot_keys": sorted(REQUIRED_SNAPSHOT_KEYS),
725            "observed_snapshot_keys": sorted(snapshot),
726            "missing_snapshot_keys": missing,
727        }
728    )
729    return _scenario_result(
730        "A10",
731        "snapshot completeness on real textbook input",
732        ["初中数学七上"],
733        status,
734        reason,
735        ran_successfully=True,
736        key_metrics=metrics,
737        fairness_notes=[
738            "Branch A needs one extra step after feedback to observe the applied feedback_effect because feedback is queued.",
739            "Branch B's report mentions 16 fields including attention, but Branch A's locked comparable surface is the 15-field spec set.",
740        ],
741        structural_mismatch=None,
742    )
743
744
745SCENARIOS: List[Dict[str, Any]] = [
746    {"id": "A01", "title": "小学语文一上 — pipeline", "textbooks": ["小学语文一上"], "runner": run_a01},
747    {"id": "A02", "title": "小学数学一上 — mixed text", "textbooks": ["小学数学一上"], "runner": run_a02},
748    {"id": "A03", "title": "初中语文七上 — complexity / sedimentation", "textbooks": ["初中语文七上"], "runner": run_a03},
749    {"id": "A04", "title": "初中数学七上 — formula / structure", "textbooks": ["初中数学七上"], "runner": run_a04},
750    {"id": "A05", "title": "高中语文必修上 — long text stability", "textbooks": ["高中语文必修上"], "runner": run_a05},
751    {"id": "A06", "title": "cross-subject transfer", "textbooks": ["小学语文一上", "小学数学一上"], "runner": run_a06},
752    {"id": "A07", "title": "session reset preserves long-term structure", "textbooks": ["初中语文七上"], "runner": run_a07},
753    {"id": "A08", "title": "multi-round feedback", "textbooks": ["小学语文一上"], "runner": run_a08},
754    {"id": "A09", "title": "incremental sedimentation", "textbooks": ["小学语文一上"], "runner": run_a09},
755    {"id": "A10", "title": "snapshot completeness on real textbook input", "textbooks": ["初中数学七上"], "runner": run_a10},
756]
757
758
759def _run_scenarios() -> List[Dict[str, Any]]:
760    results = []
761    for scenario in SCENARIOS:
762        try:
763            results.append(scenario["runner"]())
764        except Exception as exc:
765            results.append(_scenario_failure(scenario["id"], scenario["title"], scenario["textbooks"], exc))
766    return results
767
768
769def _status_counts(scenarios: List[Dict[str, Any]]) -> Dict[str, int]:
770    counts = {status: 0 for status in VALID_SCENARIO_STATUSES}
771    for scenario in scenarios:
772        counts[scenario["status"]] = counts.get(scenario["status"], 0) + 1
773    return counts
774
775
776def _collect_structural_mismatches(scenarios: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
777    mismatches: List[Dict[str, Any]] = []
778    for scenario in scenarios:
779        if scenario["structural_mismatch"]:
780            mismatches.append(
781                {
782                    "scenario_id": scenario["scenario_id"],
783                    "title": scenario["title"],
784                    "detail": scenario["structural_mismatch"],
785                }
786            )
787    mismatches.extend(
788        [
789            {
790                "scenario_id": "A01/A08",
791                "title": "emit surface mismatch",
792                "detail": "Branch A emit() returns a plain string, not Branch B's structured payload with activated nodes and active_count.",
793            },
794            {
795                "scenario_id": "A05/A10",
796                "title": "attention surface mismatch",
797                "detail": "Branch A does not expose attention used/total. free_capacity is the locked comparable field instead.",
798            },
799            {
800                "scenario_id": "A08/A10",
801                "title": "feedback timing mismatch",
802                "detail": "Branch A commit_feedback() is queued and becomes observable on the next step, unlike Branch B's more immediate feedback probes.",
803            },
804        ]
805    )
806    return mismatches
807
808
809def _overall_summary(scenarios: List[Dict[str, Any]], dataset_check: Dict[str, Any]) -> Dict[str, Any]:
810    counts = _status_counts(scenarios)
811    ran_successfully = sum(1 for scenario in scenarios if scenario["ran_successfully"])
812    return {
813        "scenario_count": len(scenarios),
814        "ran_successfully_count": ran_successfully,
815        "status_counts": counts,
816        "fairness_gap_reduced": bool(dataset_check["all_required_files_exist"] and len(scenarios) == 10),
817        "materially_changes_previous_ab_conclusion": False,
818        "summary": (
819            "Branch A was run on the same 5 real textbooks and A01-A10 scenario family used by Branch B. "
820            "Eight scenarios passed on Branch A's own observable surface, A04 is a structural mismatch, and A08 failed on the required slice."
821        ),
822    }
823
824
825def _known_limitations() -> List[str]:
826    return [
827        "Branch A tokenizes each ingest call with a Unicode word regex and keeps at most 8 tokens, so long textbook slices compress into coarse tokens instead of Branch B's character/bigram view.",
828        "Branch A does not expose attention.used/attention.total; free_capacity is the closest locked observable, and phi min/max must be derived from internal state for diagnostics.",
829        "Branch A emit() is string-only and commit_feedback() is asynchronous, so some Branch B feedback/output probes can only be approximated, not matched exactly.",
830        "The first cleaned textbook paragraphs include front matter and publishing metadata; this is shared with Branch B's slice definition but is amplified by Branch A's coarse tokenization.",
831    ]
832
833
834def _recommendation() -> Dict[str, str]:
835    return {
836        "decision": "enough to proceed with merge decision",
837        "reason": (
838            "The main A/B fairness gap was the unmatched real-data harness. This validation closes that gap enough to make a merge decision on current evidence. "
839            "The remaining issues are explicit Branch A results: one failed scenario (A08) and one true structural mismatch (A04), not hidden harness differences."
840        ),
841    }
842
843
844def _result_textbooks(result: Dict[str, Any]) -> str:
845    return ", ".join(result["textbook_used"])
846
847
848def _branch_a_observed_summary(result: Dict[str, Any]) -> str:
849    metrics = result["key_metrics"]
850    if result["scenario_id"] == "A04":
851        return (
852            f"phi={metrics['phi_summary']['node_count']}, J={metrics['J_summary']['edge_count']}, "
853            f"flow-asym-proxy={metrics['directed_flow_asymmetry_proxy_avg']}"
854        )
855    if result["scenario_id"] == "A08":
856        return f"mode={metrics['output_mode']}, emit={metrics.get('emit_output', '')}, active={metrics['mu_summary']['active_count']}"
857    return (
858        f"phi={metrics['phi_summary']['node_count']}, mu={metrics['mu_summary']['active_count']}, "
859        f"J={metrics['J_summary']['edge_count']}, mode={metrics['output_mode']}"
860    )
861
862
863def _render_markdown(report: Dict[str, Any]) -> str:
864    dataset_check = report["dataset_check"]
865    lines = [
866        "# Branch A Real Textbook Validation",
867        "",
868        "## Purpose",
869        "Run Branch A on the same 5 real textbook files and the same A01-A10 real-data scenario family used by Branch B, then report the result honestly without changing Branch A runtime behavior.",
870        "",
871        "## Base Commits",
872        f"- Branch A base commit: `{report['base_commit']}`",
873        f"- Branch B reference commit: `{report['branch_b_reference_commit']}` (`{BRANCH_B_REFERENCE_LABEL}`)",
874        f"- Branch under test: `{report['branch']}`",
875        "",
876        "## Dataset Path And File Check",
877        f"- Dataset path: `{report['dataset_dir']}`",
878        f"- Directory exists: `{dataset_check['directory_exists']}`",
879        f"- All 5 required files present: `{dataset_check['all_required_files_exist']}`",
880    ]
881    for row in report["dataset_files"]:
882        lines.append(f"- {'OK' if row['exists'] else 'MISSING'} `{row['path']}`")
883    lines.extend(
884        [
885            "",
886            "## Scenario Results",
887            "| ID | Textbook | Status | Branch B Reference | Branch A Observed | Reason |",
888            "| --- | --- | --- | --- | --- | --- |",
889        ]
890    )
891    for result in report["scenarios"]:
892        lines.append(
893            f"| {result['scenario_id']} | {_result_textbooks(result)} | {result['status']} | "
894            f"{result['branch_b_reference_expectation']['summary']} | {_branch_a_observed_summary(result)} | {result['reason']} |"
895        )
896    lines.extend(["", "## Explicit Structural Mismatch"])
897    for mismatch in report["structural_mismatches"]:
898        lines.append(f"- `{mismatch['scenario_id']}`: {mismatch['detail']}")
899    lines.extend(
900        [
901            "",
902            "## Concise Fairness Interpretation",
903            "- This run materially reduces the main A/B fairness gap because Branch A was executed on the same dataset, same file set, and same A01-A10 slice family as Branch B.",
904            "- It does not erase Branch A's current disadvantages: A08 fails on the mandated slice, A04 is not directly comparable, and most Branch A state sizes remain much smaller than Branch B's reference values.",
905            "",
906            "## Does This Reduce The Main A/B Fairness Gap?",
907            "- Yes. The earlier fairness concern was unmatched real-data coverage. That concern is now materially reduced because Branch A was run on the same real textbooks and scenario family.",
908            "",
909            "## Recommendation",
910            f"- Decision: `{report['recommendation']['decision']}`",
911            f"- Reason: {report['recommendation']['reason']}",
912            "",
913        ]
914    )
915    return "\n".join(lines)
916
917
918def _render_review(report: Dict[str, Any]) -> str:
919    passed = [scenario["scenario_id"] for scenario in report["scenarios"] if scenario["status"] == "PASS"]
920    failed = [scenario["scenario_id"] for scenario in report["scenarios"] if scenario["status"] == "FAIL"]
921    mismatched = [scenario["scenario_id"] for scenario in report["scenarios"] if scenario["status"] == "STRUCTURAL MISMATCH"]
922    lines = [
923        "# Review: Branch A Real Textbook Validation",
924        "",
925        "## What Was Run",
926        f"- Branch A base commit `{report['base_commit']}` on branch `{report['branch']}`.",
927        f"- Branch B reference commit `{report['branch_b_reference_commit']}` for dataset/scenario parity.",
928        f"- Same dataset directory: `{report['dataset_dir']}` with the exact 5 textbook files required by Branch B.",
929        f"- Same real-data scenario family: A01-A10.",
930        "",
931        "## Outcome",
932        f"- Succeeded: {', '.join(passed) if passed else 'none'}",
933        f"- Failed: {', '.join(failed) if failed else 'none'}",
934        f"- Structurally not comparable: {', '.join(mismatched) if mismatched else 'none'}",
935        "",
936        "## Decision Readout",
937        "- The matched real-textbook run materially reduces the earlier fairness gap.",
938        "- It does not materially change a conclusion that Branch B currently has broader and cleaner real-data validation coverage.",
939        f"- Recommendation: `{report['recommendation']['decision']}`",
940        f"- Rationale: {report['recommendation']['reason']}",
941        "",
942    ]
943    return "\n".join(lines)
944
945
946def generate_validation_report(
947    json_path: Path | str = DEFAULT_JSON_REPORT_PATH,
948    markdown_path: Path | str = DEFAULT_MARKDOWN_REPORT_PATH,
949    review_path: Path | str = DEFAULT_REVIEW_REPORT_PATH,
950) -> Dict[str, Any]:
951    dataset_check = _require_dataset()
952    scenarios = _run_scenarios()
953    report: Dict[str, Any] = {
954        "branch": _current_branch(),
955        "base_commit": BASE_COMMIT,
956        "branch_b_reference_commit": BRANCH_B_REFERENCE_COMMIT,
957        "dataset_dir": str(DATASET_DIR),
958        "dataset_files": dataset_file_rows(),
959        "dataset_check": dataset_check,
960        "scenarios": scenarios,
961        "overall_summary": _overall_summary(scenarios, dataset_check),
962        "structural_mismatches": _collect_structural_mismatches(scenarios),
963        "known_limitations": _known_limitations(),
964        "recommendation": _recommendation(),
965    }
966    if tuple(report) != REQUIRED_REPORT_KEYS:
967        raise RuntimeError(f"Unexpected report key order: {tuple(report)!r}")
968
969    json_path = Path(json_path)
970    markdown_path = Path(markdown_path)
971    review_path = Path(review_path)
972    json_path.parent.mkdir(parents=True, exist_ok=True)
973    markdown_path.parent.mkdir(parents=True, exist_ok=True)
974    review_path.parent.mkdir(parents=True, exist_ok=True)
975    json_path.write_text(json.dumps(report, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
976    markdown_path.write_text(_render_markdown(report), encoding="utf-8")
977    review_path.write_text(_render_review(report), encoding="utf-8")
978    return report
979
980
981def _parse_args(argv: Sequence[str]) -> argparse.Namespace:
982    parser = argparse.ArgumentParser(description="Branch A real-textbook validation against Branch B dataset/scenarios.")
983    parser.add_argument("--json-out", default=str(DEFAULT_JSON_REPORT_PATH))
984    parser.add_argument("--markdown-out", default=str(DEFAULT_MARKDOWN_REPORT_PATH))
985    parser.add_argument("--review-out", default=str(DEFAULT_REVIEW_REPORT_PATH))
986    return parser.parse_args(argv)
987
988
989def main(argv: Sequence[str] | None = None) -> int:
990    args = _parse_args(sys.argv[1:] if argv is None else argv)
991    generate_validation_report(args.json_out, args.markdown_out, args.review_out)
992    return 0
993
994
995if __name__ == "__main__":
996    raise SystemExit(main())