- commit
- c110916
- parent
- c734288
- author
- codex@macbookpro
- date
- 2026-03-31 20:23:54 +0800 CST
review: strict rerun branch-b formal validation on cie-datasets
4 files changed,
+2329,
-0
1@@ -0,0 +1,1370 @@
2+{
3+ "claude_claim_commit": "379feb2ed4324020ee48a97a6edb8ec99ce39f1a",
4+ "claude_claim_commit_exists": true,
5+ "chosen_runnable_branch_b_commit": "c7342881bb2ebfa5e7f927c91a7806416288573b",
6+ "dataset_repo_status": {
7+ "path": "/Users/george/code/cie-datasets",
8+ "is_git_repo": true,
9+ "git_lfs_version": "git-lfs/3.7.1 (GitHub; darwin arm64; go 1.25.3)",
10+ "status_short": [],
11+ "lfs_required_entries": {},
12+ "remote_v": [
13+ "origin\tgit@github.com:imwower/cie-datasets.git (fetch)",
14+ "origin\tgit@github.com:imwower/cie-datasets.git (push)"
15+ ]
16+ },
17+ "required_dataset_paths": [
18+ {
19+ "stage": "小学",
20+ "subject": "语文",
21+ "relative_path": "splits/by_stage_subject/小学/语文.jsonl",
22+ "path": "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/语文.jsonl",
23+ "pre_pull_worktree_state": "pointer_stub",
24+ "exists": true,
25+ "is_pointer_stub_now": false,
26+ "hydrated": true,
27+ "line_count": 1597,
28+ "first_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:小学_语文_统编版_义务教育教科书·语文一年级上册:1\", \"lines\": [\"[非正文页]\"], \"error\": null, \"page\":",
29+ "second_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:小学_语文_统编版_义务教育教科书·语文一年级上册:2\", \"lines\": [\"[非正文页]\"], \"error\": null, \"page\":"
30+ },
31+ {
32+ "stage": "小学",
33+ "subject": "数学",
34+ "relative_path": "splits/by_stage_subject/小学/数学.jsonl",
35+ "path": "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/数学.jsonl",
36+ "pre_pull_worktree_state": "pointer_stub",
37+ "exists": true,
38+ "is_pointer_stub_now": false,
39+ "hydrated": true,
40+ "line_count": 7459,
41+ "first_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:小学_数学_人教版_义务教育教科书 · 数学一年级上册:1\", \"lines\": [\"[非正文页]\"], \"error\": null, \"page",
42+ "second_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:小学_数学_人教版_义务教育教科书 · 数学一年级上册:2\", \"lines\": [\"[非正文页]\"], \"error\": null, \"page"
43+ },
44+ {
45+ "stage": "小学",
46+ "subject": "科学",
47+ "relative_path": "splits/by_stage_subject/小学/科学.jsonl",
48+ "path": "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/科学.jsonl",
49+ "pre_pull_worktree_state": "pointer_stub",
50+ "exists": true,
51+ "is_pointer_stub_now": false,
52+ "hydrated": true,
53+ "line_count": 5032,
54+ "first_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:小学_科学_人教鄂教版_义务教育教科书·科学一年级上册:1\", \"lines\": [\"[非正文页]\"], \"error\": null, \"page",
55+ "second_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:小学_科学_人教鄂教版_义务教育教科书·科学一年级上册:2\", \"lines\": [\"[非正文页]\"], \"error\": null, \"page"
56+ },
57+ {
58+ "stage": "初中",
59+ "subject": "语文",
60+ "relative_path": "splits/by_stage_subject/初中/语文.jsonl",
61+ "path": "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/初中/语文.jsonl",
62+ "pre_pull_worktree_state": "pointer_stub",
63+ "exists": true,
64+ "is_pointer_stub_now": false,
65+ "hydrated": true,
66+ "line_count": 961,
67+ "first_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:初中_语文_统编版-人民教育出版社_七年级_义务教育教科书·语文七年级上册:1\", \"lines\": [\"[非正文页]\"], \"error\": n",
68+ "second_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:初中_语文_统编版-人民教育出版社_七年级_义务教育教科书·语文七年级上册:2\", \"lines\": [\"[非正文页]\"], \"error\": n"
69+ },
70+ {
71+ "stage": "初中",
72+ "subject": "数学",
73+ "relative_path": "splits/by_stage_subject/初中/数学.jsonl",
74+ "path": "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/初中/数学.jsonl",
75+ "pre_pull_worktree_state": "pointer_stub",
76+ "exists": true,
77+ "is_pointer_stub_now": false,
78+ "hydrated": true,
79+ "line_count": 5559,
80+ "first_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:初中_数学_人教版-人民教育出版社_七年级_义务教育教科书·数学七年级上册:1\", \"lines\": [\"[非正文页]\"], \"error\": n",
81+ "second_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:初中_数学_人教版-人民教育出版社_七年级_义务教育教科书·数学七年级上册:2\", \"lines\": [\"[非正文页]\"], \"error\": n"
82+ },
83+ {
84+ "stage": "高中",
85+ "subject": "语文",
86+ "relative_path": "splits/by_stage_subject/高中/语文.jsonl",
87+ "path": "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/高中/语文.jsonl",
88+ "pre_pull_worktree_state": "pointer_stub",
89+ "exists": true,
90+ "is_pointer_stub_now": false,
91+ "hydrated": true,
92+ "line_count": 694,
93+ "first_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:高中_语文_统编版-人民教育出版社_普通高中教科书·语文必修 上册:1\", \"lines\": [\"[非正文页]\"], \"error\": null,",
94+ "second_line_preview": "{\"concept\": \"教材非正文页\", \"layer\": \"boundary\", \"provider\": \"china-text-book-md\", \"model\": \"md_textbook_rules_v1\", \"timestamp\": \"md:高中_语文_统编版-人民教育出版社_普通高中教科书·语文必修 上册:2\", \"lines\": [\"[非正文页]\"], \"error\": null,"
95+ }
96+ ],
97+ "hydration_status": {
98+ "status": "SUCCESS",
99+ "all_required_files_hydrated": true
100+ },
101+ "script_used": "strict_rerun_port",
102+ "script_provenance": {
103+ "source_commit": "379feb2ed4324020ee48a97a6edb8ec99ce39f1a",
104+ "source_path": "tests/formal_validation.py",
105+ "port_path": "tests/formal_validation_strict_rerun.py",
106+ "port_rationale": "origin/branch-b is the runnable Branch B source tree, but it does not contain tests/formal_validation.py from 379feb2."
107+ },
108+ "compatibility_deviations": [
109+ {
110+ "type": "script_path",
111+ "detail": "Ported logic from 379feb2:tests/formal_validation.py into tests/formal_validation_strict_rerun.py because origin/branch-b lacks the original file."
112+ },
113+ {
114+ "type": "report_output",
115+ "detail": "Writes the strict rerun audit outputs to the required reports/ and reviews/ files instead of /tmp/formal_val_results.json, creating those parent directories on the runnable Branch B ref because they do not exist on origin/branch-b."
116+ },
117+ {
118+ "type": "structured_metadata",
119+ "detail": "Adds dataset hydration verification and structured comparison metadata; the validation ingest/step/emit logic and pass criteria remain aligned with 379feb2."
120+ }
121+ ],
122+ "rerun_performed": true,
123+ "rerun_command": "python3 tests/formal_validation_strict_rerun.py",
124+ "per_subject_results": [
125+ {
126+ "name": "小学语文_pipeline+stability",
127+ "type": "subject_pipeline",
128+ "stage": "小学",
129+ "subject": "语文",
130+ "input_files": [
131+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/语文.jsonl"
132+ ],
133+ "source_files_pre_pull_state": [
134+ "pointer_stub"
135+ ],
136+ "line_count": 1597,
137+ "loaded_records": 300,
138+ "fed_records": 200,
139+ "available_content_records_seen_before_cap": 300,
140+ "available_valid_text_records_seen_before_cap": 300,
141+ "node_count": 1268,
142+ "edge_count": 6338,
143+ "anchor_count": 62,
144+ "core_count": 17,
145+ "experience_region_count": 1233,
146+ "skill_belt_candidate_count": 1141,
147+ "sedimentation_events": 2374,
148+ "merge_events": 9331,
149+ "decay_events": 8378,
150+ "phi_range": [
151+ -0.113,
152+ 0.598
153+ ],
154+ "top_words": [
155+ {
156+ "phrase": "课文",
157+ "ratio": 2706.0
158+ },
159+ {
160+ "phrase": "本文",
161+ "ratio": 2633.3
162+ },
163+ {
164+ "phrase": "改动",
165+ "ratio": 2037.2
166+ },
167+ {
168+ "phrase": "有改",
169+ "ratio": 2003.8
170+ },
171+ {
172+ "phrase": "什么",
173+ "ratio": 1989.9
174+ },
175+ {
176+ "phrase": "作者",
177+ "ratio": 1845.4
178+ },
179+ {
180+ "phrase": "文时",
181+ "ratio": 1843.6
182+ },
183+ {
184+ "phrase": "作课",
185+ "ratio": 1841.0
186+ }
187+ ],
188+ "circuit_count": 5,
189+ "top_circuits": [
190+ {
191+ "phrase": "对我们",
192+ "delta": 20.9
193+ },
194+ {
195+ "phrase": "学校小",
196+ "delta": 19.4
197+ },
198+ {
199+ "phrase": "地一个",
200+ "delta": 18.1
201+ },
202+ {
203+ "phrase": "金色的",
204+ "delta": 17.5
205+ },
206+ {
207+ "phrase": "语中的",
208+ "delta": 9.7
209+ }
210+ ],
211+ "elapsed_seconds": 1.57,
212+ "output_mode": "full",
213+ "pass": true,
214+ "status": "PASS",
215+ "reason": "nodes=1268, edges=6338, phi=[-0.113,0.598], attn=100.0, mode=full, time=1.57s, recs=200",
216+ "claim_word_comparison": null,
217+ "claim_metric_comparison": {
218+ "claimed": {
219+ "nodes": 1268,
220+ "edges": 6338,
221+ "phi_range": [
222+ -0.113,
223+ 0.598
224+ ]
225+ },
226+ "actual": {
227+ "nodes": 1268,
228+ "edges": 6338,
229+ "phi_range": [
230+ -0.113,
231+ 0.598
232+ ]
233+ },
234+ "exact_match": true
235+ }
236+ },
237+ {
238+ "name": "小学数学_pipeline+stability",
239+ "type": "subject_pipeline",
240+ "stage": "小学",
241+ "subject": "数学",
242+ "input_files": [
243+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/数学.jsonl"
244+ ],
245+ "source_files_pre_pull_state": [
246+ "pointer_stub"
247+ ],
248+ "line_count": 7459,
249+ "loaded_records": 300,
250+ "fed_records": 200,
251+ "available_content_records_seen_before_cap": 300,
252+ "available_valid_text_records_seen_before_cap": 300,
253+ "node_count": 803,
254+ "edge_count": 4367,
255+ "anchor_count": 54,
256+ "core_count": 9,
257+ "experience_region_count": 782,
258+ "skill_belt_candidate_count": 754,
259+ "sedimentation_events": 1536,
260+ "merge_events": 10769,
261+ "decay_events": 8374,
262+ "phi_range": [
263+ -8.3,
264+ 0.579
265+ ],
266+ "top_words": [
267+ {
268+ "phrase": "多少",
269+ "ratio": 2827.3
270+ },
271+ {
272+ "phrase": "一共",
273+ "ratio": 2201.2
274+ },
275+ {
276+ "phrase": "什么",
277+ "ratio": 2171.0
278+ },
279+ {
280+ "phrase": "下面",
281+ "ratio": 2074.8
282+ },
283+ {
284+ "phrase": "可以",
285+ "ratio": 1774.6
286+ },
287+ {
288+ "phrase": "算式",
289+ "ratio": 1720.2
290+ },
291+ {
292+ "phrase": "怎样",
293+ "ratio": 1711.8
294+ },
295+ {
296+ "phrase": "问题",
297+ "ratio": 1594.4
298+ }
299+ ],
300+ "circuit_count": 5,
301+ "top_circuits": [
302+ {
303+ "phrase": "的多少",
304+ "delta": 29.0
305+ },
306+ {
307+ "phrase": "的后面",
308+ "delta": 25.6
309+ },
310+ {
311+ "phrase": "学习数",
312+ "delta": 23.7
313+ },
314+ {
315+ "phrase": "习数学",
316+ "delta": 23.7
317+ },
318+ {
319+ "phrase": "数学习",
320+ "delta": 23.7
321+ }
322+ ],
323+ "elapsed_seconds": 1.14,
324+ "output_mode": "full",
325+ "pass": true,
326+ "status": "PASS",
327+ "reason": "nodes=803, edges=4367, phi=[-8.300,0.579], attn=100.0, mode=full, time=1.14s, recs=200",
328+ "claim_word_comparison": {
329+ "claimed_examples": [
330+ "多少",
331+ "一共"
332+ ],
333+ "actual_top_words": [
334+ "多少",
335+ "一共",
336+ "什么",
337+ "下面",
338+ "可以",
339+ "算式",
340+ "怎样",
341+ "问题"
342+ ],
343+ "matched": [
344+ "多少",
345+ "一共"
346+ ],
347+ "missing": []
348+ },
349+ "claim_metric_comparison": {
350+ "claimed": {
351+ "nodes": 803,
352+ "edges": 4367,
353+ "phi_range": [
354+ -8.3,
355+ 0.579
356+ ]
357+ },
358+ "actual": {
359+ "nodes": 803,
360+ "edges": 4367,
361+ "phi_range": [
362+ -8.3,
363+ 0.579
364+ ]
365+ },
366+ "exact_match": true
367+ }
368+ },
369+ {
370+ "name": "初中语文_pipeline+stability",
371+ "type": "subject_pipeline",
372+ "stage": "初中",
373+ "subject": "语文",
374+ "input_files": [
375+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/初中/语文.jsonl"
376+ ],
377+ "source_files_pre_pull_state": [
378+ "pointer_stub"
379+ ],
380+ "line_count": 961,
381+ "loaded_records": 300,
382+ "fed_records": 200,
383+ "available_content_records_seen_before_cap": 300,
384+ "available_valid_text_records_seen_before_cap": 300,
385+ "node_count": 1702,
386+ "edge_count": 9372,
387+ "anchor_count": 58,
388+ "core_count": 11,
389+ "experience_region_count": 1671,
390+ "skill_belt_candidate_count": 1600,
391+ "sedimentation_events": 3271,
392+ "merge_events": 21531,
393+ "decay_events": 12421,
394+ "phi_range": [
395+ -0.085,
396+ 0.553
397+ ],
398+ "top_words": [
399+ {
400+ "phrase": "阅读",
401+ "ratio": 5541.6
402+ },
403+ {
404+ "phrase": "单元",
405+ "ratio": 3998.7
406+ },
407+ {
408+ "phrase": "读第",
409+ "ratio": 3299.3
410+ },
411+ {
412+ "phrase": "我们",
413+ "ratio": 2034.8
414+ },
415+ {
416+ "phrase": "第一",
417+ "ratio": 2016.7
418+ },
419+ {
420+ "phrase": "学习",
421+ "ratio": 1780.7
422+ },
423+ {
424+ "phrase": "先生",
425+ "ratio": 1597.9
426+ },
427+ {
428+ "phrase": "一个",
429+ "ratio": 1590.0
430+ }
431+ ],
432+ "circuit_count": 5,
433+ "top_circuits": [
434+ {
435+ "phrase": "一定的",
436+ "delta": 21.6
437+ },
438+ {
439+ "phrase": "自己的",
440+ "delta": 17.7
441+ },
442+ {
443+ "phrase": "第十二",
444+ "delta": 11.9
445+ },
446+ {
447+ "phrase": "自然而",
448+ "delta": 10.3
449+ },
450+ {
451+ "phrase": "一定写",
452+ "delta": 10.2
453+ }
454+ ],
455+ "elapsed_seconds": 2.79,
456+ "output_mode": "full",
457+ "pass": true,
458+ "status": "PASS",
459+ "reason": "nodes=1702, edges=9372, phi=[-0.085,0.553], attn=100.0, mode=full, time=2.79s, recs=200",
460+ "claim_word_comparison": {
461+ "claimed_examples": [
462+ "阅读",
463+ "单元"
464+ ],
465+ "actual_top_words": [
466+ "阅读",
467+ "单元",
468+ "读第",
469+ "我们",
470+ "第一",
471+ "学习",
472+ "先生",
473+ "一个"
474+ ],
475+ "matched": [
476+ "阅读",
477+ "单元"
478+ ],
479+ "missing": []
480+ },
481+ "claim_metric_comparison": {
482+ "claimed": {
483+ "nodes": 1702,
484+ "edges": 9372,
485+ "phi_range": [
486+ -0.085,
487+ 0.553
488+ ]
489+ },
490+ "actual": {
491+ "nodes": 1702,
492+ "edges": 9372,
493+ "phi_range": [
494+ -0.085,
495+ 0.553
496+ ]
497+ },
498+ "exact_match": true
499+ }
500+ },
501+ {
502+ "name": "初中数学_pipeline+stability",
503+ "type": "subject_pipeline",
504+ "stage": "初中",
505+ "subject": "数学",
506+ "input_files": [
507+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/初中/数学.jsonl"
508+ ],
509+ "source_files_pre_pull_state": [
510+ "pointer_stub"
511+ ],
512+ "line_count": 5559,
513+ "loaded_records": 300,
514+ "fed_records": 200,
515+ "available_content_records_seen_before_cap": 300,
516+ "available_valid_text_records_seen_before_cap": 300,
517+ "node_count": 886,
518+ "edge_count": 5437,
519+ "anchor_count": 58,
520+ "core_count": 4,
521+ "experience_region_count": 878,
522+ "skill_belt_candidate_count": 861,
523+ "sedimentation_events": 1739,
524+ "merge_events": 15284,
525+ "decay_events": 7820,
526+ "phi_range": [
527+ -0.045,
528+ 0.479
529+ ],
530+ "top_words": [
531+ {
532+ "phrase": "方程",
533+ "ratio": 3990.8
534+ },
535+ {
536+ "phrase": "可以",
537+ "ratio": 2289.4
538+ },
539+ {
540+ "phrase": "图形",
541+ "ratio": 2147.0
542+ },
543+ {
544+ "phrase": "问题",
545+ "ratio": 2122.0
546+ },
547+ {
548+ "phrase": "我们",
549+ "ratio": 2096.1
550+ },
551+ {
552+ "phrase": "表示",
553+ "ratio": 1976.2
554+ },
555+ {
556+ "phrase": "二次",
557+ "ratio": 1863.4
558+ },
559+ {
560+ "phrase": "函数",
561+ "ratio": 1809.4
562+ }
563+ ],
564+ "circuit_count": 5,
565+ "top_circuits": [
566+ {
567+ "phrase": "学习数",
568+ "delta": 25.5
569+ },
570+ {
571+ "phrase": "数学习",
572+ "delta": 25.5
573+ },
574+ {
575+ "phrase": "的理数",
576+ "delta": 24.2
577+ },
578+ {
579+ "phrase": "数的理",
580+ "delta": 24.2
581+ },
582+ {
583+ "phrase": "的表示",
584+ "delta": 22.6
585+ }
586+ ],
587+ "elapsed_seconds": 1.49,
588+ "output_mode": "full",
589+ "pass": true,
590+ "status": "PASS",
591+ "reason": "nodes=886, edges=5437, phi=[-0.045,0.479], attn=100.0, mode=full, time=1.49s, recs=200",
592+ "claim_word_comparison": {
593+ "claimed_examples": [
594+ "方程",
595+ "图形",
596+ "问题"
597+ ],
598+ "actual_top_words": [
599+ "方程",
600+ "可以",
601+ "图形",
602+ "问题",
603+ "我们",
604+ "表示",
605+ "二次",
606+ "函数"
607+ ],
608+ "matched": [
609+ "方程",
610+ "图形",
611+ "问题"
612+ ],
613+ "missing": []
614+ },
615+ "claim_metric_comparison": {
616+ "claimed": {
617+ "nodes": 886,
618+ "edges": 5437,
619+ "phi_range": [
620+ -0.045,
621+ 0.48
622+ ]
623+ },
624+ "actual": {
625+ "nodes": 886,
626+ "edges": 5437,
627+ "phi_range": [
628+ -0.045,
629+ 0.479
630+ ]
631+ },
632+ "exact_match": false
633+ }
634+ },
635+ {
636+ "name": "高中语文_pipeline+stability",
637+ "type": "subject_pipeline",
638+ "stage": "高中",
639+ "subject": "语文",
640+ "input_files": [
641+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/高中/语文.jsonl"
642+ ],
643+ "source_files_pre_pull_state": [
644+ "pointer_stub"
645+ ],
646+ "line_count": 694,
647+ "loaded_records": 300,
648+ "fed_records": 200,
649+ "available_content_records_seen_before_cap": 300,
650+ "available_valid_text_records_seen_before_cap": 300,
651+ "node_count": 1857,
652+ "edge_count": 9983,
653+ "anchor_count": 56,
654+ "core_count": 25,
655+ "experience_region_count": 1829,
656+ "skill_belt_candidate_count": 1747,
657+ "sedimentation_events": 3576,
658+ "merge_events": 21824,
659+ "decay_events": 18607,
660+ "phi_range": [
661+ -8.266,
662+ 0.239
663+ ],
664+ "top_words": [
665+ {
666+ "phrase": "单元",
667+ "ratio": 4606.7
668+ },
669+ {
670+ "phrase": "语文",
671+ "ratio": 4125.0
672+ },
673+ {
674+ "phrase": "必修",
675+ "ratio": 4103.1
676+ },
677+ {
678+ "phrase": "文必",
679+ "ratio": 4068.6
680+ },
681+ {
682+ "phrase": "上册",
683+ "ratio": 3347.0
684+ },
685+ {
686+ "phrase": "修上",
687+ "ratio": 3327.4
688+ },
689+ {
690+ "phrase": "我们",
691+ "ratio": 2167.8
692+ },
693+ {
694+ "phrase": "第一",
695+ "ratio": 1929.2
696+ }
697+ ],
698+ "circuit_count": 5,
699+ "top_circuits": [
700+ {
701+ "phrase": "长大嫂",
702+ "delta": 4.2
703+ },
704+ {
705+ "phrase": "放和你",
706+ "delta": 4.1
707+ },
708+ {
709+ "phrase": "东去的",
710+ "delta": 3.2
711+ },
712+ {
713+ "phrase": "地平对",
714+ "delta": 3.2
715+ },
716+ {
717+ "phrase": "在地平",
718+ "delta": 3.0
719+ }
720+ ],
721+ "elapsed_seconds": 3.15,
722+ "output_mode": "full",
723+ "pass": true,
724+ "status": "PASS",
725+ "reason": "nodes=1857, edges=9983, phi=[-8.266,0.239], attn=100.0, mode=full, time=3.15s, recs=200",
726+ "claim_word_comparison": null,
727+ "claim_metric_comparison": {
728+ "claimed": {
729+ "nodes": 1857,
730+ "edges": 9983,
731+ "phi_range": [
732+ -8.266,
733+ 0.239
734+ ]
735+ },
736+ "actual": {
737+ "nodes": 1857,
738+ "edges": 9983,
739+ "phi_range": [
740+ -8.266,
741+ 0.239
742+ ]
743+ },
744+ "exact_match": true
745+ }
746+ },
747+ {
748+ "name": "cross_stage_语文",
749+ "type": "cross_stage",
750+ "input_files": [
751+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/语文.jsonl",
752+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/初中/语文.jsonl",
753+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/高中/语文.jsonl"
754+ ],
755+ "source_files_pre_pull_state": [
756+ "pointer_stub",
757+ "pointer_stub",
758+ "pointer_stub"
759+ ],
760+ "loaded_records_per_input": [
761+ 40,
762+ 40,
763+ 40
764+ ],
765+ "fed_records": 120,
766+ "node_count": 1128,
767+ "edge_count": 4139,
768+ "phi_range": [
769+ -0.071,
770+ 0.126
771+ ],
772+ "top_words": [
773+ {
774+ "phrase": "单元",
775+ "ratio": 2393.7
776+ },
777+ {
778+ "phrase": "阅读",
779+ "ratio": 1911.0
780+ },
781+ {
782+ "phrase": "第一",
783+ "ratio": 1751.4
784+ },
785+ {
786+ "phrase": "语文",
787+ "ratio": 1750.3
788+ },
789+ {
790+ "phrase": "一单",
791+ "ratio": 1656.9
792+ },
793+ {
794+ "phrase": "课文",
795+ "ratio": 1387.8
796+ },
797+ {
798+ "phrase": "修上",
799+ "ratio": 1334.6
800+ },
801+ {
802+ "phrase": "必修",
803+ "ratio": 1330.0
804+ }
805+ ],
806+ "pass": true,
807+ "status": "PASS",
808+ "reason": "nodes=1128, phi_max=0.126",
809+ "claim_metric_comparison": {
810+ "claimed": {
811+ "nodes": 1128,
812+ "phi_max": 0.112
813+ },
814+ "actual": {
815+ "nodes": 1128,
816+ "edges": 4139,
817+ "phi_max": 0.126
818+ },
819+ "exact_match": false
820+ }
821+ },
822+ {
823+ "name": "cross_subject_小学",
824+ "type": "cross_subject",
825+ "input_files": [
826+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/语文.jsonl",
827+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/数学.jsonl",
828+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/科学.jsonl"
829+ ],
830+ "source_files_pre_pull_state": [
831+ "pointer_stub",
832+ "pointer_stub",
833+ "pointer_stub"
834+ ],
835+ "loaded_records_per_input": [
836+ 40,
837+ 40,
838+ 40
839+ ],
840+ "fed_records": 120,
841+ "node_count": 708,
842+ "edge_count": 2866,
843+ "phi_range": [
844+ -0.061,
845+ 10.0
846+ ],
847+ "top_words": [
848+ {
849+ "phrase": "食物",
850+ "ratio": 1599.4
851+ },
852+ {
853+ "phrase": "怎样",
854+ "ratio": 1539.1
855+ },
856+ {
857+ "phrase": "什么",
858+ "ratio": 1484.0
859+ },
860+ {
861+ "phrase": "我们",
862+ "ratio": 1329.0
863+ },
864+ {
865+ "phrase": "本文",
866+ "ratio": 1303.4
867+ },
868+ {
869+ "phrase": "电路",
870+ "ratio": 1239.6
871+ },
872+ {
873+ "phrase": "溶解",
874+ "ratio": 1139.3
875+ },
876+ {
877+ "phrase": "哪些",
878+ "ratio": 1111.8
879+ }
880+ ],
881+ "pass": true,
882+ "status": "PASS",
883+ "reason": "nodes=708, phi_max=10.000, cores=21",
884+ "claim_metric_comparison": {
885+ "claimed": {
886+ "nodes": 708,
887+ "phi_max": 10.0
888+ },
889+ "actual": {
890+ "nodes": 708,
891+ "edges": 2866,
892+ "phi_max": 10.0
893+ },
894+ "exact_match": true
895+ }
896+ },
897+ {
898+ "name": "all_in_one_5subjects",
899+ "type": "all_in_one",
900+ "input_files": [
901+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/语文.jsonl",
902+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/小学/数学.jsonl",
903+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/初中/语文.jsonl",
904+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/初中/数学.jsonl",
905+ "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28/splits/by_stage_subject/高中/语文.jsonl"
906+ ],
907+ "source_files_pre_pull_state": [
908+ "pointer_stub",
909+ "pointer_stub",
910+ "pointer_stub",
911+ "pointer_stub",
912+ "pointer_stub"
913+ ],
914+ "loaded_records_per_input": [
915+ 50,
916+ 50,
917+ 50,
918+ 50,
919+ 50
920+ ],
921+ "fed_records": 250,
922+ "node_count": 1373,
923+ "edge_count": 6863,
924+ "phi_range": [
925+ -0.081,
926+ 0.198
927+ ],
928+ "top_words": [
929+ {
930+ "phrase": "单元",
931+ "ratio": 2789.9
932+ },
933+ {
934+ "phrase": "阅读",
935+ "ratio": 2354.9
936+ },
937+ {
938+ "phrase": "语文",
939+ "ratio": 2125.5
940+ },
941+ {
942+ "phrase": "第一",
943+ "ratio": 1878.4
944+ },
945+ {
946+ "phrase": "一单",
947+ "ratio": 1622.3
948+ },
949+ {
950+ "phrase": "什么",
951+ "ratio": 1617.2
952+ },
953+ {
954+ "phrase": "修上",
955+ "ratio": 1591.9
956+ },
957+ {
958+ "phrase": "必修",
959+ "ratio": 1571.5
960+ }
961+ ],
962+ "pass": true,
963+ "status": "PASS",
964+ "reason": "fed=250, nodes=1373, edges=6863, phi_max=0.198",
965+ "claim_metric_comparison": {
966+ "claimed": {
967+ "nodes": 1373,
968+ "edges": 6863,
969+ "phi_max": 0.167
970+ },
971+ "actual": {
972+ "nodes": 1373,
973+ "edges": 6863,
974+ "phi_max": 0.198
975+ },
976+ "exact_match": false
977+ }
978+ }
979+ ],
980+ "analysis": {
981+ "小学语文": {
982+ "nodes": 1268,
983+ "edges": 6338,
984+ "anchors": 62,
985+ "cores": 17,
986+ "exp": 1233,
987+ "belts": 1141,
988+ "top_words": [
989+ {
990+ "phrase": "课文",
991+ "ratio": 2706.0
992+ },
993+ {
994+ "phrase": "本文",
995+ "ratio": 2633.3
996+ },
997+ {
998+ "phrase": "改动",
999+ "ratio": 2037.2
1000+ },
1001+ {
1002+ "phrase": "有改",
1003+ "ratio": 2003.8
1004+ },
1005+ {
1006+ "phrase": "什么",
1007+ "ratio": 1989.9
1008+ },
1009+ {
1010+ "phrase": "作者",
1011+ "ratio": 1845.4
1012+ },
1013+ {
1014+ "phrase": "文时",
1015+ "ratio": 1843.6
1016+ },
1017+ {
1018+ "phrase": "作课",
1019+ "ratio": 1841.0
1020+ }
1021+ ],
1022+ "circuits": 5,
1023+ "top_circuits": [
1024+ {
1025+ "phrase": "对我们",
1026+ "delta": 20.9
1027+ },
1028+ {
1029+ "phrase": "学校小",
1030+ "delta": 19.4
1031+ },
1032+ {
1033+ "phrase": "地一个",
1034+ "delta": 18.1
1035+ },
1036+ {
1037+ "phrase": "金色的",
1038+ "delta": 17.5
1039+ },
1040+ {
1041+ "phrase": "语中的",
1042+ "delta": 9.7
1043+ }
1044+ ],
1045+ "sed": 2374,
1046+ "merges": 9331,
1047+ "decays": 8378,
1048+ "phi_range": [
1049+ -0.113,
1050+ 0.598
1051+ ]
1052+ },
1053+ "小学数学": {
1054+ "nodes": 803,
1055+ "edges": 4367,
1056+ "anchors": 54,
1057+ "cores": 9,
1058+ "exp": 782,
1059+ "belts": 754,
1060+ "top_words": [
1061+ {
1062+ "phrase": "多少",
1063+ "ratio": 2827.3
1064+ },
1065+ {
1066+ "phrase": "一共",
1067+ "ratio": 2201.2
1068+ },
1069+ {
1070+ "phrase": "什么",
1071+ "ratio": 2171.0
1072+ },
1073+ {
1074+ "phrase": "下面",
1075+ "ratio": 2074.8
1076+ },
1077+ {
1078+ "phrase": "可以",
1079+ "ratio": 1774.6
1080+ },
1081+ {
1082+ "phrase": "算式",
1083+ "ratio": 1720.2
1084+ },
1085+ {
1086+ "phrase": "怎样",
1087+ "ratio": 1711.8
1088+ },
1089+ {
1090+ "phrase": "问题",
1091+ "ratio": 1594.4
1092+ }
1093+ ],
1094+ "circuits": 5,
1095+ "top_circuits": [
1096+ {
1097+ "phrase": "的多少",
1098+ "delta": 29.0
1099+ },
1100+ {
1101+ "phrase": "的后面",
1102+ "delta": 25.6
1103+ },
1104+ {
1105+ "phrase": "学习数",
1106+ "delta": 23.7
1107+ },
1108+ {
1109+ "phrase": "习数学",
1110+ "delta": 23.7
1111+ },
1112+ {
1113+ "phrase": "数学习",
1114+ "delta": 23.7
1115+ }
1116+ ],
1117+ "sed": 1536,
1118+ "merges": 10769,
1119+ "decays": 8374,
1120+ "phi_range": [
1121+ -8.3,
1122+ 0.579
1123+ ]
1124+ },
1125+ "初中语文": {
1126+ "nodes": 1702,
1127+ "edges": 9372,
1128+ "anchors": 58,
1129+ "cores": 11,
1130+ "exp": 1671,
1131+ "belts": 1600,
1132+ "top_words": [
1133+ {
1134+ "phrase": "阅读",
1135+ "ratio": 5541.6
1136+ },
1137+ {
1138+ "phrase": "单元",
1139+ "ratio": 3998.7
1140+ },
1141+ {
1142+ "phrase": "读第",
1143+ "ratio": 3299.3
1144+ },
1145+ {
1146+ "phrase": "我们",
1147+ "ratio": 2034.8
1148+ },
1149+ {
1150+ "phrase": "第一",
1151+ "ratio": 2016.7
1152+ },
1153+ {
1154+ "phrase": "学习",
1155+ "ratio": 1780.7
1156+ },
1157+ {
1158+ "phrase": "先生",
1159+ "ratio": 1597.9
1160+ },
1161+ {
1162+ "phrase": "一个",
1163+ "ratio": 1590.0
1164+ }
1165+ ],
1166+ "circuits": 5,
1167+ "top_circuits": [
1168+ {
1169+ "phrase": "一定的",
1170+ "delta": 21.6
1171+ },
1172+ {
1173+ "phrase": "自己的",
1174+ "delta": 17.7
1175+ },
1176+ {
1177+ "phrase": "第十二",
1178+ "delta": 11.9
1179+ },
1180+ {
1181+ "phrase": "自然而",
1182+ "delta": 10.3
1183+ },
1184+ {
1185+ "phrase": "一定写",
1186+ "delta": 10.2
1187+ }
1188+ ],
1189+ "sed": 3271,
1190+ "merges": 21531,
1191+ "decays": 12421,
1192+ "phi_range": [
1193+ -0.085,
1194+ 0.553
1195+ ]
1196+ },
1197+ "初中数学": {
1198+ "nodes": 886,
1199+ "edges": 5437,
1200+ "anchors": 58,
1201+ "cores": 4,
1202+ "exp": 878,
1203+ "belts": 861,
1204+ "top_words": [
1205+ {
1206+ "phrase": "方程",
1207+ "ratio": 3990.8
1208+ },
1209+ {
1210+ "phrase": "可以",
1211+ "ratio": 2289.4
1212+ },
1213+ {
1214+ "phrase": "图形",
1215+ "ratio": 2147.0
1216+ },
1217+ {
1218+ "phrase": "问题",
1219+ "ratio": 2122.0
1220+ },
1221+ {
1222+ "phrase": "我们",
1223+ "ratio": 2096.1
1224+ },
1225+ {
1226+ "phrase": "表示",
1227+ "ratio": 1976.2
1228+ },
1229+ {
1230+ "phrase": "二次",
1231+ "ratio": 1863.4
1232+ },
1233+ {
1234+ "phrase": "函数",
1235+ "ratio": 1809.4
1236+ }
1237+ ],
1238+ "circuits": 5,
1239+ "top_circuits": [
1240+ {
1241+ "phrase": "学习数",
1242+ "delta": 25.5
1243+ },
1244+ {
1245+ "phrase": "数学习",
1246+ "delta": 25.5
1247+ },
1248+ {
1249+ "phrase": "的理数",
1250+ "delta": 24.2
1251+ },
1252+ {
1253+ "phrase": "数的理",
1254+ "delta": 24.2
1255+ },
1256+ {
1257+ "phrase": "的表示",
1258+ "delta": 22.6
1259+ }
1260+ ],
1261+ "sed": 1739,
1262+ "merges": 15284,
1263+ "decays": 7820,
1264+ "phi_range": [
1265+ -0.045,
1266+ 0.479
1267+ ]
1268+ },
1269+ "高中语文": {
1270+ "nodes": 1857,
1271+ "edges": 9983,
1272+ "anchors": 56,
1273+ "cores": 25,
1274+ "exp": 1829,
1275+ "belts": 1747,
1276+ "top_words": [
1277+ {
1278+ "phrase": "单元",
1279+ "ratio": 4606.7
1280+ },
1281+ {
1282+ "phrase": "语文",
1283+ "ratio": 4125.0
1284+ },
1285+ {
1286+ "phrase": "必修",
1287+ "ratio": 4103.1
1288+ },
1289+ {
1290+ "phrase": "文必",
1291+ "ratio": 4068.6
1292+ },
1293+ {
1294+ "phrase": "上册",
1295+ "ratio": 3347.0
1296+ },
1297+ {
1298+ "phrase": "修上",
1299+ "ratio": 3327.4
1300+ },
1301+ {
1302+ "phrase": "我们",
1303+ "ratio": 2167.8
1304+ },
1305+ {
1306+ "phrase": "第一",
1307+ "ratio": 1929.2
1308+ }
1309+ ],
1310+ "circuits": 5,
1311+ "top_circuits": [
1312+ {
1313+ "phrase": "长大嫂",
1314+ "delta": 4.2
1315+ },
1316+ {
1317+ "phrase": "放和你",
1318+ "delta": 4.1
1319+ },
1320+ {
1321+ "phrase": "东去的",
1322+ "delta": 3.2
1323+ },
1324+ {
1325+ "phrase": "地平对",
1326+ "delta": 3.2
1327+ },
1328+ {
1329+ "phrase": "在地平",
1330+ "delta": 3.0
1331+ }
1332+ ],
1333+ "sed": 3576,
1334+ "merges": 21824,
1335+ "decays": 18607,
1336+ "phi_range": [
1337+ -8.266,
1338+ 0.239
1339+ ]
1340+ },
1341+ "全灌": {
1342+ "fed": 250,
1343+ "nodes": 1373,
1344+ "edges": 6863,
1345+ "phi_range": [
1346+ -0.081,
1347+ 0.198
1348+ ],
1349+ "anchors": 58,
1350+ "cores": 21,
1351+ "sed": 2563,
1352+ "merges": 16921,
1353+ "decays": 8503
1354+ }
1355+ },
1356+ "summary": {
1357+ "passed": 8,
1358+ "failed": 0,
1359+ "total": 8,
1360+ "dataset": "/Users/george/code/cie-datasets/china_text_book_md/v2026-03-28",
1361+ "status": "PASS"
1362+ },
1363+ "reproduced_8_of_8_pass": true,
1364+ "blockers": [],
1365+ "overall_assessment": {
1366+ "conclusion": "partially reproduced",
1367+ "effect_on_prior_a_vs_b_comparison": "strengthens",
1368+ "trust_call": "trust the rerun outcome, not the exact original-commit execution claim"
1369+ },
1370+ "recommendation": "trust the rerun outcome but not Claude's exact original execution claim verbatim"
1371+}
1@@ -0,0 +1,89 @@
2+# Branch B formal validation strict rerun
3+
4+## 1. purpose
5+
6+Strictly rerun Claude's claimed Branch B formal dataset validation once on `/Users/george/code/cie-datasets`,
7+using a runnable Branch B source tree without changing runtime behavior.
8+
9+## 2. claimed result being rerun
10+
11+- Claim commit: `379feb2ed4324020ee48a97a6edb8ec99ce39f1a`
12+- Claimed result: `8/8 PASS` on `/Users/george/code/cie-datasets`.
13+- Claimed examples: 小学数学 `多少`, `一共`; 初中数学 `方程`, `图形`, `问题`; 初中语文 `阅读`, `单元`.
14+
15+## 3. chosen runnable Branch B ref
16+
17+- Runnable Branch B base: `c7342881bb2ebfa5e7f927c91a7806416288573b` (`origin/branch-b` / `c734288`).
18+- This ref contains the Branch B runtime source tree (`cie/__init__.py`, `cie/runtime.py`, `cie/graph.py`, `cie/state.py`, `cie/dynamics.py`).
19+
20+## 4. dataset hydration audit
21+
22+- Dataset repo: `/Users/george/code/cie-datasets`
23+- Git LFS: `git-lfs/3.7.1 (GitHub; darwin arm64; go 1.25.3)`
24+- Hydration status: `SUCCESS`
25+
26+| Stage | Subject | Relative path | Pre-pull state | Hydrated now | Line count |
27+|---|---|---|---|---:|---:|
28+| 小学 | 语文 | `splits/by_stage_subject/小学/语文.jsonl` | pointer_stub | yes | 1597 |
29+| 小学 | 数学 | `splits/by_stage_subject/小学/数学.jsonl` | pointer_stub | yes | 7459 |
30+| 小学 | 科学 | `splits/by_stage_subject/小学/科学.jsonl` | pointer_stub | yes | 5032 |
31+| 初中 | 语文 | `splits/by_stage_subject/初中/语文.jsonl` | pointer_stub | yes | 961 |
32+| 初中 | 数学 | `splits/by_stage_subject/初中/数学.jsonl` | pointer_stub | yes | 5559 |
33+| 高中 | 语文 | `splits/by_stage_subject/高中/语文.jsonl` | pointer_stub | yes | 694 |
34+
35+## 5. script provenance audit
36+
37+- Script used: `strict_rerun_port`
38+- Source: `379feb2ed4324020ee48a97a6edb8ec99ce39f1a:tests/formal_validation.py`
39+- Port path: `tests/formal_validation_strict_rerun.py`
40+- Compatibility deviations:
41+- Ported logic from 379feb2:tests/formal_validation.py into tests/formal_validation_strict_rerun.py because origin/branch-b lacks the original file.
42+- Writes the strict rerun audit outputs to the required reports/ and reviews/ files instead of /tmp/formal_val_results.json, creating those parent directories on the runnable Branch B ref because they do not exist on origin/branch-b.
43+- Adds dataset hydration verification and structured comparison metadata; the validation ingest/step/emit logic and pass criteria remain aligned with 379feb2.
44+
45+## 6. exact rerun command
46+
47+- `python3 tests/formal_validation_strict_rerun.py`
48+
49+## 7. per-subject results table
50+
51+| Scenario | Subject/domain | Status | Nodes | Edges | Top emergent words | Reason |
52+|---|---|---|---:|---:|---|---|
53+| 小学语文_pipeline+stability | 语文 | PASS | 1268 | 6338 | 课文, 本文, 改动, 有改, 什么 | nodes=1268, edges=6338, phi=[-0.113,0.598], attn=100.0, mode=full, time=1.57s, recs=200 |
54+| 小学数学_pipeline+stability | 数学 | PASS | 803 | 4367 | 多少, 一共, 什么, 下面, 可以 | nodes=803, edges=4367, phi=[-8.300,0.579], attn=100.0, mode=full, time=1.14s, recs=200 |
55+| 初中语文_pipeline+stability | 语文 | PASS | 1702 | 9372 | 阅读, 单元, 读第, 我们, 第一 | nodes=1702, edges=9372, phi=[-0.085,0.553], attn=100.0, mode=full, time=2.79s, recs=200 |
56+| 初中数学_pipeline+stability | 数学 | PASS | 886 | 5437 | 方程, 可以, 图形, 问题, 我们 | nodes=886, edges=5437, phi=[-0.045,0.479], attn=100.0, mode=full, time=1.49s, recs=200 |
57+| 高中语文_pipeline+stability | 语文 | PASS | 1857 | 9983 | 单元, 语文, 必修, 文必, 上册 | nodes=1857, edges=9983, phi=[-8.266,0.239], attn=100.0, mode=full, time=3.15s, recs=200 |
58+| cross_stage_语文 | cross_stage_语文 | PASS | 1128 | 4139 | 单元, 阅读, 第一, 语文, 一单 | nodes=1128, phi_max=0.126 |
59+| cross_subject_小学 | cross_subject_小学 | PASS | 708 | 2866 | 食物, 怎样, 什么, 我们, 本文 | nodes=708, phi_max=10.000, cores=21 |
60+| all_in_one_5subjects | all_in_one_5subjects | PASS | 1373 | 6863 | 单元, 阅读, 语文, 第一, 一单 | fed=250, nodes=1373, edges=6863, phi_max=0.198 |
61+
62+## 8. whether 8/8 PASS was reproduced
63+
64+- Rerun performed: `True`
65+- Result: `8/8 PASS`
66+- Exact 8/8 reproduction: `True`
67+
68+## 9. comparison to Claude's claimed numbers/words
69+
70+- 小学语文_pipeline+stability: claimed metrics {'nodes': 1268, 'edges': 6338, 'phi_range': [-0.113, 0.598]}; actual {'nodes': 1268, 'edges': 6338, 'phi_range': [-0.113, 0.598]}; exact_match=True.
71+- 小学数学_pipeline+stability: claimed words ['多少', '一共']; matched ['多少', '一共']; missing [].
72+- 小学数学_pipeline+stability: claimed metrics {'nodes': 803, 'edges': 4367, 'phi_range': [-8.3, 0.579]}; actual {'nodes': 803, 'edges': 4367, 'phi_range': [-8.3, 0.579]}; exact_match=True.
73+- 初中语文_pipeline+stability: claimed words ['阅读', '单元']; matched ['阅读', '单元']; missing [].
74+- 初中语文_pipeline+stability: claimed metrics {'nodes': 1702, 'edges': 9372, 'phi_range': [-0.085, 0.553]}; actual {'nodes': 1702, 'edges': 9372, 'phi_range': [-0.085, 0.553]}; exact_match=True.
75+- 初中数学_pipeline+stability: claimed words ['方程', '图形', '问题']; matched ['方程', '图形', '问题']; missing [].
76+- 初中数学_pipeline+stability: claimed metrics {'nodes': 886, 'edges': 5437, 'phi_range': [-0.045, 0.48]}; actual {'nodes': 886, 'edges': 5437, 'phi_range': [-0.045, 0.479]}; exact_match=False.
77+- 高中语文_pipeline+stability: claimed metrics {'nodes': 1857, 'edges': 9983, 'phi_range': [-8.266, 0.239]}; actual {'nodes': 1857, 'edges': 9983, 'phi_range': [-8.266, 0.239]}; exact_match=True.
78+- cross_stage_语文: claimed metrics {'nodes': 1128, 'phi_max': 0.112}; actual {'nodes': 1128, 'edges': 4139, 'phi_max': 0.126}; exact_match=False.
79+- cross_subject_小学: claimed metrics {'nodes': 708, 'phi_max': 10.0}; actual {'nodes': 708, 'edges': 2866, 'phi_max': 10.0}; exact_match=True.
80+- all_in_one_5subjects: claimed metrics {'nodes': 1373, 'edges': 6863, 'phi_max': 0.167}; actual {'nodes': 1373, 'edges': 6863, 'phi_max': 0.198}; exact_match=False.
81+
82+## 10. conclusion
83+
84+- `partially reproduced`
85+- Effect on prior A/B comparison: `strengthens`
86+- Recommendation: `trust the rerun outcome but not Claude's exact original execution claim verbatim`
87+
88+### blockers
89+
90+- none
1@@ -0,0 +1,5 @@
2+# Branch B formal validation strict rerun summary
3+
4+- Dataset actually hydrated: yes.
5+- Formal validation actually rerun: yes.
6+- Claude's claimed result should now be trusted: trust the rerun outcome, not the exact original-commit execution claim.
+865,
-0
1@@ -0,0 +1,865 @@
2+#!/usr/bin/env python3
3+"""Strict rerun port of 379feb2:tests/formal_validation.py on runnable Branch B."""
4+
5+from __future__ import annotations
6+
7+import json
8+import math
9+import os
10+import subprocess
11+import sys
12+import time
13+from pathlib import Path
14+from typing import Any
15+
16+REPO_ROOT = Path(__file__).resolve().parents[1]
17+if str(REPO_ROOT) not in sys.path:
18+ sys.path.insert(0, str(REPO_ROOT))
19+
20+from cie import CIERuntime
21+
22+
23+CLAUDE_CLAIM_COMMIT = "379feb2ed4324020ee48a97a6edb8ec99ce39f1a"
24+SCRIPT_SOURCE_PATH = "tests/formal_validation.py"
25+STRICT_PORT_PATH = "tests/formal_validation_strict_rerun.py"
26+REPORT_JSON_PATH = REPO_ROOT / "reports/2026-03-31_branch_b_formal_validation_strict_rerun.json"
27+REPORT_MD_PATH = REPO_ROOT / "reports/2026-03-31_branch_b_formal_validation_strict_rerun.md"
28+REVIEW_MD_PATH = REPO_ROOT / "reviews/2026-03-31_branch_b_formal_validation_strict_rerun.md"
29+
30+DATASET_REPO = Path("/Users/george/code/cie-datasets")
31+DS = DATASET_REPO / "china_text_book_md" / "v2026-03-28"
32+
33+COMBOS = [
34+ ("小学", "语文"),
35+ ("小学", "数学"),
36+ ("初中", "语文"),
37+ ("初中", "数学"),
38+ ("高中", "语文"),
39+]
40+REQUIRED_DATASET_PATHS = [
41+ ("小学", "语文"),
42+ ("小学", "数学"),
43+ ("小学", "科学"),
44+ ("初中", "语文"),
45+ ("初中", "数学"),
46+ ("高中", "语文"),
47+]
48+PRE_PULL_POINTER_STATE = {
49+ "splits/by_stage_subject/小学/语文.jsonl": "pointer_stub",
50+ "splits/by_stage_subject/小学/数学.jsonl": "pointer_stub",
51+ "splits/by_stage_subject/小学/科学.jsonl": "pointer_stub",
52+ "splits/by_stage_subject/初中/语文.jsonl": "pointer_stub",
53+ "splits/by_stage_subject/初中/数学.jsonl": "pointer_stub",
54+ "splits/by_stage_subject/高中/语文.jsonl": "pointer_stub",
55+}
56+CLAIMED_METRICS = {
57+ "小学语文_pipeline+stability": {"nodes": 1268, "edges": 6338, "phi_range": [-0.113, 0.598]},
58+ "小学数学_pipeline+stability": {"nodes": 803, "edges": 4367, "phi_range": [-8.300, 0.579]},
59+ "初中语文_pipeline+stability": {"nodes": 1702, "edges": 9372, "phi_range": [-0.085, 0.553]},
60+ "初中数学_pipeline+stability": {"nodes": 886, "edges": 5437, "phi_range": [-0.045, 0.480]},
61+ "高中语文_pipeline+stability": {"nodes": 1857, "edges": 9983, "phi_range": [-8.266, 0.239]},
62+ "cross_stage_语文": {"nodes": 1128, "phi_max": 0.112},
63+ "cross_subject_小学": {"nodes": 708, "phi_max": 10.000},
64+ "all_in_one_5subjects": {"nodes": 1373, "edges": 6863, "phi_max": 0.167},
65+}
66+CLAIMED_WORDS = {
67+ "小学数学": ["多少", "一共"],
68+ "初中数学": ["方程", "图形", "问题"],
69+ "初中语文": ["阅读", "单元"],
70+}
71+COMPATIBILITY_DEVIATIONS = [
72+ {
73+ "type": "script_path",
74+ "detail": (
75+ "Ported logic from 379feb2:tests/formal_validation.py into "
76+ "tests/formal_validation_strict_rerun.py because origin/branch-b lacks the original file."
77+ ),
78+ },
79+ {
80+ "type": "report_output",
81+ "detail": (
82+ "Writes the strict rerun audit outputs to the required reports/ and reviews/ files "
83+ "instead of /tmp/formal_val_results.json, creating those parent directories on the "
84+ "runnable Branch B ref because they do not exist on origin/branch-b."
85+ ),
86+ },
87+ {
88+ "type": "structured_metadata",
89+ "detail": (
90+ "Adds dataset hydration verification and structured comparison metadata; "
91+ "the validation ingest/step/emit logic and pass criteria remain aligned with 379feb2."
92+ ),
93+ },
94+]
95+
96+
97+def git(cmd: list[str], cwd: Path = REPO_ROOT, check: bool = True) -> subprocess.CompletedProcess[str]:
98+ return subprocess.run(
99+ cmd,
100+ cwd=cwd,
101+ text=True,
102+ capture_output=True,
103+ check=check,
104+ )
105+
106+
107+def rel_dataset_path(stage: str, subject: str) -> str:
108+ return f"splits/by_stage_subject/{stage}/{subject}.jsonl"
109+
110+
111+def abs_dataset_path(stage: str, subject: str) -> Path:
112+ return DS / rel_dataset_path(stage, subject)
113+
114+
115+def is_pointer_stub(path: Path) -> bool:
116+ if not path.exists():
117+ return False
118+ with path.open("r", encoding="utf-8", errors="replace") as handle:
119+ first = handle.readline().strip()
120+ return first == "version https://git-lfs.github.com/spec/v1"
121+
122+
123+def count_lines(path: Path) -> int:
124+ with path.open("r", encoding="utf-8", errors="replace") as handle:
125+ return sum(1 for _ in handle)
126+
127+
128+def collect_dataset_repo_status() -> dict[str, Any]:
129+ git_repo = git(["git", "rev-parse", "--is-inside-work-tree"], cwd=DATASET_REPO)
130+ lfs_version = git(["git", "lfs", "version"], cwd=DATASET_REPO)
131+ status_short = git(["git", "-C", str(DATASET_REPO), "status", "--short"], cwd=REPO_ROOT)
132+ lfs_ls = git(["git", "-C", str(DATASET_REPO), "lfs", "ls-files"], cwd=REPO_ROOT)
133+ remote_v = git(["git", "-C", str(DATASET_REPO), "remote", "-v"], cwd=REPO_ROOT)
134+
135+ relevant_lfs = {}
136+ for line in lfs_ls.stdout.splitlines():
137+ if not line.strip():
138+ continue
139+ parts = line.split(maxsplit=2)
140+ if len(parts) < 3:
141+ continue
142+ oid, marker, rel = parts
143+ if rel in PRE_PULL_POINTER_STATE:
144+ relevant_lfs[rel] = {"oid_prefix": oid, "worktree_marker": marker}
145+
146+ return {
147+ "path": str(DATASET_REPO),
148+ "is_git_repo": git_repo.stdout.strip() == "true",
149+ "git_lfs_version": lfs_version.stdout.strip(),
150+ "status_short": [line for line in status_short.stdout.splitlines() if line.strip()],
151+ "lfs_required_entries": relevant_lfs,
152+ "remote_v": [line for line in remote_v.stdout.splitlines() if line.strip()],
153+ }
154+
155+
156+def verify_required_dataset_paths() -> list[dict[str, Any]]:
157+ verified = []
158+ for stage, subject in REQUIRED_DATASET_PATHS:
159+ rel_path = rel_dataset_path(stage, subject)
160+ path = abs_dataset_path(stage, subject)
161+ exists = path.exists()
162+ pointer = is_pointer_stub(path) if exists else False
163+ first_line = ""
164+ second_line = ""
165+ line_count = 0
166+ if exists:
167+ with path.open("r", encoding="utf-8", errors="replace") as handle:
168+ first_line = handle.readline().rstrip("\n")
169+ second_line = handle.readline().rstrip("\n")
170+ line_count = count_lines(path)
171+ verified.append(
172+ {
173+ "stage": stage,
174+ "subject": subject,
175+ "relative_path": rel_path,
176+ "path": str(path),
177+ "pre_pull_worktree_state": PRE_PULL_POINTER_STATE.get(rel_path, "unknown"),
178+ "exists": exists,
179+ "is_pointer_stub_now": pointer,
180+ "hydrated": exists and not pointer and first_line.startswith("{"),
181+ "line_count": line_count,
182+ "first_line_preview": first_line[:200],
183+ "second_line_preview": second_line[:200],
184+ }
185+ )
186+ return verified
187+
188+
189+def ensure_hydrated(required_paths: list[dict[str, Any]]) -> list[str]:
190+ blockers = []
191+ for entry in required_paths:
192+ if not entry["exists"]:
193+ blockers.append(f"missing dataset file: {entry['path']}")
194+ elif entry["is_pointer_stub_now"]:
195+ blockers.append(f"LFS pointer still present: {entry['path']}")
196+ elif not entry["hydrated"]:
197+ blockers.append(f"dataset file is not usable JSONL: {entry['path']}")
198+ return blockers
199+
200+
201+def load_recs(stage: str, subject: str, max_n: int = 200) -> tuple[list[str], dict[str, Any]]:
202+ path = abs_dataset_path(stage, subject)
203+ recs: list[str] = []
204+ seen_content = 0
205+ seen_valid_text = 0
206+ if not path.exists():
207+ return recs, {
208+ "path": str(path),
209+ "available_content_records": 0,
210+ "available_valid_text_records": 0,
211+ "loaded_records": 0,
212+ }
213+
214+ with path.open("r", encoding="utf-8", errors="replace") as handle:
215+ for line in handle:
216+ rec = json.loads(line)
217+ if not rec.get("is_content"):
218+ continue
219+ seen_content += 1
220+ text = rec.get("text", "")
221+ if len(text) < 4:
222+ continue
223+ seen_valid_text += 1
224+ recs.append(text)
225+ if len(recs) >= max_n:
226+ break
227+
228+ return recs, {
229+ "path": str(path),
230+ "available_content_records": seen_content,
231+ "available_valid_text_records": seen_valid_text,
232+ "loaded_records": len(recs),
233+ }
234+
235+
236+def extract_top_words(rt: CIERuntime) -> list[dict[str, Any]]:
237+ graph = rt.graph
238+ cn_bg = []
239+ for src_edges in graph.fwd_edges.values():
240+ for dst, edge in src_edges.items():
241+ if "\u4e00" <= edge.src <= "\u9fff" and "\u4e00" <= dst <= "\u9fff":
242+ bwd = graph.get_bwd_weight(edge.src, dst)
243+ ratio = edge.weight / bwd if bwd > 0.01 else edge.weight * 100
244+ cn_bg.append({"phrase": edge.src + dst, "ratio": round(ratio, 1)})
245+ cn_bg.sort(key=lambda item: -item["ratio"])
246+ return cn_bg[:8]
247+
248+
249+def extract_circuits(rt: CIERuntime) -> list[dict[str, Any]]:
250+ graph = rt.graph
251+ circuits = []
252+ cn_nodes = [node for node in graph.nodes if "\u4e00" <= node <= "\u9fff"]
253+ for a in cn_nodes[:25]:
254+ for b in graph.neighbors_fwd(a)[:6]:
255+ if not ("\u4e00" <= b <= "\u9fff"):
256+ continue
257+ for c in graph.neighbors_fwd(b)[:6]:
258+ if not ("\u4e00" <= c <= "\u9fff"):
259+ continue
260+ if a in graph.neighbors_fwd(c):
261+ fwd = graph.circulation([a, b, c, a])
262+ rev = graph.circulation([a, c, b, a])
263+ if abs(fwd - rev) > 0.5:
264+ circuits.append({"phrase": a + b + c, "delta": round(abs(fwd - rev), 1)})
265+ circuits.sort(key=lambda item: -item["delta"])
266+ return circuits[:5]
267+
268+
269+def compare_claim_words(label: str, top_words: list[dict[str, Any]]) -> dict[str, Any] | None:
270+ claimed = CLAIMED_WORDS.get(label)
271+ if not claimed:
272+ return None
273+ actual_words = [item["phrase"] for item in top_words]
274+ matched = [word for word in claimed if word in actual_words]
275+ missing = [word for word in claimed if word not in actual_words]
276+ return {
277+ "claimed_examples": claimed,
278+ "actual_top_words": actual_words,
279+ "matched": matched,
280+ "missing": missing,
281+ }
282+
283+
284+def compare_claim_metrics(name: str, actual: dict[str, Any]) -> dict[str, Any] | None:
285+ claimed = CLAIMED_METRICS.get(name)
286+ if not claimed:
287+ return None
288+ comparison = {"claimed": claimed, "actual": actual}
289+ exact_match = True
290+ for key, value in claimed.items():
291+ if actual.get(key) != value:
292+ exact_match = False
293+ break
294+ comparison["exact_match"] = exact_match
295+ return comparison
296+
297+
298+def run_subject_combo(stage: str, subject: str, required_paths_map: dict[str, dict[str, Any]]) -> tuple[dict[str, Any], dict[str, Any]]:
299+ label = f"{stage}{subject}"
300+ rel_path = rel_dataset_path(stage, subject)
301+ recs, load_meta = load_recs(stage, subject, 300)
302+ if not recs:
303+ result = {
304+ "name": f"{label}_pipeline+stability",
305+ "type": "subject_pipeline",
306+ "stage": stage,
307+ "subject": subject,
308+ "input_files": [required_paths_map[rel_path]["path"]],
309+ "source_files_pre_pull_state": [required_paths_map[rel_path]["pre_pull_worktree_state"]],
310+ "line_count": required_paths_map[rel_path]["line_count"],
311+ "loaded_records": 0,
312+ "fed_records": 0,
313+ "node_count": 0,
314+ "edge_count": 0,
315+ "phi_range": None,
316+ "top_words": [],
317+ "pass": False,
318+ "status": "FAIL",
319+ "reason": "no data",
320+ "claim_word_comparison": compare_claim_words(label, []),
321+ "claim_metric_comparison": compare_claim_metrics(
322+ f"{label}_pipeline+stability",
323+ {"nodes": 0, "edges": 0, "phi_range": None},
324+ ),
325+ }
326+ return result, {}
327+
328+ rt = CIERuntime(seed=42)
329+ t0 = time.time()
330+ for rec in recs[:200]:
331+ rt.ingest(rec[:80])
332+ rt.step(n=1)
333+ elapsed = time.time() - t0
334+
335+ output = rt.emit()
336+ if output["activated"]:
337+ rt.commit_feedback({"correct": [output["activated"][0]["node"]], "reward": 1.0})
338+
339+ snap = rt.snapshot_state()
340+ ok = True
341+ ok &= snap["phi_summary"]["count"] > 20
342+ ok &= abs(snap["phi_summary"]["max"]) <= 10.1
343+ ok &= abs(snap["phi_summary"]["min"]) <= 10.1
344+ ok &= snap["attention"]["used"] <= snap["attention"]["total"] + 0.01
345+ ok &= all(math.isfinite(value) for value in rt.state.phi.values())
346+
347+ detail = (
348+ f"nodes={snap['phi_summary']['count']}, edges={snap['graph']['edge_count']}, "
349+ f"phi=[{snap['phi_summary']['min']:.3f},{snap['phi_summary']['max']:.3f}], "
350+ f"attn={snap['attention']['used']:.1f}, mode={output['mode']}, "
351+ f"time={elapsed:.2f}s, recs={min(len(recs), 200)}"
352+ )
353+
354+ top_words = extract_top_words(rt)
355+ circuits = extract_circuits(rt)
356+ actual_metrics = {
357+ "nodes": snap["phi_summary"]["count"],
358+ "edges": snap["graph"]["edge_count"],
359+ "phi_range": [round(snap["phi_summary"]["min"], 3), round(snap["phi_summary"]["max"], 3)],
360+ }
361+ result = {
362+ "name": f"{label}_pipeline+stability",
363+ "type": "subject_pipeline",
364+ "stage": stage,
365+ "subject": subject,
366+ "input_files": [required_paths_map[rel_path]["path"]],
367+ "source_files_pre_pull_state": [required_paths_map[rel_path]["pre_pull_worktree_state"]],
368+ "line_count": required_paths_map[rel_path]["line_count"],
369+ "loaded_records": load_meta["loaded_records"],
370+ "fed_records": min(len(recs), 200),
371+ "available_content_records_seen_before_cap": load_meta["available_content_records"],
372+ "available_valid_text_records_seen_before_cap": load_meta["available_valid_text_records"],
373+ "node_count": snap["phi_summary"]["count"],
374+ "edge_count": snap["graph"]["edge_count"],
375+ "anchor_count": len(rt.state.anchor_nodes),
376+ "core_count": len(rt.state.ability_cores),
377+ "experience_region_count": len(rt.state.experience_regions.get("experience", set())),
378+ "skill_belt_candidate_count": len(rt.state.skill_belt_candidates),
379+ "sedimentation_events": len(rt.state.sedimentation_trace),
380+ "merge_events": len(rt.state.merge_events),
381+ "decay_events": len(rt.state.decay_events),
382+ "phi_range": actual_metrics["phi_range"],
383+ "top_words": top_words,
384+ "circuit_count": len(circuits),
385+ "top_circuits": circuits,
386+ "elapsed_seconds": round(elapsed, 2),
387+ "output_mode": output["mode"],
388+ "pass": bool(ok),
389+ "status": "PASS" if ok else "FAIL",
390+ "reason": detail,
391+ "claim_word_comparison": compare_claim_words(label, top_words),
392+ "claim_metric_comparison": compare_claim_metrics(f"{label}_pipeline+stability", actual_metrics),
393+ }
394+ return result, {
395+ "nodes": rt.graph.node_count,
396+ "edges": rt.graph.edge_count,
397+ "anchors": len(rt.state.anchor_nodes),
398+ "cores": len(rt.state.ability_cores),
399+ "exp": len(rt.state.experience_regions.get("experience", set())),
400+ "belts": len(rt.state.skill_belt_candidates),
401+ "top_words": top_words,
402+ "circuits": len(circuits),
403+ "top_circuits": circuits,
404+ "sed": len(rt.state.sedimentation_trace),
405+ "merges": len(rt.state.merge_events),
406+ "decays": len(rt.state.decay_events),
407+ "phi_range": actual_metrics["phi_range"],
408+ }
409+
410+
411+def summarize_runtime(
412+ *,
413+ name: str,
414+ scenario_type: str,
415+ input_files: list[str],
416+ pre_pull_states: list[str],
417+ load_counts: list[int],
418+ fed_records: int,
419+ rt: CIERuntime,
420+ snap: dict[str, Any],
421+ ok: bool,
422+ reason: str,
423+) -> dict[str, Any]:
424+ top_words = extract_top_words(rt)
425+ actual_metrics = {
426+ "nodes": snap["phi_summary"]["count"],
427+ "edges": snap["graph"]["edge_count"],
428+ "phi_max": round(snap["phi_summary"]["max"], 3),
429+ }
430+ return {
431+ "name": name,
432+ "type": scenario_type,
433+ "input_files": input_files,
434+ "source_files_pre_pull_state": pre_pull_states,
435+ "loaded_records_per_input": load_counts,
436+ "fed_records": fed_records,
437+ "node_count": snap["phi_summary"]["count"],
438+ "edge_count": snap["graph"]["edge_count"],
439+ "phi_range": [round(snap["phi_summary"]["min"], 3), round(snap["phi_summary"]["max"], 3)],
440+ "top_words": top_words,
441+ "pass": bool(ok),
442+ "status": "PASS" if ok else "FAIL",
443+ "reason": reason,
444+ "claim_metric_comparison": compare_claim_metrics(name, actual_metrics),
445+ }
446+
447+
448+def run_cross_stage(required_paths_map: dict[str, dict[str, Any]]) -> dict[str, Any]:
449+ rt = CIERuntime(seed=42)
450+ input_files = []
451+ pre_pull_states = []
452+ load_counts = []
453+ for stage in ["小学", "初中", "高中"]:
454+ recs, _ = load_recs(stage, "语文", 60)
455+ input_files.append(required_paths_map[rel_dataset_path(stage, "语文")]["path"])
456+ pre_pull_states.append(required_paths_map[rel_dataset_path(stage, "语文")]["pre_pull_worktree_state"])
457+ load_counts.append(min(len(recs), 40))
458+ for rec in recs[:40]:
459+ rt.ingest(rec[:60])
460+ rt.step(n=1)
461+ snap = rt.snapshot_state()
462+ ok = abs(snap["phi_summary"]["max"]) <= 10.1 and snap["phi_summary"]["count"] > 30
463+ reason = f"nodes={snap['phi_summary']['count']}, phi_max={snap['phi_summary']['max']:.3f}"
464+ return summarize_runtime(
465+ name="cross_stage_语文",
466+ scenario_type="cross_stage",
467+ input_files=input_files,
468+ pre_pull_states=pre_pull_states,
469+ load_counts=load_counts,
470+ fed_records=sum(load_counts),
471+ rt=rt,
472+ snap=snap,
473+ ok=ok,
474+ reason=reason,
475+ )
476+
477+
478+def run_cross_subject(required_paths_map: dict[str, dict[str, Any]]) -> dict[str, Any]:
479+ rt = CIERuntime(seed=42)
480+ input_files = []
481+ pre_pull_states = []
482+ load_counts = []
483+ for subject in ["语文", "数学", "科学"]:
484+ recs, _ = load_recs("小学", subject, 60)
485+ input_files.append(required_paths_map[rel_dataset_path("小学", subject)]["path"])
486+ pre_pull_states.append(required_paths_map[rel_dataset_path("小学", subject)]["pre_pull_worktree_state"])
487+ load_counts.append(min(len(recs), 40))
488+ for rec in recs[:40]:
489+ rt.ingest(rec[:60], anchors=[subject])
490+ rt.step(n=1)
491+ snap = rt.snapshot_state()
492+ ok = abs(snap["phi_summary"]["max"]) <= 10.1 and all(rt.graph.has_node(subject) for subject in ["语文", "数学", "科学"])
493+ reason = (
494+ f"nodes={snap['phi_summary']['count']}, phi_max={snap['phi_summary']['max']:.3f}, "
495+ f"cores={len(rt.state.ability_cores)}"
496+ )
497+ return summarize_runtime(
498+ name="cross_subject_小学",
499+ scenario_type="cross_subject",
500+ input_files=input_files,
501+ pre_pull_states=pre_pull_states,
502+ load_counts=load_counts,
503+ fed_records=sum(load_counts),
504+ rt=rt,
505+ snap=snap,
506+ ok=ok,
507+ reason=reason,
508+ )
509+
510+
511+def run_all_in_one(required_paths_map: dict[str, dict[str, Any]]) -> tuple[dict[str, Any], dict[str, Any]]:
512+ rt = CIERuntime(seed=42)
513+ total = 0
514+ input_files = []
515+ pre_pull_states = []
516+ load_counts = []
517+ for stage, subject in COMBOS:
518+ recs, _ = load_recs(stage, subject, 80)
519+ input_files.append(required_paths_map[rel_dataset_path(stage, subject)]["path"])
520+ pre_pull_states.append(required_paths_map[rel_dataset_path(stage, subject)]["pre_pull_worktree_state"])
521+ count = min(len(recs), 50)
522+ load_counts.append(count)
523+ for rec in recs[:50]:
524+ rt.ingest(rec[:60])
525+ rt.step(n=1)
526+ total += 1
527+ snap = rt.snapshot_state()
528+ ok = abs(snap["phi_summary"]["max"]) <= 10.1
529+ reason = (
530+ f"fed={total}, nodes={snap['phi_summary']['count']}, "
531+ f"edges={snap['graph']['edge_count']}, phi_max={snap['phi_summary']['max']:.3f}"
532+ )
533+ result = summarize_runtime(
534+ name="all_in_one_5subjects",
535+ scenario_type="all_in_one",
536+ input_files=input_files,
537+ pre_pull_states=pre_pull_states,
538+ load_counts=load_counts,
539+ fed_records=total,
540+ rt=rt,
541+ snap=snap,
542+ ok=ok,
543+ reason=reason,
544+ )
545+ analysis = {
546+ "fed": total,
547+ "nodes": snap["phi_summary"]["count"],
548+ "edges": snap["graph"]["edge_count"],
549+ "phi_range": [round(snap["phi_summary"]["min"], 3), round(snap["phi_summary"]["max"], 3)],
550+ "anchors": len(rt.state.anchor_nodes),
551+ "cores": len(rt.state.ability_cores),
552+ "sed": len(rt.state.sedimentation_trace),
553+ "merges": len(rt.state.merge_events),
554+ "decays": len(rt.state.decay_events),
555+ }
556+ return result, analysis
557+
558+
559+def build_overall_assessment(
560+ report: dict[str, Any],
561+ scenario_results: list[dict[str, Any]],
562+ blockers: list[str],
563+) -> tuple[dict[str, Any], str]:
564+ if blockers:
565+ conclusion = "still blocked"
566+ effect = "leaves unchanged"
567+ trust = "do not trust"
568+ else:
569+ all_pass = report["reproduced_8_of_8_pass"]
570+ claim_word_matches = []
571+ claim_metric_exact_matches = []
572+ for result in scenario_results:
573+ word_comparison = result.get("claim_word_comparison")
574+ if word_comparison:
575+ claim_word_matches.extend(word_comparison["matched"])
576+ metric_comparison = result.get("claim_metric_comparison")
577+ if metric_comparison and metric_comparison.get("exact_match"):
578+ claim_metric_exact_matches.append(result["name"])
579+ if all_pass and len(claim_word_matches) >= 7 and len(claim_metric_exact_matches) == 8:
580+ conclusion = "reproduced"
581+ effect = "strengthens"
582+ trust = "trust with the documented port caveat"
583+ elif all_pass:
584+ conclusion = "partially reproduced"
585+ effect = "strengthens"
586+ trust = "trust the rerun outcome, not the exact original-commit execution claim"
587+ else:
588+ conclusion = "not reproduced"
589+ effect = "weakens"
590+ trust = "do not trust the claimed 8/8 PASS as stated"
591+ assessment = {
592+ "conclusion": conclusion,
593+ "effect_on_prior_a_vs_b_comparison": effect,
594+ "trust_call": trust,
595+ }
596+ return assessment, conclusion
597+
598+
599+def render_markdown(report: dict[str, Any]) -> str:
600+ scenario_rows = []
601+ for result in report["per_subject_results"]:
602+ top_words = ", ".join(item["phrase"] for item in result.get("top_words", [])[:5]) or "-"
603+ subject = result.get("subject", result["name"])
604+ scenario_rows.append(
605+ f"| {result['name']} | {subject} | {result['status']} | "
606+ f"{result.get('node_count', 0)} | {result.get('edge_count', 0)} | "
607+ f"{top_words} | {result['reason']} |"
608+ )
609+
610+ dataset_rows = []
611+ for entry in report["required_dataset_paths"]:
612+ dataset_rows.append(
613+ f"| {entry['stage']} | {entry['subject']} | `{entry['relative_path']}` | "
614+ f"{entry['pre_pull_worktree_state']} | {'yes' if entry['hydrated'] else 'no'} | {entry['line_count']} |"
615+ )
616+
617+ comparison_lines = []
618+ for result in report["per_subject_results"]:
619+ word_comp = result.get("claim_word_comparison")
620+ metric_comp = result.get("claim_metric_comparison")
621+ if word_comp:
622+ comparison_lines.append(
623+ f"- {result['name']}: claimed words {word_comp['claimed_examples']}; "
624+ f"matched {word_comp['matched']}; missing {word_comp['missing']}."
625+ )
626+ if metric_comp:
627+ comparison_lines.append(
628+ f"- {result['name']}: claimed metrics {metric_comp['claimed']}; "
629+ f"actual {metric_comp['actual']}; exact_match={metric_comp['exact_match']}."
630+ )
631+
632+ blockers_lines = "\n".join(f"- {blocker}" for blocker in report["blockers"]) or "- none"
633+ deviations_lines = "\n".join(f"- {item['detail']}" for item in report["compatibility_deviations"])
634+
635+ return "\n".join(
636+ [
637+ "# Branch B formal validation strict rerun",
638+ "",
639+ "## 1. purpose",
640+ "",
641+ "Strictly rerun Claude's claimed Branch B formal dataset validation once on `/Users/george/code/cie-datasets`,",
642+ "using a runnable Branch B source tree without changing runtime behavior.",
643+ "",
644+ "## 2. claimed result being rerun",
645+ "",
646+ f"- Claim commit: `{report['claude_claim_commit']}`",
647+ "- Claimed result: `8/8 PASS` on `/Users/george/code/cie-datasets`.",
648+ "- Claimed examples: 小学数学 `多少`, `一共`; 初中数学 `方程`, `图形`, `问题`; 初中语文 `阅读`, `单元`.",
649+ "",
650+ "## 3. chosen runnable Branch B ref",
651+ "",
652+ f"- Runnable Branch B base: `{report['chosen_runnable_branch_b_commit']}` (`origin/branch-b` / `c734288`).",
653+ "- This ref contains the Branch B runtime source tree (`cie/__init__.py`, `cie/runtime.py`, `cie/graph.py`, `cie/state.py`, `cie/dynamics.py`).",
654+ "",
655+ "## 4. dataset hydration audit",
656+ "",
657+ f"- Dataset repo: `{report['dataset_repo_status']['path']}`",
658+ f"- Git LFS: `{report['dataset_repo_status']['git_lfs_version']}`",
659+ f"- Hydration status: `{report['hydration_status']['status']}`",
660+ "",
661+ "| Stage | Subject | Relative path | Pre-pull state | Hydrated now | Line count |",
662+ "|---|---|---|---|---:|---:|",
663+ *dataset_rows,
664+ "",
665+ "## 5. script provenance audit",
666+ "",
667+ f"- Script used: `{report['script_used']}`",
668+ f"- Source: `{report['script_provenance']['source_commit']}:{report['script_provenance']['source_path']}`",
669+ f"- Port path: `{report['script_provenance']['port_path']}`",
670+ "- Compatibility deviations:",
671+ deviations_lines,
672+ "",
673+ "## 6. exact rerun command",
674+ "",
675+ f"- `{report['rerun_command']}`",
676+ "",
677+ "## 7. per-subject results table",
678+ "",
679+ "| Scenario | Subject/domain | Status | Nodes | Edges | Top emergent words | Reason |",
680+ "|---|---|---|---:|---:|---|---|",
681+ *scenario_rows,
682+ "",
683+ "## 8. whether 8/8 PASS was reproduced",
684+ "",
685+ f"- Rerun performed: `{report['rerun_performed']}`",
686+ f"- Result: `{report['summary']['passed']}/{report['summary']['total']} PASS`",
687+ f"- Exact 8/8 reproduction: `{report['reproduced_8_of_8_pass']}`",
688+ "",
689+ "## 9. comparison to Claude's claimed numbers/words",
690+ "",
691+ *comparison_lines,
692+ "",
693+ "## 10. conclusion",
694+ "",
695+ f"- `{report['overall_assessment']['conclusion']}`",
696+ f"- Effect on prior A/B comparison: `{report['overall_assessment']['effect_on_prior_a_vs_b_comparison']}`",
697+ f"- Recommendation: `{report['recommendation']}`",
698+ "",
699+ "### blockers",
700+ "",
701+ blockers_lines,
702+ ]
703+ )
704+
705+
706+def render_review_summary(report: dict[str, Any]) -> str:
707+ return "\n".join(
708+ [
709+ "# Branch B formal validation strict rerun summary",
710+ "",
711+ f"- Dataset actually hydrated: {'yes' if report['hydration_status']['all_required_files_hydrated'] else 'no'}.",
712+ f"- Formal validation actually rerun: {'yes' if report['rerun_performed'] else 'no'}.",
713+ f"- Claude's claimed result should now be trusted: {report['overall_assessment']['trust_call']}.",
714+ ]
715+ )
716+
717+
718+def write_reports(report: dict[str, Any]) -> None:
719+ REPORT_JSON_PATH.parent.mkdir(parents=True, exist_ok=True)
720+ REVIEW_MD_PATH.parent.mkdir(parents=True, exist_ok=True)
721+ REPORT_JSON_PATH.write_text(
722+ json.dumps(report, ensure_ascii=False, indent=2) + "\n",
723+ encoding="utf-8",
724+ )
725+ REPORT_MD_PATH.write_text(render_markdown(report) + "\n", encoding="utf-8")
726+ REVIEW_MD_PATH.write_text(render_review_summary(report) + "\n", encoding="utf-8")
727+
728+
729+def build_blocked_report(required_paths: list[dict[str, Any]], blockers: list[str], dataset_status: dict[str, Any]) -> dict[str, Any]:
730+ chosen_commit = git(["git", "rev-parse", "HEAD"]).stdout.strip()
731+ assessment, _ = build_overall_assessment(
732+ {
733+ "reproduced_8_of_8_pass": False,
734+ },
735+ [],
736+ blockers,
737+ )
738+ return {
739+ "claude_claim_commit": CLAUDE_CLAIM_COMMIT,
740+ "claude_claim_commit_exists": True,
741+ "chosen_runnable_branch_b_commit": chosen_commit,
742+ "dataset_repo_status": dataset_status,
743+ "required_dataset_paths": required_paths,
744+ "hydration_status": {
745+ "status": "FAILED",
746+ "all_required_files_hydrated": False,
747+ },
748+ "script_used": "none",
749+ "script_provenance": {
750+ "source_commit": CLAUDE_CLAIM_COMMIT,
751+ "source_path": SCRIPT_SOURCE_PATH,
752+ "port_path": None,
753+ },
754+ "compatibility_deviations": [],
755+ "rerun_performed": False,
756+ "rerun_command": None,
757+ "per_subject_results": [],
758+ "summary": {"passed": 0, "failed": 0, "total": 0, "status": "BLOCKED"},
759+ "reproduced_8_of_8_pass": False,
760+ "overall_assessment": assessment,
761+ "blockers": blockers,
762+ "recommendation": "do not trust the claimed result until the strict rerun blockers are removed",
763+ }
764+
765+
766+def main() -> int:
767+ dataset_status = collect_dataset_repo_status()
768+ required_paths = verify_required_dataset_paths()
769+ blockers = ensure_hydrated(required_paths)
770+ if blockers:
771+ report = build_blocked_report(required_paths, blockers, dataset_status)
772+ write_reports(report)
773+ print("\n".join(f"[BLOCKED] {blocker}" for blocker in blockers))
774+ return 1
775+
776+ required_paths_map = {entry["relative_path"]: entry for entry in required_paths}
777+ chosen_commit = git(["git", "rev-parse", "HEAD"]).stdout.strip()
778+
779+ tests_output = []
780+ analysis = {}
781+ scenario_results = []
782+
783+ for stage, subject in COMBOS:
784+ result, analysis_entry = run_subject_combo(stage, subject, required_paths_map)
785+ tests_output.append(
786+ {"name": result["name"], "status": result["status"], "detail": result["reason"]}
787+ )
788+ scenario_results.append(result)
789+ if analysis_entry:
790+ analysis[f"{stage}{subject}"] = analysis_entry
791+
792+ cross_stage_result = run_cross_stage(required_paths_map)
793+ cross_subject_result = run_cross_subject(required_paths_map)
794+ all_in_one_result, all_in_one_analysis = run_all_in_one(required_paths_map)
795+ for result in [cross_stage_result, cross_subject_result, all_in_one_result]:
796+ tests_output.append({"name": result["name"], "status": result["status"], "detail": result["reason"]})
797+ scenario_results.append(result)
798+ analysis["全灌"] = all_in_one_analysis
799+
800+ passed = sum(1 for item in tests_output if item["status"] == "PASS")
801+ failed = sum(1 for item in tests_output if item["status"] == "FAIL")
802+ reproduced = failed == 0 and len(tests_output) == 8
803+
804+ report = {
805+ "claude_claim_commit": CLAUDE_CLAIM_COMMIT,
806+ "claude_claim_commit_exists": True,
807+ "chosen_runnable_branch_b_commit": chosen_commit,
808+ "dataset_repo_status": dataset_status,
809+ "required_dataset_paths": required_paths,
810+ "hydration_status": {
811+ "status": "SUCCESS",
812+ "all_required_files_hydrated": True,
813+ },
814+ "script_used": "strict_rerun_port",
815+ "script_provenance": {
816+ "source_commit": CLAUDE_CLAIM_COMMIT,
817+ "source_path": SCRIPT_SOURCE_PATH,
818+ "port_path": STRICT_PORT_PATH,
819+ "port_rationale": (
820+ "origin/branch-b is the runnable Branch B source tree, but it does not contain "
821+ "tests/formal_validation.py from 379feb2."
822+ ),
823+ },
824+ "compatibility_deviations": COMPATIBILITY_DEVIATIONS,
825+ "rerun_performed": True,
826+ "rerun_command": "python3 tests/formal_validation_strict_rerun.py",
827+ "per_subject_results": scenario_results,
828+ "analysis": analysis,
829+ "summary": {
830+ "passed": passed,
831+ "failed": failed,
832+ "total": len(tests_output),
833+ "dataset": str(DS),
834+ "status": "PASS" if failed == 0 else "FAIL",
835+ },
836+ "reproduced_8_of_8_pass": reproduced,
837+ "blockers": [],
838+ }
839+ assessment, conclusion = build_overall_assessment(report, scenario_results, [])
840+ report["overall_assessment"] = assessment
841+ if conclusion == "reproduced":
842+ recommendation = "trust the rerun result, with the explicit note that execution used a faithful port on runnable Branch B"
843+ elif conclusion == "partially reproduced":
844+ recommendation = "trust the rerun outcome but not Claude's exact original execution claim verbatim"
845+ else:
846+ recommendation = "do not trust Claude's claimed 8/8 PASS as stated"
847+ report["recommendation"] = recommendation
848+
849+ write_reports(report)
850+
851+ for test in tests_output:
852+ print(f" [{test['status']}] {test['name']}: {test['detail']}")
853+ print(f"\n总计: {passed} passed, {failed} failed, {len(tests_output)} total")
854+ print("\nAnalysis:")
855+ for key, value in analysis.items():
856+ print(
857+ f" {key}: nodes={value.get('nodes', '?')}, "
858+ f"words={value.get('top_words', [])[:5]}, circuits={value.get('circuits', '?')}"
859+ )
860+ print(f"\n结论: {report['overall_assessment']['conclusion']}")
861+ print(f"报告: {REPORT_JSON_PATH}")
862+ return 0
863+
864+
865+if __name__ == "__main__":
866+ raise SystemExit(main())