- commit
- c734288
- parent
- 0025475
- author
- im_wower
- date
- 2026-03-31 17:30:53 +0800 CST
phase8: 词结构涌现验证——语文→课文→蜗牛等词从纯bigram自动浮出 (42/42)
2 files changed,
+369,
-0
+77,
-0
1@@ -304,3 +304,80 @@ K=3 分量从无差别变为有语义:
2 ### 提交历史更新
3
4
5+
6+
7+---
8+
9+## Phase 8: 词结构涌现验证(核心命题实证)
10+
11+> 2026-03-31 | 小学语文一上, 6772 字符灌入
12+
13+### 实验设计
14+
15+把课本文本按字符灌入 CIE,不做任何分词预处理,纯靠 bigram 的非对称权重判断哪些字组合是"词内部"(强方向性)、哪些是"词边界"(无方向偏好)。
16+
17+### 核心发现
18+
19+#### 非对称比完美识别了真实词汇
20+
21+| bigram | 非对称比 | 对应词汇 |
22+|--------|---------|----------|
23+| 语→文 | 2417.8 | 语文 |
24+| 课→文 | 1906.3 | 课文 |
25+| 教→科 | 1115.3 | 教科 |
26+| 识→字 | 1058.0 | 识字 |
27+| 白→菜 | 939.7 | 白菜 |
28+| 出→版 | 937.5 | 出版 |
29+| 小→学 | 926.6 | 小学 |
30+| 蜗→牛 | 874.6 | 蜗牛 |
31+| 作→者 | 1436.4 | 作者 |
32+
33+**系统从纯字符流动中,自动发现了这些是"词"——不需要词典,不需要分词器。**
34+
35+这直接验证了 README 的核心命题:"词、属性、概念边界,都不是预制的;它们是流动在流形上反复重叠后涌现出的稳定结构。"
36+
37+#### 低非对称比 = 词边界
38+
39+| bigram | 非对称比 | 含义 |
40+|--------|---------|------|
41+| 年→一 | 0.13 | 词边界("年"和"一"不属于同一个词) |
42+| 的→书 | 0.30 | 助词边界 |
43+| 妈→妈 | 1.03 | 叠词(对称) |
44+| 天→天 | 1.01 | 叠词(对称) |
45+
46+叠词(妈妈、天天)的非对称比≈1.0,因为正反方向频率相等——这也是正确的!
47+
48+#### 自动切分涌现了真实语言单元
49+
50+涌现的片段包括:"义务教育"、"教科书"、"口语交际"、"汉语拼音"、"青蛙写诗"、"大小多少"、"日月水火"
51+
52+这些不是单个词,而是课本的章节标题和教学单元名——系统从字符级流动中涌现出了语义单元。
53+
54+#### 闭环 = 固化的语言模式
55+
56+| 三字环 | 正向环流 | 含义 |
57+|--------|---------|------|
58+| 一起读 | 17.4 | "一起读"——教学指令 |
59+| 一个人 | 10.0 | 常用量词结构 |
60+| 一家人 | 8.9 | 家庭主题 |
61+| 一学期 | 3.3 | 时间单位 |
62+| 一头牛 | 3.2 | 量词结构 |
63+
64+以"一"开头的闭环特别多——因为一年级语文中"一个/一只/一头"等量词结构是高频模式,系统自动捕获了这些。
65+
66+### 图的最终状态
67+
68+| 指标 | 数值 |
69+|------|------|
70+| 节点 | 814 |
71+| 边 | 3,172 |
72+| 锚点核 | 66 |
73+| 能力核 | 14 |
74+| 经验层 | 776 |
75+| 技能带 | 691 |
76+
77+### 结论
78+
79+**纯 bigram 非对称权重,不借助任何词典或预训练模型,从小学语文课本中自动涌现出了词级结构、叠词特征、章节单元、量词模式。**
80+
81+这是 CIE "流动的图" 范式的第一个实质性验证:结构确实从流动中涌现。
+292,
-0
1@@ -0,0 +1,292 @@
2+"""
3+Phase 8: 词结构涌现验证
4+========================
5+核心命题验证:纯字符 bigram 注入后,图上能否涌现出词级结构?
6+
7+方法:
8+ 1. 把课本文本按字符灌入 CIE
9+ 2. 分析哪些 bigram 的非对称比(fwd/bwd)最高——这些是"词内部"连接
10+ 3. 哪些 bigram 的非对称比最接近 1——这些是"词边界"
11+ 4. 用非对称比自动切分文本,与已知词边界对比
12+
13+这是 CIE-Ref 中"no-segmentation verification"的 runtime 版本。
14+"""
15+
16+import sys
17+import os
18+import math
19+from collections import defaultdict, Counter
20+
21+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
22+from cie import CIERuntime
23+
24+DATA_DIR = "/Users/george/code/china-text-book-md"
25+
26+
27+def load_clean_text(filename, max_chars=50000):
28+ path = os.path.join(DATA_DIR, filename)
29+ with open(path, "r", encoding="utf-8") as f:
30+ raw = f.read()
31+ chars = []
32+ for line in raw.split("\n"):
33+ line = line.strip()
34+ if not line or line.startswith("#") or line.startswith("**") or line.startswith("---"):
35+ continue
36+ if line.startswith("!["):
37+ continue
38+ ctrl = sum(1 for c in line if ord(c) < 32 and c not in '\n\t')
39+ if ctrl > len(line) * 0.3:
40+ continue
41+ cn = sum(1 for c in line if '\u4e00' <= c <= '\u9fff')
42+ if cn >= 2:
43+ chars.extend(list(line))
44+ chars.append('\n')
45+ if len(chars) >= max_chars:
46+ break
47+ return chars[:max_chars]
48+
49+
50+def bigram_asymmetry_analysis(rt):
51+ """分析所有 bigram 的非对称比 fwd/bwd"""
52+ g = rt.graph
53+ results = []
54+
55+ for src_edges in g.fwd_edges.values():
56+ for dst, edge in src_edges.items():
57+ fwd = edge.weight
58+ bwd = g.get_bwd_weight(edge.src, dst)
59+ if bwd > 0.01:
60+ ratio = fwd / bwd
61+ else:
62+ ratio = fwd * 100 # bwd ≈ 0 means very asymmetric
63+
64+ results.append({
65+ 'src': edge.src,
66+ 'dst': dst,
67+ 'fwd': fwd,
68+ 'bwd': bwd,
69+ 'ratio': ratio,
70+ 'bigram': edge.src + dst,
71+ })
72+
73+ results.sort(key=lambda x: -x['ratio'])
74+ return results
75+
76+
77+def word_boundary_detection(rt, text_chars):
78+ """
79+ 用非对称比检测词边界:
80+ - ratio 高 → 词内部(强方向性)
81+ - ratio 接近 1 → 词边界(无方向偏好)
82+
83+ 返回切分结果。
84+ """
85+ g = rt.graph
86+ segments = []
87+ current_word = []
88+
89+ threshold = 1.3 # ratio < 1.3 判为边界
90+
91+ for i in range(len(text_chars) - 1):
92+ a, b = text_chars[i], text_chars[i + 1]
93+ current_word.append(a)
94+
95+ # 跳过非中文字符
96+ if not ('\u4e00' <= a <= '\u9fff') or not ('\u4e00' <= b <= '\u9fff'):
97+ if current_word:
98+ segments.append(''.join(current_word))
99+ current_word = []
100+ continue
101+
102+ fwd = g.get_edge_weight(a, b)
103+ bwd = g.get_bwd_weight(a, b)
104+
105+ if bwd > 0.01:
106+ ratio = fwd / bwd
107+ else:
108+ ratio = fwd * 10 if fwd > 0 else 1.0
109+
110+ if ratio < threshold:
111+ # 词边界
112+ if current_word:
113+ segments.append(''.join(current_word))
114+ current_word = []
115+
116+ if current_word:
117+ current_word.append(text_chars[-1])
118+ segments.append(''.join(current_word))
119+
120+ return segments
121+
122+
123+def evaluate_segmentation(segments, known_words):
124+ """评估切分质量:命中已知词的比例"""
125+ found = 0
126+ total_known = len(known_words)
127+
128+ seg_set = set(segments)
129+ for w in known_words:
130+ if w in seg_set:
131+ found += 1
132+
133+ # 也检查 segments 中有多少是 2-4 字的中文片段
134+ cn_segs = [s for s in segments if all('\u4e00' <= c <= '\u9fff' for c in s) and 2 <= len(s) <= 4]
135+
136+ return {
137+ 'total_segments': len(segments),
138+ 'cn_word_segments': len(cn_segs),
139+ 'known_word_hits': found,
140+ 'known_word_total': total_known,
141+ 'hit_rate': round(found / max(total_known, 1) * 100, 1),
142+ 'sample_cn_segs': cn_segs[:30],
143+ }
144+
145+
146+def run_experiment():
147+ """主实验:小学语文一上"""
148+ print("=" * 60)
149+ print(" 实验: 纯 bigram 流动能否涌现词结构")
150+ print("=" * 60)
151+
152+ # ── 准备数据 ──
153+ filename = "小学_语文_统编版_义务教育教科书·语文一年级上册.md"
154+ chars = load_clean_text(filename, max_chars=30000)
155+ print(f"\n数据: 小学语文一上, {len(chars)} 字符")
156+
157+ # ── 灌入 CIE ──
158+ rt = CIERuntime(seed=42)
159+
160+ # 分批灌入
161+ batch_size = 100
162+ for i in range(0, len(chars), batch_size):
163+ batch = chars[i:i+batch_size]
164+ rt.ingest(batch)
165+ rt.step(n=2)
166+
167+ print(f"图规模: {rt.graph.node_count} nodes, {rt.graph.edge_count} edges")
168+
169+ # ── 分析非对称比 ──
170+ bigrams = bigram_asymmetry_analysis(rt)
171+
172+ print(f"\n{'='*60}")
173+ print(f" 1. 非对称比最高的 bigram(词内部连接)")
174+ print(f"{'='*60}")
175+
176+ # 只看中文 bigram
177+ cn_bigrams = [b for b in bigrams
178+ if '\u4e00' <= b['src'] <= '\u9fff' and '\u4e00' <= b['dst'] <= '\u9fff']
179+
180+ print(f"\nTop 20 高非对称比(强方向性 = 词内部):")
181+ for b in cn_bigrams[:20]:
182+ print(f" {b['src']}→{b['dst']}: ratio={b['ratio']:.2f} (fwd={b['fwd']:.1f}, bwd={b['bwd']:.1f})")
183+
184+ print(f"\nTop 20 低非对称比(弱方向性 = 词边界):")
185+ cn_bigrams_low = sorted(cn_bigrams, key=lambda x: x['ratio'])
186+ for b in cn_bigrams_low[:20]:
187+ print(f" {b['src']}→{b['dst']}: ratio={b['ratio']:.2f} (fwd={b['fwd']:.1f}, bwd={b['bwd']:.1f})")
188+
189+ # ── 词边界检测 ──
190+ print(f"\n{'='*60}")
191+ print(f" 2. 基于非对称比的自动切分")
192+ print(f"{'='*60}")
193+
194+ # 取一段中文文本做切分
195+ test_sentences = []
196+ cn_text = []
197+ for c in chars:
198+ if '\u4e00' <= c <= '\u9fff':
199+ cn_text.append(c)
200+ elif cn_text:
201+ if len(cn_text) >= 4:
202+ test_sentences.append(''.join(cn_text))
203+ cn_text = []
204+
205+ # 一年级语文的已知常见词
206+ known_words_grade1 = [
207+ "老师", "学生", "同学", "小鸟", "大小", "上下", "左右",
208+ "学校", "花朵", "天地", "日月", "我们", "你们", "他们",
209+ "爸爸", "妈妈", "太阳", "月亮", "星星", "白云",
210+ "小朋友", "春天", "秋天", "冬天", "夏天",
211+ "美丽", "快乐", "高兴", "漂亮", "可爱",
212+ "什么", "为什", "这个", "那个", "不是",
213+ "语文", "数学", "读书", "写字", "画画",
214+ "自己", "大家", "小学", "中国", "北京",
215+ ]
216+
217+ all_segments = []
218+ for sent in test_sentences[:50]:
219+ segs = word_boundary_detection(rt, list(sent))
220+ all_segments.extend(segs)
221+
222+ eval_result = evaluate_segmentation(all_segments, known_words_grade1)
223+
224+ print(f"\n切分统计:")
225+ print(f" 总片段: {eval_result['total_segments']}")
226+ print(f" 中文词片段(2-4字): {eval_result['cn_word_segments']}")
227+ print(f" 命中已知词: {eval_result['known_word_hits']}/{eval_result['known_word_total']} ({eval_result['hit_rate']}%)")
228+
229+ print(f"\n涌现的中文片段(前30个):")
230+ for seg in eval_result['sample_cn_segs']:
231+ print(f" 「{seg}」", end="")
232+ print()
233+
234+ # ── Hub 分析:哪些字是连接中心 ──
235+ print(f"\n{'='*60}")
236+ print(f" 3. Hub 节点分析(高 κ = 语言骨架)")
237+ print(f"{'='*60}")
238+
239+ kappas = {}
240+ for nid in rt.graph.nodes:
241+ if '\u4e00' <= nid <= '\u9fff':
242+ kappas[nid] = rt.graph.convergence(nid)
243+
244+ top_hubs = sorted(kappas.items(), key=lambda x: -x[1])[:30]
245+ print(f"\nTop 30 中文 Hub 字:")
246+ for nid, k in top_hubs:
247+ hits = rt.state.experience_hits.get(nid, 0)
248+ c = rt.state.get_confidence(nid)
249+ print(f" '{nid}': κ={k:.2f}, hits={hits}, c={c:.3f}")
250+
251+ # ── 闭环分析:哪些字形成技能环路 ──
252+ print(f"\n{'='*60}")
253+ print(f" 4. 闭环分析(非对称环流 = 固化的模式)")
254+ print(f"{'='*60}")
255+
256+ circuits = []
257+ cn_nodes = [n for n in rt.graph.nodes if '\u4e00' <= n <= '\u9fff']
258+ for a in cn_nodes[:30]:
259+ for b in rt.graph.neighbors_fwd(a):
260+ if not ('\u4e00' <= b <= '\u9fff'):
261+ continue
262+ for c in rt.graph.neighbors_fwd(b):
263+ if not ('\u4e00' <= c <= '\u9fff'):
264+ continue
265+ if a in rt.graph.neighbors_fwd(c):
266+ circ_fwd = rt.graph.circulation([a, b, c, a])
267+ circ_rev = rt.graph.circulation([a, c, b, a])
268+ asym = abs(circ_fwd - circ_rev)
269+ if asym > 0.5:
270+ circuits.append((f"{a}{b}{c}", circ_fwd, circ_rev, asym))
271+
272+ circuits.sort(key=lambda x: -x[3])
273+ print(f"\nTop 20 中文三字闭环:")
274+ for tri, fwd, rev, asym in circuits[:20]:
275+ print(f" 「{tri}」: fwd={fwd:.1f}, rev={rev:.1f}, asym={asym:.1f}")
276+
277+ # ── 最终快照 ──
278+ snap = rt.snapshot_state()
279+ print(f"\n{'='*60}")
280+ print(f" 最终状态")
281+ print(f"{'='*60}")
282+ print(f" nodes: {snap['phi_summary']['count']}")
283+ print(f" edges: {snap['graph']['edge_count']}")
284+ print(f" anchors: {len(rt.state.anchor_nodes)}")
285+ print(f" ability_cores: {len(rt.state.ability_cores)}")
286+ print(f" experience: {len(rt.state.experience_regions.get('experience', set()))}")
287+ print(f" skill_belts: {len(rt.state.skill_belt_candidates)}")
288+ print(f" phi_range: [{snap['phi_summary']['min']:.3f}, {snap['phi_summary']['max']:.3f}]")
289+ print(f" attention: {snap['attention']['used']:.1f}/{snap['attention']['total']:.0f}")
290+
291+
292+if __name__ == '__main__':
293+ run_experiment()