fill_missing_fields.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 批量补录族谱人员缺失字段:name_word(字辈)、former_name(曾用名)、childhood_name(幼名/乳名)
  5. 用法:
  6. python3 fill_missing_fields.py # dry-run,仅输出预览 CSV,不写数据库
  7. python3 fill_missing_fields.py --commit # 真正写入数据库
  8. python3 fill_missing_fields.py --stats # 仅统计字段填写率,不做任何修改
  9. """
  10. from __future__ import annotations
  11. import sys
  12. import re
  13. import json
  14. import csv
  15. import pymysql
  16. from datetime import datetime
  17. # ─── 数据库配置 ───────────────────────────────────────────────────────────────
  18. DB_CONFIG = {
  19. "host": "rm-f8ze60yirdj8786u2wo.mysql.rds.aliyuncs.com",
  20. "port": 3306,
  21. "user": "root",
  22. "password": "csqz@20255",
  23. "db": "csqz-client",
  24. "charset": "utf8mb4",
  25. "cursorclass": pymysql.cursors.DictCursor,
  26. }
  27. COMMIT_MODE = "--commit" in sys.argv
  28. STATS_ONLY = "--stats" in sys.argv
  29. # ─── 族谱原文正则规则 ────────────────────────────────────────────────────────
  30. _BOUNDARY = r'(?=[,,。.\s公字幼又乳小二三四五六七八九十\))\]】]|$)'
  31. REGEX_RULES = {
  32. "former_name": [
  33. re.compile(r'曾用名[::,,\s]*([^\s,,。.公字幼又乳小]{1,10})' + _BOUNDARY),
  34. re.compile(r'又名[::,,\s]*([^\s,,。.公字幼乳小]{1,10})' + _BOUNDARY),
  35. re.compile(r'曾名[::,,\s]*([^\s,,。.公字幼乳小]{1,10})' + _BOUNDARY),
  36. re.compile(r'一名[::,,\s]*([^\s,,。.公字幼乳小]{1,10})' + _BOUNDARY),
  37. ],
  38. "childhood_name": [
  39. re.compile(r'幼名[::,,\s]*([^\s,,。.公字又曾]{1,10})' + _BOUNDARY),
  40. re.compile(r'乳名[::,,\s]*([^\s,,。.公字又曾]{1,10})' + _BOUNDARY),
  41. re.compile(r'小名[::,,\s]*([^\s,,。.公字又曾]{1,10})' + _BOUNDARY),
  42. ],
  43. }
  44. def _normalize_text(text):
  45. """全角标点→半角,统一处理"""
  46. if not text:
  47. return ""
  48. text = text.replace('\uff1a', ':').replace('\uff0c', ',').replace('\u3002', '.')
  49. text = text.replace('\u3000', ' ').replace('\xa0', ' ')
  50. return text
  51. def _extract_from_original(text_trad, text_simp):
  52. """从族谱原文中提取 former_name / childhood_name,返回 dict"""
  53. result = {}
  54. for field, patterns in REGEX_RULES.items():
  55. for text in [text_trad or "", text_simp or ""]:
  56. norm = _normalize_text(text)
  57. for pat in patterns:
  58. m = pat.search(norm)
  59. if m:
  60. val = m.group(1).strip().rstrip('公')
  61. if val:
  62. result[field] = val
  63. break
  64. if field in result:
  65. break
  66. return result
  67. def _match_ai_name_word(ai_content_str, member_name, member_simp):
  68. """从 ai_content JSON 中按姓名匹配 name_word,返回字符串或 None"""
  69. if not ai_content_str:
  70. return None
  71. try:
  72. records = json.loads(ai_content_str)
  73. if not isinstance(records, list):
  74. records = [records]
  75. for r in records:
  76. orig = (r.get('original_name') or '').strip()
  77. nm = (r.get('name') or '').strip()
  78. nw = (r.get('name_word') or '').strip()
  79. if not nw:
  80. continue
  81. targets = set()
  82. if member_name: targets.add(member_name)
  83. if member_simp: targets.add(member_simp)
  84. # 精确匹配
  85. if orig in targets or nm in targets:
  86. return nw
  87. # 模糊匹配(名字互含)
  88. for t in targets:
  89. if t and (t in orig or t in nm or orig in t or nm in t):
  90. return nw
  91. except Exception:
  92. pass
  93. return None
  94. def _guess_name_word(simplified_name, name):
  95. """从姓名推算字辈:去掉"公"后缀,取第1个字;返回字符串或 None"""
  96. raw = (simplified_name or name or "").strip()
  97. raw = re.sub(r'公$', '', raw)
  98. raw = re.sub(r'\s+', '', raw)
  99. if len(raw) >= 2:
  100. return raw[0]
  101. return None
  102. def get_conn():
  103. return pymysql.connect(**DB_CONFIG)
  104. def print_stats(conn):
  105. with conn.cursor() as cur:
  106. cur.execute("SELECT COUNT(*) AS total FROM family_member_info")
  107. total = cur.fetchone()['total']
  108. for field, label in [
  109. ('name_word', '字辈'),
  110. ('former_name', '曾用名'),
  111. ('childhood_name', '幼名/乳名'),
  112. ]:
  113. cur.execute("""
  114. SELECT COUNT(*) AS cnt FROM family_member_info
  115. WHERE {} IS NOT NULL AND TRIM({}) != ''
  116. """.format(field, field))
  117. filled = cur.fetchone()['cnt']
  118. pct = filled / total * 100 if total else 0
  119. print(" {:<12s} ({:<15s}): 已填 {:4d} / {} ({:.1f}%)".format(
  120. label, field, filled, total, pct))
  121. def collect_updates(conn):
  122. """
  123. 遍历所有成员,计算需要补录的字段。
  124. 返回 list of dict。
  125. """
  126. print(" 正在加载成员数据…")
  127. with conn.cursor() as cur:
  128. cur.execute("""
  129. SELECT id, name, simplified_name,
  130. name_word, former_name, childhood_name,
  131. genealogy_original_traditional,
  132. genealogy_original_simplified,
  133. source_record_id
  134. FROM family_member_info
  135. """)
  136. members = cur.fetchall()
  137. print(" 成员总数:{}".format(len(members)))
  138. print(" 正在加载 AI 解析记录…")
  139. cur.execute("""
  140. SELECT id, ai_content
  141. FROM genealogy_records
  142. WHERE ai_status = 2 AND ai_content IS NOT NULL
  143. """)
  144. ai_map = {r['id']: r['ai_content'] for r in cur.fetchall()}
  145. print(" AI 记录数:{}".format(len(ai_map)))
  146. updates = []
  147. for m in members:
  148. mid = m['id']
  149. name = (m['name'] or '').strip()
  150. simp = (m['simplified_name'] or '').strip()
  151. cur_nw = (m['name_word'] or '').strip()
  152. cur_fn = (m['former_name'] or '').strip()
  153. cur_cn = (m['childhood_name'] or '').strip()
  154. orig_trad = m['genealogy_original_traditional'] or ''
  155. orig_simp = m['genealogy_original_simplified'] or ''
  156. source_rec_id = m['source_record_id']
  157. new_nw = None
  158. nw_source = ''
  159. new_fn = None
  160. new_cn = None
  161. # ── 字辈:优先 ai_content ─────────────────────────────────────────
  162. if not cur_nw:
  163. if source_rec_id and source_rec_id in ai_map:
  164. ai_nw = _match_ai_name_word(ai_map[source_rec_id], name, simp)
  165. if ai_nw:
  166. new_nw = ai_nw
  167. nw_source = 'ai_content'
  168. # 兜底:从姓名推算
  169. if not new_nw:
  170. guessed = _guess_name_word(simp, name)
  171. if guessed:
  172. new_nw = guessed
  173. nw_source = 'name_guess'
  174. # ── 曾用名 / 幼名:从族谱原文提取 ──────────────────────────────
  175. extracted = _extract_from_original(orig_trad, orig_simp)
  176. if not cur_fn and extracted.get('former_name'):
  177. new_fn = extracted['former_name']
  178. if not cur_cn and extracted.get('childhood_name'):
  179. new_cn = extracted['childhood_name']
  180. # 只收录有变化的记录
  181. if new_nw or new_fn or new_cn:
  182. updates.append({
  183. 'id': mid,
  184. 'name': name,
  185. 'simplified_name': simp,
  186. 'name_word_current': cur_nw,
  187. 'name_word_new': new_nw or '',
  188. 'name_word_source': nw_source,
  189. 'former_name_current': cur_fn,
  190. 'former_name_new': new_fn or '',
  191. 'childhood_name_current': cur_cn,
  192. 'childhood_name_new': new_cn or '',
  193. })
  194. return updates
  195. def write_csv(updates, path):
  196. fieldnames = [
  197. 'id', 'name', 'simplified_name',
  198. 'name_word_current', 'name_word_new', 'name_word_source',
  199. 'former_name_current', 'former_name_new',
  200. 'childhood_name_current', 'childhood_name_new',
  201. ]
  202. with open(path, 'w', newline='', encoding='utf-8-sig') as f:
  203. writer = csv.DictWriter(f, fieldnames=fieldnames)
  204. writer.writeheader()
  205. writer.writerows(updates)
  206. print(" 已写出预览 CSV:{}".format(path))
  207. def apply_updates(conn, updates):
  208. """执行 UPDATE,仅覆盖当前为空的字段"""
  209. stats = {'name_word': 0, 'former_name': 0, 'childhood_name': 0, 'total_members': 0}
  210. with conn.cursor() as cur:
  211. for u in updates:
  212. sets = []
  213. params = []
  214. if u['name_word_new']:
  215. sets.append("name_word = %s")
  216. params.append(u['name_word_new'])
  217. stats['name_word'] += 1
  218. if u['former_name_new']:
  219. sets.append("former_name = %s")
  220. params.append(u['former_name_new'])
  221. stats['former_name'] += 1
  222. if u['childhood_name_new']:
  223. sets.append("childhood_name = %s")
  224. params.append(u['childhood_name_new'])
  225. stats['childhood_name'] += 1
  226. if sets:
  227. params.append(u['id'])
  228. cur.execute(
  229. "UPDATE family_member_info SET {} WHERE id = %s".format(', '.join(sets)),
  230. params
  231. )
  232. stats['total_members'] += 1
  233. conn.commit()
  234. return stats
  235. def main():
  236. print("=" * 60)
  237. mode_label = '[COMMIT 模式 - 真正写入]' if COMMIT_MODE else '[DRY-RUN 模式 - 仅预览]'
  238. print(" 族谱字段批量补录工具 {}".format(mode_label))
  239. print(" 运行时间:{}".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
  240. print("=" * 60)
  241. conn = get_conn()
  242. try:
  243. print("\n【字段填写率统计(运行前)】")
  244. print_stats(conn)
  245. if STATS_ONLY:
  246. return
  247. print("\n【扫描成员数据…】")
  248. updates = collect_updates(conn)
  249. ai_nw = [u for u in updates if u['name_word_source'] == 'ai_content']
  250. guess_nw = [u for u in updates if u['name_word_source'] == 'name_guess']
  251. fn_hits = [u for u in updates if u['former_name_new']]
  252. cn_hits = [u for u in updates if u['childhood_name_new']]
  253. print("\n 字辈(来自 ai_content) : {:4d} 条 ← 高可信度".format(len(ai_nw)))
  254. print(" 字辈(来自姓名推算) : {:4d} 条 ← 建议人工抽检".format(len(guess_nw)))
  255. print(" 曾用名(来自族谱原文) : {:4d} 条".format(len(fn_hits)))
  256. print(" 幼名/乳名(来自族谱原文): {:4d} 条".format(len(cn_hits)))
  257. print(" 涉及成员总计 : {:4d} 名".format(len(updates)))
  258. ts = datetime.now().strftime('%Y%m%d_%H%M%S')
  259. # 全量预览 CSV
  260. all_csv = "fill_preview_{}.csv".format(ts)
  261. write_csv(updates, all_csv)
  262. # 推算字辈单独一个 CSV(供重点核查)
  263. if guess_nw:
  264. guess_csv = "fill_preview_guess_nameword_{}.csv".format(ts)
  265. write_csv(guess_nw, guess_csv)
  266. print(" ⚠ 推算字辈单独导出:{}(请优先人工核查)".format(guess_csv))
  267. if COMMIT_MODE:
  268. print("\n【开始写入数据库…】")
  269. stats = apply_updates(conn, updates)
  270. print(" 字辈写入 : {} 条".format(stats['name_word']))
  271. print(" 曾用名写入 : {} 条".format(stats['former_name']))
  272. print(" 幼名/乳名写入 : {} 条".format(stats['childhood_name']))
  273. print(" 影响成员 : {} 名".format(stats['total_members']))
  274. print("\n【字段填写率统计(运行后)】")
  275. print_stats(conn)
  276. else:
  277. print("\n DRY-RUN 完成,未写入数据库。")
  278. print(" 确认 CSV 内容无误后,执行真正写入:")
  279. print(" python3 fill_missing_fields.py --commit")
  280. finally:
  281. conn.close()
  282. print("\n" + "=" * 60)
  283. if __name__ == '__main__':
  284. main()