audit_rendered_placeholder_integrity.php 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. <?php
  2. /**
  3. * 全库题干「下划线占位 + 句点小黑点」流水线校验(与 paper-body 选择/填空口径对齐)。
  4. *
  5. * 用法:
  6. * php scripts/audit_rendered_placeholder_integrity.php [--connection mysql] [--table questions]
  7. * [--chunk 2000] [--out-dir storage/app/audit_placeholder]
  8. * [--types choice,fill]
  9. * [--check-unbalanced-dollars]
  10. *
  11. * 默认:仅扫描 choice + fill;输出 summary JSON + ndjson 明细。
  12. * 「$ 个数奇偶」默认不测(题库脏数据多时可加 --check-unbalanced-dollars)。
  13. * 重点排查项单独写入 *priority_issues*.ndjson(空位夹在双 $…$ 段之间、公式段以运算符结尾紧邻空位)。
  14. */
  15. declare(strict_types=1);
  16. require __DIR__.'/../vendor/autoload.php';
  17. $app = require __DIR__.'/../bootstrap/app.php';
  18. $kernel = $app->make(Illuminate\Contracts\Console\Kernel::class);
  19. $kernel->bootstrap();
  20. use App\Support\BlankPlaceholderRenderer;
  21. use Illuminate\Support\Facades\DB;
  22. $options = getopt('', [
  23. 'table::',
  24. 'connection::',
  25. 'chunk::',
  26. 'out-dir::',
  27. 'types::',
  28. 'check-unbalanced-dollars::',
  29. ]);
  30. $checkUnbalancedDollars = array_key_exists('check-unbalanced-dollars', $options);
  31. $table = isset($options['table']) ? trim((string) $options['table']) : 'questions';
  32. $connection = isset($options['connection']) ? trim((string) $options['connection']) : config('database.default');
  33. $chunk = isset($options['chunk']) ? max(100, (int) $options['chunk']) : 2000;
  34. $defaultOut = dirname(__DIR__).'/storage/app/audit_placeholder';
  35. $outDir = isset($options['out-dir']) ? rtrim((string) $options['out-dir'], '/') : $defaultOut;
  36. // 默认仅选择与填空(与用户需求一致);若要全题型可传 --types=all 并在下方解析
  37. $typeFilter = isset($options['types']) ? trim((string) $options['types']) : 'choice,fill';
  38. $types = [];
  39. if (strtolower($typeFilter) === 'all') {
  40. $types = [];
  41. } elseif ($typeFilter !== '') {
  42. $types = array_values(array_filter(array_map('trim', explode(',', $typeFilter)), static fn($v) => $v !== ''));
  43. }
  44. @mkdir($outDir, 0777, true);
  45. $stamp = date('Ymd_His');
  46. $summaryPath = "{$outDir}/rendered_placeholder_audit_summary_{$stamp}.json";
  47. $detailPath = "{$outDir}/rendered_placeholder_audit_details_{$stamp}.ndjson";
  48. $priorityDetailPath = "{$outDir}/rendered_placeholder_audit_priority_issues_{$stamp}.ndjson";
  49. $detailFp = fopen($detailPath, 'wb');
  50. if ($detailFp === false) {
  51. fwrite(STDERR, "Failed to open detail file: {$detailPath}\n");
  52. exit(1);
  53. }
  54. $priorityIssueTypes = [
  55. 'blank_between_math_segments',
  56. 'math_ends_with_operator_before_blank',
  57. ];
  58. $priorityFp = fopen($priorityDetailPath, 'wb');
  59. if ($priorityFp === false) {
  60. fwrite(STDERR, "Failed to open priority detail file: {$priorityDetailPath}\n");
  61. exit(1);
  62. }
  63. $issues = [];
  64. $examples = [];
  65. $scanned = 0;
  66. $startedAt = microtime(true);
  67. $recordIssue = static function (string $type, object $row, string $reason, string $rendered) use (&$issues, &$examples, $detailFp, $priorityFp, $priorityIssueTypes): void {
  68. if (! isset($issues[$type])) {
  69. $issues[$type] = 0;
  70. $examples[$type] = [];
  71. }
  72. $issues[$type]++;
  73. $entry = [
  74. 'issue' => $type,
  75. 'id' => (int) $row->id,
  76. 'question_type' => (string) ($row->question_type ?? ''),
  77. 'reason' => $reason,
  78. 'stem_preview' => mb_substr((string) $row->stem, 0, 220),
  79. 'rendered_preview' => mb_substr($rendered, 0, 260),
  80. ];
  81. fwrite($detailFp, json_encode($entry, JSON_UNESCAPED_UNICODE)."\n");
  82. if (in_array($type, $priorityIssueTypes, true)) {
  83. fwrite($priorityFp, json_encode($entry, JSON_UNESCAPED_UNICODE)."\n");
  84. }
  85. if (count($examples[$type]) < 20) {
  86. $examples[$type][] = [
  87. 'id' => (int) $row->id,
  88. 'question_type' => (string) ($row->question_type ?? ''),
  89. 'reason' => $reason,
  90. ];
  91. }
  92. };
  93. $blankSpan = BlankPlaceholderRenderer::defaultBlankSpan();
  94. $query = DB::connection($connection)
  95. ->table($table)
  96. ->select('id', 'question_type', 'stem')
  97. ->whereNotNull('stem')
  98. ->orderBy('id');
  99. if ($types !== []) {
  100. $query->whereIn('question_type', $types);
  101. }
  102. $query->chunkById($chunk, function ($rows) use (&$scanned, $recordIssue, $blankSpan, $checkUnbalancedDollars): void {
  103. foreach ($rows as $row) {
  104. $stem = (string) $row->stem;
  105. $type = strtolower(trim((string) ($row->question_type ?? '')));
  106. [$rendered, $hasPlaceholders] = BlankPlaceholderRenderer::replaceToBlankSpan($stem, $blankSpan, false, false);
  107. // 与当前 paper-body 渲染口径一致(只覆盖选择/填空)
  108. if ($type === 'choice') {
  109. $rendered = BlankPlaceholderRenderer::normalizeTerminalPunctuation($rendered, 'remove');
  110. } elseif ($type === 'fill') {
  111. if (! $hasPlaceholders) {
  112. $rendered .= ' '.$blankSpan;
  113. }
  114. $rendered = BlankPlaceholderRenderer::normalizeTerminalPunctuation($rendered, 'dot');
  115. $rendered = BlankPlaceholderRenderer::normalizePeriodBeforeTrailingParentheticalNote($rendered, '.');
  116. $rendered = BlankPlaceholderRenderer::appendTerminalPunctuationIfMissing($rendered, '.');
  117. }
  118. // 1) 30949 类:\left( + 空位 + \right) 被拆成多个数学段
  119. if (preg_match('/\$\\s*\\\\left[\\(\\[]\\s*\$\\s*<span[^>]*>.*?<\\/span>\\s*\$\\s*\\\\right[\\)\\]]\\s*\$/u', $rendered)) {
  120. $recordIssue('broken_left_right_split', $row, 'left/right wrapped blank split into separate math segments', $rendered);
  121. }
  122. // 2) 空位夹在两个数学段中(高风险结构,常导致公式语义断裂)
  123. if (preg_match('/\$[^$]*\$\\s*<span[^>]*>.*?<\\/span>\\s*\$[^$]*\$/u', $rendered)) {
  124. $recordIssue('blank_between_math_segments', $row, 'blank span inserted between two $...$ segments', $rendered);
  125. }
  126. // 3) 渲染后「可见文本」里 $ 个数奇数 — 默认跳过(原始题干脏数据多);需要时加 --check-unbalanced-dollars
  127. if ($checkUnbalancedDollars) {
  128. $visibleForDollar = html_entity_decode(strip_tags($rendered), ENT_QUOTES | ENT_HTML5, 'UTF-8');
  129. if ((substr_count($visibleForDollar, '$') % 2) !== 0) {
  130. $recordIssue('unbalanced_dollar_after_render', $row, 'odd number of $ in visible text after rendering', $rendered);
  131. }
  132. }
  133. // 4) 数学段在空位前以操作符结束(语义可能不完整)
  134. if (preg_match('/\$[^$]*[=+\-×÷*\\\\cdot]\\s*\$\\s*<span[^>]*>.*?<\\/span>/u', $rendered)) {
  135. $recordIssue('math_ends_with_operator_before_blank', $row, 'math segment ends with operator right before blank span', $rendered);
  136. }
  137. // 5) 2562 类回归:空位 span 后紧跟孤立 $ + 汉字(错误插 $)
  138. if (preg_match('/<\\/span>\s*\$\s*[\p{Han}]/u', $rendered)) {
  139. $recordIssue('span_then_dollar_before_han', $row, 'blank span followed by stray $ before Chinese (formula boundary break)', $rendered);
  140. }
  141. // 6) 占位 token 泄漏(不应出现在最终 HTML)
  142. if (preg_match('/<<<|BLANK_IN_MATH|LATEX_BLANK|LR_PAIR_/u', $rendered)) {
  143. $recordIssue('internal_placeholder_token_leak', $row, 'placeholder token not restored in output', $rendered);
  144. }
  145. $scanned++;
  146. if (($scanned % 5000) === 0) {
  147. fwrite(STDERR, "scanned={$scanned}\n");
  148. }
  149. }
  150. }, 'id');
  151. fclose($detailFp);
  152. fclose($priorityFp);
  153. $elapsed = round(microtime(true) - $startedAt, 3);
  154. $investigationFocus = [
  155. 'rules' => $priorityIssueTypes,
  156. 'issue_counts' => [
  157. 'blank_between_math_segments' => $issues['blank_between_math_segments'] ?? 0,
  158. 'math_ends_with_operator_before_blank' => $issues['math_ends_with_operator_before_blank'] ?? 0,
  159. ],
  160. ];
  161. $summary = [
  162. 'table' => $table,
  163. 'connection' => $connection,
  164. 'chunk' => $chunk,
  165. 'types_filter' => $types,
  166. 'scanned_rows' => $scanned,
  167. 'investigation_focus' => $investigationFocus,
  168. 'checks_disabled_by_default' => array_values(array_filter([
  169. $checkUnbalancedDollars ? null : 'unbalanced_dollar_after_render ($ odd/even in visible text)',
  170. ])),
  171. 'issue_counts' => $issues,
  172. 'example_ids' => array_map(static fn(array $list) => array_column($list, 'id'), array_filter($examples, 'is_array')),
  173. 'elapsed_seconds' => $elapsed,
  174. 'generated_at' => date('c'),
  175. 'detail_path' => $detailPath,
  176. 'priority_issues_detail_path' => $priorityDetailPath,
  177. ];
  178. file_put_contents($summaryPath, json_encode($summary, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
  179. echo json_encode([
  180. 'summary_path' => $summaryPath,
  181. 'detail_path' => $detailPath,
  182. 'priority_issues_detail_path' => $priorityDetailPath,
  183. 'summary' => $summary,
  184. ], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)."\n";