|
|
@@ -0,0 +1,220 @@
|
|
|
+<?php
|
|
|
+
|
|
|
+/**
|
|
|
+ * 全库题干「下划线占位 + 句点小黑点」流水线校验(与 paper-body 选择/填空口径对齐)。
|
|
|
+ *
|
|
|
+ * 用法:
|
|
|
+ * php scripts/audit_rendered_placeholder_integrity.php [--connection mysql] [--table questions]
|
|
|
+ * [--chunk 2000] [--out-dir storage/app/audit_placeholder]
|
|
|
+ * [--types choice,fill]
|
|
|
+ * [--check-unbalanced-dollars]
|
|
|
+ *
|
|
|
+ * 默认:仅扫描 choice + fill;输出 summary JSON + ndjson 明细。
|
|
|
+ * 「$ 个数奇偶」默认不测(题库脏数据多时可加 --check-unbalanced-dollars)。
|
|
|
+ * 重点排查项单独写入 *priority_issues*.ndjson(空位夹在双 $…$ 段之间、公式段以运算符结尾紧邻空位)。
|
|
|
+ */
|
|
|
+
|
|
|
+declare(strict_types=1);
|
|
|
+
|
|
|
+require __DIR__.'/../vendor/autoload.php';
|
|
|
+$app = require __DIR__.'/../bootstrap/app.php';
|
|
|
+$kernel = $app->make(Illuminate\Contracts\Console\Kernel::class);
|
|
|
+$kernel->bootstrap();
|
|
|
+
|
|
|
+use App\Support\BlankPlaceholderRenderer;
|
|
|
+use Illuminate\Support\Facades\DB;
|
|
|
+
|
|
|
+$options = getopt('', [
|
|
|
+ 'table::',
|
|
|
+ 'connection::',
|
|
|
+ 'chunk::',
|
|
|
+ 'out-dir::',
|
|
|
+ 'types::',
|
|
|
+ 'check-unbalanced-dollars::',
|
|
|
+]);
|
|
|
+
|
|
|
+$checkUnbalancedDollars = array_key_exists('check-unbalanced-dollars', $options);
|
|
|
+
|
|
|
+$table = isset($options['table']) ? trim((string) $options['table']) : 'questions';
|
|
|
+$connection = isset($options['connection']) ? trim((string) $options['connection']) : config('database.default');
|
|
|
+$chunk = isset($options['chunk']) ? max(100, (int) $options['chunk']) : 2000;
|
|
|
+$defaultOut = dirname(__DIR__).'/storage/app/audit_placeholder';
|
|
|
+$outDir = isset($options['out-dir']) ? rtrim((string) $options['out-dir'], '/') : $defaultOut;
|
|
|
+// 默认仅选择与填空(与用户需求一致);若要全题型可传 --types=all 并在下方解析
|
|
|
+$typeFilter = isset($options['types']) ? trim((string) $options['types']) : 'choice,fill';
|
|
|
+$types = [];
|
|
|
+if (strtolower($typeFilter) === 'all') {
|
|
|
+ $types = [];
|
|
|
+} elseif ($typeFilter !== '') {
|
|
|
+ $types = array_values(array_filter(array_map('trim', explode(',', $typeFilter)), static fn($v) => $v !== ''));
|
|
|
+}
|
|
|
+
|
|
|
+@mkdir($outDir, 0777, true);
|
|
|
+$stamp = date('Ymd_His');
|
|
|
+$summaryPath = "{$outDir}/rendered_placeholder_audit_summary_{$stamp}.json";
|
|
|
+$detailPath = "{$outDir}/rendered_placeholder_audit_details_{$stamp}.ndjson";
|
|
|
+$priorityDetailPath = "{$outDir}/rendered_placeholder_audit_priority_issues_{$stamp}.ndjson";
|
|
|
+
|
|
|
+$detailFp = fopen($detailPath, 'wb');
|
|
|
+if ($detailFp === false) {
|
|
|
+ fwrite(STDERR, "Failed to open detail file: {$detailPath}\n");
|
|
|
+ exit(1);
|
|
|
+}
|
|
|
+
|
|
|
+$priorityIssueTypes = [
|
|
|
+ 'blank_between_math_segments',
|
|
|
+ 'math_ends_with_operator_before_blank',
|
|
|
+];
|
|
|
+
|
|
|
+$priorityFp = fopen($priorityDetailPath, 'wb');
|
|
|
+if ($priorityFp === false) {
|
|
|
+ fwrite(STDERR, "Failed to open priority detail file: {$priorityDetailPath}\n");
|
|
|
+ exit(1);
|
|
|
+}
|
|
|
+
|
|
|
+$issues = [];
|
|
|
+$examples = [];
|
|
|
+
|
|
|
+$scanned = 0;
|
|
|
+$startedAt = microtime(true);
|
|
|
+
|
|
|
+$recordIssue = static function (string $type, object $row, string $reason, string $rendered) use (&$issues, &$examples, $detailFp, $priorityFp, $priorityIssueTypes): void {
|
|
|
+ if (! isset($issues[$type])) {
|
|
|
+ $issues[$type] = 0;
|
|
|
+ $examples[$type] = [];
|
|
|
+ }
|
|
|
+ $issues[$type]++;
|
|
|
+
|
|
|
+ $entry = [
|
|
|
+ 'issue' => $type,
|
|
|
+ 'id' => (int) $row->id,
|
|
|
+ 'question_type' => (string) ($row->question_type ?? ''),
|
|
|
+ 'reason' => $reason,
|
|
|
+ 'stem_preview' => mb_substr((string) $row->stem, 0, 220),
|
|
|
+ 'rendered_preview' => mb_substr($rendered, 0, 260),
|
|
|
+ ];
|
|
|
+
|
|
|
+ fwrite($detailFp, json_encode($entry, JSON_UNESCAPED_UNICODE)."\n");
|
|
|
+
|
|
|
+ if (in_array($type, $priorityIssueTypes, true)) {
|
|
|
+ fwrite($priorityFp, json_encode($entry, JSON_UNESCAPED_UNICODE)."\n");
|
|
|
+ }
|
|
|
+
|
|
|
+ if (count($examples[$type]) < 20) {
|
|
|
+ $examples[$type][] = [
|
|
|
+ 'id' => (int) $row->id,
|
|
|
+ 'question_type' => (string) ($row->question_type ?? ''),
|
|
|
+ 'reason' => $reason,
|
|
|
+ ];
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+$blankSpan = BlankPlaceholderRenderer::defaultBlankSpan();
|
|
|
+$query = DB::connection($connection)
|
|
|
+ ->table($table)
|
|
|
+ ->select('id', 'question_type', 'stem')
|
|
|
+ ->whereNotNull('stem')
|
|
|
+ ->orderBy('id');
|
|
|
+
|
|
|
+if ($types !== []) {
|
|
|
+ $query->whereIn('question_type', $types);
|
|
|
+}
|
|
|
+
|
|
|
+$query->chunkById($chunk, function ($rows) use (&$scanned, $recordIssue, $blankSpan, $checkUnbalancedDollars): void {
|
|
|
+ foreach ($rows as $row) {
|
|
|
+ $stem = (string) $row->stem;
|
|
|
+ $type = strtolower(trim((string) ($row->question_type ?? '')));
|
|
|
+
|
|
|
+ [$rendered, $hasPlaceholders] = BlankPlaceholderRenderer::replaceToBlankSpan($stem, $blankSpan, false, false);
|
|
|
+
|
|
|
+ // 与当前 paper-body 渲染口径一致(只覆盖选择/填空)
|
|
|
+ if ($type === 'choice') {
|
|
|
+ $rendered = BlankPlaceholderRenderer::normalizeTerminalPunctuation($rendered, 'remove');
|
|
|
+ } elseif ($type === 'fill') {
|
|
|
+ if (! $hasPlaceholders) {
|
|
|
+ $rendered .= ' '.$blankSpan;
|
|
|
+ }
|
|
|
+ $rendered = BlankPlaceholderRenderer::normalizeTerminalPunctuation($rendered, 'dot');
|
|
|
+ $rendered = BlankPlaceholderRenderer::normalizePeriodBeforeTrailingParentheticalNote($rendered, '.');
|
|
|
+ $rendered = BlankPlaceholderRenderer::appendTerminalPunctuationIfMissing($rendered, '.');
|
|
|
+ }
|
|
|
+
|
|
|
+ // 1) 30949 类:\left( + 空位 + \right) 被拆成多个数学段
|
|
|
+ if (preg_match('/\$\\s*\\\\left[\\(\\[]\\s*\$\\s*<span[^>]*>.*?<\\/span>\\s*\$\\s*\\\\right[\\)\\]]\\s*\$/u', $rendered)) {
|
|
|
+ $recordIssue('broken_left_right_split', $row, 'left/right wrapped blank split into separate math segments', $rendered);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 2) 空位夹在两个数学段中(高风险结构,常导致公式语义断裂)
|
|
|
+ if (preg_match('/\$[^$]*\$\\s*<span[^>]*>.*?<\\/span>\\s*\$[^$]*\$/u', $rendered)) {
|
|
|
+ $recordIssue('blank_between_math_segments', $row, 'blank span inserted between two $...$ segments', $rendered);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 3) 渲染后「可见文本」里 $ 个数奇数 — 默认跳过(原始题干脏数据多);需要时加 --check-unbalanced-dollars
|
|
|
+ if ($checkUnbalancedDollars) {
|
|
|
+ $visibleForDollar = html_entity_decode(strip_tags($rendered), ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
+ if ((substr_count($visibleForDollar, '$') % 2) !== 0) {
|
|
|
+ $recordIssue('unbalanced_dollar_after_render', $row, 'odd number of $ in visible text after rendering', $rendered);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 4) 数学段在空位前以操作符结束(语义可能不完整)
|
|
|
+ if (preg_match('/\$[^$]*[=+\-×÷*\\\\cdot]\\s*\$\\s*<span[^>]*>.*?<\\/span>/u', $rendered)) {
|
|
|
+ $recordIssue('math_ends_with_operator_before_blank', $row, 'math segment ends with operator right before blank span', $rendered);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 5) 2562 类回归:空位 span 后紧跟孤立 $ + 汉字(错误插 $)
|
|
|
+ if (preg_match('/<\\/span>\s*\$\s*[\p{Han}]/u', $rendered)) {
|
|
|
+ $recordIssue('span_then_dollar_before_han', $row, 'blank span followed by stray $ before Chinese (formula boundary break)', $rendered);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 6) 占位 token 泄漏(不应出现在最终 HTML)
|
|
|
+ if (preg_match('/<<<|BLANK_IN_MATH|LATEX_BLANK|LR_PAIR_/u', $rendered)) {
|
|
|
+ $recordIssue('internal_placeholder_token_leak', $row, 'placeholder token not restored in output', $rendered);
|
|
|
+ }
|
|
|
+
|
|
|
+ $scanned++;
|
|
|
+ if (($scanned % 5000) === 0) {
|
|
|
+ fwrite(STDERR, "scanned={$scanned}\n");
|
|
|
+ }
|
|
|
+ }
|
|
|
+}, 'id');
|
|
|
+
|
|
|
+fclose($detailFp);
|
|
|
+fclose($priorityFp);
|
|
|
+
|
|
|
+$elapsed = round(microtime(true) - $startedAt, 3);
|
|
|
+
|
|
|
+$investigationFocus = [
|
|
|
+ 'rules' => $priorityIssueTypes,
|
|
|
+ 'issue_counts' => [
|
|
|
+ 'blank_between_math_segments' => $issues['blank_between_math_segments'] ?? 0,
|
|
|
+ 'math_ends_with_operator_before_blank' => $issues['math_ends_with_operator_before_blank'] ?? 0,
|
|
|
+ ],
|
|
|
+];
|
|
|
+
|
|
|
+$summary = [
|
|
|
+ 'table' => $table,
|
|
|
+ 'connection' => $connection,
|
|
|
+ 'chunk' => $chunk,
|
|
|
+ 'types_filter' => $types,
|
|
|
+ 'scanned_rows' => $scanned,
|
|
|
+ 'investigation_focus' => $investigationFocus,
|
|
|
+ 'checks_disabled_by_default' => array_values(array_filter([
|
|
|
+ $checkUnbalancedDollars ? null : 'unbalanced_dollar_after_render ($ odd/even in visible text)',
|
|
|
+ ])),
|
|
|
+ 'issue_counts' => $issues,
|
|
|
+ 'example_ids' => array_map(static fn(array $list) => array_column($list, 'id'), array_filter($examples, 'is_array')),
|
|
|
+ 'elapsed_seconds' => $elapsed,
|
|
|
+ 'generated_at' => date('c'),
|
|
|
+ 'detail_path' => $detailPath,
|
|
|
+ 'priority_issues_detail_path' => $priorityDetailPath,
|
|
|
+];
|
|
|
+
|
|
|
+file_put_contents($summaryPath, json_encode($summary, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
|
|
|
+
|
|
|
+echo json_encode([
|
|
|
+ 'summary_path' => $summaryPath,
|
|
|
+ 'detail_path' => $detailPath,
|
|
|
+ 'priority_issues_detail_path' => $priorityDetailPath,
|
|
|
+ 'summary' => $summary,
|
|
|
+], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)."\n";
|