make(Illuminate\Contracts\Console\Kernel::class); $kernel->bootstrap(); use App\Support\BlankPlaceholderRenderer; use Illuminate\Support\Facades\DB; $options = getopt('', [ 'table::', 'connection::', 'chunk::', 'out-dir::', 'types::', 'check-unbalanced-dollars::', ]); $checkUnbalancedDollars = array_key_exists('check-unbalanced-dollars', $options); $table = isset($options['table']) ? trim((string) $options['table']) : 'questions'; $connection = isset($options['connection']) ? trim((string) $options['connection']) : config('database.default'); $chunk = isset($options['chunk']) ? max(100, (int) $options['chunk']) : 2000; $defaultOut = dirname(__DIR__).'/storage/app/audit_placeholder'; $outDir = isset($options['out-dir']) ? rtrim((string) $options['out-dir'], '/') : $defaultOut; // 默认仅选择与填空(与用户需求一致);若要全题型可传 --types=all 并在下方解析 $typeFilter = isset($options['types']) ? trim((string) $options['types']) : 'choice,fill'; $types = []; if (strtolower($typeFilter) === 'all') { $types = []; } elseif ($typeFilter !== '') { $types = array_values(array_filter(array_map('trim', explode(',', $typeFilter)), static fn($v) => $v !== '')); } @mkdir($outDir, 0777, true); $stamp = date('Ymd_His'); $summaryPath = "{$outDir}/rendered_placeholder_audit_summary_{$stamp}.json"; $detailPath = "{$outDir}/rendered_placeholder_audit_details_{$stamp}.ndjson"; $priorityDetailPath = "{$outDir}/rendered_placeholder_audit_priority_issues_{$stamp}.ndjson"; $detailFp = fopen($detailPath, 'wb'); if ($detailFp === false) { fwrite(STDERR, "Failed to open detail file: {$detailPath}\n"); exit(1); } $priorityIssueTypes = [ 'blank_between_math_segments', 'math_ends_with_operator_before_blank', ]; $priorityFp = fopen($priorityDetailPath, 'wb'); if ($priorityFp === false) { fwrite(STDERR, "Failed to open priority detail file: {$priorityDetailPath}\n"); exit(1); } $issues = []; $examples = []; $scanned = 0; $startedAt = microtime(true); $recordIssue = static function (string $type, object $row, string $reason, string $rendered) use (&$issues, &$examples, $detailFp, $priorityFp, $priorityIssueTypes): void { if (! isset($issues[$type])) { $issues[$type] = 0; $examples[$type] = []; } $issues[$type]++; $entry = [ 'issue' => $type, 'id' => (int) $row->id, 'question_type' => (string) ($row->question_type ?? ''), 'reason' => $reason, 'stem_preview' => mb_substr((string) $row->stem, 0, 220), 'rendered_preview' => mb_substr($rendered, 0, 260), ]; fwrite($detailFp, json_encode($entry, JSON_UNESCAPED_UNICODE)."\n"); if (in_array($type, $priorityIssueTypes, true)) { fwrite($priorityFp, json_encode($entry, JSON_UNESCAPED_UNICODE)."\n"); } if (count($examples[$type]) < 20) { $examples[$type][] = [ 'id' => (int) $row->id, 'question_type' => (string) ($row->question_type ?? ''), 'reason' => $reason, ]; } }; $blankSpan = BlankPlaceholderRenderer::defaultBlankSpan(); $query = DB::connection($connection) ->table($table) ->select('id', 'question_type', 'stem') ->whereNotNull('stem') ->orderBy('id'); if ($types !== []) { $query->whereIn('question_type', $types); } $query->chunkById($chunk, function ($rows) use (&$scanned, $recordIssue, $blankSpan, $checkUnbalancedDollars): void { foreach ($rows as $row) { $stem = (string) $row->stem; $type = strtolower(trim((string) ($row->question_type ?? ''))); [$rendered, $hasPlaceholders] = BlankPlaceholderRenderer::replaceToBlankSpan($stem, $blankSpan, false, false); // 与当前 paper-body 渲染口径一致(只覆盖选择/填空) if ($type === 'choice') { $rendered = BlankPlaceholderRenderer::normalizeTerminalPunctuation($rendered, 'remove'); } elseif ($type === 'fill') { if (! $hasPlaceholders) { $rendered .= ' '.$blankSpan; } $rendered = BlankPlaceholderRenderer::normalizeTerminalPunctuation($rendered, 'dot'); $rendered = BlankPlaceholderRenderer::normalizePeriodBeforeTrailingParentheticalNote($rendered, '.'); $rendered = BlankPlaceholderRenderer::appendTerminalPunctuationIfMissing($rendered, '.'); } // 1) 30949 类:\left( + 空位 + \right) 被拆成多个数学段 if (preg_match('/\$\\s*\\\\left[\\(\\[]\\s*\$\\s*]*>.*?<\\/span>\\s*\$\\s*\\\\right[\\)\\]]\\s*\$/u', $rendered)) { $recordIssue('broken_left_right_split', $row, 'left/right wrapped blank split into separate math segments', $rendered); } // 2) 空位夹在两个数学段中(高风险结构,常导致公式语义断裂) if (preg_match('/\$[^$]*\$\\s*]*>.*?<\\/span>\\s*\$[^$]*\$/u', $rendered)) { $recordIssue('blank_between_math_segments', $row, 'blank span inserted between two $...$ segments', $rendered); } // 3) 渲染后「可见文本」里 $ 个数奇数 — 默认跳过(原始题干脏数据多);需要时加 --check-unbalanced-dollars if ($checkUnbalancedDollars) { $visibleForDollar = html_entity_decode(strip_tags($rendered), ENT_QUOTES | ENT_HTML5, 'UTF-8'); if ((substr_count($visibleForDollar, '$') % 2) !== 0) { $recordIssue('unbalanced_dollar_after_render', $row, 'odd number of $ in visible text after rendering', $rendered); } } // 4) 数学段在空位前以操作符结束(语义可能不完整) if (preg_match('/\$[^$]*[=+\-×÷*\\\\cdot]\\s*\$\\s*]*>.*?<\\/span>/u', $rendered)) { $recordIssue('math_ends_with_operator_before_blank', $row, 'math segment ends with operator right before blank span', $rendered); } // 5) 2562 类回归:空位 span 后紧跟孤立 $ + 汉字(错误插 $) if (preg_match('/<\\/span>\s*\$\s*[\p{Han}]/u', $rendered)) { $recordIssue('span_then_dollar_before_han', $row, 'blank span followed by stray $ before Chinese (formula boundary break)', $rendered); } // 6) 占位 token 泄漏(不应出现在最终 HTML) if (preg_match('/<<<|BLANK_IN_MATH|LATEX_BLANK|LR_PAIR_/u', $rendered)) { $recordIssue('internal_placeholder_token_leak', $row, 'placeholder token not restored in output', $rendered); } $scanned++; if (($scanned % 5000) === 0) { fwrite(STDERR, "scanned={$scanned}\n"); } } }, 'id'); fclose($detailFp); fclose($priorityFp); $elapsed = round(microtime(true) - $startedAt, 3); $investigationFocus = [ 'rules' => $priorityIssueTypes, 'issue_counts' => [ 'blank_between_math_segments' => $issues['blank_between_math_segments'] ?? 0, 'math_ends_with_operator_before_blank' => $issues['math_ends_with_operator_before_blank'] ?? 0, ], ]; $summary = [ 'table' => $table, 'connection' => $connection, 'chunk' => $chunk, 'types_filter' => $types, 'scanned_rows' => $scanned, 'investigation_focus' => $investigationFocus, 'checks_disabled_by_default' => array_values(array_filter([ $checkUnbalancedDollars ? null : 'unbalanced_dollar_after_render ($ odd/even in visible text)', ])), 'issue_counts' => $issues, 'example_ids' => array_map(static fn(array $list) => array_column($list, 'id'), array_filter($examples, 'is_array')), 'elapsed_seconds' => $elapsed, 'generated_at' => date('c'), 'detail_path' => $detailPath, 'priority_issues_detail_path' => $priorityDetailPath, ]; file_put_contents($summaryPath, json_encode($summary, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)); echo json_encode([ 'summary_path' => $summaryPath, 'detail_path' => $detailPath, 'priority_issues_detail_path' => $priorityDetailPath, 'summary' => $summary, ], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)."\n";