| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- <?php
- declare(strict_types=1);
- require __DIR__.'/../vendor/autoload.php';
- $app = require __DIR__.'/../bootstrap/app.php';
- $kernel = $app->make(Illuminate\Contracts\Console\Kernel::class);
- $kernel->bootstrap();
- use Illuminate\Support\Facades\DB;
- $limit = isset($argv[1]) ? max(1, (int) $argv[1]) : 50000;
- $outDir = isset($argv[2]) ? rtrim($argv[2], '/'): '/tmp';
- $table = isset($argv[3]) ? trim((string) $argv[3]) : 'questions';
- @mkdir($outDir, 0777, true);
- $rows = DB::connection('remote_mysql')
- ->table($table)
- ->select('id', 'question_type', 'stem')
- ->whereNotNull('stem')
- ->orderByDesc('id')
- ->limit($limit)
- ->get();
- $issues = [
- 'unbalanced_dollar' => [],
- 'suspicious_latex_env' => [],
- 'mixed_placeholder_inside_math' => [],
- 'compare_blank_between_math_tokens' => [],
- 'tail_backslash_dollar_marker' => [],
- ];
- $push = static function(array &$bucket, object $row, string $reason): void {
- if (count($bucket) >= 2000) {
- return;
- }
- $bucket[] = [
- 'id' => (int) $row->id,
- 'question_type' => (string) $row->question_type,
- 'reason' => $reason,
- 'stem_preview' => mb_substr((string) $row->stem, 0, 220),
- ];
- };
- foreach ($rows as $row) {
- $stem = (string) $row->stem;
- $dollarCount = substr_count($stem, '$');
- if (($dollarCount % 2) !== 0) {
- $push($issues['unbalanced_dollar'], $row, 'odd number of $ delimiters');
- }
- if (preg_match('/\\\\begin\{[^}]*$/u', $stem) || preg_match('/\\\\end\{[^}]*$/u', $stem)) {
- $push($issues['suspicious_latex_env'], $row, 'truncated \\begin/\\end block');
- }
- if (preg_match('/\$(?:[^$]|\\\\.)*(?:\\\\underline\{[^}]*\}|_{2,}|[((](?:\s| | | )*[))])(?:[^$]|\\\\.)*\$/u', $stem)) {
- $push($issues['mixed_placeholder_inside_math'], $row, 'placeholder token appears inside $...$');
- }
- if (preg_match('/\$[^$]*\$\s*_{2,}\s*\$[^$]*\$/u', $stem) || preg_match('/\$[^$]*\$\s*[((](?:\s| | | )*[))]\s*\$[^$]*\$/u', $stem)) {
- $push($issues['compare_blank_between_math_tokens'], $row, 'blank token inserted between two math segments');
- }
- if (preg_match('/\\\\+\$(?=\s*(?:<[^>]+>\s*)*$)/u', $stem)) {
- $push($issues['tail_backslash_dollar_marker'], $row, 'tail backslash-dollar marker used as blank placeholder');
- }
- }
- $summary = [
- 'table' => $table,
- 'scan_limit' => $limit,
- 'scanned_rows' => count($rows),
- 'counts' => array_map('count', $issues),
- 'generated_at' => date('c'),
- ];
- $stamp = date('Ymd_His');
- $summaryPath = "$outDir/question_stem_quality_summary_$stamp.json";
- $detailPath = "$outDir/question_stem_quality_details_$stamp.json";
- file_put_contents($summaryPath, json_encode($summary, JSON_UNESCAPED_UNICODE|JSON_PRETTY_PRINT));
- file_put_contents($detailPath, json_encode($issues, JSON_UNESCAPED_UNICODE|JSON_PRETTY_PRINT));
- echo json_encode([
- 'summary_path' => $summaryPath,
- 'detail_path' => $detailPath,
- 'summary' => $summary,
- ], JSON_UNESCAPED_UNICODE|JSON_PRETTY_PRINT), "\n";
|