audit_question_stem_quality.php 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. <?php
  2. declare(strict_types=1);
  3. require __DIR__.'/../vendor/autoload.php';
  4. $app = require __DIR__.'/../bootstrap/app.php';
  5. $kernel = $app->make(Illuminate\Contracts\Console\Kernel::class);
  6. $kernel->bootstrap();
  7. use Illuminate\Support\Facades\DB;
  8. $limit = isset($argv[1]) ? max(1, (int) $argv[1]) : 50000;
  9. $outDir = isset($argv[2]) ? rtrim($argv[2], '/'): '/tmp';
  10. $table = isset($argv[3]) ? trim((string) $argv[3]) : 'questions';
  11. @mkdir($outDir, 0777, true);
  12. $rows = DB::connection('remote_mysql')
  13. ->table($table)
  14. ->select('id', 'question_type', 'stem')
  15. ->whereNotNull('stem')
  16. ->orderByDesc('id')
  17. ->limit($limit)
  18. ->get();
  19. $issues = [
  20. 'unbalanced_dollar' => [],
  21. 'suspicious_latex_env' => [],
  22. 'mixed_placeholder_inside_math' => [],
  23. 'compare_blank_between_math_tokens' => [],
  24. 'tail_backslash_dollar_marker' => [],
  25. ];
  26. $push = static function(array &$bucket, object $row, string $reason): void {
  27. if (count($bucket) >= 2000) {
  28. return;
  29. }
  30. $bucket[] = [
  31. 'id' => (int) $row->id,
  32. 'question_type' => (string) $row->question_type,
  33. 'reason' => $reason,
  34. 'stem_preview' => mb_substr((string) $row->stem, 0, 220),
  35. ];
  36. };
  37. foreach ($rows as $row) {
  38. $stem = (string) $row->stem;
  39. $dollarCount = substr_count($stem, '$');
  40. if (($dollarCount % 2) !== 0) {
  41. $push($issues['unbalanced_dollar'], $row, 'odd number of $ delimiters');
  42. }
  43. if (preg_match('/\\\\begin\{[^}]*$/u', $stem) || preg_match('/\\\\end\{[^}]*$/u', $stem)) {
  44. $push($issues['suspicious_latex_env'], $row, 'truncated \\begin/\\end block');
  45. }
  46. if (preg_match('/\$(?:[^$]|\\\\.)*(?:\\\\underline\{[^}]*\}|_{2,}|[((](?:\s|&nbsp;|&#160;| )*[))])(?:[^$]|\\\\.)*\$/u', $stem)) {
  47. $push($issues['mixed_placeholder_inside_math'], $row, 'placeholder token appears inside $...$');
  48. }
  49. if (preg_match('/\$[^$]*\$\s*_{2,}\s*\$[^$]*\$/u', $stem) || preg_match('/\$[^$]*\$\s*[((](?:\s|&nbsp;|&#160;| )*[))]\s*\$[^$]*\$/u', $stem)) {
  50. $push($issues['compare_blank_between_math_tokens'], $row, 'blank token inserted between two math segments');
  51. }
  52. if (preg_match('/\\\\+\$(?=\s*(?:<[^>]+>\s*)*$)/u', $stem)) {
  53. $push($issues['tail_backslash_dollar_marker'], $row, 'tail backslash-dollar marker used as blank placeholder');
  54. }
  55. }
  56. $summary = [
  57. 'table' => $table,
  58. 'scan_limit' => $limit,
  59. 'scanned_rows' => count($rows),
  60. 'counts' => array_map('count', $issues),
  61. 'generated_at' => date('c'),
  62. ];
  63. $stamp = date('Ymd_His');
  64. $summaryPath = "$outDir/question_stem_quality_summary_$stamp.json";
  65. $detailPath = "$outDir/question_stem_quality_details_$stamp.json";
  66. file_put_contents($summaryPath, json_encode($summary, JSON_UNESCAPED_UNICODE|JSON_PRETTY_PRINT));
  67. file_put_contents($detailPath, json_encode($issues, JSON_UNESCAPED_UNICODE|JSON_PRETTY_PRINT));
  68. echo json_encode([
  69. 'summary_path' => $summaryPath,
  70. 'detail_path' => $detailPath,
  71. 'summary' => $summary,
  72. ], JSON_UNESCAPED_UNICODE|JSON_PRETTY_PRINT), "\n";