QuestionExtractorService.php 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. <?php
  2. namespace App\Services;
  3. use App\Models\MarkdownImport;
  4. use App\Models\PaperPart;
  5. use App\Models\PaperQuestionRef;
  6. use App\Models\PreQuestionCandidate;
  7. use App\Models\SourcePaper;
  8. use Illuminate\Support\Collection;
  9. use Illuminate\Support\Facades\DB;
  10. use Illuminate\Support\Str;
  11. class QuestionExtractorService
  12. {
  13. /**
  14. * 使用严格题号正则拆分题目,写入候选表与绑定表。
  15. */
  16. public function extractAndPersist(PaperPart $part, ?MarkdownImport $import = null, int &$sequenceStart = 1): Collection
  17. {
  18. $paper = $part->paper;
  19. $file = $paper->file;
  20. $import = $import ?? $this->ensureSyntheticImport($paper);
  21. $blocks = app(\App\Services\AsyncMarkdownSplitter::class)->split($part->raw_markdown);
  22. return DB::transaction(function () use ($blocks, $part, $paper, $file, $import, &$sequenceStart) {
  23. $created = collect();
  24. foreach ($blocks as $i => $block) {
  25. $questionNumber = (string) ($block['question_number'] ?? ($i + 1));
  26. $order = $i + 1;
  27. $raw = $block['raw_markdown'];
  28. $clean = $this->cleanMarkdown($raw);
  29. $sequence = $sequenceStart++;
  30. $reuse = $this->findReusableCandidate($raw, $clean);
  31. $reuseConfidence = $reuse?->confidence ?? $reuse?->ai_confidence;
  32. $candidate = PreQuestionCandidate::updateOrCreate(
  33. [
  34. 'import_id' => $import->id,
  35. 'sequence' => $sequence,
  36. ],
  37. [
  38. 'source_file_id' => $file?->id,
  39. 'source_paper_id' => $paper?->id,
  40. 'part_id' => $part?->id,
  41. 'index' => (int) $questionNumber,
  42. 'question_number' => $questionNumber,
  43. 'order' => $order,
  44. 'sequence' => $sequence,
  45. 'raw_markdown' => $raw,
  46. 'clean_markdown' => $clean,
  47. 'structured_json' => $reuse?->structured_json,
  48. 'stem' => $reuse?->stem,
  49. 'options' => $reuse?->options,
  50. 'images' => !empty($reuse?->images) ? $reuse->images : ($block['images'] ?? []),
  51. 'tables' => !empty($reuse?->tables) ? $reuse->tables : ($block['tables'] ?? []),
  52. 'is_question_candidate' => $reuse?->is_question_candidate ?? true,
  53. 'ai_confidence' => $reuse?->ai_confidence,
  54. 'confidence' => $reuseConfidence,
  55. 'formula_detected' => $this->detectFormula($raw),
  56. 'is_valid_question' => $reuse?->is_valid_question ?? true,
  57. 'status' => PreQuestionCandidate::STATUS_PENDING,
  58. ]
  59. );
  60. PaperQuestionRef::updateOrCreate(
  61. [
  62. 'source_paper_id' => $paper->id,
  63. 'part_id' => $part->id,
  64. 'candidate_id' => $candidate->id,
  65. ],
  66. [
  67. 'question_number' => $questionNumber,
  68. 'order' => $order,
  69. 'raw_markdown' => $raw,
  70. 'metadata' => [
  71. 'source_file_uuid' => $file?->uuid,
  72. 'paper_uuid' => $paper?->uuid,
  73. 'part_order' => $part?->order,
  74. ],
  75. ]
  76. );
  77. $created->push($candidate);
  78. }
  79. $part->update(['question_count' => $created->count()]);
  80. return $created;
  81. });
  82. }
  83. /**
  84. * 严格按题号拆分,题号正则:^\s*(\d+)(\.|、|\)|)|\]|】)?\s+
  85. */
  86. public function splitQuestions(string $markdown): array
  87. {
  88. $pattern = '/^\s*(\d+)(?:[\\.、\\))\\]】])?\s+/m';
  89. preg_match_all($pattern, $markdown, $matches, PREG_OFFSET_CAPTURE);
  90. if (empty($matches[0])) {
  91. return [[
  92. 'question_number' => 1,
  93. 'raw_markdown' => trim($markdown),
  94. ]];
  95. }
  96. $blocks = [];
  97. $positions = array_map(fn($m) => $m[1], $matches[0]);
  98. $numbers = array_map(fn($m) => $m[0], $matches[1]);
  99. foreach ($positions as $idx => $start) {
  100. $end = $positions[$idx + 1] ?? strlen($markdown);
  101. $slice = substr($markdown, $start, $end - $start);
  102. $blocks[] = [
  103. 'question_number' => $numbers[$idx] ?? ($idx + 1),
  104. 'raw_markdown' => trim($slice),
  105. ];
  106. }
  107. return $blocks;
  108. }
  109. protected function detectFormula(string $markdown): bool
  110. {
  111. return (bool) preg_match('/\\$\\$.*?\\$\\$|\\$[^$]+\\$|\\\\frac|\\\\sum|\\\\int|\\\\sqrt/u', $markdown);
  112. }
  113. protected function cleanMarkdown(string $markdown): string
  114. {
  115. return trim(Str::of($markdown)->replace("\r", '')->toString());
  116. }
  117. /**
  118. * 查找可复用的高置信度候选题,避免重复 AI 解析。
  119. */
  120. protected function findReusableCandidate(string $raw, string $clean): ?PreQuestionCandidate
  121. {
  122. return PreQuestionCandidate::query()
  123. ->where(function ($query) use ($raw, $clean) {
  124. $query->where('raw_markdown', $raw)
  125. ->orWhere('clean_markdown', $clean);
  126. })
  127. ->where(function ($query) {
  128. $query->where('confidence', '>=', 0.85)
  129. ->orWhere('ai_confidence', '>=', 0.85);
  130. })
  131. ->orderByDesc('confidence')
  132. ->orderByDesc('ai_confidence')
  133. ->first();
  134. }
  135. protected function ensureSyntheticImport(SourcePaper $paper): MarkdownImport
  136. {
  137. return MarkdownImport::firstOrCreate(
  138. [
  139. 'file_name' => $paper->file?->original_filename ?? ('paper-' . $paper->id),
  140. 'source_name' => $paper->title,
  141. ],
  142. [
  143. 'original_markdown' => $paper->raw_markdown,
  144. 'status' => MarkdownImport::STATUS_PARSED,
  145. 'progress_stage' => MarkdownImport::STAGE_PARSED,
  146. 'progress_message' => 'Auto generated from source_papers',
  147. 'progress_current' => 0,
  148. 'progress_total' => 0,
  149. 'progress_updated_at' => now(),
  150. ]
  151. );
  152. }
  153. }