QuestionExtractorService.php 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. <?php
  2. namespace App\Services;
  3. use App\Models\MarkdownImport;
  4. use App\Models\PaperPart;
  5. use App\Models\PaperQuestionRef;
  6. use App\Models\PreQuestionCandidate;
  7. use App\Models\SourcePaper;
  8. use Illuminate\Support\Collection;
  9. use Illuminate\Support\Facades\DB;
  10. use Illuminate\Support\Str;
  11. class QuestionExtractorService
  12. {
  13. /**
  14. * 使用严格题号正则拆分题目,写入候选表与绑定表。
  15. */
  16. public function extractAndPersist(PaperPart $part, ?MarkdownImport $import = null, int &$sequenceStart = 1): Collection
  17. {
  18. $paper = $part->paper;
  19. $file = $paper->file;
  20. $import = $import ?? $this->ensureSyntheticImport($paper);
  21. $blocks = app(\App\Services\AsyncMarkdownSplitter::class)->split($part->raw_markdown);
  22. return DB::transaction(function () use ($blocks, $part, $paper, $file, $import, &$sequenceStart) {
  23. $created = collect();
  24. foreach ($blocks as $i => $block) {
  25. $questionNumber = (string) ($block['question_number'] ?? ($i + 1));
  26. $order = $i + 1;
  27. $raw = $block['raw_markdown'];
  28. $clean = $this->cleanMarkdown($raw);
  29. $rawHash = $this->hashContent($raw);
  30. $cleanHash = $this->hashContent($clean);
  31. $sequence = $sequenceStart++;
  32. $reuse = $this->findReusableCandidate($raw, $clean);
  33. $reuseConfidence = $reuse?->confidence ?? $reuse?->ai_confidence;
  34. $candidate = PreQuestionCandidate::updateOrCreate(
  35. [
  36. 'import_id' => $import->id,
  37. 'sequence' => $sequence,
  38. ],
  39. [
  40. 'source_file_id' => $file?->id,
  41. 'source_paper_id' => $paper?->id,
  42. 'part_id' => $part?->id,
  43. 'index' => (int) $questionNumber,
  44. 'question_number' => $questionNumber,
  45. 'order' => $order,
  46. 'sequence' => $sequence,
  47. 'raw_markdown' => $raw,
  48. 'raw_hash' => $rawHash,
  49. 'clean_markdown' => $clean,
  50. 'clean_hash' => $cleanHash,
  51. 'structured_json' => $reuse?->structured_json,
  52. 'stem' => $reuse?->stem,
  53. 'options' => $reuse?->options,
  54. 'images' => !empty($reuse?->images) ? $reuse->images : ($block['images'] ?? []),
  55. 'tables' => !empty($reuse?->tables) ? $reuse->tables : ($block['tables'] ?? []),
  56. 'is_question_candidate' => $reuse?->is_question_candidate ?? true,
  57. 'ai_confidence' => $reuse?->ai_confidence,
  58. 'confidence' => $reuseConfidence,
  59. 'formula_detected' => $this->detectFormula($raw),
  60. 'is_valid_question' => $reuse?->is_valid_question ?? true,
  61. 'status' => PreQuestionCandidate::STATUS_PENDING,
  62. ]
  63. );
  64. PaperQuestionRef::updateOrCreate(
  65. [
  66. 'source_paper_id' => $paper->id,
  67. 'part_id' => $part->id,
  68. 'candidate_id' => $candidate->id,
  69. ],
  70. [
  71. 'question_number' => $questionNumber,
  72. 'order' => $order,
  73. 'raw_markdown' => $raw,
  74. 'metadata' => [
  75. 'source_file_uuid' => $file?->uuid,
  76. 'paper_uuid' => $paper?->uuid,
  77. 'part_order' => $part?->order,
  78. ],
  79. ]
  80. );
  81. $created->push($candidate);
  82. }
  83. $part->update(['question_count' => $created->count()]);
  84. return $created;
  85. });
  86. }
  87. protected function detectFormula(string $markdown): bool
  88. {
  89. return (bool) preg_match('/\\$\\$.*?\\$\\$|\\$[^$]+\\$|\\\\frac|\\\\sum|\\\\int|\\\\sqrt/u', $markdown);
  90. }
  91. protected function cleanMarkdown(string $markdown): string
  92. {
  93. return trim(Str::of($markdown)->replace("\r", '')->toString());
  94. }
  95. protected function hashContent(?string $content): ?string
  96. {
  97. $content = trim((string) $content);
  98. if ($content === '') {
  99. return null;
  100. }
  101. return hash('sha1', $content);
  102. }
  103. /**
  104. * 查找可复用的高置信度候选题,避免重复 AI 解析。
  105. */
  106. protected function findReusableCandidate(string $raw, string $clean): ?PreQuestionCandidate
  107. {
  108. $rawHash = $this->hashContent($raw);
  109. $cleanHash = $this->hashContent($clean);
  110. return PreQuestionCandidate::query()
  111. ->where(function ($query) use ($raw, $clean, $rawHash, $cleanHash) {
  112. $query->where('raw_markdown', $raw)
  113. ->orWhere('clean_markdown', $clean);
  114. if ($rawHash || $cleanHash) {
  115. $query->orWhere(function ($inner) use ($rawHash, $cleanHash) {
  116. if ($rawHash) {
  117. $inner->where('raw_hash', $rawHash);
  118. }
  119. if ($cleanHash) {
  120. $inner->orWhere('clean_hash', $cleanHash);
  121. }
  122. });
  123. }
  124. })
  125. ->where(function ($query) {
  126. $query->where('confidence', '>=', 0.85)
  127. ->orWhere('ai_confidence', '>=', 0.85)
  128. ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.ai_parsed')) = 'true'");
  129. })
  130. ->orderByDesc('confidence')
  131. ->orderByDesc('ai_confidence')
  132. ->first();
  133. }
  134. protected function ensureSyntheticImport(SourcePaper $paper): MarkdownImport
  135. {
  136. return MarkdownImport::firstOrCreate(
  137. [
  138. 'file_name' => $paper->file?->original_filename ?? ('paper-' . $paper->id),
  139. 'source_name' => $paper->title,
  140. ],
  141. [
  142. 'original_markdown' => $paper->raw_markdown,
  143. 'status' => MarkdownImport::STATUS_PARSED,
  144. 'progress_stage' => MarkdownImport::STAGE_PARSED,
  145. 'progress_message' => 'Auto generated from source_papers',
  146. 'progress_current' => 0,
  147. 'progress_total' => 0,
  148. 'progress_updated_at' => now(),
  149. ]
  150. );
  151. }
  152. }