| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173 |
- <?php
- namespace App\Services;
- use App\Models\MarkdownImport;
- use App\Models\PaperPart;
- use App\Models\PaperQuestionRef;
- use App\Models\PreQuestionCandidate;
- use App\Models\SourcePaper;
- use Illuminate\Support\Collection;
- use Illuminate\Support\Facades\DB;
- use Illuminate\Support\Str;
- class QuestionExtractorService
- {
- /**
- * 使用严格题号正则拆分题目,写入候选表与绑定表。
- */
- public function extractAndPersist(PaperPart $part, ?MarkdownImport $import = null, int &$sequenceStart = 1): Collection
- {
- $paper = $part->paper;
- $file = $paper->file;
- $import = $import ?? $this->ensureSyntheticImport($paper);
- $blocks = app(\App\Services\AsyncMarkdownSplitter::class)->split($part->raw_markdown);
- return DB::transaction(function () use ($blocks, $part, $paper, $file, $import, &$sequenceStart) {
- $created = collect();
- foreach ($blocks as $i => $block) {
- $questionNumber = (string) ($block['question_number'] ?? ($i + 1));
- $order = $i + 1;
- $raw = $block['raw_markdown'];
- $clean = $this->cleanMarkdown($raw);
- $sequence = $sequenceStart++;
- $reuse = $this->findReusableCandidate($raw, $clean);
- $reuseConfidence = $reuse?->confidence ?? $reuse?->ai_confidence;
- $candidate = PreQuestionCandidate::updateOrCreate(
- [
- 'import_id' => $import->id,
- 'sequence' => $sequence,
- ],
- [
- 'source_file_id' => $file?->id,
- 'source_paper_id' => $paper?->id,
- 'part_id' => $part?->id,
- 'index' => (int) $questionNumber,
- 'question_number' => $questionNumber,
- 'order' => $order,
- 'sequence' => $sequence,
- 'raw_markdown' => $raw,
- 'clean_markdown' => $clean,
- 'structured_json' => $reuse?->structured_json,
- 'stem' => $reuse?->stem,
- 'options' => $reuse?->options,
- 'images' => !empty($reuse?->images) ? $reuse->images : ($block['images'] ?? []),
- 'tables' => !empty($reuse?->tables) ? $reuse->tables : ($block['tables'] ?? []),
- 'is_question_candidate' => $reuse?->is_question_candidate ?? true,
- 'ai_confidence' => $reuse?->ai_confidence,
- 'confidence' => $reuseConfidence,
- 'formula_detected' => $this->detectFormula($raw),
- 'is_valid_question' => $reuse?->is_valid_question ?? true,
- 'status' => PreQuestionCandidate::STATUS_PENDING,
- ]
- );
- PaperQuestionRef::updateOrCreate(
- [
- 'source_paper_id' => $paper->id,
- 'part_id' => $part->id,
- 'candidate_id' => $candidate->id,
- ],
- [
- 'question_number' => $questionNumber,
- 'order' => $order,
- 'raw_markdown' => $raw,
- 'metadata' => [
- 'source_file_uuid' => $file?->uuid,
- 'paper_uuid' => $paper?->uuid,
- 'part_order' => $part?->order,
- ],
- ]
- );
- $created->push($candidate);
- }
- $part->update(['question_count' => $created->count()]);
- return $created;
- });
- }
- /**
- * 严格按题号拆分,题号正则:^\s*(\d+)(\.|、|\)|)|\]|】)?\s+
- */
- public function splitQuestions(string $markdown): array
- {
- $pattern = '/^\s*(\d+)(?:[\\.、\\))\\]】])?\s+/m';
- preg_match_all($pattern, $markdown, $matches, PREG_OFFSET_CAPTURE);
- if (empty($matches[0])) {
- return [[
- 'question_number' => 1,
- 'raw_markdown' => trim($markdown),
- ]];
- }
- $blocks = [];
- $positions = array_map(fn($m) => $m[1], $matches[0]);
- $numbers = array_map(fn($m) => $m[0], $matches[1]);
- foreach ($positions as $idx => $start) {
- $end = $positions[$idx + 1] ?? strlen($markdown);
- $slice = substr($markdown, $start, $end - $start);
- $blocks[] = [
- 'question_number' => $numbers[$idx] ?? ($idx + 1),
- 'raw_markdown' => trim($slice),
- ];
- }
- return $blocks;
- }
- protected function detectFormula(string $markdown): bool
- {
- return (bool) preg_match('/\\$\\$.*?\\$\\$|\\$[^$]+\\$|\\\\frac|\\\\sum|\\\\int|\\\\sqrt/u', $markdown);
- }
- protected function cleanMarkdown(string $markdown): string
- {
- return trim(Str::of($markdown)->replace("\r", '')->toString());
- }
- /**
- * 查找可复用的高置信度候选题,避免重复 AI 解析。
- */
- protected function findReusableCandidate(string $raw, string $clean): ?PreQuestionCandidate
- {
- return PreQuestionCandidate::query()
- ->where(function ($query) use ($raw, $clean) {
- $query->where('raw_markdown', $raw)
- ->orWhere('clean_markdown', $clean);
- })
- ->where(function ($query) {
- $query->where('confidence', '>=', 0.85)
- ->orWhere('ai_confidence', '>=', 0.85);
- })
- ->orderByDesc('confidence')
- ->orderByDesc('ai_confidence')
- ->first();
- }
- protected function ensureSyntheticImport(SourcePaper $paper): MarkdownImport
- {
- return MarkdownImport::firstOrCreate(
- [
- 'file_name' => $paper->file?->original_filename ?? ('paper-' . $paper->id),
- 'source_name' => $paper->title,
- ],
- [
- 'original_markdown' => $paper->raw_markdown,
- 'status' => MarkdownImport::STATUS_PARSED,
- 'progress_stage' => MarkdownImport::STAGE_PARSED,
- 'progress_message' => 'Auto generated from source_papers',
- 'progress_current' => 0,
- 'progress_total' => 0,
- 'progress_updated_at' => now(),
- ]
- );
- }
- }
|