paper; $file = $paper->file; $import = $import ?? $this->ensureSyntheticImport($paper); $blocks = app(\App\Services\AsyncMarkdownSplitter::class)->split($part->raw_markdown); return DB::transaction(function () use ($blocks, $part, $paper, $file, $import, &$sequenceStart) { $created = collect(); foreach ($blocks as $i => $block) { $questionNumber = (string) ($block['question_number'] ?? ($i + 1)); $order = $i + 1; $raw = $block['raw_markdown']; $clean = $this->cleanMarkdown($raw); $sequence = $sequenceStart++; $reuse = $this->findReusableCandidate($raw, $clean); $reuseConfidence = $reuse?->confidence ?? $reuse?->ai_confidence; $candidate = PreQuestionCandidate::updateOrCreate( [ 'import_id' => $import->id, 'sequence' => $sequence, ], [ 'source_file_id' => $file?->id, 'source_paper_id' => $paper?->id, 'part_id' => $part?->id, 'index' => (int) $questionNumber, 'question_number' => $questionNumber, 'order' => $order, 'sequence' => $sequence, 'raw_markdown' => $raw, 'clean_markdown' => $clean, 'structured_json' => $reuse?->structured_json, 'stem' => $reuse?->stem, 'options' => $reuse?->options, 'images' => !empty($reuse?->images) ? $reuse->images : ($block['images'] ?? []), 'tables' => !empty($reuse?->tables) ? $reuse->tables : ($block['tables'] ?? []), 'is_question_candidate' => $reuse?->is_question_candidate ?? true, 'ai_confidence' => $reuse?->ai_confidence, 'confidence' => $reuseConfidence, 'formula_detected' => $this->detectFormula($raw), 'is_valid_question' => $reuse?->is_valid_question ?? true, 'status' => PreQuestionCandidate::STATUS_PENDING, ] ); PaperQuestionRef::updateOrCreate( [ 'source_paper_id' => $paper->id, 'part_id' => $part->id, 'candidate_id' => $candidate->id, ], [ 'question_number' => $questionNumber, 'order' => $order, 'raw_markdown' => $raw, 'metadata' => [ 'source_file_uuid' => $file?->uuid, 'paper_uuid' => $paper?->uuid, 'part_order' => $part?->order, ], ] ); $created->push($candidate); } $part->update(['question_count' => $created->count()]); return $created; }); } /** * 严格按题号拆分,题号正则:^\s*(\d+)(\.|、|\)|)|\]|】)?\s+ */ public function splitQuestions(string $markdown): array { $pattern = '/^\s*(\d+)(?:[\\.、\\))\\]】])?\s+/m'; preg_match_all($pattern, $markdown, $matches, PREG_OFFSET_CAPTURE); if (empty($matches[0])) { return [[ 'question_number' => 1, 'raw_markdown' => trim($markdown), ]]; } $blocks = []; $positions = array_map(fn($m) => $m[1], $matches[0]); $numbers = array_map(fn($m) => $m[0], $matches[1]); foreach ($positions as $idx => $start) { $end = $positions[$idx + 1] ?? strlen($markdown); $slice = substr($markdown, $start, $end - $start); $blocks[] = [ 'question_number' => $numbers[$idx] ?? ($idx + 1), 'raw_markdown' => trim($slice), ]; } return $blocks; } protected function detectFormula(string $markdown): bool { return (bool) preg_match('/\\$\\$.*?\\$\\$|\\$[^$]+\\$|\\\\frac|\\\\sum|\\\\int|\\\\sqrt/u', $markdown); } protected function cleanMarkdown(string $markdown): string { return trim(Str::of($markdown)->replace("\r", '')->toString()); } /** * 查找可复用的高置信度候选题,避免重复 AI 解析。 */ protected function findReusableCandidate(string $raw, string $clean): ?PreQuestionCandidate { return PreQuestionCandidate::query() ->where(function ($query) use ($raw, $clean) { $query->where('raw_markdown', $raw) ->orWhere('clean_markdown', $clean); }) ->where(function ($query) { $query->where('confidence', '>=', 0.85) ->orWhere('ai_confidence', '>=', 0.85); }) ->orderByDesc('confidence') ->orderByDesc('ai_confidence') ->first(); } protected function ensureSyntheticImport(SourcePaper $paper): MarkdownImport { return MarkdownImport::firstOrCreate( [ 'file_name' => $paper->file?->original_filename ?? ('paper-' . $paper->id), 'source_name' => $paper->title, ], [ 'original_markdown' => $paper->raw_markdown, 'status' => MarkdownImport::STATUS_PARSED, 'progress_stage' => MarkdownImport::STAGE_PARSED, 'progress_message' => 'Auto generated from source_papers', 'progress_current' => 0, 'progress_total' => 0, 'progress_updated_at' => now(), ] ); } }