paper; $file = $paper->file; $import = $import ?? $this->ensureSyntheticImport($paper); $blocks = app(\App\Services\AsyncMarkdownSplitter::class)->split($part->raw_markdown); return DB::transaction(function () use ($blocks, $part, $paper, $file, $import, &$sequenceStart) { $created = collect(); foreach ($blocks as $i => $block) { $questionNumber = (string) ($block['question_number'] ?? ($i + 1)); $order = $i + 1; $raw = $block['raw_markdown']; $clean = $this->cleanMarkdown($raw); $rawHash = $this->hashContent($raw); $cleanHash = $this->hashContent($clean); $sequence = $sequenceStart++; $reuse = $this->findReusableCandidate($raw, $clean); $reuseConfidence = $reuse?->confidence ?? $reuse?->ai_confidence; $candidate = PreQuestionCandidate::updateOrCreate( [ 'import_id' => $import->id, 'sequence' => $sequence, ], [ 'source_file_id' => $file?->id, 'source_paper_id' => $paper?->id, 'part_id' => $part?->id, 'index' => (int) $questionNumber, 'question_number' => $questionNumber, 'order' => $order, 'sequence' => $sequence, 'raw_markdown' => $raw, 'raw_hash' => $rawHash, 'clean_markdown' => $clean, 'clean_hash' => $cleanHash, 'structured_json' => $reuse?->structured_json, 'stem' => $reuse?->stem, 'options' => $reuse?->options, 'images' => !empty($reuse?->images) ? $reuse->images : ($block['images'] ?? []), 'tables' => !empty($reuse?->tables) ? $reuse->tables : ($block['tables'] ?? []), 'is_question_candidate' => $reuse?->is_question_candidate ?? true, 'ai_confidence' => $reuse?->ai_confidence, 'confidence' => $reuseConfidence, 'formula_detected' => $this->detectFormula($raw), 'is_valid_question' => $reuse?->is_valid_question ?? true, 'status' => PreQuestionCandidate::STATUS_PENDING, ] ); PaperQuestionRef::updateOrCreate( [ 'source_paper_id' => $paper->id, 'part_id' => $part->id, 'candidate_id' => $candidate->id, ], [ 'question_number' => $questionNumber, 'order' => $order, 'raw_markdown' => $raw, 'metadata' => [ 'source_file_uuid' => $file?->uuid, 'paper_uuid' => $paper?->uuid, 'part_order' => $part?->order, ], ] ); $created->push($candidate); } $part->update(['question_count' => $created->count()]); return $created; }); } protected function detectFormula(string $markdown): bool { return (bool) preg_match('/\\$\\$.*?\\$\\$|\\$[^$]+\\$|\\\\frac|\\\\sum|\\\\int|\\\\sqrt/u', $markdown); } protected function cleanMarkdown(string $markdown): string { return trim(Str::of($markdown)->replace("\r", '')->toString()); } protected function hashContent(?string $content): ?string { $content = trim((string) $content); if ($content === '') { return null; } return hash('sha1', $content); } /** * 查找可复用的高置信度候选题,避免重复 AI 解析。 */ protected function findReusableCandidate(string $raw, string $clean): ?PreQuestionCandidate { $rawHash = $this->hashContent($raw); $cleanHash = $this->hashContent($clean); return PreQuestionCandidate::query() ->where(function ($query) use ($raw, $clean, $rawHash, $cleanHash) { $query->where('raw_markdown', $raw) ->orWhere('clean_markdown', $clean); if ($rawHash || $cleanHash) { $query->orWhere(function ($inner) use ($rawHash, $cleanHash) { if ($rawHash) { $inner->where('raw_hash', $rawHash); } if ($cleanHash) { $inner->orWhere('clean_hash', $cleanHash); } }); } }) ->where(function ($query) { $query->where('confidence', '>=', 0.85) ->orWhere('ai_confidence', '>=', 0.85) ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.ai_parsed')) = 'true'"); }) ->orderByDesc('confidence') ->orderByDesc('ai_confidence') ->first(); } protected function ensureSyntheticImport(SourcePaper $paper): MarkdownImport { return MarkdownImport::firstOrCreate( [ 'file_name' => $paper->file?->original_filename ?? ('paper-' . $paper->id), 'source_name' => $paper->title, ], [ 'original_markdown' => $paper->raw_markdown, 'status' => MarkdownImport::STATUS_PARSED, 'progress_stage' => MarkdownImport::STAGE_PARSED, 'progress_message' => 'Auto generated from source_papers', 'progress_current' => 0, 'progress_total' => 0, 'progress_updated_at' => now(), ] ); } }