| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- <?php
- namespace App\Services;
- use App\Models\MarkdownImport;
- use App\Models\PaperPart;
- use App\Models\PaperQuestionRef;
- use App\Models\PreQuestionCandidate;
- use App\Models\SourcePaper;
- use Illuminate\Support\Collection;
- use Illuminate\Support\Facades\DB;
- use Illuminate\Support\Str;
- class QuestionExtractorService
- {
- /**
- * 使用严格题号正则拆分题目,写入候选表与绑定表。
- */
- public function extractAndPersist(PaperPart $part, ?MarkdownImport $import = null, int &$sequenceStart = 1): Collection
- {
- $paper = $part->paper;
- $file = $paper->file;
- $import = $import ?? $this->ensureSyntheticImport($paper);
- $blocks = app(\App\Services\AsyncMarkdownSplitter::class)->split($part->raw_markdown);
- return DB::transaction(function () use ($blocks, $part, $paper, $file, $import, &$sequenceStart) {
- $created = collect();
- foreach ($blocks as $i => $block) {
- $questionNumber = (string) ($block['question_number'] ?? ($i + 1));
- $order = $i + 1;
- $raw = $block['raw_markdown'];
- $clean = $this->cleanMarkdown($raw);
- $rawHash = $this->hashContent($raw);
- $cleanHash = $this->hashContent($clean);
- $sequence = $sequenceStart++;
- $reuse = $this->findReusableCandidate($raw, $clean);
- $reuseConfidence = $reuse?->confidence ?? $reuse?->ai_confidence;
- $candidate = PreQuestionCandidate::updateOrCreate(
- [
- 'import_id' => $import->id,
- 'sequence' => $sequence,
- ],
- [
- 'source_file_id' => $file?->id,
- 'source_paper_id' => $paper?->id,
- 'part_id' => $part?->id,
- 'index' => (int) $questionNumber,
- 'question_number' => $questionNumber,
- 'order' => $order,
- 'sequence' => $sequence,
- 'raw_markdown' => $raw,
- 'raw_hash' => $rawHash,
- 'clean_markdown' => $clean,
- 'clean_hash' => $cleanHash,
- 'structured_json' => $reuse?->structured_json,
- 'stem' => $reuse?->stem,
- 'options' => $reuse?->options,
- 'images' => !empty($reuse?->images) ? $reuse->images : ($block['images'] ?? []),
- 'tables' => !empty($reuse?->tables) ? $reuse->tables : ($block['tables'] ?? []),
- 'is_question_candidate' => $reuse?->is_question_candidate ?? true,
- 'ai_confidence' => $reuse?->ai_confidence,
- 'confidence' => $reuseConfidence,
- 'formula_detected' => $this->detectFormula($raw),
- 'is_valid_question' => $reuse?->is_valid_question ?? true,
- 'status' => PreQuestionCandidate::STATUS_PENDING,
- ]
- );
- PaperQuestionRef::updateOrCreate(
- [
- 'source_paper_id' => $paper->id,
- 'part_id' => $part->id,
- 'candidate_id' => $candidate->id,
- ],
- [
- 'question_number' => $questionNumber,
- 'order' => $order,
- 'raw_markdown' => $raw,
- 'metadata' => [
- 'source_file_uuid' => $file?->uuid,
- 'paper_uuid' => $paper?->uuid,
- 'part_order' => $part?->order,
- ],
- ]
- );
- $created->push($candidate);
- }
- $part->update(['question_count' => $created->count()]);
- return $created;
- });
- }
- protected function detectFormula(string $markdown): bool
- {
- return (bool) preg_match('/\\$\\$.*?\\$\\$|\\$[^$]+\\$|\\\\frac|\\\\sum|\\\\int|\\\\sqrt/u', $markdown);
- }
- protected function cleanMarkdown(string $markdown): string
- {
- return trim(Str::of($markdown)->replace("\r", '')->toString());
- }
- protected function hashContent(?string $content): ?string
- {
- $content = trim((string) $content);
- if ($content === '') {
- return null;
- }
- return hash('sha1', $content);
- }
- /**
- * 查找可复用的高置信度候选题,避免重复 AI 解析。
- */
- protected function findReusableCandidate(string $raw, string $clean): ?PreQuestionCandidate
- {
- $rawHash = $this->hashContent($raw);
- $cleanHash = $this->hashContent($clean);
- return PreQuestionCandidate::query()
- ->where(function ($query) use ($raw, $clean, $rawHash, $cleanHash) {
- $query->where('raw_markdown', $raw)
- ->orWhere('clean_markdown', $clean);
- if ($rawHash || $cleanHash) {
- $query->orWhere(function ($inner) use ($rawHash, $cleanHash) {
- if ($rawHash) {
- $inner->where('raw_hash', $rawHash);
- }
- if ($cleanHash) {
- $inner->orWhere('clean_hash', $cleanHash);
- }
- });
- }
- })
- ->where(function ($query) {
- $query->where('confidence', '>=', 0.85)
- ->orWhere('ai_confidence', '>=', 0.85)
- ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.ai_parsed')) = 'true'");
- })
- ->orderByDesc('confidence')
- ->orderByDesc('ai_confidence')
- ->first();
- }
- protected function ensureSyntheticImport(SourcePaper $paper): MarkdownImport
- {
- return MarkdownImport::firstOrCreate(
- [
- 'file_name' => $paper->file?->original_filename ?? ('paper-' . $paper->id),
- 'source_name' => $paper->title,
- ],
- [
- 'original_markdown' => $paper->raw_markdown,
- 'status' => MarkdownImport::STATUS_PARSED,
- 'progress_stage' => MarkdownImport::STAGE_PARSED,
- 'progress_message' => 'Auto generated from source_papers',
- 'progress_current' => 0,
- 'progress_total' => 0,
- 'progress_updated_at' => now(),
- ]
- );
- }
- }
|