splitIntoPapers($sourceFile->raw_markdown); return DB::transaction(function () use ($sourceFile, $segments) { $sourceFile->papers()->delete(); $papers = collect(); foreach ($segments as $idx => $segment) { $papers->push( SourcePaper::create([ 'uuid' => (string) Str::uuid(), 'source_file_id' => $sourceFile->id, 'order' => $idx + 1, 'title' => $segment['title'] ?? null, 'full_title' => $segment['full_title'] ?? null, 'chapter' => $segment['chapter'] ?? $sourceFile->extracted_metadata['chapter'] ?? null, 'grade' => $segment['grade'] ?? $sourceFile->extracted_metadata['grade'] ?? null, 'term' => $segment['term'] ?? $sourceFile->extracted_metadata['term'] ?? null, 'edition' => $segment['edition'] ?? $sourceFile->extracted_metadata['edition'] ?? null, 'textbook_series' => $segment['textbook_series'] ?? $sourceFile->extracted_metadata['textbook_series'] ?? null, 'source_type' => $segment['source_type'] ?? null, 'source_year' => $segment['source_year'] ?? $sourceFile->extracted_metadata['year'] ?? null, 'raw_markdown' => $segment['raw'], 'detected_metadata' => $segment['meta'] ?? [], ]) ); } return $papers; }); } /** * 基于 Markdown 标题拆分卷子。 */ public function splitIntoPapers(string $markdown): array { $lines = preg_split('/\r\n|\r|\n/', $markdown); $segments = []; $current = ['title' => null, 'buffer' => []]; $headingPattern = '/^(#{1,2})\s*(.+)$/u'; $paperKeywords = '/(期中|期末|专项|模拟|基础卷|提升卷|练习卷|单元卷|测试卷|套卷|试卷)/u'; $sectionPrefix = '/^(卷\\s*[一二三四五六七八九十0-9IVX]+|第\\s*[一二三四五六七八九十0-9IVX]+\\s*卷)/u'; $chapterPaperPattern = '/^(第\\s*[一二三四五六七八九十0-9]+\\s*[章节单元]).*(质量检测卷|能力提优检测卷|基础过关检测卷|检测卷|训练卷|专项训练卷)/u'; $paperLinePattern = '/(质量检测卷|能力提优检测卷|基础过关检测卷|检测卷|训练卷|专项训练卷|期中|期末|专项|模拟|基础卷|提升卷|练习卷|单元卷|测试卷|套卷|试卷)/u'; $questionLinePattern = '/^\\s*(\\d+|[A-D])\\s*[\\.、\\)]/u'; $excludeKeywords = '/(答题卡|参考答案|扫描全能王|解析|来源)/u'; $commentPattern = '//i'; foreach ($lines as $line) { $trimmed = trim($line); // 优先支持隐藏的卷子标记 if (preg_match($commentPattern, $trimmed, $cm)) { if (!empty($current['buffer'])) { $segments[] = [ 'title' => $current['title'], 'full_title' => $current['title'], 'raw' => trim(implode("\n", $current['buffer'])), 'meta' => $this->detectMetaFromTitle($current['title']), ]; } $current = [ 'title' => trim($cm[1]), 'buffer' => [$line], ]; continue; } if ($trimmed !== '' && !preg_match($headingPattern, $trimmed)) { $isSectionPrefix = preg_match($sectionPrefix, $trimmed) === 1; $isPartHeading = preg_match('/^(选择题|填空题|解答题|综合题|计算题|应用题)/u', $trimmed) === 1; $isChapterPaper = preg_match($chapterPaperPattern, $trimmed) === 1; $isPaperLine = $isChapterPaper || preg_match($paperLinePattern, $trimmed) === 1; $isQuestionLine = preg_match($questionLinePattern, $trimmed) === 1; $lineLength = mb_strlen($trimmed); if ($isPaperLine && !$isSectionPrefix && !$isPartHeading && !$isQuestionLine) { if (preg_match($excludeKeywords, $trimmed)) { $current['buffer'][] = $line; continue; } if (!$isChapterPaper && $lineLength > 80) { $current['buffer'][] = $line; continue; } if ($this->isSameTitle($current['title'], $trimmed)) { $current['buffer'][] = $line; continue; } if (!empty($current['buffer'])) { $segments[] = [ 'title' => $current['title'], 'full_title' => $current['title'], 'raw' => trim(implode("\n", $current['buffer'])), 'meta' => $this->detectMetaFromTitle($current['title']), ]; } $current = [ 'title' => $this->sanitizeTitle($trimmed), 'buffer' => [$line], ]; continue; } } if (preg_match($headingPattern, $line, $m)) { $title = $this->sanitizeTitle(trim($m[2])); $isSectionPrefix = preg_match($sectionPrefix, $title) === 1; $isPaper = preg_match($paperKeywords, $title) === 1; $isPaper = $isPaper || (str_contains($title, '卷') && !$isSectionPrefix); if (!$isPaper) { $current['buffer'][] = $line; continue; } if ($this->isSameTitle($current['title'], $title)) { $current['buffer'][] = $line; continue; } if (!empty($current['buffer'])) { $segments[] = [ 'title' => $current['title'], 'full_title' => $current['title'], 'raw' => trim(implode("\n", $current['buffer'])), 'meta' => $this->detectMetaFromTitle($current['title']), ]; } $current = [ 'title' => $title, 'buffer' => [$line], ]; } else { $current['buffer'][] = $line; } } if (!empty($current['buffer'])) { $segments[] = [ 'title' => $current['title'], 'full_title' => $current['title'], 'raw' => trim(implode("\n", $current['buffer'])), 'meta' => $this->detectMetaFromTitle($current['title']), ]; } if (empty($segments)) { return [[ 'title' => null, 'full_title' => null, 'raw' => trim($markdown), 'meta' => [], ]]; } $segments = $this->mergeAdjacentSegments($segments); return array_values(array_filter($segments, function ($segment) { $title = trim((string) ($segment['title'] ?? '')); $raw = trim((string) ($segment['raw'] ?? '')); if ($title === '' && mb_strlen($raw) < 80) { return false; } return true; })); } protected function sanitizeTitle(string $title): string { $title = trim($title); $title = preg_replace('/^[◎◆•·\\*\\-\\s]+/u', '', $title); $title = preg_replace('/^[①②③④⑤⑥⑦⑧⑨⑩\\d]+[\\s\\.、]+/u', '', $title); $title = preg_replace('/\\s*\\d+\\s*\\/\\s*答\\s*\\d+$/u', '', $title); $title = trim($title); if (mb_strlen($title) > 200) { $title = mb_substr($title, 0, 200); } return $title; } protected function isSameTitle(?string $currentTitle, ?string $nextTitle): bool { $currentTitle = $currentTitle ? $this->sanitizeTitle($currentTitle) : null; $nextTitle = $nextTitle ? $this->sanitizeTitle($nextTitle) : null; return $currentTitle !== null && $nextTitle !== null && $currentTitle === $nextTitle; } protected function mergeAdjacentSegments(array $segments): array { $merged = []; foreach ($segments as $segment) { $title = $segment['title'] ?? null; $raw = $segment['raw'] ?? ''; $lastIndex = count($merged) - 1; if ($lastIndex >= 0) { // 1. 同名合并 if ($this->isSameTitle($merged[$lastIndex]['title'] ?? null, $title)) { $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw); continue; } // 2. 碎片合并:当前片段无标题,且长度较短(归纳为前一个卷子的尾部或干扰项) if (empty($title) && mb_strlen(trim($raw)) < 500) { $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw); continue; } // 3. 碎片合并:当前片段标题太短且不含核心关键词,且其 Markdown 内容也不长 if ($title && mb_strlen($title) < 5 && mb_strlen(trim($raw)) < 300) { $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw); continue; } } $merged[] = $segment; } return $merged; } protected function detectMetaFromTitle(?string $title): array { if (!$title) { return []; } $meta = []; if (preg_match('/第[一二三四五六七八九十0-9]+章/u', $title, $m)) { $meta['chapter'] = $m[0]; } if (preg_match('/20[0-9]{2}/', $title, $m)) { $meta['source_year'] = $m[0]; } if (Str::contains($title, '期中')) { $meta['source_type'] = 'midterm'; } elseif (Str::contains($title, '期末')) { $meta['source_type'] = 'final'; } elseif (Str::contains($title, '模拟')) { $meta['source_type'] = 'mock'; } return $meta; } }