| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269 |
- <?php
- namespace App\Services;
- use App\Models\SourceFile;
- use App\Models\SourcePaper;
- use Illuminate\Support\Collection;
- use Illuminate\Support\Facades\DB;
- use Illuminate\Support\Str;
- class SourcePaperExtractorService
- {
- /**
- * 从单个 Markdown 文件中切出多套卷子,并持久化。
- */
- public function extract(SourceFile $sourceFile): Collection
- {
- $segments = $this->splitIntoPapers($sourceFile->raw_markdown);
- return DB::transaction(function () use ($sourceFile, $segments) {
- $sourceFile->papers()->delete();
- $papers = collect();
- foreach ($segments as $idx => $segment) {
- $papers->push(
- SourcePaper::create([
- 'uuid' => (string) Str::uuid(),
- 'source_file_id' => $sourceFile->id,
- 'order' => $idx + 1,
- 'title' => $segment['title'] ?? null,
- 'full_title' => $segment['full_title'] ?? null,
- 'chapter' => $segment['chapter'] ?? $sourceFile->extracted_metadata['chapter'] ?? null,
- 'grade' => $segment['grade'] ?? $sourceFile->extracted_metadata['grade'] ?? null,
- 'term' => $segment['term'] ?? $sourceFile->extracted_metadata['term'] ?? null,
- 'edition' => $segment['edition'] ?? $sourceFile->extracted_metadata['edition'] ?? null,
- 'textbook_series' => $segment['textbook_series'] ?? $sourceFile->extracted_metadata['textbook_series'] ?? null,
- 'source_type' => $segment['source_type'] ?? null,
- 'source_year' => $segment['source_year'] ?? $sourceFile->extracted_metadata['year'] ?? null,
- 'raw_markdown' => $segment['raw'],
- 'detected_metadata' => $segment['meta'] ?? [],
- ])
- );
- }
- return $papers;
- });
- }
- /**
- * 基于 Markdown 标题拆分卷子。
- */
- public function splitIntoPapers(string $markdown): array
- {
- $lines = preg_split('/\r\n|\r|\n/', $markdown);
- $segments = [];
- $current = ['title' => null, 'buffer' => []];
- $headingPattern = '/^(#{1,2})\s*(.+)$/u';
- $paperKeywords = '/(期中|期末|专项|模拟|基础卷|提升卷|练习卷|单元卷|测试卷|套卷|试卷)/u';
- $sectionPrefix = '/^(卷\\s*[一二三四五六七八九十0-9IVX]+|第\\s*[一二三四五六七八九十0-9IVX]+\\s*卷)/u';
- $chapterPaperPattern = '/^(第\\s*[一二三四五六七八九十0-9]+\\s*[章节单元]).*(质量检测卷|能力提优检测卷|基础过关检测卷|检测卷|训练卷|专项训练卷)/u';
- $paperLinePattern = '/(质量检测卷|能力提优检测卷|基础过关检测卷|检测卷|训练卷|专项训练卷|期中|期末|专项|模拟|基础卷|提升卷|练习卷|单元卷|测试卷|套卷|试卷)/u';
- $questionLinePattern = '/^\\s*(\\d+|[A-D])\\s*[\\.、\\)]/u';
- $excludeKeywords = '/(答题卡|参考答案|扫描全能王|解析|来源)/u';
- $commentPattern = '/<!--\s*paper:\s*(.+?)\s*-->/i';
- foreach ($lines as $line) {
- $trimmed = trim($line);
- // 优先支持隐藏的卷子标记
- if (preg_match($commentPattern, $trimmed, $cm)) {
- if (!empty($current['buffer'])) {
- $segments[] = [
- 'title' => $current['title'],
- 'full_title' => $current['title'],
- 'raw' => trim(implode("\n", $current['buffer'])),
- 'meta' => $this->detectMetaFromTitle($current['title']),
- ];
- }
- $current = [
- 'title' => trim($cm[1]),
- 'buffer' => [$line],
- ];
- continue;
- }
- if ($trimmed !== '' && !preg_match($headingPattern, $trimmed)) {
- $isSectionPrefix = preg_match($sectionPrefix, $trimmed) === 1;
- $isPartHeading = preg_match('/^(选择题|填空题|解答题|综合题|计算题|应用题)/u', $trimmed) === 1;
- $isChapterPaper = preg_match($chapterPaperPattern, $trimmed) === 1;
- $isPaperLine = $isChapterPaper || preg_match($paperLinePattern, $trimmed) === 1;
- $isQuestionLine = preg_match($questionLinePattern, $trimmed) === 1;
- $lineLength = mb_strlen($trimmed);
- if ($isPaperLine && !$isSectionPrefix && !$isPartHeading && !$isQuestionLine) {
- if (preg_match($excludeKeywords, $trimmed)) {
- $current['buffer'][] = $line;
- continue;
- }
- if (!$isChapterPaper && $lineLength > 80) {
- $current['buffer'][] = $line;
- continue;
- }
- if ($this->isSameTitle($current['title'], $trimmed)) {
- $current['buffer'][] = $line;
- continue;
- }
- if (!empty($current['buffer'])) {
- $segments[] = [
- 'title' => $current['title'],
- 'full_title' => $current['title'],
- 'raw' => trim(implode("\n", $current['buffer'])),
- 'meta' => $this->detectMetaFromTitle($current['title']),
- ];
- }
- $current = [
- 'title' => $this->sanitizeTitle($trimmed),
- 'buffer' => [$line],
- ];
- continue;
- }
- }
- if (preg_match($headingPattern, $line, $m)) {
- $title = $this->sanitizeTitle(trim($m[2]));
- $isSectionPrefix = preg_match($sectionPrefix, $title) === 1;
- $isPaper = preg_match($paperKeywords, $title) === 1;
- $isPaper = $isPaper || (str_contains($title, '卷') && !$isSectionPrefix);
- if (!$isPaper) {
- $current['buffer'][] = $line;
- continue;
- }
- if ($this->isSameTitle($current['title'], $title)) {
- $current['buffer'][] = $line;
- continue;
- }
- if (!empty($current['buffer'])) {
- $segments[] = [
- 'title' => $current['title'],
- 'full_title' => $current['title'],
- 'raw' => trim(implode("\n", $current['buffer'])),
- 'meta' => $this->detectMetaFromTitle($current['title']),
- ];
- }
- $current = [
- 'title' => $title,
- 'buffer' => [$line],
- ];
- } else {
- $current['buffer'][] = $line;
- }
- }
- if (!empty($current['buffer'])) {
- $segments[] = [
- 'title' => $current['title'],
- 'full_title' => $current['title'],
- 'raw' => trim(implode("\n", $current['buffer'])),
- 'meta' => $this->detectMetaFromTitle($current['title']),
- ];
- }
- if (empty($segments)) {
- return [[
- 'title' => null,
- 'full_title' => null,
- 'raw' => trim($markdown),
- 'meta' => [],
- ]];
- }
- $segments = $this->mergeAdjacentSegments($segments);
- return array_values(array_filter($segments, function ($segment) {
- $title = trim((string) ($segment['title'] ?? ''));
- $raw = trim((string) ($segment['raw'] ?? ''));
- if ($title === '' && mb_strlen($raw) < 80) {
- return false;
- }
- return true;
- }));
- }
- protected function sanitizeTitle(string $title): string
- {
- $title = trim($title);
- $title = preg_replace('/^[◎◆•·\\*\\-\\s]+/u', '', $title);
- $title = preg_replace('/^[①②③④⑤⑥⑦⑧⑨⑩\\d]+[\\s\\.、]+/u', '', $title);
- $title = preg_replace('/\\s*\\d+\\s*\\/\\s*答\\s*\\d+$/u', '', $title);
- $title = trim($title);
- if (mb_strlen($title) > 200) {
- $title = mb_substr($title, 0, 200);
- }
- return $title;
- }
- protected function isSameTitle(?string $currentTitle, ?string $nextTitle): bool
- {
- $currentTitle = $currentTitle ? $this->sanitizeTitle($currentTitle) : null;
- $nextTitle = $nextTitle ? $this->sanitizeTitle($nextTitle) : null;
- return $currentTitle !== null && $nextTitle !== null && $currentTitle === $nextTitle;
- }
- protected function mergeAdjacentSegments(array $segments): array
- {
- $merged = [];
- foreach ($segments as $segment) {
- $title = $segment['title'] ?? null;
- $raw = $segment['raw'] ?? '';
- $lastIndex = count($merged) - 1;
-
- if ($lastIndex >= 0) {
- // 1. 同名合并
- if ($this->isSameTitle($merged[$lastIndex]['title'] ?? null, $title)) {
- $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw);
- continue;
- }
-
- // 2. 碎片合并:当前片段无标题,且长度较短(归纳为前一个卷子的尾部或干扰项)
- if (empty($title) && mb_strlen(trim($raw)) < 500) {
- $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw);
- continue;
- }
- // 3. 碎片合并:当前片段标题太短且不含核心关键词,且其 Markdown 内容也不长
- if ($title && mb_strlen($title) < 5 && mb_strlen(trim($raw)) < 300) {
- $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw);
- continue;
- }
- }
- $merged[] = $segment;
- }
- return $merged;
- }
- protected function detectMetaFromTitle(?string $title): array
- {
- if (!$title) {
- return [];
- }
- $meta = [];
- if (preg_match('/第[一二三四五六七八九十0-9]+章/u', $title, $m)) {
- $meta['chapter'] = $m[0];
- }
- if (preg_match('/20[0-9]{2}/', $title, $m)) {
- $meta['source_year'] = $m[0];
- }
- if (Str::contains($title, '期中')) {
- $meta['source_type'] = 'midterm';
- } elseif (Str::contains($title, '期末')) {
- $meta['source_type'] = 'final';
- } elseif (Str::contains($title, '模拟')) {
- $meta['source_type'] = 'mock';
- }
- return $meta;
- }
- }
|