yms
/
math_cms


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
							<?php

namespace App\Services;

use App\Models\SourceFile;
use App\Models\SourcePaper;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Str;

class SourcePaperExtractorService
{
    /**
     * 从单个 Markdown 文件中切出多套卷子，并持久化。
     */
    public function extract(SourceFile $sourceFile): Collection
    {
        $segments = $this->splitIntoPapers($sourceFile->raw_markdown);

        return DB::transaction(function () use ($sourceFile, $segments) {
            $sourceFile->papers()->delete();

            $papers = collect();
            foreach ($segments as $idx => $segment) {
                $papers->push(
                    SourcePaper::create([
                        'uuid' => (string) Str::uuid(),
                        'source_file_id' => $sourceFile->id,
                        'order' => $idx + 1,
                        'title' => $segment['title'] ?? null,
                        'full_title' => $segment['full_title'] ?? null,
                        'chapter' => $segment['chapter'] ?? $sourceFile->extracted_metadata['chapter'] ?? null,
                        'grade' => $segment['grade'] ?? $sourceFile->extracted_metadata['grade'] ?? null,
                        'term' => $segment['term'] ?? $sourceFile->extracted_metadata['term'] ?? null,
                        'edition' => $segment['edition'] ?? $sourceFile->extracted_metadata['edition'] ?? null,
                        'textbook_series' => $segment['textbook_series'] ?? $sourceFile->extracted_metadata['textbook_series'] ?? null,
                        'source_type' => $segment['source_type'] ?? null,
                        'source_year' => $segment['source_year'] ?? $sourceFile->extracted_metadata['year'] ?? null,
                        'raw_markdown' => $segment['raw'],
                        'detected_metadata' => $segment['meta'] ?? [],
                    ])
                );
            }

            return $papers;
        });
    }

    /**
     * 基于 Markdown 标题拆分卷子。
     */
    public function splitIntoPapers(string $markdown): array
    {
        $lines = preg_split('/\r\n|\r|\n/', $markdown);
        $segments = [];
        $current = ['title' => null, 'buffer' => []];

        $headingPattern = '/^(#{1,2})\s*(.+)$/u';
        $paperKeywords = '/(期中|期末|专项|模拟|基础卷|提升卷|练习卷|单元卷|测试卷|套卷|试卷)/u';
        $sectionPrefix = '/^(卷\\s*[一二三四五六七八九十0-9IVX]+|第\\s*[一二三四五六七八九十0-9IVX]+\\s*卷)/u';
        $chapterPaperPattern = '/^(第\\s*[一二三四五六七八九十0-9]+\\s*[章节单元]).*(质量检测卷|能力提优检测卷|基础过关检测卷|检测卷|训练卷|专项训练卷)/u';
        $paperLinePattern = '/(质量检测卷|能力提优检测卷|基础过关检测卷|检测卷|训练卷|专项训练卷|期中|期末|专项|模拟|基础卷|提升卷|练习卷|单元卷|测试卷|套卷|试卷)/u';
        $questionLinePattern = '/^\\s*(\\d+|[A-D])\\s*[\\.、\\)]/u';
        $excludeKeywords = '/(答题卡|参考答案|扫描全能王|解析|来源)/u';
        $commentPattern = '/<!--\s*paper:\s*(.+?)\s*-->/i';

        foreach ($lines as $line) {
            $trimmed = trim($line);

            // 优先支持隐藏的卷子标记
            if (preg_match($commentPattern, $trimmed, $cm)) {
                if (!empty($current['buffer'])) {
                    $segments[] = [
                        'title' => $current['title'],
                        'full_title' => $current['title'],
                        'raw' => trim(implode("\n", $current['buffer'])),
                        'meta' => $this->detectMetaFromTitle($current['title']),
                    ];
                }
                $current = [
                    'title' => trim($cm[1]),
                    'buffer' => [$line],
                ];
                continue;
            }

            if ($trimmed !== '' && !preg_match($headingPattern, $trimmed)) {
                $isSectionPrefix = preg_match($sectionPrefix, $trimmed) === 1;
                $isPartHeading = preg_match('/^(选择题|填空题|解答题|综合题|计算题|应用题)/u', $trimmed) === 1;
                $isChapterPaper = preg_match($chapterPaperPattern, $trimmed) === 1;
                $isPaperLine = $isChapterPaper || preg_match($paperLinePattern, $trimmed) === 1;
                $isQuestionLine = preg_match($questionLinePattern, $trimmed) === 1;
                $lineLength = mb_strlen($trimmed);

                if ($isPaperLine && !$isSectionPrefix && !$isPartHeading && !$isQuestionLine) {
                    if (preg_match($excludeKeywords, $trimmed)) {
                        $current['buffer'][] = $line;
                        continue;
                    }

                    if (!$isChapterPaper && $lineLength > 80) {
                        $current['buffer'][] = $line;
                        continue;
                    }

                    if ($this->isSameTitle($current['title'], $trimmed)) {
                        $current['buffer'][] = $line;
                        continue;
                    }

                    if (!empty($current['buffer'])) {
                        $segments[] = [
                            'title' => $current['title'],
                            'full_title' => $current['title'],
                            'raw' => trim(implode("\n", $current['buffer'])),
                            'meta' => $this->detectMetaFromTitle($current['title']),
                        ];
                    }
                    $current = [
                        'title' => $this->sanitizeTitle($trimmed),
                        'buffer' => [$line],
                    ];
                    continue;
                }
            }

            if (preg_match($headingPattern, $line, $m)) {
                $title = $this->sanitizeTitle(trim($m[2]));
                $isSectionPrefix = preg_match($sectionPrefix, $title) === 1;
                $isPaper = preg_match($paperKeywords, $title) === 1;
                $isPaper = $isPaper || (str_contains($title, '卷') && !$isSectionPrefix);

                if (!$isPaper) {
                    $current['buffer'][] = $line;
                    continue;
                }

                if ($this->isSameTitle($current['title'], $title)) {
                    $current['buffer'][] = $line;
                    continue;
                }

                if (!empty($current['buffer'])) {
                    $segments[] = [
                        'title' => $current['title'],
                        'full_title' => $current['title'],
                        'raw' => trim(implode("\n", $current['buffer'])),
                        'meta' => $this->detectMetaFromTitle($current['title']),
                    ];
                }
                $current = [
                    'title' => $title,
                    'buffer' => [$line],
                ];
            } else {
                $current['buffer'][] = $line;
            }
        }

        if (!empty($current['buffer'])) {
            $segments[] = [
                'title' => $current['title'],
                'full_title' => $current['title'],
                'raw' => trim(implode("\n", $current['buffer'])),
                'meta' => $this->detectMetaFromTitle($current['title']),
            ];
        }

        if (empty($segments)) {
            return [[
                'title' => null,
                'full_title' => null,
                'raw' => trim($markdown),
                'meta' => [],
            ]];
        }

        $segments = $this->mergeAdjacentSegments($segments);

        return array_values(array_filter($segments, function ($segment) {
            $title = trim((string) ($segment['title'] ?? ''));
            $raw = trim((string) ($segment['raw'] ?? ''));
            if ($title === '' && mb_strlen($raw) < 80) {
                return false;
            }
            return true;
        }));
    }

    protected function sanitizeTitle(string $title): string
    {
        $title = trim($title);
        $title = preg_replace('/^[◎◆•·\\*\\-\\s]+/u', '', $title);
        $title = preg_replace('/^[①②③④⑤⑥⑦⑧⑨⑩\\d]+[\\s\\.、]+/u', '', $title);
        $title = preg_replace('/\\s*\\d+\\s*\\/\\s*答\\s*\\d+$/u', '', $title);
        $title = trim($title);

        if (mb_strlen($title) > 200) {
            $title = mb_substr($title, 0, 200);
        }

        return $title;
    }

    protected function isSameTitle(?string $currentTitle, ?string $nextTitle): bool
    {
        $currentTitle = $currentTitle ? $this->sanitizeTitle($currentTitle) : null;
        $nextTitle = $nextTitle ? $this->sanitizeTitle($nextTitle) : null;

        return $currentTitle !== null && $nextTitle !== null && $currentTitle === $nextTitle;
    }

    protected function mergeAdjacentSegments(array $segments): array
    {
        $merged = [];
        foreach ($segments as $segment) {
            $title = $segment['title'] ?? null;
            $raw = $segment['raw'] ?? '';
            $lastIndex = count($merged) - 1;
            
            if ($lastIndex >= 0) {
                // 1. 同名合并
                if ($this->isSameTitle($merged[$lastIndex]['title'] ?? null, $title)) {
                    $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw);
                    continue;
                }
                
                // 2. 碎片合并：当前片段无标题，且长度较短（归纳为前一个卷子的尾部或干扰项）
                if (empty($title) && mb_strlen(trim($raw)) < 500) {
                    $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw);
                    continue;
                }

                // 3. 碎片合并：当前片段标题太短且不含核心关键词，且其 Markdown 内容也不长
                if ($title && mb_strlen($title) < 5 && mb_strlen(trim($raw)) < 300) {
                     $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw);
                     continue;
                }
            }
            $merged[] = $segment;
        }

        return $merged;
    }

    protected function detectMetaFromTitle(?string $title): array
    {
        if (!$title) {
            return [];
        }

        $meta = [];
        if (preg_match('/第[一二三四五六七八九十0-9]+章/u', $title, $m)) {
            $meta['chapter'] = $m[0];
        }
        if (preg_match('/20[0-9]{2}/', $title, $m)) {
            $meta['source_year'] = $m[0];
        }
        if (Str::contains($title, '期中')) {
            $meta['source_type'] = 'midterm';
        } elseif (Str::contains($title, '期末')) {
            $meta['source_type'] = 'final';
        } elseif (Str::contains($title, '模拟')) {
            $meta['source_type'] = 'mock';
        }

        return $meta;
    }
}