| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- <?php
- namespace App\Services;
- use App\Models\SourceFile;
- use App\Models\SourcePaper;
- use Illuminate\Support\Collection;
- use Illuminate\Support\Facades\DB;
- use Illuminate\Support\Str;
- class SourcePaperExtractorService
- {
- /**
- * 从单个 Markdown 文件中切出多套卷子,并持久化。
- */
- public function extract(SourceFile $sourceFile): Collection
- {
- $segments = $this->splitIntoPapers($sourceFile->raw_markdown);
- return DB::transaction(function () use ($sourceFile, $segments) {
- $sourceFile->papers()->delete();
- $papers = collect();
- foreach ($segments as $idx => $segment) {
- $papers->push(
- SourcePaper::create([
- 'uuid' => (string) Str::uuid(),
- 'source_file_id' => $sourceFile->id,
- 'order' => $idx + 1,
- 'title' => $segment['title'] ?? null,
- 'full_title' => $segment['full_title'] ?? null,
- 'chapter' => $segment['chapter'] ?? $sourceFile->extracted_metadata['chapter'] ?? null,
- 'grade' => $segment['grade'] ?? $sourceFile->extracted_metadata['grade'] ?? null,
- 'term' => $segment['term'] ?? $sourceFile->extracted_metadata['term'] ?? null,
- 'edition' => $segment['edition'] ?? $sourceFile->extracted_metadata['edition'] ?? null,
- 'textbook_series' => $segment['textbook_series'] ?? $sourceFile->extracted_metadata['textbook_series'] ?? null,
- 'source_type' => $segment['source_type'] ?? null,
- 'source_year' => $segment['source_year'] ?? $sourceFile->extracted_metadata['year'] ?? null,
- 'raw_markdown' => $segment['raw'],
- 'detected_metadata' => $segment['meta'] ?? [],
- ])
- );
- }
- return $papers;
- });
- }
- /**
- * 基于 Markdown 标题拆分卷子。
- */
- public function splitIntoPapers(string $markdown): array
- {
- $lines = preg_split('/\r\n|\r|\n/', $markdown);
- $segments = [];
- $current = ['title' => null, 'buffer' => []];
- $paperPattern = '/^(#{1,2})\s*(.+卷|期中|期末|专项|模拟|基础卷|提升卷|练习卷)/u';
- foreach ($lines as $line) {
- if (preg_match($paperPattern, $line, $m)) {
- if (!empty($current['buffer'])) {
- $segments[] = [
- 'title' => $current['title'],
- 'full_title' => $current['title'],
- 'raw' => trim(implode("\n", $current['buffer'])),
- 'meta' => $this->detectMetaFromTitle($current['title']),
- ];
- }
- $current = [
- 'title' => trim($m[2]),
- 'buffer' => [$line],
- ];
- } else {
- $current['buffer'][] = $line;
- }
- }
- if (!empty($current['buffer'])) {
- $segments[] = [
- 'title' => $current['title'],
- 'full_title' => $current['title'],
- 'raw' => trim(implode("\n", $current['buffer'])),
- 'meta' => $this->detectMetaFromTitle($current['title']),
- ];
- }
- if (empty($segments)) {
- return [[
- 'title' => null,
- 'full_title' => null,
- 'raw' => trim($markdown),
- 'meta' => [],
- ]];
- }
- return $segments;
- }
- protected function detectMetaFromTitle(?string $title): array
- {
- if (!$title) {
- return [];
- }
- $meta = [];
- if (preg_match('/第[一二三四五六七八九十0-9]+章/u', $title, $m)) {
- $meta['chapter'] = $m[0];
- }
- if (preg_match('/20[0-9]{2}/', $title, $m)) {
- $meta['source_year'] = $m[0];
- }
- if (Str::contains($title, '期中')) {
- $meta['source_type'] = 'midterm';
- } elseif (Str::contains($title, '期末')) {
- $meta['source_type'] = 'final';
- } elseif (Str::contains($title, '模拟')) {
- $meta['source_type'] = 'mock';
- }
- return $meta;
- }
- }
|