SourcePaperExtractorService.php 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. <?php
  2. namespace App\Services;
  3. use App\Models\SourceFile;
  4. use App\Models\SourcePaper;
  5. use Illuminate\Support\Collection;
  6. use Illuminate\Support\Facades\DB;
  7. use Illuminate\Support\Str;
  8. class SourcePaperExtractorService
  9. {
  10. /**
  11. * 从单个 Markdown 文件中切出多套卷子,并持久化。
  12. */
  13. public function extract(SourceFile $sourceFile): Collection
  14. {
  15. $segments = $this->splitIntoPapers($sourceFile->raw_markdown);
  16. return DB::transaction(function () use ($sourceFile, $segments) {
  17. $sourceFile->papers()->delete();
  18. $papers = collect();
  19. foreach ($segments as $idx => $segment) {
  20. $papers->push(
  21. SourcePaper::create([
  22. 'uuid' => (string) Str::uuid(),
  23. 'source_file_id' => $sourceFile->id,
  24. 'order' => $idx + 1,
  25. 'title' => $segment['title'] ?? null,
  26. 'full_title' => $segment['full_title'] ?? null,
  27. 'chapter' => $segment['chapter'] ?? $sourceFile->extracted_metadata['chapter'] ?? null,
  28. 'grade' => $segment['grade'] ?? $sourceFile->extracted_metadata['grade'] ?? null,
  29. 'term' => $segment['term'] ?? $sourceFile->extracted_metadata['term'] ?? null,
  30. 'edition' => $segment['edition'] ?? $sourceFile->extracted_metadata['edition'] ?? null,
  31. 'textbook_series' => $segment['textbook_series'] ?? $sourceFile->extracted_metadata['textbook_series'] ?? null,
  32. 'source_type' => $segment['source_type'] ?? null,
  33. 'source_year' => $segment['source_year'] ?? $sourceFile->extracted_metadata['year'] ?? null,
  34. 'raw_markdown' => $segment['raw'],
  35. 'detected_metadata' => $segment['meta'] ?? [],
  36. ])
  37. );
  38. }
  39. return $papers;
  40. });
  41. }
  42. /**
  43. * 基于 Markdown 标题拆分卷子。
  44. */
  45. public function splitIntoPapers(string $markdown): array
  46. {
  47. $lines = preg_split('/\r\n|\r|\n/', $markdown);
  48. $segments = [];
  49. $current = ['title' => null, 'buffer' => []];
  50. $paperPattern = '/^(#{1,2})\s*(.+卷|期中|期末|专项|模拟|基础卷|提升卷|练习卷)/u';
  51. foreach ($lines as $line) {
  52. if (preg_match($paperPattern, $line, $m)) {
  53. if (!empty($current['buffer'])) {
  54. $segments[] = [
  55. 'title' => $current['title'],
  56. 'full_title' => $current['title'],
  57. 'raw' => trim(implode("\n", $current['buffer'])),
  58. 'meta' => $this->detectMetaFromTitle($current['title']),
  59. ];
  60. }
  61. $current = [
  62. 'title' => trim($m[2]),
  63. 'buffer' => [$line],
  64. ];
  65. } else {
  66. $current['buffer'][] = $line;
  67. }
  68. }
  69. if (!empty($current['buffer'])) {
  70. $segments[] = [
  71. 'title' => $current['title'],
  72. 'full_title' => $current['title'],
  73. 'raw' => trim(implode("\n", $current['buffer'])),
  74. 'meta' => $this->detectMetaFromTitle($current['title']),
  75. ];
  76. }
  77. if (empty($segments)) {
  78. return [[
  79. 'title' => null,
  80. 'full_title' => null,
  81. 'raw' => trim($markdown),
  82. 'meta' => [],
  83. ]];
  84. }
  85. return $segments;
  86. }
  87. protected function detectMetaFromTitle(?string $title): array
  88. {
  89. if (!$title) {
  90. return [];
  91. }
  92. $meta = [];
  93. if (preg_match('/第[一二三四五六七八九十0-9]+章/u', $title, $m)) {
  94. $meta['chapter'] = $m[0];
  95. }
  96. if (preg_match('/20[0-9]{2}/', $title, $m)) {
  97. $meta['source_year'] = $m[0];
  98. }
  99. if (Str::contains($title, '期中')) {
  100. $meta['source_type'] = 'midterm';
  101. } elseif (Str::contains($title, '期末')) {
  102. $meta['source_type'] = 'final';
  103. } elseif (Str::contains($title, '模拟')) {
  104. $meta['source_type'] = 'mock';
  105. }
  106. return $meta;
  107. }
  108. }