SourcePaperExtractorService.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. <?php
  2. namespace App\Services;
  3. use App\Models\SourceFile;
  4. use App\Models\SourcePaper;
  5. use Illuminate\Support\Collection;
  6. use Illuminate\Support\Facades\DB;
  7. use Illuminate\Support\Str;
  8. class SourcePaperExtractorService
  9. {
  10. /**
  11. * 从单个 Markdown 文件中切出多套卷子,并持久化。
  12. */
  13. public function extract(SourceFile $sourceFile): Collection
  14. {
  15. $segments = $this->splitIntoPapers($sourceFile->raw_markdown);
  16. return DB::transaction(function () use ($sourceFile, $segments) {
  17. $sourceFile->papers()->delete();
  18. $papers = collect();
  19. foreach ($segments as $idx => $segment) {
  20. $papers->push(
  21. SourcePaper::create([
  22. 'uuid' => (string) Str::uuid(),
  23. 'source_file_id' => $sourceFile->id,
  24. 'order' => $idx + 1,
  25. 'title' => $segment['title'] ?? null,
  26. 'full_title' => $segment['full_title'] ?? null,
  27. 'chapter' => $segment['chapter'] ?? $sourceFile->extracted_metadata['chapter'] ?? null,
  28. 'grade' => $segment['grade'] ?? $sourceFile->extracted_metadata['grade'] ?? null,
  29. 'term' => $segment['term'] ?? $sourceFile->extracted_metadata['term'] ?? null,
  30. 'edition' => $segment['edition'] ?? $sourceFile->extracted_metadata['edition'] ?? null,
  31. 'textbook_series' => $segment['textbook_series'] ?? $sourceFile->extracted_metadata['textbook_series'] ?? null,
  32. 'source_type' => $segment['source_type'] ?? null,
  33. 'source_year' => $segment['source_year'] ?? $sourceFile->extracted_metadata['year'] ?? null,
  34. 'raw_markdown' => $segment['raw'],
  35. 'detected_metadata' => $segment['meta'] ?? [],
  36. ])
  37. );
  38. }
  39. return $papers;
  40. });
  41. }
  42. /**
  43. * 基于 Markdown 标题拆分卷子。
  44. */
  45. public function splitIntoPapers(string $markdown): array
  46. {
  47. $lines = preg_split('/\r\n|\r|\n/', $markdown);
  48. $segments = [];
  49. $current = ['title' => null, 'buffer' => []];
  50. $headingPattern = '/^(#{1,2})\s*(.+)$/u';
  51. $paperKeywords = '/(期中|期末|专项|模拟|基础卷|提升卷|练习卷|单元卷|测试卷|套卷|试卷)/u';
  52. $sectionPrefix = '/^(卷\\s*[一二三四五六七八九十0-9IVX]+|第\\s*[一二三四五六七八九十0-9IVX]+\\s*卷)/u';
  53. $chapterPaperPattern = '/^(第\\s*[一二三四五六七八九十0-9]+\\s*[章节单元]).*(质量检测卷|能力提优检测卷|基础过关检测卷|检测卷|训练卷|专项训练卷)/u';
  54. $paperLinePattern = '/(质量检测卷|能力提优检测卷|基础过关检测卷|检测卷|训练卷|专项训练卷|期中|期末|专项|模拟|基础卷|提升卷|练习卷|单元卷|测试卷|套卷|试卷)/u';
  55. $questionLinePattern = '/^\\s*(\\d+|[A-D])\\s*[\\.、\\)]/u';
  56. $excludeKeywords = '/(答题卡|参考答案|扫描全能王|解析|来源)/u';
  57. $commentPattern = '/<!--\s*paper:\s*(.+?)\s*-->/i';
  58. foreach ($lines as $line) {
  59. $trimmed = trim($line);
  60. // 优先支持隐藏的卷子标记
  61. if (preg_match($commentPattern, $trimmed, $cm)) {
  62. if (!empty($current['buffer'])) {
  63. $segments[] = [
  64. 'title' => $current['title'],
  65. 'full_title' => $current['title'],
  66. 'raw' => trim(implode("\n", $current['buffer'])),
  67. 'meta' => $this->detectMetaFromTitle($current['title']),
  68. ];
  69. }
  70. $current = [
  71. 'title' => trim($cm[1]),
  72. 'buffer' => [$line],
  73. ];
  74. continue;
  75. }
  76. if ($trimmed !== '' && !preg_match($headingPattern, $trimmed)) {
  77. $isSectionPrefix = preg_match($sectionPrefix, $trimmed) === 1;
  78. $isPartHeading = preg_match('/^(选择题|填空题|解答题|综合题|计算题|应用题)/u', $trimmed) === 1;
  79. $isChapterPaper = preg_match($chapterPaperPattern, $trimmed) === 1;
  80. $isPaperLine = $isChapterPaper || preg_match($paperLinePattern, $trimmed) === 1;
  81. $isQuestionLine = preg_match($questionLinePattern, $trimmed) === 1;
  82. $lineLength = mb_strlen($trimmed);
  83. if ($isPaperLine && !$isSectionPrefix && !$isPartHeading && !$isQuestionLine) {
  84. if (preg_match($excludeKeywords, $trimmed)) {
  85. $current['buffer'][] = $line;
  86. continue;
  87. }
  88. if (!$isChapterPaper && $lineLength > 80) {
  89. $current['buffer'][] = $line;
  90. continue;
  91. }
  92. if ($this->isSameTitle($current['title'], $trimmed)) {
  93. $current['buffer'][] = $line;
  94. continue;
  95. }
  96. if (!empty($current['buffer'])) {
  97. $segments[] = [
  98. 'title' => $current['title'],
  99. 'full_title' => $current['title'],
  100. 'raw' => trim(implode("\n", $current['buffer'])),
  101. 'meta' => $this->detectMetaFromTitle($current['title']),
  102. ];
  103. }
  104. $current = [
  105. 'title' => $this->sanitizeTitle($trimmed),
  106. 'buffer' => [$line],
  107. ];
  108. continue;
  109. }
  110. }
  111. if (preg_match($headingPattern, $line, $m)) {
  112. $title = $this->sanitizeTitle(trim($m[2]));
  113. $isSectionPrefix = preg_match($sectionPrefix, $title) === 1;
  114. $isPaper = preg_match($paperKeywords, $title) === 1;
  115. $isPaper = $isPaper || (str_contains($title, '卷') && !$isSectionPrefix);
  116. if (!$isPaper) {
  117. $current['buffer'][] = $line;
  118. continue;
  119. }
  120. if ($this->isSameTitle($current['title'], $title)) {
  121. $current['buffer'][] = $line;
  122. continue;
  123. }
  124. if (!empty($current['buffer'])) {
  125. $segments[] = [
  126. 'title' => $current['title'],
  127. 'full_title' => $current['title'],
  128. 'raw' => trim(implode("\n", $current['buffer'])),
  129. 'meta' => $this->detectMetaFromTitle($current['title']),
  130. ];
  131. }
  132. $current = [
  133. 'title' => $title,
  134. 'buffer' => [$line],
  135. ];
  136. } else {
  137. $current['buffer'][] = $line;
  138. }
  139. }
  140. if (!empty($current['buffer'])) {
  141. $segments[] = [
  142. 'title' => $current['title'],
  143. 'full_title' => $current['title'],
  144. 'raw' => trim(implode("\n", $current['buffer'])),
  145. 'meta' => $this->detectMetaFromTitle($current['title']),
  146. ];
  147. }
  148. if (empty($segments)) {
  149. return [[
  150. 'title' => null,
  151. 'full_title' => null,
  152. 'raw' => trim($markdown),
  153. 'meta' => [],
  154. ]];
  155. }
  156. $segments = $this->mergeAdjacentSegments($segments);
  157. return array_values(array_filter($segments, function ($segment) {
  158. $title = trim((string) ($segment['title'] ?? ''));
  159. $raw = trim((string) ($segment['raw'] ?? ''));
  160. if ($title === '' && mb_strlen($raw) < 80) {
  161. return false;
  162. }
  163. return true;
  164. }));
  165. }
  166. protected function sanitizeTitle(string $title): string
  167. {
  168. $title = trim($title);
  169. $title = preg_replace('/^[◎◆•·\\*\\-\\s]+/u', '', $title);
  170. $title = preg_replace('/^[①②③④⑤⑥⑦⑧⑨⑩\\d]+[\\s\\.、]+/u', '', $title);
  171. $title = preg_replace('/\\s*\\d+\\s*\\/\\s*答\\s*\\d+$/u', '', $title);
  172. $title = trim($title);
  173. if (mb_strlen($title) > 200) {
  174. $title = mb_substr($title, 0, 200);
  175. }
  176. return $title;
  177. }
  178. protected function isSameTitle(?string $currentTitle, ?string $nextTitle): bool
  179. {
  180. $currentTitle = $currentTitle ? $this->sanitizeTitle($currentTitle) : null;
  181. $nextTitle = $nextTitle ? $this->sanitizeTitle($nextTitle) : null;
  182. return $currentTitle !== null && $nextTitle !== null && $currentTitle === $nextTitle;
  183. }
  184. protected function mergeAdjacentSegments(array $segments): array
  185. {
  186. $merged = [];
  187. foreach ($segments as $segment) {
  188. $title = $segment['title'] ?? null;
  189. $raw = $segment['raw'] ?? '';
  190. $lastIndex = count($merged) - 1;
  191. if ($lastIndex >= 0) {
  192. // 1. 同名合并
  193. if ($this->isSameTitle($merged[$lastIndex]['title'] ?? null, $title)) {
  194. $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw);
  195. continue;
  196. }
  197. // 2. 碎片合并:当前片段无标题,且长度较短(归纳为前一个卷子的尾部或干扰项)
  198. if (empty($title) && mb_strlen(trim($raw)) < 500) {
  199. $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw);
  200. continue;
  201. }
  202. // 3. 碎片合并:当前片段标题太短且不含核心关键词,且其 Markdown 内容也不长
  203. if ($title && mb_strlen($title) < 5 && mb_strlen(trim($raw)) < 300) {
  204. $merged[$lastIndex]['raw'] = trim($merged[$lastIndex]['raw'] . "\n\n" . $raw);
  205. continue;
  206. }
  207. }
  208. $merged[] = $segment;
  209. }
  210. return $merged;
  211. }
  212. protected function detectMetaFromTitle(?string $title): array
  213. {
  214. if (!$title) {
  215. return [];
  216. }
  217. $meta = [];
  218. if (preg_match('/第[一二三四五六七八九十0-9]+章/u', $title, $m)) {
  219. $meta['chapter'] = $m[0];
  220. }
  221. if (preg_match('/20[0-9]{2}/', $title, $m)) {
  222. $meta['source_year'] = $m[0];
  223. }
  224. if (Str::contains($title, '期中')) {
  225. $meta['source_type'] = 'midterm';
  226. } elseif (Str::contains($title, '期末')) {
  227. $meta['source_type'] = 'final';
  228. } elseif (Str::contains($title, '模拟')) {
  229. $meta['source_type'] = 'mock';
  230. }
  231. return $meta;
  232. }
  233. }