PaperPartExtractorService.php 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. <?php
  2. namespace App\Services;
  3. use App\Models\PaperPart;
  4. use App\Models\SourcePaper;
  5. use Illuminate\Support\Collection;
  6. use Illuminate\Support\Facades\DB;
  7. use Illuminate\Support\Str;
  8. class PaperPartExtractorService
  9. {
  10. /**
  11. * 基于卷子 Markdown 拆分题型区块。
  12. */
  13. public function extract(SourcePaper $paper): Collection
  14. {
  15. $parts = $this->splitIntoParts($paper->raw_markdown);
  16. return DB::transaction(function () use ($paper, $parts) {
  17. $paper->parts()->delete();
  18. $result = collect();
  19. foreach ($parts as $idx => $part) {
  20. $result->push(PaperPart::create([
  21. 'source_paper_id' => $paper->id,
  22. 'order' => $idx + 1,
  23. 'title' => $part['title'] ?? null,
  24. 'type' => $part['type'] ?? null,
  25. 'raw_markdown' => $part['raw'],
  26. 'question_count' => $part['question_count'] ?? null,
  27. 'detected_features' => $part['detected_features'] ?? [],
  28. ]));
  29. }
  30. return $result;
  31. });
  32. }
  33. public function splitIntoParts(string $markdown): array
  34. {
  35. $lines = preg_split('/\r\n|\r|\n/', $markdown);
  36. $segments = [];
  37. $current = ['title' => null, 'buffer' => []];
  38. $partPattern = '/^(#{2,3})\s*(第? ?[一二三四五六七八九十0-9IVX]+[部分卷]|选择题|填空题|解答题|综合题|计算题|应用题)/u';
  39. $commentPattern = '/<!--\s*part:\s*(.+?)\s*-->/i';
  40. foreach ($lines as $line) {
  41. $trimmed = trim($line);
  42. // 支持隐藏的区块标记
  43. if (preg_match($commentPattern, $trimmed, $cm)) {
  44. if (!empty($current['buffer'])) {
  45. $segments[] = $this->finalizeSegment($current);
  46. }
  47. $current = [
  48. 'title' => trim($cm[1]),
  49. 'buffer' => [$line],
  50. ];
  51. continue;
  52. }
  53. if (preg_match($partPattern, $line, $m)) {
  54. if (!empty($current['buffer'])) {
  55. $segments[] = $this->finalizeSegment($current);
  56. }
  57. $current = [
  58. 'title' => trim($m[0], "# \t"),
  59. 'buffer' => [$line],
  60. ];
  61. } else {
  62. $current['buffer'][] = $line;
  63. }
  64. }
  65. if (!empty($current['buffer'])) {
  66. $segments[] = $this->finalizeSegment($current);
  67. }
  68. if (empty($segments)) {
  69. return [[
  70. 'title' => null,
  71. 'type' => 'mixed',
  72. 'raw' => trim($markdown),
  73. 'detected_features' => [],
  74. ]];
  75. }
  76. return $segments;
  77. }
  78. protected function finalizeSegment(array $segment): array
  79. {
  80. $raw = trim(implode("\n", $segment['buffer']));
  81. $title = $segment['title'];
  82. return [
  83. 'title' => $title,
  84. 'type' => $this->detectType($title),
  85. 'raw' => $raw,
  86. 'detected_features' => [
  87. 'title' => $title,
  88. ],
  89. ];
  90. }
  91. protected function detectType(?string $title): ?string
  92. {
  93. if (!$title) {
  94. return null;
  95. }
  96. return match (true) {
  97. Str::contains($title, '选择') => 'choice',
  98. Str::contains($title, '填空') => 'fill',
  99. Str::contains($title, ['解答', '简答', '分析']) => 'answer',
  100. Str::contains($title, ['计算', '推导']) => 'calc',
  101. default => 'mixed',
  102. };
  103. }
  104. }