SourceFileParserService.php 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. <?php
  2. namespace App\Services;
  3. use App\Models\MarkdownImport;
  4. use App\Models\SourceFile;
  5. use Illuminate\Support\Arr;
  6. use Illuminate\Support\Facades\DB;
  7. use Illuminate\Support\Str;
  8. class SourceFileParserService
  9. {
  10. public function storeFromMarkdown(
  11. string $filename,
  12. string $rawMarkdown,
  13. ?MarkdownImport $import = null,
  14. array $fileMetadata = [],
  15. ?string $storagePath = null
  16. ): SourceFile {
  17. $normalized = $this->normalizeFilename($filename);
  18. $extension = pathinfo($filename, PATHINFO_EXTENSION) ?: null;
  19. $extracted = $this->extractMetadataFromFilename($filename);
  20. $rawHash = sha1($rawMarkdown);
  21. // 去重:同名且内容一致时直接复用,避免重复 source_file
  22. $existing = SourceFile::query()
  23. ->where('normalized_filename', $normalized)
  24. ->orWhere('original_filename', $filename)
  25. ->get()
  26. ->first(function (SourceFile $file) use ($rawHash) {
  27. $oldHash = sha1((string) $file->raw_markdown);
  28. return $oldHash === $rawHash;
  29. });
  30. if ($existing) {
  31. // 也把 hash 写入已存在记录的元数据,方便后续识别
  32. $meta = $existing->extracted_metadata ?? [];
  33. if (empty($meta['raw_hash'])) {
  34. $meta['raw_hash'] = $rawHash;
  35. $existing->update(['extracted_metadata' => $meta]);
  36. }
  37. if ($import) {
  38. $import->update([
  39. 'source_name' => $existing->normalized_filename,
  40. 'file_name' => $existing->original_filename,
  41. ]);
  42. }
  43. return $existing;
  44. }
  45. return DB::transaction(function () use ($filename, $normalized, $extension, $storagePath, $rawMarkdown, $fileMetadata, $extracted, $import) {
  46. $sourceFile = SourceFile::create([
  47. 'uuid' => (string) Str::uuid(),
  48. 'original_filename' => $filename,
  49. 'normalized_filename' => $normalized,
  50. 'extension' => $extension,
  51. 'storage_path' => $storagePath,
  52. 'raw_markdown' => $rawMarkdown,
  53. 'file_metadata' => $fileMetadata,
  54. 'extracted_metadata' => array_merge($extracted, [
  55. 'raw_hash' => sha1($rawMarkdown),
  56. ]),
  57. ]);
  58. if ($import) {
  59. $import->update([
  60. 'source_name' => $normalized,
  61. 'file_name' => $filename,
  62. ]);
  63. }
  64. return $sourceFile;
  65. });
  66. }
  67. /**
  68. * 从文件名提取教材信息(版别、年级、学期、章节)。
  69. */
  70. public function extractMetadataFromFilename(string $filename): array
  71. {
  72. $info = [];
  73. $basename = str_replace([' ', ' '], '', pathinfo($filename, PATHINFO_FILENAME));
  74. // 统一下划线为分隔符,便于模式匹配
  75. $normalized = str_replace(['-', '(', ')', '(', ')'], '_', $basename);
  76. $editionPatterns = [
  77. '人教' => 'PEP',
  78. '苏教' => 'SJ',
  79. '浙教' => 'ZJ',
  80. '北师' => 'BS',
  81. '沪教' => 'HJ',
  82. '北师大' => 'BS',
  83. '北师大版' => 'BS',
  84. ];
  85. foreach ($editionPatterns as $key => $code) {
  86. if (Str::contains($normalized, $key)) {
  87. $info['edition'] = $code;
  88. break;
  89. }
  90. }
  91. // 年级:支持中文数字与阿拉伯数字
  92. if (preg_match('/高[一二三]|高[123]/u', $normalized, $m)) {
  93. $info['grade'] = $m[0];
  94. } elseif (preg_match('/初[一二三]|初[123]|初中/u', $normalized, $m)) {
  95. $info['grade'] = $m[0];
  96. } elseif (preg_match('/([1-9])[_年]?\s*年级?/u', $normalized, $m)) {
  97. $num = (int) ($m[1] ?? 0);
  98. if ($num > 0) {
  99. $info['grade'] = $num . '年级';
  100. }
  101. } elseif (preg_match('/[一二三四五六七八九]年级/u', $normalized, $m)) {
  102. $info['grade'] = $m[0];
  103. }
  104. // 学期/册次:上=1, 下=2,或文件名带 _1/_2;全册=0(便于前端/枚举)
  105. if (preg_match('/上册|下册|第[12]学期|第[12]册/u', $normalized, $m)) {
  106. $info['term'] = $m[0];
  107. } elseif (preg_match('/[_-](1|2)(?:\\.md)?$/', $normalized, $m)) {
  108. $info['term'] = $m[1] === '1' ? '上册' : '下册';
  109. } elseif (Str::contains($normalized, ['全册', '全集'])) {
  110. $info['term'] = '0'; // 全册
  111. }
  112. if (preg_match('/第[一二三四五六七八九十0-9]+章/u', $normalized, $m)) {
  113. $info['chapter'] = $m[0];
  114. }
  115. if (preg_match('/20[0-9]{2}/', $basename, $m)) {
  116. $info['year'] = $m[0];
  117. }
  118. if (preg_match('/培优|专项|期中|期末|模拟|基础卷|提升卷|练习卷/u', $normalized, $m)) {
  119. $info['source_type'] = $m[0];
  120. }
  121. if (Str::contains($normalized, '数学')) {
  122. $info['subject'] = '数学';
  123. }
  124. return $info;
  125. }
  126. protected function normalizeFilename(string $filename): string
  127. {
  128. $basename = pathinfo($filename, PATHINFO_FILENAME);
  129. $normalized = Str::of($basename)
  130. ->replace(['(', ')', '(', ')', ' '], ['_', '_', '_', '_', '_'])
  131. ->slug('_')
  132. ->toString();
  133. return (string) $normalized;
  134. }
  135. }