| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- <?php
- namespace App\Services;
- use App\Models\MarkdownImport;
- use App\Models\SourceFile;
- use Illuminate\Support\Arr;
- use Illuminate\Support\Facades\DB;
- use Illuminate\Support\Str;
- class SourceFileParserService
- {
- public function storeFromMarkdown(
- string $filename,
- string $rawMarkdown,
- ?MarkdownImport $import = null,
- array $fileMetadata = [],
- ?string $storagePath = null
- ): SourceFile {
- $normalized = $this->normalizeFilename($filename);
- $extension = pathinfo($filename, PATHINFO_EXTENSION) ?: null;
- $extracted = $this->extractMetadataFromFilename($filename);
- $rawHash = sha1($rawMarkdown);
- // 去重:同名且内容一致时直接复用,避免重复 source_file
- $existing = SourceFile::query()
- ->where('normalized_filename', $normalized)
- ->orWhere('original_filename', $filename)
- ->get()
- ->first(function (SourceFile $file) use ($rawHash) {
- $oldHash = sha1((string) $file->raw_markdown);
- return $oldHash === $rawHash;
- });
- if ($existing) {
- // 也把 hash 写入已存在记录的元数据,方便后续识别
- $meta = $existing->extracted_metadata ?? [];
- if (empty($meta['raw_hash'])) {
- $meta['raw_hash'] = $rawHash;
- $existing->update(['extracted_metadata' => $meta]);
- }
- if ($import) {
- $import->update([
- 'source_name' => $existing->normalized_filename,
- 'file_name' => $existing->original_filename,
- ]);
- }
- return $existing;
- }
- return DB::transaction(function () use ($filename, $normalized, $extension, $storagePath, $rawMarkdown, $fileMetadata, $extracted, $import) {
- $sourceFile = SourceFile::create([
- 'uuid' => (string) Str::uuid(),
- 'original_filename' => $filename,
- 'normalized_filename' => $normalized,
- 'extension' => $extension,
- 'storage_path' => $storagePath,
- 'raw_markdown' => $rawMarkdown,
- 'file_metadata' => $fileMetadata,
- 'extracted_metadata' => array_merge($extracted, [
- 'raw_hash' => sha1($rawMarkdown),
- ]),
- ]);
- if ($import) {
- $import->update([
- 'source_name' => $normalized,
- 'file_name' => $filename,
- ]);
- }
- return $sourceFile;
- });
- }
- /**
- * 从文件名提取教材信息(版别、年级、学期、章节)。
- */
- public function extractMetadataFromFilename(string $filename): array
- {
- $info = [];
- $basename = str_replace([' ', ' '], '', pathinfo($filename, PATHINFO_FILENAME));
- // 统一下划线为分隔符,便于模式匹配
- $normalized = str_replace(['-', '(', ')', '(', ')'], '_', $basename);
- $editionPatterns = [
- '人教' => 'PEP',
- '苏教' => 'SJ',
- '浙教' => 'ZJ',
- '北师' => 'BS',
- '沪教' => 'HJ',
- '北师大' => 'BS',
- '北师大版' => 'BS',
- ];
- foreach ($editionPatterns as $key => $code) {
- if (Str::contains($normalized, $key)) {
- $info['edition'] = $code;
- break;
- }
- }
- // 年级:支持中文数字与阿拉伯数字
- if (preg_match('/高[一二三]|高[123]/u', $normalized, $m)) {
- $info['grade'] = $m[0];
- } elseif (preg_match('/初[一二三]|初[123]|初中/u', $normalized, $m)) {
- $info['grade'] = $m[0];
- } elseif (preg_match('/([1-9])[_年]?\s*年级?/u', $normalized, $m)) {
- $num = (int) ($m[1] ?? 0);
- if ($num > 0) {
- $info['grade'] = $num . '年级';
- }
- } elseif (preg_match('/[一二三四五六七八九]年级/u', $normalized, $m)) {
- $info['grade'] = $m[0];
- }
- // 学期/册次:上=1, 下=2,或文件名带 _1/_2;全册=0(便于前端/枚举)
- if (preg_match('/上册|下册|第[12]学期|第[12]册/u', $normalized, $m)) {
- $info['term'] = $m[0];
- } elseif (preg_match('/[_-](1|2)(?:\\.md)?$/', $normalized, $m)) {
- $info['term'] = $m[1] === '1' ? '上册' : '下册';
- } elseif (Str::contains($normalized, ['全册', '全集'])) {
- $info['term'] = '0'; // 全册
- }
- if (preg_match('/第[一二三四五六七八九十0-9]+章/u', $normalized, $m)) {
- $info['chapter'] = $m[0];
- }
- if (preg_match('/20[0-9]{2}/', $basename, $m)) {
- $info['year'] = $m[0];
- }
- if (preg_match('/培优|专项|期中|期末|模拟|基础卷|提升卷|练习卷/u', $normalized, $m)) {
- $info['source_type'] = $m[0];
- }
- if (Str::contains($normalized, '数学')) {
- $info['subject'] = '数学';
- }
- return $info;
- }
- protected function normalizeFilename(string $filename): string
- {
- $basename = pathinfo($filename, PATHINFO_FILENAME);
- $normalized = Str::of($basename)
- ->replace(['(', ')', '(', ')', ' '], ['_', '_', '_', '_', '_'])
- ->slug('_')
- ->toString();
- return (string) $normalized;
- }
- }
|