normalizeFilename($filename); $extension = pathinfo($filename, PATHINFO_EXTENSION) ?: null; $extracted = $this->extractMetadataFromFilename($filename); $rawHash = sha1($rawMarkdown); // 去重:同名且内容一致时直接复用,避免重复 source_file $existing = SourceFile::query() ->where('normalized_filename', $normalized) ->orWhere('original_filename', $filename) ->get() ->first(function (SourceFile $file) use ($rawHash) { $oldHash = sha1((string) $file->raw_markdown); return $oldHash === $rawHash; }); if ($existing) { // 也把 hash 写入已存在记录的元数据,方便后续识别 $meta = $existing->extracted_metadata ?? []; if (empty($meta['raw_hash'])) { $meta['raw_hash'] = $rawHash; $existing->update(['extracted_metadata' => $meta]); } if ($import) { $import->update([ 'source_name' => $existing->normalized_filename, 'file_name' => $existing->original_filename, ]); } return $existing; } return DB::transaction(function () use ($filename, $normalized, $extension, $storagePath, $rawMarkdown, $fileMetadata, $extracted, $import) { $sourceFile = SourceFile::create([ 'uuid' => (string) Str::uuid(), 'original_filename' => $filename, 'normalized_filename' => $normalized, 'extension' => $extension, 'storage_path' => $storagePath, 'raw_markdown' => $rawMarkdown, 'file_metadata' => $fileMetadata, 'extracted_metadata' => array_merge($extracted, [ 'raw_hash' => sha1($rawMarkdown), ]), ]); if ($import) { $import->update([ 'source_name' => $normalized, 'file_name' => $filename, ]); } return $sourceFile; }); } /** * 从文件名提取教材信息(版别、年级、学期、章节)。 */ public function extractMetadataFromFilename(string $filename): array { $info = []; $basename = str_replace([' ', ' '], '', pathinfo($filename, PATHINFO_FILENAME)); // 统一下划线为分隔符,便于模式匹配 $normalized = str_replace(['-', '(', ')', '(', ')'], '_', $basename); $editionPatterns = [ '人教' => 'PEP', '苏教' => 'SJ', '浙教' => 'ZJ', '北师' => 'BS', '沪教' => 'HJ', '北师大' => 'BS', '北师大版' => 'BS', ]; foreach ($editionPatterns as $key => $code) { if (Str::contains($normalized, $key)) { $info['edition'] = $code; break; } } // 年级:支持中文数字与阿拉伯数字 if (preg_match('/高[一二三]|高[123]/u', $normalized, $m)) { $info['grade'] = $m[0]; } elseif (preg_match('/初[一二三]|初[123]|初中/u', $normalized, $m)) { $info['grade'] = $m[0]; } elseif (preg_match('/([1-9])[_年]?\s*年级?/u', $normalized, $m)) { $num = (int) ($m[1] ?? 0); if ($num > 0) { $info['grade'] = $num . '年级'; } } elseif (preg_match('/[一二三四五六七八九]年级/u', $normalized, $m)) { $info['grade'] = $m[0]; } // 学期/册次:上=1, 下=2,或文件名带 _1/_2;全册=0(便于前端/枚举) if (preg_match('/上册|下册|第[12]学期|第[12]册/u', $normalized, $m)) { $info['term'] = $m[0]; } elseif (preg_match('/[_-](1|2)(?:\\.md)?$/', $normalized, $m)) { $info['term'] = $m[1] === '1' ? '上册' : '下册'; } elseif (Str::contains($normalized, ['全册', '全集'])) { $info['term'] = '0'; // 全册 } if (preg_match('/第[一二三四五六七八九十0-9]+章/u', $normalized, $m)) { $info['chapter'] = $m[0]; } if (preg_match('/20[0-9]{2}/', $basename, $m)) { $info['year'] = $m[0]; } if (preg_match('/培优|专项|期中|期末|模拟|基础卷|提升卷|练习卷/u', $normalized, $m)) { $info['source_type'] = $m[0]; } if (Str::contains($normalized, '数学')) { $info['subject'] = '数学'; } return $info; } protected function normalizeFilename(string $filename): string { $basename = pathinfo($filename, PATHINFO_FILENAME); $normalized = Str::of($basename) ->replace(['(', ')', '(', ')', ' '], ['_', '_', '_', '_', '_']) ->slug('_') ->toString(); return (string) $normalized; } }