| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- <?php
- namespace App\Services;
- use Illuminate\Support\Facades\Log;
- class AsyncMarkdownSplitter
- {
- /**
- * 将 Markdown 切分为题目数组
- *
- * @param string $markdown 原始 Markdown 文本
- * @return array 题目数组,每个元素包含 index 和 raw_markdown
- */
- public function split(string $markdown): array
- {
- // 使用正则表达式识别题号作为切分点(只接受“数字 + 明确分隔符”)
- $pattern = '/^\s*(\d{1,4})(?:[\\..、\\))\\]】])\\s*/m';
- $commentPattern = '/<!--\s*question:\s*(\d+)\s*-->/i';
- // 统一寻找切分点
- preg_match_all($pattern, $markdown, $matches, PREG_OFFSET_CAPTURE);
- preg_match_all($commentPattern, $markdown, $commentMatches, PREG_OFFSET_CAPTURE);
- // 合并匹配结果并按偏移量排序
- $allMatches = [];
- foreach ($matches[1] as $idx => $m) {
- $allMatches[] = [
- 'pos' => $matches[0][$idx][1],
- 'length' => strlen($matches[0][$idx][0]),
- 'index' => (int)$m[0],
- ];
- }
- foreach ($commentMatches[1] as $idx => $m) {
- $allMatches[] = [
- 'pos' => $commentMatches[0][$idx][1],
- 'length' => strlen($commentMatches[0][$idx][0]),
- 'index' => (int)$m[0],
- ];
- }
- usort($allMatches, fn($a, $b) => $a['pos'] <=> $b['pos']);
- $candidates = [];
- if (empty($allMatches)) {
- // 没有找到题号,整个作为一块
- return [
- [
- 'index' => 1,
- 'raw_markdown' => trim($markdown)
- ]
- ];
- }
- for ($i = 0; $i < count($allMatches); $i++) {
- $start = $allMatches[$i]['pos'];
- $end = $i + 1 < count($allMatches) ? $allMatches[$i+1]['pos'] : strlen($markdown);
- $block = substr($markdown, $start, $end - $start);
- $block = trim($block);
- if (!empty($block)) {
- $candidates[] = [
- 'sequence' => $i + 1,
- 'index' => $allMatches[$i]['index'],
- 'raw_markdown' => $block
- ];
- }
- }
- return $candidates;
- }
- /**
- * 验证切分结果
- *
- * @param array $candidates 切分结果
- * @return bool
- */
- public function validate(array $candidates): bool
- {
- // 题号重复在“多套试卷/多章节合并”场景是正常现象,不应判定为失败。
- // 仅做轻量日志,避免输出超长 indexes 列表刷屏。
- $indexes = array_map(fn($item) => $item['index'], $candidates);
- $uniqueCount = count(array_unique($indexes));
- $total = count($indexes);
- if ($total > 0 && $uniqueCount !== $total) {
- Log::warning('Duplicate question indexes detected', [
- 'total' => $total,
- 'unique' => $uniqueCount,
- ]);
- }
- // 检查每个候选是否有内容
- foreach ($candidates as $candidate) {
- if (empty($candidate['raw_markdown'])) {
- Log::warning('Empty markdown content detected', [
- 'index' => $candidate['index']
- ]);
- return false;
- }
- }
- return true;
- }
- /**
- * 获取切分统计信息
- *
- * @param array $candidates 切分结果
- * @return array
- */
- public function getStatistics(array $candidates): array
- {
- $total = count($candidates);
- $avgLength = 0;
- $maxLength = 0;
- $minLength = PHP_INT_MAX;
- foreach ($candidates as $candidate) {
- $length = strlen($candidate['raw_markdown']);
- $avgLength += $length;
- $maxLength = max($maxLength, $length);
- $minLength = min($minLength, $length);
- }
- if ($total > 0) {
- $avgLength = round($avgLength / $total, 2);
- }
- return [
- 'total_candidates' => $total,
- 'avg_length' => $avgLength,
- 'max_length' => $maxLength,
- 'min_length' => $minLength === PHP_INT_MAX ? 0 : $minLength
- ];
- }
- }
|