/i'; // 统一寻找切分点 preg_match_all($pattern, $markdown, $matches, PREG_OFFSET_CAPTURE); preg_match_all($commentPattern, $markdown, $commentMatches, PREG_OFFSET_CAPTURE); // 合并匹配结果并按偏移量排序 $allMatches = []; foreach ($matches[1] as $idx => $m) { $allMatches[] = [ 'pos' => $matches[0][$idx][1], 'length' => strlen($matches[0][$idx][0]), 'index' => (int)$m[0], ]; } foreach ($commentMatches[1] as $idx => $m) { $allMatches[] = [ 'pos' => $commentMatches[0][$idx][1], 'length' => strlen($commentMatches[0][$idx][0]), 'index' => (int)$m[0], ]; } usort($allMatches, fn($a, $b) => $a['pos'] <=> $b['pos']); $candidates = []; if (empty($allMatches)) { // 没有找到题号,整个作为一块 return [ [ 'index' => 1, 'raw_markdown' => trim($markdown) ] ]; } for ($i = 0; $i < count($allMatches); $i++) { $start = $allMatches[$i]['pos']; $end = $i + 1 < count($allMatches) ? $allMatches[$i+1]['pos'] : strlen($markdown); $block = substr($markdown, $start, $end - $start); $block = trim($block); if (!empty($block)) { $candidates[] = [ 'sequence' => $i + 1, 'index' => $allMatches[$i]['index'], 'raw_markdown' => $block ]; } } return $candidates; } /** * 验证切分结果 * * @param array $candidates 切分结果 * @return bool */ public function validate(array $candidates): bool { // 题号重复在“多套试卷/多章节合并”场景是正常现象,不应判定为失败。 // 仅做轻量日志,避免输出超长 indexes 列表刷屏。 $indexes = array_map(fn($item) => $item['index'], $candidates); $uniqueCount = count(array_unique($indexes)); $total = count($indexes); if ($total > 0 && $uniqueCount !== $total) { Log::warning('Duplicate question indexes detected', [ 'total' => $total, 'unique' => $uniqueCount, ]); } // 检查每个候选是否有内容 foreach ($candidates as $candidate) { if (empty($candidate['raw_markdown'])) { Log::warning('Empty markdown content detected', [ 'index' => $candidate['index'] ]); return false; } } return true; } /** * 获取切分统计信息 * * @param array $candidates 切分结果 * @return array */ public function getStatistics(array $candidates): array { $total = count($candidates); $avgLength = 0; $maxLength = 0; $minLength = PHP_INT_MAX; foreach ($candidates as $candidate) { $length = strlen($candidate['raw_markdown']); $avgLength += $length; $maxLength = max($maxLength, $length); $minLength = min($minLength, $length); } if ($total > 0) { $avgLength = round($avgLength / $total, 2); } return [ 'total_candidates' => $total, 'avg_length' => $avgLength, 'max_length' => $maxLength, 'min_length' => $minLength === PHP_INT_MAX ? 0 : $minLength ]; } }