extractTextBlocks($ocrData); // 识别题号并按坐标分组 $questionUnits = $this->groupBlocksByQuestionNumber($textBlocks); // 处理每个题目的内容 $structuredQuestions = $this->processQuestionUnits($questionUnits); // 如果提供了试卷信息,进行匹配优化 if ($paperInfo) { $structuredQuestions = $this->optimizeWithPaperInfo($structuredQuestions, $paperInfo); } return $structuredQuestions; } /** * 提取所有文本块和元数据 */ private function extractTextBlocksAndMeta(array $ocrData): array { $blocks = []; $meta = ['height' => 2000]; // Default height // Unwrap 'raw' key if present while (isset($ocrData['raw']) && is_array($ocrData['raw'])) { $ocrData = $ocrData['raw']; } if (isset($ocrData['data']) && is_string($ocrData['data'])) { $ocrData['data'] = json_decode($ocrData['data'], true); } if (isset($ocrData['data']['page_list'])) { foreach ($ocrData['data']['page_list'] as $page) { // Extract page height if (isset($page['height'])) { $meta['height'] = $page['height']; } if (isset($page['answer_list'])) { foreach ($page['answer_list'] as $item) { if (isset($item['text']) && !empty(trim($item['text']))) { $position = null; if (isset($item['content_list_info']) && !empty($item['content_list_info'])) { $position = $item['content_list_info'][0]['pos'] ?? null; } $blocks[] = [ 'text' => trim($item['text']), 'position' => $position, 'confidence' => $item['confidence'] ?? null, 'type' => $this->detectTextType($item['text']), 'ids' => $item['ids'] ?? [] ]; } } } } } // Sort blocks by Y usort($blocks, function($a, $b) { if (!$a['position'] || !$b['position']) return 0; $y1 = $a['position'][0]['y'] ?? 0; $y2 = $b['position'][0]['y'] ?? 0; return $y1 <=> $y2; }); return ['blocks' => $blocks, 'meta' => $meta]; } /** * 检测文本类型(题号、选项、答案等) */ private function detectTextType(string $text): string { // 题型说明(优先检测,因为可能包含数字) if (preg_match('/(一、选择题|二、填空题|三、解答题|本大题)/u', $text)) { return 'section_header'; } // 题号模式:1.、1 、1、、1)、(1)、①等 // 但要排除单独的 "(2)=" 这种情况 if (preg_match('/^[\((]?[一二三四五六七八九十\d]+[\.\)))、]/u', $text)) { // 额外检查:如果只是 "(数字)=" 这种形式,不认为是题号 if (preg_match('/^[\((]\d+[\))]=?\s*$/u', trim($text))) { return 'content'; } // 确保题号后面有实际内容,或者至少有分数标记 if (mb_strlen($text) > 3 || preg_match('/\d+分/u', $text)) { return 'question_number'; } } // 选项模式:A.、B、C、D、A)、A.等 if (preg_match('/^[A-Da-d][\.\))]/', $text)) { return 'option'; } // 答题区域标记 if (preg_match('/(得分|评卷人|答案|填空|解答|______|____)/u', $text)) { return 'answer_area'; } // 试卷标题 if (preg_match('/(试卷|测试卷|考试|试题)/u', $text)) { return 'title'; } // 学生信息 if (preg_match('/(姓名|班级|学号|年级)/u', $text)) { return 'student_info'; } return 'content'; } // ... (keep extractTextBlocks for backward compatibility if needed, or redirect) private function extractTextBlocks(array $ocrData): array { return $this->extractTextBlocksAndMeta($ocrData)['blocks']; } public function matchWithSystemPaper(array $ocrRawData, $paperQuestions): array { // Check if we have the wrapped structure with 'raw' key $dataToParse = isset($ocrRawData['raw']) ? $ocrRawData['raw'] : $ocrRawData; // 1. 提取所有文本块和元数据 $extracted = $this->extractTextBlocksAndMeta($dataToParse); $blocks = $extracted['blocks']; $pageHeight = $extracted['meta']['height']; \Log::info('OCR匹配开始', [ 'total_blocks' => count($blocks), 'page_height' => $pageHeight, 'system_questions' => $paperQuestions->count() ]); // 2. 从OCR文本块中提取所有题目 $ocrQuestions = $this->extractOCRQuestions($blocks); \Log::info('提取OCR题目', [ 'ocr_questions_count' => count($ocrQuestions), 'ocr_question_numbers' => array_column($ocrQuestions, 'ocr_question_number') ]); // 3. 对每个系统题目,找到最匹配的OCR题目 $results = []; $usedOcrQuestions = []; // 记录已使用的OCR题目,避免重复匹配 foreach ($paperQuestions as $paperQuestion) { $bestMatch = $this->findBestMatchingOCRQuestion($ocrQuestions, $paperQuestion); if ($bestMatch && !in_array($bestMatch['ocr_question_number'], $usedOcrQuestions)) { $usedOcrQuestions[] = $bestMatch['ocr_question_number']; // 计算答题区域的Y范围 $blockTop = $bestMatch['y_start'] ?? 0; $blockBottom = $bestMatch['y_end'] ?? $blockTop; // 起点:题号附近略向上,覆盖同行选择题答案 $yStart = max(0, $blockTop - 10); // 下一题的起点 $nextOcrY = null; foreach ($ocrQuestions as $nextOcrQ) { if ($nextOcrQ['y_start'] > $blockTop && !in_array($nextOcrQ['ocr_question_number'], $usedOcrQuestions)) { if ($nextOcrY === null || $nextOcrQ['y_start'] < $nextOcrY) { $nextOcrY = $nextOcrQ['y_start']; } } } // 根据题型动态扩展高度(选填短,解答长) $questionType = $paperQuestion->question_type ?? null; $heightMap = [ 'choice' => 120, 'fill' => 180, 'answer' => 360, ]; $extend = $heightMap[$questionType] ?? 220; // 终点:下一题前 5px 或默认扩展高度,取较小以避免跨题 $yEndCandidate = min($pageHeight, $blockBottom + $extend); if ($nextOcrY !== null) { $yEnd = min($yEndCandidate, $nextOcrY - 5); $yEnd = max($yEnd, $yStart + 120); // 保底高度 } else { $yEnd = $yEndCandidate; } // 查找题型说明,如果在当前题和下一题之间,使用它作为边界 foreach ($blocks as $block) { $textType = $this->detectTextType($block['text']); if ($textType === 'section_header') { $blockYTop = $this->getBlockTopY($block); if ($blockYTop > $yStart && $blockYTop < $yEnd) { $yEnd = min($yEnd, $blockYTop - 5); break; } } } // 最小高度兜底 if ($yEnd - $yStart < 80) { $yEnd = min($pageHeight, $yStart + 80); } // 确保Y范围有效 if ($yEnd <= $yStart) { $yEnd = $yStart + 200; // 至少给200像素的空间 } \Log::debug("Q{$paperQuestion->question_number} Y范围", [ 'y_start' => $yStart, 'y_end' => $yEnd, 'range' => $yEnd - $yStart ]); // 提取答题区域的文本块 $answerBlocks = []; foreach ($blocks as $block) { $blockY = $this->getBlockCenterY($block); if ($blockY > $yStart && $blockY < $yEnd) { $answerBlocks[] = $block; } } // 获取系统题干 $systemQuestionText = strip_tags($paperQuestion->question_text ?? ''); // 提取学生答案 $studentAnswer = $this->extractAnswerFromBlocks( $answerBlocks, $systemQuestionText, $paperQuestion->question_type ?? null ); // 计算置信度 $confidences = []; foreach ($answerBlocks as $block) { if (isset($block['confidence'])) { $confidences[] = $block['confidence']; } } $avgConfidence = !empty($confidences) ? array_sum($confidences) / count($confidences) : 0; $results[$paperQuestion->question_number] = [ 'student_answer' => trim($studentAnswer), 'confidence' => $bestMatch['similarity'], // 使用匹配相似度作为置信度 'coordinates' => [ 'y_min' => $yStart, 'y_max' => $yEnd ], 'debug_info' => [ 'y_start' => $yStart, 'y_end' => $yEnd, 'block_count' => count($answerBlocks), 'system_question_length' => mb_strlen($systemQuestionText), 'ocr_question_number' => $bestMatch['ocr_question_number'], 'match_similarity' => round($bestMatch['similarity'] * 100, 2) . '%', 'ocr_confidence' => round($avgConfidence * 100, 2) . '%' ], 'question_text' => $systemQuestionText ]; \Log::info("系统Q{$paperQuestion->question_number} 匹配到 OCR Q{$bestMatch['ocr_question_number']}", [ 'similarity' => round($bestMatch['similarity'] * 100, 2) . '%', 'student_answer_preview' => mb_substr($studentAnswer, 0, 50) ]); } else { // 未找到匹配 \Log::warning("系统Q{$paperQuestion->question_number} 未找到匹配的OCR题目"); $results[$paperQuestion->question_number] = [ 'student_answer' => '', 'confidence' => 0, 'coordinates' => [ 'y_min' => 0, 'y_max' => 0 ], 'debug_info' => [ 'error' => '未找到匹配的OCR题目' ], 'question_text' => strip_tags($paperQuestion->question_text ?? '') ]; } } \Log::info('OCR匹配完成', [ 'matched_count' => count(array_filter($results, fn($r) => !empty($r['student_answer']))), 'total_count' => count($results) ]); return $results; } /** * 基于题号坐标对所有block做y轴分段聚类 */ private function groupBlocksByQuestionNumber(array $blocks): array { $questionUnits = []; $currentQuestion = null; $blocksByType = []; // 第一步:识别所有题号 $questionNumbers = []; foreach ($blocks as $index => $block) { if ($block['type'] === 'question_number') { $y = $this->getBlockCenterY($block); $questionNumbers[] = [ 'index' => $index, 'text' => $block['text'], 'y' => $y, 'number' => $this->extractQuestionNumber($block['text']) ]; } } // 第二步:按题号分组blocks for ($i = 0; $i < count($questionNumbers); $i++) { $currentQN = $questionNumbers[$i]; $nextQN = $questionNumbers[$i + 1] ?? null; $yStart = $currentQN['y']; $yEnd = $nextQN ? $nextQN['y'] : PHP_INT_MAX; // 收集这个题号范围内的所有blocks $questionBlocks = []; foreach ($blocks as $block) { $y = $this->getBlockCenterY($block); if ($y >= $yStart && ($nextQN === null || $y < $yEnd)) { $questionBlocks[] = $block; } } $questionUnits[] = [ 'question_number' => $currentQN['number'], 'question_text' => $currentQN['text'], 'blocks' => $questionBlocks, 'y_range' => ['start' => $yStart, 'end' => $yEnd] ]; } return $questionUnits; } /** * 处理每个题目的内容 */ private function processQuestionUnits(array $questionUnits): array { $structuredQuestions = []; foreach ($questionUnits as $unit) { $question = [ 'question_number' => $unit['question_number'], 'content' => '', 'options' => [], 'answer' => '', 'confidence' => 0 ]; $contentParts = []; $options = []; $answerAreas = []; foreach ($unit['blocks'] as $block) { switch ($block['type']) { case 'content': $contentParts[] = $block['text']; break; case 'option': // 提取选项字母和内容 if (preg_match('/^([A-Da-d])[\.\))]\s*(.*)/', $block['text'], $matches)) { $options[] = [ 'letter' => strtoupper($matches[1]), 'content' => trim($matches[2]) ]; } else { $options[] = [ 'letter' => '', 'content' => $block['text'] ]; } break; case 'answer_area': // 查找答题区域中的手写内容 $answerAreas[] = $block['text']; break; } } // 合并题干内容 $question['content'] = $this->mergeContentParts($contentParts); $question['options'] = $options; $question['answer'] = $this->extractAnswerFromAnswerAreas($answerAreas); $question['confidence'] = $this->calculateConfidence($unit['blocks']); $structuredQuestions[] = $question; } return $structuredQuestions; } /** * 合并题干内容 */ private function mergeContentParts(array $contentParts): string { $merged = ''; $lastWasQuestion = false; foreach ($contentParts as $part) { // 跳过题号(已经在其他地方处理) if (preg_match('/^[\((]?[\d]+[\.\)))、]/', $part)) { $lastWasQuestion = true; continue; } // 跳过重复的题号 if ($lastWasQuestion && preg_match('/^[\((]?[\d]+[\.\)))、]/', $part)) { continue; } $merged .= ($merged ? ' ' : '') . $part; $lastWasQuestion = false; } return trim($merged); } /** * 从答题区域提取答案 */ private function extractAnswerFromAnswerAreas(array $answerAreas): string { $answer = ''; foreach ($answerAreas as $area) { // 查找手写内容(通常在空白或下划线附近) if (preg_match('/([A-Da-d])/', $area, $matches)) { $answer = strtoupper($matches[1]); break; } // 查找填空题的答案 if (preg_match('/\S+/', $area, $matches) && !preg_match('/(得分|评卷人)/', $area)) { $answer = trim($matches[0]); } } return $answer; } /** * 计算置信度 */ private function calculateConfidence(array $blocks): float { $totalConfidence = 0; $count = 0; foreach ($blocks as $block) { if ($block['confidence'] !== null) { $totalConfidence += $block['confidence']; $count++; } } return $count > 0 ? $totalConfidence / $count : 0.8; } /** * 获取block的Y坐标中心 */ private function getBlockCenterY(array $block): ?int { if (!$block['position'] || empty($block['position'])) { return null; } $yValues = []; foreach ($block['position'] as $point) { if (isset($point['y'])) { $yValues[] = $point['y']; } } if (empty($yValues)) { return null; } return (min($yValues) + max($yValues)) / 2; } /** * 从文本中提取题号 */ private function extractQuestionNumber(string $text): int { if (preg_match('/[\d]+/', $text, $matches)) { return (int)$matches[0]; } return 0; } /** * 使用试卷信息优化匹配结果 */ private function optimizeWithPaperInfo(array $questions, array $paperInfo): array { // 获取系统试卷的题目列表 $systemQuestions = $paperInfo['questions'] ?? []; // 构建系统题目的映射 $systemMap = []; foreach ($systemQuestions as $sysQ) { $systemMap[$sysQ['question_number']] = $sysQ; } // 优化每个题目 $optimized = []; foreach ($questions as $question) { $qNum = $question['question_number']; // 如果系统试卷中有对应题号,进行优化 if (isset($systemMap[$qNum])) { $sysQuestion = $systemMap[$qNum]; // 题型匹配 if (isset($sysQuestion['question_type'])) { $question = $this->optimizeByQuestionType($question, $sysQuestion['question_type']); } // 答案优化 if (isset($sysQuestion['correct_answer'])) { $question = $this->optimizeAnswer($question, $sysQuestion['correct_answer']); } } $optimized[] = $question; } return $optimized; } /** * 根据题型优化解析 */ private function optimizeByQuestionType(array $question, string $questionType): array { switch ($questionType) { case 'choice': // 选择题:确保有选项,优化答案格式 if (empty($question['options']) && preg_match('/[A-Da-d]/', $question['content'])) { // 如果内容中包含选项,尝试提取 $question['options'] = $this->extractOptionsFromContent($question['content']); } break; case 'fill': // 填空题:识别填空位置 $question['blanks'] = $this->findFillBlanks($question['content']); break; case 'answer': // 解答题:保留完整内容 $question['full_solution'] = $question['content']; break; } return $question; } /** * 优化答案格式 */ private function optimizeAnswer(array $question, string $correctAnswer): array { // 如果是选择题,标准化答案格式 if (!empty($question['options'])) { $question['answer'] = $this->normalizeChoiceAnswer($question['answer'], $correctAnswer); } return $question; } /** * 标准化选择题答案 */ private function normalizeChoiceAnswer(string $studentAnswer, string $correctAnswer): string { // 映射表:处理各种答案格式 $map = [ '①' => 'A', '②' => 'B', '③' => 'C', '④' => 'D', '1' => 'A', '2' => 'B', '3' => 'C', '4' => 'D' ]; $studentAnswer = trim($studentAnswer); return $map[$studentAnswer] ?? strtoupper($studentAnswer); } /** * 从内容中提取选项 */ private function extractOptionsFromContent(string $content): array { $options = []; $lines = explode("\n", $content); foreach ($lines as $line) { if (preg_match('/^([A-Da-d])[\.\))]\s*(.*)/', trim($line), $matches)) { $options[] = [ 'letter' => strtoupper($matches[1]), 'content' => trim($matches[2]) ]; } } return $options; } /** * 查找填空位置 */ private function findFillBlanks(string $content): array { $blanks = []; // 查找下划线或括号 if (preg_match_all('/(_{2,})|([\s\S]*?)|\([\s\S]*?\)/u', $content, $matches)) { $blanks = $matches[0]; } return $blanks; } /** * 调试输出:生成可视化的分析结果 */ public function generateDebugOutput(array $ocrData, array $structuredQuestions): string { $output = "=== OCR数据解析调试输出 ===\n\n"; // 原始数据统计 $blocks = $this->extractTextBlocks($ocrData); $output .= "1. 原始文本块数量: " . count($blocks) . "\n"; $typeStats = []; foreach ($blocks as $block) { $type = $block['type']; $typeStats[$type] = ($typeStats[$type] ?? 0) + 1; } $output .= " 类型分布: " . json_encode($typeStats, JSON_UNESCAPED_UNICODE) . "\n\n"; // 结构化题目 $output .= "2. 识别到的题目数量: " . count($structuredQuestions) . "\n"; foreach ($structuredQuestions as $i => $q) { $output .= "\n题目 " . ($i + 1) . " (题号: {$q['question_number']}):\n"; $output .= " - 内容: " . substr($q['content'], 0, 100) . "...\n"; $output .= " - 选项数: " . count($q['options']) . "\n"; $output .= " - 答案: " . ($q['answer'] ?: '未识别') . "\n"; $output .= " - 置信度: " . round($q['confidence'] * 100, 2) . "%\n"; } return $output; } /** * 寻找题目锚点 */ /** * 寻找题目锚点 */ public function findQuestionAnchor(array $blocks, $paperQuestion): ?array { $qNum = $paperQuestion->question_number; $cleanContent = strip_tags($paperQuestion->question_text); $cleanContent = preg_replace('/\s+/', '', $cleanContent); // 策略1:优先匹配 "题号." 的形式 (e.g., "1.", "2、") foreach ($blocks as $block) { // 匹配 "1.", "1、", "(1)" 等开头 if (preg_match('/^[\((]?'.$qNum.'[\.\)))、]/', $block['text'])) { return $this->getBlockCoordinates($block); } } // 策略1.5:匹配独立的题号 (e.g., "1" 后面跟着空格或换行) // 有时候OCR会把 "1." 识别成 "1" 和 "." 分开的block,或者 "1 题目内容" foreach ($blocks as $block) { if (preg_match('/^'.$qNum.'\s+/', $block['text']) || $block['text'] === (string)$qNum) { // 只有当这个block看起来像题号(比较短,或者在左侧)时才采纳 // 这里简单判断一下长度,防止匹配到 "100" 中的 "1" if (strlen($block['text']) < 5) { return $this->getBlockCoordinates($block); } } } // 策略2:如果题号匹配失败,尝试匹配题目内容的前几个字 $prefix = mb_substr($cleanContent, 0, 15); // 取前15个字 if (mb_strlen($prefix) > 2) { foreach ($blocks as $block) { $blockText = preg_replace('/\s+/', '', $block['text']); // 简单的包含检查 if (mb_strpos($blockText, $prefix) !== false) { return $this->getBlockCoordinates($block); } // Fuzzy matching for the prefix similar_text($prefix, mb_substr($blockText, 0, mb_strlen($prefix) + 5), $percent); if ($percent > 80) { return $this->getBlockCoordinates($block); } } } return null; } /** * 获取Block的坐标信息 */ private function getBlockCoordinates(array $block): array { if (empty($block['position'])) { return ['y_top' => 0, 'y_bottom' => 0]; } $ys = array_column($block['position'], 'y'); return [ 'y_top' => min($ys), 'y_bottom' => max($ys), 'x_left' => min(array_column($block['position'], 'x')), 'x_right' => max(array_column($block['position'], 'x')), ]; } /** * 从裁剪区域的OCR结果中提取答案(去除题目文本) */ public function extractAnswerFromCrop(array $cropResult, string $systemQuestionText): string { // 1. 获取OCR识别的完整文本 $ocrText = ''; if (isset($cropResult['content'])) { $ocrText = $cropResult['content']; } elseif (isset($cropResult['questions'])) { $texts = array_column($cropResult['questions'], 'content'); $ocrText = implode("\n", $texts); } if (empty($ocrText)) { return ''; } // 2. 预处理文本(去除标点、空格,统一格式) $normalizedOcr = $this->normalizeTextForComparison($ocrText); $normalizedSystem = $this->normalizeTextForComparison($systemQuestionText); // 3. 尝试去除题目部分 // 策略A: 如果OCR文本以系统题目开头(允许一定的模糊匹配) if (str_starts_with($normalizedOcr, $normalizedSystem)) { // 找到系统题目在原始OCR文本中的结束位置 // 这是一个简化的处理,实际可能需要更复杂的对齐算法 $cleanOcr = $this->removePrefixFuzzy($ocrText, $systemQuestionText); return trim($cleanOcr); } // 策略B: 最长公共子序列匹配 (LCS) - 简化版 // 如果OCR文本的前半部分与系统题目高度相似,则认为前半部分是题目 $splitIndex = $this->findSplitIndex($ocrText, $systemQuestionText); if ($splitIndex > 0) { return trim(substr($ocrText, $splitIndex)); } // 策略C: 如果无法区分,且OCR文本比系统题目长很多,可能包含了答案 // 但为了安全,如果匹配失败,我们还是返回原文本,或者尝试启发式规则 // 启发式规则:如果包含 "解:"、"答:" 等关键字,取关键字之后的内容 if (preg_match('/(解[::]|答[::])(.*)/s', $ocrText, $matches)) { return trim($matches[0]); // 返回包含"解:"的部分 } // 启发式规则:对于填空题,如果末尾有内容 // 比如 ".... = 3",取等号后面的 if (preg_match('/=\s*(\S+)$/', $ocrText, $matches)) { return trim($matches[1]); } return $ocrText; } private function normalizeTextForComparison(string $text): string { $text = strip_tags($text); $text = preg_replace('/\s+/', '', $text); $text = preg_replace('/[[:punct:]]/', '', $text); // 去除标点 return strtolower($text); } private function removePrefixFuzzy(string $fullText, string $prefix): string { // 简单实现:逐字符匹配,直到不匹配为止 $len = min(strlen($fullText), strlen($prefix) * 1.5); // 限制搜索范围 $matchCount = 0; $lastMatchIndex = 0; // 这里使用一个简单的滑动窗口或者直接比较 // 为了效率,我们假设题目在开头 // 我们寻找 prefix 的最后一个字符在 fullText 中的位置 // 更简单的方法:直接计算相似度,找到最佳切割点 // 但这里我们先用一个简单的 hack: // 假设 OCR 结果中的题目部分和 systemQuestionText 长度差不多 $prefixLen = strlen($prefix); $potentialPrefix = substr($fullText, 0, $prefixLen + 10); // 多取一点 similar_text($potentialPrefix, $prefix, $percent); if ($percent > 80) { return substr($fullText, $prefixLen); // 简单截断 } return $fullText; } private function findSplitIndex(string $ocrText, string $systemText): int { // 寻找 systemText 在 ocrText 中的结束位置 // 这是一个难点,因为 OCR 可能有错别字 // 简化算法: // 1. 取 systemText 的后 10 个字符作为"锚点" $anchor = mb_substr($systemText, -10); if (mb_strlen($anchor) < 5) $anchor = $systemText; $pos = mb_strpos($ocrText, $anchor); if ($pos !== false) { return $pos + mb_strlen($anchor); } return 0; } /** * 找到答案开始的块索引(通过与系统题干匹配) */ private function findAnswerStartIndex(array $blocks, string $systemText): int { $accumulated = ''; $normalizedSystem = $this->normalizeTextForComparison($systemText); foreach ($blocks as $index => $block) { $accumulated .= $block['text']; $normalizedAccumulated = $this->normalizeTextForComparison($accumulated); // 计算相似度 similar_text($normalizedAccumulated, $normalizedSystem, $percent); \Log::debug("Block {$index} accumulated similarity: {$percent}%", [ 'accumulated_length' => mb_strlen($accumulated), 'system_length' => mb_strlen($systemText) ]); // 如果相似度超过80%,认为题干已经匹配完成 if ($percent > 80) { return $index + 1; // 下一个块开始是答案 } // 如果累积文本已经明显超过系统文本,但相似度还不够,可能是OCR错误太多 // 这时候用长度作为备选方案 if (mb_strlen($normalizedAccumulated) > mb_strlen($normalizedSystem) * 1.2 && $percent > 60) { return $index + 1; } } return 0; // 未找到匹配,从头开始(保守策略) } /** * 从文本块中提取答案(排除题干部分) */ private function extractAnswerFromBlocks(array $blocks, string $systemQuestionText, ?string $questionType = null): string { if (empty($blocks)) { return ''; } // 策略1: 查找明确的答案标记 $answerKeywords = ['答:', '答案:', '解:', '解答:']; foreach ($blocks as $block) { foreach ($answerKeywords as $keyword) { if (mb_strpos($block['text'], $keyword) !== false) { // 找到答案标记,提取标记后的内容 $parts = mb_split($keyword, $block['text']); if (count($parts) > 1) { return trim($parts[1]); } } } } // 策略2: 选择题,优先单个字母 if ($questionType === 'choice') { foreach ($blocks as $block) { $text = trim($block['text']); $type = $this->detectTextType($text); if ($type === 'question_number' || $type === 'section_header' || $type === 'option') { continue; } if (preg_match('/^[A-Da-d][\.\))]?$/u', $text)) { return strtoupper(preg_replace('/[^A-D]/i', '', $text)); } } } // 策略3: 填空题,优先短数字/等式 if ($questionType === 'fill') { foreach ($blocks as $block) { $text = trim($block['text']); $type = $this->detectTextType($text); if ($type === 'question_number' || $type === 'section_header' || $type === 'option') { continue; } if (mb_strlen($text) <= 25 && preg_match('/[\\d=]/u', $text)) { $normalizedText = $this->normalizeTextForComparison($text); $normalizedQuestion = $this->normalizeTextForComparison($systemQuestionText); if (mb_strpos($normalizedQuestion, $normalizedText) === false) { return $text; } } } } // 策略4: 解答/通用,收集非题干短句,优先最短 $normalizedQuestion = $this->normalizeTextForComparison($systemQuestionText); $candidates = []; foreach ($blocks as $block) { $text = trim($block['text']); $type = $this->detectTextType($text); if ($type === 'question_number' || $type === 'section_header' || $type === 'option') { continue; } if ($text === '' || mb_strpos($text, '分') !== false) { continue; } $normalizedText = $this->normalizeTextForComparison($text); if ($normalizedText !== '' && mb_strpos($normalizedQuestion, $normalizedText) === false) { $candidates[] = $text; } } if (!empty($candidates)) { usort($candidates, fn($a, $b) => mb_strlen($a) <=> mb_strlen($b)); return trim($candidates[0]); } return ''; } /** * 从OCR文本块中提取所有题目(基于题号标记) */ private function extractOCRQuestions(array $blocks): array { $ocrQuestions = []; $currentQuestion = null; foreach ($blocks as $idx => $block) { $type = $this->detectTextType($block['text']); // 检测到新题号 if ($type === 'question_number') { // 保存上一题 if ($currentQuestion !== null) { $ocrQuestions[] = $currentQuestion; } // 开始新题 $questionNumber = $this->extractQuestionNumber($block['text']); $currentQuestion = [ 'ocr_question_number' => $questionNumber, 'question_text' => $block['text'], 'blocks' => [$block], 'y_start' => $this->getBlockTopY($block), 'y_end' => $this->getBlockBottomY($block), ]; } elseif ($type === 'section_header') { // 遇到题型说明,保存当前题目并重置 if ($currentQuestion !== null) { $ocrQuestions[] = $currentQuestion; $currentQuestion = null; } } elseif ($currentQuestion !== null) { // 累积当前题目的内容 $currentQuestion['blocks'][] = $block; $currentQuestion['question_text'] .= ' ' . $block['text']; $currentQuestion['y_end'] = $this->getBlockBottomY($block); } } // 保存最后一题 if ($currentQuestion !== null) { $ocrQuestions[] = $currentQuestion; } return $ocrQuestions; } /** * 为系统题目找到最匹配的OCR题目 */ private function findBestMatchingOCRQuestion(array $ocrQuestions, $paperQuestion): ?array { $systemTextRaw = strip_tags($paperQuestion->question_text ?? ''); $systemText = $this->normalizeTextForMatching($systemTextRaw); $targetNumber = (int) ($paperQuestion->question_number ?? 0); // 优先按题号直接命中 foreach ($ocrQuestions as $ocrQ) { if (($ocrQ['ocr_question_number'] ?? null) === $targetNumber) { $ocrQ['similarity'] = 0.6; // 基准相似度 \Log::info("Found match by number for Q{$paperQuestion->question_number}", [ 'ocr_question_number' => $ocrQ['ocr_question_number'] ]); return $ocrQ; } } $bestMatch = null; $bestSimilarity = 0; foreach ($ocrQuestions as $ocrQ) { $ocrText = $this->normalizeTextForMatching($ocrQ['question_text']); // 1) 优先题号直接匹配,给出高基准分 $numberBoost = ($ocrQ['ocr_question_number'] ?? null) === $targetNumber ? 20 : 0; // 2) 文本相似度 similar_text($systemText, $ocrText, $percent); $percent += $numberBoost; // 数字匹配可以抵消轻微文本差异 \Log::debug("Matching Q{$paperQuestion->question_number} with OCR Q{$ocrQ['ocr_question_number']}", [ 'similarity' => round($percent, 2), 'number_boost' => $numberBoost, 'system_text_preview' => mb_substr($systemTextRaw, 0, 50), 'ocr_text_preview' => mb_substr($ocrQ['question_text'], 0, 50) ]); if ($percent > $bestSimilarity) { $bestSimilarity = $percent; $bestMatch = $ocrQ; } } // 只返回相似度超过阈值的匹配 if ($bestSimilarity >= 30) { // 降低阈值以适应OCR识别误差和LaTeX差异 $bestMatch['similarity'] = $bestSimilarity / 100; \Log::info("Found match for Q{$paperQuestion->question_number}", [ 'ocr_question_number' => $bestMatch['ocr_question_number'], 'similarity' => round($bestSimilarity, 2) . '%' ]); return $bestMatch; } \Log::warning("No match found for Q{$paperQuestion->question_number}", [ 'best_similarity' => round($bestSimilarity, 2) . '%' ]); return null; } /** * 标准化文本用于匹配(去除空格、标点、LaTeX等) */ private function normalizeTextForMatching(string $text): string { // 去除LaTeX标记(包括$$和$) $text = preg_replace('/\$\$?[^\$]+\$\$?/s', '', $text); // 去除HTML标签 $text = strip_tags($text); // 去除所有标点符号和特殊字符 $text = preg_replace('/[[:punct:]]/u', '', $text); $text = preg_replace('/[^\p{L}\p{N}]/u', '', $text); // 去除空格 $text = preg_replace('/\s+/u', '', $text); // 转小写 $text = mb_strtolower($text); return $text; } /** * 获取block的顶部Y坐标 */ private function getBlockTopY(array $block): int { if (empty($block['position'])) { return 0; } return min(array_column($block['position'], 'y')); } /** * 获取block的底部Y坐标 */ private function getBlockBottomY(array $block): int { if (empty($block['position'])) { return 0; } return max(array_column($block['position'], 'y')); } }