| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149 |
- <?php
- namespace App\Services;
- use Illuminate\Support\Facades\Log;
- use Illuminate\Support\Regex;
- class OCRDataParser
- {
- /**
- * 解析OCR原始数据,返回结构化的题目信息
- *
- * @param array $ocrData 阿里云OCR返回的原始数据
- * @param array $paperInfo 系统试卷信息(可选,用于辅助匹配)
- * @return array 结构化的题目列表
- */
- public function parseStructuredQuestions(array $ocrData, ?array $paperInfo = null): array
- {
- // 提取所有文本块
- $textBlocks = $this->extractTextBlocks($ocrData);
- // 识别题号并按坐标分组
- $questionUnits = $this->groupBlocksByQuestionNumber($textBlocks);
- // 处理每个题目的内容
- $structuredQuestions = $this->processQuestionUnits($questionUnits);
- // 如果提供了试卷信息,进行匹配优化
- if ($paperInfo) {
- $structuredQuestions = $this->optimizeWithPaperInfo($structuredQuestions, $paperInfo);
- }
- return $structuredQuestions;
- }
- /**
- * 提取所有文本块和元数据
- */
- private function extractTextBlocksAndMeta(array $ocrData): array
- {
- $blocks = [];
- $meta = ['height' => 2000]; // Default height
- // Unwrap 'raw' key if present
- while (isset($ocrData['raw']) && is_array($ocrData['raw'])) {
- $ocrData = $ocrData['raw'];
- }
- if (isset($ocrData['data']) && is_string($ocrData['data'])) {
- $ocrData['data'] = json_decode($ocrData['data'], true);
- }
- if (isset($ocrData['data']['page_list'])) {
- foreach ($ocrData['data']['page_list'] as $page) {
- // Extract page height
- if (isset($page['height'])) {
- $meta['height'] = $page['height'];
- }
- if (isset($page['answer_list'])) {
- foreach ($page['answer_list'] as $item) {
- if (isset($item['text']) && !empty(trim($item['text']))) {
- $position = null;
- if (isset($item['content_list_info']) && !empty($item['content_list_info'])) {
- $position = $item['content_list_info'][0]['pos'] ?? null;
- }
- $blocks[] = [
- 'text' => trim($item['text']),
- 'position' => $position,
- 'confidence' => $item['confidence'] ?? null,
- 'type' => $this->detectTextType($item['text']),
- 'ids' => $item['ids'] ?? []
- ];
- }
- }
- }
- }
- }
- // Sort blocks by Y
- usort($blocks, function($a, $b) {
- if (!$a['position'] || !$b['position']) return 0;
- $y1 = $a['position'][0]['y'] ?? 0;
- $y2 = $b['position'][0]['y'] ?? 0;
- return $y1 <=> $y2;
- });
- return ['blocks' => $blocks, 'meta' => $meta];
- }
- /**
- * 检测文本类型(题号、选项、答案等)
- */
- private function detectTextType(string $text): string
- {
- // 题型说明(优先检测,因为可能包含数字)
- if (preg_match('/(一、选择题|二、填空题|三、解答题|本大题)/u', $text)) {
- return 'section_header';
- }
- // 题号模式:1.、1 、1、、1)、(1)、①等
- // 但要排除单独的 "(2)=" 这种情况
- if (preg_match('/^[\((]?[一二三四五六七八九十\d]+[\.\)))、]/u', $text)) {
- // 额外检查:如果只是 "(数字)=" 这种形式,不认为是题号
- if (preg_match('/^[\((]\d+[\))]=?\s*$/u', trim($text))) {
- return 'content';
- }
- // 确保题号后面有实际内容,或者至少有分数标记
- if (mb_strlen($text) > 3 || preg_match('/\d+分/u', $text)) {
- return 'question_number';
- }
- }
- // 选项模式:A.、B、C、D、A)、A.等
- if (preg_match('/^[A-Da-d][\.\))]/', $text)) {
- return 'option';
- }
- // 答题区域标记
- if (preg_match('/(得分|评卷人|答案|填空|解答|______|____)/u', $text)) {
- return 'answer_area';
- }
- // 试卷标题
- if (preg_match('/(试卷|测试卷|考试|试题)/u', $text)) {
- return 'title';
- }
- // 学生信息
- if (preg_match('/(姓名|班级|学号|年级)/u', $text)) {
- return 'student_info';
- }
- return 'content';
- }
- // ... (keep extractTextBlocks for backward compatibility if needed, or redirect)
- private function extractTextBlocks(array $ocrData): array
- {
- return $this->extractTextBlocksAndMeta($ocrData)['blocks'];
- }
- public function matchWithSystemPaper(array $ocrRawData, $paperQuestions): array
- {
- // Check if we have the wrapped structure with 'raw' key
- $dataToParse = isset($ocrRawData['raw']) ? $ocrRawData['raw'] : $ocrRawData;
- // 1. 提取所有文本块和元数据
- $extracted = $this->extractTextBlocksAndMeta($dataToParse);
- $blocks = $extracted['blocks'];
- $pageHeight = $extracted['meta']['height'];
-
- \Log::info('OCR匹配开始', [
- 'total_blocks' => count($blocks),
- 'page_height' => $pageHeight,
- 'system_questions' => $paperQuestions->count()
- ]);
-
- // 2. 从OCR文本块中提取所有题目
- $ocrQuestions = $this->extractOCRQuestions($blocks);
-
- \Log::info('提取OCR题目', [
- 'ocr_questions_count' => count($ocrQuestions),
- 'ocr_question_numbers' => array_column($ocrQuestions, 'ocr_question_number')
- ]);
-
- // 3. 对每个系统题目,找到最匹配的OCR题目
- $results = [];
- $usedOcrQuestions = []; // 记录已使用的OCR题目,避免重复匹配
-
- foreach ($paperQuestions as $paperQuestion) {
- $bestMatch = $this->findBestMatchingOCRQuestion($ocrQuestions, $paperQuestion);
-
- if ($bestMatch && !in_array($bestMatch['ocr_question_number'], $usedOcrQuestions)) {
- $usedOcrQuestions[] = $bestMatch['ocr_question_number'];
-
- // 计算答题区域的Y范围
- $blockTop = $bestMatch['y_start'] ?? 0;
- $blockBottom = $bestMatch['y_end'] ?? $blockTop;
- // 起点:题号附近略向上,覆盖同行选择题答案
- $yStart = max(0, $blockTop - 10);
- // 下一题的起点
- $nextOcrY = null;
- foreach ($ocrQuestions as $nextOcrQ) {
- if ($nextOcrQ['y_start'] > $blockTop &&
- !in_array($nextOcrQ['ocr_question_number'], $usedOcrQuestions)) {
- if ($nextOcrY === null || $nextOcrQ['y_start'] < $nextOcrY) {
- $nextOcrY = $nextOcrQ['y_start'];
- }
- }
- }
- // 根据题型动态扩展高度(选填短,解答长)
- $questionType = $paperQuestion->question_type ?? null;
- $heightMap = [
- 'choice' => 120,
- 'fill' => 180,
- 'answer' => 360,
- ];
- $extend = $heightMap[$questionType] ?? 220;
- // 终点:下一题前 5px 或默认扩展高度,取较小以避免跨题
- $yEndCandidate = min($pageHeight, $blockBottom + $extend);
- if ($nextOcrY !== null) {
- $yEnd = min($yEndCandidate, $nextOcrY - 5);
- $yEnd = max($yEnd, $yStart + 120); // 保底高度
- } else {
- $yEnd = $yEndCandidate;
- }
- // 查找题型说明,如果在当前题和下一题之间,使用它作为边界
- foreach ($blocks as $block) {
- $textType = $this->detectTextType($block['text']);
- if ($textType === 'section_header') {
- $blockYTop = $this->getBlockTopY($block);
- if ($blockYTop > $yStart && $blockYTop < $yEnd) {
- $yEnd = min($yEnd, $blockYTop - 5);
- break;
- }
- }
- }
- // 最小高度兜底
- if ($yEnd - $yStart < 80) {
- $yEnd = min($pageHeight, $yStart + 80);
- }
- // 确保Y范围有效
- if ($yEnd <= $yStart) {
- $yEnd = $yStart + 200; // 至少给200像素的空间
- }
-
- \Log::debug("Q{$paperQuestion->question_number} Y范围", [
- 'y_start' => $yStart,
- 'y_end' => $yEnd,
- 'range' => $yEnd - $yStart
- ]);
-
- // 提取答题区域的文本块
- $answerBlocks = [];
- foreach ($blocks as $block) {
- $blockY = $this->getBlockCenterY($block);
- if ($blockY > $yStart && $blockY < $yEnd) {
- $answerBlocks[] = $block;
- }
- }
-
- // 获取系统题干
- $systemQuestionText = strip_tags($paperQuestion->question_text ?? '');
-
- // 提取学生答案
- $studentAnswer = $this->extractAnswerFromBlocks(
- $answerBlocks,
- $systemQuestionText,
- $paperQuestion->question_type ?? null
- );
-
- // 计算置信度
- $confidences = [];
- foreach ($answerBlocks as $block) {
- if (isset($block['confidence'])) {
- $confidences[] = $block['confidence'];
- }
- }
-
- $avgConfidence = !empty($confidences) ? array_sum($confidences) / count($confidences) : 0;
-
- $results[$paperQuestion->question_number] = [
- 'student_answer' => trim($studentAnswer),
- 'confidence' => $bestMatch['similarity'], // 使用匹配相似度作为置信度
- 'coordinates' => [
- 'y_min' => $yStart,
- 'y_max' => $yEnd
- ],
- 'debug_info' => [
- 'y_start' => $yStart,
- 'y_end' => $yEnd,
- 'block_count' => count($answerBlocks),
- 'system_question_length' => mb_strlen($systemQuestionText),
- 'ocr_question_number' => $bestMatch['ocr_question_number'],
- 'match_similarity' => round($bestMatch['similarity'] * 100, 2) . '%',
- 'ocr_confidence' => round($avgConfidence * 100, 2) . '%'
- ],
- 'question_text' => $systemQuestionText
- ];
-
- \Log::info("系统Q{$paperQuestion->question_number} 匹配到 OCR Q{$bestMatch['ocr_question_number']}", [
- 'similarity' => round($bestMatch['similarity'] * 100, 2) . '%',
- 'student_answer_preview' => mb_substr($studentAnswer, 0, 50)
- ]);
- } else {
- // 未找到匹配
- \Log::warning("系统Q{$paperQuestion->question_number} 未找到匹配的OCR题目");
-
- $results[$paperQuestion->question_number] = [
- 'student_answer' => '',
- 'confidence' => 0,
- 'coordinates' => [
- 'y_min' => 0,
- 'y_max' => 0
- ],
- 'debug_info' => [
- 'error' => '未找到匹配的OCR题目'
- ],
- 'question_text' => strip_tags($paperQuestion->question_text ?? '')
- ];
- }
- }
-
- \Log::info('OCR匹配完成', [
- 'matched_count' => count(array_filter($results, fn($r) => !empty($r['student_answer']))),
- 'total_count' => count($results)
- ]);
- return $results;
- }
- /**
- * 基于题号坐标对所有block做y轴分段聚类
- */
- private function groupBlocksByQuestionNumber(array $blocks): array
- {
- $questionUnits = [];
- $currentQuestion = null;
- $blocksByType = [];
- // 第一步:识别所有题号
- $questionNumbers = [];
- foreach ($blocks as $index => $block) {
- if ($block['type'] === 'question_number') {
- $y = $this->getBlockCenterY($block);
- $questionNumbers[] = [
- 'index' => $index,
- 'text' => $block['text'],
- 'y' => $y,
- 'number' => $this->extractQuestionNumber($block['text'])
- ];
- }
- }
- // 第二步:按题号分组blocks
- for ($i = 0; $i < count($questionNumbers); $i++) {
- $currentQN = $questionNumbers[$i];
- $nextQN = $questionNumbers[$i + 1] ?? null;
- $yStart = $currentQN['y'];
- $yEnd = $nextQN ? $nextQN['y'] : PHP_INT_MAX;
- // 收集这个题号范围内的所有blocks
- $questionBlocks = [];
- foreach ($blocks as $block) {
- $y = $this->getBlockCenterY($block);
- if ($y >= $yStart && ($nextQN === null || $y < $yEnd)) {
- $questionBlocks[] = $block;
- }
- }
- $questionUnits[] = [
- 'question_number' => $currentQN['number'],
- 'question_text' => $currentQN['text'],
- 'blocks' => $questionBlocks,
- 'y_range' => ['start' => $yStart, 'end' => $yEnd]
- ];
- }
- return $questionUnits;
- }
- /**
- * 处理每个题目的内容
- */
- private function processQuestionUnits(array $questionUnits): array
- {
- $structuredQuestions = [];
- foreach ($questionUnits as $unit) {
- $question = [
- 'question_number' => $unit['question_number'],
- 'content' => '',
- 'options' => [],
- 'answer' => '',
- 'confidence' => 0
- ];
- $contentParts = [];
- $options = [];
- $answerAreas = [];
- foreach ($unit['blocks'] as $block) {
- switch ($block['type']) {
- case 'content':
- $contentParts[] = $block['text'];
- break;
- case 'option':
- // 提取选项字母和内容
- if (preg_match('/^([A-Da-d])[\.\))]\s*(.*)/', $block['text'], $matches)) {
- $options[] = [
- 'letter' => strtoupper($matches[1]),
- 'content' => trim($matches[2])
- ];
- } else {
- $options[] = [
- 'letter' => '',
- 'content' => $block['text']
- ];
- }
- break;
- case 'answer_area':
- // 查找答题区域中的手写内容
- $answerAreas[] = $block['text'];
- break;
- }
- }
- // 合并题干内容
- $question['content'] = $this->mergeContentParts($contentParts);
- $question['options'] = $options;
- $question['answer'] = $this->extractAnswerFromAnswerAreas($answerAreas);
- $question['confidence'] = $this->calculateConfidence($unit['blocks']);
- $structuredQuestions[] = $question;
- }
- return $structuredQuestions;
- }
- /**
- * 合并题干内容
- */
- private function mergeContentParts(array $contentParts): string
- {
- $merged = '';
- $lastWasQuestion = false;
- foreach ($contentParts as $part) {
- // 跳过题号(已经在其他地方处理)
- if (preg_match('/^[\((]?[\d]+[\.\)))、]/', $part)) {
- $lastWasQuestion = true;
- continue;
- }
- // 跳过重复的题号
- if ($lastWasQuestion && preg_match('/^[\((]?[\d]+[\.\)))、]/', $part)) {
- continue;
- }
- $merged .= ($merged ? ' ' : '') . $part;
- $lastWasQuestion = false;
- }
- return trim($merged);
- }
- /**
- * 从答题区域提取答案
- */
- private function extractAnswerFromAnswerAreas(array $answerAreas): string
- {
- $answer = '';
- foreach ($answerAreas as $area) {
- // 查找手写内容(通常在空白或下划线附近)
- if (preg_match('/([A-Da-d])/', $area, $matches)) {
- $answer = strtoupper($matches[1]);
- break;
- }
- // 查找填空题的答案
- if (preg_match('/\S+/', $area, $matches) && !preg_match('/(得分|评卷人)/', $area)) {
- $answer = trim($matches[0]);
- }
- }
- return $answer;
- }
- /**
- * 计算置信度
- */
- private function calculateConfidence(array $blocks): float
- {
- $totalConfidence = 0;
- $count = 0;
- foreach ($blocks as $block) {
- if ($block['confidence'] !== null) {
- $totalConfidence += $block['confidence'];
- $count++;
- }
- }
- return $count > 0 ? $totalConfidence / $count : 0.8;
- }
- /**
- * 获取block的Y坐标中心
- */
- private function getBlockCenterY(array $block): ?int
- {
- if (!$block['position'] || empty($block['position'])) {
- return null;
- }
- $yValues = [];
- foreach ($block['position'] as $point) {
- if (isset($point['y'])) {
- $yValues[] = $point['y'];
- }
- }
- if (empty($yValues)) {
- return null;
- }
- return (min($yValues) + max($yValues)) / 2;
- }
- /**
- * 从文本中提取题号
- */
- private function extractQuestionNumber(string $text): int
- {
- if (preg_match('/[\d]+/', $text, $matches)) {
- return (int)$matches[0];
- }
- return 0;
- }
- /**
- * 使用试卷信息优化匹配结果
- */
- private function optimizeWithPaperInfo(array $questions, array $paperInfo): array
- {
- // 获取系统试卷的题目列表
- $systemQuestions = $paperInfo['questions'] ?? [];
- // 构建系统题目的映射
- $systemMap = [];
- foreach ($systemQuestions as $sysQ) {
- $systemMap[$sysQ['question_number']] = $sysQ;
- }
- // 优化每个题目
- $optimized = [];
- foreach ($questions as $question) {
- $qNum = $question['question_number'];
- // 如果系统试卷中有对应题号,进行优化
- if (isset($systemMap[$qNum])) {
- $sysQuestion = $systemMap[$qNum];
- // 题型匹配
- if (isset($sysQuestion['question_type'])) {
- $question = $this->optimizeByQuestionType($question, $sysQuestion['question_type']);
- }
- // 答案优化
- if (isset($sysQuestion['correct_answer'])) {
- $question = $this->optimizeAnswer($question, $sysQuestion['correct_answer']);
- }
- }
- $optimized[] = $question;
- }
- return $optimized;
- }
- /**
- * 根据题型优化解析
- */
- private function optimizeByQuestionType(array $question, string $questionType): array
- {
- switch ($questionType) {
- case 'choice':
- // 选择题:确保有选项,优化答案格式
- if (empty($question['options']) && preg_match('/[A-Da-d]/', $question['content'])) {
- // 如果内容中包含选项,尝试提取
- $question['options'] = $this->extractOptionsFromContent($question['content']);
- }
- break;
- case 'fill':
- // 填空题:识别填空位置
- $question['blanks'] = $this->findFillBlanks($question['content']);
- break;
- case 'answer':
- // 解答题:保留完整内容
- $question['full_solution'] = $question['content'];
- break;
- }
- return $question;
- }
- /**
- * 优化答案格式
- */
- private function optimizeAnswer(array $question, string $correctAnswer): array
- {
- // 如果是选择题,标准化答案格式
- if (!empty($question['options'])) {
- $question['answer'] = $this->normalizeChoiceAnswer($question['answer'], $correctAnswer);
- }
- return $question;
- }
- /**
- * 标准化选择题答案
- */
- private function normalizeChoiceAnswer(string $studentAnswer, string $correctAnswer): string
- {
- // 映射表:处理各种答案格式
- $map = [
- '①' => 'A', '②' => 'B', '③' => 'C', '④' => 'D',
- '1' => 'A', '2' => 'B', '3' => 'C', '4' => 'D'
- ];
- $studentAnswer = trim($studentAnswer);
- return $map[$studentAnswer] ?? strtoupper($studentAnswer);
- }
- /**
- * 从内容中提取选项
- */
- private function extractOptionsFromContent(string $content): array
- {
- $options = [];
- $lines = explode("\n", $content);
- foreach ($lines as $line) {
- if (preg_match('/^([A-Da-d])[\.\))]\s*(.*)/', trim($line), $matches)) {
- $options[] = [
- 'letter' => strtoupper($matches[1]),
- 'content' => trim($matches[2])
- ];
- }
- }
- return $options;
- }
- /**
- * 查找填空位置
- */
- private function findFillBlanks(string $content): array
- {
- $blanks = [];
- // 查找下划线或括号
- if (preg_match_all('/(_{2,})|([\s\S]*?)|\([\s\S]*?\)/u', $content, $matches)) {
- $blanks = $matches[0];
- }
- return $blanks;
- }
- /**
- * 调试输出:生成可视化的分析结果
- */
- public function generateDebugOutput(array $ocrData, array $structuredQuestions): string
- {
- $output = "=== OCR数据解析调试输出 ===\n\n";
- // 原始数据统计
- $blocks = $this->extractTextBlocks($ocrData);
- $output .= "1. 原始文本块数量: " . count($blocks) . "\n";
- $typeStats = [];
- foreach ($blocks as $block) {
- $type = $block['type'];
- $typeStats[$type] = ($typeStats[$type] ?? 0) + 1;
- }
- $output .= " 类型分布: " . json_encode($typeStats, JSON_UNESCAPED_UNICODE) . "\n\n";
- // 结构化题目
- $output .= "2. 识别到的题目数量: " . count($structuredQuestions) . "\n";
- foreach ($structuredQuestions as $i => $q) {
- $output .= "\n题目 " . ($i + 1) . " (题号: {$q['question_number']}):\n";
- $output .= " - 内容: " . substr($q['content'], 0, 100) . "...\n";
- $output .= " - 选项数: " . count($q['options']) . "\n";
- $output .= " - 答案: " . ($q['answer'] ?: '未识别') . "\n";
- $output .= " - 置信度: " . round($q['confidence'] * 100, 2) . "%\n";
- }
- return $output;
- }
- /**
- * 寻找题目锚点
- */
- /**
- * 寻找题目锚点
- */
- public function findQuestionAnchor(array $blocks, $paperQuestion): ?array
- {
- $qNum = $paperQuestion->question_number;
- $cleanContent = strip_tags($paperQuestion->question_text);
- $cleanContent = preg_replace('/\s+/', '', $cleanContent);
-
- // 策略1:优先匹配 "题号." 的形式 (e.g., "1.", "2、")
- foreach ($blocks as $block) {
- // 匹配 "1.", "1、", "(1)" 等开头
- if (preg_match('/^[\((]?'.$qNum.'[\.\)))、]/', $block['text'])) {
- return $this->getBlockCoordinates($block);
- }
- }
- // 策略1.5:匹配独立的题号 (e.g., "1" 后面跟着空格或换行)
- // 有时候OCR会把 "1." 识别成 "1" 和 "." 分开的block,或者 "1 题目内容"
- foreach ($blocks as $block) {
- if (preg_match('/^'.$qNum.'\s+/', $block['text']) || $block['text'] === (string)$qNum) {
- // 只有当这个block看起来像题号(比较短,或者在左侧)时才采纳
- // 这里简单判断一下长度,防止匹配到 "100" 中的 "1"
- if (strlen($block['text']) < 5) {
- return $this->getBlockCoordinates($block);
- }
- }
- }
- // 策略2:如果题号匹配失败,尝试匹配题目内容的前几个字
- $prefix = mb_substr($cleanContent, 0, 15); // 取前15个字
- if (mb_strlen($prefix) > 2) {
- foreach ($blocks as $block) {
- $blockText = preg_replace('/\s+/', '', $block['text']);
-
- // 简单的包含检查
- if (mb_strpos($blockText, $prefix) !== false) {
- return $this->getBlockCoordinates($block);
- }
-
- // Fuzzy matching for the prefix
- similar_text($prefix, mb_substr($blockText, 0, mb_strlen($prefix) + 5), $percent);
- if ($percent > 80) {
- return $this->getBlockCoordinates($block);
- }
- }
- }
- return null;
- }
- /**
- * 获取Block的坐标信息
- */
- private function getBlockCoordinates(array $block): array
- {
- if (empty($block['position'])) {
- return ['y_top' => 0, 'y_bottom' => 0];
- }
- $ys = array_column($block['position'], 'y');
- return [
- 'y_top' => min($ys),
- 'y_bottom' => max($ys),
- 'x_left' => min(array_column($block['position'], 'x')),
- 'x_right' => max(array_column($block['position'], 'x')),
- ];
- }
- /**
- * 从裁剪区域的OCR结果中提取答案(去除题目文本)
- */
- public function extractAnswerFromCrop(array $cropResult, string $systemQuestionText): string
- {
- // 1. 获取OCR识别的完整文本
- $ocrText = '';
- if (isset($cropResult['content'])) {
- $ocrText = $cropResult['content'];
- } elseif (isset($cropResult['questions'])) {
- $texts = array_column($cropResult['questions'], 'content');
- $ocrText = implode("\n", $texts);
- }
- if (empty($ocrText)) {
- return '';
- }
- // 2. 预处理文本(去除标点、空格,统一格式)
- $normalizedOcr = $this->normalizeTextForComparison($ocrText);
- $normalizedSystem = $this->normalizeTextForComparison($systemQuestionText);
- // 3. 尝试去除题目部分
- // 策略A: 如果OCR文本以系统题目开头(允许一定的模糊匹配)
- if (str_starts_with($normalizedOcr, $normalizedSystem)) {
- // 找到系统题目在原始OCR文本中的结束位置
- // 这是一个简化的处理,实际可能需要更复杂的对齐算法
- $cleanOcr = $this->removePrefixFuzzy($ocrText, $systemQuestionText);
- return trim($cleanOcr);
- }
- // 策略B: 最长公共子序列匹配 (LCS) - 简化版
- // 如果OCR文本的前半部分与系统题目高度相似,则认为前半部分是题目
- $splitIndex = $this->findSplitIndex($ocrText, $systemQuestionText);
- if ($splitIndex > 0) {
- return trim(substr($ocrText, $splitIndex));
- }
- // 策略C: 如果无法区分,且OCR文本比系统题目长很多,可能包含了答案
- // 但为了安全,如果匹配失败,我们还是返回原文本,或者尝试启发式规则
-
- // 启发式规则:如果包含 "解:"、"答:" 等关键字,取关键字之后的内容
- if (preg_match('/(解[::]|答[::])(.*)/s', $ocrText, $matches)) {
- return trim($matches[0]); // 返回包含"解:"的部分
- }
- // 启发式规则:对于填空题,如果末尾有内容
- // 比如 ".... = 3",取等号后面的
- if (preg_match('/=\s*(\S+)$/', $ocrText, $matches)) {
- return trim($matches[1]);
- }
- return $ocrText;
- }
- private function normalizeTextForComparison(string $text): string
- {
- $text = strip_tags($text);
- $text = preg_replace('/\s+/', '', $text);
- $text = preg_replace('/[[:punct:]]/', '', $text); // 去除标点
- return strtolower($text);
- }
- private function removePrefixFuzzy(string $fullText, string $prefix): string
- {
- // 简单实现:逐字符匹配,直到不匹配为止
- $len = min(strlen($fullText), strlen($prefix) * 1.5); // 限制搜索范围
- $matchCount = 0;
- $lastMatchIndex = 0;
- // 这里使用一个简单的滑动窗口或者直接比较
- // 为了效率,我们假设题目在开头
- // 我们寻找 prefix 的最后一个字符在 fullText 中的位置
-
- // 更简单的方法:直接计算相似度,找到最佳切割点
- // 但这里我们先用一个简单的 hack:
- // 假设 OCR 结果中的题目部分和 systemQuestionText 长度差不多
- $prefixLen = strlen($prefix);
- $potentialPrefix = substr($fullText, 0, $prefixLen + 10); // 多取一点
-
- similar_text($potentialPrefix, $prefix, $percent);
- if ($percent > 80) {
- return substr($fullText, $prefixLen); // 简单截断
- }
-
- return $fullText;
- }
- private function findSplitIndex(string $ocrText, string $systemText): int
- {
- // 寻找 systemText 在 ocrText 中的结束位置
- // 这是一个难点,因为 OCR 可能有错别字
-
- // 简化算法:
- // 1. 取 systemText 的后 10 个字符作为"锚点"
- $anchor = mb_substr($systemText, -10);
- if (mb_strlen($anchor) < 5) $anchor = $systemText;
- $pos = mb_strpos($ocrText, $anchor);
- if ($pos !== false) {
- return $pos + mb_strlen($anchor);
- }
- return 0;
- }
- /**
- * 找到答案开始的块索引(通过与系统题干匹配)
- */
- private function findAnswerStartIndex(array $blocks, string $systemText): int
- {
- $accumulated = '';
- $normalizedSystem = $this->normalizeTextForComparison($systemText);
-
- foreach ($blocks as $index => $block) {
- $accumulated .= $block['text'];
- $normalizedAccumulated = $this->normalizeTextForComparison($accumulated);
-
- // 计算相似度
- similar_text($normalizedAccumulated, $normalizedSystem, $percent);
-
- \Log::debug("Block {$index} accumulated similarity: {$percent}%", [
- 'accumulated_length' => mb_strlen($accumulated),
- 'system_length' => mb_strlen($systemText)
- ]);
-
- // 如果相似度超过80%,认为题干已经匹配完成
- if ($percent > 80) {
- return $index + 1; // 下一个块开始是答案
- }
-
- // 如果累积文本已经明显超过系统文本,但相似度还不够,可能是OCR错误太多
- // 这时候用长度作为备选方案
- if (mb_strlen($normalizedAccumulated) > mb_strlen($normalizedSystem) * 1.2 && $percent > 60) {
- return $index + 1;
- }
- }
-
- return 0; // 未找到匹配,从头开始(保守策略)
- }
- /**
- * 从文本块中提取答案(排除题干部分)
- */
- private function extractAnswerFromBlocks(array $blocks, string $systemQuestionText, ?string $questionType = null): string
- {
- if (empty($blocks)) {
- return '';
- }
-
- // 策略1: 查找明确的答案标记
- $answerKeywords = ['答:', '答案:', '解:', '解答:'];
- foreach ($blocks as $block) {
- foreach ($answerKeywords as $keyword) {
- if (mb_strpos($block['text'], $keyword) !== false) {
- // 找到答案标记,提取标记后的内容
- $parts = mb_split($keyword, $block['text']);
- if (count($parts) > 1) {
- return trim($parts[1]);
- }
- }
- }
- }
-
- // 策略2: 选择题,优先单个字母
- if ($questionType === 'choice') {
- foreach ($blocks as $block) {
- $text = trim($block['text']);
- $type = $this->detectTextType($text);
-
- if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
- continue;
- }
-
- if (preg_match('/^[A-Da-d][\.\))]?$/u', $text)) {
- return strtoupper(preg_replace('/[^A-D]/i', '', $text));
- }
- }
- }
-
- // 策略3: 填空题,优先短数字/等式
- if ($questionType === 'fill') {
- foreach ($blocks as $block) {
- $text = trim($block['text']);
- $type = $this->detectTextType($text);
-
- if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
- continue;
- }
-
- if (mb_strlen($text) <= 25 && preg_match('/[\\d=]/u', $text)) {
- $normalizedText = $this->normalizeTextForComparison($text);
- $normalizedQuestion = $this->normalizeTextForComparison($systemQuestionText);
- if (mb_strpos($normalizedQuestion, $normalizedText) === false) {
- return $text;
- }
- }
- }
- }
-
- // 策略4: 解答/通用,收集非题干短句,优先最短
- $normalizedQuestion = $this->normalizeTextForComparison($systemQuestionText);
- $candidates = [];
- foreach ($blocks as $block) {
- $text = trim($block['text']);
- $type = $this->detectTextType($text);
-
- if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
- continue;
- }
- if ($text === '' || mb_strpos($text, '分') !== false) {
- continue;
- }
-
- $normalizedText = $this->normalizeTextForComparison($text);
- if ($normalizedText !== '' && mb_strpos($normalizedQuestion, $normalizedText) === false) {
- $candidates[] = $text;
- }
- }
-
- if (!empty($candidates)) {
- usort($candidates, fn($a, $b) => mb_strlen($a) <=> mb_strlen($b));
- return trim($candidates[0]);
- }
-
- return '';
- }
- /**
- * 从OCR文本块中提取所有题目(基于题号标记)
- */
- private function extractOCRQuestions(array $blocks): array
- {
- $ocrQuestions = [];
- $currentQuestion = null;
-
- foreach ($blocks as $idx => $block) {
- $type = $this->detectTextType($block['text']);
-
- // 检测到新题号
- if ($type === 'question_number') {
- // 保存上一题
- if ($currentQuestion !== null) {
- $ocrQuestions[] = $currentQuestion;
- }
-
- // 开始新题
- $questionNumber = $this->extractQuestionNumber($block['text']);
- $currentQuestion = [
- 'ocr_question_number' => $questionNumber,
- 'question_text' => $block['text'],
- 'blocks' => [$block],
- 'y_start' => $this->getBlockTopY($block),
- 'y_end' => $this->getBlockBottomY($block),
- ];
- } elseif ($type === 'section_header') {
- // 遇到题型说明,保存当前题目并重置
- if ($currentQuestion !== null) {
- $ocrQuestions[] = $currentQuestion;
- $currentQuestion = null;
- }
- } elseif ($currentQuestion !== null) {
- // 累积当前题目的内容
- $currentQuestion['blocks'][] = $block;
- $currentQuestion['question_text'] .= ' ' . $block['text'];
- $currentQuestion['y_end'] = $this->getBlockBottomY($block);
- }
- }
-
- // 保存最后一题
- if ($currentQuestion !== null) {
- $ocrQuestions[] = $currentQuestion;
- }
-
- return $ocrQuestions;
- }
-
- /**
- * 为系统题目找到最匹配的OCR题目
- */
- private function findBestMatchingOCRQuestion(array $ocrQuestions, $paperQuestion): ?array
- {
- $systemTextRaw = strip_tags($paperQuestion->question_text ?? '');
- $systemText = $this->normalizeTextForMatching($systemTextRaw);
- $targetNumber = (int) ($paperQuestion->question_number ?? 0);
- // 优先按题号直接命中
- foreach ($ocrQuestions as $ocrQ) {
- if (($ocrQ['ocr_question_number'] ?? null) === $targetNumber) {
- $ocrQ['similarity'] = 0.6; // 基准相似度
- \Log::info("Found match by number for Q{$paperQuestion->question_number}", [
- 'ocr_question_number' => $ocrQ['ocr_question_number']
- ]);
- return $ocrQ;
- }
- }
-
- $bestMatch = null;
- $bestSimilarity = 0;
-
- foreach ($ocrQuestions as $ocrQ) {
- $ocrText = $this->normalizeTextForMatching($ocrQ['question_text']);
- // 1) 优先题号直接匹配,给出高基准分
- $numberBoost = ($ocrQ['ocr_question_number'] ?? null) === $targetNumber ? 20 : 0;
-
- // 2) 文本相似度
- similar_text($systemText, $ocrText, $percent);
- $percent += $numberBoost; // 数字匹配可以抵消轻微文本差异
-
- \Log::debug("Matching Q{$paperQuestion->question_number} with OCR Q{$ocrQ['ocr_question_number']}", [
- 'similarity' => round($percent, 2),
- 'number_boost' => $numberBoost,
- 'system_text_preview' => mb_substr($systemTextRaw, 0, 50),
- 'ocr_text_preview' => mb_substr($ocrQ['question_text'], 0, 50)
- ]);
-
- if ($percent > $bestSimilarity) {
- $bestSimilarity = $percent;
- $bestMatch = $ocrQ;
- }
- }
-
- // 只返回相似度超过阈值的匹配
- if ($bestSimilarity >= 30) { // 降低阈值以适应OCR识别误差和LaTeX差异
- $bestMatch['similarity'] = $bestSimilarity / 100;
- \Log::info("Found match for Q{$paperQuestion->question_number}", [
- 'ocr_question_number' => $bestMatch['ocr_question_number'],
- 'similarity' => round($bestSimilarity, 2) . '%'
- ]);
- return $bestMatch;
- }
-
- \Log::warning("No match found for Q{$paperQuestion->question_number}", [
- 'best_similarity' => round($bestSimilarity, 2) . '%'
- ]);
- return null;
- }
-
- /**
- * 标准化文本用于匹配(去除空格、标点、LaTeX等)
- */
- private function normalizeTextForMatching(string $text): string
- {
- // 去除LaTeX标记(包括$$和$)
- $text = preg_replace('/\$\$?[^\$]+\$\$?/s', '', $text);
- // 去除HTML标签
- $text = strip_tags($text);
- // 去除所有标点符号和特殊字符
- $text = preg_replace('/[[:punct:]]/u', '', $text);
- $text = preg_replace('/[^\p{L}\p{N}]/u', '', $text);
- // 去除空格
- $text = preg_replace('/\s+/u', '', $text);
- // 转小写
- $text = mb_strtolower($text);
-
- return $text;
- }
-
- /**
- * 获取block的顶部Y坐标
- */
- private function getBlockTopY(array $block): int
- {
- if (empty($block['position'])) {
- return 0;
- }
- return min(array_column($block['position'], 'y'));
- }
-
- /**
- * 获取block的底部Y坐标
- */
- private function getBlockBottomY(array $block): int
- {
- if (empty($block['position'])) {
- return 0;
- }
- return max(array_column($block['position'], 'y'));
- }
- }
|