|
@@ -0,0 +1,1149 @@
|
|
|
|
|
+<?php
|
|
|
|
|
+
|
|
|
|
|
+namespace App\Services;
|
|
|
|
|
+
|
|
|
|
|
+use Illuminate\Support\Facades\Log;
|
|
|
|
|
+use Illuminate\Support\Regex;
|
|
|
|
|
+
|
|
|
|
|
+class OCRDataParser
|
|
|
|
|
+{
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 解析OCR原始数据,返回结构化的题目信息
|
|
|
|
|
+ *
|
|
|
|
|
+ * @param array $ocrData 阿里云OCR返回的原始数据
|
|
|
|
|
+ * @param array $paperInfo 系统试卷信息(可选,用于辅助匹配)
|
|
|
|
|
+ * @return array 结构化的题目列表
|
|
|
|
|
+ */
|
|
|
|
|
+ public function parseStructuredQuestions(array $ocrData, ?array $paperInfo = null): array
|
|
|
|
|
+ {
|
|
|
|
|
+ // 提取所有文本块
|
|
|
|
|
+ $textBlocks = $this->extractTextBlocks($ocrData);
|
|
|
|
|
+
|
|
|
|
|
+ // 识别题号并按坐标分组
|
|
|
|
|
+ $questionUnits = $this->groupBlocksByQuestionNumber($textBlocks);
|
|
|
|
|
+
|
|
|
|
|
+ // 处理每个题目的内容
|
|
|
|
|
+ $structuredQuestions = $this->processQuestionUnits($questionUnits);
|
|
|
|
|
+
|
|
|
|
|
+ // 如果提供了试卷信息,进行匹配优化
|
|
|
|
|
+ if ($paperInfo) {
|
|
|
|
|
+ $structuredQuestions = $this->optimizeWithPaperInfo($structuredQuestions, $paperInfo);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $structuredQuestions;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 提取所有文本块和元数据
|
|
|
|
|
+ */
|
|
|
|
|
+ private function extractTextBlocksAndMeta(array $ocrData): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $blocks = [];
|
|
|
|
|
+ $meta = ['height' => 2000]; // Default height
|
|
|
|
|
+
|
|
|
|
|
+ // Unwrap 'raw' key if present
|
|
|
|
|
+ while (isset($ocrData['raw']) && is_array($ocrData['raw'])) {
|
|
|
|
|
+ $ocrData = $ocrData['raw'];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (isset($ocrData['data']) && is_string($ocrData['data'])) {
|
|
|
|
|
+ $ocrData['data'] = json_decode($ocrData['data'], true);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (isset($ocrData['data']['page_list'])) {
|
|
|
|
|
+ foreach ($ocrData['data']['page_list'] as $page) {
|
|
|
|
|
+ // Extract page height
|
|
|
|
|
+ if (isset($page['height'])) {
|
|
|
|
|
+ $meta['height'] = $page['height'];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (isset($page['answer_list'])) {
|
|
|
|
|
+ foreach ($page['answer_list'] as $item) {
|
|
|
|
|
+ if (isset($item['text']) && !empty(trim($item['text']))) {
|
|
|
|
|
+ $position = null;
|
|
|
|
|
+ if (isset($item['content_list_info']) && !empty($item['content_list_info'])) {
|
|
|
|
|
+ $position = $item['content_list_info'][0]['pos'] ?? null;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $blocks[] = [
|
|
|
|
|
+ 'text' => trim($item['text']),
|
|
|
|
|
+ 'position' => $position,
|
|
|
|
|
+ 'confidence' => $item['confidence'] ?? null,
|
|
|
|
|
+ 'type' => $this->detectTextType($item['text']),
|
|
|
|
|
+ 'ids' => $item['ids'] ?? []
|
|
|
|
|
+ ];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Sort blocks by Y
|
|
|
|
|
+ usort($blocks, function($a, $b) {
|
|
|
|
|
+ if (!$a['position'] || !$b['position']) return 0;
|
|
|
|
|
+ $y1 = $a['position'][0]['y'] ?? 0;
|
|
|
|
|
+ $y2 = $b['position'][0]['y'] ?? 0;
|
|
|
|
|
+ return $y1 <=> $y2;
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ return ['blocks' => $blocks, 'meta' => $meta];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 检测文本类型(题号、选项、答案等)
|
|
|
|
|
+ */
|
|
|
|
|
+ private function detectTextType(string $text): string
|
|
|
|
|
+ {
|
|
|
|
|
+ // 题型说明(优先检测,因为可能包含数字)
|
|
|
|
|
+ if (preg_match('/(一、选择题|二、填空题|三、解答题|本大题)/u', $text)) {
|
|
|
|
|
+ return 'section_header';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 题号模式:1.、1 、1、、1)、(1)、①等
|
|
|
|
|
+ // 但要排除单独的 "(2)=" 这种情况
|
|
|
|
|
+ if (preg_match('/^[\((]?[一二三四五六七八九十\d]+[\.\)))、]/u', $text)) {
|
|
|
|
|
+ // 额外检查:如果只是 "(数字)=" 这种形式,不认为是题号
|
|
|
|
|
+ if (preg_match('/^[\((]\d+[\))]=?\s*$/u', trim($text))) {
|
|
|
|
|
+ return 'content';
|
|
|
|
|
+ }
|
|
|
|
|
+ // 确保题号后面有实际内容,或者至少有分数标记
|
|
|
|
|
+ if (mb_strlen($text) > 3 || preg_match('/\d+分/u', $text)) {
|
|
|
|
|
+ return 'question_number';
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 选项模式:A.、B、C、D、A)、A.等
|
|
|
|
|
+ if (preg_match('/^[A-Da-d][\.\))]/', $text)) {
|
|
|
|
|
+ return 'option';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 答题区域标记
|
|
|
|
|
+ if (preg_match('/(得分|评卷人|答案|填空|解答|______|____)/u', $text)) {
|
|
|
|
|
+ return 'answer_area';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 试卷标题
|
|
|
|
|
+ if (preg_match('/(试卷|测试卷|考试|试题)/u', $text)) {
|
|
|
|
|
+ return 'title';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 学生信息
|
|
|
|
|
+ if (preg_match('/(姓名|班级|学号|年级)/u', $text)) {
|
|
|
|
|
+ return 'student_info';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return 'content';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // ... (keep extractTextBlocks for backward compatibility if needed, or redirect)
|
|
|
|
|
+ private function extractTextBlocks(array $ocrData): array
|
|
|
|
|
+ {
|
|
|
|
|
+ return $this->extractTextBlocksAndMeta($ocrData)['blocks'];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ public function matchWithSystemPaper(array $ocrRawData, $paperQuestions): array
|
|
|
|
|
+ {
|
|
|
|
|
+ // Check if we have the wrapped structure with 'raw' key
|
|
|
|
|
+ $dataToParse = isset($ocrRawData['raw']) ? $ocrRawData['raw'] : $ocrRawData;
|
|
|
|
|
+
|
|
|
|
|
+ // 1. 提取所有文本块和元数据
|
|
|
|
|
+ $extracted = $this->extractTextBlocksAndMeta($dataToParse);
|
|
|
|
|
+ $blocks = $extracted['blocks'];
|
|
|
|
|
+ $pageHeight = $extracted['meta']['height'];
|
|
|
|
|
+
|
|
|
|
|
+ \Log::info('OCR匹配开始', [
|
|
|
|
|
+ 'total_blocks' => count($blocks),
|
|
|
|
|
+ 'page_height' => $pageHeight,
|
|
|
|
|
+ 'system_questions' => $paperQuestions->count()
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ // 2. 从OCR文本块中提取所有题目
|
|
|
|
|
+ $ocrQuestions = $this->extractOCRQuestions($blocks);
|
|
|
|
|
+
|
|
|
|
|
+ \Log::info('提取OCR题目', [
|
|
|
|
|
+ 'ocr_questions_count' => count($ocrQuestions),
|
|
|
|
|
+ 'ocr_question_numbers' => array_column($ocrQuestions, 'ocr_question_number')
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ // 3. 对每个系统题目,找到最匹配的OCR题目
|
|
|
|
|
+ $results = [];
|
|
|
|
|
+ $usedOcrQuestions = []; // 记录已使用的OCR题目,避免重复匹配
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($paperQuestions as $paperQuestion) {
|
|
|
|
|
+ $bestMatch = $this->findBestMatchingOCRQuestion($ocrQuestions, $paperQuestion);
|
|
|
|
|
+
|
|
|
|
|
+ if ($bestMatch && !in_array($bestMatch['ocr_question_number'], $usedOcrQuestions)) {
|
|
|
|
|
+ $usedOcrQuestions[] = $bestMatch['ocr_question_number'];
|
|
|
|
|
+
|
|
|
|
|
+ // 计算答题区域的Y范围
|
|
|
|
|
+ $blockTop = $bestMatch['y_start'] ?? 0;
|
|
|
|
|
+ $blockBottom = $bestMatch['y_end'] ?? $blockTop;
|
|
|
|
|
+
|
|
|
|
|
+ // 起点:题号附近略向上,覆盖同行选择题答案
|
|
|
|
|
+ $yStart = max(0, $blockTop - 10);
|
|
|
|
|
+
|
|
|
|
|
+ // 下一题的起点
|
|
|
|
|
+ $nextOcrY = null;
|
|
|
|
|
+ foreach ($ocrQuestions as $nextOcrQ) {
|
|
|
|
|
+ if ($nextOcrQ['y_start'] > $blockTop &&
|
|
|
|
|
+ !in_array($nextOcrQ['ocr_question_number'], $usedOcrQuestions)) {
|
|
|
|
|
+ if ($nextOcrY === null || $nextOcrQ['y_start'] < $nextOcrY) {
|
|
|
|
|
+ $nextOcrY = $nextOcrQ['y_start'];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 根据题型动态扩展高度(选填短,解答长)
|
|
|
|
|
+ $questionType = $paperQuestion->question_type ?? null;
|
|
|
|
|
+ $heightMap = [
|
|
|
|
|
+ 'choice' => 120,
|
|
|
|
|
+ 'fill' => 180,
|
|
|
|
|
+ 'answer' => 360,
|
|
|
|
|
+ ];
|
|
|
|
|
+ $extend = $heightMap[$questionType] ?? 220;
|
|
|
|
|
+
|
|
|
|
|
+ // 终点:下一题前 5px 或默认扩展高度,取较小以避免跨题
|
|
|
|
|
+ $yEndCandidate = min($pageHeight, $blockBottom + $extend);
|
|
|
|
|
+ if ($nextOcrY !== null) {
|
|
|
|
|
+ $yEnd = min($yEndCandidate, $nextOcrY - 5);
|
|
|
|
|
+ $yEnd = max($yEnd, $yStart + 120); // 保底高度
|
|
|
|
|
+ } else {
|
|
|
|
|
+ $yEnd = $yEndCandidate;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 查找题型说明,如果在当前题和下一题之间,使用它作为边界
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ $textType = $this->detectTextType($block['text']);
|
|
|
|
|
+ if ($textType === 'section_header') {
|
|
|
|
|
+ $blockYTop = $this->getBlockTopY($block);
|
|
|
|
|
+ if ($blockYTop > $yStart && $blockYTop < $yEnd) {
|
|
|
|
|
+ $yEnd = min($yEnd, $blockYTop - 5);
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 最小高度兜底
|
|
|
|
|
+ if ($yEnd - $yStart < 80) {
|
|
|
|
|
+ $yEnd = min($pageHeight, $yStart + 80);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 确保Y范围有效
|
|
|
|
|
+ if ($yEnd <= $yStart) {
|
|
|
|
|
+ $yEnd = $yStart + 200; // 至少给200像素的空间
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ \Log::debug("Q{$paperQuestion->question_number} Y范围", [
|
|
|
|
|
+ 'y_start' => $yStart,
|
|
|
|
|
+ 'y_end' => $yEnd,
|
|
|
|
|
+ 'range' => $yEnd - $yStart
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ // 提取答题区域的文本块
|
|
|
|
|
+ $answerBlocks = [];
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ $blockY = $this->getBlockCenterY($block);
|
|
|
|
|
+ if ($blockY > $yStart && $blockY < $yEnd) {
|
|
|
|
|
+ $answerBlocks[] = $block;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 获取系统题干
|
|
|
|
|
+ $systemQuestionText = strip_tags($paperQuestion->question_text ?? '');
|
|
|
|
|
+
|
|
|
|
|
+ // 提取学生答案
|
|
|
|
|
+ $studentAnswer = $this->extractAnswerFromBlocks(
|
|
|
|
|
+ $answerBlocks,
|
|
|
|
|
+ $systemQuestionText,
|
|
|
|
|
+ $paperQuestion->question_type ?? null
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ // 计算置信度
|
|
|
|
|
+ $confidences = [];
|
|
|
|
|
+ foreach ($answerBlocks as $block) {
|
|
|
|
|
+ if (isset($block['confidence'])) {
|
|
|
|
|
+ $confidences[] = $block['confidence'];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $avgConfidence = !empty($confidences) ? array_sum($confidences) / count($confidences) : 0;
|
|
|
|
|
+
|
|
|
|
|
+ $results[$paperQuestion->question_number] = [
|
|
|
|
|
+ 'student_answer' => trim($studentAnswer),
|
|
|
|
|
+ 'confidence' => $bestMatch['similarity'], // 使用匹配相似度作为置信度
|
|
|
|
|
+ 'coordinates' => [
|
|
|
|
|
+ 'y_min' => $yStart,
|
|
|
|
|
+ 'y_max' => $yEnd
|
|
|
|
|
+ ],
|
|
|
|
|
+ 'debug_info' => [
|
|
|
|
|
+ 'y_start' => $yStart,
|
|
|
|
|
+ 'y_end' => $yEnd,
|
|
|
|
|
+ 'block_count' => count($answerBlocks),
|
|
|
|
|
+ 'system_question_length' => mb_strlen($systemQuestionText),
|
|
|
|
|
+ 'ocr_question_number' => $bestMatch['ocr_question_number'],
|
|
|
|
|
+ 'match_similarity' => round($bestMatch['similarity'] * 100, 2) . '%',
|
|
|
|
|
+ 'ocr_confidence' => round($avgConfidence * 100, 2) . '%'
|
|
|
|
|
+ ],
|
|
|
|
|
+ 'question_text' => $systemQuestionText
|
|
|
|
|
+ ];
|
|
|
|
|
+
|
|
|
|
|
+ \Log::info("系统Q{$paperQuestion->question_number} 匹配到 OCR Q{$bestMatch['ocr_question_number']}", [
|
|
|
|
|
+ 'similarity' => round($bestMatch['similarity'] * 100, 2) . '%',
|
|
|
|
|
+ 'student_answer_preview' => mb_substr($studentAnswer, 0, 50)
|
|
|
|
|
+ ]);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ // 未找到匹配
|
|
|
|
|
+ \Log::warning("系统Q{$paperQuestion->question_number} 未找到匹配的OCR题目");
|
|
|
|
|
+
|
|
|
|
|
+ $results[$paperQuestion->question_number] = [
|
|
|
|
|
+ 'student_answer' => '',
|
|
|
|
|
+ 'confidence' => 0,
|
|
|
|
|
+ 'coordinates' => [
|
|
|
|
|
+ 'y_min' => 0,
|
|
|
|
|
+ 'y_max' => 0
|
|
|
|
|
+ ],
|
|
|
|
|
+ 'debug_info' => [
|
|
|
|
|
+ 'error' => '未找到匹配的OCR题目'
|
|
|
|
|
+ ],
|
|
|
|
|
+ 'question_text' => strip_tags($paperQuestion->question_text ?? '')
|
|
|
|
|
+ ];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ \Log::info('OCR匹配完成', [
|
|
|
|
|
+ 'matched_count' => count(array_filter($results, fn($r) => !empty($r['student_answer']))),
|
|
|
|
|
+ 'total_count' => count($results)
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ return $results;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 基于题号坐标对所有block做y轴分段聚类
|
|
|
|
|
+ */
|
|
|
|
|
+ private function groupBlocksByQuestionNumber(array $blocks): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $questionUnits = [];
|
|
|
|
|
+ $currentQuestion = null;
|
|
|
|
|
+ $blocksByType = [];
|
|
|
|
|
+
|
|
|
|
|
+ // 第一步:识别所有题号
|
|
|
|
|
+ $questionNumbers = [];
|
|
|
|
|
+ foreach ($blocks as $index => $block) {
|
|
|
|
|
+ if ($block['type'] === 'question_number') {
|
|
|
|
|
+ $y = $this->getBlockCenterY($block);
|
|
|
|
|
+ $questionNumbers[] = [
|
|
|
|
|
+ 'index' => $index,
|
|
|
|
|
+ 'text' => $block['text'],
|
|
|
|
|
+ 'y' => $y,
|
|
|
|
|
+ 'number' => $this->extractQuestionNumber($block['text'])
|
|
|
|
|
+ ];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 第二步:按题号分组blocks
|
|
|
|
|
+ for ($i = 0; $i < count($questionNumbers); $i++) {
|
|
|
|
|
+ $currentQN = $questionNumbers[$i];
|
|
|
|
|
+ $nextQN = $questionNumbers[$i + 1] ?? null;
|
|
|
|
|
+
|
|
|
|
|
+ $yStart = $currentQN['y'];
|
|
|
|
|
+ $yEnd = $nextQN ? $nextQN['y'] : PHP_INT_MAX;
|
|
|
|
|
+
|
|
|
|
|
+ // 收集这个题号范围内的所有blocks
|
|
|
|
|
+ $questionBlocks = [];
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ $y = $this->getBlockCenterY($block);
|
|
|
|
|
+ if ($y >= $yStart && ($nextQN === null || $y < $yEnd)) {
|
|
|
|
|
+ $questionBlocks[] = $block;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $questionUnits[] = [
|
|
|
|
|
+ 'question_number' => $currentQN['number'],
|
|
|
|
|
+ 'question_text' => $currentQN['text'],
|
|
|
|
|
+ 'blocks' => $questionBlocks,
|
|
|
|
|
+ 'y_range' => ['start' => $yStart, 'end' => $yEnd]
|
|
|
|
|
+ ];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $questionUnits;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 处理每个题目的内容
|
|
|
|
|
+ */
|
|
|
|
|
+ private function processQuestionUnits(array $questionUnits): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $structuredQuestions = [];
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($questionUnits as $unit) {
|
|
|
|
|
+ $question = [
|
|
|
|
|
+ 'question_number' => $unit['question_number'],
|
|
|
|
|
+ 'content' => '',
|
|
|
|
|
+ 'options' => [],
|
|
|
|
|
+ 'answer' => '',
|
|
|
|
|
+ 'confidence' => 0
|
|
|
|
|
+ ];
|
|
|
|
|
+
|
|
|
|
|
+ $contentParts = [];
|
|
|
|
|
+ $options = [];
|
|
|
|
|
+ $answerAreas = [];
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($unit['blocks'] as $block) {
|
|
|
|
|
+ switch ($block['type']) {
|
|
|
|
|
+ case 'content':
|
|
|
|
|
+ $contentParts[] = $block['text'];
|
|
|
|
|
+ break;
|
|
|
|
|
+
|
|
|
|
|
+ case 'option':
|
|
|
|
|
+ // 提取选项字母和内容
|
|
|
|
|
+ if (preg_match('/^([A-Da-d])[\.\))]\s*(.*)/', $block['text'], $matches)) {
|
|
|
|
|
+ $options[] = [
|
|
|
|
|
+ 'letter' => strtoupper($matches[1]),
|
|
|
|
|
+ 'content' => trim($matches[2])
|
|
|
|
|
+ ];
|
|
|
|
|
+ } else {
|
|
|
|
|
+ $options[] = [
|
|
|
|
|
+ 'letter' => '',
|
|
|
|
|
+ 'content' => $block['text']
|
|
|
|
|
+ ];
|
|
|
|
|
+ }
|
|
|
|
|
+ break;
|
|
|
|
|
+
|
|
|
|
|
+ case 'answer_area':
|
|
|
|
|
+ // 查找答题区域中的手写内容
|
|
|
|
|
+ $answerAreas[] = $block['text'];
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 合并题干内容
|
|
|
|
|
+ $question['content'] = $this->mergeContentParts($contentParts);
|
|
|
|
|
+ $question['options'] = $options;
|
|
|
|
|
+ $question['answer'] = $this->extractAnswerFromAnswerAreas($answerAreas);
|
|
|
|
|
+ $question['confidence'] = $this->calculateConfidence($unit['blocks']);
|
|
|
|
|
+
|
|
|
|
|
+ $structuredQuestions[] = $question;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $structuredQuestions;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 合并题干内容
|
|
|
|
|
+ */
|
|
|
|
|
+ private function mergeContentParts(array $contentParts): string
|
|
|
|
|
+ {
|
|
|
|
|
+ $merged = '';
|
|
|
|
|
+ $lastWasQuestion = false;
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($contentParts as $part) {
|
|
|
|
|
+ // 跳过题号(已经在其他地方处理)
|
|
|
|
|
+ if (preg_match('/^[\((]?[\d]+[\.\)))、]/', $part)) {
|
|
|
|
|
+ $lastWasQuestion = true;
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 跳过重复的题号
|
|
|
|
|
+ if ($lastWasQuestion && preg_match('/^[\((]?[\d]+[\.\)))、]/', $part)) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $merged .= ($merged ? ' ' : '') . $part;
|
|
|
|
|
+ $lastWasQuestion = false;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return trim($merged);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 从答题区域提取答案
|
|
|
|
|
+ */
|
|
|
|
|
+ private function extractAnswerFromAnswerAreas(array $answerAreas): string
|
|
|
|
|
+ {
|
|
|
|
|
+ $answer = '';
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($answerAreas as $area) {
|
|
|
|
|
+ // 查找手写内容(通常在空白或下划线附近)
|
|
|
|
|
+ if (preg_match('/([A-Da-d])/', $area, $matches)) {
|
|
|
|
|
+ $answer = strtoupper($matches[1]);
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 查找填空题的答案
|
|
|
|
|
+ if (preg_match('/\S+/', $area, $matches) && !preg_match('/(得分|评卷人)/', $area)) {
|
|
|
|
|
+ $answer = trim($matches[0]);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $answer;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 计算置信度
|
|
|
|
|
+ */
|
|
|
|
|
+ private function calculateConfidence(array $blocks): float
|
|
|
|
|
+ {
|
|
|
|
|
+ $totalConfidence = 0;
|
|
|
|
|
+ $count = 0;
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ if ($block['confidence'] !== null) {
|
|
|
|
|
+ $totalConfidence += $block['confidence'];
|
|
|
|
|
+ $count++;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $count > 0 ? $totalConfidence / $count : 0.8;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 获取block的Y坐标中心
|
|
|
|
|
+ */
|
|
|
|
|
+ private function getBlockCenterY(array $block): ?int
|
|
|
|
|
+ {
|
|
|
|
|
+ if (!$block['position'] || empty($block['position'])) {
|
|
|
|
|
+ return null;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $yValues = [];
|
|
|
|
|
+ foreach ($block['position'] as $point) {
|
|
|
|
|
+ if (isset($point['y'])) {
|
|
|
|
|
+ $yValues[] = $point['y'];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (empty($yValues)) {
|
|
|
|
|
+ return null;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return (min($yValues) + max($yValues)) / 2;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 从文本中提取题号
|
|
|
|
|
+ */
|
|
|
|
|
+ private function extractQuestionNumber(string $text): int
|
|
|
|
|
+ {
|
|
|
|
|
+ if (preg_match('/[\d]+/', $text, $matches)) {
|
|
|
|
|
+ return (int)$matches[0];
|
|
|
|
|
+ }
|
|
|
|
|
+ return 0;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 使用试卷信息优化匹配结果
|
|
|
|
|
+ */
|
|
|
|
|
+ private function optimizeWithPaperInfo(array $questions, array $paperInfo): array
|
|
|
|
|
+ {
|
|
|
|
|
+ // 获取系统试卷的题目列表
|
|
|
|
|
+ $systemQuestions = $paperInfo['questions'] ?? [];
|
|
|
|
|
+
|
|
|
|
|
+ // 构建系统题目的映射
|
|
|
|
|
+ $systemMap = [];
|
|
|
|
|
+ foreach ($systemQuestions as $sysQ) {
|
|
|
|
|
+ $systemMap[$sysQ['question_number']] = $sysQ;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 优化每个题目
|
|
|
|
|
+ $optimized = [];
|
|
|
|
|
+ foreach ($questions as $question) {
|
|
|
|
|
+ $qNum = $question['question_number'];
|
|
|
|
|
+
|
|
|
|
|
+ // 如果系统试卷中有对应题号,进行优化
|
|
|
|
|
+ if (isset($systemMap[$qNum])) {
|
|
|
|
|
+ $sysQuestion = $systemMap[$qNum];
|
|
|
|
|
+
|
|
|
|
|
+ // 题型匹配
|
|
|
|
|
+ if (isset($sysQuestion['question_type'])) {
|
|
|
|
|
+ $question = $this->optimizeByQuestionType($question, $sysQuestion['question_type']);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 答案优化
|
|
|
|
|
+ if (isset($sysQuestion['correct_answer'])) {
|
|
|
|
|
+ $question = $this->optimizeAnswer($question, $sysQuestion['correct_answer']);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $optimized[] = $question;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $optimized;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 根据题型优化解析
|
|
|
|
|
+ */
|
|
|
|
|
+ private function optimizeByQuestionType(array $question, string $questionType): array
|
|
|
|
|
+ {
|
|
|
|
|
+ switch ($questionType) {
|
|
|
|
|
+ case 'choice':
|
|
|
|
|
+ // 选择题:确保有选项,优化答案格式
|
|
|
|
|
+ if (empty($question['options']) && preg_match('/[A-Da-d]/', $question['content'])) {
|
|
|
|
|
+ // 如果内容中包含选项,尝试提取
|
|
|
|
|
+ $question['options'] = $this->extractOptionsFromContent($question['content']);
|
|
|
|
|
+ }
|
|
|
|
|
+ break;
|
|
|
|
|
+
|
|
|
|
|
+ case 'fill':
|
|
|
|
|
+ // 填空题:识别填空位置
|
|
|
|
|
+ $question['blanks'] = $this->findFillBlanks($question['content']);
|
|
|
|
|
+ break;
|
|
|
|
|
+
|
|
|
|
|
+ case 'answer':
|
|
|
|
|
+ // 解答题:保留完整内容
|
|
|
|
|
+ $question['full_solution'] = $question['content'];
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $question;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 优化答案格式
|
|
|
|
|
+ */
|
|
|
|
|
+ private function optimizeAnswer(array $question, string $correctAnswer): array
|
|
|
|
|
+ {
|
|
|
|
|
+ // 如果是选择题,标准化答案格式
|
|
|
|
|
+ if (!empty($question['options'])) {
|
|
|
|
|
+ $question['answer'] = $this->normalizeChoiceAnswer($question['answer'], $correctAnswer);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $question;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 标准化选择题答案
|
|
|
|
|
+ */
|
|
|
|
|
+ private function normalizeChoiceAnswer(string $studentAnswer, string $correctAnswer): string
|
|
|
|
|
+ {
|
|
|
|
|
+ // 映射表:处理各种答案格式
|
|
|
|
|
+ $map = [
|
|
|
|
|
+ '①' => 'A', '②' => 'B', '③' => 'C', '④' => 'D',
|
|
|
|
|
+ '1' => 'A', '2' => 'B', '3' => 'C', '4' => 'D'
|
|
|
|
|
+ ];
|
|
|
|
|
+
|
|
|
|
|
+ $studentAnswer = trim($studentAnswer);
|
|
|
|
|
+ return $map[$studentAnswer] ?? strtoupper($studentAnswer);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 从内容中提取选项
|
|
|
|
|
+ */
|
|
|
|
|
+ private function extractOptionsFromContent(string $content): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $options = [];
|
|
|
|
|
+ $lines = explode("\n", $content);
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($lines as $line) {
|
|
|
|
|
+ if (preg_match('/^([A-Da-d])[\.\))]\s*(.*)/', trim($line), $matches)) {
|
|
|
|
|
+ $options[] = [
|
|
|
|
|
+ 'letter' => strtoupper($matches[1]),
|
|
|
|
|
+ 'content' => trim($matches[2])
|
|
|
|
|
+ ];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $options;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 查找填空位置
|
|
|
|
|
+ */
|
|
|
|
|
+ private function findFillBlanks(string $content): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $blanks = [];
|
|
|
|
|
+
|
|
|
|
|
+ // 查找下划线或括号
|
|
|
|
|
+ if (preg_match_all('/(_{2,})|([\s\S]*?)|\([\s\S]*?\)/u', $content, $matches)) {
|
|
|
|
|
+ $blanks = $matches[0];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $blanks;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 调试输出:生成可视化的分析结果
|
|
|
|
|
+ */
|
|
|
|
|
+ public function generateDebugOutput(array $ocrData, array $structuredQuestions): string
|
|
|
|
|
+ {
|
|
|
|
|
+ $output = "=== OCR数据解析调试输出 ===\n\n";
|
|
|
|
|
+
|
|
|
|
|
+ // 原始数据统计
|
|
|
|
|
+ $blocks = $this->extractTextBlocks($ocrData);
|
|
|
|
|
+ $output .= "1. 原始文本块数量: " . count($blocks) . "\n";
|
|
|
|
|
+
|
|
|
|
|
+ $typeStats = [];
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ $type = $block['type'];
|
|
|
|
|
+ $typeStats[$type] = ($typeStats[$type] ?? 0) + 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ $output .= " 类型分布: " . json_encode($typeStats, JSON_UNESCAPED_UNICODE) . "\n\n";
|
|
|
|
|
+
|
|
|
|
|
+ // 结构化题目
|
|
|
|
|
+ $output .= "2. 识别到的题目数量: " . count($structuredQuestions) . "\n";
|
|
|
|
|
+ foreach ($structuredQuestions as $i => $q) {
|
|
|
|
|
+ $output .= "\n题目 " . ($i + 1) . " (题号: {$q['question_number']}):\n";
|
|
|
|
|
+ $output .= " - 内容: " . substr($q['content'], 0, 100) . "...\n";
|
|
|
|
|
+ $output .= " - 选项数: " . count($q['options']) . "\n";
|
|
|
|
|
+ $output .= " - 答案: " . ($q['answer'] ?: '未识别') . "\n";
|
|
|
|
|
+ $output .= " - 置信度: " . round($q['confidence'] * 100, 2) . "%\n";
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $output;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 寻找题目锚点
|
|
|
|
|
+ */
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 寻找题目锚点
|
|
|
|
|
+ */
|
|
|
|
|
+ public function findQuestionAnchor(array $blocks, $paperQuestion): ?array
|
|
|
|
|
+ {
|
|
|
|
|
+ $qNum = $paperQuestion->question_number;
|
|
|
|
|
+ $cleanContent = strip_tags($paperQuestion->question_text);
|
|
|
|
|
+ $cleanContent = preg_replace('/\s+/', '', $cleanContent);
|
|
|
|
|
+
|
|
|
|
|
+ // 策略1:优先匹配 "题号." 的形式 (e.g., "1.", "2、")
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ // 匹配 "1.", "1、", "(1)" 等开头
|
|
|
|
|
+ if (preg_match('/^[\((]?'.$qNum.'[\.\)))、]/', $block['text'])) {
|
|
|
|
|
+ return $this->getBlockCoordinates($block);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 策略1.5:匹配独立的题号 (e.g., "1" 后面跟着空格或换行)
|
|
|
|
|
+ // 有时候OCR会把 "1." 识别成 "1" 和 "." 分开的block,或者 "1 题目内容"
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ if (preg_match('/^'.$qNum.'\s+/', $block['text']) || $block['text'] === (string)$qNum) {
|
|
|
|
|
+ // 只有当这个block看起来像题号(比较短,或者在左侧)时才采纳
|
|
|
|
|
+ // 这里简单判断一下长度,防止匹配到 "100" 中的 "1"
|
|
|
|
|
+ if (strlen($block['text']) < 5) {
|
|
|
|
|
+ return $this->getBlockCoordinates($block);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 策略2:如果题号匹配失败,尝试匹配题目内容的前几个字
|
|
|
|
|
+ $prefix = mb_substr($cleanContent, 0, 15); // 取前15个字
|
|
|
|
|
+
|
|
|
|
|
+ if (mb_strlen($prefix) > 2) {
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ $blockText = preg_replace('/\s+/', '', $block['text']);
|
|
|
|
|
+
|
|
|
|
|
+ // 简单的包含检查
|
|
|
|
|
+ if (mb_strpos($blockText, $prefix) !== false) {
|
|
|
|
|
+ return $this->getBlockCoordinates($block);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Fuzzy matching for the prefix
|
|
|
|
|
+ similar_text($prefix, mb_substr($blockText, 0, mb_strlen($prefix) + 5), $percent);
|
|
|
|
|
+ if ($percent > 80) {
|
|
|
|
|
+ return $this->getBlockCoordinates($block);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return null;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 获取Block的坐标信息
|
|
|
|
|
+ */
|
|
|
|
|
+ private function getBlockCoordinates(array $block): array
|
|
|
|
|
+ {
|
|
|
|
|
+ if (empty($block['position'])) {
|
|
|
|
|
+ return ['y_top' => 0, 'y_bottom' => 0];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $ys = array_column($block['position'], 'y');
|
|
|
|
|
+ return [
|
|
|
|
|
+ 'y_top' => min($ys),
|
|
|
|
|
+ 'y_bottom' => max($ys),
|
|
|
|
|
+ 'x_left' => min(array_column($block['position'], 'x')),
|
|
|
|
|
+ 'x_right' => max(array_column($block['position'], 'x')),
|
|
|
|
|
+ ];
|
|
|
|
|
+ }
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 从裁剪区域的OCR结果中提取答案(去除题目文本)
|
|
|
|
|
+ */
|
|
|
|
|
+ public function extractAnswerFromCrop(array $cropResult, string $systemQuestionText): string
|
|
|
|
|
+ {
|
|
|
|
|
+ // 1. 获取OCR识别的完整文本
|
|
|
|
|
+ $ocrText = '';
|
|
|
|
|
+ if (isset($cropResult['content'])) {
|
|
|
|
|
+ $ocrText = $cropResult['content'];
|
|
|
|
|
+ } elseif (isset($cropResult['questions'])) {
|
|
|
|
|
+ $texts = array_column($cropResult['questions'], 'content');
|
|
|
|
|
+ $ocrText = implode("\n", $texts);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (empty($ocrText)) {
|
|
|
|
|
+ return '';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 2. 预处理文本(去除标点、空格,统一格式)
|
|
|
|
|
+ $normalizedOcr = $this->normalizeTextForComparison($ocrText);
|
|
|
|
|
+ $normalizedSystem = $this->normalizeTextForComparison($systemQuestionText);
|
|
|
|
|
+
|
|
|
|
|
+ // 3. 尝试去除题目部分
|
|
|
|
|
+ // 策略A: 如果OCR文本以系统题目开头(允许一定的模糊匹配)
|
|
|
|
|
+ if (str_starts_with($normalizedOcr, $normalizedSystem)) {
|
|
|
|
|
+ // 找到系统题目在原始OCR文本中的结束位置
|
|
|
|
|
+ // 这是一个简化的处理,实际可能需要更复杂的对齐算法
|
|
|
|
|
+ $cleanOcr = $this->removePrefixFuzzy($ocrText, $systemQuestionText);
|
|
|
|
|
+ return trim($cleanOcr);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 策略B: 最长公共子序列匹配 (LCS) - 简化版
|
|
|
|
|
+ // 如果OCR文本的前半部分与系统题目高度相似,则认为前半部分是题目
|
|
|
|
|
+ $splitIndex = $this->findSplitIndex($ocrText, $systemQuestionText);
|
|
|
|
|
+ if ($splitIndex > 0) {
|
|
|
|
|
+ return trim(substr($ocrText, $splitIndex));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 策略C: 如果无法区分,且OCR文本比系统题目长很多,可能包含了答案
|
|
|
|
|
+ // 但为了安全,如果匹配失败,我们还是返回原文本,或者尝试启发式规则
|
|
|
|
|
+
|
|
|
|
|
+ // 启发式规则:如果包含 "解:"、"答:" 等关键字,取关键字之后的内容
|
|
|
|
|
+ if (preg_match('/(解[::]|答[::])(.*)/s', $ocrText, $matches)) {
|
|
|
|
|
+ return trim($matches[0]); // 返回包含"解:"的部分
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 启发式规则:对于填空题,如果末尾有内容
|
|
|
|
|
+ // 比如 ".... = 3",取等号后面的
|
|
|
|
|
+ if (preg_match('/=\s*(\S+)$/', $ocrText, $matches)) {
|
|
|
|
|
+ return trim($matches[1]);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $ocrText;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private function normalizeTextForComparison(string $text): string
|
|
|
|
|
+ {
|
|
|
|
|
+ $text = strip_tags($text);
|
|
|
|
|
+ $text = preg_replace('/\s+/', '', $text);
|
|
|
|
|
+ $text = preg_replace('/[[:punct:]]/', '', $text); // 去除标点
|
|
|
|
|
+ return strtolower($text);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private function removePrefixFuzzy(string $fullText, string $prefix): string
|
|
|
|
|
+ {
|
|
|
|
|
+ // 简单实现:逐字符匹配,直到不匹配为止
|
|
|
|
|
+ $len = min(strlen($fullText), strlen($prefix) * 1.5); // 限制搜索范围
|
|
|
|
|
+ $matchCount = 0;
|
|
|
|
|
+ $lastMatchIndex = 0;
|
|
|
|
|
+
|
|
|
|
|
+ // 这里使用一个简单的滑动窗口或者直接比较
|
|
|
|
|
+ // 为了效率,我们假设题目在开头
|
|
|
|
|
+ // 我们寻找 prefix 的最后一个字符在 fullText 中的位置
|
|
|
|
|
+
|
|
|
|
|
+ // 更简单的方法:直接计算相似度,找到最佳切割点
|
|
|
|
|
+ // 但这里我们先用一个简单的 hack:
|
|
|
|
|
+ // 假设 OCR 结果中的题目部分和 systemQuestionText 长度差不多
|
|
|
|
|
+ $prefixLen = strlen($prefix);
|
|
|
|
|
+ $potentialPrefix = substr($fullText, 0, $prefixLen + 10); // 多取一点
|
|
|
|
|
+
|
|
|
|
|
+ similar_text($potentialPrefix, $prefix, $percent);
|
|
|
|
|
+ if ($percent > 80) {
|
|
|
|
|
+ return substr($fullText, $prefixLen); // 简单截断
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $fullText;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private function findSplitIndex(string $ocrText, string $systemText): int
|
|
|
|
|
+ {
|
|
|
|
|
+ // 寻找 systemText 在 ocrText 中的结束位置
|
|
|
|
|
+ // 这是一个难点,因为 OCR 可能有错别字
|
|
|
|
|
+
|
|
|
|
|
+ // 简化算法:
|
|
|
|
|
+ // 1. 取 systemText 的后 10 个字符作为"锚点"
|
|
|
|
|
+ $anchor = mb_substr($systemText, -10);
|
|
|
|
|
+ if (mb_strlen($anchor) < 5) $anchor = $systemText;
|
|
|
|
|
+
|
|
|
|
|
+ $pos = mb_strpos($ocrText, $anchor);
|
|
|
|
|
+ if ($pos !== false) {
|
|
|
|
|
+ return $pos + mb_strlen($anchor);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return 0;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 找到答案开始的块索引(通过与系统题干匹配)
|
|
|
|
|
+ */
|
|
|
|
|
+ private function findAnswerStartIndex(array $blocks, string $systemText): int
|
|
|
|
|
+ {
|
|
|
|
|
+ $accumulated = '';
|
|
|
|
|
+ $normalizedSystem = $this->normalizeTextForComparison($systemText);
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($blocks as $index => $block) {
|
|
|
|
|
+ $accumulated .= $block['text'];
|
|
|
|
|
+ $normalizedAccumulated = $this->normalizeTextForComparison($accumulated);
|
|
|
|
|
+
|
|
|
|
|
+ // 计算相似度
|
|
|
|
|
+ similar_text($normalizedAccumulated, $normalizedSystem, $percent);
|
|
|
|
|
+
|
|
|
|
|
+ \Log::debug("Block {$index} accumulated similarity: {$percent}%", [
|
|
|
|
|
+ 'accumulated_length' => mb_strlen($accumulated),
|
|
|
|
|
+ 'system_length' => mb_strlen($systemText)
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ // 如果相似度超过80%,认为题干已经匹配完成
|
|
|
|
|
+ if ($percent > 80) {
|
|
|
|
|
+ return $index + 1; // 下一个块开始是答案
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 如果累积文本已经明显超过系统文本,但相似度还不够,可能是OCR错误太多
|
|
|
|
|
+ // 这时候用长度作为备选方案
|
|
|
|
|
+ if (mb_strlen($normalizedAccumulated) > mb_strlen($normalizedSystem) * 1.2 && $percent > 60) {
|
|
|
|
|
+ return $index + 1;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return 0; // 未找到匹配,从头开始(保守策略)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 从文本块中提取答案(排除题干部分)
|
|
|
|
|
+ */
|
|
|
|
|
+ private function extractAnswerFromBlocks(array $blocks, string $systemQuestionText, ?string $questionType = null): string
|
|
|
|
|
+ {
|
|
|
|
|
+ if (empty($blocks)) {
|
|
|
|
|
+ return '';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 策略1: 查找明确的答案标记
|
|
|
|
|
+ $answerKeywords = ['答:', '答案:', '解:', '解答:'];
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ foreach ($answerKeywords as $keyword) {
|
|
|
|
|
+ if (mb_strpos($block['text'], $keyword) !== false) {
|
|
|
|
|
+ // 找到答案标记,提取标记后的内容
|
|
|
|
|
+ $parts = mb_split($keyword, $block['text']);
|
|
|
|
|
+ if (count($parts) > 1) {
|
|
|
|
|
+ return trim($parts[1]);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 策略2: 选择题,优先单个字母
|
|
|
|
|
+ if ($questionType === 'choice') {
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ $text = trim($block['text']);
|
|
|
|
|
+ $type = $this->detectTextType($text);
|
|
|
|
|
+
|
|
|
|
|
+ if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (preg_match('/^[A-Da-d][\.\))]?$/u', $text)) {
|
|
|
|
|
+ return strtoupper(preg_replace('/[^A-D]/i', '', $text));
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 策略3: 填空题,优先短数字/等式
|
|
|
|
|
+ if ($questionType === 'fill') {
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ $text = trim($block['text']);
|
|
|
|
|
+ $type = $this->detectTextType($text);
|
|
|
|
|
+
|
|
|
|
|
+ if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (mb_strlen($text) <= 25 && preg_match('/[\\d=]/u', $text)) {
|
|
|
|
|
+ $normalizedText = $this->normalizeTextForComparison($text);
|
|
|
|
|
+ $normalizedQuestion = $this->normalizeTextForComparison($systemQuestionText);
|
|
|
|
|
+ if (mb_strpos($normalizedQuestion, $normalizedText) === false) {
|
|
|
|
|
+ return $text;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 策略4: 解答/通用,收集非题干短句,优先最短
|
|
|
|
|
+ $normalizedQuestion = $this->normalizeTextForComparison($systemQuestionText);
|
|
|
|
|
+ $candidates = [];
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ $text = trim($block['text']);
|
|
|
|
|
+ $type = $this->detectTextType($text);
|
|
|
|
|
+
|
|
|
|
|
+ if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+ if ($text === '' || mb_strpos($text, '分') !== false) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $normalizedText = $this->normalizeTextForComparison($text);
|
|
|
|
|
+ if ($normalizedText !== '' && mb_strpos($normalizedQuestion, $normalizedText) === false) {
|
|
|
|
|
+ $candidates[] = $text;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (!empty($candidates)) {
|
|
|
|
|
+ usort($candidates, fn($a, $b) => mb_strlen($a) <=> mb_strlen($b));
|
|
|
|
|
+ return trim($candidates[0]);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return '';
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 从OCR文本块中提取所有题目(基于题号标记)
|
|
|
|
|
+ */
|
|
|
|
|
+ private function extractOCRQuestions(array $blocks): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $ocrQuestions = [];
|
|
|
|
|
+ $currentQuestion = null;
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($blocks as $idx => $block) {
|
|
|
|
|
+ $type = $this->detectTextType($block['text']);
|
|
|
|
|
+
|
|
|
|
|
+ // 检测到新题号
|
|
|
|
|
+ if ($type === 'question_number') {
|
|
|
|
|
+ // 保存上一题
|
|
|
|
|
+ if ($currentQuestion !== null) {
|
|
|
|
|
+ $ocrQuestions[] = $currentQuestion;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 开始新题
|
|
|
|
|
+ $questionNumber = $this->extractQuestionNumber($block['text']);
|
|
|
|
|
+ $currentQuestion = [
|
|
|
|
|
+ 'ocr_question_number' => $questionNumber,
|
|
|
|
|
+ 'question_text' => $block['text'],
|
|
|
|
|
+ 'blocks' => [$block],
|
|
|
|
|
+ 'y_start' => $this->getBlockTopY($block),
|
|
|
|
|
+ 'y_end' => $this->getBlockBottomY($block),
|
|
|
|
|
+ ];
|
|
|
|
|
+ } elseif ($type === 'section_header') {
|
|
|
|
|
+ // 遇到题型说明,保存当前题目并重置
|
|
|
|
|
+ if ($currentQuestion !== null) {
|
|
|
|
|
+ $ocrQuestions[] = $currentQuestion;
|
|
|
|
|
+ $currentQuestion = null;
|
|
|
|
|
+ }
|
|
|
|
|
+ } elseif ($currentQuestion !== null) {
|
|
|
|
|
+ // 累积当前题目的内容
|
|
|
|
|
+ $currentQuestion['blocks'][] = $block;
|
|
|
|
|
+ $currentQuestion['question_text'] .= ' ' . $block['text'];
|
|
|
|
|
+ $currentQuestion['y_end'] = $this->getBlockBottomY($block);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 保存最后一题
|
|
|
|
|
+ if ($currentQuestion !== null) {
|
|
|
|
|
+ $ocrQuestions[] = $currentQuestion;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $ocrQuestions;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 为系统题目找到最匹配的OCR题目
|
|
|
|
|
+ */
|
|
|
|
|
+ private function findBestMatchingOCRQuestion(array $ocrQuestions, $paperQuestion): ?array
|
|
|
|
|
+ {
|
|
|
|
|
+ $systemTextRaw = strip_tags($paperQuestion->question_text ?? '');
|
|
|
|
|
+ $systemText = $this->normalizeTextForMatching($systemTextRaw);
|
|
|
|
|
+ $targetNumber = (int) ($paperQuestion->question_number ?? 0);
|
|
|
|
|
+
|
|
|
|
|
+ // 优先按题号直接命中
|
|
|
|
|
+ foreach ($ocrQuestions as $ocrQ) {
|
|
|
|
|
+ if (($ocrQ['ocr_question_number'] ?? null) === $targetNumber) {
|
|
|
|
|
+ $ocrQ['similarity'] = 0.6; // 基准相似度
|
|
|
|
|
+ \Log::info("Found match by number for Q{$paperQuestion->question_number}", [
|
|
|
|
|
+ 'ocr_question_number' => $ocrQ['ocr_question_number']
|
|
|
|
|
+ ]);
|
|
|
|
|
+ return $ocrQ;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $bestMatch = null;
|
|
|
|
|
+ $bestSimilarity = 0;
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($ocrQuestions as $ocrQ) {
|
|
|
|
|
+ $ocrText = $this->normalizeTextForMatching($ocrQ['question_text']);
|
|
|
|
|
+
|
|
|
|
|
+ // 1) 优先题号直接匹配,给出高基准分
|
|
|
|
|
+ $numberBoost = ($ocrQ['ocr_question_number'] ?? null) === $targetNumber ? 20 : 0;
|
|
|
|
|
+
|
|
|
|
|
+ // 2) 文本相似度
|
|
|
|
|
+ similar_text($systemText, $ocrText, $percent);
|
|
|
|
|
+ $percent += $numberBoost; // 数字匹配可以抵消轻微文本差异
|
|
|
|
|
+
|
|
|
|
|
+ \Log::debug("Matching Q{$paperQuestion->question_number} with OCR Q{$ocrQ['ocr_question_number']}", [
|
|
|
|
|
+ 'similarity' => round($percent, 2),
|
|
|
|
|
+ 'number_boost' => $numberBoost,
|
|
|
|
|
+ 'system_text_preview' => mb_substr($systemTextRaw, 0, 50),
|
|
|
|
|
+ 'ocr_text_preview' => mb_substr($ocrQ['question_text'], 0, 50)
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ if ($percent > $bestSimilarity) {
|
|
|
|
|
+ $bestSimilarity = $percent;
|
|
|
|
|
+ $bestMatch = $ocrQ;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 只返回相似度超过阈值的匹配
|
|
|
|
|
+ if ($bestSimilarity >= 30) { // 降低阈值以适应OCR识别误差和LaTeX差异
|
|
|
|
|
+ $bestMatch['similarity'] = $bestSimilarity / 100;
|
|
|
|
|
+ \Log::info("Found match for Q{$paperQuestion->question_number}", [
|
|
|
|
|
+ 'ocr_question_number' => $bestMatch['ocr_question_number'],
|
|
|
|
|
+ 'similarity' => round($bestSimilarity, 2) . '%'
|
|
|
|
|
+ ]);
|
|
|
|
|
+ return $bestMatch;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ \Log::warning("No match found for Q{$paperQuestion->question_number}", [
|
|
|
|
|
+ 'best_similarity' => round($bestSimilarity, 2) . '%'
|
|
|
|
|
+ ]);
|
|
|
|
|
+ return null;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 标准化文本用于匹配(去除空格、标点、LaTeX等)
|
|
|
|
|
+ */
|
|
|
|
|
+ private function normalizeTextForMatching(string $text): string
|
|
|
|
|
+ {
|
|
|
|
|
+ // 去除LaTeX标记(包括$$和$)
|
|
|
|
|
+ $text = preg_replace('/\$\$?[^\$]+\$\$?/s', '', $text);
|
|
|
|
|
+ // 去除HTML标签
|
|
|
|
|
+ $text = strip_tags($text);
|
|
|
|
|
+ // 去除所有标点符号和特殊字符
|
|
|
|
|
+ $text = preg_replace('/[[:punct:]]/u', '', $text);
|
|
|
|
|
+ $text = preg_replace('/[^\p{L}\p{N}]/u', '', $text);
|
|
|
|
|
+ // 去除空格
|
|
|
|
|
+ $text = preg_replace('/\s+/u', '', $text);
|
|
|
|
|
+ // 转小写
|
|
|
|
|
+ $text = mb_strtolower($text);
|
|
|
|
|
+
|
|
|
|
|
+ return $text;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 获取block的顶部Y坐标
|
|
|
|
|
+ */
|
|
|
|
|
+ private function getBlockTopY(array $block): int
|
|
|
|
|
+ {
|
|
|
|
|
+ if (empty($block['position'])) {
|
|
|
|
|
+ return 0;
|
|
|
|
|
+ }
|
|
|
|
|
+ return min(array_column($block['position'], 'y'));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 获取block的底部Y坐标
|
|
|
|
|
+ */
|
|
|
|
|
+ private function getBlockBottomY(array $block): int
|
|
|
|
|
+ {
|
|
|
|
|
+ if (empty($block['position'])) {
|
|
|
|
|
+ return 0;
|
|
|
|
|
+ }
|
|
|
|
|
+ return max(array_column($block['position'], 'y'));
|
|
|
|
|
+ }
|
|
|
|
|
+}
|