parseNestedJson($ocrData); // 提取所有文本块 $blocks = $this->extractAllTextBlocks($data); // 根据题号分组 $questionGroups = $this->groupBlocksByQuestionNumber($blocks); // 组装每道题的结构 $structuredQuestions = $this->assembleQuestions($questionGroups); return $structuredQuestions; } /** * 递归解析嵌套的JSON字符串 */ private function parseNestedJson($data) { if (is_string($data)) { $decoded = json_decode($data, true); if (json_last_error() === JSON_ERROR_NONE) { return $decoded; } return $data; } // 递归处理嵌套结构 if (is_array($data)) { foreach ($data as $key => $value) { $data[$key] = $this->parseNestedJson($value); } } return $data; } /** * 提取所有文本块 */ private function extractAllTextBlocks(array $data): array { $blocks = []; if (!isset($data['data']['page_list'])) { return $blocks; } foreach ($data['data']['page_list'] as $page) { if (!isset($page['answer_list'])) { continue; } foreach ($page['answer_list'] as $item) { if (!isset($item['content_list_info'])) { continue; } foreach ($item['content_list_info'] as $content) { $text = $content['text'] ?? ''; $text = trim($text); if ($text !== '') { $blocks[] = [ 'text' => $text, 'ids' => $item['ids'] ?? [], 'position' => $content['pos'] ?? null, 'confidence' => $content['confidence'] ?? null, 'doc_index' => $content['doc_index'] ?? 1, 'is_multipage' => $item['is_multipage'] ?? false ]; } } } } return $blocks; } /** * 根据题号将文本块分组 */ private function groupBlocksByQuestionNumber(array $blocks): array { $questionNumbers = []; $groups = []; // 第一步:识别所有题号 foreach ($blocks as $index => $block) { $text = $block['text']; // 匹配题号格式:1. 1、 1)、(1) ①等 if (preg_match('/^\s*(\d+)\s*[\.\、\)\)]/', $text, $matches)) { $questionNum = (int)$matches[1]; $y = $this->getBlockCenterY($block); $questionNumbers[] = [ 'index' => $index, 'number' => $questionNum, 'text' => $text, 'y' => $y ]; } } // 按题号排序 usort($questionNumbers, function($a, $b) { return $a['number'] <=> $b['number']; }); // 第二步:根据题号Y坐标分组 foreach ($questionNumbers as $i => $currentQN) { $nextQN = $questionNumbers[$i + 1] ?? null; $yStart = $currentQN['y']; $yEnd = $nextQN ? $nextQN['y'] : PHP_INT_MAX; // 收集这个题号范围内的所有blocks $groupBlocks = []; foreach ($blocks as $block) { $blockY = $this->getBlockCenterY($block); if ($blockY >= $yStart && ($blockY < $yEnd)) { $groupBlocks[] = $block; } } $groups[] = [ 'question_number' => $currentQN['number'], 'question_text' => $currentQN['text'], 'blocks' => $groupBlocks, 'y_range' => ['start' => $yStart, 'end' => $yEnd] ]; } return $groups; } /** * 组装每道题的结构 */ private function assembleQuestions(array $questionGroups): array { $questions = []; foreach ($questionGroups as $group) { $question = [ 'q' => $group['question_number'], 'text' => '', 'options' => [], 'blocks' => $group['blocks'] ]; $questionText = []; $options = []; $questionNumbers = []; foreach ($group['blocks'] as $block) { $text = $block['text']; // 识别题号 if (preg_match('/^\s*(\d+)\s*[\.\、\)\)]/', $text, $matches)) { continue; // 跳过题号本身 } // 识别选择题选项 if (preg_match('/^([A-Da-d])[\.\、]?/', $text, $optionMatch)) { $optionLetter = strtoupper($optionMatch[1]); $options[$optionLetter] = substr($text, 2); } elseif (in_array(substr($text, 0, 1), ['A', 'B', 'C', 'D'])) { // 单字母选项 $options[substr($text, 0, 1)] = substr($text, 1); } else { // 题干或其他内容 $questionText[] = $text; } } // 合并题干文本 $question['text'] = implode(' ', array_filter($questionText)); // 处理选项:如果有多个选项连在一起,需要拆分 if (count($options) === 0 && preg_match('/([A-Da-d])/', $question['text'])) { $options = $this->splitMergedOptions($question['text']); } $question['options'] = $options; $questions[] = $question; } return $questions; } /** * 拆分连在一起的选项 */ private function splitMergedOptions(string $text): array { $options = []; // 匹配选项模式 if (preg_match_all('/([A-Da-d])[\.\、]?([^A-D]*)/', $text, $matches, PREG_SET_ORDER)) { for ($i = 0; $i < count($matches[1]); $i++) { $letter = strtoupper($matches[1][$i]); $content = trim($matches[2][$i]); if ($content) { $options[$letter] = $content; } } } return $options; } /** * 获取block的Y坐标中心 */ private function getBlockCenterY(array $block): int { if (!isset($block['position']) || empty($block['position'])) { return 0; } $yValues = []; foreach ($block['position'] as $point) { if (isset($point['y'])) { $yValues[] = $point['y']; } } if (empty($yValues)) { return 0; } return (int)(array_sum($yValues) / count($yValues)); } }