| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248 |
- <?php
- namespace App\Services;
- class OCRStructureParser
- {
- /**
- * 解析阿里云OCR返回的碎片化blocks,重构为题目结构
- */
- public function parse(array $ocrData): array
- {
- // 递归解析data字段(防止嵌套字符串)
- $data = $this->parseNestedJson($ocrData);
- // 提取所有文本块
- $blocks = $this->extractAllTextBlocks($data);
- // 根据题号分组
- $questionGroups = $this->groupBlocksByQuestionNumber($blocks);
- // 组装每道题的结构
- $structuredQuestions = $this->assembleQuestions($questionGroups);
- return $structuredQuestions;
- }
- /**
- * 递归解析嵌套的JSON字符串
- */
- private function parseNestedJson($data)
- {
- if (is_string($data)) {
- $decoded = json_decode($data, true);
- if (json_last_error() === JSON_ERROR_NONE) {
- return $decoded;
- }
- return $data;
- }
- // 递归处理嵌套结构
- if (is_array($data)) {
- foreach ($data as $key => $value) {
- $data[$key] = $this->parseNestedJson($value);
- }
- }
- return $data;
- }
- /**
- * 提取所有文本块
- */
- private function extractAllTextBlocks(array $data): array
- {
- $blocks = [];
- if (!isset($data['data']['page_list'])) {
- return $blocks;
- }
- foreach ($data['data']['page_list'] as $page) {
- if (!isset($page['answer_list'])) {
- continue;
- }
- foreach ($page['answer_list'] as $item) {
- if (!isset($item['content_list_info'])) {
- continue;
- }
- foreach ($item['content_list_info'] as $content) {
- $text = $content['text'] ?? '';
- $text = trim($text);
- if ($text !== '') {
- $blocks[] = [
- 'text' => $text,
- 'ids' => $item['ids'] ?? [],
- 'position' => $content['pos'] ?? null,
- 'confidence' => $content['confidence'] ?? null,
- 'doc_index' => $content['doc_index'] ?? 1,
- 'is_multipage' => $item['is_multipage'] ?? false
- ];
- }
- }
- }
- }
- return $blocks;
- }
- /**
- * 根据题号将文本块分组
- */
- private function groupBlocksByQuestionNumber(array $blocks): array
- {
- $questionNumbers = [];
- $groups = [];
- // 第一步:识别所有题号
- foreach ($blocks as $index => $block) {
- $text = $block['text'];
- // 匹配题号格式:1. 1、 1)、(1) ①等
- if (preg_match('/^\s*(\d+)\s*[\.\、\)\)]/', $text, $matches)) {
- $questionNum = (int)$matches[1];
- $y = $this->getBlockCenterY($block);
- $questionNumbers[] = [
- 'index' => $index,
- 'number' => $questionNum,
- 'text' => $text,
- 'y' => $y
- ];
- }
- }
- // 按题号排序
- usort($questionNumbers, function($a, $b) {
- return $a['number'] <=> $b['number'];
- });
- // 第二步:根据题号Y坐标分组
- foreach ($questionNumbers as $i => $currentQN) {
- $nextQN = $questionNumbers[$i + 1] ?? null;
- $yStart = $currentQN['y'];
- $yEnd = $nextQN ? $nextQN['y'] : PHP_INT_MAX;
- // 收集这个题号范围内的所有blocks
- $groupBlocks = [];
- foreach ($blocks as $block) {
- $blockY = $this->getBlockCenterY($block);
- if ($blockY >= $yStart && ($blockY < $yEnd)) {
- $groupBlocks[] = $block;
- }
- }
- $groups[] = [
- 'question_number' => $currentQN['number'],
- 'question_text' => $currentQN['text'],
- 'blocks' => $groupBlocks,
- 'y_range' => ['start' => $yStart, 'end' => $yEnd]
- ];
- }
- return $groups;
- }
- /**
- * 组装每道题的结构
- */
- private function assembleQuestions(array $questionGroups): array
- {
- $questions = [];
- foreach ($questionGroups as $group) {
- $question = [
- 'q' => $group['question_number'],
- 'text' => '',
- 'options' => [],
- 'blocks' => $group['blocks']
- ];
- $questionText = [];
- $options = [];
- $questionNumbers = [];
- foreach ($group['blocks'] as $block) {
- $text = $block['text'];
- // 识别题号
- if (preg_match('/^\s*(\d+)\s*[\.\、\)\)]/', $text, $matches)) {
- continue; // 跳过题号本身
- }
- // 识别选择题选项
- if (preg_match('/^([A-Da-d])[\.\、]?/', $text, $optionMatch)) {
- $optionLetter = strtoupper($optionMatch[1]);
- $options[$optionLetter] = substr($text, 2);
- } elseif (in_array(substr($text, 0, 1), ['A', 'B', 'C', 'D'])) {
- // 单字母选项
- $options[substr($text, 0, 1)] = substr($text, 1);
- } else {
- // 题干或其他内容
- $questionText[] = $text;
- }
- }
- // 合并题干文本
- $question['text'] = implode(' ', array_filter($questionText));
- // 处理选项:如果有多个选项连在一起,需要拆分
- if (count($options) === 0 && preg_match('/([A-Da-d])/', $question['text'])) {
- $options = $this->splitMergedOptions($question['text']);
- }
- $question['options'] = $options;
- $questions[] = $question;
- }
- return $questions;
- }
- /**
- * 拆分连在一起的选项
- */
- private function splitMergedOptions(string $text): array
- {
- $options = [];
- // 匹配选项模式
- if (preg_match_all('/([A-Da-d])[\.\、]?([^A-D]*)/', $text, $matches, PREG_SET_ORDER)) {
- for ($i = 0; $i < count($matches[1]); $i++) {
- $letter = strtoupper($matches[1][$i]);
- $content = trim($matches[2][$i]);
- if ($content) {
- $options[$letter] = $content;
- }
- }
- }
- return $options;
- }
- /**
- * 获取block的Y坐标中心
- */
- private function getBlockCenterY(array $block): int
- {
- if (!isset($block['position']) || empty($block['position'])) {
- return 0;
- }
- $yValues = [];
- foreach ($block['position'] as $point) {
- if (isset($point['y'])) {
- $yValues[] = $point['y'];
- }
- }
- if (empty($yValues)) {
- return 0;
- }
- return (int)(array_sum($yValues) / count($yValues));
- }
- }
|