OCRStructureParser.php 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. <?php
  2. namespace App\Services;
  3. class OCRStructureParser
  4. {
  5. /**
  6. * 解析阿里云OCR返回的碎片化blocks,重构为题目结构
  7. */
  8. public function parse(array $ocrData): array
  9. {
  10. // 递归解析data字段(防止嵌套字符串)
  11. $data = $this->parseNestedJson($ocrData);
  12. // 提取所有文本块
  13. $blocks = $this->extractAllTextBlocks($data);
  14. // 根据题号分组
  15. $questionGroups = $this->groupBlocksByQuestionNumber($blocks);
  16. // 组装每道题的结构
  17. $structuredQuestions = $this->assembleQuestions($questionGroups);
  18. return $structuredQuestions;
  19. }
  20. /**
  21. * 递归解析嵌套的JSON字符串
  22. */
  23. private function parseNestedJson($data)
  24. {
  25. if (is_string($data)) {
  26. $decoded = json_decode($data, true);
  27. if (json_last_error() === JSON_ERROR_NONE) {
  28. return $decoded;
  29. }
  30. return $data;
  31. }
  32. // 递归处理嵌套结构
  33. if (is_array($data)) {
  34. foreach ($data as $key => $value) {
  35. $data[$key] = $this->parseNestedJson($value);
  36. }
  37. }
  38. return $data;
  39. }
  40. /**
  41. * 提取所有文本块
  42. */
  43. private function extractAllTextBlocks(array $data): array
  44. {
  45. $blocks = [];
  46. if (!isset($data['data']['page_list'])) {
  47. return $blocks;
  48. }
  49. foreach ($data['data']['page_list'] as $page) {
  50. if (!isset($page['answer_list'])) {
  51. continue;
  52. }
  53. foreach ($page['answer_list'] as $item) {
  54. if (!isset($item['content_list_info'])) {
  55. continue;
  56. }
  57. foreach ($item['content_list_info'] as $content) {
  58. $text = $content['text'] ?? '';
  59. $text = trim($text);
  60. if ($text !== '') {
  61. $blocks[] = [
  62. 'text' => $text,
  63. 'ids' => $item['ids'] ?? [],
  64. 'position' => $content['pos'] ?? null,
  65. 'confidence' => $content['confidence'] ?? null,
  66. 'doc_index' => $content['doc_index'] ?? 1,
  67. 'is_multipage' => $item['is_multipage'] ?? false
  68. ];
  69. }
  70. }
  71. }
  72. }
  73. return $blocks;
  74. }
  75. /**
  76. * 根据题号将文本块分组
  77. */
  78. private function groupBlocksByQuestionNumber(array $blocks): array
  79. {
  80. $questionNumbers = [];
  81. $groups = [];
  82. // 第一步:识别所有题号
  83. foreach ($blocks as $index => $block) {
  84. $text = $block['text'];
  85. // 匹配题号格式:1. 1、 1)、(1) ①等
  86. if (preg_match('/^\s*(\d+)\s*[\.\、\)\)]/', $text, $matches)) {
  87. $questionNum = (int)$matches[1];
  88. $y = $this->getBlockCenterY($block);
  89. $questionNumbers[] = [
  90. 'index' => $index,
  91. 'number' => $questionNum,
  92. 'text' => $text,
  93. 'y' => $y
  94. ];
  95. }
  96. }
  97. // 按题号排序
  98. usort($questionNumbers, function($a, $b) {
  99. return $a['number'] <=> $b['number'];
  100. });
  101. // 第二步:根据题号Y坐标分组
  102. foreach ($questionNumbers as $i => $currentQN) {
  103. $nextQN = $questionNumbers[$i + 1] ?? null;
  104. $yStart = $currentQN['y'];
  105. $yEnd = $nextQN ? $nextQN['y'] : PHP_INT_MAX;
  106. // 收集这个题号范围内的所有blocks
  107. $groupBlocks = [];
  108. foreach ($blocks as $block) {
  109. $blockY = $this->getBlockCenterY($block);
  110. if ($blockY >= $yStart && ($blockY < $yEnd)) {
  111. $groupBlocks[] = $block;
  112. }
  113. }
  114. $groups[] = [
  115. 'question_number' => $currentQN['number'],
  116. 'question_text' => $currentQN['text'],
  117. 'blocks' => $groupBlocks,
  118. 'y_range' => ['start' => $yStart, 'end' => $yEnd]
  119. ];
  120. }
  121. return $groups;
  122. }
  123. /**
  124. * 组装每道题的结构
  125. */
  126. private function assembleQuestions(array $questionGroups): array
  127. {
  128. $questions = [];
  129. foreach ($questionGroups as $group) {
  130. $question = [
  131. 'q' => $group['question_number'],
  132. 'text' => '',
  133. 'options' => [],
  134. 'blocks' => $group['blocks']
  135. ];
  136. $questionText = [];
  137. $options = [];
  138. $questionNumbers = [];
  139. foreach ($group['blocks'] as $block) {
  140. $text = $block['text'];
  141. // 识别题号
  142. if (preg_match('/^\s*(\d+)\s*[\.\、\)\)]/', $text, $matches)) {
  143. continue; // 跳过题号本身
  144. }
  145. // 识别选择题选项
  146. if (preg_match('/^([A-Da-d])[\.\、]?/', $text, $optionMatch)) {
  147. $optionLetter = strtoupper($optionMatch[1]);
  148. $options[$optionLetter] = substr($text, 2);
  149. } elseif (in_array(substr($text, 0, 1), ['A', 'B', 'C', 'D'])) {
  150. // 单字母选项
  151. $options[substr($text, 0, 1)] = substr($text, 1);
  152. } else {
  153. // 题干或其他内容
  154. $questionText[] = $text;
  155. }
  156. }
  157. // 合并题干文本
  158. $question['text'] = implode(' ', array_filter($questionText));
  159. // 处理选项:如果有多个选项连在一起,需要拆分
  160. if (count($options) === 0 && preg_match('/([A-Da-d])/', $question['text'])) {
  161. $options = $this->splitMergedOptions($question['text']);
  162. }
  163. $question['options'] = $options;
  164. $questions[] = $question;
  165. }
  166. return $questions;
  167. }
  168. /**
  169. * 拆分连在一起的选项
  170. */
  171. private function splitMergedOptions(string $text): array
  172. {
  173. $options = [];
  174. // 匹配选项模式
  175. if (preg_match_all('/([A-Da-d])[\.\、]?([^A-D]*)/', $text, $matches, PREG_SET_ORDER)) {
  176. for ($i = 0; $i < count($matches[1]); $i++) {
  177. $letter = strtoupper($matches[1][$i]);
  178. $content = trim($matches[2][$i]);
  179. if ($content) {
  180. $options[$letter] = $content;
  181. }
  182. }
  183. }
  184. return $options;
  185. }
  186. /**
  187. * 获取block的Y坐标中心
  188. */
  189. private function getBlockCenterY(array $block): int
  190. {
  191. if (!isset($block['position']) || empty($block['position'])) {
  192. return 0;
  193. }
  194. $yValues = [];
  195. foreach ($block['position'] as $point) {
  196. if (isset($point['y'])) {
  197. $yValues[] = $point['y'];
  198. }
  199. }
  200. if (empty($yValues)) {
  201. return 0;
  202. }
  203. return (int)(array_sum($yValues) / count($yValues));
  204. }
  205. }