OCRDataParser.php 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149
  1. <?php
  2. namespace App\Services;
  3. use Illuminate\Support\Facades\Log;
  4. use Illuminate\Support\Regex;
  5. class OCRDataParser
  6. {
  7. /**
  8. * 解析OCR原始数据,返回结构化的题目信息
  9. *
  10. * @param array $ocrData 阿里云OCR返回的原始数据
  11. * @param array $paperInfo 系统试卷信息(可选,用于辅助匹配)
  12. * @return array 结构化的题目列表
  13. */
  14. public function parseStructuredQuestions(array $ocrData, ?array $paperInfo = null): array
  15. {
  16. // 提取所有文本块
  17. $textBlocks = $this->extractTextBlocks($ocrData);
  18. // 识别题号并按坐标分组
  19. $questionUnits = $this->groupBlocksByQuestionNumber($textBlocks);
  20. // 处理每个题目的内容
  21. $structuredQuestions = $this->processQuestionUnits($questionUnits);
  22. // 如果提供了试卷信息,进行匹配优化
  23. if ($paperInfo) {
  24. $structuredQuestions = $this->optimizeWithPaperInfo($structuredQuestions, $paperInfo);
  25. }
  26. return $structuredQuestions;
  27. }
  28. /**
  29. * 提取所有文本块和元数据
  30. */
  31. private function extractTextBlocksAndMeta(array $ocrData): array
  32. {
  33. $blocks = [];
  34. $meta = ['height' => 2000]; // Default height
  35. // Unwrap 'raw' key if present
  36. while (isset($ocrData['raw']) && is_array($ocrData['raw'])) {
  37. $ocrData = $ocrData['raw'];
  38. }
  39. if (isset($ocrData['data']) && is_string($ocrData['data'])) {
  40. $ocrData['data'] = json_decode($ocrData['data'], true);
  41. }
  42. if (isset($ocrData['data']['page_list'])) {
  43. foreach ($ocrData['data']['page_list'] as $page) {
  44. // Extract page height
  45. if (isset($page['height'])) {
  46. $meta['height'] = $page['height'];
  47. }
  48. if (isset($page['answer_list'])) {
  49. foreach ($page['answer_list'] as $item) {
  50. if (isset($item['text']) && !empty(trim($item['text']))) {
  51. $position = null;
  52. if (isset($item['content_list_info']) && !empty($item['content_list_info'])) {
  53. $position = $item['content_list_info'][0]['pos'] ?? null;
  54. }
  55. $blocks[] = [
  56. 'text' => trim($item['text']),
  57. 'position' => $position,
  58. 'confidence' => $item['confidence'] ?? null,
  59. 'type' => $this->detectTextType($item['text']),
  60. 'ids' => $item['ids'] ?? []
  61. ];
  62. }
  63. }
  64. }
  65. }
  66. }
  67. // Sort blocks by Y
  68. usort($blocks, function($a, $b) {
  69. if (!$a['position'] || !$b['position']) return 0;
  70. $y1 = $a['position'][0]['y'] ?? 0;
  71. $y2 = $b['position'][0]['y'] ?? 0;
  72. return $y1 <=> $y2;
  73. });
  74. return ['blocks' => $blocks, 'meta' => $meta];
  75. }
  76. /**
  77. * 检测文本类型(题号、选项、答案等)
  78. */
  79. private function detectTextType(string $text): string
  80. {
  81. // 题型说明(优先检测,因为可能包含数字)
  82. if (preg_match('/(一、选择题|二、填空题|三、解答题|本大题)/u', $text)) {
  83. return 'section_header';
  84. }
  85. // 题号模式:1.、1 、1、、1)、(1)、①等
  86. // 但要排除单独的 "(2)=" 这种情况
  87. if (preg_match('/^[\((]?[一二三四五六七八九十\d]+[\.\)))、]/u', $text)) {
  88. // 额外检查:如果只是 "(数字)=" 这种形式,不认为是题号
  89. if (preg_match('/^[\((]\d+[\))]=?\s*$/u', trim($text))) {
  90. return 'content';
  91. }
  92. // 确保题号后面有实际内容,或者至少有分数标记
  93. if (mb_strlen($text) > 3 || preg_match('/\d+分/u', $text)) {
  94. return 'question_number';
  95. }
  96. }
  97. // 选项模式:A.、B、C、D、A)、A.等
  98. if (preg_match('/^[A-Da-d][\.\))]/', $text)) {
  99. return 'option';
  100. }
  101. // 答题区域标记
  102. if (preg_match('/(得分|评卷人|答案|填空|解答|______|____)/u', $text)) {
  103. return 'answer_area';
  104. }
  105. // 试卷标题
  106. if (preg_match('/(试卷|测试卷|考试|试题)/u', $text)) {
  107. return 'title';
  108. }
  109. // 学生信息
  110. if (preg_match('/(姓名|班级|学号|年级)/u', $text)) {
  111. return 'student_info';
  112. }
  113. return 'content';
  114. }
  115. // ... (keep extractTextBlocks for backward compatibility if needed, or redirect)
  116. private function extractTextBlocks(array $ocrData): array
  117. {
  118. return $this->extractTextBlocksAndMeta($ocrData)['blocks'];
  119. }
  120. public function matchWithSystemPaper(array $ocrRawData, $paperQuestions): array
  121. {
  122. // Check if we have the wrapped structure with 'raw' key
  123. $dataToParse = isset($ocrRawData['raw']) ? $ocrRawData['raw'] : $ocrRawData;
  124. // 1. 提取所有文本块和元数据
  125. $extracted = $this->extractTextBlocksAndMeta($dataToParse);
  126. $blocks = $extracted['blocks'];
  127. $pageHeight = $extracted['meta']['height'];
  128. \Log::info('OCR匹配开始', [
  129. 'total_blocks' => count($blocks),
  130. 'page_height' => $pageHeight,
  131. 'system_questions' => $paperQuestions->count()
  132. ]);
  133. // 2. 从OCR文本块中提取所有题目
  134. $ocrQuestions = $this->extractOCRQuestions($blocks);
  135. \Log::info('提取OCR题目', [
  136. 'ocr_questions_count' => count($ocrQuestions),
  137. 'ocr_question_numbers' => array_column($ocrQuestions, 'ocr_question_number')
  138. ]);
  139. // 3. 对每个系统题目,找到最匹配的OCR题目
  140. $results = [];
  141. $usedOcrQuestions = []; // 记录已使用的OCR题目,避免重复匹配
  142. foreach ($paperQuestions as $paperQuestion) {
  143. $bestMatch = $this->findBestMatchingOCRQuestion($ocrQuestions, $paperQuestion);
  144. if ($bestMatch && !in_array($bestMatch['ocr_question_number'], $usedOcrQuestions)) {
  145. $usedOcrQuestions[] = $bestMatch['ocr_question_number'];
  146. // 计算答题区域的Y范围
  147. $blockTop = $bestMatch['y_start'] ?? 0;
  148. $blockBottom = $bestMatch['y_end'] ?? $blockTop;
  149. // 起点:题号附近略向上,覆盖同行选择题答案
  150. $yStart = max(0, $blockTop - 10);
  151. // 下一题的起点
  152. $nextOcrY = null;
  153. foreach ($ocrQuestions as $nextOcrQ) {
  154. if ($nextOcrQ['y_start'] > $blockTop &&
  155. !in_array($nextOcrQ['ocr_question_number'], $usedOcrQuestions)) {
  156. if ($nextOcrY === null || $nextOcrQ['y_start'] < $nextOcrY) {
  157. $nextOcrY = $nextOcrQ['y_start'];
  158. }
  159. }
  160. }
  161. // 根据题型动态扩展高度(选填短,解答长)
  162. $questionType = $paperQuestion->question_type ?? null;
  163. $heightMap = [
  164. 'choice' => 120,
  165. 'fill' => 180,
  166. 'answer' => 360,
  167. ];
  168. $extend = $heightMap[$questionType] ?? 220;
  169. // 终点:下一题前 5px 或默认扩展高度,取较小以避免跨题
  170. $yEndCandidate = min($pageHeight, $blockBottom + $extend);
  171. if ($nextOcrY !== null) {
  172. $yEnd = min($yEndCandidate, $nextOcrY - 5);
  173. $yEnd = max($yEnd, $yStart + 120); // 保底高度
  174. } else {
  175. $yEnd = $yEndCandidate;
  176. }
  177. // 查找题型说明,如果在当前题和下一题之间,使用它作为边界
  178. foreach ($blocks as $block) {
  179. $textType = $this->detectTextType($block['text']);
  180. if ($textType === 'section_header') {
  181. $blockYTop = $this->getBlockTopY($block);
  182. if ($blockYTop > $yStart && $blockYTop < $yEnd) {
  183. $yEnd = min($yEnd, $blockYTop - 5);
  184. break;
  185. }
  186. }
  187. }
  188. // 最小高度兜底
  189. if ($yEnd - $yStart < 80) {
  190. $yEnd = min($pageHeight, $yStart + 80);
  191. }
  192. // 确保Y范围有效
  193. if ($yEnd <= $yStart) {
  194. $yEnd = $yStart + 200; // 至少给200像素的空间
  195. }
  196. \Log::debug("Q{$paperQuestion->question_number} Y范围", [
  197. 'y_start' => $yStart,
  198. 'y_end' => $yEnd,
  199. 'range' => $yEnd - $yStart
  200. ]);
  201. // 提取答题区域的文本块
  202. $answerBlocks = [];
  203. foreach ($blocks as $block) {
  204. $blockY = $this->getBlockCenterY($block);
  205. if ($blockY > $yStart && $blockY < $yEnd) {
  206. $answerBlocks[] = $block;
  207. }
  208. }
  209. // 获取系统题干
  210. $systemQuestionText = strip_tags($paperQuestion->question_text ?? '');
  211. // 提取学生答案
  212. $studentAnswer = $this->extractAnswerFromBlocks(
  213. $answerBlocks,
  214. $systemQuestionText,
  215. $paperQuestion->question_type ?? null
  216. );
  217. // 计算置信度
  218. $confidences = [];
  219. foreach ($answerBlocks as $block) {
  220. if (isset($block['confidence'])) {
  221. $confidences[] = $block['confidence'];
  222. }
  223. }
  224. $avgConfidence = !empty($confidences) ? array_sum($confidences) / count($confidences) : 0;
  225. $results[$paperQuestion->question_number] = [
  226. 'student_answer' => trim($studentAnswer),
  227. 'confidence' => $bestMatch['similarity'], // 使用匹配相似度作为置信度
  228. 'coordinates' => [
  229. 'y_min' => $yStart,
  230. 'y_max' => $yEnd
  231. ],
  232. 'debug_info' => [
  233. 'y_start' => $yStart,
  234. 'y_end' => $yEnd,
  235. 'block_count' => count($answerBlocks),
  236. 'system_question_length' => mb_strlen($systemQuestionText),
  237. 'ocr_question_number' => $bestMatch['ocr_question_number'],
  238. 'match_similarity' => round($bestMatch['similarity'] * 100, 2) . '%',
  239. 'ocr_confidence' => round($avgConfidence * 100, 2) . '%'
  240. ],
  241. 'question_text' => $systemQuestionText
  242. ];
  243. \Log::info("系统Q{$paperQuestion->question_number} 匹配到 OCR Q{$bestMatch['ocr_question_number']}", [
  244. 'similarity' => round($bestMatch['similarity'] * 100, 2) . '%',
  245. 'student_answer_preview' => mb_substr($studentAnswer, 0, 50)
  246. ]);
  247. } else {
  248. // 未找到匹配
  249. \Log::warning("系统Q{$paperQuestion->question_number} 未找到匹配的OCR题目");
  250. $results[$paperQuestion->question_number] = [
  251. 'student_answer' => '',
  252. 'confidence' => 0,
  253. 'coordinates' => [
  254. 'y_min' => 0,
  255. 'y_max' => 0
  256. ],
  257. 'debug_info' => [
  258. 'error' => '未找到匹配的OCR题目'
  259. ],
  260. 'question_text' => strip_tags($paperQuestion->question_text ?? '')
  261. ];
  262. }
  263. }
  264. \Log::info('OCR匹配完成', [
  265. 'matched_count' => count(array_filter($results, fn($r) => !empty($r['student_answer']))),
  266. 'total_count' => count($results)
  267. ]);
  268. return $results;
  269. }
  270. /**
  271. * 基于题号坐标对所有block做y轴分段聚类
  272. */
  273. private function groupBlocksByQuestionNumber(array $blocks): array
  274. {
  275. $questionUnits = [];
  276. $currentQuestion = null;
  277. $blocksByType = [];
  278. // 第一步:识别所有题号
  279. $questionNumbers = [];
  280. foreach ($blocks as $index => $block) {
  281. if ($block['type'] === 'question_number') {
  282. $y = $this->getBlockCenterY($block);
  283. $questionNumbers[] = [
  284. 'index' => $index,
  285. 'text' => $block['text'],
  286. 'y' => $y,
  287. 'number' => $this->extractQuestionNumber($block['text'])
  288. ];
  289. }
  290. }
  291. // 第二步:按题号分组blocks
  292. for ($i = 0; $i < count($questionNumbers); $i++) {
  293. $currentQN = $questionNumbers[$i];
  294. $nextQN = $questionNumbers[$i + 1] ?? null;
  295. $yStart = $currentQN['y'];
  296. $yEnd = $nextQN ? $nextQN['y'] : PHP_INT_MAX;
  297. // 收集这个题号范围内的所有blocks
  298. $questionBlocks = [];
  299. foreach ($blocks as $block) {
  300. $y = $this->getBlockCenterY($block);
  301. if ($y >= $yStart && ($nextQN === null || $y < $yEnd)) {
  302. $questionBlocks[] = $block;
  303. }
  304. }
  305. $questionUnits[] = [
  306. 'question_number' => $currentQN['number'],
  307. 'question_text' => $currentQN['text'],
  308. 'blocks' => $questionBlocks,
  309. 'y_range' => ['start' => $yStart, 'end' => $yEnd]
  310. ];
  311. }
  312. return $questionUnits;
  313. }
  314. /**
  315. * 处理每个题目的内容
  316. */
  317. private function processQuestionUnits(array $questionUnits): array
  318. {
  319. $structuredQuestions = [];
  320. foreach ($questionUnits as $unit) {
  321. $question = [
  322. 'question_number' => $unit['question_number'],
  323. 'content' => '',
  324. 'options' => [],
  325. 'answer' => '',
  326. 'confidence' => 0
  327. ];
  328. $contentParts = [];
  329. $options = [];
  330. $answerAreas = [];
  331. foreach ($unit['blocks'] as $block) {
  332. switch ($block['type']) {
  333. case 'content':
  334. $contentParts[] = $block['text'];
  335. break;
  336. case 'option':
  337. // 提取选项字母和内容
  338. if (preg_match('/^([A-Da-d])[\.\))]\s*(.*)/', $block['text'], $matches)) {
  339. $options[] = [
  340. 'letter' => strtoupper($matches[1]),
  341. 'content' => trim($matches[2])
  342. ];
  343. } else {
  344. $options[] = [
  345. 'letter' => '',
  346. 'content' => $block['text']
  347. ];
  348. }
  349. break;
  350. case 'answer_area':
  351. // 查找答题区域中的手写内容
  352. $answerAreas[] = $block['text'];
  353. break;
  354. }
  355. }
  356. // 合并题干内容
  357. $question['content'] = $this->mergeContentParts($contentParts);
  358. $question['options'] = $options;
  359. $question['answer'] = $this->extractAnswerFromAnswerAreas($answerAreas);
  360. $question['confidence'] = $this->calculateConfidence($unit['blocks']);
  361. $structuredQuestions[] = $question;
  362. }
  363. return $structuredQuestions;
  364. }
  365. /**
  366. * 合并题干内容
  367. */
  368. private function mergeContentParts(array $contentParts): string
  369. {
  370. $merged = '';
  371. $lastWasQuestion = false;
  372. foreach ($contentParts as $part) {
  373. // 跳过题号(已经在其他地方处理)
  374. if (preg_match('/^[\((]?[\d]+[\.\)))、]/', $part)) {
  375. $lastWasQuestion = true;
  376. continue;
  377. }
  378. // 跳过重复的题号
  379. if ($lastWasQuestion && preg_match('/^[\((]?[\d]+[\.\)))、]/', $part)) {
  380. continue;
  381. }
  382. $merged .= ($merged ? ' ' : '') . $part;
  383. $lastWasQuestion = false;
  384. }
  385. return trim($merged);
  386. }
  387. /**
  388. * 从答题区域提取答案
  389. */
  390. private function extractAnswerFromAnswerAreas(array $answerAreas): string
  391. {
  392. $answer = '';
  393. foreach ($answerAreas as $area) {
  394. // 查找手写内容(通常在空白或下划线附近)
  395. if (preg_match('/([A-Da-d])/', $area, $matches)) {
  396. $answer = strtoupper($matches[1]);
  397. break;
  398. }
  399. // 查找填空题的答案
  400. if (preg_match('/\S+/', $area, $matches) && !preg_match('/(得分|评卷人)/', $area)) {
  401. $answer = trim($matches[0]);
  402. }
  403. }
  404. return $answer;
  405. }
  406. /**
  407. * 计算置信度
  408. */
  409. private function calculateConfidence(array $blocks): float
  410. {
  411. $totalConfidence = 0;
  412. $count = 0;
  413. foreach ($blocks as $block) {
  414. if ($block['confidence'] !== null) {
  415. $totalConfidence += $block['confidence'];
  416. $count++;
  417. }
  418. }
  419. return $count > 0 ? $totalConfidence / $count : 0.8;
  420. }
  421. /**
  422. * 获取block的Y坐标中心
  423. */
  424. private function getBlockCenterY(array $block): ?int
  425. {
  426. if (!$block['position'] || empty($block['position'])) {
  427. return null;
  428. }
  429. $yValues = [];
  430. foreach ($block['position'] as $point) {
  431. if (isset($point['y'])) {
  432. $yValues[] = $point['y'];
  433. }
  434. }
  435. if (empty($yValues)) {
  436. return null;
  437. }
  438. return (min($yValues) + max($yValues)) / 2;
  439. }
  440. /**
  441. * 从文本中提取题号
  442. */
  443. private function extractQuestionNumber(string $text): int
  444. {
  445. if (preg_match('/[\d]+/', $text, $matches)) {
  446. return (int)$matches[0];
  447. }
  448. return 0;
  449. }
  450. /**
  451. * 使用试卷信息优化匹配结果
  452. */
  453. private function optimizeWithPaperInfo(array $questions, array $paperInfo): array
  454. {
  455. // 获取系统试卷的题目列表
  456. $systemQuestions = $paperInfo['questions'] ?? [];
  457. // 构建系统题目的映射
  458. $systemMap = [];
  459. foreach ($systemQuestions as $sysQ) {
  460. $systemMap[$sysQ['question_number']] = $sysQ;
  461. }
  462. // 优化每个题目
  463. $optimized = [];
  464. foreach ($questions as $question) {
  465. $qNum = $question['question_number'];
  466. // 如果系统试卷中有对应题号,进行优化
  467. if (isset($systemMap[$qNum])) {
  468. $sysQuestion = $systemMap[$qNum];
  469. // 题型匹配
  470. if (isset($sysQuestion['question_type'])) {
  471. $question = $this->optimizeByQuestionType($question, $sysQuestion['question_type']);
  472. }
  473. // 答案优化
  474. if (isset($sysQuestion['correct_answer'])) {
  475. $question = $this->optimizeAnswer($question, $sysQuestion['correct_answer']);
  476. }
  477. }
  478. $optimized[] = $question;
  479. }
  480. return $optimized;
  481. }
  482. /**
  483. * 根据题型优化解析
  484. */
  485. private function optimizeByQuestionType(array $question, string $questionType): array
  486. {
  487. switch ($questionType) {
  488. case 'choice':
  489. // 选择题:确保有选项,优化答案格式
  490. if (empty($question['options']) && preg_match('/[A-Da-d]/', $question['content'])) {
  491. // 如果内容中包含选项,尝试提取
  492. $question['options'] = $this->extractOptionsFromContent($question['content']);
  493. }
  494. break;
  495. case 'fill':
  496. // 填空题:识别填空位置
  497. $question['blanks'] = $this->findFillBlanks($question['content']);
  498. break;
  499. case 'answer':
  500. // 解答题:保留完整内容
  501. $question['full_solution'] = $question['content'];
  502. break;
  503. }
  504. return $question;
  505. }
  506. /**
  507. * 优化答案格式
  508. */
  509. private function optimizeAnswer(array $question, string $correctAnswer): array
  510. {
  511. // 如果是选择题,标准化答案格式
  512. if (!empty($question['options'])) {
  513. $question['answer'] = $this->normalizeChoiceAnswer($question['answer'], $correctAnswer);
  514. }
  515. return $question;
  516. }
  517. /**
  518. * 标准化选择题答案
  519. */
  520. private function normalizeChoiceAnswer(string $studentAnswer, string $correctAnswer): string
  521. {
  522. // 映射表:处理各种答案格式
  523. $map = [
  524. '①' => 'A', '②' => 'B', '③' => 'C', '④' => 'D',
  525. '1' => 'A', '2' => 'B', '3' => 'C', '4' => 'D'
  526. ];
  527. $studentAnswer = trim($studentAnswer);
  528. return $map[$studentAnswer] ?? strtoupper($studentAnswer);
  529. }
  530. /**
  531. * 从内容中提取选项
  532. */
  533. private function extractOptionsFromContent(string $content): array
  534. {
  535. $options = [];
  536. $lines = explode("\n", $content);
  537. foreach ($lines as $line) {
  538. if (preg_match('/^([A-Da-d])[\.\))]\s*(.*)/', trim($line), $matches)) {
  539. $options[] = [
  540. 'letter' => strtoupper($matches[1]),
  541. 'content' => trim($matches[2])
  542. ];
  543. }
  544. }
  545. return $options;
  546. }
  547. /**
  548. * 查找填空位置
  549. */
  550. private function findFillBlanks(string $content): array
  551. {
  552. $blanks = [];
  553. // 查找下划线或括号
  554. if (preg_match_all('/(_{2,})|([\s\S]*?)|\([\s\S]*?\)/u', $content, $matches)) {
  555. $blanks = $matches[0];
  556. }
  557. return $blanks;
  558. }
  559. /**
  560. * 调试输出:生成可视化的分析结果
  561. */
  562. public function generateDebugOutput(array $ocrData, array $structuredQuestions): string
  563. {
  564. $output = "=== OCR数据解析调试输出 ===\n\n";
  565. // 原始数据统计
  566. $blocks = $this->extractTextBlocks($ocrData);
  567. $output .= "1. 原始文本块数量: " . count($blocks) . "\n";
  568. $typeStats = [];
  569. foreach ($blocks as $block) {
  570. $type = $block['type'];
  571. $typeStats[$type] = ($typeStats[$type] ?? 0) + 1;
  572. }
  573. $output .= " 类型分布: " . json_encode($typeStats, JSON_UNESCAPED_UNICODE) . "\n\n";
  574. // 结构化题目
  575. $output .= "2. 识别到的题目数量: " . count($structuredQuestions) . "\n";
  576. foreach ($structuredQuestions as $i => $q) {
  577. $output .= "\n题目 " . ($i + 1) . " (题号: {$q['question_number']}):\n";
  578. $output .= " - 内容: " . substr($q['content'], 0, 100) . "...\n";
  579. $output .= " - 选项数: " . count($q['options']) . "\n";
  580. $output .= " - 答案: " . ($q['answer'] ?: '未识别') . "\n";
  581. $output .= " - 置信度: " . round($q['confidence'] * 100, 2) . "%\n";
  582. }
  583. return $output;
  584. }
  585. /**
  586. * 寻找题目锚点
  587. */
  588. /**
  589. * 寻找题目锚点
  590. */
  591. public function findQuestionAnchor(array $blocks, $paperQuestion): ?array
  592. {
  593. $qNum = $paperQuestion->question_number;
  594. $cleanContent = strip_tags($paperQuestion->question_text);
  595. $cleanContent = preg_replace('/\s+/', '', $cleanContent);
  596. // 策略1:优先匹配 "题号." 的形式 (e.g., "1.", "2、")
  597. foreach ($blocks as $block) {
  598. // 匹配 "1.", "1、", "(1)" 等开头
  599. if (preg_match('/^[\((]?'.$qNum.'[\.\)))、]/', $block['text'])) {
  600. return $this->getBlockCoordinates($block);
  601. }
  602. }
  603. // 策略1.5:匹配独立的题号 (e.g., "1" 后面跟着空格或换行)
  604. // 有时候OCR会把 "1." 识别成 "1" 和 "." 分开的block,或者 "1 题目内容"
  605. foreach ($blocks as $block) {
  606. if (preg_match('/^'.$qNum.'\s+/', $block['text']) || $block['text'] === (string)$qNum) {
  607. // 只有当这个block看起来像题号(比较短,或者在左侧)时才采纳
  608. // 这里简单判断一下长度,防止匹配到 "100" 中的 "1"
  609. if (strlen($block['text']) < 5) {
  610. return $this->getBlockCoordinates($block);
  611. }
  612. }
  613. }
  614. // 策略2:如果题号匹配失败,尝试匹配题目内容的前几个字
  615. $prefix = mb_substr($cleanContent, 0, 15); // 取前15个字
  616. if (mb_strlen($prefix) > 2) {
  617. foreach ($blocks as $block) {
  618. $blockText = preg_replace('/\s+/', '', $block['text']);
  619. // 简单的包含检查
  620. if (mb_strpos($blockText, $prefix) !== false) {
  621. return $this->getBlockCoordinates($block);
  622. }
  623. // Fuzzy matching for the prefix
  624. similar_text($prefix, mb_substr($blockText, 0, mb_strlen($prefix) + 5), $percent);
  625. if ($percent > 80) {
  626. return $this->getBlockCoordinates($block);
  627. }
  628. }
  629. }
  630. return null;
  631. }
  632. /**
  633. * 获取Block的坐标信息
  634. */
  635. private function getBlockCoordinates(array $block): array
  636. {
  637. if (empty($block['position'])) {
  638. return ['y_top' => 0, 'y_bottom' => 0];
  639. }
  640. $ys = array_column($block['position'], 'y');
  641. return [
  642. 'y_top' => min($ys),
  643. 'y_bottom' => max($ys),
  644. 'x_left' => min(array_column($block['position'], 'x')),
  645. 'x_right' => max(array_column($block['position'], 'x')),
  646. ];
  647. }
  648. /**
  649. * 从裁剪区域的OCR结果中提取答案(去除题目文本)
  650. */
  651. public function extractAnswerFromCrop(array $cropResult, string $systemQuestionText): string
  652. {
  653. // 1. 获取OCR识别的完整文本
  654. $ocrText = '';
  655. if (isset($cropResult['content'])) {
  656. $ocrText = $cropResult['content'];
  657. } elseif (isset($cropResult['questions'])) {
  658. $texts = array_column($cropResult['questions'], 'content');
  659. $ocrText = implode("\n", $texts);
  660. }
  661. if (empty($ocrText)) {
  662. return '';
  663. }
  664. // 2. 预处理文本(去除标点、空格,统一格式)
  665. $normalizedOcr = $this->normalizeTextForComparison($ocrText);
  666. $normalizedSystem = $this->normalizeTextForComparison($systemQuestionText);
  667. // 3. 尝试去除题目部分
  668. // 策略A: 如果OCR文本以系统题目开头(允许一定的模糊匹配)
  669. if (str_starts_with($normalizedOcr, $normalizedSystem)) {
  670. // 找到系统题目在原始OCR文本中的结束位置
  671. // 这是一个简化的处理,实际可能需要更复杂的对齐算法
  672. $cleanOcr = $this->removePrefixFuzzy($ocrText, $systemQuestionText);
  673. return trim($cleanOcr);
  674. }
  675. // 策略B: 最长公共子序列匹配 (LCS) - 简化版
  676. // 如果OCR文本的前半部分与系统题目高度相似,则认为前半部分是题目
  677. $splitIndex = $this->findSplitIndex($ocrText, $systemQuestionText);
  678. if ($splitIndex > 0) {
  679. return trim(substr($ocrText, $splitIndex));
  680. }
  681. // 策略C: 如果无法区分,且OCR文本比系统题目长很多,可能包含了答案
  682. // 但为了安全,如果匹配失败,我们还是返回原文本,或者尝试启发式规则
  683. // 启发式规则:如果包含 "解:"、"答:" 等关键字,取关键字之后的内容
  684. if (preg_match('/(解[::]|答[::])(.*)/s', $ocrText, $matches)) {
  685. return trim($matches[0]); // 返回包含"解:"的部分
  686. }
  687. // 启发式规则:对于填空题,如果末尾有内容
  688. // 比如 ".... = 3",取等号后面的
  689. if (preg_match('/=\s*(\S+)$/', $ocrText, $matches)) {
  690. return trim($matches[1]);
  691. }
  692. return $ocrText;
  693. }
  694. private function normalizeTextForComparison(string $text): string
  695. {
  696. $text = strip_tags($text);
  697. $text = preg_replace('/\s+/', '', $text);
  698. $text = preg_replace('/[[:punct:]]/', '', $text); // 去除标点
  699. return strtolower($text);
  700. }
  701. private function removePrefixFuzzy(string $fullText, string $prefix): string
  702. {
  703. // 简单实现:逐字符匹配,直到不匹配为止
  704. $len = min(strlen($fullText), strlen($prefix) * 1.5); // 限制搜索范围
  705. $matchCount = 0;
  706. $lastMatchIndex = 0;
  707. // 这里使用一个简单的滑动窗口或者直接比较
  708. // 为了效率,我们假设题目在开头
  709. // 我们寻找 prefix 的最后一个字符在 fullText 中的位置
  710. // 更简单的方法:直接计算相似度,找到最佳切割点
  711. // 但这里我们先用一个简单的 hack:
  712. // 假设 OCR 结果中的题目部分和 systemQuestionText 长度差不多
  713. $prefixLen = strlen($prefix);
  714. $potentialPrefix = substr($fullText, 0, $prefixLen + 10); // 多取一点
  715. similar_text($potentialPrefix, $prefix, $percent);
  716. if ($percent > 80) {
  717. return substr($fullText, $prefixLen); // 简单截断
  718. }
  719. return $fullText;
  720. }
  721. private function findSplitIndex(string $ocrText, string $systemText): int
  722. {
  723. // 寻找 systemText 在 ocrText 中的结束位置
  724. // 这是一个难点,因为 OCR 可能有错别字
  725. // 简化算法:
  726. // 1. 取 systemText 的后 10 个字符作为"锚点"
  727. $anchor = mb_substr($systemText, -10);
  728. if (mb_strlen($anchor) < 5) $anchor = $systemText;
  729. $pos = mb_strpos($ocrText, $anchor);
  730. if ($pos !== false) {
  731. return $pos + mb_strlen($anchor);
  732. }
  733. return 0;
  734. }
  735. /**
  736. * 找到答案开始的块索引(通过与系统题干匹配)
  737. */
  738. private function findAnswerStartIndex(array $blocks, string $systemText): int
  739. {
  740. $accumulated = '';
  741. $normalizedSystem = $this->normalizeTextForComparison($systemText);
  742. foreach ($blocks as $index => $block) {
  743. $accumulated .= $block['text'];
  744. $normalizedAccumulated = $this->normalizeTextForComparison($accumulated);
  745. // 计算相似度
  746. similar_text($normalizedAccumulated, $normalizedSystem, $percent);
  747. \Log::debug("Block {$index} accumulated similarity: {$percent}%", [
  748. 'accumulated_length' => mb_strlen($accumulated),
  749. 'system_length' => mb_strlen($systemText)
  750. ]);
  751. // 如果相似度超过80%,认为题干已经匹配完成
  752. if ($percent > 80) {
  753. return $index + 1; // 下一个块开始是答案
  754. }
  755. // 如果累积文本已经明显超过系统文本,但相似度还不够,可能是OCR错误太多
  756. // 这时候用长度作为备选方案
  757. if (mb_strlen($normalizedAccumulated) > mb_strlen($normalizedSystem) * 1.2 && $percent > 60) {
  758. return $index + 1;
  759. }
  760. }
  761. return 0; // 未找到匹配,从头开始(保守策略)
  762. }
  763. /**
  764. * 从文本块中提取答案(排除题干部分)
  765. */
  766. private function extractAnswerFromBlocks(array $blocks, string $systemQuestionText, ?string $questionType = null): string
  767. {
  768. if (empty($blocks)) {
  769. return '';
  770. }
  771. // 策略1: 查找明确的答案标记
  772. $answerKeywords = ['答:', '答案:', '解:', '解答:'];
  773. foreach ($blocks as $block) {
  774. foreach ($answerKeywords as $keyword) {
  775. if (mb_strpos($block['text'], $keyword) !== false) {
  776. // 找到答案标记,提取标记后的内容
  777. $parts = mb_split($keyword, $block['text']);
  778. if (count($parts) > 1) {
  779. return trim($parts[1]);
  780. }
  781. }
  782. }
  783. }
  784. // 策略2: 选择题,优先单个字母
  785. if ($questionType === 'choice') {
  786. foreach ($blocks as $block) {
  787. $text = trim($block['text']);
  788. $type = $this->detectTextType($text);
  789. if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
  790. continue;
  791. }
  792. if (preg_match('/^[A-Da-d][\.\))]?$/u', $text)) {
  793. return strtoupper(preg_replace('/[^A-D]/i', '', $text));
  794. }
  795. }
  796. }
  797. // 策略3: 填空题,优先短数字/等式
  798. if ($questionType === 'fill') {
  799. foreach ($blocks as $block) {
  800. $text = trim($block['text']);
  801. $type = $this->detectTextType($text);
  802. if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
  803. continue;
  804. }
  805. if (mb_strlen($text) <= 25 && preg_match('/[\\d=]/u', $text)) {
  806. $normalizedText = $this->normalizeTextForComparison($text);
  807. $normalizedQuestion = $this->normalizeTextForComparison($systemQuestionText);
  808. if (mb_strpos($normalizedQuestion, $normalizedText) === false) {
  809. return $text;
  810. }
  811. }
  812. }
  813. }
  814. // 策略4: 解答/通用,收集非题干短句,优先最短
  815. $normalizedQuestion = $this->normalizeTextForComparison($systemQuestionText);
  816. $candidates = [];
  817. foreach ($blocks as $block) {
  818. $text = trim($block['text']);
  819. $type = $this->detectTextType($text);
  820. if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
  821. continue;
  822. }
  823. if ($text === '' || mb_strpos($text, '分') !== false) {
  824. continue;
  825. }
  826. $normalizedText = $this->normalizeTextForComparison($text);
  827. if ($normalizedText !== '' && mb_strpos($normalizedQuestion, $normalizedText) === false) {
  828. $candidates[] = $text;
  829. }
  830. }
  831. if (!empty($candidates)) {
  832. usort($candidates, fn($a, $b) => mb_strlen($a) <=> mb_strlen($b));
  833. return trim($candidates[0]);
  834. }
  835. return '';
  836. }
  837. /**
  838. * 从OCR文本块中提取所有题目(基于题号标记)
  839. */
  840. private function extractOCRQuestions(array $blocks): array
  841. {
  842. $ocrQuestions = [];
  843. $currentQuestion = null;
  844. foreach ($blocks as $idx => $block) {
  845. $type = $this->detectTextType($block['text']);
  846. // 检测到新题号
  847. if ($type === 'question_number') {
  848. // 保存上一题
  849. if ($currentQuestion !== null) {
  850. $ocrQuestions[] = $currentQuestion;
  851. }
  852. // 开始新题
  853. $questionNumber = $this->extractQuestionNumber($block['text']);
  854. $currentQuestion = [
  855. 'ocr_question_number' => $questionNumber,
  856. 'question_text' => $block['text'],
  857. 'blocks' => [$block],
  858. 'y_start' => $this->getBlockTopY($block),
  859. 'y_end' => $this->getBlockBottomY($block),
  860. ];
  861. } elseif ($type === 'section_header') {
  862. // 遇到题型说明,保存当前题目并重置
  863. if ($currentQuestion !== null) {
  864. $ocrQuestions[] = $currentQuestion;
  865. $currentQuestion = null;
  866. }
  867. } elseif ($currentQuestion !== null) {
  868. // 累积当前题目的内容
  869. $currentQuestion['blocks'][] = $block;
  870. $currentQuestion['question_text'] .= ' ' . $block['text'];
  871. $currentQuestion['y_end'] = $this->getBlockBottomY($block);
  872. }
  873. }
  874. // 保存最后一题
  875. if ($currentQuestion !== null) {
  876. $ocrQuestions[] = $currentQuestion;
  877. }
  878. return $ocrQuestions;
  879. }
  880. /**
  881. * 为系统题目找到最匹配的OCR题目
  882. */
  883. private function findBestMatchingOCRQuestion(array $ocrQuestions, $paperQuestion): ?array
  884. {
  885. $systemTextRaw = strip_tags($paperQuestion->question_text ?? '');
  886. $systemText = $this->normalizeTextForMatching($systemTextRaw);
  887. $targetNumber = (int) ($paperQuestion->question_number ?? 0);
  888. // 优先按题号直接命中
  889. foreach ($ocrQuestions as $ocrQ) {
  890. if (($ocrQ['ocr_question_number'] ?? null) === $targetNumber) {
  891. $ocrQ['similarity'] = 0.6; // 基准相似度
  892. \Log::info("Found match by number for Q{$paperQuestion->question_number}", [
  893. 'ocr_question_number' => $ocrQ['ocr_question_number']
  894. ]);
  895. return $ocrQ;
  896. }
  897. }
  898. $bestMatch = null;
  899. $bestSimilarity = 0;
  900. foreach ($ocrQuestions as $ocrQ) {
  901. $ocrText = $this->normalizeTextForMatching($ocrQ['question_text']);
  902. // 1) 优先题号直接匹配,给出高基准分
  903. $numberBoost = ($ocrQ['ocr_question_number'] ?? null) === $targetNumber ? 20 : 0;
  904. // 2) 文本相似度
  905. similar_text($systemText, $ocrText, $percent);
  906. $percent += $numberBoost; // 数字匹配可以抵消轻微文本差异
  907. \Log::debug("Matching Q{$paperQuestion->question_number} with OCR Q{$ocrQ['ocr_question_number']}", [
  908. 'similarity' => round($percent, 2),
  909. 'number_boost' => $numberBoost,
  910. 'system_text_preview' => mb_substr($systemTextRaw, 0, 50),
  911. 'ocr_text_preview' => mb_substr($ocrQ['question_text'], 0, 50)
  912. ]);
  913. if ($percent > $bestSimilarity) {
  914. $bestSimilarity = $percent;
  915. $bestMatch = $ocrQ;
  916. }
  917. }
  918. // 只返回相似度超过阈值的匹配
  919. if ($bestSimilarity >= 30) { // 降低阈值以适应OCR识别误差和LaTeX差异
  920. $bestMatch['similarity'] = $bestSimilarity / 100;
  921. \Log::info("Found match for Q{$paperQuestion->question_number}", [
  922. 'ocr_question_number' => $bestMatch['ocr_question_number'],
  923. 'similarity' => round($bestSimilarity, 2) . '%'
  924. ]);
  925. return $bestMatch;
  926. }
  927. \Log::warning("No match found for Q{$paperQuestion->question_number}", [
  928. 'best_similarity' => round($bestSimilarity, 2) . '%'
  929. ]);
  930. return null;
  931. }
  932. /**
  933. * 标准化文本用于匹配(去除空格、标点、LaTeX等)
  934. */
  935. private function normalizeTextForMatching(string $text): string
  936. {
  937. // 去除LaTeX标记(包括$$和$)
  938. $text = preg_replace('/\$\$?[^\$]+\$\$?/s', '', $text);
  939. // 去除HTML标签
  940. $text = strip_tags($text);
  941. // 去除所有标点符号和特殊字符
  942. $text = preg_replace('/[[:punct:]]/u', '', $text);
  943. $text = preg_replace('/[^\p{L}\p{N}]/u', '', $text);
  944. // 去除空格
  945. $text = preg_replace('/\s+/u', '', $text);
  946. // 转小写
  947. $text = mb_strtolower($text);
  948. return $text;
  949. }
  950. /**
  951. * 获取block的顶部Y坐标
  952. */
  953. private function getBlockTopY(array $block): int
  954. {
  955. if (empty($block['position'])) {
  956. return 0;
  957. }
  958. return min(array_column($block['position'], 'y'));
  959. }
  960. /**
  961. * 获取block的底部Y坐标
  962. */
  963. private function getBlockBottomY(array $block): int
  964. {
  965. if (empty($block['position'])) {
  966. return 0;
  967. }
  968. return max(array_column($block['position'], 'y'));
  969. }
  970. }