OCRService.php 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714
  1. <?php
  2. namespace App\Services;
  3. use App\Models\OCRRecord;
  4. use App\Models\OCRQuestionResult;
  5. use App\Services\ImageProcessingService;
  6. use Illuminate\Http\UploadedFile;
  7. use Illuminate\Support\Facades\Http;
  8. use Illuminate\Support\Facades\Storage;
  9. use Illuminate\Support\Str;
  10. class OCRService
  11. {
  12. protected $ocrDriver;
  13. protected $learningAnalyticsService;
  14. protected $imageProcessingService;
  15. public function __construct(
  16. LearningAnalyticsService $learningAnalyticsService,
  17. ImageProcessingService $imageProcessingService
  18. ) {
  19. $this->ocrDriver = \App\Services\OCR\OCRFactory::create();
  20. $this->learningAnalyticsService = $learningAnalyticsService;
  21. $this->imageProcessingService = $imageProcessingService;
  22. }
  23. /**
  24. * 上传卷子照片并创建OCR记录
  25. */
  26. public function uploadExamPaper(UploadedFile $image, string $studentId): OCRRecord
  27. {
  28. // 验证图片
  29. $this->validateImage($image);
  30. // 生成唯一ID
  31. $recordId = 'ocr_' . Str::uuid()->toString();
  32. $examId = 'exam_' . now()->format('YmdHis') . '_' . Str::random(8);
  33. // 获取图片信息
  34. $imageInfo = getimagesize($image->getPathName());
  35. $imageWidth = $imageInfo[0] ?? 0;
  36. $imageHeight = $imageInfo[1] ?? 0;
  37. $imageSize = filesize($image->getPathName());
  38. // 保存图片
  39. $extension = $image->getClientOriginalExtension();
  40. $filename = $recordId . '.' . $extension;
  41. $imagePath = 'uploads/ocr/' . $filename;
  42. Storage::disk('public')->put($imagePath, file_get_contents($image->getPathName()));
  43. // 创建OCR记录
  44. $ocrRecord = OCRRecord::create([
  45. 'user_id' => $studentId,
  46. 'student_id' => $studentId, // 同时设置 student_id
  47. 'file_path' => $imagePath,
  48. 'paper_title' => $image->getClientOriginalName(),
  49. 'status' => 'pending',
  50. ]);
  51. // 发送到OCR服务处理
  52. $this->dispatchToOcrService($ocrRecord);
  53. return $ocrRecord;
  54. }
  55. /**
  56. * 验证上传的图片
  57. */
  58. protected function validateImage(UploadedFile $image): void
  59. {
  60. $maxSize = config('ocr.upload.max_size', 10 * 1024 * 1024);
  61. $allowedTypes = config('ocr.upload.allowed_types', ['jpg', 'jpeg', 'png', 'webp']);
  62. if (!$image->isValid()) {
  63. throw new \Exception('文件上传失败');
  64. }
  65. if ($image->getSize() > $maxSize) {
  66. throw new \Exception('文件大小超出限制(' . ($maxSize / 1024 / 1024) . 'MB)');
  67. }
  68. $extension = strtolower($image->getClientOriginalExtension());
  69. if (!in_array($extension, $allowedTypes)) {
  70. throw new \Exception('不支持的文件类型,仅支持:' . implode(', ', $allowedTypes));
  71. }
  72. }
  73. /**
  74. * 发送到OCR服务处理
  75. */
  76. protected function dispatchToOcrService(OCRRecord $ocrRecord): void
  77. {
  78. try {
  79. // 检查图片路径是否存在
  80. if (empty($ocrRecord->image_path)) {
  81. throw new \Exception('OCR记录缺少图片路径,record_id: ' . $ocrRecord->id);
  82. }
  83. // 读取图片文件
  84. $imagePath = Storage::disk($this->getDisk())->path($ocrRecord->image_path);
  85. // 确保返回的是字符串路径
  86. if (empty($imagePath)) {
  87. throw new \Exception('无法获取图片路径: ' . $ocrRecord->image_path);
  88. }
  89. if (!file_exists($imagePath)) {
  90. throw new \Exception('图片文件不存在: ' . $imagePath);
  91. }
  92. // 更新状态为processing
  93. $ocrRecord->update(['status' => 'processing']);
  94. // Single API call with cutType: answer (returns both question and answer)
  95. \Log::info('OCR: Extracting questions and answers', ['record_id' => $ocrRecord->id]);
  96. $result = $this->ocrDriver->recognize($imagePath, [
  97. 'cutType' => 'answer',
  98. 'subject' => 'Math',
  99. 'ocr_record_id' => $ocrRecord->id
  100. ]);
  101. $items = $result['questions'] ?? [];
  102. \Log::info('OCR extraction complete', ['item_count' => count($items)]);
  103. // Step 2: Parse student answers from the answer_list data
  104. // Each item in answer_list contains the full question+answer text
  105. // The student's answer is typically the last letter (A/B/C/D) in the text
  106. \Log::info('Parsing student answers from question text');
  107. $parsedQuestions = [];
  108. foreach ($items as $item) {
  109. $questionNumber = $item['question_number'];
  110. $fullText = $item['content'] ?? '';
  111. $questionText = $fullText;
  112. $studentAnswer = '';
  113. // Smart parsing: extract the last single letter (A/B/C/D) as student answer
  114. // Pattern: "题目内容...选项D[学生答案]"
  115. // The student answer is usually the very last character if it's A/B/C/D
  116. if (preg_match('/([A-D])\s*$/u', $fullText, $matches)) {
  117. $studentAnswer = $matches[1];
  118. // Remove the answer from question text
  119. $questionText = preg_replace('/\s*[A-D]\s*$/', '', $fullText);
  120. \Log::info('Extracted student answer', [
  121. 'question_number' => $questionNumber,
  122. 'answer' => $studentAnswer,
  123. 'original_text_length' => mb_strlen($fullText),
  124. 'cleaned_text_length' => mb_strlen($questionText)
  125. ]);
  126. }
  127. $parsedQuestions[] = [
  128. 'question_number' => $questionNumber,
  129. 'content' => trim($questionText),
  130. 'student_answer' => $studentAnswer,
  131. 'confidence' => $item['confidence'] ?? 0.0,
  132. 'raw_data' => $item['raw_data'] ?? null
  133. ];
  134. }
  135. // 使用新的OCR数据解析器进行结构化解析
  136. try {
  137. $finalQuestions = [];
  138. $paper = null;
  139. // 获取试卷信息
  140. if ($ocrRecord->analysis_id) {
  141. $paper = \App\Models\Paper::where('paper_id', $ocrRecord->analysis_id)->first();
  142. }
  143. $parser = new \App\Services\OCRDataParser();
  144. // 如果是系统试卷,使用增强匹配
  145. if ($paper && $paper->paper_type === 'auto_generated') {
  146. $paperQuestions = \App\Models\PaperQuestion::where('paper_id', $paper->paper_id)
  147. ->orderBy('question_number')
  148. ->get();
  149. $finalQuestions = $this->performEnhancedMatching($ocrRecord, $result, $paperQuestions);
  150. } else {
  151. // 原有的解析逻辑
  152. $paperInfo = null;
  153. if ($paper) {
  154. $paperQuestionsArr = \App\Models\PaperQuestion::where('paper_id', $paper->paper_id)
  155. ->get()
  156. ->map(function($q) {
  157. return [
  158. 'question_number' => $q->question_number,
  159. 'question_type' => $q->question_type,
  160. 'correct_answer' => $q->correct_answer,
  161. 'content' => $q->question_text
  162. ];
  163. })
  164. ->toArray();
  165. $paperInfo = ['questions' => $paperQuestionsArr];
  166. }
  167. $structuredQuestions = $parser->parseStructuredQuestions($result, $paperInfo);
  168. foreach ($structuredQuestions as $q) {
  169. $finalQuestions[] = [
  170. 'question_number' => $q['question_number'],
  171. 'content' => $q['content'],
  172. 'student_answer' => $q['answer'],
  173. 'confidence' => $q['confidence'],
  174. 'raw_data' => [
  175. 'options' => $q['options'] ?? [],
  176. 'blocks' => $q['blocks'] ?? []
  177. ]
  178. ];
  179. }
  180. }
  181. $this->processOcrResult($ocrRecord, [
  182. 'questions' => $finalQuestions,
  183. 'raw' => $result
  184. ]);
  185. } catch (\Exception $e) {
  186. // 如果新解析器失败,回退到原有逻辑
  187. \Log::warning('OCR: 解析失败,回退到原有逻辑', [
  188. 'record_id' => $ocrRecord->id,
  189. 'error' => $e->getMessage()
  190. ]);
  191. $this->processOcrResult($ocrRecord, [
  192. 'questions' => $parsedQuestions,
  193. 'raw' => $result
  194. ]);
  195. }
  196. } catch (\Exception $e) {
  197. \Log::error('OCR服务调用失败', [
  198. 'record_id' => $ocrRecord->id,
  199. 'error' => $e->getMessage(),
  200. ]);
  201. // 标记为失败
  202. $ocrRecord->update([
  203. 'status' => 'failed',
  204. 'error_message' => 'OCR服务调用失败:' . $e->getMessage(),
  205. ]);
  206. }
  207. }
  208. /**
  209. * Match answers to questions by question number
  210. */
  211. protected function matchAnswersToQuestions(array $questions, array $answers): array
  212. {
  213. // Create a map of answers by question number
  214. $answerMap = [];
  215. foreach ($answers as $answer) {
  216. $questionNumber = $answer['question_number'] ?? null;
  217. if ($questionNumber) {
  218. $answerMap[$questionNumber] = $answer['content'] ?? '';
  219. }
  220. }
  221. // Match answers to questions
  222. $matched = [];
  223. foreach ($questions as $question) {
  224. $questionNumber = $question['question_number'];
  225. $matched[] = [
  226. 'question_number' => $questionNumber,
  227. 'content' => $question['content'],
  228. 'student_answer' => $answerMap[$questionNumber] ?? '',
  229. 'confidence' => $question['confidence'] ?? 0.0,
  230. 'raw_data' => $question['raw_data'] ?? null
  231. ];
  232. }
  233. return $matched;
  234. }
  235. /**
  236. * 处理OCR结果
  237. */
  238. protected function processOcrResult(OCRRecord $ocrRecord, array $result): void
  239. {
  240. // 将完整的API返回数据写入单独的文件
  241. $logFile = storage_path("logs/ocr_raw_data_{$ocrRecord->id}_" . date('Y-m-d_H-i-s') . ".json");
  242. file_put_contents($logFile, json_encode([
  243. 'timestamp' => now()->toISOString(),
  244. 'record_id' => $ocrRecord->id,
  245. 'paper_title' => $ocrRecord->paper_title,
  246. 'student_id' => $ocrRecord->student_id,
  247. 'file_path' => $ocrRecord->file_path,
  248. 'aliyun_response' => $result
  249. ], JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
  250. \Log::info('OCR: 完整API数据已写入文件', [
  251. 'record_id' => $ocrRecord->id,
  252. 'log_file' => basename($logFile)
  253. ]);
  254. // 保存到数据库 ocr_raw_data 表
  255. try {
  256. \Illuminate\Support\Facades\DB::table('ocr_raw_data')->updateOrInsert(
  257. ['ocr_record_id' => $ocrRecord->id],
  258. [
  259. 'raw_response' => json_encode($result, JSON_UNESCAPED_UNICODE),
  260. 'api_request_id' => $result['requestId'] ?? null,
  261. 'algo_version' => $result['data']['algo_version'] ?? null,
  262. 'total_blocks' => count($result['questions'] ?? []),
  263. 'metadata' => json_encode([
  264. 'saved_at' => now()->toISOString(),
  265. 'source' => 'OCRService'
  266. ]),
  267. 'created_at' => now(),
  268. 'updated_at' => now(),
  269. ]
  270. );
  271. \Log::info('OCR: 原始数据已保存到数据库', ['record_id' => $ocrRecord->id]);
  272. } catch (\Exception $e) {
  273. \Log::error('OCR: 保存原始数据到数据库失败', [
  274. 'record_id' => $ocrRecord->id,
  275. 'error' => $e->getMessage()
  276. ]);
  277. }
  278. // Get matched questions from two-pass OCR
  279. $questions = $result['questions'] ?? [];
  280. // 将识别到的题目列表写入单独文件
  281. if (!empty($questions)) {
  282. $questionsLogFile = storage_path("logs/ocr_questions_{$ocrRecord->id}_" . date('Y-m-d_H-i-s') . ".json");
  283. file_put_contents($questionsLogFile, json_encode([
  284. 'timestamp' => now()->toISOString(),
  285. 'record_id' => $ocrRecord->id,
  286. 'paper_title' => $ocrRecord->paper_title,
  287. 'total_questions' => count($questions),
  288. 'questions' => $questions
  289. ], JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
  290. \Log::info('OCR: 题目列表已写入文件', [
  291. 'record_id' => $ocrRecord->id,
  292. 'questions_count' => count($questions),
  293. 'log_file' => basename($questionsLogFile)
  294. ]);
  295. }
  296. // 使用 LaTeX 清理服务预处理所有公式
  297. $latexCleaner = app(\App\Services\LatexCleanerService::class);
  298. $questions = $latexCleaner->cleanArray($questions, ['content', 'student_answer']);
  299. \Log::info('LaTeX formulas cleaned', ['question_count' => count($questions)]);
  300. $processedCount = 0;
  301. foreach ($questions as $question) {
  302. // 再次确保清理(双重保险)
  303. $questionText = $latexCleaner->clean($question['content'] ?? '');
  304. $studentAnswer = $latexCleaner->clean($question['student_answer'] ?? '');
  305. // 验证清理后的内容
  306. $validation = $latexCleaner->validate($questionText);
  307. if (!$validation['valid']) {
  308. \Log::warning('LaTeX validation warnings', [
  309. 'question_number' => $question['question_number'],
  310. 'errors' => $validation['errors']
  311. ]);
  312. }
  313. OCRQuestionResult::create([
  314. 'ocr_record_id' => $ocrRecord->id,
  315. 'question_number' => $question['question_number'],
  316. 'question_text' => $questionText,
  317. 'student_answer' => $studentAnswer,
  318. 'score_value' => 0, // Will be filled by AI grading
  319. 'mark_detected' => null,
  320. 'score_confidence' => $question['confidence'] ?? 0,
  321. ]);
  322. $processedCount++;
  323. }
  324. $ocrRecord->update([
  325. 'status' => 'completed',
  326. 'processed_at' => now(),
  327. 'total_questions' => $processedCount,
  328. 'processed_questions' => $processedCount,
  329. 'confidence_avg' => collect($questions)->avg('confidence') ?? 0,
  330. ]);
  331. \Log::info('OCR processing complete', [
  332. 'record_id' => $ocrRecord->id,
  333. 'questions_processed' => $processedCount
  334. ]);
  335. // 不再自动提交分析,让用户在 OCR 详情页先生成题库题目
  336. // 用户需要在 ocr-record-view 页面手动点击"生成题库题目"和"提交分析"
  337. // if ($processedCount > 0) {
  338. // $this->submitToAnalysis($ocrRecord, $questions);
  339. // }
  340. }
  341. /**
  342. * 提交到分析服务
  343. */
  344. protected function submitToAnalysis(OCRRecord $ocrRecord, array $questions): void
  345. {
  346. try {
  347. $analysisData = [
  348. 'exam_id' => $ocrRecord->exam_id ?? ('ocr_' . $ocrRecord->id), // 使用 OCR 记录 ID 作为后备
  349. 'student_id' => $ocrRecord->student_id,
  350. 'ocr_record_id' => $ocrRecord->id,
  351. 'teacher_name' => 'System', // 或者是上传者的名字
  352. 'analysis_type' => 'mastery',
  353. 'questions' => array_map(function($q) {
  354. // 优先使用人工校准的答案
  355. $studentAnswer = $q['student_answer'] ?? '';
  356. if (isset($q['manual_answer']) && !empty($q['manual_answer'])) {
  357. $studentAnswer = $q['manual_answer'];
  358. }
  359. return [
  360. 'question_id' => $q['question_number'], // 使用题号作为临时ID
  361. 'question_number' => (string)$q['question_number'],
  362. 'kp_code' => $q['kp_code'] ?? null,
  363. 'score_value' => $q['score_value'] ?? 0,
  364. 'student_answer' => $studentAnswer,
  365. 'ocr_confidence' => $q['confidence'] ?? 0,
  366. 'question_text' => $q['content'] ?? '', // 传递题目内容供AI分析
  367. 'teacher_validated' => $q['answer_verified'] ?? false,
  368. ];
  369. }, $questions)
  370. ];
  371. $result = $this->learningAnalyticsService->submitOCRAnalysis($analysisData);
  372. if (isset($result['success']) && $result['success']) {
  373. $ocrRecord->update([
  374. 'ai_analyzed_at' => now(),
  375. 'ai_analysis_count' => ($ocrRecord->ai_analysis_count ?? 0) + 1
  376. ]);
  377. }
  378. } catch (\Exception $e) {
  379. \Log::error('Failed to submit to analysis service', [
  380. 'record_id' => $ocrRecord->id,
  381. 'error' => $e->getMessage()
  382. ]);
  383. // 不抛出异常,以免影响OCR流程的完成状态
  384. }
  385. }
  386. /**
  387. * 重新处理OCR记录
  388. */
  389. public function reprocess(OCRRecord $ocrRecord): bool
  390. {
  391. // 重置状态
  392. $ocrRecord->update([
  393. 'status' => 'pending',
  394. 'error_message' => null,
  395. 'processed_at' => null,
  396. 'total_questions' => 0,
  397. 'processed_questions' => 0,
  398. 'confidence_avg' => null,
  399. ]);
  400. // 删除旧的题目结果
  401. OCRQuestionResult::where('ocr_record_id', $ocrRecord->id)->delete();
  402. // 重新发送到OCR服务
  403. $this->dispatchToOcrService($ocrRecord);
  404. return true;
  405. }
  406. /**
  407. * 获取OCR记录的统计信息
  408. */
  409. public function getStatistics(): array
  410. {
  411. $total = OCRRecord::count();
  412. $pending = OCRRecord::where('status', 'pending')->count();
  413. $processing = OCRRecord::where('status', 'processing')->count();
  414. $completed = OCRRecord::where('status', 'completed')->count();
  415. $failed = OCRRecord::where('status', 'failed')->count();
  416. return [
  417. 'total' => $total,
  418. 'pending' => $pending,
  419. 'processing' => $processing,
  420. 'completed' => $completed,
  421. 'failed' => $failed,
  422. ];
  423. }
  424. /**
  425. * Perform enhanced matching with system paper, including ROI cropping and secondary OCR.
  426. */
  427. public function performEnhancedMatching(OCRRecord $ocrRecord, array $ocrResult, $paperQuestions): array
  428. {
  429. $parser = new \App\Services\OCRDataParser();
  430. $latexCleaner = app(\App\Services\LatexCleanerService::class);
  431. $matchedResults = $parser->matchWithSystemPaper($ocrResult, $paperQuestions);
  432. $finalQuestions = [];
  433. $imagePath = Storage::disk($this->getDisk())->path($ocrRecord->image_path);
  434. // Secondary OCR Loop: Crop and Re-recognize with handwriting support
  435. foreach ($matchedResults as $qNum => $match) {
  436. $secondaryAnswer = $match['student_answer']; // Default to initial match
  437. $questionText = $match['question_text'] ?? '';
  438. if (isset($match['coordinates'])) {
  439. $yMin = $match['coordinates']['y_min'];
  440. $yMax = $match['coordinates']['y_max'];
  441. $cropPath = 'uploads/ocr/crops/' . $ocrRecord->id . "_q{$qNum}.jpg";
  442. $absoluteCropPath = Storage::disk($this->getDisk())->path($cropPath);
  443. // Ensure directory exists
  444. $cropDir = dirname($absoluteCropPath);
  445. if (!file_exists($cropDir)) {
  446. mkdir($cropDir, 0777, true);
  447. }
  448. // Crop the image
  449. if ($this->imageProcessingService->cropImage($imagePath, $yMin, $yMax, $absoluteCropPath)) {
  450. try {
  451. \Log::info("Secondary OCR for Q{$qNum} (Handwriting)", ['crop_path' => $cropPath]);
  452. // Use handwriting recognition for cropped region
  453. if (method_exists($this->ocrDriver, 'recognizeHandwriting')) {
  454. $handwritingResult = $this->ocrDriver->recognizeHandwriting($absoluteCropPath, [
  455. 'subject' => 'Math',
  456. 'ocr_record_id' => $ocrRecord->id
  457. ]);
  458. // Construct a cropResult structure from handwritingResult for extractAnswerFromCrop
  459. if (!empty($handwritingResult['texts'])) {
  460. $combinedText = implode(' ', array_column($handwritingResult['texts'], 'text'));
  461. $cropResult = [
  462. 'questions' => [
  463. [
  464. 'question_number' => $qNum, // Use current question number
  465. 'content' => $combinedText,
  466. 'student_answer' => $combinedText, // For now, treat full text as answer
  467. 'confidence' => 1, // Assume high confidence for handwriting
  468. 'bounding_box' => [ // Placeholder bbox for the whole crop
  469. 'x_min' => 0, 'y_min' => 0, 'x_max' => 1, 'y_max' => 1
  470. ]
  471. ]
  472. ]
  473. ];
  474. $secondaryAnswer = $parser->extractAnswerFromCrop($cropResult, $match['question_text'] ?? '');
  475. \Log::info("Handwriting OCR Result for Q{$qNum}", [
  476. 'raw_answer' => $secondaryAnswer,
  477. 'texts_count' => count($handwritingResult['texts'])
  478. ]);
  479. } else {
  480. \Log::info("No handwriting detected for Q{$qNum}, using original answer");
  481. }
  482. } else {
  483. // Fallback to original method if handwriting not supported
  484. \Log::warning("Handwriting recognition not supported, using standard OCR");
  485. $cropResult = $this->ocrDriver->recognize($absoluteCropPath, [
  486. 'cutType' => 'answer',
  487. 'subject' => 'Math',
  488. 'ocr_record_id' => $ocrRecord->id
  489. ]);
  490. if (!empty($cropResult['questions'])) {
  491. $secondaryAnswer = $parser->extractAnswerFromCrop($cropResult, $match['question_text'] ?? '');
  492. \Log::info("Standard OCR Result for Q{$qNum}: {$secondaryAnswer}");
  493. }
  494. }
  495. } catch (\Exception $e) {
  496. \Log::warning("Secondary OCR failed for Q{$qNum}: " . $e->getMessage());
  497. }
  498. }
  499. }
  500. // Clean up any residual question text/noise so学生答案仅保留手写内容
  501. $secondaryAnswer = $this->cleanHandwritingAnswer($secondaryAnswer, $questionText);
  502. $secondaryAnswer = $latexCleaner->clean($secondaryAnswer);
  503. $finalQuestions[] = [
  504. 'question_number' => $qNum,
  505. 'content' => '系统题目', // 或者是从PaperQuestion获取
  506. 'student_answer' => $secondaryAnswer,
  507. 'confidence' => $match['confidence'],
  508. 'student_answer_bbox' => $match['coordinates'] ?? null,
  509. 'raw_data' => $match['debug_info'] ?? []
  510. ];
  511. }
  512. \Log::info('OCR: 使用增强匹配完成 (含手写识别)', [
  513. 'record_id' => $ocrRecord->id,
  514. 'matched_count' => count($finalQuestions)
  515. ]);
  516. return $finalQuestions;
  517. }
  518. /**
  519. * 获取存储磁盘名称
  520. */
  521. protected function getDisk(): string
  522. {
  523. return 'public'; // OCR uploads are stored in public disk
  524. }
  525. /**
  526. * 清理手写识别结果,去除题干和常见前缀,返回纯答案
  527. *
  528. * @param string $rawAnswer 手写识别得到的完整文本
  529. * @param string $questionText 对应题目的题干文本(可能为空)
  530. * @return string 处理后的答案,仅保留学生答案部分
  531. */
  532. private function cleanHandwritingAnswer(string $rawAnswer, string $questionText = ''): string
  533. {
  534. // 预清洗空白
  535. $answer = trim(preg_replace('/\s+/', ' ', $rawAnswer));
  536. if ($answer === '') {
  537. return '';
  538. }
  539. // 常用前缀与编号噪声
  540. $answer = preg_replace('/^[O0〇]?\s*\d+[\\..、\\))]?\s*/u', '', $answer);
  541. $answer = preg_replace('/^(解|答|答案)[::]?\s*/u', '', $answer);
  542. // 去掉全局换行/多空格后再比较
  543. // 归一化文本用于相似度判断
  544. $normalize = function (string $text): string {
  545. $text = strip_tags($text);
  546. $text = preg_replace('/\s+/', '', $text);
  547. $text = preg_replace('/[[:punct:]]/u', '', $text);
  548. return mb_strtolower($text);
  549. };
  550. $normAnswer = $normalize($answer);
  551. $normQuestion = $normalize($questionText);
  552. // 如果整体与题干非常相似,直接判定为空答案
  553. if ($normQuestion !== '') {
  554. similar_text($normAnswer, $normQuestion, $similarity);
  555. if ($similarity >= 70 && mb_strlen($normAnswer) <= mb_strlen($normQuestion) * 1.2) {
  556. return '';
  557. }
  558. }
  559. // 移除显式的题干锚点(利用题干末尾或前缀模糊匹配)
  560. if ($questionText !== '') {
  561. $anchor = mb_substr($questionText, -12); // 取题干末尾作为锚点
  562. if ($anchor !== '') {
  563. $pos = mb_stripos($answer, $anchor);
  564. if ($pos !== false) {
  565. $answer = trim(mb_substr($answer, $pos + mb_strlen($anchor)));
  566. $normAnswer = $normalize($answer);
  567. }
  568. }
  569. // 如果答案仍然以题干开头,粗暴截掉题干长度
  570. if ($normQuestion !== '' && str_starts_with($normAnswer, $normQuestion)) {
  571. $answer = trim(mb_substr($answer, mb_strlen($questionText)));
  572. $normAnswer = $normalize($answer);
  573. }
  574. // 用题干前缀再截一次(更适合短题目)
  575. $prefix = mb_substr($questionText, 0, 18);
  576. if ($prefix !== '') {
  577. $pos = mb_stripos($answer, $prefix);
  578. if ($pos !== false && $pos + mb_strlen($prefix) <= mb_strlen($answer)) {
  579. $answer = trim(mb_substr($answer, $pos + mb_strlen($prefix)));
  580. $normAnswer = $normalize($answer);
  581. }
  582. }
  583. }
  584. // 如果仍然包含长句,尽量取“得”“=”等关键词后的尾部
  585. if (mb_strlen($answer) > 40) {
  586. if (preg_match('/得[::]?\s*([^,。;]*)/u', $answer, $matches) && !empty(trim($matches[1]))) {
  587. $answer = trim($matches[1]);
  588. } elseif (preg_match('/=\s*([^\s,。;]+)\s*$/u', $answer, $matches)) {
  589. $answer = trim($matches[1]);
  590. }
  591. } else {
  592. // 对于短文本,允许简单的等号截断
  593. if (preg_match('/=\s*([^\s,。;]+)\s*$/u', $answer, $matches)) {
  594. $answer = trim($matches[1]);
  595. }
  596. }
  597. // 最后一次相似度检查,避免把题干残留当作答案
  598. $normAnswer = $normalize($answer);
  599. if ($normQuestion !== '') {
  600. similar_text($normAnswer, $normQuestion, $finalSim);
  601. if ($finalSim >= 65 && mb_strlen($normAnswer) > 0) {
  602. return '';
  603. }
  604. }
  605. // 如果包含多段内容,优先取最后一段非空的短文本
  606. $parts = preg_split('/[\\n;]/u', $answer);
  607. if (is_array($parts)) {
  608. $parts = array_map('trim', array_filter($parts, fn($p) => $p !== ''));
  609. if (!empty($parts)) {
  610. $candidate = end($parts);
  611. if (mb_strlen($candidate) <= 50) {
  612. $answer = $candidate;
  613. }
  614. }
  615. }
  616. return trim($answer);
  617. }
  618. }