OCRService.php 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. <?php
  2. namespace App\Services;
  3. use App\Models\OCRRecord;
  4. use App\Models\OCRQuestionResult;
  5. use Illuminate\Http\UploadedFile;
  6. use Illuminate\Support\Facades\Http;
  7. use Illuminate\Support\Facades\Storage;
  8. use Illuminate\Support\Str;
  9. class OCRService
  10. {
  11. protected $ocrDriver;
  12. protected $learningAnalyticsService;
  13. public function __construct(LearningAnalyticsService $learningAnalyticsService)
  14. {
  15. $this->ocrDriver = \App\Services\OCR\OCRFactory::create();
  16. $this->learningAnalyticsService = $learningAnalyticsService;
  17. }
  18. /**
  19. * 上传卷子照片并创建OCR记录
  20. */
  21. public function uploadExamPaper(UploadedFile $image, string $studentId): OCRRecord
  22. {
  23. // 验证图片
  24. $this->validateImage($image);
  25. // 生成唯一ID
  26. $recordId = 'ocr_' . Str::uuid()->toString();
  27. $examId = 'exam_' . now()->format('YmdHis') . '_' . Str::random(8);
  28. // 获取图片信息
  29. $imageInfo = getimagesize($image->getPathName());
  30. $imageWidth = $imageInfo[0] ?? 0;
  31. $imageHeight = $imageInfo[1] ?? 0;
  32. $imageSize = filesize($image->getPathName());
  33. // 保存图片
  34. $extension = $image->getClientOriginalExtension();
  35. $filename = $recordId . '.' . $extension;
  36. $imagePath = 'uploads/ocr/' . $filename;
  37. Storage::disk('public')->put($imagePath, file_get_contents($image->getPathName()));
  38. // 创建OCR记录
  39. $ocrRecord = OCRRecord::create([
  40. 'user_id' => $studentId,
  41. 'file_path' => $imagePath,
  42. 'paper_title' => $image->getClientOriginalName(),
  43. 'status' => 'pending',
  44. ]);
  45. // 发送到OCR服务处理
  46. $this->dispatchToOcrService($ocrRecord);
  47. return $ocrRecord;
  48. }
  49. /**
  50. * 验证上传的图片
  51. */
  52. protected function validateImage(UploadedFile $image): void
  53. {
  54. $maxSize = config('ocr.upload.max_size', 10 * 1024 * 1024);
  55. $allowedTypes = config('ocr.upload.allowed_types', ['jpg', 'jpeg', 'png', 'webp']);
  56. if (!$image->isValid()) {
  57. throw new \Exception('文件上传失败');
  58. }
  59. if ($image->getSize() > $maxSize) {
  60. throw new \Exception('文件大小超出限制(' . ($maxSize / 1024 / 1024) . 'MB)');
  61. }
  62. $extension = strtolower($image->getClientOriginalExtension());
  63. if (!in_array($extension, $allowedTypes)) {
  64. throw new \Exception('不支持的文件类型,仅支持:' . implode(', ', $allowedTypes));
  65. }
  66. }
  67. /**
  68. * 发送到OCR服务处理
  69. */
  70. protected function dispatchToOcrService(OCRRecord $ocrRecord): void
  71. {
  72. try {
  73. // 检查图片路径是否存在
  74. if (empty($ocrRecord->image_path)) {
  75. throw new \Exception('OCR记录缺少图片路径,record_id: ' . $ocrRecord->id);
  76. }
  77. // 读取图片文件
  78. $imagePath = Storage::disk($this->getDisk())->path($ocrRecord->image_path);
  79. // 确保返回的是字符串路径
  80. if (empty($imagePath)) {
  81. throw new \Exception('无法获取图片路径: ' . $ocrRecord->image_path);
  82. }
  83. if (!file_exists($imagePath)) {
  84. throw new \Exception('图片文件不存在: ' . $imagePath);
  85. }
  86. // 更新状态为processing
  87. $ocrRecord->update(['status' => 'processing']);
  88. // Single API call with cutType: answer (returns both question and answer)
  89. \Log::info('OCR: Extracting questions and answers', ['record_id' => $ocrRecord->id]);
  90. $result = $this->ocrDriver->recognize($imagePath, [
  91. 'cutType' => 'answer',
  92. 'subject' => 'Math'
  93. ]);
  94. $items = $result['questions'] ?? [];
  95. \Log::info('OCR extraction complete', ['item_count' => count($items)]);
  96. // Step 2: Parse student answers from the answer_list data
  97. // Each item in answer_list contains the full question+answer text
  98. // The student's answer is typically the last letter (A/B/C/D) in the text
  99. \Log::info('Parsing student answers from question text');
  100. $parsedQuestions = [];
  101. foreach ($items as $item) {
  102. $questionNumber = $item['question_number'];
  103. $fullText = $item['content'] ?? '';
  104. $questionText = $fullText;
  105. $studentAnswer = '';
  106. // Smart parsing: extract the last single letter (A/B/C/D) as student answer
  107. // Pattern: "题目内容...选项D[学生答案]"
  108. // The student answer is usually the very last character if it's A/B/C/D
  109. if (preg_match('/([A-D])\s*$/u', $fullText, $matches)) {
  110. $studentAnswer = $matches[1];
  111. // Remove the answer from question text
  112. $questionText = preg_replace('/\s*[A-D]\s*$/', '', $fullText);
  113. \Log::info('Extracted student answer', [
  114. 'question_number' => $questionNumber,
  115. 'answer' => $studentAnswer,
  116. 'original_text_length' => mb_strlen($fullText),
  117. 'cleaned_text_length' => mb_strlen($questionText)
  118. ]);
  119. }
  120. $parsedQuestions[] = [
  121. 'question_number' => $questionNumber,
  122. 'content' => trim($questionText),
  123. 'student_answer' => $studentAnswer,
  124. 'confidence' => $item['confidence'] ?? 0.0,
  125. 'raw_data' => $item['raw_data'] ?? null
  126. ];
  127. }
  128. // 处理结果
  129. $this->processOcrResult($ocrRecord, [
  130. 'questions' => $parsedQuestions,
  131. 'raw' => $result
  132. ]);
  133. } catch (\Exception $e) {
  134. \Log::error('OCR服务调用失败', [
  135. 'record_id' => $ocrRecord->id,
  136. 'error' => $e->getMessage(),
  137. ]);
  138. // 标记为失败
  139. $ocrRecord->update([
  140. 'status' => 'failed',
  141. 'error_message' => 'OCR服务调用失败:' . $e->getMessage(),
  142. ]);
  143. }
  144. }
  145. /**
  146. * Match answers to questions by question number
  147. */
  148. protected function matchAnswersToQuestions(array $questions, array $answers): array
  149. {
  150. // Create a map of answers by question number
  151. $answerMap = [];
  152. foreach ($answers as $answer) {
  153. $questionNumber = $answer['question_number'] ?? null;
  154. if ($questionNumber) {
  155. $answerMap[$questionNumber] = $answer['content'] ?? '';
  156. }
  157. }
  158. // Match answers to questions
  159. $matched = [];
  160. foreach ($questions as $question) {
  161. $questionNumber = $question['question_number'];
  162. $matched[] = [
  163. 'question_number' => $questionNumber,
  164. 'content' => $question['content'],
  165. 'student_answer' => $answerMap[$questionNumber] ?? '',
  166. 'confidence' => $question['confidence'] ?? 0.0,
  167. 'raw_data' => $question['raw_data'] ?? null
  168. ];
  169. }
  170. return $matched;
  171. }
  172. /**
  173. * 处理OCR结果
  174. */
  175. protected function processOcrResult(OCRRecord $ocrRecord, array $result): void
  176. {
  177. // Log the raw result for debugging
  178. \Log::info('OCR Result received', ['question_count' => count($result['questions'] ?? [])]);
  179. // Get matched questions from two-pass OCR
  180. $questions = $result['questions'] ?? [];
  181. // 使用 LaTeX 清理服务预处理所有公式
  182. $latexCleaner = app(\App\Services\LatexCleanerService::class);
  183. $questions = $latexCleaner->cleanArray($questions, ['content', 'student_answer']);
  184. \Log::info('LaTeX formulas cleaned', ['question_count' => count($questions)]);
  185. $processedCount = 0;
  186. foreach ($questions as $question) {
  187. // 再次确保清理(双重保险)
  188. $questionText = $latexCleaner->clean($question['content'] ?? '');
  189. $studentAnswer = $latexCleaner->clean($question['student_answer'] ?? '');
  190. // 验证清理后的内容
  191. $validation = $latexCleaner->validate($questionText);
  192. if (!$validation['valid']) {
  193. \Log::warning('LaTeX validation warnings', [
  194. 'question_number' => $question['question_number'],
  195. 'errors' => $validation['errors']
  196. ]);
  197. }
  198. OCRQuestionResult::create([
  199. 'ocr_record_id' => $ocrRecord->id,
  200. 'question_number' => $question['question_number'],
  201. 'question_text' => $questionText,
  202. 'student_answer' => $studentAnswer,
  203. 'score_value' => 0, // Will be filled by AI grading
  204. 'mark_detected' => null,
  205. 'score_confidence' => $question['confidence'] ?? 0,
  206. ]);
  207. $processedCount++;
  208. }
  209. $ocrRecord->update([
  210. 'status' => 'completed',
  211. 'processed_at' => now(),
  212. 'total_questions' => $processedCount,
  213. 'processed_questions' => $processedCount,
  214. 'confidence_avg' => collect($questions)->avg('confidence') ?? 0,
  215. ]);
  216. \Log::info('OCR processing complete', [
  217. 'record_id' => $ocrRecord->id,
  218. 'questions_processed' => $processedCount
  219. ]);
  220. // 不再自动提交分析,让用户在 OCR 详情页先生成题库题目
  221. // 用户需要在 ocr-record-view 页面手动点击"生成题库题目"和"提交分析"
  222. // if ($processedCount > 0) {
  223. // $this->submitToAnalysis($ocrRecord, $questions);
  224. // }
  225. }
  226. /**
  227. * 提交到分析服务
  228. */
  229. protected function submitToAnalysis(OCRRecord $ocrRecord, array $questions): void
  230. {
  231. try {
  232. $analysisData = [
  233. 'exam_id' => $ocrRecord->exam_id ?? ('ocr_' . $ocrRecord->id), // 使用 OCR 记录 ID 作为后备
  234. 'student_id' => $ocrRecord->student_id,
  235. 'ocr_record_id' => $ocrRecord->id,
  236. 'teacher_name' => 'System', // 或者是上传者的名字
  237. 'analysis_type' => 'mastery',
  238. 'questions' => array_map(function($q) {
  239. // 优先使用人工校准的答案
  240. $studentAnswer = $q['student_answer'] ?? '';
  241. if (isset($q['manual_answer']) && !empty($q['manual_answer'])) {
  242. $studentAnswer = $q['manual_answer'];
  243. }
  244. return [
  245. 'question_id' => $q['question_number'], // 使用题号作为临时ID
  246. 'question_number' => (string)$q['question_number'],
  247. 'kp_code' => $q['kp_code'] ?? null,
  248. 'score_value' => $q['score_value'] ?? 0,
  249. 'student_answer' => $studentAnswer,
  250. 'ocr_confidence' => $q['confidence'] ?? 0,
  251. 'question_text' => $q['content'] ?? '', // 传递题目内容供AI分析
  252. 'teacher_validated' => $q['answer_verified'] ?? false,
  253. ];
  254. }, $questions)
  255. ];
  256. $result = $this->learningAnalyticsService->submitOCRAnalysis($analysisData);
  257. if (isset($result['success']) && $result['success']) {
  258. $ocrRecord->update([
  259. 'ai_analyzed_at' => now(),
  260. 'ai_analysis_count' => ($ocrRecord->ai_analysis_count ?? 0) + 1
  261. ]);
  262. }
  263. } catch (\Exception $e) {
  264. \Log::error('Failed to submit to analysis service', [
  265. 'record_id' => $ocrRecord->id,
  266. 'error' => $e->getMessage()
  267. ]);
  268. // 不抛出异常,以免影响OCR流程的完成状态
  269. }
  270. }
  271. /**
  272. * 重新处理OCR记录
  273. */
  274. public function reprocess(OCRRecord $ocrRecord): bool
  275. {
  276. // 重置状态
  277. $ocrRecord->update([
  278. 'status' => 'pending',
  279. 'error_message' => null,
  280. 'processed_at' => null,
  281. 'total_questions' => 0,
  282. 'processed_questions' => 0,
  283. 'confidence_avg' => null,
  284. ]);
  285. // 删除旧的题目结果
  286. OCRQuestionResult::where('ocr_record_id', $ocrRecord->id)->delete();
  287. // 重新发送到OCR服务
  288. $this->dispatchToOcrService($ocrRecord);
  289. return true;
  290. }
  291. /**
  292. * 获取OCR记录的统计信息
  293. */
  294. public function getStatistics(): array
  295. {
  296. $total = OCRRecord::count();
  297. $pending = OCRRecord::where('status', 'pending')->count();
  298. $processing = OCRRecord::where('status', 'processing')->count();
  299. $completed = OCRRecord::where('status', 'completed')->count();
  300. $failed = OCRRecord::where('status', 'failed')->count();
  301. return [
  302. 'total' => $total,
  303. 'pending' => $pending,
  304. 'processing' => $processing,
  305. 'completed' => $completed,
  306. 'failed' => $failed,
  307. ];
  308. }
  309. /**
  310. * 获取存储磁盘名称
  311. */
  312. protected function getDisk(): string
  313. {
  314. return 'public'; // OCR uploads are stored in public disk
  315. }
  316. }