| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318 |
- <?php
- namespace App\Services\OCR\Drivers;
- use App\Services\OCR\OCRInterface;
- use AlibabaCloud\SDK\Ocrapi\V20210707\Ocrapi;
- use AlibabaCloud\Tea\Utils\Utils\RuntimeOptions;
- use Darabonba\OpenApi\Models\Config;
- use AlibabaCloud\SDK\Ocrapi\V20210707\Models\RecognizeEduPaperOcrRequest;
- use AlibabaCloud\SDK\Ocrapi\V20210707\Models\RecognizeEduPaperCutRequest;
- use Illuminate\Support\Facades\Log;
- class AliyunOCRDriver implements OCRInterface
- {
- protected $client;
- public function __construct(array $config, $client = null)
- {
- if ($client) {
- $this->client = $client;
- return;
- }
- $apiConfig = new Config([
- 'accessKeyId' => $config['access_key_id'],
- 'accessKeySecret' => $config['access_key_secret'],
- 'endpoint' => $config['endpoint'],
- ]);
- $this->client = new Ocrapi($apiConfig);
- }
- public function recognize(string $imagePath, array $options = []): array
- {
- try {
- // Check if file exists
- if (!file_exists($imagePath)) {
- throw new \Exception("Image file not found: {$imagePath}");
- }
- // Get parameters from options
- $cutType = $options['cutType'] ?? 'question';
- $subject = $options['subject'] ?? 'Math';
- $ocrRecordId = $options['ocr_record_id'] ?? null;
- // Read file content
- $fileStream = fopen($imagePath, 'rb');
- $stream = \GuzzleHttp\Psr7\Utils::streamFor($fileStream);
- $request = new \AlibabaCloud\SDK\Ocrapi\V20210707\Models\RecognizeEduPaperCutRequest([
- 'body' => $stream,
- 'cutType' => $cutType,
- 'imageType' => 'photo',
- 'subject' => $subject,
- 'outputOricoord' => false
- ]);
- $runtime = new RuntimeOptions([]);
-
- // Call Aliyun API
- $response = $this->client->recognizeEduPaperCutWithOptions($request, $runtime);
- // Close stream
- if (is_resource($fileStream)) {
- fclose($fileStream);
- }
- // Parse response
- $body = json_decode(json_encode($response->body), true);
-
- // Detailed logging
- Log::info('Aliyun EduPaperCut Full Response', [
- 'cutType' => $cutType,
- 'has_data' => isset($body['data']),
- 'request_id' => $body['requestId'] ?? null,
- 'code' => $body['code'] ?? null,
- 'message' => $body['message'] ?? null,
- 'body_keys' => array_keys($body ?? [])
- ]);
-
- // Log raw data if exists
- if (isset($body['data'])) {
- $dataPreview = is_string($body['data'])
- ? substr($body['data'], 0, 500)
- : json_encode($body['data']);
- Log::info('Aliyun Data Preview', ['data' => $dataPreview]);
- }
-
- // Extract data from Aliyun response
- $questions = [];
-
- if (isset($body['data'])) {
- // The data field is a JSON string
- $data = is_string($body['data']) ? json_decode($body['data'], true) : $body['data'];
-
- // Extract page_list -> subject_list OR answer_list
- if (isset($data['page_list']) && is_array($data['page_list'])) {
- foreach ($data['page_list'] as $page) {
- // Determine which list to use based on cutType
- $itemList = null;
- if ($cutType === 'answer' && isset($page['answer_list'])) {
- $itemList = $page['answer_list'];
- } elseif (isset($page['subject_list'])) {
- $itemList = $page['subject_list'];
- }
-
- if ($itemList && is_array($itemList)) {
- foreach ($itemList as $item) {
- // Extract question/answer data
- $questionNumber = count($questions) + 1; // 默认使用索引
- if (isset($item['ids']) && is_array($item['ids']) && !empty($item['ids'])) {
- $idValue = $item['ids'][0];
- // 只有当 ids[0] 是数字时才使用它作为题号
- if (is_numeric($idValue)) {
- $questionNumber = (int) $idValue;
- }
- }
-
- // Get text - if not provided, build from prism_wordsInfo
- $text = $item['text'] ?? '';
- if (empty($text) && isset($item['prism_wordsInfo']) && is_array($item['prism_wordsInfo'])) {
- $words = [];
- foreach ($item['prism_wordsInfo'] as $wordInfo) {
- if (isset($wordInfo['word'])) {
- $words[] = $wordInfo['word'];
- }
- }
- $text = implode('', $words);
- }
-
- // Calculate confidence from prism_wordsInfo
- $confidence = 0.0;
- if (isset($item['prism_wordsInfo']) && is_array($item['prism_wordsInfo'])) {
- $totalProb = 0;
- $count = 0;
- foreach ($item['prism_wordsInfo'] as $wordInfo) {
- if (isset($wordInfo['prob'])) {
- $totalProb += $wordInfo['prob'];
- $count++;
- }
- }
- $confidence = $count > 0 ? ($totalProb / $count) / 100 : 0.0;
- }
-
- $questions[] = [
- 'question_number' => $questionNumber,
- 'content' => $text,
- 'cut_type' => $cutType,
- 'confidence' => $confidence,
- 'raw_data' => $item
- ];
- }
- }
- }
- }
- }
- // 保存完整的API响应到ocr_raw_data表
- if (isset($body['requestId']) && !empty($questions)) {
- try {
- \App\Models\OCRRawData::saveRawResponse($ocrRecordId ?? 0, $body);
- \Log::info('Aliyun OCR: 原始数据已保存', [
- 'request_id' => $body['requestId'],
- 'questions_count' => count($questions),
- 'ocr_record_id' => $ocrRecordId ?? null
- ]);
- } catch (\Exception $e) {
- \Log::error('Aliyun OCR: 保存原始数据失败', [
- 'error' => $e->getMessage(),
- 'request_id' => $body['requestId'] ?? null
- ]);
- }
- }
- return [
- 'raw' => $body,
- 'questions' => $questions,
- 'cut_type' => $cutType
- ];
- } catch (\Exception $e) {
- Log::error('Aliyun OCR Error', [
- 'message' => $e->getMessage(),
- 'trace' => $e->getTraceAsString(),
- ]);
- throw $e;
- }
- }
- /**
- * 识别手写内容(使用RecognizeEduPaperOcr接口)
- *
- * @param string $imagePath 图片路径
- * @param array $options 选项参数
- * @return array 识别结果
- */
- public function recognizeHandwriting(string $imagePath, array $options = []): array
- {
- try {
- // Check if file exists
- if (!file_exists($imagePath)) {
- throw new \Exception("Image file not found: {$imagePath}");
- }
- // Get parameters from options
- $subject = $options['subject'] ?? 'Math';
- $ocrRecordId = $options['ocr_record_id'] ?? null;
- // Read file content
- $fileStream = fopen($imagePath, 'rb');
- $stream = \GuzzleHttp\Psr7\Utils::streamFor($fileStream);
- $request = new RecognizeEduPaperOcrRequest([
- 'body' => $stream,
- 'imageType' => 'photo',
- 'subject' => $subject,
- 'textType' => '2', // 2 = 手写体
- 'outputOricoord' => true // 输出坐标信息
- ]);
- $runtime = new RuntimeOptions([]);
-
- // Call Aliyun API
- $response = $this->client->recognizeEduPaperOcrWithOptions($request, $runtime);
- // Close stream
- if (is_resource($fileStream)) {
- fclose($fileStream);
- }
- // Parse response
- $body = json_decode(json_encode($response->body), true);
-
- // Detailed logging
- Log::info('Aliyun EduPaperOcr (Handwriting) Response', [
- 'has_data' => isset($body['data']),
- 'request_id' => $body['requestId'] ?? null,
- 'code' => $body['code'] ?? null,
- 'message' => $body['message'] ?? null,
- 'body_keys' => array_keys($body ?? [])
- ]);
-
- // Extract recognized text
- $recognizedTexts = [];
-
- if (isset($body['data'])) {
- $data = is_string($body['data']) ? json_decode($body['data'], true) : $body['data'];
-
- // Extract content from data structure
- if (isset($data['content']) && is_string($data['content'])) {
- // Simple content string
- $recognizedTexts[] = [
- 'text' => $data['content'],
- 'confidence' => 1.0
- ];
- } elseif (isset($data['prism_wordsInfo']) && is_array($data['prism_wordsInfo'])) {
- // Detailed word-level information
- $allWords = [];
- $totalProb = 0;
- $count = 0;
-
- foreach ($data['prism_wordsInfo'] as $wordInfo) {
- if (isset($wordInfo['word'])) {
- $allWords[] = $wordInfo['word'];
- if (isset($wordInfo['prob'])) {
- $totalProb += $wordInfo['prob'];
- $count++;
- }
- }
- }
-
- if (!empty($allWords)) {
- $recognizedTexts[] = [
- 'text' => implode('', $allWords),
- 'confidence' => $count > 0 ? ($totalProb / $count) / 100 : 0.0
- ];
- }
- } elseif (isset($data['page_list']) && is_array($data['page_list'])) {
- // Page-based structure (similar to EduPaperCut)
- foreach ($data['page_list'] as $page) {
- if (isset($page['prism_wordsInfo']) && is_array($page['prism_wordsInfo'])) {
- $words = [];
- foreach ($page['prism_wordsInfo'] as $wordInfo) {
- if (isset($wordInfo['word'])) {
- $words[] = $wordInfo['word'];
- }
- }
- if (!empty($words)) {
- $recognizedTexts[] = [
- 'text' => implode('', $words),
- 'confidence' => 1.0
- ];
- }
- }
- }
- }
- }
-
- Log::info('Handwriting recognition result', [
- 'texts_count' => count($recognizedTexts),
- 'preview' => !empty($recognizedTexts) ? mb_substr($recognizedTexts[0]['text'], 0, 100) : 'N/A'
- ]);
- return [
- 'raw' => $body,
- 'texts' => $recognizedTexts,
- 'type' => 'handwriting'
- ];
- } catch (\Exception $e) {
- Log::error('Aliyun Handwriting OCR Error', [
- 'message' => $e->getMessage(),
- 'trace' => $e->getTraceAsString(),
- ]);
- throw $e;
- }
- }
- }
|