yms
/
math_cms


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149
							<?php

namespace App\Services;

use Illuminate\Support\Facades\Log;
use Illuminate\Support\Regex;

class OCRDataParser
{
    /**
     * 解析OCR原始数据，返回结构化的题目信息
     *
     * @param array $ocrData 阿里云OCR返回的原始数据
     * @param array $paperInfo 系统试卷信息（可选，用于辅助匹配）
     * @return array 结构化的题目列表
     */
    public function parseStructuredQuestions(array $ocrData, ?array $paperInfo = null): array
    {
        // 提取所有文本块
        $textBlocks = $this->extractTextBlocks($ocrData);

        // 识别题号并按坐标分组
        $questionUnits = $this->groupBlocksByQuestionNumber($textBlocks);

        // 处理每个题目的内容
        $structuredQuestions = $this->processQuestionUnits($questionUnits);

        // 如果提供了试卷信息，进行匹配优化
        if ($paperInfo) {
            $structuredQuestions = $this->optimizeWithPaperInfo($structuredQuestions, $paperInfo);
        }

        return $structuredQuestions;
    }

    /**
     * 提取所有文本块和元数据
     */
    private function extractTextBlocksAndMeta(array $ocrData): array
    {
        $blocks = [];
        $meta = ['height' => 2000]; // Default height

        // Unwrap 'raw' key if present
        while (isset($ocrData['raw']) && is_array($ocrData['raw'])) {
            $ocrData = $ocrData['raw'];
        }

        if (isset($ocrData['data']) && is_string($ocrData['data'])) {
            $ocrData['data'] = json_decode($ocrData['data'], true);
        }

        if (isset($ocrData['data']['page_list'])) {
            foreach ($ocrData['data']['page_list'] as $page) {
                // Extract page height
                if (isset($page['height'])) {
                    $meta['height'] = $page['height'];
                }

                if (isset($page['answer_list'])) {
                    foreach ($page['answer_list'] as $item) {
                        if (isset($item['text']) && !empty(trim($item['text']))) {
                            $position = null;
                            if (isset($item['content_list_info']) && !empty($item['content_list_info'])) {
                                $position = $item['content_list_info'][0]['pos'] ?? null;
                            }

                            $blocks[] = [
                                'text' => trim($item['text']),
                                'position' => $position,
                                'confidence' => $item['confidence'] ?? null,
                                'type' => $this->detectTextType($item['text']),
                                'ids' => $item['ids'] ?? []
                            ];
                        }
                    }
                }
            }
        }

        // Sort blocks by Y
        usort($blocks, function($a, $b) {
            if (!$a['position'] || !$b['position']) return 0;
            $y1 = $a['position'][0]['y'] ?? 0;
            $y2 = $b['position'][0]['y'] ?? 0;
            return $y1 <=> $y2;
        });

        return ['blocks' => $blocks, 'meta' => $meta];
    }

    /**
     * 检测文本类型(题号、选项、答案等)
     */
    private function detectTextType(string $text): string
    {
        // 题型说明(优先检测,因为可能包含数字)
        if (preg_match('/(一、选择题|二、填空题|三、解答题|本大题)/u', $text)) {
            return 'section_header';
        }

        // 题号模式:1.、1 、1、、1)、(1)、①等
        // 但要排除单独的 "(2)=" 这种情况
        if (preg_match('/^[\(（]?[一二三四五六七八九十\d]+[\.\)）)、]/u', $text)) {
            // 额外检查:如果只是 "(数字)=" 这种形式,不认为是题号
            if (preg_match('/^[\(（]\d+[\)）]=?\s*$/u', trim($text))) {
                return 'content';
            }
            // 确保题号后面有实际内容,或者至少有分数标记
            if (mb_strlen($text) > 3 || preg_match('/\d+分/u', $text)) {
                return 'question_number';
            }
        }

        // 选项模式:A.、B、C、D、A)、A.等
        if (preg_match('/^[A-Da-d][\.\)）]/', $text)) {
            return 'option';
        }

        // 答题区域标记
        if (preg_match('/(得分|评卷人|答案|填空|解答|______|____)/u', $text)) {
            return 'answer_area';
        }

        // 试卷标题
        if (preg_match('/(试卷|测试卷|考试|试题)/u', $text)) {
            return 'title';
        }

        // 学生信息
        if (preg_match('/(姓名|班级|学号|年级)/u', $text)) {
            return 'student_info';
        }

        return 'content';
    }

    // ... (keep extractTextBlocks for backward compatibility if needed, or redirect)
    private function extractTextBlocks(array $ocrData): array
    {
        return $this->extractTextBlocksAndMeta($ocrData)['blocks'];
    }

    public function matchWithSystemPaper(array $ocrRawData, $paperQuestions): array
    {
        // Check if we have the wrapped structure with 'raw' key
        $dataToParse = isset($ocrRawData['raw']) ? $ocrRawData['raw'] : $ocrRawData;

        // 1. 提取所有文本块和元数据
        $extracted = $this->extractTextBlocksAndMeta($dataToParse);
        $blocks = $extracted['blocks'];
        $pageHeight = $extracted['meta']['height'];
        
        \Log::info('OCR匹配开始', [
            'total_blocks' => count($blocks),
            'page_height' => $pageHeight,
            'system_questions' => $paperQuestions->count()
        ]);
        
        // 2. 从OCR文本块中提取所有题目
        $ocrQuestions = $this->extractOCRQuestions($blocks);
        
        \Log::info('提取OCR题目', [
            'ocr_questions_count' => count($ocrQuestions),
            'ocr_question_numbers' => array_column($ocrQuestions, 'ocr_question_number')
        ]);
        
        // 3. 对每个系统题目,找到最匹配的OCR题目
        $results = [];
        $usedOcrQuestions = []; // 记录已使用的OCR题目,避免重复匹配
        
        foreach ($paperQuestions as $paperQuestion) {
            $bestMatch = $this->findBestMatchingOCRQuestion($ocrQuestions, $paperQuestion);
            
            if ($bestMatch && !in_array($bestMatch['ocr_question_number'], $usedOcrQuestions)) {
            $usedOcrQuestions[] = $bestMatch['ocr_question_number'];
            
            // 计算答题区域的Y范围
            $blockTop = $bestMatch['y_start'] ?? 0;
            $blockBottom = $bestMatch['y_end'] ?? $blockTop;

            // 起点：题号附近略向上，覆盖同行选择题答案
            $yStart = max(0, $blockTop - 10);

            // 下一题的起点
            $nextOcrY = null;
            foreach ($ocrQuestions as $nextOcrQ) {
                if ($nextOcrQ['y_start'] > $blockTop && 
                    !in_array($nextOcrQ['ocr_question_number'], $usedOcrQuestions)) {
                    if ($nextOcrY === null || $nextOcrQ['y_start'] < $nextOcrY) {
                        $nextOcrY = $nextOcrQ['y_start'];
                    }
                }
            }

            // 根据题型动态扩展高度（选填短，解答长）
            $questionType = $paperQuestion->question_type ?? null;
            $heightMap = [
                'choice' => 120,
                'fill' => 180,
                'answer' => 360,
            ];
            $extend = $heightMap[$questionType] ?? 220;

            // 终点：下一题前 5px 或默认扩展高度，取较小以避免跨题
            $yEndCandidate = min($pageHeight, $blockBottom + $extend);
            if ($nextOcrY !== null) {
                $yEnd = min($yEndCandidate, $nextOcrY - 5);
                $yEnd = max($yEnd, $yStart + 120); // 保底高度
            } else {
                $yEnd = $yEndCandidate;
            }

            // 查找题型说明,如果在当前题和下一题之间,使用它作为边界
            foreach ($blocks as $block) {
                $textType = $this->detectTextType($block['text']);
                if ($textType === 'section_header') {
                    $blockYTop = $this->getBlockTopY($block);
                    if ($blockYTop > $yStart && $blockYTop < $yEnd) {
                        $yEnd = min($yEnd, $blockYTop - 5);
                        break;
                    }
                }
            }

            // 最小高度兜底
            if ($yEnd - $yStart < 80) {
                $yEnd = min($pageHeight, $yStart + 80);
            }

            // 确保Y范围有效
            if ($yEnd <= $yStart) {
                $yEnd = $yStart + 200; // 至少给200像素的空间
            }
            
            \Log::debug("Q{$paperQuestion->question_number} Y范围", [
                'y_start' => $yStart,
                'y_end' => $yEnd,
                'range' => $yEnd - $yStart
            ]);
            
            // 提取答题区域的文本块
            $answerBlocks = [];
            foreach ($blocks as $block) {
                $blockY = $this->getBlockCenterY($block);
                if ($blockY > $yStart && $blockY < $yEnd) {
                    $answerBlocks[] = $block;
                }
            }
            
            // 获取系统题干
            $systemQuestionText = strip_tags($paperQuestion->question_text ?? '');
            
            // 提取学生答案
            $studentAnswer = $this->extractAnswerFromBlocks(
                $answerBlocks,
                $systemQuestionText,
                $paperQuestion->question_type ?? null
            );
            
            // 计算置信度
            $confidences = [];
            foreach ($answerBlocks as $block) {
                if (isset($block['confidence'])) {
                    $confidences[] = $block['confidence'];
                }
            }
            
            $avgConfidence = !empty($confidences) ? array_sum($confidences) / count($confidences) : 0;
            
            $results[$paperQuestion->question_number] = [
                'student_answer' => trim($studentAnswer),
                'confidence' => $bestMatch['similarity'], // 使用匹配相似度作为置信度
                'coordinates' => [
                    'y_min' => $yStart,
                    'y_max' => $yEnd
                ],
                'debug_info' => [
                    'y_start' => $yStart,
                    'y_end' => $yEnd,
                    'block_count' => count($answerBlocks),
                    'system_question_length' => mb_strlen($systemQuestionText),
                    'ocr_question_number' => $bestMatch['ocr_question_number'],
                    'match_similarity' => round($bestMatch['similarity'] * 100, 2) . '%',
                    'ocr_confidence' => round($avgConfidence * 100, 2) . '%'
                ],
                'question_text' => $systemQuestionText
            ];
            
            \Log::info("系统Q{$paperQuestion->question_number} 匹配到 OCR Q{$bestMatch['ocr_question_number']}", [
                'similarity' => round($bestMatch['similarity'] * 100, 2) . '%',
                'student_answer_preview' => mb_substr($studentAnswer, 0, 50)
            ]);
        } else {
                // 未找到匹配
                \Log::warning("系统Q{$paperQuestion->question_number} 未找到匹配的OCR题目");
                
                $results[$paperQuestion->question_number] = [
                    'student_answer' => '',
                    'confidence' => 0,
                    'coordinates' => [
                        'y_min' => 0,
                        'y_max' => 0
                    ],
                    'debug_info' => [
                        'error' => '未找到匹配的OCR题目'
                    ],
                    'question_text' => strip_tags($paperQuestion->question_text ?? '')
                ];
            }
        }
        
        \Log::info('OCR匹配完成', [
            'matched_count' => count(array_filter($results, fn($r) => !empty($r['student_answer']))),
            'total_count' => count($results)
        ]);

        return $results;
    }

    /**
     * 基于题号坐标对所有block做y轴分段聚类
     */
    private function groupBlocksByQuestionNumber(array $blocks): array
    {
        $questionUnits = [];
        $currentQuestion = null;
        $blocksByType = [];

        // 第一步：识别所有题号
        $questionNumbers = [];
        foreach ($blocks as $index => $block) {
            if ($block['type'] === 'question_number') {
                $y = $this->getBlockCenterY($block);
                $questionNumbers[] = [
                    'index' => $index,
                    'text' => $block['text'],
                    'y' => $y,
                    'number' => $this->extractQuestionNumber($block['text'])
                ];
            }
        }

        // 第二步：按题号分组blocks
        for ($i = 0; $i < count($questionNumbers); $i++) {
            $currentQN = $questionNumbers[$i];
            $nextQN = $questionNumbers[$i + 1] ?? null;

            $yStart = $currentQN['y'];
            $yEnd = $nextQN ? $nextQN['y'] : PHP_INT_MAX;

            // 收集这个题号范围内的所有blocks
            $questionBlocks = [];
            foreach ($blocks as $block) {
                $y = $this->getBlockCenterY($block);
                if ($y >= $yStart && ($nextQN === null || $y < $yEnd)) {
                    $questionBlocks[] = $block;
                }
            }

            $questionUnits[] = [
                'question_number' => $currentQN['number'],
                'question_text' => $currentQN['text'],
                'blocks' => $questionBlocks,
                'y_range' => ['start' => $yStart, 'end' => $yEnd]
            ];
        }

        return $questionUnits;
    }

    /**
     * 处理每个题目的内容
     */
    private function processQuestionUnits(array $questionUnits): array
    {
        $structuredQuestions = [];

        foreach ($questionUnits as $unit) {
            $question = [
                'question_number' => $unit['question_number'],
                'content' => '',
                'options' => [],
                'answer' => '',
                'confidence' => 0
            ];

            $contentParts = [];
            $options = [];
            $answerAreas = [];

            foreach ($unit['blocks'] as $block) {
                switch ($block['type']) {
                    case 'content':
                        $contentParts[] = $block['text'];
                        break;

                    case 'option':
                        // 提取选项字母和内容
                        if (preg_match('/^([A-Da-d])[\.\)）]\s*(.*)/', $block['text'], $matches)) {
                            $options[] = [
                                'letter' => strtoupper($matches[1]),
                                'content' => trim($matches[2])
                            ];
                        } else {
                            $options[] = [
                                'letter' => '',
                                'content' => $block['text']
                            ];
                        }
                        break;

                    case 'answer_area':
                        // 查找答题区域中的手写内容
                        $answerAreas[] = $block['text'];
                        break;
                }
            }

            // 合并题干内容
            $question['content'] = $this->mergeContentParts($contentParts);
            $question['options'] = $options;
            $question['answer'] = $this->extractAnswerFromAnswerAreas($answerAreas);
            $question['confidence'] = $this->calculateConfidence($unit['blocks']);

            $structuredQuestions[] = $question;
        }

        return $structuredQuestions;
    }

    /**
     * 合并题干内容
     */
    private function mergeContentParts(array $contentParts): string
    {
        $merged = '';
        $lastWasQuestion = false;

        foreach ($contentParts as $part) {
            // 跳过题号（已经在其他地方处理）
            if (preg_match('/^[\(（]?[\d]+[\.\)））、]/', $part)) {
                $lastWasQuestion = true;
                continue;
            }

            // 跳过重复的题号
            if ($lastWasQuestion && preg_match('/^[\(（]?[\d]+[\.\)）)、]/', $part)) {
                continue;
            }

            $merged .= ($merged ? ' ' : '') . $part;
            $lastWasQuestion = false;
        }

        return trim($merged);
    }

    /**
     * 从答题区域提取答案
     */
    private function extractAnswerFromAnswerAreas(array $answerAreas): string
    {
        $answer = '';

        foreach ($answerAreas as $area) {
            // 查找手写内容（通常在空白或下划线附近）
            if (preg_match('/([A-Da-d])/', $area, $matches)) {
                $answer = strtoupper($matches[1]);
                break;
            }

            // 查找填空题的答案
            if (preg_match('/\S+/', $area, $matches) && !preg_match('/(得分|评卷人)/', $area)) {
                $answer = trim($matches[0]);
            }
        }

        return $answer;
    }

    /**
     * 计算置信度
     */
    private function calculateConfidence(array $blocks): float
    {
        $totalConfidence = 0;
        $count = 0;

        foreach ($blocks as $block) {
            if ($block['confidence'] !== null) {
                $totalConfidence += $block['confidence'];
                $count++;
            }
        }

        return $count > 0 ? $totalConfidence / $count : 0.8;
    }

    /**
     * 获取block的Y坐标中心
     */
    private function getBlockCenterY(array $block): ?int
    {
        if (!$block['position'] || empty($block['position'])) {
            return null;
        }

        $yValues = [];
        foreach ($block['position'] as $point) {
            if (isset($point['y'])) {
                $yValues[] = $point['y'];
            }
        }

        if (empty($yValues)) {
            return null;
        }

        return (min($yValues) + max($yValues)) / 2;
    }

    /**
     * 从文本中提取题号
     */
    private function extractQuestionNumber(string $text): int
    {
        if (preg_match('/[\d]+/', $text, $matches)) {
            return (int)$matches[0];
        }
        return 0;
    }

    /**
     * 使用试卷信息优化匹配结果
     */
    private function optimizeWithPaperInfo(array $questions, array $paperInfo): array
    {
        // 获取系统试卷的题目列表
        $systemQuestions = $paperInfo['questions'] ?? [];

        // 构建系统题目的映射
        $systemMap = [];
        foreach ($systemQuestions as $sysQ) {
            $systemMap[$sysQ['question_number']] = $sysQ;
        }

        // 优化每个题目
        $optimized = [];
        foreach ($questions as $question) {
            $qNum = $question['question_number'];

            // 如果系统试卷中有对应题号，进行优化
            if (isset($systemMap[$qNum])) {
                $sysQuestion = $systemMap[$qNum];

                // 题型匹配
                if (isset($sysQuestion['question_type'])) {
                    $question = $this->optimizeByQuestionType($question, $sysQuestion['question_type']);
                }

                // 答案优化
                if (isset($sysQuestion['correct_answer'])) {
                    $question = $this->optimizeAnswer($question, $sysQuestion['correct_answer']);
                }
            }

            $optimized[] = $question;
        }

        return $optimized;
    }

    /**
     * 根据题型优化解析
     */
    private function optimizeByQuestionType(array $question, string $questionType): array
    {
        switch ($questionType) {
            case 'choice':
                // 选择题：确保有选项，优化答案格式
                if (empty($question['options']) && preg_match('/[A-Da-d]/', $question['content'])) {
                    // 如果内容中包含选项，尝试提取
                    $question['options'] = $this->extractOptionsFromContent($question['content']);
                }
                break;

            case 'fill':
                // 填空题：识别填空位置
                $question['blanks'] = $this->findFillBlanks($question['content']);
                break;

            case 'answer':
                // 解答题：保留完整内容
                $question['full_solution'] = $question['content'];
                break;
        }

        return $question;
    }

    /**
     * 优化答案格式
     */
    private function optimizeAnswer(array $question, string $correctAnswer): array
    {
        // 如果是选择题，标准化答案格式
        if (!empty($question['options'])) {
            $question['answer'] = $this->normalizeChoiceAnswer($question['answer'], $correctAnswer);
        }

        return $question;
    }

    /**
     * 标准化选择题答案
     */
    private function normalizeChoiceAnswer(string $studentAnswer, string $correctAnswer): string
    {
        // 映射表：处理各种答案格式
        $map = [
            '①' => 'A', '②' => 'B', '③' => 'C', '④' => 'D',
            '1' => 'A', '2' => 'B', '3' => 'C', '4' => 'D'
        ];

        $studentAnswer = trim($studentAnswer);
        return $map[$studentAnswer] ?? strtoupper($studentAnswer);
    }

    /**
     * 从内容中提取选项
     */
    private function extractOptionsFromContent(string $content): array
    {
        $options = [];
        $lines = explode("\n", $content);

        foreach ($lines as $line) {
            if (preg_match('/^([A-Da-d])[\.\)）]\s*(.*)/', trim($line), $matches)) {
                $options[] = [
                    'letter' => strtoupper($matches[1]),
                    'content' => trim($matches[2])
                ];
            }
        }

        return $options;
    }

    /**
     * 查找填空位置
     */
    private function findFillBlanks(string $content): array
    {
        $blanks = [];

        // 查找下划线或括号
        if (preg_match_all('/(_{2,})|（[\s\S]*?）|\([\s\S]*?\)/u', $content, $matches)) {
            $blanks = $matches[0];
        }

        return $blanks;
    }

    /**
     * 调试输出：生成可视化的分析结果
     */
    public function generateDebugOutput(array $ocrData, array $structuredQuestions): string
    {
        $output = "=== OCR数据解析调试输出 ===\n\n";

        // 原始数据统计
        $blocks = $this->extractTextBlocks($ocrData);
        $output .= "1. 原始文本块数量: " . count($blocks) . "\n";

        $typeStats = [];
        foreach ($blocks as $block) {
            $type = $block['type'];
            $typeStats[$type] = ($typeStats[$type] ?? 0) + 1;
        }
        $output .= "   类型分布: " . json_encode($typeStats, JSON_UNESCAPED_UNICODE) . "\n\n";

        // 结构化题目
        $output .= "2. 识别到的题目数量: " . count($structuredQuestions) . "\n";
        foreach ($structuredQuestions as $i => $q) {
            $output .= "\n题目 " . ($i + 1) . " (题号: {$q['question_number']}):\n";
            $output .= "   - 内容: " . substr($q['content'], 0, 100) . "...\n";
            $output .= "   - 选项数: " . count($q['options']) . "\n";
            $output .= "   - 答案: " . ($q['answer'] ?: '未识别') . "\n";
            $output .= "   - 置信度: " . round($q['confidence'] * 100, 2) . "%\n";
        }

        return $output;
    }


    /**
     * 寻找题目锚点
     */
    /**
     * 寻找题目锚点
     */
    public function findQuestionAnchor(array $blocks, $paperQuestion): ?array
    {
        $qNum = $paperQuestion->question_number;
        $cleanContent = strip_tags($paperQuestion->question_text);
        $cleanContent = preg_replace('/\s+/', '', $cleanContent);
        
        // 策略1：优先匹配 "题号." 的形式 (e.g., "1.", "2、")
        foreach ($blocks as $block) {
            // 匹配 "1.", "1、", "(1)" 等开头
            if (preg_match('/^[\(（]?'.$qNum.'[\.\)）)、]/', $block['text'])) {
                return $this->getBlockCoordinates($block);
            }
        }

        // 策略1.5：匹配独立的题号 (e.g., "1" 后面跟着空格或换行)
        // 有时候OCR会把 "1." 识别成 "1" 和 "." 分开的block，或者 "1 题目内容"
        foreach ($blocks as $block) {
            if (preg_match('/^'.$qNum.'\s+/', $block['text']) || $block['text'] === (string)$qNum) {
                 // 只有当这个block看起来像题号（比较短，或者在左侧）时才采纳
                 // 这里简单判断一下长度，防止匹配到 "100" 中的 "1"
                 if (strlen($block['text']) < 5) {
                     return $this->getBlockCoordinates($block);
                 }
            }
        }

        // 策略2：如果题号匹配失败，尝试匹配题目内容的前几个字
        $prefix = mb_substr($cleanContent, 0, 15); // 取前15个字

        if (mb_strlen($prefix) > 2) {
            foreach ($blocks as $block) {
                $blockText = preg_replace('/\s+/', '', $block['text']);
                
                // 简单的包含检查
                if (mb_strpos($blockText, $prefix) !== false) {
                    return $this->getBlockCoordinates($block);
                }
                
                // Fuzzy matching for the prefix
                similar_text($prefix, mb_substr($blockText, 0, mb_strlen($prefix) + 5), $percent);
                if ($percent > 80) {
                     return $this->getBlockCoordinates($block);
                }
            }
        }

        return null;
    }

    /**
     * 获取Block的坐标信息
     */
    private function getBlockCoordinates(array $block): array
    {
        if (empty($block['position'])) {
            return ['y_top' => 0, 'y_bottom' => 0];
        }

        $ys = array_column($block['position'], 'y');
        return [
            'y_top' => min($ys),
            'y_bottom' => max($ys),
            'x_left' => min(array_column($block['position'], 'x')),
            'x_right' => max(array_column($block['position'], 'x')),
        ];
    }
    /**
     * 从裁剪区域的OCR结果中提取答案（去除题目文本）
     */
    public function extractAnswerFromCrop(array $cropResult, string $systemQuestionText): string
    {
        // 1. 获取OCR识别的完整文本
        $ocrText = '';
        if (isset($cropResult['content'])) {
            $ocrText = $cropResult['content'];
        } elseif (isset($cropResult['questions'])) {
            $texts = array_column($cropResult['questions'], 'content');
            $ocrText = implode("\n", $texts);
        }

        if (empty($ocrText)) {
            return '';
        }

        // 2. 预处理文本（去除标点、空格，统一格式）
        $normalizedOcr = $this->normalizeTextForComparison($ocrText);
        $normalizedSystem = $this->normalizeTextForComparison($systemQuestionText);

        // 3. 尝试去除题目部分
        // 策略A: 如果OCR文本以系统题目开头（允许一定的模糊匹配）
        if (str_starts_with($normalizedOcr, $normalizedSystem)) {
             // 找到系统题目在原始OCR文本中的结束位置
             // 这是一个简化的处理，实际可能需要更复杂的对齐算法
             $cleanOcr = $this->removePrefixFuzzy($ocrText, $systemQuestionText);
             return trim($cleanOcr);
        }

        // 策略B: 最长公共子序列匹配 (LCS) - 简化版
        // 如果OCR文本的前半部分与系统题目高度相似，则认为前半部分是题目
        $splitIndex = $this->findSplitIndex($ocrText, $systemQuestionText);
        if ($splitIndex > 0) {
            return trim(substr($ocrText, $splitIndex));
        }

        // 策略C: 如果无法区分，且OCR文本比系统题目长很多，可能包含了答案
        // 但为了安全，如果匹配失败，我们还是返回原文本，或者尝试启发式规则
        
        // 启发式规则：如果包含 "解："、"答：" 等关键字，取关键字之后的内容
        if (preg_match('/(解[:：]|答[:：])(.*)/s', $ocrText, $matches)) {
            return trim($matches[0]); // 返回包含"解："的部分
        }

        // 启发式规则：对于填空题，如果末尾有内容
        // 比如 ".... = 3"，取等号后面的
        if (preg_match('/=\s*(\S+)$/', $ocrText, $matches)) {
            return trim($matches[1]);
        }

        return $ocrText;
    }

    private function normalizeTextForComparison(string $text): string
    {
        $text = strip_tags($text);
        $text = preg_replace('/\s+/', '', $text);
        $text = preg_replace('/[[:punct:]]/', '', $text); // 去除标点
        return strtolower($text);
    }

    private function removePrefixFuzzy(string $fullText, string $prefix): string
    {
        // 简单实现：逐字符匹配，直到不匹配为止
        $len = min(strlen($fullText), strlen($prefix) * 1.5); // 限制搜索范围
        $matchCount = 0;
        $lastMatchIndex = 0;

        // 这里使用一个简单的滑动窗口或者直接比较
        // 为了效率，我们假设题目在开头
        // 我们寻找 prefix 的最后一个字符在 fullText 中的位置
        
        // 更简单的方法：直接计算相似度，找到最佳切割点
        // 但这里我们先用一个简单的 hack：
        // 假设 OCR 结果中的题目部分和 systemQuestionText 长度差不多
        $prefixLen = strlen($prefix);
        $potentialPrefix = substr($fullText, 0, $prefixLen + 10); // 多取一点
        
        similar_text($potentialPrefix, $prefix, $percent);
        if ($percent > 80) {
            return substr($fullText, $prefixLen); // 简单截断
        }
        
        return $fullText;
    }

    private function findSplitIndex(string $ocrText, string $systemText): int
    {
        // 寻找 systemText 在 ocrText 中的结束位置
        // 这是一个难点，因为 OCR 可能有错别字
        
        // 简化算法：
        // 1. 取 systemText 的后 10 个字符作为"锚点"
        $anchor = mb_substr($systemText, -10);
        if (mb_strlen($anchor) < 5) $anchor = $systemText;

        $pos = mb_strpos($ocrText, $anchor);
        if ($pos !== false) {
            return $pos + mb_strlen($anchor);
        }

        return 0;
    }

    /**
     * 找到答案开始的块索引（通过与系统题干匹配）
     */
    private function findAnswerStartIndex(array $blocks, string $systemText): int
    {
        $accumulated = '';
        $normalizedSystem = $this->normalizeTextForComparison($systemText);
        
        foreach ($blocks as $index => $block) {
            $accumulated .= $block['text'];
            $normalizedAccumulated = $this->normalizeTextForComparison($accumulated);
            
            // 计算相似度
            similar_text($normalizedAccumulated, $normalizedSystem, $percent);
            
            \Log::debug("Block {$index} accumulated similarity: {$percent}%", [
                'accumulated_length' => mb_strlen($accumulated),
                'system_length' => mb_strlen($systemText)
            ]);
            
            // 如果相似度超过80%，认为题干已经匹配完成
            if ($percent > 80) {
                return $index + 1; // 下一个块开始是答案
            }
            
            // 如果累积文本已经明显超过系统文本，但相似度还不够，可能是OCR错误太多
            // 这时候用长度作为备选方案
            if (mb_strlen($normalizedAccumulated) > mb_strlen($normalizedSystem) * 1.2 && $percent > 60) {
                return $index + 1;
            }
        }
        
        return 0; // 未找到匹配，从头开始（保守策略）
    }

    /**
     * 从文本块中提取答案(排除题干部分)
     */
    private function extractAnswerFromBlocks(array $blocks, string $systemQuestionText, ?string $questionType = null): string
    {
        if (empty($blocks)) {
            return '';
        }
        
        // 策略1: 查找明确的答案标记
        $answerKeywords = ['答:', '答案:', '解:', '解答:'];
        foreach ($blocks as $block) {
            foreach ($answerKeywords as $keyword) {
                if (mb_strpos($block['text'], $keyword) !== false) {
                    // 找到答案标记,提取标记后的内容
                    $parts = mb_split($keyword, $block['text']);
                    if (count($parts) > 1) {
                        return trim($parts[1]);
                    }
                }
            }
        }
        
        // 策略2: 选择题，优先单个字母
        if ($questionType === 'choice') {
            foreach ($blocks as $block) {
                $text = trim($block['text']);
                $type = $this->detectTextType($text);
                
                if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
                    continue;
                }
                
                if (preg_match('/^[A-Da-d][\.\)）]?$/u', $text)) {
                    return strtoupper(preg_replace('/[^A-D]/i', '', $text));
                }
            }
        }
        
        // 策略3: 填空题，优先短数字/等式
        if ($questionType === 'fill') {
            foreach ($blocks as $block) {
                $text = trim($block['text']);
                $type = $this->detectTextType($text);
                
                if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
                    continue;
                }
                
                if (mb_strlen($text) <= 25 && preg_match('/[\\d=]/u', $text)) {
                    $normalizedText = $this->normalizeTextForComparison($text);
                    $normalizedQuestion = $this->normalizeTextForComparison($systemQuestionText);
                    if (mb_strpos($normalizedQuestion, $normalizedText) === false) {
                        return $text;
                    }
                }
            }
        }
        
        // 策略4: 解答/通用，收集非题干短句，优先最短
        $normalizedQuestion = $this->normalizeTextForComparison($systemQuestionText);
        $candidates = [];
        foreach ($blocks as $block) {
            $text = trim($block['text']);
            $type = $this->detectTextType($text);
            
            if ($type === 'question_number' || $type === 'section_header' || $type === 'option') {
                continue;
            }
            if ($text === '' || mb_strpos($text, '分') !== false) {
                continue;
            }
            
            $normalizedText = $this->normalizeTextForComparison($text);
            if ($normalizedText !== '' && mb_strpos($normalizedQuestion, $normalizedText) === false) {
                $candidates[] = $text;
            }
        }
        
        if (!empty($candidates)) {
            usort($candidates, fn($a, $b) => mb_strlen($a) <=> mb_strlen($b));
            return trim($candidates[0]);
        }
        
        return '';
    }

    /**
     * 从OCR文本块中提取所有题目(基于题号标记)
     */
    private function extractOCRQuestions(array $blocks): array
    {
        $ocrQuestions = [];
        $currentQuestion = null;
        
        foreach ($blocks as $idx => $block) {
            $type = $this->detectTextType($block['text']);
            
            // 检测到新题号
            if ($type === 'question_number') {
                // 保存上一题
                if ($currentQuestion !== null) {
                    $ocrQuestions[] = $currentQuestion;
                }
                
                // 开始新题
                $questionNumber = $this->extractQuestionNumber($block['text']);
                $currentQuestion = [
                    'ocr_question_number' => $questionNumber,
                    'question_text' => $block['text'],
                    'blocks' => [$block],
                    'y_start' => $this->getBlockTopY($block),
                    'y_end' => $this->getBlockBottomY($block),
                ];
            } elseif ($type === 'section_header') {
                // 遇到题型说明,保存当前题目并重置
                if ($currentQuestion !== null) {
                    $ocrQuestions[] = $currentQuestion;
                    $currentQuestion = null;
                }
            } elseif ($currentQuestion !== null) {
                // 累积当前题目的内容
                $currentQuestion['blocks'][] = $block;
                $currentQuestion['question_text'] .= ' ' . $block['text'];
                $currentQuestion['y_end'] = $this->getBlockBottomY($block);
            }
        }
        
        // 保存最后一题
        if ($currentQuestion !== null) {
            $ocrQuestions[] = $currentQuestion;
        }
        
        return $ocrQuestions;
    }
    
    /**
     * 为系统题目找到最匹配的OCR题目
     */
    private function findBestMatchingOCRQuestion(array $ocrQuestions, $paperQuestion): ?array
    {
        $systemTextRaw = strip_tags($paperQuestion->question_text ?? '');
        $systemText = $this->normalizeTextForMatching($systemTextRaw);
        $targetNumber = (int) ($paperQuestion->question_number ?? 0);

        // 优先按题号直接命中
        foreach ($ocrQuestions as $ocrQ) {
            if (($ocrQ['ocr_question_number'] ?? null) === $targetNumber) {
                $ocrQ['similarity'] = 0.6; // 基准相似度
                \Log::info("Found match by number for Q{$paperQuestion->question_number}", [
                    'ocr_question_number' => $ocrQ['ocr_question_number']
                ]);
                return $ocrQ;
            }
        }
        
        $bestMatch = null;
        $bestSimilarity = 0;
        
        foreach ($ocrQuestions as $ocrQ) {
            $ocrText = $this->normalizeTextForMatching($ocrQ['question_text']);

            // 1) 优先题号直接匹配，给出高基准分
            $numberBoost = ($ocrQ['ocr_question_number'] ?? null) === $targetNumber ? 20 : 0;
            
            // 2) 文本相似度
            similar_text($systemText, $ocrText, $percent);
            $percent += $numberBoost; // 数字匹配可以抵消轻微文本差异
            
            \Log::debug("Matching Q{$paperQuestion->question_number} with OCR Q{$ocrQ['ocr_question_number']}", [
                'similarity' => round($percent, 2),
                'number_boost' => $numberBoost,
                'system_text_preview' => mb_substr($systemTextRaw, 0, 50),
                'ocr_text_preview' => mb_substr($ocrQ['question_text'], 0, 50)
            ]);
            
            if ($percent > $bestSimilarity) {
                $bestSimilarity = $percent;
                $bestMatch = $ocrQ;
            }
        }
        
        // 只返回相似度超过阈值的匹配
        if ($bestSimilarity >= 30) { // 降低阈值以适应OCR识别误差和LaTeX差异
            $bestMatch['similarity'] = $bestSimilarity / 100;
            \Log::info("Found match for Q{$paperQuestion->question_number}", [
                'ocr_question_number' => $bestMatch['ocr_question_number'],
                'similarity' => round($bestSimilarity, 2) . '%'
            ]);
            return $bestMatch;
        }
        
        \Log::warning("No match found for Q{$paperQuestion->question_number}", [
            'best_similarity' => round($bestSimilarity, 2) . '%'
        ]);
        return null;
    }
    
    /**
     * 标准化文本用于匹配(去除空格、标点、LaTeX等)
     */
    private function normalizeTextForMatching(string $text): string
    {
        // 去除LaTeX标记(包括$$和$)
        $text = preg_replace('/\$\$?[^\$]+\$\$?/s', '', $text);
        // 去除HTML标签
        $text = strip_tags($text);
        // 去除所有标点符号和特殊字符
        $text = preg_replace('/[[:punct:]]/u', '', $text);
        $text = preg_replace('/[^\p{L}\p{N}]/u', '', $text);
        // 去除空格
        $text = preg_replace('/\s+/u', '', $text);
        // 转小写
        $text = mb_strtolower($text);
        
        return $text;
    }
    
    /**
     * 获取block的顶部Y坐标
     */
    private function getBlockTopY(array $block): int
    {
        if (empty($block['position'])) {
            return 0;
        }
        return min(array_column($block['position'], 'y'));
    }
    
    /**
     * 获取block的底部Y坐标
     */
    private function getBlockBottomY(array $block): int
    {
        if (empty($block['position'])) {
            return 0;
        }
        return max(array_column($block['position'], 'y'));
    }
}