|
@@ -0,0 +1,400 @@
|
|
|
|
|
+<?php
|
|
|
|
|
+
|
|
|
|
|
+namespace App\Services;
|
|
|
|
|
+
|
|
|
|
|
+use App\Support\LogContext;
|
|
|
|
|
+use Illuminate\Support\Facades\Http;
|
|
|
|
|
+use Illuminate\Support\Facades\Log;
|
|
|
|
|
+
|
|
|
|
|
+class MarkdownQuestionParser
|
|
|
|
|
+{
|
|
|
|
|
+ private string $aiDriver;
|
|
|
|
|
+ private string $deepseekBaseUrl;
|
|
|
|
|
+ private string $deepseekModel;
|
|
|
|
|
+ private int $deepseekTimeout;
|
|
|
|
|
+ private string $openAiBaseUrl;
|
|
|
|
|
+ private string $openAiModel;
|
|
|
|
|
+ private int $openAiTimeout;
|
|
|
|
|
+
|
|
|
|
|
+ public function __construct()
|
|
|
|
|
+ {
|
|
|
|
|
+ $this->aiDriver = config('ai.driver', env('AI_DRIVER', 'deepseek'));
|
|
|
|
|
+
|
|
|
|
|
+ $this->deepseekBaseUrl = rtrim((string) config('ai.deepseek.base_url', 'https://api.deepseek.com/v1'), '/');
|
|
|
|
|
+ $this->deepseekModel = (string) config('ai.deepseek.model', 'deepseek-chat');
|
|
|
|
|
+ $this->deepseekTimeout = (int) config('ai.deepseek.timeout', 30);
|
|
|
|
|
+
|
|
|
|
|
+ $this->openAiBaseUrl = rtrim((string) config('ai.openai.base_url', 'https://api.openai.com/v1'), '/');
|
|
|
|
|
+ $this->openAiModel = (string) config('ai.openai.model', 'gpt-3.5-turbo');
|
|
|
|
|
+ $this->openAiTimeout = (int) config('ai.openai.timeout', 30);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 解析 Markdown 文本,返回候选题数组
|
|
|
|
|
+ */
|
|
|
|
|
+ public function parse(string $markdown): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $splitter = app(AsyncMarkdownSplitter::class);
|
|
|
|
|
+ $blocks = $splitter->split($markdown);
|
|
|
|
|
+ if (!$splitter->validate($blocks)) {
|
|
|
|
|
+ Log::warning('Markdown split validation failed; continue with best-effort parsing', [
|
|
|
|
|
+ 'blocks_count' => count($blocks),
|
|
|
|
|
+ ]);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $candidates = [];
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($blocks as $block) {
|
|
|
|
|
+ $candidates[] = $this->parseRawMarkdown(
|
|
|
|
|
+ (string) ($block['raw_markdown'] ?? ''),
|
|
|
|
|
+ (int) ($block['index'] ?? 0),
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $candidates;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 解析单题 raw_markdown,返回候选题结构
|
|
|
|
|
+ */
|
|
|
|
|
+ public function parseRawMarkdown(string $rawMarkdown, int $index): array
|
|
|
|
|
+ {
|
|
|
|
|
+ Log::debug('Parse raw_markdown start', [
|
|
|
|
|
+ 'index' => $index,
|
|
|
|
|
+ 'raw_len' => strlen($rawMarkdown),
|
|
|
|
|
+ 'raw_sha1' => LogContext::sha1($rawMarkdown),
|
|
|
|
|
+ 'raw_excerpt' => LogContext::excerpt($rawMarkdown),
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ $candidate = $this->parseBlock($rawMarkdown, $index);
|
|
|
|
|
+
|
|
|
|
|
+ // AI 结构化解析(失败则回退为启发式提取 + AI 判题)
|
|
|
|
|
+ $aiStructured = $this->parseWithAi($candidate['raw_markdown'], $candidate['index']);
|
|
|
|
|
+ if ($aiStructured !== null) {
|
|
|
|
|
+ Log::debug('Parse raw_markdown done (ai_structured)', [
|
|
|
|
|
+ 'index' => $index,
|
|
|
|
|
+ 'keys' => array_keys($aiStructured),
|
|
|
|
|
+ 'is_question_candidate' => $aiStructured['is_question_candidate'] ?? null,
|
|
|
|
|
+ 'ai_confidence' => $aiStructured['ai_confidence'] ?? null,
|
|
|
|
|
+ 'options_count' => is_array($aiStructured['options'] ?? null) ? count($aiStructured['options']) : 0,
|
|
|
|
|
+ 'images_count' => is_array($aiStructured['images'] ?? null) ? count($aiStructured['images']) : 0,
|
|
|
|
|
+ 'tables_count' => is_array($aiStructured['tables'] ?? null) ? count($aiStructured['tables']) : 0,
|
|
|
|
|
+ ]);
|
|
|
|
|
+ return array_merge($candidate, $aiStructured);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $this->enhanceWithAi($candidate);
|
|
|
|
|
+
|
|
|
|
|
+ Log::debug('Parse raw_markdown done (heuristic+detect)', [
|
|
|
|
|
+ 'index' => $index,
|
|
|
|
|
+ 'is_question_candidate' => $candidate['is_question_candidate'] ?? null,
|
|
|
|
|
+ 'ai_confidence' => $candidate['ai_confidence'] ?? null,
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ return $candidate;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 解析单个题目块
|
|
|
|
|
+ */
|
|
|
|
|
+ private function parseBlock(string $block, int $index): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $candidate = [
|
|
|
|
|
+ 'index' => $index,
|
|
|
|
|
+ 'raw_markdown' => $block,
|
|
|
|
|
+ 'stem' => null,
|
|
|
|
|
+ 'options' => null,
|
|
|
|
|
+ 'images' => [],
|
|
|
|
|
+ 'tables' => [],
|
|
|
|
|
+ 'is_question_candidate' => false,
|
|
|
|
|
+ 'ai_confidence' => null,
|
|
|
|
|
+ ];
|
|
|
|
|
+
|
|
|
|
|
+ // ② Stem 提取
|
|
|
|
|
+ $candidate['stem'] = $this->extractStem($block);
|
|
|
|
|
+
|
|
|
|
|
+ // ③ 选项识别
|
|
|
|
|
+ $candidate['options'] = $this->extractOptions($block);
|
|
|
|
|
+
|
|
|
|
|
+ // ④ 图片识别
|
|
|
|
|
+ $candidate['images'] = $this->extractImages($block);
|
|
|
|
|
+
|
|
|
|
|
+ // ⑤ 表格识别
|
|
|
|
|
+ $candidate['tables'] = $this->extractTables($block);
|
|
|
|
|
+
|
|
|
|
|
+ return $candidate;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * AI 结构化解析:返回符合候选库字段的结构化数组,失败返回 null
|
|
|
|
|
+ *
|
|
|
|
|
+ * @return array{
|
|
|
|
|
+ * index:int,
|
|
|
|
|
+ * stem:?string,
|
|
|
|
|
+ * options:?array,
|
|
|
|
|
+ * images:array,
|
|
|
|
|
+ * tables:array,
|
|
|
|
|
+ * is_question_candidate:bool,
|
|
|
|
|
+ * ai_confidence:?float
|
|
|
|
|
+ * }|null
|
|
|
|
|
+ */
|
|
|
|
|
+ private function parseWithAi(string $rawMarkdown, int $index): ?array
|
|
|
|
|
+ {
|
|
|
|
|
+ $template = (string) config('ai.question_parse_prompt');
|
|
|
|
|
+ if (trim($template) === '') {
|
|
|
|
|
+ return null;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $prompt = str_replace(['{index}', '{content}'], [(string) $index, $rawMarkdown], $template);
|
|
|
|
|
+
|
|
|
|
|
+ try {
|
|
|
|
|
+ Log::debug('AI structured parse request', [
|
|
|
|
|
+ 'driver' => $this->aiDriver,
|
|
|
|
|
+ 'index' => $index,
|
|
|
|
|
+ 'prompt_len' => strlen($prompt),
|
|
|
|
|
+ 'raw_sha1' => LogContext::sha1($rawMarkdown),
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ $result = $this->callAiApi($prompt);
|
|
|
|
|
+
|
|
|
|
|
+ $normalized = [
|
|
|
|
|
+ 'index' => (int) ($result['index'] ?? $index),
|
|
|
|
|
+ 'stem' => isset($result['stem']) ? (string) $result['stem'] : null,
|
|
|
|
|
+ 'options' => isset($result['options']) && is_array($result['options']) ? $result['options'] : null,
|
|
|
|
|
+ 'images' => isset($result['images']) && is_array($result['images']) ? $result['images'] : [],
|
|
|
|
|
+ 'tables' => isset($result['tables']) && is_array($result['tables']) ? $result['tables'] : [],
|
|
|
|
|
+ 'is_question_candidate' => (bool) ($result['is_question_candidate'] ?? $result['is_question'] ?? false),
|
|
|
|
|
+ 'ai_confidence' => isset($result['ai_confidence']) ? (float) $result['ai_confidence'] : (isset($result['confidence']) ? (float) $result['confidence'] : null),
|
|
|
|
|
+ ];
|
|
|
|
|
+
|
|
|
|
|
+ Log::debug('AI structured parse response', [
|
|
|
|
|
+ 'driver' => $this->aiDriver,
|
|
|
|
|
+ 'index' => $index,
|
|
|
|
|
+ 'response_keys' => array_keys($result),
|
|
|
|
|
+ 'normalized' => [
|
|
|
|
|
+ 'index' => $normalized['index'],
|
|
|
|
|
+ 'is_question_candidate' => $normalized['is_question_candidate'],
|
|
|
|
|
+ 'ai_confidence' => $normalized['ai_confidence'],
|
|
|
|
|
+ 'options_count' => is_array($normalized['options']) ? count($normalized['options']) : 0,
|
|
|
|
|
+ 'images_count' => is_array($normalized['images']) ? count($normalized['images']) : 0,
|
|
|
|
|
+ 'tables_count' => is_array($normalized['tables']) ? count($normalized['tables']) : 0,
|
|
|
|
|
+ ],
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ return $normalized;
|
|
|
|
|
+ } catch (\Throwable $e) {
|
|
|
|
|
+ Log::warning('AI structured parse failed, fallback to heuristic', [
|
|
|
|
|
+ 'index' => $index,
|
|
|
|
|
+ 'error' => $e->getMessage(),
|
|
|
|
|
+ 'raw_sha1' => LogContext::sha1($rawMarkdown),
|
|
|
|
|
+ ]);
|
|
|
|
|
+ return null;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 提取题目主干
|
|
|
|
|
+ */
|
|
|
|
|
+ private function extractStem(string $block): ?string
|
|
|
|
|
+ {
|
|
|
|
|
+ $lines = explode("\n", $block);
|
|
|
|
|
+ $stemLines = [];
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($lines as $line) {
|
|
|
|
|
+ $line = trim($line);
|
|
|
|
|
+
|
|
|
|
|
+ // 跳过选项行
|
|
|
|
|
+ if (preg_match('/^[A-D]\.\s+/', $line)) {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 跳过空行和图片行
|
|
|
|
|
+ if (empty($line) || preg_match('/^<img/', $line)) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $stemLines[] = $line;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return empty($stemLines) ? null : implode("\n", $stemLines);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 提取选项
|
|
|
|
|
+ */
|
|
|
|
|
+ private function extractOptions(string $block): ?array
|
|
|
|
|
+ {
|
|
|
|
|
+ $options = [];
|
|
|
|
|
+
|
|
|
|
|
+ preg_match_all('/^([A-D])\.\s+(.+)$/m', $block, $matches, PREG_SET_ORDER);
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($matches as $match) {
|
|
|
|
|
+ $label = $match[1];
|
|
|
|
|
+ $content = trim($match[2]);
|
|
|
|
|
+ $options[$label] = $content;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return empty($options) ? null : $options;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 提取图片
|
|
|
|
|
+ */
|
|
|
|
|
+ private function extractImages(string $block): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $images = [];
|
|
|
|
|
+
|
|
|
|
|
+ preg_match_all('/<img[^>]+src=["\']([^"\']+)["\'][^>]*>/i', $block, $matches);
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($matches[1] as $src) {
|
|
|
|
|
+ $images[] = $src;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $images;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 提取表格
|
|
|
|
|
+ */
|
|
|
|
|
+ private function extractTables(string $block): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $tables = [];
|
|
|
|
|
+
|
|
|
|
|
+ // 简单匹配 HTML 表格标签
|
|
|
|
|
+ preg_match_all('/<table[^>]*>.*?<\/table>/s', $block, $matches);
|
|
|
|
|
+
|
|
|
|
|
+ foreach ($matches[0] as $table) {
|
|
|
|
|
+ $tables[] = $table;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $tables;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * AI 增强:判断是否为题目
|
|
|
|
|
+ */
|
|
|
|
|
+ private function enhanceWithAi(array &$candidate): void
|
|
|
|
|
+ {
|
|
|
|
|
+ $prompt = $this->buildQuestionDetectionPrompt($candidate['raw_markdown']);
|
|
|
|
|
+
|
|
|
|
|
+ try {
|
|
|
|
|
+ $result = $this->callAiApi($prompt);
|
|
|
|
|
+
|
|
|
|
|
+ if (isset($result['is_question'])) {
|
|
|
|
|
+ $candidate['is_question_candidate'] = $result['is_question'];
|
|
|
|
|
+ $candidate['ai_confidence'] = $result['confidence'] ?? null;
|
|
|
|
|
+ }
|
|
|
|
|
+ } catch (\Exception $e) {
|
|
|
|
|
+ Log::error('AI question detection failed', [
|
|
|
|
|
+ 'error' => $e->getMessage(),
|
|
|
|
|
+ 'block' => substr($candidate['raw_markdown'], 0, 200),
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ // 默认值:不是题目
|
|
|
|
|
+ $candidate['is_question_candidate'] = false;
|
|
|
|
|
+ $candidate['ai_confidence'] = 0.0;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 构建题目检测 Prompt
|
|
|
|
|
+ */
|
|
|
|
|
+ private function buildQuestionDetectionPrompt(string $rawMarkdown): string
|
|
|
|
|
+ {
|
|
|
|
|
+ $template = (string) config('ai.question_detection_prompt');
|
|
|
|
|
+ if (trim($template) === '') {
|
|
|
|
|
+ $template = "请判断下面这段 Markdown 是否是一道数学题目。\n\n题目内容:\n{content}\n\n请输出 JSON:{\"is_question\":true|false,\"confidence\":0~1}";
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return str_replace('{content}', $rawMarkdown, $template);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 调用 AI API
|
|
|
|
|
+ */
|
|
|
|
|
+ private function callAiApi(string $prompt): array
|
|
|
|
|
+ {
|
|
|
|
|
+ if ($this->aiDriver === 'deepseek') {
|
|
|
|
|
+ return $this->callDeepSeek($prompt);
|
|
|
|
|
+ } elseif ($this->aiDriver === 'openai') {
|
|
|
|
|
+ return $this->callOpenAI($prompt);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ throw new \Exception("Unsupported AI driver: {$this->aiDriver}");
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * DeepSeek API 调用
|
|
|
|
|
+ */
|
|
|
|
|
+ private function callDeepSeek(string $prompt): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $apiKey = config('ai.deepseek.api_key', env('DEEPSEEK_API_KEY'));
|
|
|
|
|
+
|
|
|
|
|
+ $response = Http::withHeaders([
|
|
|
|
|
+ 'Authorization' => "Bearer {$apiKey}",
|
|
|
|
|
+ 'Content-Type' => 'application/json',
|
|
|
|
|
+ ])->timeout($this->deepseekTimeout)->post($this->deepseekBaseUrl . '/chat/completions', [
|
|
|
|
|
+ 'model' => $this->deepseekModel,
|
|
|
|
|
+ 'messages' => [
|
|
|
|
|
+ ['role' => 'user', 'content' => $prompt]
|
|
|
|
|
+ ],
|
|
|
|
|
+ 'temperature' => 0.1,
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ if (!$response->successful()) {
|
|
|
|
|
+ throw new \Exception('DeepSeek API error: ' . $response->body());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $content = $response->json('choices.0.message.content');
|
|
|
|
|
+
|
|
|
|
|
+ return $this->parseJsonResponse($content);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * OpenAI API 调用
|
|
|
|
|
+ */
|
|
|
|
|
+ private function callOpenAI(string $prompt): array
|
|
|
|
|
+ {
|
|
|
|
|
+ $apiKey = config('ai.openai.api_key', env('OPENAI_API_KEY'));
|
|
|
|
|
+
|
|
|
|
|
+ $response = Http::withHeaders([
|
|
|
|
|
+ 'Authorization' => "Bearer {$apiKey}",
|
|
|
|
|
+ 'Content-Type' => 'application/json',
|
|
|
|
|
+ ])->timeout($this->openAiTimeout)->post($this->openAiBaseUrl . '/chat/completions', [
|
|
|
|
|
+ 'model' => $this->openAiModel,
|
|
|
|
|
+ 'messages' => [
|
|
|
|
|
+ ['role' => 'user', 'content' => $prompt]
|
|
|
|
|
+ ],
|
|
|
|
|
+ 'temperature' => 0.1,
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ if (!$response->successful()) {
|
|
|
|
|
+ throw new \Exception('OpenAI API error: ' . $response->body());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $content = $response->json('choices.0.message.content');
|
|
|
|
|
+
|
|
|
|
|
+ return $this->parseJsonResponse($content);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 解析 AI 返回的 JSON
|
|
|
|
|
+ */
|
|
|
|
|
+ private function parseJsonResponse(string $content): array
|
|
|
|
|
+ {
|
|
|
|
|
+ // 提取 JSON 部分
|
|
|
|
|
+ preg_match('/\{.*\}/s', $content, $matches);
|
|
|
|
|
+
|
|
|
|
|
+ if (empty($matches[0])) {
|
|
|
|
|
+ throw new \Exception('No JSON found in response');
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ $json = json_decode($matches[0], true);
|
|
|
|
|
+
|
|
|
|
|
+ if (json_last_error() !== JSON_ERROR_NONE) {
|
|
|
|
|
+ throw new \Exception('Invalid JSON: ' . json_last_error_msg());
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return $json;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|