| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504 |
- <?php
- namespace App\Services;
- use App\Support\LogContext;
- use Illuminate\Support\Facades\Http;
- use Illuminate\Support\Facades\Log;
- class MarkdownQuestionParser
- {
- private string $aiDriver;
- private string $deepseekBaseUrl;
- private string $deepseekModel;
- private int $deepseekTimeout;
- private string $openAiBaseUrl;
- private string $openAiModel;
- private int $openAiTimeout;
- private array $deepseekApiKeys;
- private array $openAiApiKeys;
- private bool $rateLimited = false;
- public function __construct()
- {
- $this->aiDriver = config('ai.driver', env('AI_DRIVER', 'deepseek'));
- $this->deepseekBaseUrl = rtrim((string) config('ai.deepseek.base_url', 'https://api.deepseek.com/v1'), '/');
- $this->deepseekModel = (string) config('ai.deepseek.model', 'deepseek-chat');
- $this->deepseekTimeout = (int) config('ai.deepseek.timeout', 30);
- $this->deepseekApiKeys = $this->splitApiKeys(
- (string) config('ai.deepseek.api_keys', env('DEEPSEEK_API_KEYS', ''))
- );
- $this->openAiBaseUrl = rtrim((string) config('ai.openai.base_url', 'https://api.openai.com/v1'), '/');
- $this->openAiModel = (string) config('ai.openai.model', 'gpt-3.5-turbo');
- $this->openAiTimeout = (int) config('ai.openai.timeout', 30);
- $this->openAiApiKeys = $this->splitApiKeys(
- (string) config('ai.openai.api_keys', env('OPENAI_API_KEYS', ''))
- );
- }
- /**
- * 解析 Markdown 文本,返回候选题数组
- */
- public function parse(string $markdown): array
- {
- $splitter = app(AsyncMarkdownSplitter::class);
- $blocks = $splitter->split($markdown);
- if (!$splitter->validate($blocks)) {
- Log::warning('Markdown split validation failed; continue with best-effort parsing', [
- 'blocks_count' => count($blocks),
- ]);
- }
- $candidates = [];
- foreach ($blocks as $block) {
- $candidates[] = $this->parseRawMarkdown(
- (string) ($block['raw_markdown'] ?? ''),
- (int) ($block['index'] ?? 0),
- );
- }
- return $candidates;
- }
- /**
- * 解析单题 raw_markdown,返回候选题结构
- */
- public function parseRawMarkdown(string $rawMarkdown, int $index): array
- {
- Log::debug('Parse raw_markdown start', [
- 'index' => $index,
- 'raw_len' => strlen($rawMarkdown),
- 'raw_sha1' => LogContext::sha1($rawMarkdown),
- 'raw_excerpt' => LogContext::excerpt($rawMarkdown),
- ]);
- $candidate = $this->parseBlock($rawMarkdown, $index);
- $mode = (string) config('ai.parse_mode', 'structured');
- if ($mode === 'detect') {
- $this->enhanceWithAi($candidate);
- Log::debug('Parse raw_markdown done (detect-only)', [
- 'index' => $index,
- 'is_question_candidate' => $candidate['is_question_candidate'] ?? null,
- 'ai_confidence' => $candidate['ai_confidence'] ?? null,
- ]);
- return $candidate;
- }
- if ($mode === 'heuristic') {
- Log::debug('Parse raw_markdown done (heuristic-only)', [
- 'index' => $index,
- ]);
- return $candidate;
- }
- // AI 结构化解析(失败则回退为启发式提取 + AI 判题)
- $aiStructured = $this->parseWithAi($candidate['raw_markdown'], $candidate['index']);
- if ($aiStructured !== null) {
- Log::debug('Parse raw_markdown done (ai_structured)', [
- 'index' => $index,
- 'keys' => array_keys($aiStructured),
- 'is_question_candidate' => $aiStructured['is_question_candidate'] ?? null,
- 'ai_confidence' => $aiStructured['ai_confidence'] ?? null,
- 'options_count' => is_array($aiStructured['options'] ?? null) ? count($aiStructured['options']) : 0,
- 'images_count' => is_array($aiStructured['images'] ?? null) ? count($aiStructured['images']) : 0,
- 'tables_count' => is_array($aiStructured['tables'] ?? null) ? count($aiStructured['tables']) : 0,
- ]);
- return array_merge($candidate, $aiStructured);
- }
- if (!$this->rateLimited) {
- $this->enhanceWithAi($candidate);
- }
- Log::debug('Parse raw_markdown done (heuristic+detect)', [
- 'index' => $index,
- 'is_question_candidate' => $candidate['is_question_candidate'] ?? null,
- 'ai_confidence' => $candidate['ai_confidence'] ?? null,
- ]);
- return $candidate;
- }
- /**
- * 解析单个题目块
- */
- private function parseBlock(string $block, int $index): array
- {
- $candidate = [
- 'index' => $index,
- 'raw_markdown' => $block,
- 'stem' => null,
- 'options' => null,
- 'images' => [],
- 'tables' => [],
- 'is_question_candidate' => false,
- 'ai_confidence' => null,
- ];
- // ② Stem 提取
- $candidate['stem'] = $this->extractStem($block);
- // ③ 选项识别
- $candidate['options'] = $this->extractOptions($block);
- // ④ 图片识别
- $candidate['images'] = $this->extractImages($block);
- // ⑤ 表格识别
- $candidate['tables'] = $this->extractTables($block);
- return $candidate;
- }
- /**
- * AI 结构化解析:返回符合候选库字段的结构化数组,失败返回 null
- *
- * @return array{
- * index:int,
- * stem:?string,
- * options:?array,
- * images:array,
- * tables:array,
- * is_question_candidate:bool,
- * ai_confidence:?float
- * }|null
- */
- private function parseWithAi(string $rawMarkdown, int $index): ?array
- {
- $template = (string) config('ai.question_parse_prompt');
- if (trim($template) === '') {
- return null;
- }
- $prompt = str_replace(['{index}', '{content}'], [(string) $index, $rawMarkdown], $template);
- try {
- Log::debug('AI structured parse request', [
- 'driver' => $this->aiDriver,
- 'index' => $index,
- 'prompt_len' => strlen($prompt),
- 'raw_sha1' => LogContext::sha1($rawMarkdown),
- ]);
- $result = $this->callAiApi($prompt);
- $normalized = [
- 'index' => (int) ($result['index'] ?? $index),
- 'stem' => isset($result['stem']) ? (string) $result['stem'] : null,
- 'options' => isset($result['options']) && is_array($result['options']) ? $result['options'] : null,
- 'images' => isset($result['images']) && is_array($result['images']) ? $result['images'] : [],
- 'tables' => isset($result['tables']) && is_array($result['tables']) ? $result['tables'] : [],
- 'is_question_candidate' => (bool) ($result['is_question_candidate'] ?? $result['is_question'] ?? false),
- 'ai_confidence' => isset($result['ai_confidence']) ? (float) $result['ai_confidence'] : (isset($result['confidence']) ? (float) $result['confidence'] : null),
- 'answer' => $result['answer'] ?? null,
- 'solution' => $result['solution'] ?? null,
- 'solution_steps' => $result['solution_steps'] ?? $result['steps'] ?? [],
- ];
- // 如果是简答题(没有选项)且没有分步解析,尝试使用专门的 Prompt 补全
- if (empty($normalized['options']) && empty($normalized['solution_steps']) && $normalized['is_question_candidate']) {
- if (config('ai.enable_solution_steps', true)) {
- $stepResult = $this->refineSolutionSteps($rawMarkdown);
- if ($stepResult) {
- $normalized['solution'] = $stepResult['solution'] ?? $normalized['solution'];
- $normalized['solution_steps'] = $stepResult['steps'] ?? [];
- }
- }
- }
- Log::debug('AI structured parse response', [
- 'driver' => $this->aiDriver,
- 'index' => $index,
- 'response_keys' => array_keys($result),
- 'normalized' => [
- 'index' => $normalized['index'],
- 'is_question_candidate' => $normalized['is_question_candidate'],
- 'ai_confidence' => $normalized['ai_confidence'],
- 'options_count' => is_array($normalized['options']) ? count($normalized['options']) : 0,
- 'images_count' => is_array($normalized['images']) ? count($normalized['images']) : 0,
- 'tables_count' => is_array($normalized['tables']) ? count($normalized['tables']) : 0,
- ],
- ]);
- return $normalized;
- } catch (\Throwable $e) {
- if ($this->isRateLimited($e)) {
- $this->rateLimited = true;
- }
- Log::warning('AI structured parse failed, fallback to heuristic', [
- 'index' => $index,
- 'error' => $e->getMessage(),
- 'raw_sha1' => LogContext::sha1($rawMarkdown),
- ]);
- return null;
- }
- }
- /**
- * 提取题目主干
- */
- private function extractStem(string $block): ?string
- {
- $lines = explode("\n", $block);
- $stemLines = [];
- foreach ($lines as $line) {
- $line = trim($line);
- // 跳过选项行
- if (preg_match('/^[A-D]\.\s+/', $line)) {
- break;
- }
- // 跳过空行和图片行
- if (empty($line) || preg_match('/^<img/', $line)) {
- continue;
- }
- $stemLines[] = $line;
- }
- return empty($stemLines) ? null : implode("\n", $stemLines);
- }
- /**
- * 提取选项
- */
- private function extractOptions(string $block): ?array
- {
- $options = [];
- preg_match_all('/^([A-D])\.\s+(.+)$/m', $block, $matches, PREG_SET_ORDER);
- foreach ($matches as $match) {
- $label = $match[1];
- $content = trim($match[2]);
- $options[$label] = $content;
- }
- return empty($options) ? null : $options;
- }
- /**
- * 提取图片
- */
- private function extractImages(string $block): array
- {
- $images = [];
- preg_match_all('/<img[^>]+src=["\']([^"\']+)["\'][^>]*>/i', $block, $matches);
- foreach ($matches[1] as $src) {
- $images[] = $src;
- }
- return $images;
- }
- /**
- * 提取表格
- */
- private function extractTables(string $block): array
- {
- $tables = [];
- // 简单匹配 HTML 表格标签
- preg_match_all('/<table[^>]*>.*?<\/table>/s', $block, $matches);
- foreach ($matches[0] as $table) {
- $tables[] = $table;
- }
- return $tables;
- }
- /**
- * AI 增强:判断是否为题目
- */
- private function enhanceWithAi(array &$candidate): void
- {
- $prompt = $this->buildQuestionDetectionPrompt($candidate['raw_markdown']);
- try {
- $result = $this->callAiApi($prompt);
- if (isset($result['is_question'])) {
- $candidate['is_question_candidate'] = $result['is_question'];
- $candidate['ai_confidence'] = $result['confidence'] ?? null;
- }
- } catch (\Exception $e) {
- if ($this->isRateLimited($e)) {
- $this->rateLimited = true;
- }
- Log::error('AI question detection failed', [
- 'error' => $e->getMessage(),
- 'block' => substr($candidate['raw_markdown'], 0, 200),
- ]);
- // 默认值:不是题目
- $candidate['is_question_candidate'] = false;
- $candidate['ai_confidence'] = 0.0;
- }
- }
- /**
- * 构建题目检测 Prompt
- */
- private function buildQuestionDetectionPrompt(string $rawMarkdown): string
- {
- $template = (string) config('ai.question_detection_prompt');
- if (trim($template) === '') {
- $template = "请判断下面这段 Markdown 是否是一道数学题目。\n\n题目内容:\n{content}\n\n请输出 JSON:{\"is_question\":true|false,\"confidence\":0~1}";
- }
- return str_replace('{content}', $rawMarkdown, $template);
- }
- /**
- * 调用 AI API
- */
- private function callAiApi(string $prompt): array
- {
- if ($this->aiDriver === 'deepseek') {
- return $this->callDeepSeek($prompt);
- } elseif ($this->aiDriver === 'openai') {
- return $this->callOpenAI($prompt);
- }
- throw new \Exception("Unsupported AI driver: {$this->aiDriver}");
- }
- /**
- * DeepSeek API 调用
- */
- private function callDeepSeek(string $prompt): array
- {
- $apiKey = $this->resolveApiKey(
- $this->deepseekApiKeys,
- (string) config('ai.deepseek.api_key', env('DEEPSEEK_API_KEY'))
- );
- $response = Http::withHeaders([
- 'Authorization' => "Bearer {$apiKey}",
- 'Content-Type' => 'application/json',
- ])->timeout($this->deepseekTimeout)->post($this->deepseekBaseUrl . '/chat/completions', [
- 'model' => $this->deepseekModel,
- 'messages' => [
- ['role' => 'user', 'content' => $prompt]
- ],
- 'temperature' => 0.1,
- ]);
- if (!$response->successful()) {
- throw new \Exception('DeepSeek API error: HTTP ' . $response->status() . ' ' . $response->body());
- }
- $content = $response->json('choices.0.message.content');
- return $this->parseJsonResponse($content);
- }
- /**
- * OpenAI API 调用
- */
- private function callOpenAI(string $prompt): array
- {
- $apiKey = $this->resolveApiKey(
- $this->openAiApiKeys,
- (string) config('ai.openai.api_key', env('OPENAI_API_KEY'))
- );
- $response = Http::withHeaders([
- 'Authorization' => "Bearer {$apiKey}",
- 'Content-Type' => 'application/json',
- ])->timeout($this->openAiTimeout)->post($this->openAiBaseUrl . '/chat/completions', [
- 'model' => $this->openAiModel,
- 'messages' => [
- ['role' => 'user', 'content' => $prompt]
- ],
- 'temperature' => 0.1,
- ]);
- if (!$response->successful()) {
- throw new \Exception('OpenAI API error: HTTP ' . $response->status() . ' ' . $response->body());
- }
- $content = $response->json('choices.0.message.content');
- return $this->parseJsonResponse($content);
- }
- private function resolveApiKey(array $keys, string $fallback): string
- {
- if (empty($keys)) {
- return $fallback;
- }
- $index = (int) (crc32((string) getmypid()) % count($keys));
- return $keys[$index];
- }
- private function splitApiKeys(string $raw): array
- {
- $items = array_map('trim', explode(',', $raw));
- $items = array_filter($items, static fn (string $key) => $key !== '');
- return array_values(array_unique($items));
- }
- /**
- * 精修简答题的解题步骤
- */
- private function refineSolutionSteps(string $rawMarkdown): ?array
- {
- $prompt = app(QuestionPromptService::class)->buildSolutionStepsPrompt($rawMarkdown);
- try {
- return $this->callAiApi($prompt);
- } catch (\Throwable $e) {
- Log::warning('Refine solution steps failed', [
- 'error' => $e->getMessage(),
- ]);
- return null;
- }
- }
- /**
- * 解析 AI 返回的 JSON
- */
- private function parseJsonResponse(string $content): array
- {
- // 提取 JSON 部分
- preg_match('/\{.*\}/s', $content, $matches);
- if (empty($matches[0])) {
- throw new \Exception('No JSON found in response');
- }
- $json = json_decode($matches[0], true);
- if (json_last_error() !== JSON_ERROR_NONE) {
- throw new \Exception('Invalid JSON: ' . json_last_error_msg());
- }
- return $json;
- }
- private function isRateLimited(\Throwable $e): bool
- {
- $message = $e->getMessage();
- if (stripos($message, 'HTTP 429') !== false) {
- return true;
- }
- if (stripos($message, 'rate limit') !== false) {
- return true;
- }
- if (stripos($message, 'too many requests') !== false) {
- return true;
- }
- return false;
- }
- }
|