MarkdownQuestionParser.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. <?php
  2. namespace App\Services;
  3. use App\Support\LogContext;
  4. use Illuminate\Support\Facades\Http;
  5. use Illuminate\Support\Facades\Log;
  6. class MarkdownQuestionParser
  7. {
  8. private string $aiDriver;
  9. private string $deepseekBaseUrl;
  10. private string $deepseekModel;
  11. private int $deepseekTimeout;
  12. private string $openAiBaseUrl;
  13. private string $openAiModel;
  14. private int $openAiTimeout;
  15. private array $deepseekApiKeys;
  16. private array $openAiApiKeys;
  17. private bool $rateLimited = false;
  18. public function __construct()
  19. {
  20. $this->aiDriver = config('ai.driver', env('AI_DRIVER', 'deepseek'));
  21. $this->deepseekBaseUrl = rtrim((string) config('ai.deepseek.base_url', 'https://api.deepseek.com/v1'), '/');
  22. $this->deepseekModel = (string) config('ai.deepseek.model', 'deepseek-chat');
  23. $this->deepseekTimeout = (int) config('ai.deepseek.timeout', 30);
  24. $this->deepseekApiKeys = $this->splitApiKeys(
  25. (string) config('ai.deepseek.api_keys', env('DEEPSEEK_API_KEYS', ''))
  26. );
  27. $this->openAiBaseUrl = rtrim((string) config('ai.openai.base_url', 'https://api.openai.com/v1'), '/');
  28. $this->openAiModel = (string) config('ai.openai.model', 'gpt-3.5-turbo');
  29. $this->openAiTimeout = (int) config('ai.openai.timeout', 30);
  30. $this->openAiApiKeys = $this->splitApiKeys(
  31. (string) config('ai.openai.api_keys', env('OPENAI_API_KEYS', ''))
  32. );
  33. }
  34. /**
  35. * 解析 Markdown 文本,返回候选题数组
  36. */
  37. public function parse(string $markdown): array
  38. {
  39. $splitter = app(AsyncMarkdownSplitter::class);
  40. $blocks = $splitter->split($markdown);
  41. if (!$splitter->validate($blocks)) {
  42. Log::warning('Markdown split validation failed; continue with best-effort parsing', [
  43. 'blocks_count' => count($blocks),
  44. ]);
  45. }
  46. $candidates = [];
  47. foreach ($blocks as $block) {
  48. $candidates[] = $this->parseRawMarkdown(
  49. (string) ($block['raw_markdown'] ?? ''),
  50. (int) ($block['index'] ?? 0),
  51. );
  52. }
  53. return $candidates;
  54. }
  55. /**
  56. * 解析单题 raw_markdown,返回候选题结构
  57. */
  58. public function parseRawMarkdown(string $rawMarkdown, int $index): array
  59. {
  60. Log::debug('Parse raw_markdown start', [
  61. 'index' => $index,
  62. 'raw_len' => strlen($rawMarkdown),
  63. 'raw_sha1' => LogContext::sha1($rawMarkdown),
  64. 'raw_excerpt' => LogContext::excerpt($rawMarkdown),
  65. ]);
  66. $candidate = $this->parseBlock($rawMarkdown, $index);
  67. // AI 结构化解析(失败则回退为启发式提取 + AI 判题)
  68. $aiStructured = $this->parseWithAi($candidate['raw_markdown'], $candidate['index']);
  69. if ($aiStructured !== null) {
  70. Log::debug('Parse raw_markdown done (ai_structured)', [
  71. 'index' => $index,
  72. 'keys' => array_keys($aiStructured),
  73. 'is_question_candidate' => $aiStructured['is_question_candidate'] ?? null,
  74. 'ai_confidence' => $aiStructured['ai_confidence'] ?? null,
  75. 'options_count' => is_array($aiStructured['options'] ?? null) ? count($aiStructured['options']) : 0,
  76. 'images_count' => is_array($aiStructured['images'] ?? null) ? count($aiStructured['images']) : 0,
  77. 'tables_count' => is_array($aiStructured['tables'] ?? null) ? count($aiStructured['tables']) : 0,
  78. ]);
  79. return array_merge($candidate, $aiStructured);
  80. }
  81. if (!$this->rateLimited) {
  82. $this->enhanceWithAi($candidate);
  83. }
  84. Log::debug('Parse raw_markdown done (heuristic+detect)', [
  85. 'index' => $index,
  86. 'is_question_candidate' => $candidate['is_question_candidate'] ?? null,
  87. 'ai_confidence' => $candidate['ai_confidence'] ?? null,
  88. ]);
  89. return $candidate;
  90. }
  91. /**
  92. * 解析单个题目块
  93. */
  94. private function parseBlock(string $block, int $index): array
  95. {
  96. $candidate = [
  97. 'index' => $index,
  98. 'raw_markdown' => $block,
  99. 'stem' => null,
  100. 'options' => null,
  101. 'images' => [],
  102. 'tables' => [],
  103. 'is_question_candidate' => false,
  104. 'ai_confidence' => null,
  105. ];
  106. // ② Stem 提取
  107. $candidate['stem'] = $this->extractStem($block);
  108. // ③ 选项识别
  109. $candidate['options'] = $this->extractOptions($block);
  110. // ④ 图片识别
  111. $candidate['images'] = $this->extractImages($block);
  112. // ⑤ 表格识别
  113. $candidate['tables'] = $this->extractTables($block);
  114. return $candidate;
  115. }
  116. /**
  117. * AI 结构化解析:返回符合候选库字段的结构化数组,失败返回 null
  118. *
  119. * @return array{
  120. * index:int,
  121. * stem:?string,
  122. * options:?array,
  123. * images:array,
  124. * tables:array,
  125. * is_question_candidate:bool,
  126. * ai_confidence:?float
  127. * }|null
  128. */
  129. private function parseWithAi(string $rawMarkdown, int $index): ?array
  130. {
  131. $template = (string) config('ai.question_parse_prompt');
  132. if (trim($template) === '') {
  133. return null;
  134. }
  135. $prompt = str_replace(['{index}', '{content}'], [(string) $index, $rawMarkdown], $template);
  136. try {
  137. Log::debug('AI structured parse request', [
  138. 'driver' => $this->aiDriver,
  139. 'index' => $index,
  140. 'prompt_len' => strlen($prompt),
  141. 'raw_sha1' => LogContext::sha1($rawMarkdown),
  142. ]);
  143. $result = $this->callAiApi($prompt);
  144. $normalized = [
  145. 'index' => (int) ($result['index'] ?? $index),
  146. 'stem' => isset($result['stem']) ? (string) $result['stem'] : null,
  147. 'options' => isset($result['options']) && is_array($result['options']) ? $result['options'] : null,
  148. 'images' => isset($result['images']) && is_array($result['images']) ? $result['images'] : [],
  149. 'tables' => isset($result['tables']) && is_array($result['tables']) ? $result['tables'] : [],
  150. 'is_question_candidate' => (bool) ($result['is_question_candidate'] ?? $result['is_question'] ?? false),
  151. 'ai_confidence' => isset($result['ai_confidence']) ? (float) $result['ai_confidence'] : (isset($result['confidence']) ? (float) $result['confidence'] : null),
  152. 'answer' => $result['answer'] ?? null,
  153. 'solution' => $result['solution'] ?? null,
  154. 'solution_steps' => $result['solution_steps'] ?? $result['steps'] ?? [],
  155. ];
  156. // 如果是简答题(没有选项)且没有分步解析,尝试使用专门的 Prompt 补全
  157. if (empty($normalized['options']) && empty($normalized['solution_steps']) && $normalized['is_question_candidate']) {
  158. $stepResult = $this->refineSolutionSteps($rawMarkdown);
  159. if ($stepResult) {
  160. $normalized['solution'] = $stepResult['solution'] ?? $normalized['solution'];
  161. $normalized['solution_steps'] = $stepResult['steps'] ?? [];
  162. }
  163. }
  164. Log::debug('AI structured parse response', [
  165. 'driver' => $this->aiDriver,
  166. 'index' => $index,
  167. 'response_keys' => array_keys($result),
  168. 'normalized' => [
  169. 'index' => $normalized['index'],
  170. 'is_question_candidate' => $normalized['is_question_candidate'],
  171. 'ai_confidence' => $normalized['ai_confidence'],
  172. 'options_count' => is_array($normalized['options']) ? count($normalized['options']) : 0,
  173. 'images_count' => is_array($normalized['images']) ? count($normalized['images']) : 0,
  174. 'tables_count' => is_array($normalized['tables']) ? count($normalized['tables']) : 0,
  175. ],
  176. ]);
  177. return $normalized;
  178. } catch (\Throwable $e) {
  179. if ($this->isRateLimited($e)) {
  180. $this->rateLimited = true;
  181. }
  182. Log::warning('AI structured parse failed, fallback to heuristic', [
  183. 'index' => $index,
  184. 'error' => $e->getMessage(),
  185. 'raw_sha1' => LogContext::sha1($rawMarkdown),
  186. ]);
  187. return null;
  188. }
  189. }
  190. /**
  191. * 提取题目主干
  192. */
  193. private function extractStem(string $block): ?string
  194. {
  195. $lines = explode("\n", $block);
  196. $stemLines = [];
  197. foreach ($lines as $line) {
  198. $line = trim($line);
  199. // 跳过选项行
  200. if (preg_match('/^[A-D]\.\s+/', $line)) {
  201. break;
  202. }
  203. // 跳过空行和图片行
  204. if (empty($line) || preg_match('/^<img/', $line)) {
  205. continue;
  206. }
  207. $stemLines[] = $line;
  208. }
  209. return empty($stemLines) ? null : implode("\n", $stemLines);
  210. }
  211. /**
  212. * 提取选项
  213. */
  214. private function extractOptions(string $block): ?array
  215. {
  216. $options = [];
  217. preg_match_all('/^([A-D])\.\s+(.+)$/m', $block, $matches, PREG_SET_ORDER);
  218. foreach ($matches as $match) {
  219. $label = $match[1];
  220. $content = trim($match[2]);
  221. $options[$label] = $content;
  222. }
  223. return empty($options) ? null : $options;
  224. }
  225. /**
  226. * 提取图片
  227. */
  228. private function extractImages(string $block): array
  229. {
  230. $images = [];
  231. preg_match_all('/<img[^>]+src=["\']([^"\']+)["\'][^>]*>/i', $block, $matches);
  232. foreach ($matches[1] as $src) {
  233. $images[] = $src;
  234. }
  235. return $images;
  236. }
  237. /**
  238. * 提取表格
  239. */
  240. private function extractTables(string $block): array
  241. {
  242. $tables = [];
  243. // 简单匹配 HTML 表格标签
  244. preg_match_all('/<table[^>]*>.*?<\/table>/s', $block, $matches);
  245. foreach ($matches[0] as $table) {
  246. $tables[] = $table;
  247. }
  248. return $tables;
  249. }
  250. /**
  251. * AI 增强:判断是否为题目
  252. */
  253. private function enhanceWithAi(array &$candidate): void
  254. {
  255. $prompt = $this->buildQuestionDetectionPrompt($candidate['raw_markdown']);
  256. try {
  257. $result = $this->callAiApi($prompt);
  258. if (isset($result['is_question'])) {
  259. $candidate['is_question_candidate'] = $result['is_question'];
  260. $candidate['ai_confidence'] = $result['confidence'] ?? null;
  261. }
  262. } catch (\Exception $e) {
  263. if ($this->isRateLimited($e)) {
  264. $this->rateLimited = true;
  265. }
  266. Log::error('AI question detection failed', [
  267. 'error' => $e->getMessage(),
  268. 'block' => substr($candidate['raw_markdown'], 0, 200),
  269. ]);
  270. // 默认值:不是题目
  271. $candidate['is_question_candidate'] = false;
  272. $candidate['ai_confidence'] = 0.0;
  273. }
  274. }
  275. /**
  276. * 构建题目检测 Prompt
  277. */
  278. private function buildQuestionDetectionPrompt(string $rawMarkdown): string
  279. {
  280. $template = (string) config('ai.question_detection_prompt');
  281. if (trim($template) === '') {
  282. $template = "请判断下面这段 Markdown 是否是一道数学题目。\n\n题目内容:\n{content}\n\n请输出 JSON:{\"is_question\":true|false,\"confidence\":0~1}";
  283. }
  284. return str_replace('{content}', $rawMarkdown, $template);
  285. }
  286. /**
  287. * 调用 AI API
  288. */
  289. private function callAiApi(string $prompt): array
  290. {
  291. if ($this->aiDriver === 'deepseek') {
  292. return $this->callDeepSeek($prompt);
  293. } elseif ($this->aiDriver === 'openai') {
  294. return $this->callOpenAI($prompt);
  295. }
  296. throw new \Exception("Unsupported AI driver: {$this->aiDriver}");
  297. }
  298. /**
  299. * DeepSeek API 调用
  300. */
  301. private function callDeepSeek(string $prompt): array
  302. {
  303. $apiKey = $this->resolveApiKey(
  304. $this->deepseekApiKeys,
  305. (string) config('ai.deepseek.api_key', env('DEEPSEEK_API_KEY'))
  306. );
  307. $response = Http::withHeaders([
  308. 'Authorization' => "Bearer {$apiKey}",
  309. 'Content-Type' => 'application/json',
  310. ])->timeout($this->deepseekTimeout)->post($this->deepseekBaseUrl . '/chat/completions', [
  311. 'model' => $this->deepseekModel,
  312. 'messages' => [
  313. ['role' => 'user', 'content' => $prompt]
  314. ],
  315. 'temperature' => 0.1,
  316. ]);
  317. if (!$response->successful()) {
  318. throw new \Exception('DeepSeek API error: HTTP ' . $response->status() . ' ' . $response->body());
  319. }
  320. $content = $response->json('choices.0.message.content');
  321. return $this->parseJsonResponse($content);
  322. }
  323. /**
  324. * OpenAI API 调用
  325. */
  326. private function callOpenAI(string $prompt): array
  327. {
  328. $apiKey = $this->resolveApiKey(
  329. $this->openAiApiKeys,
  330. (string) config('ai.openai.api_key', env('OPENAI_API_KEY'))
  331. );
  332. $response = Http::withHeaders([
  333. 'Authorization' => "Bearer {$apiKey}",
  334. 'Content-Type' => 'application/json',
  335. ])->timeout($this->openAiTimeout)->post($this->openAiBaseUrl . '/chat/completions', [
  336. 'model' => $this->openAiModel,
  337. 'messages' => [
  338. ['role' => 'user', 'content' => $prompt]
  339. ],
  340. 'temperature' => 0.1,
  341. ]);
  342. if (!$response->successful()) {
  343. throw new \Exception('OpenAI API error: HTTP ' . $response->status() . ' ' . $response->body());
  344. }
  345. $content = $response->json('choices.0.message.content');
  346. return $this->parseJsonResponse($content);
  347. }
  348. private function resolveApiKey(array $keys, string $fallback): string
  349. {
  350. if (empty($keys)) {
  351. return $fallback;
  352. }
  353. $index = (int) (crc32((string) getmypid()) % count($keys));
  354. return $keys[$index];
  355. }
  356. private function splitApiKeys(string $raw): array
  357. {
  358. $items = array_map('trim', explode(',', $raw));
  359. $items = array_filter($items, static fn (string $key) => $key !== '');
  360. return array_values(array_unique($items));
  361. }
  362. /**
  363. * 精修简答题的解题步骤
  364. */
  365. private function refineSolutionSteps(string $rawMarkdown): ?array
  366. {
  367. $prompt = app(QuestionPromptService::class)->buildSolutionStepsPrompt($rawMarkdown);
  368. try {
  369. return $this->callAiApi($prompt);
  370. } catch (\Throwable $e) {
  371. Log::warning('Refine solution steps failed', [
  372. 'error' => $e->getMessage(),
  373. ]);
  374. return null;
  375. }
  376. }
  377. /**
  378. * 解析 AI 返回的 JSON
  379. */
  380. private function parseJsonResponse(string $content): array
  381. {
  382. // 提取 JSON 部分
  383. preg_match('/\{.*\}/s', $content, $matches);
  384. if (empty($matches[0])) {
  385. throw new \Exception('No JSON found in response');
  386. }
  387. $json = json_decode($matches[0], true);
  388. if (json_last_error() !== JSON_ERROR_NONE) {
  389. throw new \Exception('Invalid JSON: ' . json_last_error_msg());
  390. }
  391. return $json;
  392. }
  393. private function isRateLimited(\Throwable $e): bool
  394. {
  395. $message = $e->getMessage();
  396. if (stripos($message, 'HTTP 429') !== false) {
  397. return true;
  398. }
  399. if (stripos($message, 'rate limit') !== false) {
  400. return true;
  401. }
  402. if (stripos($message, 'too many requests') !== false) {
  403. return true;
  404. }
  405. return false;
  406. }
  407. }