MarkdownQuestionParser.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. <?php
  2. namespace App\Services;
  3. use App\Support\LogContext;
  4. use Illuminate\Support\Facades\Http;
  5. use Illuminate\Support\Facades\Log;
  6. class MarkdownQuestionParser
  7. {
  8. private string $aiDriver;
  9. private string $deepseekBaseUrl;
  10. private string $deepseekModel;
  11. private int $deepseekTimeout;
  12. private string $openAiBaseUrl;
  13. private string $openAiModel;
  14. private int $openAiTimeout;
  15. private array $deepseekApiKeys;
  16. private array $openAiApiKeys;
  17. private bool $rateLimited = false;
  18. public function __construct()
  19. {
  20. $this->aiDriver = config('ai.driver', env('AI_DRIVER', 'deepseek'));
  21. $this->deepseekBaseUrl = rtrim((string) config('ai.deepseek.base_url', 'https://api.deepseek.com/v1'), '/');
  22. $this->deepseekModel = (string) config('ai.deepseek.model', 'deepseek-chat');
  23. $this->deepseekTimeout = (int) config('ai.deepseek.timeout', 30);
  24. $this->deepseekApiKeys = $this->splitApiKeys(
  25. (string) config('ai.deepseek.api_keys', env('DEEPSEEK_API_KEYS', ''))
  26. );
  27. $this->openAiBaseUrl = rtrim((string) config('ai.openai.base_url', 'https://api.openai.com/v1'), '/');
  28. $this->openAiModel = (string) config('ai.openai.model', 'gpt-3.5-turbo');
  29. $this->openAiTimeout = (int) config('ai.openai.timeout', 30);
  30. $this->openAiApiKeys = $this->splitApiKeys(
  31. (string) config('ai.openai.api_keys', env('OPENAI_API_KEYS', ''))
  32. );
  33. }
  34. /**
  35. * 解析 Markdown 文本,返回候选题数组
  36. */
  37. public function parse(string $markdown): array
  38. {
  39. $splitter = app(AsyncMarkdownSplitter::class);
  40. $blocks = $splitter->split($markdown);
  41. if (!$splitter->validate($blocks)) {
  42. Log::warning('Markdown split validation failed; continue with best-effort parsing', [
  43. 'blocks_count' => count($blocks),
  44. ]);
  45. }
  46. $candidates = [];
  47. foreach ($blocks as $block) {
  48. $candidates[] = $this->parseRawMarkdown(
  49. (string) ($block['raw_markdown'] ?? ''),
  50. (int) ($block['index'] ?? 0),
  51. );
  52. }
  53. return $candidates;
  54. }
  55. /**
  56. * 解析单题 raw_markdown,返回候选题结构
  57. */
  58. public function parseRawMarkdown(string $rawMarkdown, int $index): array
  59. {
  60. Log::debug('Parse raw_markdown start', [
  61. 'index' => $index,
  62. 'raw_len' => strlen($rawMarkdown),
  63. 'raw_sha1' => LogContext::sha1($rawMarkdown),
  64. 'raw_excerpt' => LogContext::excerpt($rawMarkdown),
  65. ]);
  66. $candidate = $this->parseBlock($rawMarkdown, $index);
  67. $mode = (string) config('ai.parse_mode', 'structured');
  68. if ($mode === 'detect') {
  69. $this->enhanceWithAi($candidate);
  70. Log::debug('Parse raw_markdown done (detect-only)', [
  71. 'index' => $index,
  72. 'is_question_candidate' => $candidate['is_question_candidate'] ?? null,
  73. 'ai_confidence' => $candidate['ai_confidence'] ?? null,
  74. ]);
  75. return $candidate;
  76. }
  77. if ($mode === 'heuristic') {
  78. Log::debug('Parse raw_markdown done (heuristic-only)', [
  79. 'index' => $index,
  80. ]);
  81. return $candidate;
  82. }
  83. // AI 结构化解析(失败则回退为启发式提取 + AI 判题)
  84. $aiStructured = $this->parseWithAi($candidate['raw_markdown'], $candidate['index']);
  85. if ($aiStructured !== null) {
  86. Log::debug('Parse raw_markdown done (ai_structured)', [
  87. 'index' => $index,
  88. 'keys' => array_keys($aiStructured),
  89. 'is_question_candidate' => $aiStructured['is_question_candidate'] ?? null,
  90. 'ai_confidence' => $aiStructured['ai_confidence'] ?? null,
  91. 'options_count' => is_array($aiStructured['options'] ?? null) ? count($aiStructured['options']) : 0,
  92. 'images_count' => is_array($aiStructured['images'] ?? null) ? count($aiStructured['images']) : 0,
  93. 'tables_count' => is_array($aiStructured['tables'] ?? null) ? count($aiStructured['tables']) : 0,
  94. ]);
  95. return array_merge($candidate, $aiStructured);
  96. }
  97. if (!$this->rateLimited) {
  98. $this->enhanceWithAi($candidate);
  99. }
  100. Log::debug('Parse raw_markdown done (heuristic+detect)', [
  101. 'index' => $index,
  102. 'is_question_candidate' => $candidate['is_question_candidate'] ?? null,
  103. 'ai_confidence' => $candidate['ai_confidence'] ?? null,
  104. ]);
  105. return $candidate;
  106. }
  107. /**
  108. * 解析单个题目块
  109. */
  110. private function parseBlock(string $block, int $index): array
  111. {
  112. $candidate = [
  113. 'index' => $index,
  114. 'raw_markdown' => $block,
  115. 'stem' => null,
  116. 'options' => null,
  117. 'images' => [],
  118. 'tables' => [],
  119. 'is_question_candidate' => false,
  120. 'ai_confidence' => null,
  121. ];
  122. // ② Stem 提取
  123. $candidate['stem'] = $this->extractStem($block);
  124. // ③ 选项识别
  125. $candidate['options'] = $this->extractOptions($block);
  126. // ④ 图片识别
  127. $candidate['images'] = $this->extractImages($block);
  128. // ⑤ 表格识别
  129. $candidate['tables'] = $this->extractTables($block);
  130. return $candidate;
  131. }
  132. /**
  133. * AI 结构化解析:返回符合候选库字段的结构化数组,失败返回 null
  134. *
  135. * @return array{
  136. * index:int,
  137. * stem:?string,
  138. * options:?array,
  139. * images:array,
  140. * tables:array,
  141. * is_question_candidate:bool,
  142. * ai_confidence:?float
  143. * }|null
  144. */
  145. private function parseWithAi(string $rawMarkdown, int $index): ?array
  146. {
  147. $template = (string) config('ai.question_parse_prompt');
  148. if (trim($template) === '') {
  149. return null;
  150. }
  151. $prompt = str_replace(['{index}', '{content}'], [(string) $index, $rawMarkdown], $template);
  152. try {
  153. Log::debug('AI structured parse request', [
  154. 'driver' => $this->aiDriver,
  155. 'index' => $index,
  156. 'prompt_len' => strlen($prompt),
  157. 'raw_sha1' => LogContext::sha1($rawMarkdown),
  158. ]);
  159. $result = $this->callAiApi($prompt);
  160. $normalized = [
  161. 'index' => (int) ($result['index'] ?? $index),
  162. 'stem' => isset($result['stem']) ? (string) $result['stem'] : null,
  163. 'options' => isset($result['options']) && is_array($result['options']) ? $result['options'] : null,
  164. 'images' => isset($result['images']) && is_array($result['images']) ? $result['images'] : [],
  165. 'tables' => isset($result['tables']) && is_array($result['tables']) ? $result['tables'] : [],
  166. 'is_question_candidate' => (bool) ($result['is_question_candidate'] ?? $result['is_question'] ?? false),
  167. 'ai_confidence' => isset($result['ai_confidence']) ? (float) $result['ai_confidence'] : (isset($result['confidence']) ? (float) $result['confidence'] : null),
  168. 'answer' => $result['answer'] ?? null,
  169. 'solution' => $result['solution'] ?? null,
  170. 'solution_steps' => $result['solution_steps'] ?? $result['steps'] ?? [],
  171. ];
  172. // 如果是简答题(没有选项)且没有分步解析,尝试使用专门的 Prompt 补全
  173. if (empty($normalized['options']) && empty($normalized['solution_steps']) && $normalized['is_question_candidate']) {
  174. if (config('ai.enable_solution_steps', true)) {
  175. $stepResult = $this->refineSolutionSteps($rawMarkdown);
  176. if ($stepResult) {
  177. $normalized['solution'] = $stepResult['solution'] ?? $normalized['solution'];
  178. $normalized['solution_steps'] = $stepResult['steps'] ?? [];
  179. }
  180. }
  181. }
  182. Log::debug('AI structured parse response', [
  183. 'driver' => $this->aiDriver,
  184. 'index' => $index,
  185. 'response_keys' => array_keys($result),
  186. 'normalized' => [
  187. 'index' => $normalized['index'],
  188. 'is_question_candidate' => $normalized['is_question_candidate'],
  189. 'ai_confidence' => $normalized['ai_confidence'],
  190. 'options_count' => is_array($normalized['options']) ? count($normalized['options']) : 0,
  191. 'images_count' => is_array($normalized['images']) ? count($normalized['images']) : 0,
  192. 'tables_count' => is_array($normalized['tables']) ? count($normalized['tables']) : 0,
  193. ],
  194. ]);
  195. return $normalized;
  196. } catch (\Throwable $e) {
  197. if ($this->isRateLimited($e)) {
  198. $this->rateLimited = true;
  199. }
  200. Log::warning('AI structured parse failed, fallback to heuristic', [
  201. 'index' => $index,
  202. 'error' => $e->getMessage(),
  203. 'raw_sha1' => LogContext::sha1($rawMarkdown),
  204. ]);
  205. return null;
  206. }
  207. }
  208. /**
  209. * 提取题目主干
  210. */
  211. private function extractStem(string $block): ?string
  212. {
  213. $lines = explode("\n", $block);
  214. $stemLines = [];
  215. foreach ($lines as $line) {
  216. $line = trim($line);
  217. // 跳过选项行
  218. if (preg_match('/^[A-D]\.\s+/', $line)) {
  219. break;
  220. }
  221. // 跳过空行和图片行
  222. if (empty($line) || preg_match('/^<img/', $line)) {
  223. continue;
  224. }
  225. $stemLines[] = $line;
  226. }
  227. return empty($stemLines) ? null : implode("\n", $stemLines);
  228. }
  229. /**
  230. * 提取选项
  231. */
  232. private function extractOptions(string $block): ?array
  233. {
  234. $options = [];
  235. preg_match_all('/^([A-D])\.\s+(.+)$/m', $block, $matches, PREG_SET_ORDER);
  236. foreach ($matches as $match) {
  237. $label = $match[1];
  238. $content = trim($match[2]);
  239. $options[$label] = $content;
  240. }
  241. return empty($options) ? null : $options;
  242. }
  243. /**
  244. * 提取图片
  245. */
  246. private function extractImages(string $block): array
  247. {
  248. $images = [];
  249. preg_match_all('/<img[^>]+src=["\']([^"\']+)["\'][^>]*>/i', $block, $matches);
  250. foreach ($matches[1] as $src) {
  251. $images[] = $src;
  252. }
  253. return $images;
  254. }
  255. /**
  256. * 提取表格
  257. */
  258. private function extractTables(string $block): array
  259. {
  260. $tables = [];
  261. // 简单匹配 HTML 表格标签
  262. preg_match_all('/<table[^>]*>.*?<\/table>/s', $block, $matches);
  263. foreach ($matches[0] as $table) {
  264. $tables[] = $table;
  265. }
  266. return $tables;
  267. }
  268. /**
  269. * AI 增强:判断是否为题目
  270. */
  271. private function enhanceWithAi(array &$candidate): void
  272. {
  273. $prompt = $this->buildQuestionDetectionPrompt($candidate['raw_markdown']);
  274. try {
  275. $result = $this->callAiApi($prompt);
  276. if (isset($result['is_question'])) {
  277. $candidate['is_question_candidate'] = $result['is_question'];
  278. $candidate['ai_confidence'] = $result['confidence'] ?? null;
  279. }
  280. } catch (\Exception $e) {
  281. if ($this->isRateLimited($e)) {
  282. $this->rateLimited = true;
  283. }
  284. Log::error('AI question detection failed', [
  285. 'error' => $e->getMessage(),
  286. 'block' => substr($candidate['raw_markdown'], 0, 200),
  287. ]);
  288. // 默认值:不是题目
  289. $candidate['is_question_candidate'] = false;
  290. $candidate['ai_confidence'] = 0.0;
  291. }
  292. }
  293. /**
  294. * 构建题目检测 Prompt
  295. */
  296. private function buildQuestionDetectionPrompt(string $rawMarkdown): string
  297. {
  298. $template = (string) config('ai.question_detection_prompt');
  299. if (trim($template) === '') {
  300. $template = "请判断下面这段 Markdown 是否是一道数学题目。\n\n题目内容:\n{content}\n\n请输出 JSON:{\"is_question\":true|false,\"confidence\":0~1}";
  301. }
  302. return str_replace('{content}', $rawMarkdown, $template);
  303. }
  304. /**
  305. * 调用 AI API
  306. */
  307. private function callAiApi(string $prompt): array
  308. {
  309. if ($this->aiDriver === 'deepseek') {
  310. return $this->callDeepSeek($prompt);
  311. } elseif ($this->aiDriver === 'openai') {
  312. return $this->callOpenAI($prompt);
  313. }
  314. throw new \Exception("Unsupported AI driver: {$this->aiDriver}");
  315. }
  316. /**
  317. * DeepSeek API 调用
  318. */
  319. private function callDeepSeek(string $prompt): array
  320. {
  321. $apiKey = $this->resolveApiKey(
  322. $this->deepseekApiKeys,
  323. (string) config('ai.deepseek.api_key', env('DEEPSEEK_API_KEY'))
  324. );
  325. $response = Http::withHeaders([
  326. 'Authorization' => "Bearer {$apiKey}",
  327. 'Content-Type' => 'application/json',
  328. ])->timeout($this->deepseekTimeout)->post($this->deepseekBaseUrl . '/chat/completions', [
  329. 'model' => $this->deepseekModel,
  330. 'messages' => [
  331. ['role' => 'user', 'content' => $prompt]
  332. ],
  333. 'temperature' => 0.1,
  334. ]);
  335. if (!$response->successful()) {
  336. throw new \Exception('DeepSeek API error: HTTP ' . $response->status() . ' ' . $response->body());
  337. }
  338. $content = $response->json('choices.0.message.content');
  339. return $this->parseJsonResponse($content);
  340. }
  341. /**
  342. * OpenAI API 调用
  343. */
  344. private function callOpenAI(string $prompt): array
  345. {
  346. $apiKey = $this->resolveApiKey(
  347. $this->openAiApiKeys,
  348. (string) config('ai.openai.api_key', env('OPENAI_API_KEY'))
  349. );
  350. $response = Http::withHeaders([
  351. 'Authorization' => "Bearer {$apiKey}",
  352. 'Content-Type' => 'application/json',
  353. ])->timeout($this->openAiTimeout)->post($this->openAiBaseUrl . '/chat/completions', [
  354. 'model' => $this->openAiModel,
  355. 'messages' => [
  356. ['role' => 'user', 'content' => $prompt]
  357. ],
  358. 'temperature' => 0.1,
  359. ]);
  360. if (!$response->successful()) {
  361. throw new \Exception('OpenAI API error: HTTP ' . $response->status() . ' ' . $response->body());
  362. }
  363. $content = $response->json('choices.0.message.content');
  364. return $this->parseJsonResponse($content);
  365. }
  366. private function resolveApiKey(array $keys, string $fallback): string
  367. {
  368. if (empty($keys)) {
  369. return $fallback;
  370. }
  371. $index = (int) (crc32((string) getmypid()) % count($keys));
  372. return $keys[$index];
  373. }
  374. private function splitApiKeys(string $raw): array
  375. {
  376. $items = array_map('trim', explode(',', $raw));
  377. $items = array_filter($items, static fn (string $key) => $key !== '');
  378. return array_values(array_unique($items));
  379. }
  380. /**
  381. * 精修简答题的解题步骤
  382. */
  383. private function refineSolutionSteps(string $rawMarkdown): ?array
  384. {
  385. $prompt = app(QuestionPromptService::class)->buildSolutionStepsPrompt($rawMarkdown);
  386. try {
  387. return $this->callAiApi($prompt);
  388. } catch (\Throwable $e) {
  389. Log::warning('Refine solution steps failed', [
  390. 'error' => $e->getMessage(),
  391. ]);
  392. return null;
  393. }
  394. }
  395. /**
  396. * 解析 AI 返回的 JSON
  397. */
  398. private function parseJsonResponse(string $content): array
  399. {
  400. // 提取 JSON 部分
  401. preg_match('/\{.*\}/s', $content, $matches);
  402. if (empty($matches[0])) {
  403. throw new \Exception('No JSON found in response');
  404. }
  405. $json = json_decode($matches[0], true);
  406. if (json_last_error() !== JSON_ERROR_NONE) {
  407. throw new \Exception('Invalid JSON: ' . json_last_error_msg());
  408. }
  409. return $json;
  410. }
  411. private function isRateLimited(\Throwable $e): bool
  412. {
  413. $message = $e->getMessage();
  414. if (stripos($message, 'HTTP 429') !== false) {
  415. return true;
  416. }
  417. if (stripos($message, 'rate limit') !== false) {
  418. return true;
  419. }
  420. if (stripos($message, 'too many requests') !== false) {
  421. return true;
  422. }
  423. return false;
  424. }
  425. }