MarkdownQuestionParser.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. <?php
  2. namespace App\Services;
  3. use App\Support\LogContext;
  4. use Illuminate\Support\Facades\Http;
  5. use Illuminate\Support\Facades\Log;
  6. class MarkdownQuestionParser
  7. {
  8. private string $aiDriver;
  9. private string $deepseekBaseUrl;
  10. private string $deepseekModel;
  11. private int $deepseekTimeout;
  12. private string $openAiBaseUrl;
  13. private string $openAiModel;
  14. private int $openAiTimeout;
  15. public function __construct()
  16. {
  17. $this->aiDriver = config('ai.driver', env('AI_DRIVER', 'deepseek'));
  18. $this->deepseekBaseUrl = rtrim((string) config('ai.deepseek.base_url', 'https://api.deepseek.com/v1'), '/');
  19. $this->deepseekModel = (string) config('ai.deepseek.model', 'deepseek-chat');
  20. $this->deepseekTimeout = (int) config('ai.deepseek.timeout', 30);
  21. $this->openAiBaseUrl = rtrim((string) config('ai.openai.base_url', 'https://api.openai.com/v1'), '/');
  22. $this->openAiModel = (string) config('ai.openai.model', 'gpt-3.5-turbo');
  23. $this->openAiTimeout = (int) config('ai.openai.timeout', 30);
  24. }
  25. /**
  26. * 解析 Markdown 文本,返回候选题数组
  27. */
  28. public function parse(string $markdown): array
  29. {
  30. $splitter = app(AsyncMarkdownSplitter::class);
  31. $blocks = $splitter->split($markdown);
  32. if (!$splitter->validate($blocks)) {
  33. Log::warning('Markdown split validation failed; continue with best-effort parsing', [
  34. 'blocks_count' => count($blocks),
  35. ]);
  36. }
  37. $candidates = [];
  38. foreach ($blocks as $block) {
  39. $candidates[] = $this->parseRawMarkdown(
  40. (string) ($block['raw_markdown'] ?? ''),
  41. (int) ($block['index'] ?? 0),
  42. );
  43. }
  44. return $candidates;
  45. }
  46. /**
  47. * 解析单题 raw_markdown,返回候选题结构
  48. */
  49. public function parseRawMarkdown(string $rawMarkdown, int $index): array
  50. {
  51. Log::debug('Parse raw_markdown start', [
  52. 'index' => $index,
  53. 'raw_len' => strlen($rawMarkdown),
  54. 'raw_sha1' => LogContext::sha1($rawMarkdown),
  55. 'raw_excerpt' => LogContext::excerpt($rawMarkdown),
  56. ]);
  57. $candidate = $this->parseBlock($rawMarkdown, $index);
  58. // AI 结构化解析(失败则回退为启发式提取 + AI 判题)
  59. $aiStructured = $this->parseWithAi($candidate['raw_markdown'], $candidate['index']);
  60. if ($aiStructured !== null) {
  61. Log::debug('Parse raw_markdown done (ai_structured)', [
  62. 'index' => $index,
  63. 'keys' => array_keys($aiStructured),
  64. 'is_question_candidate' => $aiStructured['is_question_candidate'] ?? null,
  65. 'ai_confidence' => $aiStructured['ai_confidence'] ?? null,
  66. 'options_count' => is_array($aiStructured['options'] ?? null) ? count($aiStructured['options']) : 0,
  67. 'images_count' => is_array($aiStructured['images'] ?? null) ? count($aiStructured['images']) : 0,
  68. 'tables_count' => is_array($aiStructured['tables'] ?? null) ? count($aiStructured['tables']) : 0,
  69. ]);
  70. return array_merge($candidate, $aiStructured);
  71. }
  72. $this->enhanceWithAi($candidate);
  73. Log::debug('Parse raw_markdown done (heuristic+detect)', [
  74. 'index' => $index,
  75. 'is_question_candidate' => $candidate['is_question_candidate'] ?? null,
  76. 'ai_confidence' => $candidate['ai_confidence'] ?? null,
  77. ]);
  78. return $candidate;
  79. }
  80. /**
  81. * 解析单个题目块
  82. */
  83. private function parseBlock(string $block, int $index): array
  84. {
  85. $candidate = [
  86. 'index' => $index,
  87. 'raw_markdown' => $block,
  88. 'stem' => null,
  89. 'options' => null,
  90. 'images' => [],
  91. 'tables' => [],
  92. 'is_question_candidate' => false,
  93. 'ai_confidence' => null,
  94. ];
  95. // ② Stem 提取
  96. $candidate['stem'] = $this->extractStem($block);
  97. // ③ 选项识别
  98. $candidate['options'] = $this->extractOptions($block);
  99. // ④ 图片识别
  100. $candidate['images'] = $this->extractImages($block);
  101. // ⑤ 表格识别
  102. $candidate['tables'] = $this->extractTables($block);
  103. return $candidate;
  104. }
  105. /**
  106. * AI 结构化解析:返回符合候选库字段的结构化数组,失败返回 null
  107. *
  108. * @return array{
  109. * index:int,
  110. * stem:?string,
  111. * options:?array,
  112. * images:array,
  113. * tables:array,
  114. * is_question_candidate:bool,
  115. * ai_confidence:?float
  116. * }|null
  117. */
  118. private function parseWithAi(string $rawMarkdown, int $index): ?array
  119. {
  120. $template = (string) config('ai.question_parse_prompt');
  121. if (trim($template) === '') {
  122. return null;
  123. }
  124. $prompt = str_replace(['{index}', '{content}'], [(string) $index, $rawMarkdown], $template);
  125. try {
  126. Log::debug('AI structured parse request', [
  127. 'driver' => $this->aiDriver,
  128. 'index' => $index,
  129. 'prompt_len' => strlen($prompt),
  130. 'raw_sha1' => LogContext::sha1($rawMarkdown),
  131. ]);
  132. $result = $this->callAiApi($prompt);
  133. $normalized = [
  134. 'index' => (int) ($result['index'] ?? $index),
  135. 'stem' => isset($result['stem']) ? (string) $result['stem'] : null,
  136. 'options' => isset($result['options']) && is_array($result['options']) ? $result['options'] : null,
  137. 'images' => isset($result['images']) && is_array($result['images']) ? $result['images'] : [],
  138. 'tables' => isset($result['tables']) && is_array($result['tables']) ? $result['tables'] : [],
  139. 'is_question_candidate' => (bool) ($result['is_question_candidate'] ?? $result['is_question'] ?? false),
  140. 'ai_confidence' => isset($result['ai_confidence']) ? (float) $result['ai_confidence'] : (isset($result['confidence']) ? (float) $result['confidence'] : null),
  141. 'answer' => $result['answer'] ?? null,
  142. 'solution' => $result['solution'] ?? null,
  143. 'solution_steps' => $result['solution_steps'] ?? $result['steps'] ?? [],
  144. ];
  145. // 如果是简答题(没有选项)且没有分步解析,尝试使用专门的 Prompt 补全
  146. if (empty($normalized['options']) && empty($normalized['solution_steps']) && $normalized['is_question_candidate']) {
  147. $stepResult = $this->refineSolutionSteps($rawMarkdown);
  148. if ($stepResult) {
  149. $normalized['solution'] = $stepResult['solution'] ?? $normalized['solution'];
  150. $normalized['solution_steps'] = $stepResult['steps'] ?? [];
  151. }
  152. }
  153. Log::debug('AI structured parse response', [
  154. 'driver' => $this->aiDriver,
  155. 'index' => $index,
  156. 'response_keys' => array_keys($result),
  157. 'normalized' => [
  158. 'index' => $normalized['index'],
  159. 'is_question_candidate' => $normalized['is_question_candidate'],
  160. 'ai_confidence' => $normalized['ai_confidence'],
  161. 'options_count' => is_array($normalized['options']) ? count($normalized['options']) : 0,
  162. 'images_count' => is_array($normalized['images']) ? count($normalized['images']) : 0,
  163. 'tables_count' => is_array($normalized['tables']) ? count($normalized['tables']) : 0,
  164. ],
  165. ]);
  166. return $normalized;
  167. } catch (\Throwable $e) {
  168. Log::warning('AI structured parse failed, fallback to heuristic', [
  169. 'index' => $index,
  170. 'error' => $e->getMessage(),
  171. 'raw_sha1' => LogContext::sha1($rawMarkdown),
  172. ]);
  173. return null;
  174. }
  175. }
  176. /**
  177. * 提取题目主干
  178. */
  179. private function extractStem(string $block): ?string
  180. {
  181. $lines = explode("\n", $block);
  182. $stemLines = [];
  183. foreach ($lines as $line) {
  184. $line = trim($line);
  185. // 跳过选项行
  186. if (preg_match('/^[A-D]\.\s+/', $line)) {
  187. break;
  188. }
  189. // 跳过空行和图片行
  190. if (empty($line) || preg_match('/^<img/', $line)) {
  191. continue;
  192. }
  193. $stemLines[] = $line;
  194. }
  195. return empty($stemLines) ? null : implode("\n", $stemLines);
  196. }
  197. /**
  198. * 提取选项
  199. */
  200. private function extractOptions(string $block): ?array
  201. {
  202. $options = [];
  203. preg_match_all('/^([A-D])\.\s+(.+)$/m', $block, $matches, PREG_SET_ORDER);
  204. foreach ($matches as $match) {
  205. $label = $match[1];
  206. $content = trim($match[2]);
  207. $options[$label] = $content;
  208. }
  209. return empty($options) ? null : $options;
  210. }
  211. /**
  212. * 提取图片
  213. */
  214. private function extractImages(string $block): array
  215. {
  216. $images = [];
  217. preg_match_all('/<img[^>]+src=["\']([^"\']+)["\'][^>]*>/i', $block, $matches);
  218. foreach ($matches[1] as $src) {
  219. $images[] = $src;
  220. }
  221. return $images;
  222. }
  223. /**
  224. * 提取表格
  225. */
  226. private function extractTables(string $block): array
  227. {
  228. $tables = [];
  229. // 简单匹配 HTML 表格标签
  230. preg_match_all('/<table[^>]*>.*?<\/table>/s', $block, $matches);
  231. foreach ($matches[0] as $table) {
  232. $tables[] = $table;
  233. }
  234. return $tables;
  235. }
  236. /**
  237. * AI 增强:判断是否为题目
  238. */
  239. private function enhanceWithAi(array &$candidate): void
  240. {
  241. $prompt = $this->buildQuestionDetectionPrompt($candidate['raw_markdown']);
  242. try {
  243. $result = $this->callAiApi($prompt);
  244. if (isset($result['is_question'])) {
  245. $candidate['is_question_candidate'] = $result['is_question'];
  246. $candidate['ai_confidence'] = $result['confidence'] ?? null;
  247. }
  248. } catch (\Exception $e) {
  249. Log::error('AI question detection failed', [
  250. 'error' => $e->getMessage(),
  251. 'block' => substr($candidate['raw_markdown'], 0, 200),
  252. ]);
  253. // 默认值:不是题目
  254. $candidate['is_question_candidate'] = false;
  255. $candidate['ai_confidence'] = 0.0;
  256. }
  257. }
  258. /**
  259. * 构建题目检测 Prompt
  260. */
  261. private function buildQuestionDetectionPrompt(string $rawMarkdown): string
  262. {
  263. $template = (string) config('ai.question_detection_prompt');
  264. if (trim($template) === '') {
  265. $template = "请判断下面这段 Markdown 是否是一道数学题目。\n\n题目内容:\n{content}\n\n请输出 JSON:{\"is_question\":true|false,\"confidence\":0~1}";
  266. }
  267. return str_replace('{content}', $rawMarkdown, $template);
  268. }
  269. /**
  270. * 调用 AI API
  271. */
  272. private function callAiApi(string $prompt): array
  273. {
  274. if ($this->aiDriver === 'deepseek') {
  275. return $this->callDeepSeek($prompt);
  276. } elseif ($this->aiDriver === 'openai') {
  277. return $this->callOpenAI($prompt);
  278. }
  279. throw new \Exception("Unsupported AI driver: {$this->aiDriver}");
  280. }
  281. /**
  282. * DeepSeek API 调用
  283. */
  284. private function callDeepSeek(string $prompt): array
  285. {
  286. $apiKey = config('ai.deepseek.api_key', env('DEEPSEEK_API_KEY'));
  287. $response = Http::withHeaders([
  288. 'Authorization' => "Bearer {$apiKey}",
  289. 'Content-Type' => 'application/json',
  290. ])->timeout($this->deepseekTimeout)->post($this->deepseekBaseUrl . '/chat/completions', [
  291. 'model' => $this->deepseekModel,
  292. 'messages' => [
  293. ['role' => 'user', 'content' => $prompt]
  294. ],
  295. 'temperature' => 0.1,
  296. ]);
  297. if (!$response->successful()) {
  298. throw new \Exception('DeepSeek API error: ' . $response->body());
  299. }
  300. $content = $response->json('choices.0.message.content');
  301. return $this->parseJsonResponse($content);
  302. }
  303. /**
  304. * OpenAI API 调用
  305. */
  306. private function callOpenAI(string $prompt): array
  307. {
  308. $apiKey = config('ai.openai.api_key', env('OPENAI_API_KEY'));
  309. $response = Http::withHeaders([
  310. 'Authorization' => "Bearer {$apiKey}",
  311. 'Content-Type' => 'application/json',
  312. ])->timeout($this->openAiTimeout)->post($this->openAiBaseUrl . '/chat/completions', [
  313. 'model' => $this->openAiModel,
  314. 'messages' => [
  315. ['role' => 'user', 'content' => $prompt]
  316. ],
  317. 'temperature' => 0.1,
  318. ]);
  319. if (!$response->successful()) {
  320. throw new \Exception('OpenAI API error: ' . $response->body());
  321. }
  322. $content = $response->json('choices.0.message.content');
  323. return $this->parseJsonResponse($content);
  324. }
  325. /**
  326. * 精修简答题的解题步骤
  327. */
  328. private function refineSolutionSteps(string $rawMarkdown): ?array
  329. {
  330. $prompt = app(QuestionPromptService::class)->buildSolutionStepsPrompt($rawMarkdown);
  331. try {
  332. return $this->callAiApi($prompt);
  333. } catch (\Throwable $e) {
  334. Log::warning('Refine solution steps failed', [
  335. 'error' => $e->getMessage(),
  336. ]);
  337. return null;
  338. }
  339. }
  340. /**
  341. * 解析 AI 返回的 JSON
  342. */
  343. private function parseJsonResponse(string $content): array
  344. {
  345. // 提取 JSON 部分
  346. preg_match('/\{.*\}/s', $content, $matches);
  347. if (empty($matches[0])) {
  348. throw new \Exception('No JSON found in response');
  349. }
  350. $json = json_decode($matches[0], true);
  351. if (json_last_error() !== JSON_ERROR_NONE) {
  352. throw new \Exception('Invalid JSON: ' . json_last_error_msg());
  353. }
  354. return $json;
  355. }
  356. }