aiDriver = config('ai.driver', env('AI_DRIVER', 'deepseek'));
$this->deepseekBaseUrl = rtrim((string) config('ai.deepseek.base_url', 'https://api.deepseek.com/v1'), '/');
$this->deepseekModel = (string) config('ai.deepseek.model', 'deepseek-chat');
$this->deepseekTimeout = (int) config('ai.deepseek.timeout', 30);
$this->openAiBaseUrl = rtrim((string) config('ai.openai.base_url', 'https://api.openai.com/v1'), '/');
$this->openAiModel = (string) config('ai.openai.model', 'gpt-3.5-turbo');
$this->openAiTimeout = (int) config('ai.openai.timeout', 30);
}
/**
* 解析 Markdown 文本,返回候选题数组
*/
public function parse(string $markdown): array
{
$splitter = app(AsyncMarkdownSplitter::class);
$blocks = $splitter->split($markdown);
if (!$splitter->validate($blocks)) {
Log::warning('Markdown split validation failed; continue with best-effort parsing', [
'blocks_count' => count($blocks),
]);
}
$candidates = [];
foreach ($blocks as $block) {
$candidates[] = $this->parseRawMarkdown(
(string) ($block['raw_markdown'] ?? ''),
(int) ($block['index'] ?? 0),
);
}
return $candidates;
}
/**
* 解析单题 raw_markdown,返回候选题结构
*/
public function parseRawMarkdown(string $rawMarkdown, int $index): array
{
Log::debug('Parse raw_markdown start', [
'index' => $index,
'raw_len' => strlen($rawMarkdown),
'raw_sha1' => LogContext::sha1($rawMarkdown),
'raw_excerpt' => LogContext::excerpt($rawMarkdown),
]);
$candidate = $this->parseBlock($rawMarkdown, $index);
// AI 结构化解析(失败则回退为启发式提取 + AI 判题)
$aiStructured = $this->parseWithAi($candidate['raw_markdown'], $candidate['index']);
if ($aiStructured !== null) {
Log::debug('Parse raw_markdown done (ai_structured)', [
'index' => $index,
'keys' => array_keys($aiStructured),
'is_question_candidate' => $aiStructured['is_question_candidate'] ?? null,
'ai_confidence' => $aiStructured['ai_confidence'] ?? null,
'options_count' => is_array($aiStructured['options'] ?? null) ? count($aiStructured['options']) : 0,
'images_count' => is_array($aiStructured['images'] ?? null) ? count($aiStructured['images']) : 0,
'tables_count' => is_array($aiStructured['tables'] ?? null) ? count($aiStructured['tables']) : 0,
]);
return array_merge($candidate, $aiStructured);
}
$this->enhanceWithAi($candidate);
Log::debug('Parse raw_markdown done (heuristic+detect)', [
'index' => $index,
'is_question_candidate' => $candidate['is_question_candidate'] ?? null,
'ai_confidence' => $candidate['ai_confidence'] ?? null,
]);
return $candidate;
}
/**
* 解析单个题目块
*/
private function parseBlock(string $block, int $index): array
{
$candidate = [
'index' => $index,
'raw_markdown' => $block,
'stem' => null,
'options' => null,
'images' => [],
'tables' => [],
'is_question_candidate' => false,
'ai_confidence' => null,
];
// ② Stem 提取
$candidate['stem'] = $this->extractStem($block);
// ③ 选项识别
$candidate['options'] = $this->extractOptions($block);
// ④ 图片识别
$candidate['images'] = $this->extractImages($block);
// ⑤ 表格识别
$candidate['tables'] = $this->extractTables($block);
return $candidate;
}
/**
* AI 结构化解析:返回符合候选库字段的结构化数组,失败返回 null
*
* @return array{
* index:int,
* stem:?string,
* options:?array,
* images:array,
* tables:array,
* is_question_candidate:bool,
* ai_confidence:?float
* }|null
*/
private function parseWithAi(string $rawMarkdown, int $index): ?array
{
$template = (string) config('ai.question_parse_prompt');
if (trim($template) === '') {
return null;
}
$prompt = str_replace(['{index}', '{content}'], [(string) $index, $rawMarkdown], $template);
try {
Log::debug('AI structured parse request', [
'driver' => $this->aiDriver,
'index' => $index,
'prompt_len' => strlen($prompt),
'raw_sha1' => LogContext::sha1($rawMarkdown),
]);
$result = $this->callAiApi($prompt);
$normalized = [
'index' => (int) ($result['index'] ?? $index),
'stem' => isset($result['stem']) ? (string) $result['stem'] : null,
'options' => isset($result['options']) && is_array($result['options']) ? $result['options'] : null,
'images' => isset($result['images']) && is_array($result['images']) ? $result['images'] : [],
'tables' => isset($result['tables']) && is_array($result['tables']) ? $result['tables'] : [],
'is_question_candidate' => (bool) ($result['is_question_candidate'] ?? $result['is_question'] ?? false),
'ai_confidence' => isset($result['ai_confidence']) ? (float) $result['ai_confidence'] : (isset($result['confidence']) ? (float) $result['confidence'] : null),
'answer' => $result['answer'] ?? null,
'solution' => $result['solution'] ?? null,
'solution_steps' => $result['solution_steps'] ?? $result['steps'] ?? [],
];
// 如果是简答题(没有选项)且没有分步解析,尝试使用专门的 Prompt 补全
if (empty($normalized['options']) && empty($normalized['solution_steps']) && $normalized['is_question_candidate']) {
$stepResult = $this->refineSolutionSteps($rawMarkdown);
if ($stepResult) {
$normalized['solution'] = $stepResult['solution'] ?? $normalized['solution'];
$normalized['solution_steps'] = $stepResult['steps'] ?? [];
}
}
Log::debug('AI structured parse response', [
'driver' => $this->aiDriver,
'index' => $index,
'response_keys' => array_keys($result),
'normalized' => [
'index' => $normalized['index'],
'is_question_candidate' => $normalized['is_question_candidate'],
'ai_confidence' => $normalized['ai_confidence'],
'options_count' => is_array($normalized['options']) ? count($normalized['options']) : 0,
'images_count' => is_array($normalized['images']) ? count($normalized['images']) : 0,
'tables_count' => is_array($normalized['tables']) ? count($normalized['tables']) : 0,
],
]);
return $normalized;
} catch (\Throwable $e) {
Log::warning('AI structured parse failed, fallback to heuristic', [
'index' => $index,
'error' => $e->getMessage(),
'raw_sha1' => LogContext::sha1($rawMarkdown),
]);
return null;
}
}
/**
* 提取题目主干
*/
private function extractStem(string $block): ?string
{
$lines = explode("\n", $block);
$stemLines = [];
foreach ($lines as $line) {
$line = trim($line);
// 跳过选项行
if (preg_match('/^[A-D]\.\s+/', $line)) {
break;
}
// 跳过空行和图片行
if (empty($line) || preg_match('/^]+src=["\']([^"\']+)["\'][^>]*>/i', $block, $matches);
foreach ($matches[1] as $src) {
$images[] = $src;
}
return $images;
}
/**
* 提取表格
*/
private function extractTables(string $block): array
{
$tables = [];
// 简单匹配 HTML 表格标签
preg_match_all('/