markdownImportId); if (!$import) { Log::error('MarkdownImport not found for batch', [ 'id' => $this->markdownImportId, 'sequence_start' => $this->sequenceStart, 'sequence_end' => $this->sequenceEnd, ]); return; } Log::info('Markdown batch started', [ 'import_id' => $this->markdownImportId, 'sequence_start' => $this->sequenceStart, 'sequence_end' => $this->sequenceEnd, ]); $records = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->whereBetween('sequence', [$this->sequenceStart, $this->sequenceEnd]) ->orderBy('sequence') ->get(); $processed = 0; $failed = 0; foreach ($records as $record) { try { $meta = $record->meta ?? []; if (!empty($meta['ai_parsed'])) { $processed++; continue; } $existingConfidence = $record->confidence ?? $record->ai_confidence; // 置信度高(>=0.85)时跳过再次 AI 解析,直接计入进度 if ($existingConfidence !== null && (float) $existingConfidence >= 0.85) { $this->markParsed($record); $processed++; continue; } // 快速过滤卷子/区块标题,避免误判为题目再次走 AI if (!$this->isLikelyQuestion((string) $record->raw_markdown)) { // 标记为已过滤,但不标记为已解析 $meta['ai_parsed'] = true; $meta['ai_parsed_at'] = now()->toDateTimeString(); $meta['filtered_out'] = true; // 添加过滤标记 $record->update([ 'is_question_candidate' => false, 'ai_confidence' => 0.0, 'confidence' => 0.0, 'is_valid_question' => false, 'status' => 'rejected', 'meta' => $meta, ]); $processed++; continue; } // 已经处理过的不重复处理 if (in_array($record->status, ['pending', 'reviewed', 'accepted', 'rejected'], true) && $record->stem !== null) { $this->markParsed($record); continue; } $parsed = $parser->parseRawMarkdown((string) $record->raw_markdown, (int) $record->index); $meta = $record->meta ?? []; $meta['ai_parsed'] = true; $meta['ai_parsed_at'] = now()->toDateTimeString(); // 结构化后立即回传图片 $uploadedImages = []; if (!empty($parsed['images'])) { foreach ($parsed['images'] as $idx => $imgUrl) { $path = "imports/images/{$record->id}_{$idx}.jpg"; $uploadedImages[] = $uploader->put($path, (string)@file_get_contents($imgUrl)) ?: $imgUrl; } } $meta['images_uploaded'] = !empty($uploadedImages); $record->update([ 'stem' => $parsed['stem'] ?? null, 'options' => $parsed['options'] ?? null, 'images' => !empty($uploadedImages) ? $uploadedImages : $parsedImages, 'tables' => $parsed['tables'] ?? [], 'is_question_candidate' => (bool) ($parsed['is_question_candidate'] ?? false), 'ai_confidence' => $parsed['ai_confidence'] ?? null, 'status' => 'pending', 'meta' => $meta, ]); $processed++; } catch (\Throwable $e) { $failed++; Log::warning('Markdown batch item failed', [ 'import_id' => $this->markdownImportId, 'candidate_id' => $record->id, 'sequence' => $record->sequence, 'index' => $record->index, 'error' => $e->getMessage(), ]); } } $this->refreshProgress(); Log::info('Markdown batch finished', [ 'import_id' => $this->markdownImportId, 'sequence_start' => $this->sequenceStart, 'sequence_end' => $this->sequenceEnd, 'processed' => $processed, 'failed' => $failed, ]); $this->finalizeIfDone(); } private function finalizeIfDone(): void { $import = MarkdownImport::find($this->markdownImportId); if (!$import) { return; } // 重新计算真实的解析进度 $total = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED) ->count(); // 真正完成AI解析的记录数 $parsed = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED) ->where(function ($query) { $query->whereNotNull('stem') ->where('stem', '!=', '') ->orWhere(function ($q) { $q->whereNotNull('ai_confidence') ->where('ai_confidence', '>', 0); }); }) ->count(); // 如果所有候选题都已解析完成,更新状态 if ($total > 0 && $parsed >= $total) { $updated = DB::table('markdown_imports') ->where('id', $this->markdownImportId) ->where('status', 'processing') ->update([ 'status' => 'parsed', 'progress_stage' => MarkdownImport::STAGE_PARSED, 'progress_message' => "解析完成,等待人工校对 ({$parsed}/{$total})", 'progress_total' => $total, 'progress_current' => $parsed, 'progress_updated_at' => now(), 'processing_finished_at' => now(), ]); if ($updated) { Log::info('Markdown import finalized', [ 'import_id' => $this->markdownImportId, 'total_candidates' => $total, 'parsed_candidates' => $parsed, ]); } } } private function refreshProgress(): void { // 总候选题数(排除被过滤的) $total = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED) ->where(function ($query) { $query->whereNull('meta') ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'"); }) ->count(); // 真正完成AI解析的判断:有stem字段或有有效的ai_confidence $parsed = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED) ->where(function ($query) { $query->whereNull('meta') ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'"); }) ->where(function ($query) { $query->whereNotNull('stem') ->where('stem', '!=', '') ->orWhere(function ($q) { $q->whereNotNull('ai_confidence') ->where('ai_confidence', '>', 0); }); }) ->count(); // 计算有stem但AI置信度为0的数量(可能是非题目被错误解析) $stemOnlyCount = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED) ->where(function ($query) { $query->whereNull('meta') ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'"); }) ->whereNotNull('stem') ->where('stem', '!=', '') ->where(function ($query) { $query->whereNull('ai_confidence') ->orWhere('ai_confidence', '=', 0); }) ->count(); // 被过滤的记录数 $filteredCount = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED) ->whereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) = 'true'") ->count(); DB::table('markdown_imports') ->where('id', $this->markdownImportId) ->update([ 'progress_total' => $total, 'progress_current' => min($parsed, $total), 'progress_updated_at' => now(), 'progress_stage' => MarkdownImport::STAGE_AI_PARSING, 'progress_message' => "AI 解析中… {$parsed}/{$total}" . ($stemOnlyCount > 0 ? " (含{$stemOnlyCount}个待筛选)" : '') . ($filteredCount > 0 ? " (已过滤{$filteredCount}个非题目)" : ''), ]); } private function markParsed(PreQuestionCandidate $record): void { // 只有在记录真正有解析结果时才标记为已解析 if (!empty($record->stem) || (!empty($record->ai_confidence) && $record->ai_confidence > 0)) { $meta = $record->meta ?? []; if (empty($meta['ai_parsed'])) { $meta['ai_parsed'] = true; $meta['ai_parsed_at'] = now()->toDateTimeString(); $record->update(['meta' => $meta]); } } } /** * 轻量启发式判断是否像一道题目,过滤卷子/部分标题和说明文字。 */ private function isLikelyQuestion(string $raw): bool { $text = trim(strip_tags($raw)); $length = mb_strlen($text); // Markdown 标题或“卷/部分/说明”且文本很短,视为非题 if (preg_match('/^#+\\s+/m', $raw)) { return false; } if (preg_match('/(第[一二三四五六七八九十IVX]+[卷部分]|题型|说明|试卷)/u', $text) && $length <= 80) { return false; } // 过短且无问句/命令词/选项特征 if ($length < 25 && !preg_match('/[\\??求解求证计算]|[A-D]\\.|(本小题满分/u', $text)) { return false; } return true; } }