markdownImportId); if (!$import) { Log::error('MarkdownImport not found for batch', [ 'id' => $this->markdownImportId, 'sequence_start' => $this->sequenceStart, 'sequence_end' => $this->sequenceEnd, ]); return; } Log::info('Markdown batch started', [ 'import_id' => $this->markdownImportId, 'sequence_start' => $this->sequenceStart, 'sequence_end' => $this->sequenceEnd, ]); $records = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->whereBetween('sequence', [$this->sequenceStart, $this->sequenceEnd]) ->orderBy('sequence') ->get(); $processed = 0; $failed = 0; $totalRecords = $records->count(); // 初始化进度 $this->refreshProgress(); foreach ($records as $index => $record) { try { $meta = $record->meta ?? []; if (!empty($meta['ai_parsed'])) { $processed++; continue; } $existingConfidence = $record->confidence ?? $record->ai_confidence; // 置信度高(>=0.85)时跳过再次 AI 解析,直接计入进度 if ($existingConfidence !== null && (float) $existingConfidence >= 0.85) { $this->markParsed($record); $processed++; continue; } // 快速过滤卷子/区块标题,避免误判为题目再次走 AI if (!$this->isLikelyQuestion((string) $record->raw_markdown)) { // 标记为已过滤,但不标记为已解析 $meta['ai_parsed'] = true; $meta['ai_parsed_at'] = now()->toDateTimeString(); $meta['filtered_out'] = true; // 添加过滤标记 $record->update([ 'is_question_candidate' => false, 'ai_confidence' => 0.0, 'confidence' => 0.0, 'is_valid_question' => false, 'status' => 'rejected', 'meta' => $meta, ]); $processed++; continue; } // 已经处理过的不重复处理 if (in_array($record->status, ['pending', 'reviewed', 'accepted', 'rejected'], true) && $record->stem !== null) { $this->markParsed($record); continue; } $parsed = $parser->parseRawMarkdown((string) $record->raw_markdown, (int) $record->index); $meta = $record->meta ?? []; $meta['ai_parsed'] = true; $meta['ai_parsed_at'] = now()->toDateTimeString(); // 结构化后立即回传图片 $uploadedImages = []; if (!empty($parsed['images'])) { foreach ($parsed['images'] as $idx => $imgUrl) { $path = "imports/images/{$record->id}_{$idx}.jpg"; $uploadedImages[] = $uploader->put($path, (string)@file_get_contents($imgUrl)) ?: $imgUrl; } } $meta['images_uploaded'] = !empty($uploadedImages); $record->update([ 'stem' => $parsed['stem'] ?? null, 'options' => $parsed['options'] ?? null, 'images' => !empty($uploadedImages) ? $uploadedImages : ($parsed['images'] ?? []), 'tables' => $parsed['tables'] ?? [], 'is_question_candidate' => (bool) ($parsed['is_question_candidate'] ?? false), 'ai_confidence' => $parsed['ai_confidence'] ?? null, 'status' => 'pending', 'meta' => $meta, ]); $processed++; // 每处理5个记录就更新一次进度,避免过于频繁 if (($index + 1) % 5 === 0 || ($index + 1) === $totalRecords) { $this->refreshProgress(); } } catch (\Throwable $e) { $failed++; Log::error('Markdown batch item failed', [ 'import_id' => $this->markdownImportId, 'candidate_id' => $record->id, 'sequence' => $record->sequence, 'index' => $record->index, 'error' => $e->getMessage(), 'trace' => $e->getTraceAsString(), ]); // 失败时也更新进度 if (($index + 1) % 5 === 0 || ($index + 1) === $totalRecords) { $this->refreshProgress(); } } } $this->refreshProgress(); Log::info('Markdown batch finished', [ 'import_id' => $this->markdownImportId, 'sequence_start' => $this->sequenceStart, 'sequence_end' => $this->sequenceEnd, 'processed' => $processed, 'failed' => $failed, ]); $this->finalizeIfDone(); } /** * 任务失败时的处理 */ public function failed(\Throwable $exception): void { Log::error('Markdown batch job failed permanently', [ 'import_id' => $this->markdownImportId, 'sequence_start' => $this->sequenceStart, 'sequence_end' => $this->sequenceEnd, 'attempts' => $this->attempts(), 'error' => $exception->getMessage(), 'trace' => $exception->getTraceAsString(), ]); // 标记导入记录为失败状态,避免卡住 $import = MarkdownImport::find($this->markdownImportId); if ($import) { $import->update([ 'status' => MarkdownImport::STATUS_FAILED, 'progress_stage' => MarkdownImport::STAGE_FAILED, 'progress_message' => 'AI 解析任务失败', 'error_message' => '队列任务执行失败,已超过最大重试次数', 'processing_finished_at' => now(), ]); } } private function finalizeIfDone(): void { $import = MarkdownImport::find($this->markdownImportId); if (!$import) { return; } // 统一使用与 refreshProgress 相同的查询逻辑 [$total, $parsed, $batchInfo] = $this->calculateProgress(); // 只有当所有候选题都已处理(解析或过滤)完成时才更新状态 if ($total > 0 && $parsed >= $total) { $updated = $import->update([ 'status' => MarkdownImport::STATUS_PARSED, 'progress_stage' => MarkdownImport::STAGE_PARSED, 'progress_message' => "解析完成,等待人工校对 ({$parsed}/{$total})", 'progress_total' => $total, 'progress_current' => $parsed, 'progress_updated_at' => now(), 'processing_finished_at' => now(), ]); if ($updated) { Log::info('Markdown import finalized', [ 'import_id' => $this->markdownImportId, 'total_candidates' => $total, 'parsed_candidates' => $parsed, ]); } } } /** * 计算进度:返回 [总数量, 已处理数量] */ private function calculateProgress(): array { // 总候选题数(排除被过滤的和已废弃的) $total = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED) ->where(function ($query) { $query->whereNull('meta') ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'"); }) ->count(); // 真正完成AI解析的判断:有 stem 字段且不为空,或有有效的 ai_confidence // 或已经被过滤 (filtered_out=true) $parsed = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED) ->where(function ($query) { $query->whereNull('meta') ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'"); }) ->where(function ($query) { $query->where(function ($q) { // 有题目内容 $q->whereNotNull('stem') ->where('stem', '!=', ''); })->orWhere(function ($q) { // 或有有效的AI置信度 $q->whereNotNull('ai_confidence') ->where('ai_confidence', '>', 0); }); }) ->count(); // 计算当前正在处理的批次信息 $batchInfo = sprintf( '批次 %d-%d', $this->sequenceStart, $this->sequenceEnd ); return [$total, $parsed, $batchInfo]; } private function refreshProgress(): void { [$total, $parsed, $batchInfo] = $this->calculateProgress(); // 计算有stem但AI置信度为0的数量(可能是非题目被错误解析) $stemOnlyCount = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED) ->where(function ($query) { $query->whereNull('meta') ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'"); }) ->whereNotNull('stem') ->where('stem', '!=', '') ->where(function ($query) { $query->whereNull('ai_confidence') ->orWhere('ai_confidence', '=', 0); }) ->count(); // 被过滤的记录数 $filteredCount = PreQuestionCandidate::query() ->where('import_id', $this->markdownImportId) ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED) ->whereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) = 'true'") ->count(); $import = MarkdownImport::find($this->markdownImportId); if ($import) { $import->update([ 'progress_total' => $total, 'progress_current' => min($parsed, $total), 'progress_updated_at' => now(), 'progress_stage' => MarkdownImport::STAGE_AI_PARSING, 'progress_message' => "{$batchInfo} | AI 解析中… {$parsed}/{$total}" . ($stemOnlyCount > 0 ? " (含{$stemOnlyCount}个待筛选)" : '') . ($filteredCount > 0 ? " (已过滤{$filteredCount}个非题目)" : ''), ]); } } private function markParsed(PreQuestionCandidate $record): void { // 只有在记录真正有解析结果时才标记为已解析 if (!empty($record->stem) || (!empty($record->ai_confidence) && $record->ai_confidence > 0)) { $meta = $record->meta ?? []; if (empty($meta['ai_parsed'])) { $meta['ai_parsed'] = true; $meta['ai_parsed_at'] = now()->toDateTimeString(); $record->update(['meta' => $meta]); } } } /** * 轻量启发式判断是否像一道题目,过滤卷子/部分标题和说明文字。 */ private function isLikelyQuestion(string $raw): bool { $text = trim(strip_tags($raw)); $length = mb_strlen($text); // Markdown 标题或“卷/部分/说明”且文本很短,视为非题 if (preg_match('/^#+\\s+/m', $raw)) { return false; } if (preg_match('/(第[一二三四五六七八九十IVX]+[卷部分]|题型|说明|试卷)/u', $text) && $length <= 80) { return false; } // 过短且无问句/命令词/选项特征 if ($length < 25 && !preg_match('/[\\??求解求证计算]|[A-D]\\.|(本小题满分/u', $text)) { return false; } return true; } }