| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354 |
- <?php
- namespace App\Jobs;
- use App\Models\MarkdownImport;
- use App\Models\PreQuestionCandidate;
- use App\Services\MarkdownQuestionParser;
- use App\Services\PdfStorageService;
- use Illuminate\Bus\Queueable;
- use Illuminate\Contracts\Queue\ShouldQueue;
- use Illuminate\Foundation\Bus\Dispatchable;
- use Illuminate\Queue\InteractsWithQueue;
- use Illuminate\Queue\SerializesModels;
- use Illuminate\Support\Facades\DB;
- use Illuminate\Support\Facades\Log;
- class ProcessMarkdownCandidateBatch implements ShouldQueue
- {
- use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
- public int $timeout = 300; // 5分钟超时
- public int $tries = 3;
- public function __construct(
- public int $markdownImportId,
- public int $sequenceStart,
- public int $sequenceEnd
- ) {
- //
- }
- public function handle(MarkdownQuestionParser $parser, PdfStorageService $uploader): void
- {
- $import = MarkdownImport::find($this->markdownImportId);
- if (!$import) {
- Log::error('MarkdownImport not found for batch', [
- 'id' => $this->markdownImportId,
- 'sequence_start' => $this->sequenceStart,
- 'sequence_end' => $this->sequenceEnd,
- ]);
- return;
- }
- Log::info('Markdown batch started', [
- 'import_id' => $this->markdownImportId,
- 'sequence_start' => $this->sequenceStart,
- 'sequence_end' => $this->sequenceEnd,
- ]);
- $records = PreQuestionCandidate::query()
- ->where('import_id', $this->markdownImportId)
- ->whereBetween('sequence', [$this->sequenceStart, $this->sequenceEnd])
- ->orderBy('sequence')
- ->get();
- $processed = 0;
- $failed = 0;
- $totalRecords = $records->count();
- // 初始化进度
- $this->refreshProgress();
- foreach ($records as $index => $record) {
- try {
- $meta = $record->meta ?? [];
- if (!empty($meta['ai_parsed'])) {
- $processed++;
- continue;
- }
- $existingConfidence = $record->confidence ?? $record->ai_confidence;
- // 置信度高(>=0.85)时跳过再次 AI 解析,直接计入进度
- if ($existingConfidence !== null && (float) $existingConfidence >= 0.85) {
- $this->markParsed($record);
- $processed++;
- continue;
- }
- // 快速过滤卷子/区块标题,避免误判为题目再次走 AI
- if (!$this->isLikelyQuestion((string) $record->raw_markdown)) {
- // 标记为已过滤,但不标记为已解析
- $meta['ai_parsed'] = true;
- $meta['ai_parsed_at'] = now()->toDateTimeString();
- $meta['filtered_out'] = true; // 添加过滤标记
- $record->update([
- 'is_question_candidate' => false,
- 'ai_confidence' => 0.0,
- 'confidence' => 0.0,
- 'is_valid_question' => false,
- 'status' => 'rejected',
- 'meta' => $meta,
- ]);
- $processed++;
- continue;
- }
- // 已经处理过的不重复处理
- if (in_array($record->status, ['pending', 'reviewed', 'accepted', 'rejected'], true) && $record->stem !== null) {
- $this->markParsed($record);
- continue;
- }
- $parsed = $parser->parseRawMarkdown((string) $record->raw_markdown, (int) $record->index);
- $meta = $record->meta ?? [];
- $meta['ai_parsed'] = true;
- $meta['ai_parsed_at'] = now()->toDateTimeString();
- // 结构化后立即回传图片
- $uploadedImages = [];
- if (!empty($parsed['images'])) {
- foreach ($parsed['images'] as $idx => $imgUrl) {
- $path = "imports/images/{$record->id}_{$idx}.jpg";
- $uploadedImages[] = $uploader->put($path, (string)@file_get_contents($imgUrl)) ?: $imgUrl;
- }
- }
- $meta['images_uploaded'] = !empty($uploadedImages);
- $record->update([
- 'stem' => $parsed['stem'] ?? null,
- 'options' => $parsed['options'] ?? null,
- 'images' => !empty($uploadedImages) ? $uploadedImages : ($parsed['images'] ?? []),
- 'tables' => $parsed['tables'] ?? [],
- 'is_question_candidate' => (bool) ($parsed['is_question_candidate'] ?? false),
- 'ai_confidence' => $parsed['ai_confidence'] ?? null,
- 'status' => 'pending',
- 'meta' => $meta,
- ]);
- $processed++;
- // 每处理5个记录就更新一次进度,避免过于频繁
- if (($index + 1) % 5 === 0 || ($index + 1) === $totalRecords) {
- $this->refreshProgress();
- }
- } catch (\Throwable $e) {
- $failed++;
- Log::error('Markdown batch item failed', [
- 'import_id' => $this->markdownImportId,
- 'candidate_id' => $record->id,
- 'sequence' => $record->sequence,
- 'index' => $record->index,
- 'error' => $e->getMessage(),
- 'trace' => $e->getTraceAsString(),
- ]);
- // 失败时也更新进度
- if (($index + 1) % 5 === 0 || ($index + 1) === $totalRecords) {
- $this->refreshProgress();
- }
- }
- }
- $this->refreshProgress();
- Log::info('Markdown batch finished', [
- 'import_id' => $this->markdownImportId,
- 'sequence_start' => $this->sequenceStart,
- 'sequence_end' => $this->sequenceEnd,
- 'processed' => $processed,
- 'failed' => $failed,
- ]);
- $this->finalizeIfDone();
- }
- /**
- * 任务失败时的处理
- */
- public function failed(\Throwable $exception): void
- {
- Log::error('Markdown batch job failed permanently', [
- 'import_id' => $this->markdownImportId,
- 'sequence_start' => $this->sequenceStart,
- 'sequence_end' => $this->sequenceEnd,
- 'attempts' => $this->attempts(),
- 'error' => $exception->getMessage(),
- 'trace' => $exception->getTraceAsString(),
- ]);
- // 标记导入记录为失败状态,避免卡住
- $import = MarkdownImport::find($this->markdownImportId);
- if ($import) {
- $import->update([
- 'status' => MarkdownImport::STATUS_FAILED,
- 'progress_stage' => MarkdownImport::STAGE_FAILED,
- 'progress_message' => 'AI 解析任务失败',
- 'error_message' => '队列任务执行失败,已超过最大重试次数',
- 'processing_finished_at' => now(),
- ]);
- }
- }
- private function finalizeIfDone(): void
- {
- $import = MarkdownImport::find($this->markdownImportId);
- if (!$import) {
- return;
- }
- // 统一使用与 refreshProgress 相同的查询逻辑
- [$total, $parsed, $batchInfo] = $this->calculateProgress();
- // 只有当所有候选题都已处理(解析或过滤)完成时才更新状态
- if ($total > 0 && $parsed >= $total) {
- $updated = $import->update([
- 'status' => MarkdownImport::STATUS_PARSED,
- 'progress_stage' => MarkdownImport::STAGE_PARSED,
- 'progress_message' => "解析完成,等待人工校对 ({$parsed}/{$total})",
- 'progress_total' => $total,
- 'progress_current' => $parsed,
- 'progress_updated_at' => now(),
- 'processing_finished_at' => now(),
- ]);
- if ($updated) {
- Log::info('Markdown import finalized', [
- 'import_id' => $this->markdownImportId,
- 'total_candidates' => $total,
- 'parsed_candidates' => $parsed,
- ]);
- }
- }
- }
- /**
- * 计算进度:返回 [总数量, 已处理数量]
- */
- private function calculateProgress(): array
- {
- // 总候选题数(排除被过滤的和已废弃的)
- $total = PreQuestionCandidate::query()
- ->where('import_id', $this->markdownImportId)
- ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED)
- ->where(function ($query) {
- $query->whereNull('meta')
- ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'");
- })
- ->count();
- // 真正完成AI解析的判断:有 stem 字段且不为空,或有有效的 ai_confidence
- // 或已经被过滤 (filtered_out=true)
- $parsed = PreQuestionCandidate::query()
- ->where('import_id', $this->markdownImportId)
- ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED)
- ->where(function ($query) {
- $query->whereNull('meta')
- ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'");
- })
- ->where(function ($query) {
- $query->where(function ($q) {
- // 有题目内容
- $q->whereNotNull('stem')
- ->where('stem', '!=', '');
- })->orWhere(function ($q) {
- // 或有有效的AI置信度
- $q->whereNotNull('ai_confidence')
- ->where('ai_confidence', '>', 0);
- });
- })
- ->count();
- // 计算当前正在处理的批次信息
- $batchInfo = sprintf(
- '批次 %d-%d',
- $this->sequenceStart,
- $this->sequenceEnd
- );
- return [$total, $parsed, $batchInfo];
- }
- private function refreshProgress(): void
- {
- [$total, $parsed, $batchInfo] = $this->calculateProgress();
- // 计算有stem但AI置信度为0的数量(可能是非题目被错误解析)
- $stemOnlyCount = PreQuestionCandidate::query()
- ->where('import_id', $this->markdownImportId)
- ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED)
- ->where(function ($query) {
- $query->whereNull('meta')
- ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'");
- })
- ->whereNotNull('stem')
- ->where('stem', '!=', '')
- ->where(function ($query) {
- $query->whereNull('ai_confidence')
- ->orWhere('ai_confidence', '=', 0);
- })
- ->count();
- // 被过滤的记录数
- $filteredCount = PreQuestionCandidate::query()
- ->where('import_id', $this->markdownImportId)
- ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED)
- ->whereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) = 'true'")
- ->count();
- $import = MarkdownImport::find($this->markdownImportId);
- if ($import) {
- $import->update([
- 'progress_total' => $total,
- 'progress_current' => min($parsed, $total),
- 'progress_updated_at' => now(),
- 'progress_stage' => MarkdownImport::STAGE_AI_PARSING,
- 'progress_message' => "{$batchInfo} | AI 解析中… {$parsed}/{$total}" .
- ($stemOnlyCount > 0 ? " (含{$stemOnlyCount}个待筛选)" : '') .
- ($filteredCount > 0 ? " (已过滤{$filteredCount}个非题目)" : ''),
- ]);
- }
- }
- private function markParsed(PreQuestionCandidate $record): void
- {
- // 只有在记录真正有解析结果时才标记为已解析
- if (!empty($record->stem) || (!empty($record->ai_confidence) && $record->ai_confidence > 0)) {
- $meta = $record->meta ?? [];
- if (empty($meta['ai_parsed'])) {
- $meta['ai_parsed'] = true;
- $meta['ai_parsed_at'] = now()->toDateTimeString();
- $record->update(['meta' => $meta]);
- }
- }
- }
- /**
- * 轻量启发式判断是否像一道题目,过滤卷子/部分标题和说明文字。
- */
- private function isLikelyQuestion(string $raw): bool
- {
- $text = trim(strip_tags($raw));
- $length = mb_strlen($text);
- // Markdown 标题或“卷/部分/说明”且文本很短,视为非题
- if (preg_match('/^#+\\s+/m', $raw)) {
- return false;
- }
- if (preg_match('/(第[一二三四五六七八九十IVX]+[卷部分]|题型|说明|试卷)/u', $text) && $length <= 80) {
- return false;
- }
- // 过短且无问句/命令词/选项特征
- if ($length < 25 && !preg_match('/[\\??求解求证计算]|[A-D]\\.|(本小题满分/u', $text)) {
- return false;
- }
- return true;
- }
- }
|