ProcessMarkdownCandidateBatch.php 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. <?php
  2. namespace App\Jobs;
  3. use App\Models\MarkdownImport;
  4. use App\Models\PreQuestionCandidate;
  5. use App\Services\MarkdownQuestionParser;
  6. use App\Services\PdfStorageService;
  7. use Illuminate\Bus\Queueable;
  8. use Illuminate\Contracts\Queue\ShouldQueue;
  9. use Illuminate\Foundation\Bus\Dispatchable;
  10. use Illuminate\Queue\InteractsWithQueue;
  11. use Illuminate\Queue\SerializesModels;
  12. use Illuminate\Support\Facades\DB;
  13. use Illuminate\Support\Facades\Log;
  14. class ProcessMarkdownCandidateBatch implements ShouldQueue
  15. {
  16. use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
  17. public int $timeout = 300; // 5分钟超时
  18. public int $tries = 3;
  19. public function __construct(
  20. public int $markdownImportId,
  21. public int $sequenceStart,
  22. public int $sequenceEnd
  23. ) {
  24. //
  25. }
  26. public function handle(MarkdownQuestionParser $parser, PdfStorageService $uploader): void
  27. {
  28. $import = MarkdownImport::find($this->markdownImportId);
  29. if (!$import) {
  30. Log::error('MarkdownImport not found for batch', [
  31. 'id' => $this->markdownImportId,
  32. 'sequence_start' => $this->sequenceStart,
  33. 'sequence_end' => $this->sequenceEnd,
  34. ]);
  35. return;
  36. }
  37. Log::info('Markdown batch started', [
  38. 'import_id' => $this->markdownImportId,
  39. 'sequence_start' => $this->sequenceStart,
  40. 'sequence_end' => $this->sequenceEnd,
  41. ]);
  42. $records = PreQuestionCandidate::query()
  43. ->where('import_id', $this->markdownImportId)
  44. ->whereBetween('sequence', [$this->sequenceStart, $this->sequenceEnd])
  45. ->orderBy('sequence')
  46. ->get();
  47. $processed = 0;
  48. $failed = 0;
  49. foreach ($records as $record) {
  50. try {
  51. $meta = $record->meta ?? [];
  52. if (!empty($meta['ai_parsed'])) {
  53. $processed++;
  54. continue;
  55. }
  56. $existingConfidence = $record->confidence ?? $record->ai_confidence;
  57. // 置信度高(>=0.85)时跳过再次 AI 解析,直接计入进度
  58. if ($existingConfidence !== null && (float) $existingConfidence >= 0.85) {
  59. $this->markParsed($record);
  60. $processed++;
  61. continue;
  62. }
  63. // 快速过滤卷子/区块标题,避免误判为题目再次走 AI
  64. if (!$this->isLikelyQuestion((string) $record->raw_markdown)) {
  65. // 标记为已过滤,但不标记为已解析
  66. $meta['ai_parsed'] = true;
  67. $meta['ai_parsed_at'] = now()->toDateTimeString();
  68. $meta['filtered_out'] = true; // 添加过滤标记
  69. $record->update([
  70. 'is_question_candidate' => false,
  71. 'ai_confidence' => 0.0,
  72. 'confidence' => 0.0,
  73. 'is_valid_question' => false,
  74. 'status' => 'rejected',
  75. 'meta' => $meta,
  76. ]);
  77. $processed++;
  78. continue;
  79. }
  80. // 已经处理过的不重复处理
  81. if (in_array($record->status, ['pending', 'reviewed', 'accepted', 'rejected'], true) && $record->stem !== null) {
  82. $this->markParsed($record);
  83. continue;
  84. }
  85. $parsed = $parser->parseRawMarkdown((string) $record->raw_markdown, (int) $record->index);
  86. $meta = $record->meta ?? [];
  87. $meta['ai_parsed'] = true;
  88. $meta['ai_parsed_at'] = now()->toDateTimeString();
  89. // 结构化后立即回传图片
  90. $uploadedImages = [];
  91. if (!empty($parsed['images'])) {
  92. foreach ($parsed['images'] as $idx => $imgUrl) {
  93. $path = "imports/images/{$record->id}_{$idx}.jpg";
  94. $uploadedImages[] = $uploader->put($path, (string)@file_get_contents($imgUrl)) ?: $imgUrl;
  95. }
  96. }
  97. $meta['images_uploaded'] = !empty($uploadedImages);
  98. $record->update([
  99. 'stem' => $parsed['stem'] ?? null,
  100. 'options' => $parsed['options'] ?? null,
  101. 'images' => !empty($uploadedImages) ? $uploadedImages : ($parsed['images'] ?? []),
  102. 'tables' => $parsed['tables'] ?? [],
  103. 'is_question_candidate' => (bool) ($parsed['is_question_candidate'] ?? false),
  104. 'ai_confidence' => $parsed['ai_confidence'] ?? null,
  105. 'status' => 'pending',
  106. 'meta' => $meta,
  107. ]);
  108. $processed++;
  109. } catch (\Throwable $e) {
  110. $failed++;
  111. Log::error('Markdown batch item failed', [
  112. 'import_id' => $this->markdownImportId,
  113. 'candidate_id' => $record->id,
  114. 'sequence' => $record->sequence,
  115. 'index' => $record->index,
  116. 'error' => $e->getMessage(),
  117. 'trace' => $e->getTraceAsString(),
  118. ]);
  119. }
  120. }
  121. $this->refreshProgress();
  122. Log::info('Markdown batch finished', [
  123. 'import_id' => $this->markdownImportId,
  124. 'sequence_start' => $this->sequenceStart,
  125. 'sequence_end' => $this->sequenceEnd,
  126. 'processed' => $processed,
  127. 'failed' => $failed,
  128. ]);
  129. $this->finalizeIfDone();
  130. }
  131. /**
  132. * 任务失败时的处理
  133. */
  134. public function failed(\Throwable $exception): void
  135. {
  136. Log::error('Markdown batch job failed permanently', [
  137. 'import_id' => $this->markdownImportId,
  138. 'sequence_start' => $this->sequenceStart,
  139. 'sequence_end' => $this->sequenceEnd,
  140. 'attempts' => $this->attempts(),
  141. 'error' => $exception->getMessage(),
  142. 'trace' => $exception->getTraceAsString(),
  143. ]);
  144. // 标记导入记录为失败状态,避免卡住
  145. $import = MarkdownImport::find($this->markdownImportId);
  146. if ($import) {
  147. $import->update([
  148. 'status' => MarkdownImport::STATUS_FAILED,
  149. 'progress_stage' => MarkdownImport::STAGE_FAILED,
  150. 'progress_message' => 'AI 解析任务失败',
  151. 'error_message' => '队列任务执行失败,已超过最大重试次数',
  152. 'processing_finished_at' => now(),
  153. ]);
  154. }
  155. }
  156. private function finalizeIfDone(): void
  157. {
  158. $import = MarkdownImport::find($this->markdownImportId);
  159. if (!$import) {
  160. return;
  161. }
  162. // 统一使用与 refreshProgress 相同的查询逻辑
  163. [$total, $parsed] = $this->calculateProgress();
  164. // 只有当所有候选题都已处理(解析或过滤)完成时才更新状态
  165. if ($total > 0 && $parsed >= $total) {
  166. $updated = $import->update([
  167. 'status' => MarkdownImport::STATUS_PARSED,
  168. 'progress_stage' => MarkdownImport::STAGE_PARSED,
  169. 'progress_message' => "解析完成,等待人工校对 ({$parsed}/{$total})",
  170. 'progress_total' => $total,
  171. 'progress_current' => $parsed,
  172. 'progress_updated_at' => now(),
  173. 'processing_finished_at' => now(),
  174. ]);
  175. if ($updated) {
  176. Log::info('Markdown import finalized', [
  177. 'import_id' => $this->markdownImportId,
  178. 'total_candidates' => $total,
  179. 'parsed_candidates' => $parsed,
  180. ]);
  181. }
  182. }
  183. }
  184. /**
  185. * 计算进度:返回 [总数量, 已处理数量]
  186. */
  187. private function calculateProgress(): array
  188. {
  189. // 总候选题数(排除被过滤的和已废弃的)
  190. $total = PreQuestionCandidate::query()
  191. ->where('import_id', $this->markdownImportId)
  192. ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED)
  193. ->where(function ($query) {
  194. $query->whereNull('meta')
  195. ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'");
  196. })
  197. ->count();
  198. // 真正完成AI解析的判断:有 stem 字段且不为空,或有有效的 ai_confidence
  199. // 或已经被过滤 (filtered_out=true)
  200. $parsed = PreQuestionCandidate::query()
  201. ->where('import_id', $this->markdownImportId)
  202. ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED)
  203. ->where(function ($query) {
  204. $query->whereNull('meta')
  205. ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'");
  206. })
  207. ->where(function ($query) {
  208. $query->where(function ($q) {
  209. // 有题目内容
  210. $q->whereNotNull('stem')
  211. ->where('stem', '!=', '');
  212. })->orWhere(function ($q) {
  213. // 或有有效的AI置信度
  214. $q->whereNotNull('ai_confidence')
  215. ->where('ai_confidence', '>', 0);
  216. });
  217. })
  218. ->count();
  219. return [$total, $parsed];
  220. }
  221. private function refreshProgress(): void
  222. {
  223. [$total, $parsed] = $this->calculateProgress();
  224. // 计算有stem但AI置信度为0的数量(可能是非题目被错误解析)
  225. $stemOnlyCount = PreQuestionCandidate::query()
  226. ->where('import_id', $this->markdownImportId)
  227. ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED)
  228. ->where(function ($query) {
  229. $query->whereNull('meta')
  230. ->orWhereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) != 'true'");
  231. })
  232. ->whereNotNull('stem')
  233. ->where('stem', '!=', '')
  234. ->where(function ($query) {
  235. $query->whereNull('ai_confidence')
  236. ->orWhere('ai_confidence', '=', 0);
  237. })
  238. ->count();
  239. // 被过滤的记录数
  240. $filteredCount = PreQuestionCandidate::query()
  241. ->where('import_id', $this->markdownImportId)
  242. ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED)
  243. ->whereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.filtered_out')) = 'true'")
  244. ->count();
  245. $import = MarkdownImport::find($this->markdownImportId);
  246. if ($import) {
  247. $import->update([
  248. 'progress_total' => $total,
  249. 'progress_current' => min($parsed, $total),
  250. 'progress_updated_at' => now(),
  251. 'progress_stage' => MarkdownImport::STAGE_AI_PARSING,
  252. 'progress_message' => "AI 解析中… {$parsed}/{$total}" .
  253. ($stemOnlyCount > 0 ? " (含{$stemOnlyCount}个待筛选)" : '') .
  254. ($filteredCount > 0 ? " (已过滤{$filteredCount}个非题目)" : ''),
  255. ]);
  256. }
  257. }
  258. private function markParsed(PreQuestionCandidate $record): void
  259. {
  260. // 只有在记录真正有解析结果时才标记为已解析
  261. if (!empty($record->stem) || (!empty($record->ai_confidence) && $record->ai_confidence > 0)) {
  262. $meta = $record->meta ?? [];
  263. if (empty($meta['ai_parsed'])) {
  264. $meta['ai_parsed'] = true;
  265. $meta['ai_parsed_at'] = now()->toDateTimeString();
  266. $record->update(['meta' => $meta]);
  267. }
  268. }
  269. }
  270. /**
  271. * 轻量启发式判断是否像一道题目,过滤卷子/部分标题和说明文字。
  272. */
  273. private function isLikelyQuestion(string $raw): bool
  274. {
  275. $text = trim(strip_tags($raw));
  276. $length = mb_strlen($text);
  277. // Markdown 标题或“卷/部分/说明”且文本很短,视为非题
  278. if (preg_match('/^#+\\s+/m', $raw)) {
  279. return false;
  280. }
  281. if (preg_match('/(第[一二三四五六七八九十IVX]+[卷部分]|题型|说明|试卷)/u', $text) && $length <= 80) {
  282. return false;
  283. }
  284. // 过短且无问句/命令词/选项特征
  285. if ($length < 25 && !preg_match('/[\\??求解求证计算]|[A-D]\\.|(本小题满分/u', $text)) {
  286. return false;
  287. }
  288. return true;
  289. }
  290. }