ProcessMarkdownCandidateBatch.php 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. <?php
  2. namespace App\Jobs;
  3. use App\Models\MarkdownImport;
  4. use App\Models\PreQuestionCandidate;
  5. use App\Services\MarkdownQuestionParser;
  6. use App\Services\PdfStorageService;
  7. use Illuminate\Bus\Queueable;
  8. use Illuminate\Contracts\Queue\ShouldQueue;
  9. use Illuminate\Foundation\Bus\Dispatchable;
  10. use Illuminate\Queue\InteractsWithQueue;
  11. use Illuminate\Queue\SerializesModels;
  12. use Illuminate\Support\Facades\DB;
  13. use Illuminate\Support\Facades\Log;
  14. class ProcessMarkdownCandidateBatch implements ShouldQueue
  15. {
  16. use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
  17. public int $timeout = 300; // 5分钟超时
  18. public int $tries = 3;
  19. public function __construct(
  20. public int $markdownImportId,
  21. public int $sequenceStart,
  22. public int $sequenceEnd
  23. ) {
  24. //
  25. }
  26. public function handle(MarkdownQuestionParser $parser, PdfStorageService $uploader): void
  27. {
  28. $import = MarkdownImport::find($this->markdownImportId);
  29. if (!$import) {
  30. Log::error('MarkdownImport not found for batch', [
  31. 'id' => $this->markdownImportId,
  32. 'sequence_start' => $this->sequenceStart,
  33. 'sequence_end' => $this->sequenceEnd,
  34. ]);
  35. return;
  36. }
  37. Log::info('Markdown batch started', [
  38. 'import_id' => $this->markdownImportId,
  39. 'sequence_start' => $this->sequenceStart,
  40. 'sequence_end' => $this->sequenceEnd,
  41. ]);
  42. $records = PreQuestionCandidate::query()
  43. ->where('import_id', $this->markdownImportId)
  44. ->whereBetween('sequence', [$this->sequenceStart, $this->sequenceEnd])
  45. ->orderBy('sequence')
  46. ->get();
  47. $processed = 0;
  48. $failed = 0;
  49. foreach ($records as $record) {
  50. try {
  51. $meta = $record->meta ?? [];
  52. if (!empty($meta['ai_parsed'])) {
  53. $processed++;
  54. continue;
  55. }
  56. $existingConfidence = $record->confidence ?? $record->ai_confidence;
  57. // 置信度高(>=0.85)时跳过再次 AI 解析,直接计入进度
  58. if ($existingConfidence !== null && (float) $existingConfidence >= 0.85) {
  59. $this->markParsed($record);
  60. $processed++;
  61. continue;
  62. }
  63. // 快速过滤卷子/区块标题,避免误判为题目再次走 AI
  64. if (!$this->isLikelyQuestion((string) $record->raw_markdown)) {
  65. $meta['ai_parsed'] = true;
  66. $meta['ai_parsed_at'] = now()->toDateTimeString();
  67. $record->update([
  68. 'is_question_candidate' => false,
  69. 'ai_confidence' => 0.0,
  70. 'confidence' => 0.0,
  71. 'is_valid_question' => false,
  72. 'status' => 'rejected',
  73. 'meta' => $meta,
  74. ]);
  75. $processed++;
  76. continue;
  77. }
  78. // 已经处理过的不重复处理
  79. if (in_array($record->status, ['pending', 'reviewed', 'accepted', 'rejected'], true) && $record->stem !== null) {
  80. $this->markParsed($record);
  81. continue;
  82. }
  83. $parsed = $parser->parseRawMarkdown((string) $record->raw_markdown, (int) $record->index);
  84. $meta = $record->meta ?? [];
  85. $meta['ai_parsed'] = true;
  86. $meta['ai_parsed_at'] = now()->toDateTimeString();
  87. // 结构化后立即回传图片
  88. $uploadedImages = [];
  89. if (!empty($parsed['images'])) {
  90. foreach ($parsed['images'] as $idx => $imgUrl) {
  91. $path = "imports/images/{$record->id}_{$idx}.jpg";
  92. $uploadedImages[] = $uploader->put($path, (string)@file_get_contents($imgUrl)) ?: $imgUrl;
  93. }
  94. }
  95. $meta['images_uploaded'] = !empty($uploadedImages);
  96. $record->update([
  97. 'stem' => $parsed['stem'] ?? null,
  98. 'options' => $parsed['options'] ?? null,
  99. 'images' => !empty($uploadedImages) ? $uploadedImages : $parsedImages,
  100. 'tables' => $parsed['tables'] ?? [],
  101. 'is_question_candidate' => (bool) ($parsed['is_question_candidate'] ?? false),
  102. 'ai_confidence' => $parsed['ai_confidence'] ?? null,
  103. 'status' => 'pending',
  104. 'meta' => $meta,
  105. ]);
  106. $processed++;
  107. } catch (\Throwable $e) {
  108. $failed++;
  109. Log::warning('Markdown batch item failed', [
  110. 'import_id' => $this->markdownImportId,
  111. 'candidate_id' => $record->id,
  112. 'sequence' => $record->sequence,
  113. 'index' => $record->index,
  114. 'error' => $e->getMessage(),
  115. ]);
  116. }
  117. }
  118. $this->refreshProgress();
  119. Log::info('Markdown batch finished', [
  120. 'import_id' => $this->markdownImportId,
  121. 'sequence_start' => $this->sequenceStart,
  122. 'sequence_end' => $this->sequenceEnd,
  123. 'processed' => $processed,
  124. 'failed' => $failed,
  125. ]);
  126. $this->finalizeIfDone();
  127. }
  128. private function finalizeIfDone(): void
  129. {
  130. $import = MarkdownImport::find($this->markdownImportId);
  131. if (!$import) {
  132. return;
  133. }
  134. $total = (int) ($import->progress_total ?? 0);
  135. $current = (int) ($import->progress_current ?? 0);
  136. if ($total <= 0 || $current < $total) {
  137. return;
  138. }
  139. // 只要有一个 batch 到达“完成条件”,就尝试做一次幂等的最终状态更新
  140. $updated = DB::table('markdown_imports')
  141. ->where('id', $this->markdownImportId)
  142. ->where('status', 'processing')
  143. ->update([
  144. 'status' => 'parsed',
  145. 'progress_stage' => MarkdownImport::STAGE_PARSED,
  146. 'progress_message' => '解析完成,等待人工校对',
  147. 'progress_updated_at' => now(),
  148. 'processing_finished_at' => now(),
  149. ]);
  150. if ($updated) {
  151. Log::info('Markdown import finalized', [
  152. 'import_id' => $this->markdownImportId,
  153. 'progress_total' => $total,
  154. 'progress_current' => $current,
  155. ]);
  156. }
  157. }
  158. private function refreshProgress(): void
  159. {
  160. $total = PreQuestionCandidate::query()
  161. ->where('import_id', $this->markdownImportId)
  162. ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED)
  163. ->count();
  164. $parsed = PreQuestionCandidate::query()
  165. ->where('import_id', $this->markdownImportId)
  166. ->where('status', '!=', PreQuestionCandidate::STATUS_SUPERSEDED)
  167. ->where(function ($query) {
  168. $query->whereRaw("JSON_UNQUOTE(JSON_EXTRACT(meta, '$.ai_parsed')) = 'true'")
  169. ->orWhereNotNull('stem')
  170. ->orWhereNotNull('ai_confidence')
  171. ->orWhereNotNull('confidence');
  172. })
  173. ->count();
  174. DB::table('markdown_imports')
  175. ->where('id', $this->markdownImportId)
  176. ->update([
  177. 'progress_total' => $total,
  178. 'progress_current' => min($parsed, $total),
  179. 'progress_updated_at' => now(),
  180. 'progress_stage' => MarkdownImport::STAGE_AI_PARSING,
  181. 'progress_message' => 'AI 解析中…',
  182. ]);
  183. }
  184. private function markParsed(PreQuestionCandidate $record): void
  185. {
  186. $meta = $record->meta ?? [];
  187. if (!empty($meta['ai_parsed'])) {
  188. return;
  189. }
  190. $meta['ai_parsed'] = true;
  191. $meta['ai_parsed_at'] = now()->toDateTimeString();
  192. $record->update(['meta' => $meta]);
  193. }
  194. /**
  195. * 轻量启发式判断是否像一道题目,过滤卷子/部分标题和说明文字。
  196. */
  197. private function isLikelyQuestion(string $raw): bool
  198. {
  199. $text = trim(strip_tags($raw));
  200. $length = mb_strlen($text);
  201. // Markdown 标题或“卷/部分/说明”且文本很短,视为非题
  202. if (preg_match('/^#+\\s+/m', $raw)) {
  203. return false;
  204. }
  205. if (preg_match('/(第[一二三四五六七八九十IVX]+[卷部分]|题型|说明|试卷)/u', $text) && $length <= 80) {
  206. return false;
  207. }
  208. // 过短且无问句/命令词/选项特征
  209. if ($length < 25 && !preg_match('/[\\??求解求证计算]|[A-D]\\.|(本小题满分/u', $text)) {
  210. return false;
  211. }
  212. return true;
  213. }
  214. }