AsyncMarkdownSplitter.php 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. <?php
  2. namespace App\Services;
  3. use Illuminate\Support\Facades\Log;
  4. class AsyncMarkdownSplitter
  5. {
  6. /**
  7. * 将 Markdown 切分为题目数组
  8. *
  9. * @param string $markdown 原始 Markdown 文本
  10. * @return array 题目数组,每个元素包含 index 和 raw_markdown
  11. */
  12. public function split(string $markdown): array
  13. {
  14. // 使用正则表达式识别题号作为切分点(只接受“数字 + 明确分隔符”)
  15. $pattern = '/^\s*(\d{1,4})(?:[\\..、\\))\\]】])\\s*/m';
  16. $commentPattern = '/<!--\s*question:\s*(\d+)\s*-->/i';
  17. // 统一寻找切分点
  18. preg_match_all($pattern, $markdown, $matches, PREG_OFFSET_CAPTURE);
  19. preg_match_all($commentPattern, $markdown, $commentMatches, PREG_OFFSET_CAPTURE);
  20. // 合并匹配结果并按偏移量排序
  21. $allMatches = [];
  22. foreach ($matches[1] as $idx => $m) {
  23. $allMatches[] = [
  24. 'pos' => $matches[0][$idx][1],
  25. 'length' => strlen($matches[0][$idx][0]),
  26. 'index' => (int)$m[0],
  27. ];
  28. }
  29. foreach ($commentMatches[1] as $idx => $m) {
  30. $allMatches[] = [
  31. 'pos' => $commentMatches[0][$idx][1],
  32. 'length' => strlen($commentMatches[0][$idx][0]),
  33. 'index' => (int)$m[0],
  34. ];
  35. }
  36. usort($allMatches, fn($a, $b) => $a['pos'] <=> $b['pos']);
  37. $candidates = [];
  38. if (empty($allMatches)) {
  39. // 没有找到题号,整个作为一块
  40. return [
  41. [
  42. 'index' => 1,
  43. 'raw_markdown' => trim($markdown)
  44. ]
  45. ];
  46. }
  47. for ($i = 0; $i < count($allMatches); $i++) {
  48. $start = $allMatches[$i]['pos'];
  49. $end = $i + 1 < count($allMatches) ? $allMatches[$i+1]['pos'] : strlen($markdown);
  50. $block = substr($markdown, $start, $end - $start);
  51. $block = trim($block);
  52. if (!empty($block)) {
  53. $candidates[] = [
  54. 'sequence' => $i + 1,
  55. 'index' => $allMatches[$i]['index'],
  56. 'raw_markdown' => $block
  57. ];
  58. }
  59. }
  60. return $candidates;
  61. }
  62. /**
  63. * 验证切分结果
  64. *
  65. * @param array $candidates 切分结果
  66. * @return bool
  67. */
  68. public function validate(array $candidates): bool
  69. {
  70. // 题号重复在“多套试卷/多章节合并”场景是正常现象,不应判定为失败。
  71. // 仅做轻量日志,避免输出超长 indexes 列表刷屏。
  72. $indexes = array_map(fn($item) => $item['index'], $candidates);
  73. $uniqueCount = count(array_unique($indexes));
  74. $total = count($indexes);
  75. if ($total > 0 && $uniqueCount !== $total) {
  76. Log::warning('Duplicate question indexes detected', [
  77. 'total' => $total,
  78. 'unique' => $uniqueCount,
  79. ]);
  80. }
  81. // 检查每个候选是否有内容
  82. foreach ($candidates as $candidate) {
  83. if (empty($candidate['raw_markdown'])) {
  84. Log::warning('Empty markdown content detected', [
  85. 'index' => $candidate['index']
  86. ]);
  87. return false;
  88. }
  89. }
  90. return true;
  91. }
  92. /**
  93. * 获取切分统计信息
  94. *
  95. * @param array $candidates 切分结果
  96. * @return array
  97. */
  98. public function getStatistics(array $candidates): array
  99. {
  100. $total = count($candidates);
  101. $avgLength = 0;
  102. $maxLength = 0;
  103. $minLength = PHP_INT_MAX;
  104. foreach ($candidates as $candidate) {
  105. $length = strlen($candidate['raw_markdown']);
  106. $avgLength += $length;
  107. $maxLength = max($maxLength, $length);
  108. $minLength = min($minLength, $length);
  109. }
  110. if ($total > 0) {
  111. $avgLength = round($avgLength / $total, 2);
  112. }
  113. return [
  114. 'total_candidates' => $total,
  115. 'avg_length' => $avgLength,
  116. 'max_length' => $maxLength,
  117. 'min_length' => $minLength === PHP_INT_MAX ? 0 : $minLength
  118. ];
  119. }
  120. }