AsyncMarkdownSplitter.php 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. <?php
  2. namespace App\Services;
  3. use Illuminate\Support\Facades\Log;
  4. class AsyncMarkdownSplitter
  5. {
  6. /**
  7. * 将 Markdown 切分为题目数组
  8. *
  9. * @param string $markdown 原始 Markdown 文本
  10. * @return array 题目数组,每个元素包含 index 和 raw_markdown
  11. */
  12. public function split(string $markdown): array
  13. {
  14. // 使用正则表达式识别题号作为切分点(只接受“数字 + 明确分隔符”)
  15. // 注意:不要用 “数字 + 空白” 作为切分点,会误切正文中的列表/步骤/年份等。
  16. $pattern = '/^\s*(\d{1,4})(?:[\\..、\\))\\]】])\\s*/m';
  17. // 找到所有匹配的位置
  18. preg_match_all($pattern, $markdown, $matches, PREG_OFFSET_CAPTURE);
  19. $candidates = [];
  20. if (empty($matches[0])) {
  21. // 没有找到题号,整个作为一块
  22. return [
  23. [
  24. 'index' => 1,
  25. 'raw_markdown' => trim($markdown)
  26. ]
  27. ];
  28. }
  29. // 构建分块
  30. $positions = [];
  31. foreach ($matches[0] as $match) {
  32. $positions[] = $match[1];
  33. }
  34. for ($i = 0; $i < count($positions); $i++) {
  35. $start = $positions[$i];
  36. $end = $i + 1 < count($positions) ? $positions[$i + 1] : strlen($markdown);
  37. $block = substr($markdown, $start, $end - $start);
  38. $block = trim($block);
  39. if (!empty($block)) {
  40. // 提取题号作为 index
  41. preg_match('/^\s*(\d+)/', $block, $indexMatch);
  42. $index = $indexMatch[1] ?? ($i + 1);
  43. $candidates[] = [
  44. // sequence:文件内顺序,保证唯一,不会因为 index 重复而覆盖
  45. 'sequence' => $i + 1,
  46. 'index' => (int)$index,
  47. 'raw_markdown' => $block
  48. ];
  49. }
  50. }
  51. return $candidates;
  52. }
  53. /**
  54. * 验证切分结果
  55. *
  56. * @param array $candidates 切分结果
  57. * @return bool
  58. */
  59. public function validate(array $candidates): bool
  60. {
  61. // 题号重复在“多套试卷/多章节合并”场景是正常现象,不应判定为失败。
  62. // 仅做轻量日志,避免输出超长 indexes 列表刷屏。
  63. $indexes = array_map(fn($item) => $item['index'], $candidates);
  64. $uniqueCount = count(array_unique($indexes));
  65. $total = count($indexes);
  66. if ($total > 0 && $uniqueCount !== $total) {
  67. Log::warning('Duplicate question indexes detected', [
  68. 'total' => $total,
  69. 'unique' => $uniqueCount,
  70. ]);
  71. }
  72. // 检查每个候选是否有内容
  73. foreach ($candidates as $candidate) {
  74. if (empty($candidate['raw_markdown'])) {
  75. Log::warning('Empty markdown content detected', [
  76. 'index' => $candidate['index']
  77. ]);
  78. return false;
  79. }
  80. }
  81. return true;
  82. }
  83. /**
  84. * 获取切分统计信息
  85. *
  86. * @param array $candidates 切分结果
  87. * @return array
  88. */
  89. public function getStatistics(array $candidates): array
  90. {
  91. $total = count($candidates);
  92. $avgLength = 0;
  93. $maxLength = 0;
  94. $minLength = PHP_INT_MAX;
  95. foreach ($candidates as $candidate) {
  96. $length = strlen($candidate['raw_markdown']);
  97. $avgLength += $length;
  98. $maxLength = max($maxLength, $length);
  99. $minLength = min($minLength, $length);
  100. }
  101. if ($total > 0) {
  102. $avgLength = round($avgLength / $total, 2);
  103. }
  104. return [
  105. 'total_candidates' => $total,
  106. 'avg_length' => $avgLength,
  107. 'max_length' => $maxLength,
  108. 'min_length' => $minLength === PHP_INT_MAX ? 0 : $minLength
  109. ];
  110. }
  111. }