yms
/
math_cms


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
							<?php

namespace App\Services;

use Illuminate\Support\Facades\Log;

class AsyncMarkdownSplitter
{
    /**
     * 将 Markdown 切分为题目数组
     *
     * @param string $markdown 原始 Markdown 文本
     * @return array 题目数组，每个元素包含 index 和 raw_markdown
     */
    public function split(string $markdown): array
    {
        // 使用正则表达式识别题号作为切分点（只接受“数字 + 明确分隔符”）
        $pattern = '/^\s*(\d{1,4})(?:[\\.．、\\)）\\]】])\\s*/m';
        $commentPattern = '/<!--\s*question:\s*(\d+)\s*-->/i';

        // 统一寻找切分点
        preg_match_all($pattern, $markdown, $matches, PREG_OFFSET_CAPTURE);
        preg_match_all($commentPattern, $markdown, $commentMatches, PREG_OFFSET_CAPTURE);

        // 合并匹配结果并按偏移量排序
        $allMatches = [];
        foreach ($matches[1] as $idx => $m) {
            $allMatches[] = [
                'pos' => $matches[0][$idx][1],
                'length' => strlen($matches[0][$idx][0]),
                'index' => (int)$m[0],
            ];
        }
        foreach ($commentMatches[1] as $idx => $m) {
            $allMatches[] = [
                'pos' => $commentMatches[0][$idx][1],
                'length' => strlen($commentMatches[0][$idx][0]),
                'index' => (int)$m[0],
            ];
        }

        usort($allMatches, fn($a, $b) => $a['pos'] <=> $b['pos']);

        $candidates = [];

        if (empty($allMatches)) {
            // 没有找到题号，整个作为一块
            return [
                [
                    'index' => 1,
                    'raw_markdown' => trim($markdown)
                ]
            ];
        }

        for ($i = 0; $i < count($allMatches); $i++) {
            $start = $allMatches[$i]['pos'];
            $end = $i + 1 < count($allMatches) ? $allMatches[$i+1]['pos'] : strlen($markdown);

            $block = substr($markdown, $start, $end - $start);
            $block = trim($block);

            if (!empty($block)) {
                $candidates[] = [
                    'sequence' => $i + 1,
                    'index' => $allMatches[$i]['index'],
                    'raw_markdown' => $block
                ];
            }
        }

        return $candidates;
    }

    /**
     * 验证切分结果
     *
     * @param array $candidates 切分结果
     * @return bool
     */
    public function validate(array $candidates): bool
    {
        // 题号重复在“多套试卷/多章节合并”场景是正常现象，不应判定为失败。
        // 仅做轻量日志，避免输出超长 indexes 列表刷屏。
        $indexes = array_map(fn($item) => $item['index'], $candidates);
        $uniqueCount = count(array_unique($indexes));
        $total = count($indexes);

        if ($total > 0 && $uniqueCount !== $total) {
            Log::warning('Duplicate question indexes detected', [
                'total' => $total,
                'unique' => $uniqueCount,
            ]);
        }

        // 检查每个候选是否有内容
        foreach ($candidates as $candidate) {
            if (empty($candidate['raw_markdown'])) {
                Log::warning('Empty markdown content detected', [
                    'index' => $candidate['index']
                ]);
                return false;
            }
        }

        return true;
    }

    /**
     * 获取切分统计信息
     *
     * @param array $candidates 切分结果
     * @return array
     */
    public function getStatistics(array $candidates): array
    {
        $total = count($candidates);
        $avgLength = 0;
        $maxLength = 0;
        $minLength = PHP_INT_MAX;

        foreach ($candidates as $candidate) {
            $length = strlen($candidate['raw_markdown']);
            $avgLength += $length;
            $maxLength = max($maxLength, $length);
            $minLength = min($minLength, $length);
        }

        if ($total > 0) {
            $avgLength = round($avgLength / $total, 2);
        }

        return [
            'total_candidates' => $total,
            'avg_length' => $avgLength,
            'max_length' => $maxLength,
            'min_length' => $minLength === PHP_INT_MAX ? 0 : $minLength
        ];
    }
}