| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231 |
- <?php
- namespace App\Services;
- /**
- * LaTeX 清理服务
- * 专门用于清理 OCR 识别返回的 LaTeX 公式中的常见错误
- * 在数据存入数据库之前进行预处理
- */
- class LatexCleanerService
- {
- /**
- * 清理 LaTeX 文本
- *
- * @param string $latex 原始 LaTeX 文本
- * @return string 清理后的 LaTeX 文本
- */
- public function clean(string $latex): string
- {
- if (empty($latex)) {
- return $latex;
- }
- // 1. 基础清理
- $latex = $this->basicCleanup($latex);
-
- // 2. 空格规范化
- $latex = $this->normalizeWhitespace($latex);
-
- // 3. 清理错误的定界符
- $latex = $this->cleanDelimiters($latex);
-
- // 4. 修复常见的 LaTeX 命令
- $latex = $this->fixCommonCommands($latex);
-
- // 5. 清理括号匹配问题
- $latex = $this->fixBraces($latex);
-
- return trim($latex);
- }
- /**
- * 基础清理
- */
- protected function basicCleanup(string $latex): string
- {
- // 递归解码 HTML 实体
- $decoded = html_entity_decode($latex, ENT_QUOTES, 'UTF-8');
- while ($decoded !== $latex) {
- $latex = $decoded;
- $decoded = html_entity_decode($latex, ENT_QUOTES, 'UTF-8');
- }
-
- // 移除 HTML 标签
- $latex = strip_tags($latex);
-
- return $latex;
- }
- /**
- * 空格规范化 - 处理 OCR 常见的空格问题
- */
- protected function normalizeWhitespace(string $latex): string
- {
- // 1. 移除 LaTeX 命令后的空格: \frac { -> \frac{
- $latex = preg_replace('/\\\\([a-zA-Z]+)\s+\{/', '\\\\$1{', $latex);
-
- // 2. 移除花括号内的前导和尾随空格: { 1 } -> {1}
- $latex = preg_replace('/\{\s+/', '{', $latex);
- $latex = preg_replace('/\s+\}/', '}', $latex);
-
- // 2.1 移除闭合花括号后紧跟开放花括号之间的空格: } { -> }{
- $latex = preg_replace('/\}\s+\{/', '}{', $latex);
-
- // 3. 移除上标/下标符号周围的空格: x ^ { a } -> x^{a}
- $latex = preg_replace('/\s*\^\s*\{\s*/', '^{', $latex);
- $latex = preg_replace('/\s*_\s*\{\s*/', '_{', $latex);
-
- // 4. 移除 \left 和 \right 后的空格: \left ( -> \left(, \right ) -> \right)
- $latex = preg_replace('/\\\\(left|right)\s+/', '\\\\$1', $latex);
-
- // 4.1 特殊处理 \right 和 ) 之间的空格
- $latex = preg_replace('/\\\\right\s+\)/', '\\right)', $latex);
- $latex = preg_replace('/\\\\left\s+\(/', '\\left(', $latex);
-
- // 5. 移除括号内侧的空格: ( x ) -> (x)
- $latex = preg_replace('/\(\s+/', '(', $latex);
- $latex = preg_replace('/\s+\)/', ')', $latex);
-
- // 6. 规范化多个连续空格为单个空格
- $latex = preg_replace('/\s+/', ' ', $latex);
-
- return $latex;
- }
- /**
- * 清理错误的定界符 - OCR 常见错误
- */
- protected function cleanDelimiters(string $latex): string
- {
- // 1. 移除花括号内的 $: {a$} -> {a}
- $latex = preg_replace('/\{([^}]*)\$+([^}]*)\}/', '{$1$2}', $latex);
-
- // 2. 移除末尾的多余 $$$
- $latex = preg_replace('/\$+\s*$/', '', $latex);
-
- // 3. 移除开头的多余 $$$
- $latex = preg_replace('/^\s*\$+/', '', $latex);
-
- // 4. 移除连续的 $$$ (3个或更多) -> $$
- $latex = preg_replace('/\$\$\$+/', '$$', $latex);
-
- // 5. 修复不匹配的定界符
- // 如果只有一个 $,可能是 OCR 错误,移除它
- $dollarCount = substr_count($latex, '$');
- if ($dollarCount === 1) {
- $latex = str_replace('$', '', $latex);
- }
-
- return $latex;
- }
- /**
- * 修复常见的 LaTeX 命令
- */
- protected function fixCommonCommands(string $latex): string
- {
- // 常见的 LaTeX 命令列表
- $commands = [
- 'frac', 'sqrt', 'sum', 'int', 'lim', 'prod',
- 'sin', 'cos', 'tan', 'log', 'ln', 'exp',
- 'alpha', 'beta', 'gamma', 'delta', 'theta', 'pi', 'sigma', 'omega',
- 'leq', 'geq', 'neq', 'approx', 'infty', 'partial',
- 'times', 'div', 'pm', 'mp', 'cdot',
- 'left', 'right', 'big', 'Big', 'bigg', 'Bigg'
- ];
-
- // 为缺少反斜杠的命令添加反斜杠
- foreach ($commands as $cmd) {
- // 匹配单词边界的命令(不是已经有反斜杠的)
- $pattern = '/(?<!\\\\)\b' . preg_quote($cmd, '/') . '\b/';
- $latex = preg_replace($pattern, '\\\\' . $cmd, $latex);
- }
-
- // 规范化反斜杠(处理多重转义)
- $latex = preg_replace('/\\\\+([a-zA-Z])/', '\\\\$1', $latex);
-
- return $latex;
- }
- /**
- * 修复括号匹配问题
- */
- protected function fixBraces(string $latex): string
- {
- // 统计花括号数量
- $openCount = substr_count($latex, '{');
- $closeCount = substr_count($latex, '}');
-
- // 如果不匹配,尝试修复
- if ($openCount > $closeCount) {
- // 缺少闭合括号,在末尾添加
- $latex .= str_repeat('}', $openCount - $closeCount);
- } elseif ($closeCount > $openCount) {
- // 多余的闭合括号,移除末尾的
- $diff = $closeCount - $openCount;
- for ($i = 0; $i < $diff; $i++) {
- $latex = preg_replace('/\}\s*$/', '', $latex, 1);
- }
- }
-
- return $latex;
- }
- /**
- * 批量清理文本数组
- *
- * @param array $texts 文本数组
- * @param array $keys 需要清理的键名
- * @return array 清理后的数组
- */
- public function cleanArray(array $texts, array $keys = ['content', 'question_text', 'student_answer', 'answer']): array
- {
- foreach ($texts as &$item) {
- if (is_array($item)) {
- foreach ($keys as $key) {
- if (isset($item[$key]) && is_string($item[$key])) {
- $item[$key] = $this->clean($item[$key]);
- }
- }
- }
- }
-
- return $texts;
- }
- /**
- * 验证清理后的 LaTeX 是否有效
- *
- * @param string $latex 清理后的 LaTeX
- * @return array ['valid' => bool, 'errors' => array]
- */
- public function validate(string $latex): array
- {
- $errors = [];
-
- // 检查括号匹配
- if (substr_count($latex, '{') !== substr_count($latex, '}')) {
- $errors[] = '花括号不匹配';
- }
-
- if (substr_count($latex, '(') !== substr_count($latex, ')')) {
- $errors[] = '圆括号不匹配';
- }
-
- if (substr_count($latex, '[') !== substr_count($latex, ']')) {
- $errors[] = '方括号不匹配';
- }
-
- // 检查定界符匹配
- $dollarCount = substr_count($latex, '$');
- if ($dollarCount % 2 !== 0) {
- $errors[] = '$ 定界符不匹配';
- }
-
- return [
- 'valid' => empty($errors),
- 'errors' => $errors
- ];
- }
- }
|