BlankPlaceholderRenderer.php 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. <?php
  2. namespace App\Support;
  3. class BlankPlaceholderRenderer
  4. {
  5. private const DEFAULT_BLANK_SPAN = '<span style="display:inline-block; min-width:80px; border-bottom:1.2px dashed #444; vertical-align:bottom;">&nbsp;</span>';
  6. // 仅匹配“空白占位”型 underline,不匹配 \underline{\frac{...}} 这类有内容公式下划线
  7. private const BLANK_UNDERLINE_PATTERN = '/\\\\+underline\{\s*(?:(?:\\\\+qquad+|\\\\+quad+|\\\\+hspace\{[^{}]*\}|\\\\+hphantom\{\s*(?:(?:\\\\+qquad+|\\\\+quad+|\\\\+hspace\{[^{}]*\}|_{2,}|&nbsp;|&#160;|\s| |\\\\+\s+)*)\s*\}|_{2,}|&nbsp;|&#160;|\s| |\\\\+\s+)*)\s*\}/u';
  8. /**
  9. * 将题干中的空括号/下划线/部分异常占位符统一替换为标准空位样式。
  10. *
  11. * @return array{0:string,1:bool} [renderedContent, replacedAnyPlaceholder]
  12. */
  13. public static function replaceToBlankSpan(
  14. string $content,
  15. ?string $blankSpan = null,
  16. bool $collapseAdjacentBlanks = false,
  17. bool $normalizeChineseTerminalPeriod = true
  18. ): array
  19. {
  20. $blankSpan = $blankSpan ?: self::DEFAULT_BLANK_SPAN;
  21. $renderedContent = $content;
  22. $latexPlaceholders = [];
  23. $counter = 0;
  24. $renderedContent = preg_replace_callback('/\$(?:[^\$]|\\\\.)*\$/u', function ($matches) use (&$latexPlaceholders, &$counter, $blankSpan) {
  25. $latexContent = $matches[0];
  26. $inner = mb_substr($latexContent, 1, mb_strlen($latexContent) - 2);
  27. // 数学环境内也可能包含填空占位符(如 $\\underline{\\qquad}$ / $\\angle A=\\underline{\\quad}$)
  28. $blankToken = '<<<BLANK_IN_MATH_'.$counter.'>>>';
  29. $innerWithBlanks = preg_replace(
  30. [
  31. self::BLANK_UNDERLINE_PATTERN,
  32. '/\\\\+qquad+/u',
  33. '/\\\\+quad+/u',
  34. '/[((](?:\s|&nbsp;|&#160;| )*[))]/u',
  35. '/_{2,}/u',
  36. ],
  37. $blankToken,
  38. $inner,
  39. -1,
  40. $blankCount
  41. );
  42. if ($blankCount > 0) {
  43. $parts = explode($blankToken, $innerWithBlanks);
  44. $rebuilt = '';
  45. $lastIndex = count($parts) - 1;
  46. foreach ($parts as $index => $part) {
  47. if ($part !== '') {
  48. // 纯标点不再包进数学环境,避免生成 "$.$" 这类尾部格式。
  49. if (preg_match('/^[\..。]$/u', $part)) {
  50. $rebuilt .= $part;
  51. } else {
  52. $rebuilt .= htmlspecialchars('$'.$part.'$', ENT_QUOTES | ENT_HTML5, 'UTF-8');
  53. }
  54. }
  55. if ($index < $lastIndex) {
  56. $rebuilt .= $blankSpan;
  57. }
  58. }
  59. return $rebuilt === '' ? $blankSpan : $rebuilt;
  60. }
  61. $placeholder = '<<<LATEX_BLANK_'.$counter.'>>>';
  62. $latexPlaceholders[$placeholder] = $latexContent;
  63. $counter++;
  64. return $placeholder;
  65. }, $renderedContent);
  66. // 兼容常见空位写法:\underline{...}、\qquad、空括号(含 nbsp 等空白)、连续下划线、尾部 \\$
  67. $patterns = [
  68. self::BLANK_UNDERLINE_PATTERN,
  69. '/\\\\+qquad+/u',
  70. '/[((](?:\s|&nbsp;|&#160;| )*[))]/u',
  71. '/_{2,}/u',
  72. '/\\\\+\$(?=\s*$)/u',
  73. ];
  74. $renderedContent = preg_replace($patterns, $blankSpan, $renderedContent);
  75. if ($collapseAdjacentBlanks) {
  76. $quotedBlankSpan = preg_quote($blankSpan, '/');
  77. $renderedContent = preg_replace('/(?:'.$quotedBlankSpan.'(?:\s|&nbsp;|&#160;| )*){2,}/u', $blankSpan, $renderedContent);
  78. }
  79. // 兼容脏数据:空位后紧跟孤立 "$" 且位于句尾(如 "...=____$."),移除该孤立 "$"。
  80. // 仅作用在“标准空位 + 句尾”场景,不影响正常数学公式分隔符。
  81. $quotedBlankSpan = preg_quote($blankSpan, '/');
  82. $renderedContent = preg_replace(
  83. '/('.$quotedBlankSpan.')\s*\$(?=\s*[\..。]?(?:\s*(?:(?:<\/[^>]+>|<[^>]+\/>)\s*)*)$)/u',
  84. '$1',
  85. $renderedContent
  86. ) ?? $renderedContent;
  87. foreach ($latexPlaceholders as $placeholder => $latexContent) {
  88. if (preg_match('/^\$(.*?)(\\\\+)\$$/u', $latexContent, $match)) {
  89. $inner = rtrim($match[1]);
  90. if ($inner === '' || preg_match('/[=::]\s*$/u', $inner)) {
  91. if ($inner === '') {
  92. $replacement = $blankSpan;
  93. } else {
  94. $replacement = htmlspecialchars('$'.$inner.'$', ENT_QUOTES | ENT_HTML5, 'UTF-8').' '.$blankSpan;
  95. }
  96. $renderedContent = str_replace($placeholder, $replacement, $renderedContent);
  97. continue;
  98. }
  99. }
  100. $encodedLatex = htmlspecialchars($latexContent, ENT_QUOTES | ENT_HTML5, 'UTF-8');
  101. $renderedContent = str_replace($placeholder, $encodedLatex, $renderedContent);
  102. }
  103. if ($normalizeChineseTerminalPeriod) {
  104. $renderedContent = self::normalizeChineseTerminalPeriod($renderedContent);
  105. }
  106. return [$renderedContent, $renderedContent !== $content];
  107. }
  108. public static function defaultBlankSpan(): string
  109. {
  110. return self::DEFAULT_BLANK_SPAN;
  111. }
  112. /**
  113. * 统一句尾标点(仅处理句尾,不影响中间小数/表达式)
  114. *
  115. * $mode:
  116. * - remove: 去掉句尾句号
  117. * - dot: 句尾统一为英文实心点 "."
  118. * - cn: 句尾统一为中文句号 "。"
  119. */
  120. public static function normalizeTerminalPunctuation(string $content, string $mode): string
  121. {
  122. $replacement = match ($mode) {
  123. 'remove' => '',
  124. 'dot' => '.',
  125. 'cn' => '。',
  126. default => null,
  127. };
  128. if ($replacement === null) {
  129. return $content;
  130. }
  131. // 仅处理句尾最后一个标点(允许句尾带 HTML 标签,如 <image .../>)。
  132. // 1) 先处理数学片段尾点(如 "$.$" / "$。$" / "$.$")。
  133. if (preg_match('/^(.*)\$\s*[\..。]\s*\$(\s*(?:(?:<\/[^>]+>|<[^>]+\/>)\s*)*)$/us', $content, $m)) {
  134. return $m[1].$replacement.$m[2];
  135. }
  136. // 2) 再处理普通句尾点(只替换最后一个,不影响中间文本)。
  137. if (preg_match('/^(.*?)([\..。])(\s*(?:(?:<\/[^>]+>|<[^>]+\/>)\s*)*)$/us', $content, $m)) {
  138. return $m[1].$replacement.$m[3];
  139. }
  140. return $content;
  141. }
  142. /**
  143. * 仅当句尾不存在句号类标点时,追加目标标点。
  144. * 不会覆盖已存在的句尾标点,也不处理正文中间内容。
  145. */
  146. public static function appendTerminalPunctuationIfMissing(string $content, string $punctuation): string
  147. {
  148. if ($punctuation === '') {
  149. return $content;
  150. }
  151. // 句尾若已有终止符号(中英文句号/问号/叹号/分号/冒号),则不再追加
  152. if (preg_match('/[\..。!!\??;;::](\s*(?:(?:<\/[^>]+>|<[^>]+\/>)\s*)*)$/us', $content)) {
  153. return $content;
  154. }
  155. return rtrim($content).$punctuation;
  156. }
  157. private static function normalizeChineseTerminalPeriod(string $content): string
  158. {
  159. // 仅在存在中文语境时,把句末英文句号统一为中文句号。
  160. if (! preg_match('/\p{Han}/u', $content)) {
  161. return $content;
  162. }
  163. return self::normalizeTerminalPunctuation($content, 'cn');
  164. }
  165. }