TextEncoding.php 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. <?php
  2. namespace App\Support;
  3. class TextEncoding
  4. {
  5. /**
  6. * Normalize unknown-encoding text to valid UTF-8 for safe JSON/JS rendering.
  7. */
  8. public static function toUtf8(string $content): string
  9. {
  10. // Remove null bytes that can break downstream processing.
  11. $content = str_replace("\0", '', $content);
  12. if ($content === '') {
  13. return $content;
  14. }
  15. if (function_exists('mb_check_encoding') && mb_check_encoding($content, 'UTF-8')) {
  16. return $content;
  17. }
  18. $candidates = [
  19. 'UTF-8',
  20. 'GB18030',
  21. 'GBK',
  22. 'GB2312',
  23. 'BIG5',
  24. 'ISO-8859-1',
  25. 'Windows-1252',
  26. ];
  27. if (function_exists('mb_detect_encoding') && function_exists('mb_convert_encoding')) {
  28. $detected = mb_detect_encoding($content, $candidates, true);
  29. if (is_string($detected) && $detected !== '' && strtoupper($detected) !== 'UTF-8') {
  30. $converted = @mb_convert_encoding($content, 'UTF-8', $detected);
  31. if (is_string($converted) && $converted !== '' && mb_check_encoding($converted, 'UTF-8')) {
  32. return $converted;
  33. }
  34. }
  35. }
  36. // Fallback: best-effort iconv conversion (drops invalid bytes).
  37. foreach (['GB18030', 'GBK', 'GB2312', 'BIG5', 'ISO-8859-1', 'Windows-1252'] as $from) {
  38. $converted = @iconv($from, 'UTF-8//IGNORE', $content);
  39. if (is_string($converted) && $converted !== '' && function_exists('mb_check_encoding') && mb_check_encoding($converted, 'UTF-8')) {
  40. return $converted;
  41. }
  42. }
  43. // Last resort: strip invalid sequences.
  44. $converted = @iconv('UTF-8', 'UTF-8//IGNORE', $content);
  45. return is_string($converted) ? $converted : '';
  46. }
  47. }