Iconv.php 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. <?php
  2. declare(strict_types=1);
  3. namespace Laminas\Stdlib\StringWrapper;
  4. use Laminas\Stdlib\Exception;
  5. use function assert;
  6. use function extension_loaded;
  7. use function iconv;
  8. use function iconv_strlen;
  9. use function iconv_strpos;
  10. use function iconv_substr;
  11. class Iconv extends AbstractStringWrapper
  12. {
  13. /**
  14. * List of supported character sets (upper case)
  15. *
  16. * @link http://www.gnu.org/software/libiconv/
  17. *
  18. * @var string[]
  19. */
  20. protected static $encodings = [
  21. // European languages
  22. 'ASCII',
  23. 'ISO-8859-1',
  24. 'ISO-8859-2',
  25. 'ISO-8859-3',
  26. 'ISO-8859-4',
  27. 'ISO-8859-5',
  28. 'ISO-8859-7',
  29. 'ISO-8859-9',
  30. 'ISO-8859-10',
  31. 'ISO-8859-13',
  32. 'ISO-8859-14',
  33. 'ISO-8859-15',
  34. 'ISO-8859-16',
  35. 'KOI8-R',
  36. 'KOI8-U',
  37. 'KOI8-RU',
  38. 'CP1250',
  39. 'CP1251',
  40. 'CP1252',
  41. 'CP1253',
  42. 'CP1254',
  43. 'CP1257',
  44. 'CP850',
  45. 'CP866',
  46. 'CP1131',
  47. 'MACROMAN',
  48. 'MACCENTRALEUROPE',
  49. 'MACICELAND',
  50. 'MACCROATIAN',
  51. 'MACROMANIA',
  52. 'MACCYRILLIC',
  53. 'MACUKRAINE',
  54. 'MACGREEK',
  55. 'MACTURKISH',
  56. 'MACINTOSH',
  57. // Semitic languages
  58. 'ISO-8859-6',
  59. 'ISO-8859-8',
  60. 'CP1255',
  61. 'CP1256',
  62. 'CP862',
  63. 'MACHEBREW',
  64. 'MACARABIC',
  65. // Japanese
  66. 'EUC-JP',
  67. 'SHIFT_JIS',
  68. 'CP932',
  69. 'ISO-2022-JP',
  70. 'ISO-2022-JP-2',
  71. 'ISO-2022-JP-1',
  72. // Chinese
  73. 'EUC-CN',
  74. 'HZ',
  75. 'GBK',
  76. 'CP936',
  77. 'GB18030',
  78. 'EUC-TW',
  79. 'BIG5',
  80. 'CP950',
  81. 'BIG5-HKSCS',
  82. 'BIG5-HKSCS:2004',
  83. 'BIG5-HKSCS:2001',
  84. 'BIG5-HKSCS:1999',
  85. 'ISO-2022-CN',
  86. 'ISO-2022-CN-EXT',
  87. // Korean
  88. 'EUC-KR',
  89. 'CP949',
  90. 'ISO-2022-KR',
  91. 'JOHAB',
  92. // Armenian
  93. 'ARMSCII-8',
  94. // Georgian
  95. 'GEORGIAN-ACADEMY',
  96. 'GEORGIAN-PS',
  97. // Tajik
  98. 'KOI8-T',
  99. // Kazakh
  100. 'PT154',
  101. 'RK1048',
  102. // Thai
  103. 'ISO-8859-11',
  104. 'TIS-620',
  105. 'CP874',
  106. 'MACTHAI',
  107. // Laotian
  108. 'MULELAO-1',
  109. 'CP1133',
  110. // Vietnamese
  111. 'VISCII',
  112. 'TCVN',
  113. 'CP1258',
  114. // Platform specifics
  115. 'HP-ROMAN8',
  116. 'NEXTSTEP',
  117. // Full Unicode
  118. 'UTF-8',
  119. 'UCS-2',
  120. 'UCS-2BE',
  121. 'UCS-2LE',
  122. 'UCS-4',
  123. 'UCS-4BE',
  124. 'UCS-4LE',
  125. 'UTF-16',
  126. 'UTF-16BE',
  127. 'UTF-16LE',
  128. 'UTF-32',
  129. 'UTF-32BE',
  130. 'UTF-32LE',
  131. 'UTF-7',
  132. 'C99',
  133. 'JAVA',
  134. /* Commented out because that's internal encodings not existing in real world
  135. // Full Unicode, in terms of uint16_t or uint32_t (with machine dependent endianness and alignment)
  136. 'UCS-2-INTERNAL',
  137. 'UCS-4-INTERNAL',
  138. // Locale dependent, in terms of `char' or `wchar_t' (with machine dependent endianness and alignment,
  139. // and with OS and locale dependent semantics)
  140. 'char',
  141. 'wchar_t',
  142. '', // The empty encoding name is equivalent to "char": it denotes the locale dependent character encoding.
  143. */
  144. // When configured with the option --enable-extra-encodings,
  145. // it also provides support for a few extra encodings:
  146. // European languages
  147. 'CP437',
  148. 'CP737',
  149. 'CP775',
  150. 'CP852',
  151. 'CP853',
  152. 'CP855',
  153. 'CP857',
  154. 'CP858',
  155. 'CP860',
  156. 'CP861',
  157. 'CP863',
  158. 'CP865',
  159. 'CP869',
  160. 'CP1125',
  161. // Semitic languages
  162. 'CP864',
  163. // Japanese
  164. 'EUC-JISX0213',
  165. 'Shift_JISX0213',
  166. 'ISO-2022-JP-3',
  167. // Chinese
  168. 'BIG5-2003', // (experimental)
  169. // Turkmen
  170. 'TDS565',
  171. // Platform specifics
  172. 'ATARIST',
  173. 'RISCOS-LATIN1',
  174. ];
  175. /**
  176. * Get a list of supported character encodings
  177. *
  178. * @return string[]
  179. */
  180. public static function getSupportedEncodings()
  181. {
  182. return static::$encodings;
  183. }
  184. /**
  185. * Constructor
  186. *
  187. * @throws Exception\ExtensionNotLoadedException
  188. */
  189. public function __construct()
  190. {
  191. if (! extension_loaded('iconv')) {
  192. throw new Exception\ExtensionNotLoadedException(
  193. 'PHP extension "iconv" is required for this wrapper'
  194. );
  195. }
  196. }
  197. /**
  198. * Returns the length of the given string
  199. *
  200. * @param string $str
  201. * @return int|false
  202. */
  203. public function strlen($str)
  204. {
  205. return iconv_strlen($str, $this->getEncoding());
  206. }
  207. /**
  208. * Returns the portion of string specified by the start and length parameters
  209. *
  210. * @param string $str
  211. * @param int $offset
  212. * @param int|null $length
  213. * @return string|false
  214. */
  215. public function substr($str, $offset = 0, $length = null)
  216. {
  217. $length ??= $this->strlen($str);
  218. assert($length !== false);
  219. return iconv_substr($str, $offset, $length, $this->getEncoding());
  220. }
  221. /**
  222. * Find the position of the first occurrence of a substring in a string
  223. *
  224. * @param string $haystack
  225. * @param string $needle
  226. * @param int $offset
  227. * @return int|false
  228. */
  229. public function strpos($haystack, $needle, $offset = 0)
  230. {
  231. $encoding = $this->getEncoding();
  232. assert($encoding !== null);
  233. return iconv_strpos($haystack, $needle, $offset, $encoding);
  234. }
  235. /**
  236. * Convert a string from defined encoding to the defined convert encoding
  237. *
  238. * @param string $str
  239. * @param bool $reverse
  240. * @return string|false
  241. */
  242. public function convert($str, $reverse = false)
  243. {
  244. $encoding = $this->getEncoding();
  245. $convertEncoding = $this->getConvertEncoding();
  246. if ($convertEncoding === null) {
  247. throw new Exception\LogicException(
  248. 'No convert encoding defined'
  249. );
  250. }
  251. if ($encoding === $convertEncoding) {
  252. return $str;
  253. }
  254. $fromEncoding = $reverse ? $convertEncoding : $encoding;
  255. $toEncoding = $reverse ? $encoding : $convertEncoding;
  256. if (null === $toEncoding || null === $fromEncoding) {
  257. return $str;
  258. }
  259. // automatically add "//IGNORE" to not stop converting on invalid characters
  260. // invalid characters triggers a notice anyway
  261. return iconv($fromEncoding, $toEncoding . '//IGNORE', $str);
  262. }
  263. }