| [ Index ] |
PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008] |
[Summary view] [Print] [Text view]
1 <?php 2 /*************************************************************** 3 * Copyright notice 4 * 5 * (c) 2003-2006 Kasper Skaarhoj (kasperYYYY@typo3.com) 6 * All rights reserved 7 * 8 * This script is part of the Typo3 project. The Typo3 project is 9 * free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * The GNU General Public License can be found at 15 * http://www.gnu.org/copyleft/gpl.html. 16 * 17 * This script is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * GNU General Public License for more details. 21 * 22 * This copyright notice MUST APPEAR in all copies of the script! 23 ***************************************************************/ 24 /** 25 * Class for conversion between charsets. 26 * 27 * Typo Id: class.t3lib_cs.php,v 1.56 2006/05/03 08:47:30 masi Exp $ 28 * Moodle $Id: class.t3lib_cs.php,v 1.7 2006/08/11 09:48:35 stronk7 Exp $ 29 * 30 * @author Kasper Skaarhoj <kasperYYYY@typo3.com> 31 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 32 */ 33 /** 34 * [CLASS/FUNCTION INDEX of SCRIPT] 35 * 36 * 37 * 38 * 136: class t3lib_cs 39 * 488: function parse_charset($charset) 40 * 507: function get_locale_charset($locale) 41 * 42 * SECTION: Charset Conversion functions 43 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) 44 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) 45 * 617: function utf8_encode($str,$charset) 46 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0) 47 * 706: function utf8_to_entities($str) 48 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0) 49 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0) 50 * 823: function UnumberToChar($cbyte) 51 * 868: function utf8CharToUnumber($str,$hex=0) 52 * 53 * SECTION: Init functions 54 * 911: function initCharset($charset) 55 * 973: function initUnicodeData($mode=null) 56 * 1198: function initCaseFolding($charset) 57 * 1260: function initToASCII($charset) 58 * 59 * SECTION: String operation functions 60 * 1331: function substr($charset,$string,$start,$len=null) 61 * 1384: function strlen($charset,$string) 62 * 1414: function crop($charset,$string,$len,$crop='') 63 * 1467: function strtrunc($charset,$string,$len) 64 * 1501: function conv_case($charset,$string,$case) 65 * 1527: function specCharsToASCII($charset,$string) 66 * 67 * SECTION: Internal string operation functions 68 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='') 69 * 70 * SECTION: Internal UTF-8 string operation functions 71 * 1622: function utf8_substr($str,$start,$len=null) 72 * 1655: function utf8_strlen($str) 73 * 1676: function utf8_strtrunc($str,$len) 74 * 1698: function utf8_strpos($haystack,$needle,$offset=0) 75 * 1723: function utf8_strrpos($haystack,$needle) 76 * 1745: function utf8_char2byte_pos($str,$pos) 77 * 1786: function utf8_byte2char_pos($str,$pos) 78 * 1809: function utf8_char_mapping($str,$mode,$opt='') 79 * 80 * SECTION: Internal EUC string operation functions 81 * 1885: function euc_strtrunc($str,$len,$charset) 82 * 1914: function euc_substr($str,$start,$charset,$len=null) 83 * 1939: function euc_strlen($str,$charset) 84 * 1966: function euc_char2byte_pos($str,$pos,$charset) 85 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='') 86 * 87 * TOTAL FUNCTIONS: 35 88 * (This index is automatically created/updated by the extension "extdeveval") 89 * 90 */ 91 92 93 94 95 96 97 98 99 /** 100 * Notes on UTF-8 101 * 102 * Functions working on UTF-8 strings: 103 * 104 * - strchr/strstr 105 * - strrchr 106 * - substr_count 107 * - implode/explode/join 108 * 109 * Functions nearly working on UTF-8 strings: 110 * 111 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen 112 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII 113 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos 114 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0 115 * 116 * Functions NOT working on UTF-8 strings: 117 * 118 * - str*cmp 119 * - stristr 120 * - stripos 121 * - substr 122 * - strrev 123 * - ereg/eregi 124 * - split/spliti 125 * - preg_* 126 * - ... 127 * 128 */ 129 /** 130 * Class for conversion between charsets 131 * 132 * @author Kasper Skaarhoj <kasperYYYY@typo3.com> 133 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 134 * @package TYPO3 135 * @subpackage t3lib 136 */ 137 class t3lib_cs { 138 var $noCharByteVal=63; // ASCII Value for chars with no equivalent. 139 140 // This is the array where parsed conversion tables are stored (cached) 141 var $parsedCharsets=array(); 142 143 // An array where case folding data will be stored (cached) 144 var $caseFolding=array(); 145 146 // An array where charset-to-ASCII mappings are stored (cached) 147 var $toASCII=array(); 148 149 // This tells the converter which charsets has two bytes per char: 150 var $twoByteSets=array( 151 'ucs-2'=>1, // 2-byte Unicode 152 ); 153 154 // This tells the converter which charsets has four bytes per char: 155 var $fourByteSets=array( 156 'ucs-4'=>1, // 4-byte Unicode 157 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16) 158 ); 159 160 // This tells the converter which charsets use a scheme like the Extended Unix Code: 161 var $eucBasedSets=array( 162 'gb2312'=>1, // Chinese, simplified. 163 'big5'=>1, // Chinese, traditional. 164 'euc-kr'=>1, // Korean 165 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80! 166 ); 167 168 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html 169 // http://czyborra.com/charsets/iso8859.html 170 var $synonyms=array( 171 'us' => 'ascii', 172 'us-ascii'=> 'ascii', 173 'cp819' => 'iso-8859-1', 174 'ibm819' => 'iso-8859-1', 175 'iso-ir-100' => 'iso-8859-1', 176 'iso-ir-109' => 'iso-8859-2', 177 'iso-ir-148' => 'iso-8859-9', 178 'iso-ir-199' => 'iso-8859-14', 179 'iso-ir-203' => 'iso-8859-15', 180 'csisolatin1' => 'iso-8859-1', 181 'csisolatin2' => 'iso-8859-2', 182 'csisolatin3' => 'iso-8859-3', 183 'csisolatin5' => 'iso-8859-9', 184 'csisolatin8' => 'iso-8859-14', 185 'csisolatin9' => 'iso-8859-15', 186 'csisolatingreek' => 'iso-8859-7', 187 'iso-celtic' => 'iso-8859-14', 188 'latin1' => 'iso-8859-1', 189 'latin2' => 'iso-8859-2', 190 'latin3' => 'iso-8859-3', 191 'latin5' => 'iso-8859-9', 192 'latin6' => 'iso-8859-10', 193 'latin8' => 'iso-8859-14', 194 'latin9' => 'iso-8859-15', 195 'l1' => 'iso-8859-1', 196 'l2' => 'iso-8859-2', 197 'l3' => 'iso-8859-3', 198 'l5' => 'iso-8859-9', 199 'l6' => 'iso-8859-10', 200 'l8' => 'iso-8859-14', 201 'l9' => 'iso-8859-15', 202 'cyrillic' => 'iso-8859-5', 203 'arabic' => 'iso-8859-6', 204 'tis-620' => 'iso-8859-11', 205 'win874' => 'windows-874', 206 'win1250' => 'windows-1250', 207 'win1251' => 'windows-1251', 208 'win1252' => 'windows-1252', 209 'win1253' => 'windows-1253', 210 'win1254' => 'windows-1254', 211 'win1255' => 'windows-1255', 212 'win1256' => 'windows-1256', 213 'win1257' => 'windows-1257', 214 'win1258' => 'windows-1258', 215 'cp1250' => 'windows-1250', 216 'cp1251' => 'windows-1251', 217 'cp1252' => 'windows-1252', 218 'ms-ee' => 'windows-1250', 219 'ms-ansi' => 'windows-1252', 220 'ms-greek' => 'windows-1253', 221 'ms-turk' => 'windows-1254', 222 'winbaltrim' => 'windows-1257', 223 'koi-8ru' => 'koi-8r', 224 'koi8r' => 'koi-8r', 225 'cp878' => 'koi-8r', 226 'mac' => 'macroman', 227 'macintosh' => 'macroman', 228 'euc-cn' => 'gb2312', 229 'x-euc-cn' => 'gb2312', 230 'euccn' => 'gb2312', 231 'cp936' => 'gb2312', 232 'big-5' => 'big5', 233 'cp950' => 'big5', 234 'eucjp' => 'euc-jp', 235 'sjis' => 'shift_jis', 236 'shift-jis' => 'shift_jis', 237 'cp932' => 'shift_jis', 238 'cp949' => 'euc-kr', 239 'utf7' => 'utf-7', 240 'utf8' => 'utf-8', 241 'utf16' => 'utf-16', 242 'utf32' => 'utf-32', 243 'utf8' => 'utf-8', 244 'ucs2' => 'ucs-2', 245 'ucs4' => 'ucs-4', 246 ); 247 248 // mapping of iso-639:2 language codes to script names 249 var $lang_to_script=array( 250 // iso-639:2 language codes, see: 251 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm 252 // http://www.loc.gov/standards/iso639-2/langcodes.html 253 // http://www.unicode.org/onlinedat/languages.html 254 'ar' => 'arabic', 255 'bg' => 'cyrillic', // Bulgarian 256 'bs' => 'east_european', // Bosnian 257 'cs' => 'east_european', // Czech 258 'da' => 'west_european', // Danish 259 'de' => 'west_european', // German 260 'es' => 'west_european', // Spanish 261 'et' => 'estonian', 262 'eo' => 'unicode', // Esperanto 263 'eu' => 'west_european', // Basque 264 'fa' => 'arabic', // Persian 265 'fi' => 'west_european', // Finish 266 'fo' => 'west_european', // Faroese 267 'fr' => 'west_european', // French 268 'gr' => 'greek', 269 'he' => 'hebrew', // Hebrew (since 1998) 270 'hi' => 'unicode', // Hindi 271 'hr' => 'east_european', // Croatian 272 'hu' => 'east_european', // Hungarian 273 'iw' => 'hebrew', // Hebrew (til 1998) 274 'is' => 'west_european', // Icelandic 275 'it' => 'west_european', // Italian 276 'ja' => 'japanese', 277 'kl' => 'west_european', // Greenlandic 278 'ko' => 'korean', 279 'lt' => 'lithuanian', 280 'lv' => 'west_european', // Latvian/Lettish 281 'nl' => 'west_european', // Dutch 282 'no' => 'west_european', // Norwegian 283 'pl' => 'east_european', // Polish 284 'pt' => 'west_european', // Portuguese 285 'ro' => 'east_european', // Romanian 286 'ru' => 'cyrillic', // Russian 287 'sk' => 'east_european', // Slovak 288 'sl' => 'east_european', // Slovenian 289 'sr' => 'cyrillic', // Serbian 290 'sv' => 'west_european', // Swedish 291 'th' => 'thai', 292 'uk' => 'cyrillic', // Ukranian 293 'vi' => 'vietnamese', 294 'zh' => 'chinese', 295 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp 296 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp 297 'ara' => 'arabic', 298 'bgr' => 'cyrillic', // Bulgarian 299 'cat' => 'west_european', // Catalan 300 'chs' => 'simpl_chinese', 301 'cht' => 'trad_chinese', 302 'csy' => 'east_european', // Czech 303 'dan' => 'west_european', // Danisch 304 'deu' => 'west_european', // German 305 'dea' => 'west_european', // German (Austrian) 306 'des' => 'west_european', // German (Swiss) 307 'ena' => 'west_european', // English (Australian) 308 'enc' => 'west_european', // English (Canadian) 309 'eng' => 'west_european', // English 310 'enz' => 'west_european', // English (New Zealand) 311 'enu' => 'west_european', // English (United States) 312 'euq' => 'west_european', // Basque 313 'fos' => 'west_european', // Faroese 314 'far' => 'arabic', // Persian 315 'fin' => 'west_european', // Finish 316 'fra' => 'west_european', // French 317 'frb' => 'west_european', // French (Belgian) 318 'frc' => 'west_european', // French (Canadian) 319 'frs' => 'west_european', // French (Swiss) 320 'ell' => 'greek', 321 'heb' => 'hebrew', 322 'hin' => 'unicode', // Hindi 323 'hun' => 'east_european', // Hungarian 324 'isl' => 'west_euorpean', // Icelandic 325 'ita' => 'west_european', // Italian 326 'its' => 'west_european', // Italian (Swiss) 327 'jpn' => 'japanese', 328 'kor' => 'korean', 329 'lth' => 'lithuanian', 330 'lvi' => 'west_european', // Latvian/Lettish 331 'msl' => 'west_european', // Malay 332 'nlb' => 'west_european', // Dutch (Belgian) 333 'nld' => 'west_european', // Dutch 334 'nor' => 'west_european', // Norwegian (bokmal) 335 'non' => 'west_european', // Norwegian (nynorsk) 336 'plk' => 'east_european', // Polish 337 'ptg' => 'west_european', // Portuguese 338 'ptb' => 'west_european', // Portuguese (Brazil) 339 'rom' => 'east_european', // Romanian 340 'rus' => 'cyrillic', // Russian 341 'slv' => 'east_european', // Slovenian 342 'sky' => 'east_european', // Slovak 343 'srl' => 'east_european', // Serbian (Latin) 344 'srb' => 'cyrillic', // Serbian (Cyrillic) 345 'esp' => 'west_european', // Spanish (trad. sort) 346 'esm' => 'west_european', // Spanish (Mexican) 347 'esn' => 'west_european', // Spanish (internat. sort) 348 'sve' => 'west_european', // Swedish 349 'tha' => 'thai', 350 'trk' => 'turkish', 351 'ukr' => 'cyrillic', // Ukrainian 352 // English language names 353 'arabic' => 'arabic', 354 'basque' => 'west_european', 355 'bosnian' => 'east_european', 356 'bulgarian' => 'east_european', 357 'catalan' => 'west_european', 358 'croatian' => 'east_european', 359 'czech' => 'east_european', 360 'danish' => 'west_european', 361 'dutch' => 'west_european', 362 'english' => 'west_european', 363 'esperanto' => 'unicode', 364 'estonian' => 'estonian', 365 'faroese' => 'west_european', 366 'farsi' => 'arabic', 367 'finnish' => 'west_european', 368 'french' => 'west_european', 369 'galician' => 'west_european', 370 'german' => 'west_european', 371 'greek' => 'greek', 372 'greenlandic' => 'west_european', 373 'hebrew' => 'hebrew', 374 'hindi' => 'unicode', 375 'hungarian' => 'east_european', 376 'icelandic' => 'west_european', 377 'italian' => 'west_european', 378 'latvian' => 'west_european', 379 'lettish' => 'west_european', 380 'lithuanian' => 'lithuanian', 381 'malay' => 'west_european', 382 'norwegian' => 'west_european', 383 'persian' => 'arabic', 384 'polish' => 'east_european', 385 'portuguese' => 'west_european', 386 'russian' => 'cyrillic', 387 'romanian' => 'east_european', 388 'serbian' => 'cyrillic', 389 'slovak' => 'east_european', 390 'slovenian' => 'east_european', 391 'spanish' => 'west_european', 392 'svedish' => 'west_european', 393 'that' => 'thai', 394 'turkish' => 'turkish', 395 'ukrainian' => 'cyrillic', 396 ); 397 398 // mapping of language (family) names to charsets on Unix 399 var $script_to_charset_unix=array( 400 'west_european' => 'iso-8859-1', 401 'estonian' => 'iso-8859-1', 402 'east_european' => 'iso-8859-2', 403 'baltic' => 'iso-8859-4', 404 'cyrillic' => 'iso-8859-5', 405 'arabic' => 'iso-8859-6', 406 'greek' => 'iso-8859-7', 407 'hebrew' => 'iso-8859-8', 408 'turkish' => 'iso-8859-9', 409 'thai' => 'iso-8859-11', // = TIS-620 410 'lithuanian' => 'iso-8859-13', 411 'chinese' => 'gb2312', // = euc-cn 412 'japanese' => 'euc-jp', 413 'korean' => 'euc-kr', 414 'simpl_chinese' => 'gb2312', 415 'trad_chinese' => 'big5', 416 'vietnamese' => '', 417 'unicode' => 'utf-8', 418 ); 419 420 // mapping of language (family) names to charsets on Windows 421 var $script_to_charset_windows=array( 422 'east_european' => 'windows-1250', 423 'cyrillic' => 'windows-1251', 424 'west_european' => 'windows-1252', 425 'greek' => 'windows-1253', 426 'turkish' => 'windows-1254', 427 'hebrew' => 'windows-1255', 428 'arabic' => 'windows-1256', 429 'baltic' => 'windows-1257', 430 'estonian' => 'windows-1257', 431 'lithuanian' => 'windows-1257', 432 'vietnamese' => 'windows-1258', 433 'thai' => 'cp874', 434 'korean' => 'cp949', 435 'chinese' => 'gb2312', 436 'japanese' => 'shift_jis', 437 'simpl_chinese' => 'gb2312', 438 'trad_chinese' => 'big5', 439 ); 440 441 // mapping of locale names to charsets 442 var $locale_to_charset=array( 443 'japanese.euc' => 'euc-jp', 444 'ja_jp.ujis' => 'euc-jp', 445 'korean.euc' => 'euc-kr', 446 'sr@Latn' => 'iso-8859-2', 447 'zh_cn' => 'gb2312', 448 'zh_hk' => 'big5', 449 'zh_tw' => 'big5', 450 ); 451 452 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3: 453 // Empty values means "iso-8859-1" 454 var $charSetArray = array( 455 'dk' => '', 456 'de' => '', 457 'no' => '', 458 'it' => '', 459 'fr' => '', 460 'es' => '', 461 'nl' => '', 462 'cz' => 'windows-1250', 463 'pl' => 'iso-8859-2', 464 'si' => 'windows-1250', 465 'fi' => '', 466 'tr' => 'iso-8859-9', 467 'se' => '', 468 'pt' => '', 469 'ru' => 'windows-1251', 470 'ro' => 'iso-8859-2', 471 'ch' => 'gb2312', 472 'sk' => 'windows-1250', 473 'lt' => 'windows-1257', 474 'is' => 'utf-8', 475 'hr' => 'windows-1250', 476 'hu' => 'iso-8859-2', 477 'gl' => '', 478 'th' => 'iso-8859-11', 479 'gr' => 'iso-8859-7', 480 'hk' => 'big5', 481 'eu' => '', 482 'bg' => 'windows-1251', 483 'br' => '', 484 'et' => 'iso-8859-4', 485 'ar' => 'iso-8859-6', 486 'he' => 'utf-8', 487 'ua' => 'windows-1251', 488 'jp' => 'shift_jis', 489 'lv' => 'utf-8', 490 'vn' => 'utf-8', 491 'ca' => 'iso-8859-15', 492 'ba' => 'iso-8859-2', 493 'kr' => 'euc-kr', 494 'eo' => 'utf-8', 495 'my' => '', 496 'hi' => 'utf-8', 497 'fo' => 'utf-8', 498 'fa' => 'utf-8', 499 'sr' => 'utf-8' 500 ); 501 502 // TYPO3 specific: Array with the iso names used for each system language in TYPO3: 503 // Missing keys means: same as Typo3 504 var $isoArray = array( 505 'ba' => 'bs', 506 'br' => 'pt_BR', 507 'ch' => 'zh_CN', 508 'cz' => 'cs', 509 'dk' => 'da', 510 'si' => 'sl', 511 'se' => 'sv', 512 'gl' => 'kl', 513 'gr' => 'el', 514 'hk' => 'zh_HK', 515 'kr' => 'ko', 516 'ua' => 'uk', 517 'jp' => 'ja', 518 'vn' => 'vi', 519 ); 520 521 /** 522 * Normalize - changes input character set to lowercase letters. 523 * 524 * @param string Input charset 525 * @return string Normalized charset 526 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 527 */ 528 function parse_charset($charset) { 529 $charset = strtolower($charset); 530 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset]; 531 532 return $charset; 533 } 534 535 /** 536 * Get the charset of a locale. 537 * 538 * ln language 539 * ln_CN language / country 540 * ln_CN.cs language / country / charset 541 * ln_CN.cs@mod language / country / charset / modifier 542 * 543 * @param string Locale string 544 * @return string Charset resolved for locale string 545 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 546 */ 547 function get_locale_charset($locale) { 548 $locale = strtolower($locale); 549 550 // exact locale specific charset? 551 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale]; 552 553 // get modifier 554 list($locale,$modifier) = explode('@',$locale); 555 556 // locale contains charset: use it 557 list($locale,$charset) = explode('.',$locale); 558 if ($charset) return $this->parse_charset($charset); 559 560 // modifier is 'euro' (after charset check, because of xx.utf-8@euro) 561 if ($modifier == 'euro') return 'iso-8859-15'; 562 563 // get language 564 list($language,$country) = explode('_',$locale); 565 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language]; 566 567 if (TYPO3_OS == 'WIN') { 568 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'window-1252'; 569 } else { 570 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1'; 571 } 572 573 return $cs; 574 } 575 576 577 578 579 580 581 582 583 584 /******************************************** 585 * 586 * Charset Conversion functions 587 * 588 ********************************************/ 589 590 /** 591 * Convert from one charset to another charset. 592 * 593 * @param string Input string 594 * @param string From charset (the current charset of the string) 595 * @param string To charset (the output charset wanted) 596 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 597 * @return string Converted string 598 * @see convArray() 599 */ 600 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) { 601 if ($fromCS==$toCS) return $str; 602 603 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything 604 if ($toCS=='utf-8' || !$useEntityForNoChar) { 605 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) { 606 case 'mbstring': 607 $conv_str = mb_convert_encoding($str,$toCS,$fromCS); 608 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets 609 break; 610 611 case 'iconv': 612 $conv_str = iconv($fromCS,$toCS.'//IGNORE',$str); 613 if (false !== $conv_str) return $conv_str; 614 break; 615 616 case 'recode': 617 $conv_str = recode_string($fromCS.'..'.$toCS,$str); 618 if (false !== $conv_str) return $conv_str; 619 break; 620 } 621 // fallback to TYPO3 conversion 622 } 623 624 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS); 625 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar); 626 return $str; 627 } 628 629 /** 630 * Convert all elements in ARRAY from one charset to another charset. 631 * NOTICE: Array is passed by reference! 632 * 633 * @param string Input array, possibly multidimensional 634 * @param string From charset (the current charset of the string) 635 * @param string To charset (the output charset wanted) 636 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 637 * @return void 638 * @see conv() 639 */ 640 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) { 641 foreach($array as $key => $value) { 642 if (is_array($array[$key])) { 643 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar); 644 } else { 645 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar); 646 } 647 } 648 } 649 650 /** 651 * Converts $str from $charset to UTF-8 652 * 653 * @param string String in local charset to convert to UTF-8 654 * @param string Charset, lowercase. Must be found in csconvtbl/ folder. 655 * @return string Output string, converted to UTF-8 656 */ 657 function utf8_encode($str,$charset) { 658 659 if ($charset === 'utf-8') return $str; 660 661 // Charset is case-insensitive. 662 if ($this->initCharset($charset)) { // Parse conv. table if not already... 663 $strLen = strlen($str); 664 $outStr=''; 665 666 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string. 667 $chr=substr($str,$a,1); 668 $ord=ord($chr); 669 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char 670 $ord2 = ord($str{$a+1}); 671 $ord = $ord<<8 | $ord2; // assume big endian 672 673 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 674 $outStr.=$this->parsedCharsets[$charset]['local'][$ord]; 675 } else $outStr.=chr($this->noCharByteVal); // No char exists 676 $a++; 677 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8 678 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int. 679 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte 680 $a++; 681 $ord2=ord(substr($str,$a,1)); 682 $ord = $ord*256+$ord2; 683 } 684 } 685 686 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 687 $outStr.= $this->parsedCharsets[$charset]['local'][$ord]; 688 } else $outStr.= chr($this->noCharByteVal); // No char exists 689 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 690 } 691 return $outStr; 692 } 693 } 694 695 /** 696 * Converts $str from UTF-8 to $charset 697 * 698 * @param string String in UTF-8 to convert to local charset 699 * @param string Charset, lowercase. Must be found in csconvtbl/ folder. 700 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 701 * @return string Output string, converted to local charset 702 */ 703 function utf8_decode($str,$charset,$useEntityForNoChar=0) { 704 705 // Charset is case-insensitive. 706 if ($this->initCharset($charset)) { // Parse conv. table if not already... 707 $strLen = strlen($str); 708 $outStr=''; 709 $buf=''; 710 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string. 711 $chr=substr($str,$a,1); 712 $ord=ord($chr); 713 if ($ord>127) { // This means multibyte! (first byte!) 714 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 715 716 $buf=$chr; // Add first byte 717 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 718 $ord = $ord << 1; // Shift it left and ... 719 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 720 $a++; // Increase pointer... 721 $buf.=substr($str,$a,1); // ... and add the next char. 722 } else break; 723 } 724 725 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then... 726 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number 727 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars. 728 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255); 729 } else $outStr.= chr($mByte); 730 } elseif ($useEntityForNoChar) { // Create num entity: 731 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';'; 732 } else $outStr.=chr($this->noCharByteVal); // No char exists 733 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!) 734 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 735 } 736 return $outStr; 737 } 738 } 739 740 /** 741 * Converts all chars > 127 to numeric entities. 742 * 743 * @param string Input string 744 * @return string Output string 745 */ 746 function utf8_to_entities($str) { 747 $strLen = strlen($str); 748 $outStr=''; 749 $buf=''; 750 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string. 751 $chr=substr($str,$a,1); 752 $ord=ord($chr); 753 if ($ord>127) { // This means multibyte! (first byte!) 754 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 755 $buf=$chr; // Add first byte 756 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 757 $ord = $ord << 1; // Shift it left and ... 758 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 759 $a++; // Increase pointer... 760 $buf.=substr($str,$a,1); // ... and add the next char. 761 } else break; 762 } 763 764 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';'; 765 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!) 766 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 767 } 768 769 return $outStr; 770 } 771 772 /** 773 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars 774 * 775 * @param string Input string, UTF-8 776 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well) 777 * @return string Output string 778 */ 779 function entities_to_utf8($str,$alsoStdHtmlEnt=0) { 780 if ($alsoStdHtmlEnt) { 781 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below. 782 } 783 784 $token = md5(microtime()); 785 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str)); 786 foreach($parts as $k => $v) { 787 if ($k%2) { 788 if (substr($v,0,1)=='#') { // Dec or hex entities: 789 if (substr($v,1,1)=='x') { 790 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2))); 791 } else { 792 $parts[$k] = $this->UnumberToChar(substr($v,1)); 793 } 794 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities: 795 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1'); 796 } else { // No conversion: 797 $parts[$k] ='&'.$v.';'; 798 } 799 } 800 } 801 802 return implode('',$parts); 803 } 804 805 /** 806 * Converts all chars in the input UTF-8 string into integer numbers returned in an array 807 * 808 * @param string Input string, UTF-8 809 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters. 810 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned. 811 * @return array Output array with the char numbers 812 */ 813 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) { 814 // If entities must be registered as well...: 815 if ($convEntities) { 816 $str = $this->entities_to_utf8($str,1); 817 } 818 // Do conversion: 819 $strLen = strlen($str); 820 $outArr=array(); 821 $buf=''; 822 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string. 823 $chr=substr($str,$a,1); 824 $ord=ord($chr); 825 if ($ord>127) { // This means multibyte! (first byte!) 826 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 827 $buf=$chr; // Add first byte 828 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 829 $ord = $ord << 1; // Shift it left and ... 830 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 831 $a++; // Increase pointer... 832 $buf.=substr($str,$a,1); // ... and add the next char. 833 } else break; 834 } 835 836 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf); 837 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!) 838 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 839 } 840 841 return $outArr; 842 } 843 844 /** 845 * Converts a UNICODE number to a UTF-8 multibyte character 846 * Algorithm based on script found at From: http://czyborra.com/utf/ 847 * Unit-tested by Kasper 848 * 849 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence: 850 * 851 * bytes | bits | representation 852 * 1 | 7 | 0vvvvvvv 853 * 2 | 11 | 110vvvvv 10vvvvvv 854 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv 855 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv 856 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 857 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 858 * 859 * @param integer UNICODE integer 860 * @return string UTF-8 multibyte character string 861 * @see utf8CharToUnumber() 862 */ 863 function UnumberToChar($cbyte) { 864 $str=''; 865 866 if ($cbyte < 0x80) { 867 $str.=chr($cbyte); 868 } else if ($cbyte < 0x800) { 869 $str.=chr(0xC0 | ($cbyte >> 6)); 870 $str.=chr(0x80 | ($cbyte & 0x3F)); 871 } else if ($cbyte < 0x10000) { 872 $str.=chr(0xE0 | ($cbyte >> 12)); 873 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 874 $str.=chr(0x80 | ($cbyte & 0x3F)); 875 } else if ($cbyte < 0x200000) { 876 $str.=chr(0xF0 | ($cbyte >> 18)); 877 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 878 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 879 $str.=chr(0x80 | ($cbyte & 0x3F)); 880 } else if ($cbyte < 0x4000000) { 881 $str.=chr(0xF8 | ($cbyte >> 24)); 882 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F)); 883 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 884 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 885 $str.=chr(0x80 | ($cbyte & 0x3F)); 886 } else if ($cbyte < 0x80000000) { 887 $str.=chr(0xFC | ($cbyte >> 30)); 888 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F)); 889 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F)); 890 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 891 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 892 $str.=chr(0x80 | ($cbyte & 0x3F)); 893 } else { // Cannot express a 32-bit character in UTF-8 894 $str .= chr($this->noCharByteVal); 895 } 896 return $str; 897 } 898 899 /** 900 * Converts a UTF-8 Multibyte character to a UNICODE number 901 * Unit-tested by Kasper 902 * 903 * @param string UTF-8 multibyte character string 904 * @param boolean If set, then a hex. number is returned. 905 * @return integer UNICODE integer 906 * @see UnumberToChar() 907 */ 908 function utf8CharToUnumber($str,$hex=0) { 909 $ord=ord(substr($str,0,1)); // First char 910 911 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string 912 $binBuf=''; 913 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 914 $ord = $ord << 1; // Shift it left and ... 915 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 916 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6); 917 } else break; 918 } 919 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf; 920 921 $int = bindec($binBuf); 922 } else $int = $ord; 923 924 return $hex ? 'x'.dechex($int) : $int; 925 } 926 927 928 929 930 931 932 933 934 935 /******************************************** 936 * 937 * Init functions 938 * 939 ********************************************/ 940 941 /** 942 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder 943 * This function is automatically called by the conversion functions 944 * 945 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/ 946 * 947 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl) 948 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed. 949 * @access private 950 */ 951 function initCharset($charset) { 952 // Only process if the charset is not yet loaded: 953 if (!is_array($this->parsedCharsets[$charset])) { 954 955 // Conversion table filename: 956 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl'; 957 958 // If the conversion table is found: 959 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) { 960 // Cache file for charsets: 961 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero. 962 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl'); 963 if ($cacheFile && @is_file($cacheFile)) { 964 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile)); 965 } else { 966 // Parse conversion table into lines: 967 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1); 968 // Initialize the internal variable holding the conv. table: 969 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array()); 970 // traverse the lines: 971 $detectedType=''; 972 foreach($lines as $value) { 973 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored. 974 975 // Detect type if not done yet: (Done on first real line) 976 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE" 977 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token'; 978 979 if ($detectedType=='ms-token') { 980 list($hexbyte,$utf8) = split('=|:',$value,3); 981 } elseif ($detectedType=='whitespaced') { 982 $regA=array(); 983 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA); 984 $hexbyte = $regA[1]; 985 $utf8 = 'U+'.$regA[2]; 986 } 987 $decval = hexdec(trim($hexbyte)); 988 if ($decval>127) { 989 $utf8decval = hexdec(substr(trim($utf8),2)); 990 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval); 991 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval; 992 } 993 } 994 } 995 if ($cacheFile) { 996 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset])); 997 } 998 } 999 return 2; 1000 } else return false; 1001 } else return 1; 1002 } 1003 1004 /** 1005 * This function initializes all UTF-8 character data tables. 1006 * 1007 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/ 1008 * 1009 * @param string Mode ("case", "ascii", ...) 1010 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 1011 * @access private 1012 */ 1013 function initUnicodeData($mode=null) { 1014 // cache files 1015 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl'); 1016 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl'); 1017 1018 // Only process if the tables are not yet loaded 1019 switch($mode) { 1020 case 'case': 1021 if (is_array($this->caseFolding['utf-8'])) return 1; 1022 1023 // Use cached version if possible 1024 if ($cacheFileCase && @is_file($cacheFileCase)) { 1025 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase)); 1026 return 2; 1027 } 1028 break; 1029 1030 case 'ascii': 1031 if (is_array($this->toASCII['utf-8'])) return 1; 1032 1033 // Use cached version if possible 1034 if ($cacheFileASCII && @is_file($cacheFileASCII)) { 1035 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII)); 1036 return 2; 1037 } 1038 break; 1039 } 1040 1041 // process main Unicode data file 1042 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt'; 1043 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false; 1044 1045 $fh = fopen($unicodeDataFile,'rb'); 1046 if (!$fh) return false; 1047 1048 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence) 1049 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper) 1050 $this->caseFolding['utf-8'] = array(); 1051 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand 1052 $utf8CaseFolding['toUpper'] = array(); 1053 $utf8CaseFolding['toLower'] = array(); 1054 $utf8CaseFolding['toTitle'] = array(); 1055 1056 $decomposition = array(); // array of temp. decompositions 1057 $mark = array(); // array of chars that are marks (eg. composing accents) 1058 $number = array(); // array of chars that are numbers (eg. digits) 1059 $omit = array(); // array of chars to be omitted (eg. Russian hard sign) 1060 1061 while (!feof($fh)) { 1062 $line = fgets($fh,4096); 1063 // has a lot of info 1064 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line)); 1065 1066 $ord = hexdec($char); 1067 if ($ord > 0xFFFF) break; // only process the BMP 1068 1069 $utf8_char = $this->UnumberToChar($ord); 1070 1071 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper)); 1072 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower)); 1073 // store "title" only when different from "upper" (only a few) 1074 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title)); 1075 1076 switch ($cat{0}) { 1077 case 'M': // mark (accent, umlaut, ...) 1078 $mark["U+$char"] = 1; 1079 break; 1080 1081 case 'N': // numeric value 1082 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num; 1083 } 1084 1085 // accented Latin letters without "official" decomposition 1086 $match = array(); 1087 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) { 1088 $c = ord($match[2]); 1089 if ($match[1] == 'SMALL') $c += 32; 1090 1091 $decomposition["U+$char"] = array(dechex($c)); 1092 continue; 1093 } 1094 1095 $match = array(); 1096 if (ereg('(<.*>)? *(.+)',$decomp,$match)) { 1097 switch($match[1]) { 1098 case '<circle>': // add parenthesis as circle replacement, eg (1) 1099 $match[2] = '0028 '.$match[2].' 0029'; 1100 break; 1101 1102 case '<square>': // add square brackets as square replacement, eg [1] 1103 $match[2] = '005B '.$match[2].' 005D'; 1104 break; 1105 1106 case '<compat>': // ignore multi char decompositions that start with a space 1107 if (ereg('^0020 ',$match[2])) continue 2; 1108 break; 1109 1110 // ignore Arabic and vertical layout presentation decomposition 1111 case '<initial>': 1112 case '<medial>': 1113 case '<final>': 1114 case '<isolated>': 1115 case '<vertical>': 1116 continue 2; 1117 } 1118 $decomposition["U+$char"] = split(' ',$match[2]); 1119 } 1120 } 1121 fclose($fh); 1122 1123 // process additional Unicode data for casing (allow folded characters to expand into a sequence) 1124 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt'; 1125 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) { 1126 $fh = fopen($specialCasingFile,'rb'); 1127 if ($fh) { 1128 while (!feof($fh)) { 1129 $line = fgets($fh,4096); 1130 if ($line{0} != '#' && trim($line) != '') { 1131 1132 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line); 1133 if ($cond == '' || $cond{0} == '#') { 1134 $utf8_char = $this->UnumberToChar(hexdec($char)); 1135 if ($char != $lower) { 1136 $arr = split(' ',$lower); 1137 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 1138 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr); 1139 } 1140 if ($char != $title && $title != $upper) { 1141 $arr = split(' ',$title); 1142 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 1143 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr); 1144 } 1145 if ($char != $upper) { 1146 $arr = split(' ',$upper); 1147 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 1148 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr); 1149 } 1150 } 1151 } 1152 } 1153 fclose($fh); 1154 } 1155 } 1156 1157 // process custom decompositions 1158 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt'; 1159 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) { 1160 $fh = fopen($customTranslitFile,'rb'); 1161 if ($fh) { 1162 while (!feof($fh)) { 1163 $line = fgets($fh,4096); 1164 if ($line{0} != '#' && trim($line) != '') { 1165 list($char,$translit) = t3lib_div::trimExplode(';', $line); 1166 if (!$translit) $omit["U+$char"] = 1; 1167 $decomposition["U+$char"] = split(' ', $translit); 1168 1169 } 1170 } 1171 fclose($fh); 1172 } 1173 } 1174 1175 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>) 1176 foreach($decomposition as $from => $to) { 1177 $code_decomp = array(); 1178 1179 while ($code_value = array_shift($to)) { 1180 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition 1181 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) { 1182 array_unshift($to, $cv); 1183 } 1184 } elseif (!isset($mark["U+$code_value"])) { // remove mark 1185 array_push($code_decomp, $code_value); 1186 } 1187 } 1188 if (count($code_decomp) || isset($omit[$from])) { 1189 $decomposition[$from] = $code_decomp; 1190 } else { 1191 unset($decomposition[$from]); 1192 } 1193 } 1194 1195 // create ascii only mapping 1196 $this->toASCII['utf-8'] = array(); 1197 $ascii =& $this->toASCII['utf-8']; 1198 1199 foreach($decomposition as $from => $to) { 1200 $code_decomp = array(); 1201 while ($code_value = array_shift($to)) { 1202 $ord = hexdec($code_value); 1203 if ($ord > 127) 1204 continue 2; // skip decompositions containing non-ASCII chars 1205 else 1206 array_push($code_decomp,chr($ord)); 1207 } 1208 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp); 1209 } 1210 1211 // add numeric decompositions 1212 foreach($number as $from => $to) { 1213 $utf8_char = $this->UnumberToChar(hexdec($from)); 1214 if (!isset($ascii[$utf8_char])) { 1215 $ascii[$utf8_char] = $to; 1216 } 1217 } 1218 1219 if ($cacheFileCase) { 1220 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding)); 1221 } 1222 1223 if ($cacheFileASCII) { 1224 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii)); 1225 } 1226 1227 return 3; 1228 } 1229 1230 /** 1231 * This function initializes the folding table for a charset other than UTF-8. 1232 * This function is automatically called by the case folding functions. 1233 * 1234 * @param string Charset for which to initialize case folding. 1235 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 1236 * @access private 1237 */ 1238 function initCaseFolding($charset) { 1239 // Only process if the case table is not yet loaded: 1240 if (is_array($this->caseFolding[$charset])) return 1; 1241 1242 // Use cached version if possible 1243 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl'); 1244 if ($cacheFile && @is_file($cacheFile)) { 1245 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 1246 return 2; 1247 } 1248 1249 // init UTF-8 conversion for this charset 1250 if (!$this->initCharset($charset)) { 1251 return false; 1252 } 1253 1254 // UTF-8 case folding is used as the base conversion table 1255 if (!$this->initUnicodeData('case')) { 1256 return false; 1257 } 1258 1259 $nochar = chr($this->noCharByteVal); 1260 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 1261 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 1262 $c = $this->utf8_decode($utf8, $charset); 1263 1264 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset); 1265 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset); 1266 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc; 1267 1268 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset); 1269 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset); 1270 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc; 1271 1272 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset); 1273 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset); 1274 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc; 1275 } 1276 1277 // add the ASCII case table 1278 for ($i=ord('a'); $i<=ord('z'); $i++) { 1279 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32); 1280 } 1281 for ($i=ord('A'); $i<=ord('Z'); $i++) { 1282 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32); 1283 } 1284 1285 if ($cacheFile) { 1286 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset])); 1287 } 1288 1289 return 3; 1290 } 1291 1292 /** 1293 * This function initializes the to-ASCII conversion table for a charset other than UTF-8. 1294 * This function is automatically called by the ASCII transliteration functions. 1295 * 1296 * @param string Charset for which to initialize conversion. 1297 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 1298 * @access private 1299 */ 1300 function initToASCII($charset) { 1301 // Only process if the case table is not yet loaded: 1302 if (is_array($this->toASCII[$charset])) return 1; 1303 1304 // Use cached version if possible 1305 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl'); 1306 if ($cacheFile && @is_file($cacheFile)) { 1307 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 1308 return 2; 1309 } 1310 1311 // init UTF-8 conversion for this charset 1312 if (!$this->initCharset($charset)) { 1313 return false; 1314 } 1315 1316 // UTF-8/ASCII transliteration is used as the base conversion table 1317 if (!$this->initUnicodeData('ascii')) { 1318 return false; 1319 } 1320 1321 $nochar = chr($this->noCharByteVal); 1322 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 1323 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 1324 $c = $this->utf8_decode($utf8, $charset); 1325 1326 if (isset($this->toASCII['utf-8'][$utf8])) { 1327 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8]; 1328 } 1329 } 1330 1331 if ($cacheFile) { 1332 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset])); 1333 } 1334 1335 return 3; 1336 } 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 /******************************************** 1354 * 1355 * String operation functions 1356 * 1357 ********************************************/ 1358 1359 /** 1360 * Returns a part of a string. 1361 * Unit-tested by Kasper (single byte charsets only) 1362 * 1363 * @param string The character set 1364 * @param string Character string 1365 * @param integer Start position (character position) 1366 * @param integer Length (in characters) 1367 * @return string The substring 1368 * @see substr(), mb_substr() 1369 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1370 */ 1371 function substr($charset,$string,$start,$len=null) { 1372 if ($len===0) return ''; 1373 1374 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1375 // cannot omit $len, when specifying charset 1376 if ($len==null) { 1377 $enc = mb_internal_encoding(); // save internal encoding 1378 mb_internal_encoding($charset); 1379 $str = mb_substr($string,$start); 1380 mb_internal_encoding($enc); // restore internal encoding 1381 1382 return $str; 1383 } 1384 else { 1385 return mb_substr($string,$start,$len,$charset); 1386 } 1387 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1388 // cannot omit $len, when specifying charset 1389 if ($len==null) { 1390 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding 1391 iconv_set_encoding('internal_encoding',$charset); 1392 $str = iconv_substr($string,$start); 1393 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding 1394 1395 return $str; 1396 } 1397 else { 1398 return iconv_substr($string,$start,$len,$charset); 1399 } 1400 } elseif ($charset == 'utf-8') { 1401 return $this->utf8_substr($string,$start,$len); 1402 } elseif ($this->eucBasedSets[$charset]) { 1403 return $this->euc_substr($string,$start,$charset,$len); 1404 } elseif ($this->twoByteSets[$charset]) { 1405 return substr($string,$start*2,$len*2); 1406 } elseif ($this->fourByteSets[$charset]) { 1407 return substr($string,$start*4,$len*4); 1408 } 1409 1410 // treat everything else as single-byte encoding 1411 return $len === NULL ? substr($string,$start) : substr($string,$start,$len); 1412 } 1413 1414 /** 1415 * Counts the number of characters. 1416 * Unit-tested by Kasper (single byte charsets only) 1417 * 1418 * @param string The character set 1419 * @param string Character string 1420 * @return integer The number of characters 1421 * @see strlen() 1422 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1423 */ 1424 function strlen($charset,$string) { 1425 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1426 return mb_strlen($string,$charset); 1427 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1428 return iconv_strlen($string,$charset); 1429 } elseif ($charset == 'utf-8') { 1430 return $this->utf8_strlen($string); 1431 } elseif ($this->eucBasedSets[$charset]) { 1432 return $this->euc_strlen($string,$charset); 1433 } elseif ($this->twoByteSets[$charset]) { 1434 return strlen($string)/2; 1435 } elseif ($this->fourByteSets[$charset]) { 1436 return strlen($string)/4; 1437 } 1438 // treat everything else as single-byte encoding 1439 return strlen($string); 1440 } 1441 1442 /** 1443 * Truncates a string and pre-/appends a string. 1444 * Unit tested by Kasper 1445 * 1446 * @param string The character set 1447 * @param string Character string 1448 * @param integer Length (in characters) 1449 * @param string Crop signifier 1450 * @return string The shortened string 1451 * @see substr(), mb_strimwidth() 1452 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1453 */ 1454 function crop($charset,$string,$len,$crop='') { 1455 if (intval($len) == 0) return $string; 1456 1457 if ($charset == 'utf-8') { 1458 $i = $this->utf8_char2byte_pos($string,$len); 1459 } elseif ($this->eucBasedSets[$charset]) { 1460 $i = $this->euc_char2byte_pos($string,$len,$charset); 1461 } else { 1462 if ($len > 0) { 1463 $i = $len; 1464 } else { 1465 $i = strlen($string)+$len; 1466 if ($i<=0) $i = false; 1467 } 1468 } 1469 1470 if ($i === false) { // $len outside actual string length 1471 return $string; 1472 } else { 1473 if ($len > 0) { 1474 if (strlen($string{$i})) { 1475 return substr($string,0,$i).$crop; 1476 1477 } 1478 } else { 1479 if (strlen($string{$i-1})) { 1480 return $crop.substr($string,$i); 1481 } 1482 } 1483 1484 /* 1485 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...) 1486 if ($len > 0) { 1487 return substr($string,0,$i).$crop; 1488 } else { 1489 return $crop.substr($string,$i); 1490 } 1491 } 1492 */ 1493 } 1494 return $string; 1495 } 1496 1497 /** 1498 * Cuts a string short at a given byte length. 1499 * 1500 * @param string The character set 1501 * @param string Character string 1502 * @param integer The byte length 1503 * @return string The shortened string 1504 * @see mb_strcut() 1505 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1506 */ 1507 function strtrunc($charset,$string,$len) { 1508 if ($len <= 0) return ''; 1509 1510 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1511 return mb_strcut($string,0,$len,$charset); 1512 } elseif ($charset == 'utf-8') { 1513 return $this->utf8_strtrunc($string,$len); 1514 } elseif ($this->eucBasedSets[$charset]) { 1515 return $this->euc_strtrunc($string,$charset); 1516 } elseif ($this->twoByteSets[$charset]) { 1517 if ($len % 2) $len--; // don't cut at odd positions 1518 } elseif ($this->fourByteSets[$charset]) { 1519 $x = $len % 4; 1520 $len -= $x; // realign to position dividable by four 1521 } 1522 // treat everything else as single-byte encoding 1523 return substr($string,0,$len); 1524 } 1525 1526 /** 1527 * Translates all characters of a string into their respective case values. 1528 * Unlike strtolower() and strtoupper() this method is locale independent. 1529 * Note that the string length may change! 1530 * eg. lower case German �(sharp S) becomes upper case "SS" 1531 * Unit-tested by Kasper 1532 * Real case folding is language dependent, this method ignores this fact. 1533 * 1534 * @param string Character set of string 1535 * @param string Input string to convert case for 1536 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" ) 1537 * @return string The converted string 1538 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1539 * @see strtolower(), strtoupper() 1540 */ 1541 function conv_case($charset,$string,$case) { 1542 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3) { 1543 if ($case == 'toLower') { 1544 $string = mb_strtolower($string,$charset); 1545 } else { 1546 $string = mb_strtoupper($string,$charset); 1547 } 1548 } elseif ($charset == 'utf-8') { 1549 $string = $this->utf8_char_mapping($string,'case',$case); 1550 } elseif (isset($this->eucBasedSets[$charset])) { 1551 $string = $this->euc_char_mapping($string,$charset,'case',$case); 1552 } else { 1553 // treat everything else as single-byte encoding 1554 $string = $this->sb_char_mapping($string,$charset,'case',$case); 1555 } 1556 1557 return $string; 1558 } 1559 1560 /** 1561 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.) 1562 * 1563 * @param string Character set of string 1564 * @param string Input string to convert 1565 * @return string The converted string 1566 */ 1567 function specCharsToASCII($charset,$string) { 1568 if ($charset == 'utf-8') { 1569 $string = $this->utf8_char_mapping($string,'ascii'); 1570 } elseif (isset($this->eucBasedSets[$charset])) { 1571 $string = $this->euc_char_mapping($string,$charset,'ascii'); 1572 } else { 1573 // treat everything else as single-byte encoding 1574 $string = $this->sb_char_mapping($string,$charset,'ascii'); 1575 } 1576 1577 return $string; 1578 } 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 /******************************************** 1592 * 1593 * Internal string operation functions 1594 * 1595 ********************************************/ 1596 1597 /** 1598 * Maps all characters of a string in a single byte charset. 1599 * 1600 * @param string the string 1601 * @param string the charset 1602 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 1603 * @param string 'case': conversion 'toLower' or 'toUpper' 1604 * @return string the converted string 1605 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1606 */ 1607 function sb_char_mapping($str,$charset,$mode,$opt='') { 1608 switch($mode) { 1609 case 'case': 1610 if (!$this->initCaseFolding($charset)) return $str; // do nothing 1611 $map =& $this->caseFolding[$charset][$opt]; 1612 break; 1613 1614 case 'ascii': 1615 if (!$this->initToASCII($charset)) return $str; // do nothing 1616 $map =& $this->toASCII[$charset]; 1617 break; 1618 1619 default: 1620 return $str; 1621 } 1622 1623 $out = ''; 1624 for($i=0; strlen($str{$i}); $i++) { 1625 $c = $str{$i}; 1626 if (isset($map[$c])) { 1627 $out .= $map[$c]; 1628 } else { 1629 $out .= $c; 1630 } 1631 } 1632 1633 return $out; 1634 } 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 /******************************************** 1646 * 1647 * Internal UTF-8 string operation functions 1648 * 1649 ********************************************/ 1650 1651 /** 1652 * Returns a part of a UTF-8 string. 1653 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len 1654 * 1655 * @param string UTF-8 string 1656 * @param integer Start position (character position) 1657 * @param integer Length (in characters) 1658 * @return string The substring 1659 * @see substr() 1660 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1661 */ 1662 function utf8_substr($str,$start,$len=null) { 1663 if (!strcmp($len,'0')) return ''; 1664 1665 $byte_start = $this->utf8_char2byte_pos($str,$start); 1666 if ($byte_start === false) { 1667 if ($start > 0) { 1668 return false; // $start outside string length 1669 } else { 1670 $start = 0; 1671 } 1672 } 1673 1674 $str = substr($str,$byte_start); 1675 1676 if ($len!=null) { 1677 $byte_end = $this->utf8_char2byte_pos($str,$len); 1678 if ($byte_end === false) // $len outside actual string length 1679 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string. 1680 else 1681 return substr($str,0,$byte_end); 1682 } 1683 else return $str; 1684 } 1685 1686 /** 1687 * Counts the number of characters of a string in UTF-8. 1688 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen() 1689 * 1690 * @param string UTF-8 multibyte character string 1691 * @return integer The number of characters 1692 * @see strlen() 1693 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1694 */ 1695 function utf8_strlen($str) { 1696 $n=0; 1697 for($i=0; strlen($str{$i}); $i++) { 1698 $c = ord($str{$i}); 1699 if (!($c & 0x80)) // single-byte (0xxxxxx) 1700 $n++; 1701 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 1702 $n++; 1703 } 1704 return $n; 1705 } 1706 1707 /** 1708 * Truncates a string in UTF-8 short at a given byte length. 1709 * 1710 * @param string UTF-8 multibyte character string 1711 * @param integer the byte length 1712 * @return string the shortened string 1713 * @see mb_strcut() 1714 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1715 */ 1716 function utf8_strtrunc($str,$len) { 1717 $i = $len-1; 1718 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence 1719 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte 1720 if ($i <= 0) return ''; // sanity check 1721 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes 1722 if ($bc+$i > $len) return substr($str,0,$i); 1723 // fallthru: multibyte char fits into length 1724 } 1725 return substr($str,0,$len); 1726 } 1727 1728 /** 1729 * Find position of first occurrence of a string, both arguments are in UTF-8. 1730 * 1731 * @param string UTF-8 string to search in 1732 * @param string UTF-8 string to search for 1733 * @param integer Positition to start the search 1734 * @return integer The character position 1735 * @see strpos() 1736 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1737 */ 1738 function utf8_strpos($haystack,$needle,$offset=0) { 1739 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1740 return mb_strpos($haystack,$needle,$offset,'utf-8'); 1741 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1742 return iconv_strpos($haystack,$needle,$offset,'utf-8'); 1743 } 1744 1745 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset); 1746 if ($byte_offset === false) return false; // offset beyond string length 1747 1748 $byte_pos = strpos($haystack,$needle,$byte_offset); 1749 if ($byte_pos === false) return false; // needle not found 1750 1751 return $this->utf8_byte2char_pos($haystack,$byte_pos); 1752 } 1753 1754 /** 1755 * Find position of last occurrence of a char in a string, both arguments are in UTF-8. 1756 * 1757 * @param string UTF-8 string to search in 1758 * @param string UTF-8 character to search for (single character) 1759 * @return integer The character position 1760 * @see strrpos() 1761 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1762 */ 1763 function utf8_strrpos($haystack,$needle) { 1764 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1765 return mb_strrpos($haystack,$needle,'utf-8'); 1766 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1767 return iconv_strrpos($haystack,$needle,'utf-8'); 1768 } 1769 1770 $byte_pos = strrpos($haystack,$needle); 1771 if ($byte_pos === false) return false; // needle not found 1772 1773 return $this->utf8_byte2char_pos($haystack,$byte_pos); 1774 } 1775 1776 /** 1777 * Translates a character position into an 'absolute' byte position. 1778 * Unit tested by Kasper. 1779 * 1780 * @param string UTF-8 string 1781 * @param integer Character position (negative values start from the end) 1782 * @return integer Byte position 1783 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1784 */ 1785 function utf8_char2byte_pos($str,$pos) { 1786 $n = 0; // number of characters found 1787 $p = abs($pos); // number of characters wanted 1788 1789 if ($pos >= 0) { 1790 $i = 0; 1791 $d = 1; 1792 } else { 1793 $i = strlen($str)-1; 1794 $d = -1; 1795 } 1796 1797 for( ; strlen($str{$i}) && $n<$p; $i+=$d) { 1798 $c = (int)ord($str{$i}); 1799 if (!($c & 0x80)) // single-byte (0xxxxxx) 1800 $n++; 1801 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 1802 $n++; 1803 } 1804 if (!strlen($str{$i})) return false; // offset beyond string length 1805 1806 if ($pos >= 0) { 1807 // skip trailing multi-byte data bytes 1808 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; } 1809 } else { 1810 // correct offset 1811 $i++; 1812 } 1813 1814 return $i; 1815 } 1816 1817 /** 1818 * Translates an 'absolute' byte position into a character position. 1819 * Unit tested by Kasper. 1820 * 1821 * @param string UTF-8 string 1822 * @param integer byte position 1823 * @return integer character position 1824 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1825 */ 1826 function utf8_byte2char_pos($str,$pos) { 1827 $n = 0; // number of characters 1828 for($i=$pos; $i>0; $i--) { 1829 $c = (int)ord($str{$i}); 1830 if (!($c & 0x80)) // single-byte (0xxxxxx) 1831 $n++; 1832 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 1833 $n++; 1834 } 1835 if (!strlen($str{$i})) return false; // offset beyond string length 1836 1837 return $n; 1838 } 1839 1840 /** 1841 * Maps all characters of an UTF-8 string. 1842 * 1843 * @param string UTF-8 string 1844 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 1845 * @param string 'case': conversion 'toLower' or 'toUpper' 1846 * @return string the converted string 1847 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1848 */ 1849 function utf8_char_mapping($str,$mode,$opt='') { 1850 if (!$this->initUnicodeData($mode)) return $str; // do nothing 1851 1852 $out = ''; 1853 switch($mode) { 1854 case 'case': 1855 $map =& $this->caseFolding['utf-8'][$opt]; 1856 break; 1857 1858 case 'ascii': 1859 $map =& $this->toASCII['utf-8']; 1860 break; 1861 1862 default: 1863 return $str; 1864 } 1865 1866 for($i=0; strlen($str{$i}); $i++) { 1867 $c = ord($str{$i}); 1868 if (!($c & 0x80)) // single-byte (0xxxxxx) 1869 $mbc = $str{$i}; 1870 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx) 1871 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes 1872 $mbc = substr($str,$i,$bc); 1873 $i += $bc-1; 1874 } 1875 1876 if (isset($map[$mbc])) { 1877 $out .= $map[$mbc]; 1878 } else { 1879 $out .= $mbc; 1880 } 1881 } 1882 1883 return $out; 1884 } 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 /******************************************** 1904 * 1905 * Internal EUC string operation functions 1906 * 1907 * Extended Unix Code: 1908 * ASCII compatible 7bit single bytes chars 1909 * 8bit two byte chars 1910 * 1911 * Shift-JIS is treated as a special case. 1912 * 1913 ********************************************/ 1914 1915 /** 1916 * Cuts a string in the EUC charset family short at a given byte length. 1917 * 1918 * @param string EUC multibyte character string 1919 * @param integer the byte length 1920 * @param string the charset 1921 * @return string the shortened string 1922 * @see mb_strcut() 1923 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1924 */ 1925 function euc_strtrunc($str,$len,$charset) { 1926 $sjis = ($charset == 'shift_jis'); 1927 for ($i=0; strlen($str{$i}) && $i<$len; $i++) { 1928 $c = ord($str{$i}); 1929 if ($sjis) { 1930 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char 1931 } 1932 else { 1933 if ($c >= 0x80) $i++; // advance a double-byte char 1934 } 1935 } 1936 if (!strlen($str{$i})) return $str; // string shorter than supplied length 1937 1938 if ($i>$len) 1939 return substr($str,0,$len-1); // we ended on a first byte 1940 else 1941 return substr($str,0,$len); 1942 } 1943 1944 /** 1945 * Returns a part of a string in the EUC charset family. 1946 * 1947 * @param string EUC multibyte character string 1948 * @param integer start position (character position) 1949 * @param string the charset 1950 * @param integer length (in characters) 1951 * @return string the substring 1952 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1953 */ 1954 function euc_substr($str,$start,$charset,$len=null) { 1955 $byte_start = $this->euc_char2byte_pos($str,$start,$charset); 1956 if ($byte_start === false) return false; // $start outside string length 1957 1958 $str = substr($str,$byte_start); 1959 1960 if ($len!=null) { 1961 $byte_end = $this->euc_char2byte_pos($str,$len,$charset); 1962 if ($byte_end === false) // $len outside actual string length 1963 return $str; 1964 else 1965 return substr($str,0,$byte_end); 1966 } 1967 else return $str; 1968 } 1969 1970 /** 1971 * Counts the number of characters of a string in the EUC charset family. 1972 * 1973 * @param string EUC multibyte character string 1974 * @param string the charset 1975 * @return integer the number of characters 1976 * @see strlen() 1977 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1978 */ 1979 function euc_strlen($str,$charset) { 1980 $sjis = ($charset == 'shift_jis'); 1981 $n=0; 1982 for ($i=0; strlen($str{$i}); $i++) { 1983 $c = ord($str{$i}); 1984 if ($sjis) { 1985 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char 1986 } 1987 else { 1988 if ($c >= 0x80) $i++; // advance a double-byte char 1989 } 1990 1991 $n++; 1992 } 1993 1994 return $n; 1995 } 1996 1997 /** 1998 * Translates a character position into an 'absolute' byte position. 1999 * 2000 * @param string EUC multibyte character string 2001 * @param integer character position (negative values start from the end) 2002 * @param string the charset 2003 * @return integer byte position 2004 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2005 */ 2006 function euc_char2byte_pos($str,$pos,$charset) { 2007 $sjis = ($charset == 'shift_jis'); 2008 $n = 0; // number of characters seen 2009 $p = abs($pos); // number of characters wanted 2010 2011 if ($pos >= 0) { 2012 $i = 0; 2013 $d = 1; 2014 } else { 2015 $i = strlen($str)-1; 2016 $d = -1; 2017 } 2018 2019 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) { 2020 $c = ord($str{$i}); 2021 if ($sjis) { 2022 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char 2023 } 2024 else { 2025 if ($c >= 0x80) $i+=$d; // advance a double-byte char 2026 } 2027 2028 $n++; 2029 } 2030 if (!strlen($str{$i})) return false; // offset beyond string length 2031 2032 if ($pos < 0) $i++; // correct offset 2033 2034 return $i; 2035 } 2036 2037 /** 2038 * Maps all characters of a string in the EUC charset family. 2039 * 2040 * @param string EUC multibyte character string 2041 * @param string the charset 2042 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 2043 * @param string 'case': conversion 'toLower' or 'toUpper' 2044 * @return string the converted string 2045 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2046 */ 2047 function euc_char_mapping($str,$charset,$mode,$opt='') { 2048 switch($mode) { 2049 case 'case': 2050 if (!$this->initCaseFolding($charset)) return $str; // do nothing 2051 $map =& $this->caseFolding[$charset][$opt]; 2052 break; 2053 2054 case 'ascii': 2055 if (!$this->initToASCII($charset)) return $str; // do nothing 2056 $map =& $this->toASCII[$charset]; 2057 break; 2058 2059 default: 2060 return $str; 2061 } 2062 2063 $sjis = ($charset == 'shift_jis'); 2064 $out = ''; 2065 for($i=0; strlen($str{$i}); $i++) { 2066 $mbc = $str{$i}; 2067 $c = ord($mbc); 2068 2069 if ($sjis) { 2070 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char 2071 $mbc = substr($str,$i,2); 2072 $i++; 2073 } 2074 } 2075 else { 2076 if ($c >= 0x80) { // a double-byte char 2077 $mbc = substr($str,$i,2); 2078 $i++; 2079 } 2080 } 2081 2082 if (isset($map[$mbc])) { 2083 $out .= $map[$mbc]; 2084 } else { 2085 $out .= $mbc; 2086 } 2087 } 2088 2089 return $out; 2090 } 2091 2092 } 2093 2094 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) { 2095 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']); 2096 } 2097 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Wed Jan 14 11:33:29 2009 | Cross-referenced by PHPXref 0.7 |