[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/lib/typo3/ -> class.t3lib_cs.php (source)

   1  <?php
   2  /***************************************************************
   3  *  Copyright notice
   4  *
   5  *  (c) 2003-2006 Kasper Skaarhoj (kasperYYYY@typo3.com)
   6  *  All rights reserved
   7  *
   8  *  This script is part of the Typo3 project. The Typo3 project is
   9  *  free software; you can redistribute it and/or modify
  10  *  it under the terms of the GNU General Public License as published by
  11  *  the Free Software Foundation; either version 2 of the License, or
  12  *  (at your option) any later version.
  13  *
  14  *  The GNU General Public License can be found at
  15  *  http://www.gnu.org/copyleft/gpl.html.
  16  *
  17  *  This script is distributed in the hope that it will be useful,
  18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  *  GNU General Public License for more details.
  21  *
  22  *  This copyright notice MUST APPEAR in all copies of the script!
  23  ***************************************************************/
  24  /**
  25   * Class for conversion between charsets.
  26   *
  27   *    Typo Id: class.t3lib_cs.php,v 1.56 2006/05/03 08:47:30 masi Exp $
  28   * Moodle $Id: class.t3lib_cs.php,v 1.7 2006/08/11 09:48:35 stronk7 Exp $
  29   *
  30   * @author    Kasper Skaarhoj <kasperYYYY@typo3.com>
  31   * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
  32   */
  33  /**
  34   * [CLASS/FUNCTION INDEX of SCRIPT]
  35   *
  36   *
  37   *
  38   *  136: class t3lib_cs
  39   *  488:     function parse_charset($charset)
  40   *  507:     function get_locale_charset($locale)
  41   *
  42   *              SECTION: Charset Conversion functions
  43   *  560:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
  44   *  600:     function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
  45   *  617:     function utf8_encode($str,$charset)
  46   *  663:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
  47   *  706:     function utf8_to_entities($str)
  48   *  739:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
  49   *  773:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
  50   *  823:     function UnumberToChar($cbyte)
  51   *  868:     function utf8CharToUnumber($str,$hex=0)
  52   *
  53   *              SECTION: Init functions
  54   *  911:     function initCharset($charset)
  55   *  973:     function initUnicodeData($mode=null)
  56   * 1198:     function initCaseFolding($charset)
  57   * 1260:     function initToASCII($charset)
  58   *
  59   *              SECTION: String operation functions
  60   * 1331:     function substr($charset,$string,$start,$len=null)
  61   * 1384:     function strlen($charset,$string)
  62   * 1414:     function crop($charset,$string,$len,$crop='')
  63   * 1467:     function strtrunc($charset,$string,$len)
  64   * 1501:     function conv_case($charset,$string,$case)
  65   * 1527:     function specCharsToASCII($charset,$string)
  66   *
  67   *              SECTION: Internal string operation functions
  68   * 1567:     function sb_char_mapping($str,$charset,$mode,$opt='')
  69   *
  70   *              SECTION: Internal UTF-8 string operation functions
  71   * 1622:     function utf8_substr($str,$start,$len=null)
  72   * 1655:     function utf8_strlen($str)
  73   * 1676:     function utf8_strtrunc($str,$len)
  74   * 1698:     function utf8_strpos($haystack,$needle,$offset=0)
  75   * 1723:     function utf8_strrpos($haystack,$needle)
  76   * 1745:     function utf8_char2byte_pos($str,$pos)
  77   * 1786:     function utf8_byte2char_pos($str,$pos)
  78   * 1809:     function utf8_char_mapping($str,$mode,$opt='')
  79   *
  80   *              SECTION: Internal EUC string operation functions
  81   * 1885:     function euc_strtrunc($str,$len,$charset)
  82   * 1914:     function euc_substr($str,$start,$charset,$len=null)
  83   * 1939:     function euc_strlen($str,$charset)
  84   * 1966:     function euc_char2byte_pos($str,$pos,$charset)
  85   * 2007:     function euc_char_mapping($str,$charset,$mode,$opt='')
  86   *
  87   * TOTAL FUNCTIONS: 35
  88   * (This index is automatically created/updated by the extension "extdeveval")
  89   *
  90   */
  91  
  92  
  93  
  94  
  95  
  96  
  97  
  98  
  99  /**
 100   * Notes on UTF-8
 101   *
 102   * Functions working on UTF-8 strings:
 103   *
 104   * - strchr/strstr
 105   * - strrchr
 106   * - substr_count
 107   * - implode/explode/join
 108   *
 109   * Functions nearly working on UTF-8 strings:
 110   *
 111   * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
 112   * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
 113   * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
 114   * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
 115   *
 116   * Functions NOT working on UTF-8 strings:
 117   *
 118   * - str*cmp
 119   * - stristr
 120   * - stripos
 121   * - substr
 122   * - strrev
 123   * - ereg/eregi
 124   * - split/spliti
 125   * - preg_*
 126   * - ...
 127   *
 128   */
 129  /**
 130   * Class for conversion between charsets
 131   *
 132   * @author    Kasper Skaarhoj <kasperYYYY@typo3.com>
 133   * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
 134   * @package TYPO3
 135   * @subpackage t3lib
 136   */
 137  class t3lib_cs {
 138      var $noCharByteVal=63;        // ASCII Value for chars with no equivalent.
 139  
 140          // This is the array where parsed conversion tables are stored (cached)
 141      var $parsedCharsets=array();
 142  
 143          // An array where case folding data will be stored (cached)
 144      var $caseFolding=array();
 145  
 146          // An array where charset-to-ASCII mappings are stored (cached)
 147      var $toASCII=array();
 148  
 149          // This tells the converter which charsets has two bytes per char:
 150      var $twoByteSets=array(
 151          'ucs-2'=>1,    // 2-byte Unicode
 152      );
 153  
 154          // This tells the converter which charsets has four bytes per char:
 155      var $fourByteSets=array(
 156          'ucs-4'=>1,    // 4-byte Unicode
 157          'utf-32'=>1,    // 4-byte Unicode (limited to the 21-bits of UTF-16)
 158      );
 159  
 160          // This tells the converter which charsets use a scheme like the Extended Unix Code:
 161      var $eucBasedSets=array(
 162          'gb2312'=>1,        // Chinese, simplified.
 163          'big5'=>1,        // Chinese, traditional.
 164          'euc-kr'=>1,        // Korean
 165          'shift_jis'=>1,        // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
 166      );
 167  
 168          // see    http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
 169          // http://czyborra.com/charsets/iso8859.html
 170      var $synonyms=array(
 171          'us' => 'ascii',
 172          'us-ascii'=> 'ascii',
 173          'cp819' => 'iso-8859-1',
 174          'ibm819' => 'iso-8859-1',
 175          'iso-ir-100' => 'iso-8859-1',
 176          'iso-ir-109' => 'iso-8859-2',
 177          'iso-ir-148' => 'iso-8859-9',
 178          'iso-ir-199' => 'iso-8859-14',
 179          'iso-ir-203' => 'iso-8859-15',
 180          'csisolatin1' => 'iso-8859-1',
 181          'csisolatin2' => 'iso-8859-2',
 182          'csisolatin3' => 'iso-8859-3',
 183          'csisolatin5' => 'iso-8859-9',
 184          'csisolatin8' => 'iso-8859-14',
 185          'csisolatin9' => 'iso-8859-15',
 186          'csisolatingreek' => 'iso-8859-7',
 187          'iso-celtic' => 'iso-8859-14',
 188          'latin1' => 'iso-8859-1',
 189          'latin2' => 'iso-8859-2',
 190          'latin3' => 'iso-8859-3',
 191          'latin5' => 'iso-8859-9',
 192          'latin6' => 'iso-8859-10',
 193          'latin8' => 'iso-8859-14',
 194          'latin9' => 'iso-8859-15',
 195          'l1' => 'iso-8859-1',
 196          'l2' => 'iso-8859-2',
 197          'l3' => 'iso-8859-3',
 198          'l5' => 'iso-8859-9',
 199          'l6' => 'iso-8859-10',
 200          'l8' => 'iso-8859-14',
 201          'l9' => 'iso-8859-15',
 202          'cyrillic' => 'iso-8859-5',
 203          'arabic' => 'iso-8859-6',
 204          'tis-620' => 'iso-8859-11',
 205          'win874' => 'windows-874',
 206          'win1250' => 'windows-1250',
 207          'win1251' => 'windows-1251',
 208          'win1252' => 'windows-1252',
 209          'win1253' => 'windows-1253',
 210          'win1254' => 'windows-1254',
 211          'win1255' => 'windows-1255',
 212          'win1256' => 'windows-1256',
 213          'win1257' => 'windows-1257',
 214          'win1258' => 'windows-1258',
 215          'cp1250' => 'windows-1250',
 216          'cp1251' => 'windows-1251',
 217          'cp1252' => 'windows-1252',
 218          'ms-ee' => 'windows-1250',
 219          'ms-ansi' => 'windows-1252',
 220          'ms-greek' => 'windows-1253',
 221          'ms-turk' => 'windows-1254',
 222          'winbaltrim' => 'windows-1257',
 223          'koi-8ru' => 'koi-8r',
 224          'koi8r' => 'koi-8r',
 225          'cp878' => 'koi-8r',
 226          'mac' => 'macroman',
 227          'macintosh' => 'macroman',
 228          'euc-cn' => 'gb2312',
 229          'x-euc-cn' => 'gb2312',
 230          'euccn' => 'gb2312',
 231          'cp936' => 'gb2312',
 232          'big-5' => 'big5',
 233          'cp950' => 'big5',
 234          'eucjp' => 'euc-jp',
 235          'sjis' => 'shift_jis',
 236          'shift-jis' => 'shift_jis',
 237          'cp932' => 'shift_jis',
 238          'cp949' => 'euc-kr',
 239          'utf7' => 'utf-7',
 240          'utf8' => 'utf-8',
 241          'utf16' => 'utf-16',
 242          'utf32' => 'utf-32',
 243          'utf8' => 'utf-8',
 244          'ucs2' => 'ucs-2',
 245          'ucs4' => 'ucs-4',
 246      );
 247  
 248          // mapping of iso-639:2 language codes to script names
 249      var $lang_to_script=array(
 250              // iso-639:2 language codes, see:
 251              //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
 252              //  http://www.loc.gov/standards/iso639-2/langcodes.html
 253              //  http://www.unicode.org/onlinedat/languages.html
 254          'ar' => 'arabic',
 255          'bg' => 'cyrillic',        // Bulgarian
 256          'bs' => 'east_european',    // Bosnian
 257          'cs' => 'east_european',    // Czech
 258          'da' => 'west_european',    // Danish
 259          'de' => 'west_european',    // German
 260          'es' => 'west_european',    // Spanish
 261          'et' => 'estonian',
 262          'eo' => 'unicode',        // Esperanto
 263          'eu' => 'west_european',    // Basque
 264          'fa' => 'arabic',    // Persian
 265          'fi' => 'west_european',    // Finish
 266          'fo' => 'west_european',    // Faroese
 267          'fr' => 'west_european',    // French
 268          'gr' => 'greek',
 269          'he' => 'hebrew',        // Hebrew (since 1998)
 270          'hi' => 'unicode',        // Hindi
 271          'hr' => 'east_european',    // Croatian
 272          'hu' => 'east_european',    // Hungarian
 273          'iw' => 'hebrew',        // Hebrew (til 1998)
 274          'is' => 'west_european',    // Icelandic
 275          'it' => 'west_european',    // Italian
 276          'ja' => 'japanese',
 277          'kl' => 'west_european',    // Greenlandic
 278          'ko' => 'korean',
 279          'lt' => 'lithuanian',
 280          'lv' => 'west_european',    // Latvian/Lettish
 281          'nl' => 'west_european',    // Dutch
 282          'no' => 'west_european',    // Norwegian
 283          'pl' => 'east_european',    // Polish
 284          'pt' => 'west_european',    // Portuguese
 285          'ro' => 'east_european',    // Romanian
 286          'ru' => 'cyrillic',        // Russian
 287          'sk' => 'east_european',    // Slovak
 288          'sl' => 'east_european',    // Slovenian
 289          'sr' => 'cyrillic',        // Serbian
 290          'sv' => 'west_european',    // Swedish
 291          'th' => 'thai',
 292          'uk' => 'cyrillic',        // Ukranian
 293          'vi' => 'vietnamese',
 294          'zh' => 'chinese',
 295              // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
 296              // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
 297          'ara' => 'arabic',
 298          'bgr' => 'cyrillic',        // Bulgarian
 299          'cat' => 'west_european',    // Catalan
 300          'chs' => 'simpl_chinese',
 301          'cht' => 'trad_chinese',
 302          'csy' => 'east_european',    // Czech
 303          'dan' => 'west_european',    // Danisch
 304          'deu' => 'west_european',    // German
 305          'dea' => 'west_european',    // German (Austrian)
 306          'des' => 'west_european',    // German (Swiss)
 307          'ena' => 'west_european',    // English (Australian)
 308          'enc' => 'west_european',    // English (Canadian)
 309          'eng' => 'west_european',    // English
 310          'enz' => 'west_european',    // English (New Zealand)
 311          'enu' => 'west_european',    // English (United States)
 312          'euq' => 'west_european',    // Basque
 313          'fos' => 'west_european',    // Faroese
 314          'far' => 'arabic',    // Persian
 315          'fin' => 'west_european',    // Finish
 316          'fra' => 'west_european',    // French
 317          'frb' => 'west_european',    // French (Belgian)
 318          'frc' => 'west_european',    // French (Canadian)
 319          'frs' => 'west_european',    // French (Swiss)
 320          'ell' => 'greek',
 321          'heb' => 'hebrew',
 322          'hin' => 'unicode',    // Hindi
 323          'hun' => 'east_european',    // Hungarian
 324          'isl' => 'west_euorpean',    // Icelandic
 325          'ita' => 'west_european',    // Italian
 326          'its' => 'west_european',    // Italian (Swiss)
 327          'jpn' => 'japanese',
 328          'kor' => 'korean',
 329          'lth' => 'lithuanian',
 330          'lvi' => 'west_european',    // Latvian/Lettish
 331          'msl' => 'west_european',    // Malay
 332          'nlb' => 'west_european',    // Dutch (Belgian)
 333          'nld' => 'west_european',    // Dutch
 334          'nor' => 'west_european',    // Norwegian (bokmal)
 335          'non' => 'west_european',    // Norwegian (nynorsk)
 336          'plk' => 'east_european',    // Polish
 337          'ptg' => 'west_european',    // Portuguese
 338          'ptb' => 'west_european',    // Portuguese (Brazil)
 339          'rom' => 'east_european',    // Romanian
 340          'rus' => 'cyrillic',        // Russian
 341          'slv' => 'east_european',    // Slovenian
 342          'sky' => 'east_european',    // Slovak
 343          'srl' => 'east_european',    // Serbian (Latin)
 344          'srb' => 'cyrillic',        // Serbian (Cyrillic)
 345          'esp' => 'west_european',    // Spanish (trad. sort)
 346          'esm' => 'west_european',    // Spanish (Mexican)
 347          'esn' => 'west_european',    // Spanish (internat. sort)
 348          'sve' => 'west_european',    // Swedish
 349          'tha' => 'thai',
 350          'trk' => 'turkish',
 351          'ukr' => 'cyrillic',    // Ukrainian
 352              // English language names
 353          'arabic' => 'arabic',
 354          'basque' => 'west_european',
 355          'bosnian' => 'east_european',
 356          'bulgarian' => 'east_european',
 357          'catalan' => 'west_european',
 358          'croatian' => 'east_european',
 359          'czech' => 'east_european',
 360          'danish' => 'west_european',
 361          'dutch' => 'west_european',
 362          'english' => 'west_european',
 363          'esperanto' => 'unicode',
 364          'estonian' => 'estonian',
 365          'faroese' => 'west_european',
 366          'farsi' => 'arabic',
 367          'finnish' => 'west_european',
 368          'french' => 'west_european',
 369          'galician' => 'west_european',
 370          'german' => 'west_european',
 371          'greek' => 'greek',
 372          'greenlandic' => 'west_european',
 373          'hebrew' => 'hebrew',
 374          'hindi' => 'unicode',
 375          'hungarian' => 'east_european',
 376          'icelandic' => 'west_european',
 377          'italian' => 'west_european',
 378          'latvian' => 'west_european',
 379          'lettish' => 'west_european',
 380          'lithuanian' => 'lithuanian',
 381          'malay' => 'west_european',
 382          'norwegian' => 'west_european',
 383          'persian' => 'arabic',
 384          'polish' => 'east_european',
 385          'portuguese' => 'west_european',
 386          'russian' => 'cyrillic',
 387          'romanian' => 'east_european',
 388          'serbian' => 'cyrillic',
 389          'slovak' => 'east_european',
 390          'slovenian' => 'east_european',
 391          'spanish' => 'west_european',
 392          'svedish' => 'west_european',
 393          'that' => 'thai',
 394          'turkish' => 'turkish',
 395          'ukrainian' => 'cyrillic',
 396      );
 397  
 398          // mapping of language (family) names to charsets on Unix
 399      var $script_to_charset_unix=array(
 400          'west_european' => 'iso-8859-1',
 401          'estonian' => 'iso-8859-1',
 402          'east_european' => 'iso-8859-2',
 403          'baltic' => 'iso-8859-4',
 404          'cyrillic' => 'iso-8859-5',
 405          'arabic' => 'iso-8859-6',
 406          'greek' => 'iso-8859-7',
 407          'hebrew' => 'iso-8859-8',
 408          'turkish' => 'iso-8859-9',
 409          'thai' => 'iso-8859-11', // = TIS-620
 410          'lithuanian' => 'iso-8859-13',
 411          'chinese' => 'gb2312', // = euc-cn
 412          'japanese' => 'euc-jp',
 413          'korean' => 'euc-kr',
 414          'simpl_chinese' => 'gb2312',
 415          'trad_chinese' => 'big5',
 416          'vietnamese' => '',
 417          'unicode' => 'utf-8',
 418      );
 419  
 420          // mapping of language (family) names to charsets on Windows
 421      var $script_to_charset_windows=array(
 422          'east_european' => 'windows-1250',
 423          'cyrillic' => 'windows-1251',
 424          'west_european' => 'windows-1252',
 425          'greek' => 'windows-1253',
 426          'turkish' => 'windows-1254',
 427          'hebrew' => 'windows-1255',
 428          'arabic' => 'windows-1256',
 429          'baltic' => 'windows-1257',
 430          'estonian' => 'windows-1257',
 431          'lithuanian' => 'windows-1257',
 432          'vietnamese' => 'windows-1258',
 433          'thai' => 'cp874',
 434          'korean' => 'cp949',
 435          'chinese' => 'gb2312',
 436          'japanese' => 'shift_jis',
 437          'simpl_chinese' => 'gb2312',
 438          'trad_chinese' => 'big5',
 439      );
 440  
 441          // mapping of locale names to charsets
 442      var $locale_to_charset=array(
 443          'japanese.euc' => 'euc-jp',
 444          'ja_jp.ujis' => 'euc-jp',
 445          'korean.euc' => 'euc-kr',
 446          'sr@Latn' => 'iso-8859-2',
 447          'zh_cn' => 'gb2312',
 448          'zh_hk' => 'big5',
 449          'zh_tw' => 'big5',
 450      );
 451  
 452          // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
 453          // Empty values means "iso-8859-1"
 454      var $charSetArray = array(
 455          'dk' => '',
 456          'de' => '',
 457          'no' => '',
 458          'it' => '',
 459          'fr' => '',
 460          'es' => '',
 461          'nl' => '',
 462          'cz' => 'windows-1250',
 463          'pl' => 'iso-8859-2',
 464          'si' => 'windows-1250',
 465          'fi' => '',
 466          'tr' => 'iso-8859-9',
 467          'se' => '',
 468          'pt' => '',
 469          'ru' => 'windows-1251',
 470          'ro' => 'iso-8859-2',
 471          'ch' => 'gb2312',
 472          'sk' => 'windows-1250',
 473          'lt' => 'windows-1257',
 474          'is' => 'utf-8',
 475          'hr' => 'windows-1250',
 476          'hu' => 'iso-8859-2',
 477          'gl' => '',
 478          'th' => 'iso-8859-11',
 479          'gr' => 'iso-8859-7',
 480          'hk' => 'big5',
 481          'eu' => '',
 482          'bg' => 'windows-1251',
 483          'br' => '',
 484          'et' => 'iso-8859-4',
 485          'ar' => 'iso-8859-6',
 486          'he' => 'utf-8',
 487          'ua' => 'windows-1251',
 488          'jp' => 'shift_jis',
 489          'lv' => 'utf-8',
 490          'vn' => 'utf-8',
 491          'ca' => 'iso-8859-15',
 492          'ba' => 'iso-8859-2',
 493          'kr' => 'euc-kr',
 494          'eo' => 'utf-8',
 495          'my' => '',
 496          'hi' => 'utf-8',
 497          'fo' => 'utf-8',
 498          'fa' => 'utf-8',
 499          'sr' => 'utf-8'
 500      );
 501  
 502          // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
 503          // Missing keys means: same as Typo3
 504      var $isoArray = array(
 505          'ba' => 'bs',
 506          'br' => 'pt_BR',
 507          'ch' => 'zh_CN',
 508          'cz' => 'cs',
 509          'dk' => 'da',
 510          'si' => 'sl',
 511          'se' => 'sv',
 512          'gl' => 'kl',
 513          'gr' => 'el',
 514          'hk' => 'zh_HK',
 515          'kr' => 'ko',
 516          'ua' => 'uk',
 517          'jp' => 'ja',
 518          'vn' => 'vi',
 519      );
 520  
 521      /**
 522       * Normalize - changes input character set to lowercase letters.
 523       *
 524       * @param    string        Input charset
 525       * @return    string        Normalized charset
 526       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
 527       */
 528  	function parse_charset($charset)    {
 529          $charset = strtolower($charset);
 530          if (isset($this->synonyms[$charset]))    $charset = $this->synonyms[$charset];
 531  
 532          return $charset;
 533      }
 534  
 535      /**
 536       * Get the charset of a locale.
 537       *
 538       * ln            language
 539       * ln_CN         language / country
 540       * ln_CN.cs      language / country / charset
 541       * ln_CN.cs@mod  language / country / charset / modifier
 542       *
 543       * @param    string        Locale string
 544       * @return    string        Charset resolved for locale string
 545       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
 546       */
 547  	function get_locale_charset($locale)    {
 548          $locale = strtolower($locale);
 549  
 550              // exact locale specific charset?
 551          if (isset($this->locale_to_charset[$locale]))    return $this->locale_to_charset[$locale];
 552  
 553              // get modifier
 554          list($locale,$modifier) = explode('@',$locale);
 555  
 556              // locale contains charset: use it
 557          list($locale,$charset) = explode('.',$locale);
 558          if ($charset)    return $this->parse_charset($charset);
 559  
 560              // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
 561          if ($modifier == 'euro')    return 'iso-8859-15';
 562  
 563              // get language
 564          list($language,$country) = explode('_',$locale);
 565          if (isset($this->lang_to_script[$language]))    $script = $this->lang_to_script[$language];
 566  
 567          if (TYPO3_OS == 'WIN')    {
 568              $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'window-1252';
 569          } else {
 570              $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
 571          }
 572  
 573          return $cs;
 574      }
 575  
 576  
 577  
 578  
 579  
 580  
 581  
 582  
 583  
 584      /********************************************
 585       *
 586       * Charset Conversion functions
 587       *
 588       ********************************************/
 589  
 590      /**
 591       * Convert from one charset to another charset.
 592       *
 593       * @param    string        Input string
 594       * @param    string        From charset (the current charset of the string)
 595       * @param    string        To charset (the output charset wanted)
 596       * @param    boolean        If set, then characters that are not available in the destination character set will be encoded as numeric entities
 597       * @return    string        Converted string
 598       * @see convArray()
 599       */
 600  	function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)    {
 601          if ($fromCS==$toCS)    return $str;
 602  
 603              // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
 604          if ($toCS=='utf-8' || !$useEntityForNoChar)    {
 605              switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod'])    {
 606              case 'mbstring':
 607                  $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
 608                  if (false !== $conv_str)    return $conv_str; // returns false for unsupported charsets
 609                  break;
 610  
 611              case 'iconv':
 612                  $conv_str = iconv($fromCS,$toCS.'//IGNORE',$str);
 613                  if (false !== $conv_str)    return $conv_str;
 614                  break;
 615  
 616              case 'recode':
 617                  $conv_str = recode_string($fromCS.'..'.$toCS,$str);
 618                  if (false !== $conv_str)    return $conv_str;
 619                  break;
 620              }
 621              // fallback to TYPO3 conversion
 622          }
 623  
 624          if ($fromCS!='utf-8')    $str=$this->utf8_encode($str,$fromCS);
 625          if ($toCS!='utf-8')    $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
 626          return $str;
 627      }
 628  
 629      /**
 630       * Convert all elements in ARRAY from one charset to another charset.
 631       * NOTICE: Array is passed by reference!
 632       *
 633       * @param    string        Input array, possibly multidimensional
 634       * @param    string        From charset (the current charset of the string)
 635       * @param    string        To charset (the output charset wanted)
 636       * @param    boolean        If set, then characters that are not available in the destination character set will be encoded as numeric entities
 637       * @return    void
 638       * @see conv()
 639       */
 640  	function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)    {
 641          foreach($array as $key => $value)    {
 642              if (is_array($array[$key]))    {
 643                  $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
 644              } else {
 645                  $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
 646              }
 647          }
 648      }
 649  
 650      /**
 651       * Converts $str from $charset to UTF-8
 652       *
 653       * @param    string        String in local charset to convert to UTF-8
 654       * @param    string        Charset, lowercase. Must be found in csconvtbl/ folder.
 655       * @return    string        Output string, converted to UTF-8
 656       */
 657  	function utf8_encode($str,$charset)    {
 658  
 659          if ($charset === 'utf-8')    return $str;
 660  
 661              // Charset is case-insensitive.
 662          if ($this->initCharset($charset))    {    // Parse conv. table if not already...
 663              $strLen = strlen($str);
 664              $outStr='';
 665  
 666              for ($a=0;$a<$strLen;$a++)    {    // Traverse each char in string.
 667                  $chr=substr($str,$a,1);
 668                  $ord=ord($chr);
 669                  if (isset($this->twoByteSets[$charset]))    {    // If the charset has two bytes per char
 670                      $ord2 = ord($str{$a+1});
 671                      $ord = $ord<<8 | $ord2; // assume big endian
 672  
 673                      if (isset($this->parsedCharsets[$charset]['local'][$ord]))    {    // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 674                          $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
 675                      } else $outStr.=chr($this->noCharByteVal);    // No char exists
 676                      $a++;
 677                  } elseif ($ord>127)    {    // If char has value over 127 it's a multibyte char in UTF-8
 678                      if (isset($this->eucBasedSets[$charset]))    {    // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
 679                          if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF))    {    // Shift-JIS: chars between 160 and 223 are single byte
 680                              $a++;
 681                              $ord2=ord(substr($str,$a,1));
 682                              $ord = $ord*256+$ord2;
 683                          }
 684                      }
 685  
 686                      if (isset($this->parsedCharsets[$charset]['local'][$ord]))    {    // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 687                          $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
 688                      } else $outStr.= chr($this->noCharByteVal);    // No char exists
 689                  } else $outStr.= $chr;    // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 690              }
 691              return $outStr;
 692          }
 693      }
 694  
 695      /**
 696       * Converts $str from UTF-8 to $charset
 697       *
 698       * @param    string        String in UTF-8 to convert to local charset
 699       * @param    string        Charset, lowercase. Must be found in csconvtbl/ folder.
 700       * @param    boolean        If set, then characters that are not available in the destination character set will be encoded as numeric entities
 701       * @return    string        Output string, converted to local charset
 702       */
 703  	function utf8_decode($str,$charset,$useEntityForNoChar=0)    {
 704  
 705              // Charset is case-insensitive.
 706          if ($this->initCharset($charset))    {    // Parse conv. table if not already...
 707              $strLen = strlen($str);
 708              $outStr='';
 709              $buf='';
 710              for ($a=0,$i=0;$a<$strLen;$a++,$i++)    {    // Traverse each char in UTF-8 string.
 711                  $chr=substr($str,$a,1);
 712                  $ord=ord($chr);
 713                  if ($ord>127)    {    // This means multibyte! (first byte!)
 714                      if ($ord & 64)    {    // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 715  
 716                          $buf=$chr;    // Add first byte
 717                          for ($b=0;$b<8;$b++)    {    // for each byte in multibyte string...
 718                              $ord = $ord << 1;    // Shift it left and ...
 719                              if ($ord & 128)    {    // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 720                                  $a++;    // Increase pointer...
 721                                  $buf.=substr($str,$a,1);    // ... and add the next char.
 722                              } else break;
 723                          }
 724  
 725                          if (isset($this->parsedCharsets[$charset]['utf8'][$buf]))    {    // If the UTF-8 char-sequence is found then...
 726                              $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];    // The local number
 727                              if ($mByte>255)    {    // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
 728                                  $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
 729                              } else $outStr.= chr($mByte);
 730                          } elseif ($useEntityForNoChar) {    // Create num entity:
 731                              $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
 732                          } else $outStr.=chr($this->noCharByteVal);    // No char exists
 733                      } else $outStr.=chr($this->noCharByteVal);    // No char exists (MIDDLE of MB sequence!)
 734                  } else $outStr.=$chr;    // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 735              }
 736              return $outStr;
 737          }
 738      }
 739  
 740      /**
 741       * Converts all chars > 127 to numeric entities.
 742       *
 743       * @param    string        Input string
 744       * @return    string        Output string
 745       */
 746  	function utf8_to_entities($str)    {
 747          $strLen = strlen($str);
 748          $outStr='';
 749          $buf='';
 750          for ($a=0;$a<$strLen;$a++)    {    // Traverse each char in UTF-8 string.
 751              $chr=substr($str,$a,1);
 752              $ord=ord($chr);
 753              if ($ord>127)    {    // This means multibyte! (first byte!)
 754                  if ($ord & 64)    {    // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 755                      $buf=$chr;    // Add first byte
 756                      for ($b=0;$b<8;$b++)    {    // for each byte in multibyte string...
 757                          $ord = $ord << 1;    // Shift it left and ...
 758                          if ($ord & 128)    {    // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 759                              $a++;    // Increase pointer...
 760                              $buf.=substr($str,$a,1);    // ... and add the next char.
 761                          } else break;
 762                      }
 763  
 764                      $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
 765                  } else $outStr.=chr($this->noCharByteVal);    // No char exists (MIDDLE of MB sequence!)
 766              } else $outStr.=$chr;    // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 767          }
 768  
 769          return $outStr;
 770      }
 771  
 772      /**
 773       * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
 774       *
 775       * @param    string        Input string, UTF-8
 776       * @param    boolean        If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
 777       * @return    string        Output string
 778       */
 779  	function entities_to_utf8($str,$alsoStdHtmlEnt=0)    {
 780          if ($alsoStdHtmlEnt)    {
 781              $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));        // Getting them in iso-8859-1 - but thats ok since this is observed below.
 782          }
 783  
 784          $token = md5(microtime());
 785          $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
 786          foreach($parts as $k => $v)    {
 787              if ($k%2)    {
 788                  if (substr($v,0,1)=='#')    {    // Dec or hex entities:
 789                      if (substr($v,1,1)=='x')    {
 790                          $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
 791                      } else {
 792                          $parts[$k] = $this->UnumberToChar(substr($v,1));
 793                      }
 794                  } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {    // Other entities:
 795                      $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
 796                  } else {    // No conversion:
 797                      $parts[$k] ='&'.$v.';';
 798                  }
 799              }
 800          }
 801  
 802          return implode('',$parts);
 803      }
 804  
 805      /**
 806       * Converts all chars in the input UTF-8 string into integer numbers returned in an array
 807       *
 808       * @param    string        Input string, UTF-8
 809       * @param    boolean        If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
 810       * @param    boolean        If set, then instead of integer numbers the real UTF-8 char is returned.
 811       * @return    array        Output array with the char numbers
 812       */
 813  	function utf8_to_numberarray($str,$convEntities=0,$retChar=0)    {
 814              // If entities must be registered as well...:
 815          if ($convEntities)    {
 816              $str = $this->entities_to_utf8($str,1);
 817          }
 818              // Do conversion:
 819          $strLen = strlen($str);
 820          $outArr=array();
 821          $buf='';
 822          for ($a=0;$a<$strLen;$a++)    {    // Traverse each char in UTF-8 string.
 823              $chr=substr($str,$a,1);
 824              $ord=ord($chr);
 825              if ($ord>127)    {    // This means multibyte! (first byte!)
 826                  if ($ord & 64)    {    // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 827                      $buf=$chr;    // Add first byte
 828                      for ($b=0;$b<8;$b++)    {    // for each byte in multibyte string...
 829                          $ord = $ord << 1;    // Shift it left and ...
 830                          if ($ord & 128)    {    // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 831                              $a++;    // Increase pointer...
 832                              $buf.=substr($str,$a,1);    // ... and add the next char.
 833                          } else break;
 834                      }
 835  
 836                      $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
 837                  } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;    // No char exists (MIDDLE of MB sequence!)
 838              } else $outArr[]=$retChar?chr($ord):$ord;    // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 839          }
 840  
 841          return $outArr;
 842      }
 843  
 844      /**
 845       * Converts a UNICODE number to a UTF-8 multibyte character
 846       * Algorithm based on script found at From: http://czyborra.com/utf/
 847       * Unit-tested by Kasper
 848       *
 849       * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
 850       *
 851       *  bytes | bits | representation
 852       *      1 |    7 | 0vvvvvvv
 853       *      2 |   11 | 110vvvvv 10vvvvvv
 854       *      3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
 855       *      4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
 856       *      5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 857       *      6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 858       *
 859       * @param    integer        UNICODE integer
 860       * @return    string        UTF-8 multibyte character string
 861       * @see utf8CharToUnumber()
 862       */
 863  	function UnumberToChar($cbyte)    {
 864          $str='';
 865  
 866          if ($cbyte < 0x80) {
 867              $str.=chr($cbyte);
 868          } else if ($cbyte < 0x800) {
 869              $str.=chr(0xC0 | ($cbyte >> 6));
 870              $str.=chr(0x80 | ($cbyte & 0x3F));
 871          } else if ($cbyte < 0x10000) {
 872              $str.=chr(0xE0 | ($cbyte >> 12));
 873              $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 874              $str.=chr(0x80 | ($cbyte & 0x3F));
 875          } else if ($cbyte < 0x200000) {
 876              $str.=chr(0xF0 | ($cbyte >> 18));
 877              $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 878              $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 879              $str.=chr(0x80 | ($cbyte & 0x3F));
 880          } else if ($cbyte < 0x4000000) {
 881              $str.=chr(0xF8 | ($cbyte >> 24));
 882              $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
 883              $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 884              $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 885              $str.=chr(0x80 | ($cbyte & 0x3F));
 886          } else if ($cbyte < 0x80000000) {
 887              $str.=chr(0xFC | ($cbyte >> 30));
 888              $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
 889              $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
 890              $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 891              $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 892              $str.=chr(0x80 | ($cbyte & 0x3F));
 893          } else { // Cannot express a 32-bit character in UTF-8
 894              $str .= chr($this->noCharByteVal);
 895          }
 896          return $str;
 897      }
 898  
 899      /**
 900       * Converts a UTF-8 Multibyte character to a UNICODE number
 901       * Unit-tested by Kasper
 902       *
 903       * @param    string        UTF-8 multibyte character string
 904       * @param    boolean        If set, then a hex. number is returned.
 905       * @return    integer        UNICODE integer
 906       * @see UnumberToChar()
 907       */
 908  	function utf8CharToUnumber($str,$hex=0)    {
 909          $ord=ord(substr($str,0,1));    // First char
 910  
 911          if (($ord & 192) == 192)    {    // This verifyes that it IS a multi byte string
 912              $binBuf='';
 913              for ($b=0;$b<8;$b++)    {    // for each byte in multibyte string...
 914                  $ord = $ord << 1;    // Shift it left and ...
 915                  if ($ord & 128)    {    // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 916                      $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
 917                  } else break;
 918              }
 919              $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
 920  
 921              $int = bindec($binBuf);
 922          } else $int = $ord;
 923  
 924          return $hex ? 'x'.dechex($int) : $int;
 925      }
 926  
 927  
 928  
 929  
 930  
 931  
 932  
 933  
 934  
 935      /********************************************
 936       *
 937       * Init functions
 938       *
 939       ********************************************/
 940  
 941      /**
 942       * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
 943       * This function is automatically called by the conversion functions
 944       *
 945       * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
 946       *
 947       * @param    string        The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
 948       * @return    integer        Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
 949       * @access private
 950       */
 951  	function initCharset($charset)    {
 952              // Only process if the charset is not yet loaded:
 953          if (!is_array($this->parsedCharsets[$charset]))    {
 954  
 955                  // Conversion table filename:
 956              $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
 957  
 958                  // If the conversion table is found:
 959              if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))    {
 960                      // Cache file for charsets:
 961                      // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
 962                  $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
 963                  if ($cacheFile && @is_file($cacheFile))    {
 964                      $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
 965                  } else {
 966                          // Parse conversion table into lines:
 967                      $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
 968                          // Initialize the internal variable holding the conv. table:
 969                      $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
 970                          // traverse the lines:
 971                      $detectedType='';
 972                      foreach($lines as $value)    {
 973                          if (trim($value) && substr($value,0,1)!='#')    {    // Comment line or blanks are ignored.
 974  
 975                                  // Detect type if not done yet: (Done on first real line)
 976                                  // The "whitespaced" type is on the syntax     "0x0A    0x000A    #LINE FEED"     while     "ms-token" is like         "B9 = U+00B9 : SUPERSCRIPT ONE"
 977                              if (!$detectedType)        $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
 978  
 979                              if ($detectedType=='ms-token')    {
 980                                  list($hexbyte,$utf8) = split('=|:',$value,3);
 981                              } elseif ($detectedType=='whitespaced')    {
 982                                  $regA=array();
 983                                  ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
 984                                  $hexbyte = $regA[1];
 985                                  $utf8 = 'U+'.$regA[2];
 986                              }
 987                              $decval = hexdec(trim($hexbyte));
 988                              if ($decval>127)    {
 989                                  $utf8decval = hexdec(substr(trim($utf8),2));
 990                                  $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
 991                                  $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
 992                              }
 993                          }
 994                      }
 995                      if ($cacheFile)    {
 996                          t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
 997                      }
 998                  }
 999                  return 2;
1000              } else return false;
1001          } else return 1;
1002      }
1003  
1004      /**
1005       * This function initializes all UTF-8 character data tables.
1006       *
1007       * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1008       *
1009       * @param    string        Mode ("case", "ascii", ...)
1010       * @return    integer        Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1011       * @access private
1012       */
1013  	function initUnicodeData($mode=null)    {
1014              // cache files
1015          $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1016          $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1017  
1018              // Only process if the tables are not yet loaded
1019          switch($mode)    {
1020              case 'case':
1021                  if (is_array($this->caseFolding['utf-8']))    return 1;
1022  
1023                      // Use cached version if possible
1024                  if ($cacheFileCase && @is_file($cacheFileCase))    {
1025                      $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1026                      return 2;
1027                  }
1028                  break;
1029  
1030              case 'ascii':
1031                  if (is_array($this->toASCII['utf-8']))    return 1;
1032  
1033                      // Use cached version if possible
1034                  if ($cacheFileASCII && @is_file($cacheFileASCII))    {
1035                      $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1036                      return 2;
1037                  }
1038                  break;
1039          }
1040  
1041              // process main Unicode data file
1042          $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1043          if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile)))    return false;
1044  
1045          $fh = fopen($unicodeDataFile,'rb');
1046          if (!$fh)    return false;
1047  
1048              // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1049              // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1050          $this->caseFolding['utf-8'] = array();
1051          $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1052          $utf8CaseFolding['toUpper'] = array();
1053          $utf8CaseFolding['toLower'] = array();
1054          $utf8CaseFolding['toTitle'] = array();
1055  
1056          $decomposition = array();    // array of temp. decompositions
1057          $mark = array();        // array of chars that are marks (eg. composing accents)
1058          $number = array();        // array of chars that are numbers (eg. digits)
1059          $omit = array();        // array of chars to be omitted (eg. Russian hard sign)
1060  
1061          while (!feof($fh))    {
1062              $line = fgets($fh,4096);
1063                  // has a lot of info
1064              list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1065  
1066              $ord = hexdec($char);
1067              if ($ord > 0xFFFF)    break;    // only process the BMP
1068  
1069              $utf8_char = $this->UnumberToChar($ord);
1070  
1071              if ($upper)    $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1072              if ($lower)    $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1073                  // store "title" only when different from "upper" (only a few)
1074              if ($title && $title != $upper)    $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1075  
1076              switch ($cat{0})    {
1077                  case 'M':    // mark (accent, umlaut, ...)
1078                      $mark["U+$char"] = 1;
1079                      break;
1080  
1081                  case 'N':    // numeric value
1082                      if ($ord > 0x80 && $num != '')    $number["U+$char"] = $num;
1083              }
1084  
1085                  // accented Latin letters without "official" decomposition
1086              $match = array();
1087              if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp)    {
1088                  $c = ord($match[2]);
1089                  if ($match[1] == 'SMALL')    $c += 32;
1090  
1091                  $decomposition["U+$char"] = array(dechex($c));
1092                  continue;
1093              }
1094  
1095              $match = array();
1096              if (ereg('(<.*>)? *(.+)',$decomp,$match))    {
1097                  switch($match[1])    {
1098                      case '<circle>':    // add parenthesis as circle replacement, eg (1)
1099                          $match[2] = '0028 '.$match[2].' 0029';
1100                          break;
1101  
1102                      case '<square>':    // add square brackets as square replacement, eg [1]
1103                          $match[2] = '005B '.$match[2].' 005D';
1104                          break;
1105  
1106                      case '<compat>':    // ignore multi char decompositions that start with a space
1107                          if (ereg('^0020 ',$match[2]))    continue 2;
1108                          break;
1109  
1110                          // ignore Arabic and vertical layout presentation decomposition
1111                      case '<initial>':
1112                      case '<medial>':
1113                      case '<final>':
1114                      case '<isolated>':
1115                      case '<vertical>':
1116                          continue 2;
1117                  }
1118                  $decomposition["U+$char"] = split(' ',$match[2]);
1119              }
1120          }
1121          fclose($fh);
1122  
1123              // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1124          $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1125          if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))    {
1126              $fh = fopen($specialCasingFile,'rb');
1127              if ($fh)    {
1128                  while (!feof($fh))    {
1129                      $line = fgets($fh,4096);
1130                      if ($line{0} != '#' && trim($line) != '')    {
1131  
1132                          list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1133                          if ($cond == '' || $cond{0} == '#')    {
1134                              $utf8_char = $this->UnumberToChar(hexdec($char));
1135                              if ($char != $lower)    {
1136                                  $arr = split(' ',$lower);
1137                                  for ($i=0; isset($arr[$i]); $i++)    $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1138                                  $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1139                              }
1140                              if ($char != $title && $title != $upper)    {
1141                                  $arr = split(' ',$title);
1142                                  for ($i=0; isset($arr[$i]); $i++)    $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1143                                  $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1144                              }
1145                              if ($char != $upper)    {
1146                                      $arr = split(' ',$upper);
1147                                  for ($i=0; isset($arr[$i]); $i++)    $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1148                                  $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1149                              }
1150                          }
1151                      }
1152                  }
1153                  fclose($fh);
1154              }
1155          }
1156  
1157              // process custom decompositions
1158          $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1159          if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))    {
1160              $fh = fopen($customTranslitFile,'rb');
1161              if ($fh)    {
1162                  while (!feof($fh))    {
1163                      $line = fgets($fh,4096);
1164                      if ($line{0} != '#' && trim($line) != '')    {
1165                          list($char,$translit) = t3lib_div::trimExplode(';', $line);
1166                          if (!$translit)    $omit["U+$char"] = 1;
1167                          $decomposition["U+$char"] = split(' ', $translit);
1168  
1169                      }
1170                  }
1171                  fclose($fh);
1172              }
1173          }
1174  
1175              // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1176          foreach($decomposition as $from => $to)    {
1177              $code_decomp = array();
1178  
1179              while ($code_value = array_shift($to))    {
1180                  if (isset($decomposition["U+$code_value"]))    {    // do recursive decomposition
1181                      foreach(array_reverse($decomposition["U+$code_value"]) as $cv)    {
1182                          array_unshift($to, $cv);
1183                      }
1184                  } elseif (!isset($mark["U+$code_value"])) {    // remove mark
1185                      array_push($code_decomp, $code_value);
1186                  }
1187              }
1188              if (count($code_decomp) || isset($omit[$from]))    {
1189                  $decomposition[$from] = $code_decomp;
1190              } else {
1191                  unset($decomposition[$from]);
1192              }
1193          }
1194  
1195              // create ascii only mapping
1196          $this->toASCII['utf-8'] = array();
1197          $ascii =& $this->toASCII['utf-8'];
1198  
1199          foreach($decomposition as $from => $to)    {
1200              $code_decomp = array();
1201              while ($code_value = array_shift($to))    {
1202                  $ord = hexdec($code_value);
1203                  if ($ord > 127)
1204                      continue 2;    // skip decompositions containing non-ASCII chars
1205                  else
1206                      array_push($code_decomp,chr($ord));
1207              }
1208              $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1209          }
1210  
1211              // add numeric decompositions
1212          foreach($number as $from => $to)    {
1213              $utf8_char = $this->UnumberToChar(hexdec($from));
1214              if (!isset($ascii[$utf8_char]))    {
1215                  $ascii[$utf8_char] = $to;
1216              }
1217          }
1218  
1219          if ($cacheFileCase)    {
1220                  t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1221          }
1222  
1223          if ($cacheFileASCII)    {
1224                  t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1225          }
1226  
1227          return 3;
1228      }
1229  
1230      /**
1231       * This function initializes the folding table for a charset other than UTF-8.
1232       * This function is automatically called by the case folding functions.
1233       *
1234       * @param    string        Charset for which to initialize case folding.
1235       * @return    integer        Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1236       * @access private
1237       */
1238  	function initCaseFolding($charset)    {
1239              // Only process if the case table is not yet loaded:
1240          if (is_array($this->caseFolding[$charset]))    return 1;
1241  
1242              // Use cached version if possible
1243          $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1244          if ($cacheFile && @is_file($cacheFile))    {
1245              $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1246              return 2;
1247          }
1248  
1249              // init UTF-8 conversion for this charset
1250          if (!$this->initCharset($charset))    {
1251              return false;
1252          }
1253  
1254              // UTF-8 case folding is used as the base conversion table
1255          if (!$this->initUnicodeData('case'))    {
1256              return false;
1257          }
1258  
1259          $nochar = chr($this->noCharByteVal);
1260          foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)    {
1261                  // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1262              $c = $this->utf8_decode($utf8, $charset);
1263  
1264                  // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1265              $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1266              if ($cc != '' && $cc != $nochar)    $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1267  
1268                  // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1269              $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1270              if ($cc != '' && $cc != $nochar)    $this->caseFolding[$charset]['toLower'][$c] = $cc;
1271  
1272                  // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1273              $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1274              if ($cc != '' && $cc != $nochar)    $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1275          }
1276  
1277              // add the ASCII case table
1278          for ($i=ord('a'); $i<=ord('z'); $i++)    {
1279              $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1280          }
1281          for ($i=ord('A'); $i<=ord('Z'); $i++)    {
1282              $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1283          }
1284  
1285          if ($cacheFile)    {
1286                  t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1287          }
1288  
1289          return 3;
1290      }
1291  
1292      /**
1293       * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1294       * This function is automatically called by the ASCII transliteration functions.
1295       *
1296       * @param    string        Charset for which to initialize conversion.
1297       * @return    integer        Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1298       * @access private
1299       */
1300  	function initToASCII($charset)    {
1301              // Only process if the case table is not yet loaded:
1302          if (is_array($this->toASCII[$charset]))    return 1;
1303  
1304              // Use cached version if possible
1305          $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1306          if ($cacheFile && @is_file($cacheFile))    {
1307              $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1308              return 2;
1309          }
1310  
1311              // init UTF-8 conversion for this charset
1312          if (!$this->initCharset($charset))    {
1313              return false;
1314          }
1315  
1316              // UTF-8/ASCII transliteration is used as the base conversion table
1317          if (!$this->initUnicodeData('ascii'))    {
1318              return false;
1319          }
1320  
1321          $nochar = chr($this->noCharByteVal);
1322          foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)    {
1323                  // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1324              $c = $this->utf8_decode($utf8, $charset);
1325  
1326              if (isset($this->toASCII['utf-8'][$utf8]))    {
1327                  $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1328              }
1329          }
1330  
1331          if ($cacheFile)    {
1332                  t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1333          }
1334  
1335          return 3;
1336      }
1337  
1338  
1339  
1340  
1341  
1342  
1343  
1344  
1345  
1346  
1347  
1348  
1349  
1350  
1351  
1352  
1353      /********************************************
1354       *
1355       * String operation functions
1356       *
1357       ********************************************/
1358  
1359      /**
1360       * Returns a part of a string.
1361       * Unit-tested by Kasper (single byte charsets only)
1362       *
1363       * @param    string        The character set
1364       * @param    string        Character string
1365       * @param    integer        Start position (character position)
1366       * @param    integer        Length (in characters)
1367       * @return    string        The substring
1368       * @see substr(), mb_substr()
1369       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1370       */
1371  	function substr($charset,$string,$start,$len=null)    {
1372          if ($len===0)    return '';
1373  
1374          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1375                  // cannot omit $len, when specifying charset
1376              if ($len==null)    {
1377                  $enc = mb_internal_encoding();    // save internal encoding
1378                  mb_internal_encoding($charset);
1379                  $str = mb_substr($string,$start);
1380                  mb_internal_encoding($enc);    // restore internal encoding
1381  
1382                  return $str;
1383              }
1384              else {
1385                  return mb_substr($string,$start,$len,$charset);
1386              }
1387          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')    {
1388                  // cannot omit $len, when specifying charset
1389              if ($len==null)    {
1390                  $enc = iconv_get_encoding('internal_encoding');    // save internal encoding
1391                  iconv_set_encoding('internal_encoding',$charset);
1392                  $str = iconv_substr($string,$start);
1393                  iconv_set_encoding('internal_encoding',$enc);    // restore internal encoding
1394  
1395                  return $str;
1396              }
1397              else {
1398                  return iconv_substr($string,$start,$len,$charset);
1399              }
1400          } elseif ($charset == 'utf-8')    {
1401              return $this->utf8_substr($string,$start,$len);
1402          } elseif ($this->eucBasedSets[$charset])    {
1403              return $this->euc_substr($string,$start,$charset,$len);
1404          } elseif ($this->twoByteSets[$charset])    {
1405              return substr($string,$start*2,$len*2);
1406          } elseif ($this->fourByteSets[$charset])    {
1407              return substr($string,$start*4,$len*4);
1408          }
1409  
1410          // treat everything else as single-byte encoding
1411          return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1412      }
1413  
1414      /**
1415       * Counts the number of characters.
1416       * Unit-tested by Kasper (single byte charsets only)
1417       *
1418       * @param    string        The character set
1419       * @param    string        Character string
1420       * @return    integer        The number of characters
1421       * @see strlen()
1422       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1423       */
1424  	function strlen($charset,$string)    {
1425          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1426              return mb_strlen($string,$charset);
1427          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')    {
1428              return iconv_strlen($string,$charset);
1429          } elseif ($charset == 'utf-8')    {
1430              return $this->utf8_strlen($string);
1431          } elseif ($this->eucBasedSets[$charset])    {
1432              return $this->euc_strlen($string,$charset);
1433          } elseif ($this->twoByteSets[$charset])    {
1434              return strlen($string)/2;
1435          } elseif ($this->fourByteSets[$charset])    {
1436              return strlen($string)/4;
1437          }
1438          // treat everything else as single-byte encoding
1439          return strlen($string);
1440      }
1441  
1442      /**
1443       * Truncates a string and pre-/appends a string.
1444       * Unit tested by Kasper
1445       *
1446       * @param    string        The character set
1447       * @param    string        Character string
1448       * @param    integer        Length (in characters)
1449       * @param    string        Crop signifier
1450       * @return    string        The shortened string
1451       * @see substr(), mb_strimwidth()
1452       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1453       */
1454  	function crop($charset,$string,$len,$crop='')    {
1455          if (intval($len) == 0)    return $string;
1456  
1457          if ($charset == 'utf-8')    {
1458              $i = $this->utf8_char2byte_pos($string,$len);
1459          } elseif ($this->eucBasedSets[$charset])    {
1460              $i = $this->euc_char2byte_pos($string,$len,$charset);
1461          } else {
1462              if ($len > 0)    {
1463                  $i = $len;
1464              } else {
1465                  $i = strlen($string)+$len;
1466                  if ($i<=0)    $i = false;
1467              }
1468          }
1469  
1470          if ($i === false)    {    // $len outside actual string length
1471              return $string;
1472          } else    {
1473              if ($len > 0)    {
1474                  if (strlen($string{$i}))    {
1475                      return substr($string,0,$i).$crop;
1476  
1477                  }
1478              } else {
1479                  if (strlen($string{$i-1}))    {
1480                      return $crop.substr($string,$i);
1481                  }
1482              }
1483  
1484  /*
1485              if (abs($len)<$this->strlen($charset,$string))    {    // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1486                  if ($len > 0)    {
1487                      return substr($string,0,$i).$crop;
1488                  } else {
1489                      return $crop.substr($string,$i);
1490                  }
1491              }
1492  */
1493          }
1494          return $string;
1495      }
1496  
1497      /**
1498       * Cuts a string short at a given byte length.
1499       *
1500       * @param    string        The character set
1501       * @param    string        Character string
1502       * @param    integer        The byte length
1503       * @return    string        The shortened string
1504       * @see mb_strcut()
1505       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1506       */
1507  	function strtrunc($charset,$string,$len)    {
1508          if ($len <= 0)    return '';
1509  
1510          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1511              return mb_strcut($string,0,$len,$charset);
1512          } elseif ($charset == 'utf-8')    {
1513              return $this->utf8_strtrunc($string,$len);
1514          } elseif ($this->eucBasedSets[$charset])    {
1515              return $this->euc_strtrunc($string,$charset);
1516          } elseif ($this->twoByteSets[$charset])    {
1517              if ($len % 2)    $len--;        // don't cut at odd positions
1518          } elseif ($this->fourByteSets[$charset])    {
1519              $x = $len % 4;
1520              $len -= $x;    // realign to position dividable by four
1521          }
1522          // treat everything else as single-byte encoding
1523          return substr($string,0,$len);
1524      }
1525  
1526      /**
1527       * Translates all characters of a string into their respective case values.
1528       * Unlike strtolower() and strtoupper() this method is locale independent.
1529       * Note that the string length may change!
1530       * eg. lower case German �(sharp S) becomes upper case "SS"
1531       * Unit-tested by Kasper
1532       * Real case folding is language dependent, this method ignores this fact.
1533       *
1534       * @param    string        Character set of string
1535       * @param    string        Input string to convert case for
1536       * @param    string        Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1537       * @return    string        The converted string
1538       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1539       * @see strtolower(), strtoupper()
1540       */
1541  	function conv_case($charset,$string,$case)    {
1542          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3)    {
1543              if ($case == 'toLower')    {
1544                  $string = mb_strtolower($string,$charset);
1545              } else {
1546                  $string = mb_strtoupper($string,$charset);
1547              }
1548          } elseif ($charset == 'utf-8')    {
1549              $string = $this->utf8_char_mapping($string,'case',$case);
1550          } elseif (isset($this->eucBasedSets[$charset]))    {
1551              $string = $this->euc_char_mapping($string,$charset,'case',$case);
1552          } else {
1553                  // treat everything else as single-byte encoding
1554              $string = $this->sb_char_mapping($string,$charset,'case',$case);
1555          }
1556  
1557          return $string;
1558      }
1559  
1560      /**
1561       * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1562       *
1563       * @param    string        Character set of string
1564       * @param    string        Input string to convert
1565       * @return    string        The converted string
1566       */
1567  	function specCharsToASCII($charset,$string)    {
1568          if ($charset == 'utf-8')    {
1569              $string = $this->utf8_char_mapping($string,'ascii');
1570          } elseif (isset($this->eucBasedSets[$charset]))    {
1571              $string = $this->euc_char_mapping($string,$charset,'ascii');
1572          } else {
1573                  // treat everything else as single-byte encoding
1574              $string = $this->sb_char_mapping($string,$charset,'ascii');
1575          }
1576  
1577          return $string;
1578      }
1579  
1580  
1581  
1582  
1583  
1584  
1585  
1586  
1587  
1588  
1589  
1590  
1591      /********************************************
1592       *
1593       * Internal string operation functions
1594       *
1595       ********************************************/
1596  
1597      /**
1598       * Maps all characters of a string in a single byte charset.
1599       *
1600       * @param    string        the string
1601       * @param    string        the charset
1602       * @param    string        mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1603       * @param    string        'case': conversion 'toLower' or 'toUpper'
1604       * @return    string        the converted string
1605       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1606       */
1607  	function sb_char_mapping($str,$charset,$mode,$opt='')    {
1608          switch($mode)    {
1609              case 'case':
1610                  if (!$this->initCaseFolding($charset))    return $str;    // do nothing
1611                  $map =& $this->caseFolding[$charset][$opt];
1612                  break;
1613  
1614              case 'ascii':
1615                  if (!$this->initToASCII($charset))    return $str;    // do nothing
1616                  $map =& $this->toASCII[$charset];
1617                  break;
1618  
1619              default:
1620                  return $str;
1621          }
1622  
1623          $out = '';
1624          for($i=0; strlen($str{$i}); $i++)    {
1625              $c = $str{$i};
1626              if (isset($map[$c]))    {
1627                  $out .= $map[$c];
1628              } else {
1629                  $out .= $c;
1630              }
1631          }
1632  
1633          return $out;
1634      }
1635  
1636  
1637  
1638  
1639  
1640  
1641  
1642  
1643  
1644  
1645      /********************************************
1646       *
1647       * Internal UTF-8 string operation functions
1648       *
1649       ********************************************/
1650  
1651      /**
1652       * Returns a part of a UTF-8 string.
1653       * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1654       *
1655       * @param    string        UTF-8 string
1656       * @param    integer        Start position (character position)
1657       * @param    integer        Length (in characters)
1658       * @return    string        The substring
1659       * @see substr()
1660       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1661       */
1662  	function utf8_substr($str,$start,$len=null)    {
1663          if (!strcmp($len,'0'))    return '';
1664  
1665          $byte_start = $this->utf8_char2byte_pos($str,$start);
1666          if ($byte_start === false)    {
1667              if ($start > 0)    {
1668                  return false;    // $start outside string length
1669              } else {
1670                  $start = 0;
1671              }
1672          }
1673  
1674          $str = substr($str,$byte_start);
1675  
1676          if ($len!=null)    {
1677              $byte_end = $this->utf8_char2byte_pos($str,$len);
1678              if ($byte_end === false)    // $len outside actual string length
1679                  return $len<0 ? '' : $str;    // When length is less than zero and exceeds, then we return blank string.
1680              else
1681                  return substr($str,0,$byte_end);
1682          }
1683          else    return $str;
1684      }
1685  
1686      /**
1687       * Counts the number of characters of a string in UTF-8.
1688       * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1689       *
1690       * @param    string        UTF-8 multibyte character string
1691       * @return    integer        The number of characters
1692       * @see strlen()
1693       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1694       */
1695  	function utf8_strlen($str)    {
1696          $n=0;
1697          for($i=0; strlen($str{$i}); $i++)    {
1698              $c = ord($str{$i});
1699              if (!($c & 0x80))    // single-byte (0xxxxxx)
1700                  $n++;
1701              elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1702                  $n++;
1703          }
1704          return $n;
1705      }
1706  
1707      /**
1708       * Truncates a string in UTF-8 short at a given byte length.
1709       *
1710       * @param    string        UTF-8 multibyte character string
1711       * @param    integer        the byte length
1712       * @return    string        the shortened string
1713       * @see mb_strcut()
1714       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1715       */
1716  	function utf8_strtrunc($str,$len)    {
1717          $i = $len-1;
1718          if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1719              for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)    ;    // find the first byte
1720              if ($i <= 0)    return ''; // sanity check
1721              for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)    $bc++;    // calculate number of bytes
1722              if ($bc+$i > $len)    return substr($str,0,$i);
1723                          // fallthru: multibyte char fits into length
1724          }
1725          return substr($str,0,$len);
1726      }
1727  
1728      /**
1729       * Find position of first occurrence of a string, both arguments are in UTF-8.
1730       *
1731       * @param    string        UTF-8 string to search in
1732       * @param    string        UTF-8 string to search for
1733       * @param    integer        Positition to start the search
1734       * @return    integer        The character position
1735       * @see strpos()
1736       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1737       */
1738  	function utf8_strpos($haystack,$needle,$offset=0)    {
1739          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1740              return mb_strpos($haystack,$needle,$offset,'utf-8');
1741          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')    {
1742              return iconv_strpos($haystack,$needle,$offset,'utf-8');
1743          }
1744  
1745          $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1746          if ($byte_offset === false)    return false; // offset beyond string length
1747  
1748          $byte_pos = strpos($haystack,$needle,$byte_offset);
1749          if ($byte_pos === false)    return false; // needle not found
1750  
1751          return $this->utf8_byte2char_pos($haystack,$byte_pos);
1752      }
1753  
1754      /**
1755       * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1756       *
1757       * @param    string        UTF-8 string to search in
1758       * @param    string        UTF-8 character to search for (single character)
1759       * @return    integer        The character position
1760       * @see strrpos()
1761       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1762       */
1763  	function utf8_strrpos($haystack,$needle)    {
1764          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring')    {
1765              return mb_strrpos($haystack,$needle,'utf-8');
1766          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')    {
1767              return iconv_strrpos($haystack,$needle,'utf-8');
1768          }
1769  
1770          $byte_pos = strrpos($haystack,$needle);
1771          if ($byte_pos === false)    return false; // needle not found
1772  
1773          return $this->utf8_byte2char_pos($haystack,$byte_pos);
1774      }
1775  
1776      /**
1777       * Translates a character position into an 'absolute' byte position.
1778       * Unit tested by Kasper.
1779       *
1780       * @param    string        UTF-8 string
1781       * @param    integer        Character position (negative values start from the end)
1782       * @return    integer        Byte position
1783       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1784       */
1785  	function utf8_char2byte_pos($str,$pos)    {
1786          $n = 0;                // number of characters found
1787          $p = abs($pos);        // number of characters wanted
1788  
1789          if ($pos >= 0)    {
1790              $i = 0;
1791              $d = 1;
1792          } else {
1793              $i = strlen($str)-1;
1794              $d = -1;
1795          }
1796  
1797          for( ; strlen($str{$i}) && $n<$p; $i+=$d)    {
1798              $c = (int)ord($str{$i});
1799              if (!($c & 0x80))    // single-byte (0xxxxxx)
1800                  $n++;
1801              elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1802                  $n++;
1803          }
1804          if (!strlen($str{$i}))    return false; // offset beyond string length
1805  
1806          if ($pos >= 0)    {
1807                  // skip trailing multi-byte data bytes
1808              while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1809          } else {
1810                  // correct offset
1811              $i++;
1812          }
1813  
1814          return $i;
1815      }
1816  
1817      /**
1818       * Translates an 'absolute' byte position into a character position.
1819       * Unit tested by Kasper.
1820       *
1821       * @param    string        UTF-8 string
1822       * @param    integer        byte position
1823       * @return    integer        character position
1824       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1825       */
1826  	function utf8_byte2char_pos($str,$pos)    {
1827          $n = 0;    // number of characters
1828          for($i=$pos; $i>0; $i--)    {
1829              $c = (int)ord($str{$i});
1830              if (!($c & 0x80))    // single-byte (0xxxxxx)
1831                  $n++;
1832              elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1833                  $n++;
1834          }
1835          if (!strlen($str{$i}))    return false; // offset beyond string length
1836  
1837          return $n;
1838      }
1839  
1840      /**
1841       * Maps all characters of an UTF-8 string.
1842       *
1843       * @param    string        UTF-8 string
1844       * @param    string        mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1845       * @param    string        'case': conversion 'toLower' or 'toUpper'
1846       * @return    string        the converted string
1847       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1848       */
1849  	function utf8_char_mapping($str,$mode,$opt='')    {
1850          if (!$this->initUnicodeData($mode))    return $str;    // do nothing
1851  
1852          $out = '';
1853          switch($mode)    {
1854              case 'case':
1855                  $map =& $this->caseFolding['utf-8'][$opt];
1856                  break;
1857  
1858              case 'ascii':
1859                  $map =& $this->toASCII['utf-8'];
1860                  break;
1861  
1862              default:
1863                  return $str;
1864          }
1865  
1866          for($i=0; strlen($str{$i}); $i++)    {
1867              $c = ord($str{$i});
1868              if (!($c & 0x80))    // single-byte (0xxxxxx)
1869                  $mbc = $str{$i};
1870              elseif (($c & 0xC0) == 0xC0)    {    // multi-byte starting byte (11xxxxxx)
1871                  for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; }    // calculate number of bytes
1872                  $mbc = substr($str,$i,$bc);
1873                  $i += $bc-1;
1874              }
1875  
1876              if (isset($map[$mbc]))    {
1877                  $out .= $map[$mbc];
1878              } else {
1879                  $out .= $mbc;
1880              }
1881          }
1882  
1883          return $out;
1884      }
1885  
1886  
1887  
1888  
1889  
1890  
1891  
1892  
1893  
1894  
1895  
1896  
1897  
1898  
1899  
1900  
1901  
1902  
1903      /********************************************
1904       *
1905       * Internal EUC string operation functions
1906       *
1907       * Extended Unix Code:
1908       *  ASCII compatible 7bit single bytes chars
1909       *  8bit two byte chars
1910       *
1911       * Shift-JIS is treated as a special case.
1912       *
1913       ********************************************/
1914  
1915      /**
1916       * Cuts a string in the EUC charset family short at a given byte length.
1917       *
1918       * @param    string        EUC multibyte character string
1919       * @param    integer        the byte length
1920       * @param    string        the charset
1921       * @return    string        the shortened string
1922       * @see mb_strcut()
1923       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1924       */
1925  	function euc_strtrunc($str,$len,$charset)     {
1926          $sjis = ($charset == 'shift_jis');
1927          for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1928              $c = ord($str{$i});
1929              if ($sjis)    {
1930                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))    $i++;    // advance a double-byte char
1931              }
1932              else    {
1933                  if ($c >= 0x80)    $i++;    // advance a double-byte char
1934              }
1935          }
1936          if (!strlen($str{$i}))    return $str;    // string shorter than supplied length
1937  
1938          if ($i>$len)
1939              return substr($str,0,$len-1);    // we ended on a first byte
1940          else
1941              return substr($str,0,$len);
1942          }
1943  
1944      /**
1945       * Returns a part of a string in the EUC charset family.
1946       *
1947       * @param    string        EUC multibyte character string
1948       * @param    integer        start position (character position)
1949       * @param    string        the charset
1950       * @param    integer        length (in characters)
1951       * @return    string        the substring
1952       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1953       */
1954  	function euc_substr($str,$start,$charset,$len=null)    {
1955          $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1956          if ($byte_start === false)    return false;    // $start outside string length
1957  
1958          $str = substr($str,$byte_start);
1959  
1960          if ($len!=null)    {
1961              $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1962              if ($byte_end === false)    // $len outside actual string length
1963                  return $str;
1964              else
1965                  return substr($str,0,$byte_end);
1966          }
1967          else    return $str;
1968      }
1969  
1970      /**
1971       * Counts the number of characters of a string in the EUC charset family.
1972       *
1973       * @param    string        EUC multibyte character string
1974       * @param    string        the charset
1975       * @return    integer        the number of characters
1976       * @see strlen()
1977       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1978       */
1979  	function euc_strlen($str,$charset)     {
1980          $sjis = ($charset == 'shift_jis');
1981          $n=0;
1982          for ($i=0; strlen($str{$i}); $i++) {
1983              $c = ord($str{$i});
1984              if ($sjis)    {
1985                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))    $i++;    // advance a double-byte char
1986              }
1987              else    {
1988                  if ($c >= 0x80)    $i++;    // advance a double-byte char
1989              }
1990  
1991              $n++;
1992          }
1993  
1994          return $n;
1995      }
1996  
1997      /**
1998       * Translates a character position into an 'absolute' byte position.
1999       *
2000       * @param    string        EUC multibyte character string
2001       * @param    integer        character position (negative values start from the end)
2002       * @param    string        the charset
2003       * @return    integer        byte position
2004       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2005       */
2006  	function euc_char2byte_pos($str,$pos,$charset)    {
2007          $sjis = ($charset == 'shift_jis');
2008          $n = 0; // number of characters seen
2009          $p = abs($pos);    // number of characters wanted
2010  
2011          if ($pos >= 0)    {
2012              $i = 0;
2013              $d = 1;
2014          } else {
2015              $i = strlen($str)-1;
2016              $d = -1;
2017          }
2018  
2019          for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2020              $c = ord($str{$i});
2021              if ($sjis)    {
2022                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))    $i+=$d;    // advance a double-byte char
2023              }
2024              else    {
2025                  if ($c >= 0x80)    $i+=$d;    // advance a double-byte char
2026              }
2027  
2028              $n++;
2029          }
2030          if (!strlen($str{$i}))    return false; // offset beyond string length
2031  
2032          if ($pos < 0)    $i++;    // correct offset
2033  
2034          return $i;
2035      }
2036  
2037      /**
2038       * Maps all characters of a string in the EUC charset family.
2039       *
2040       * @param    string        EUC multibyte character string
2041       * @param    string        the charset
2042       * @param    string        mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2043       * @param    string        'case': conversion 'toLower' or 'toUpper'
2044       * @return    string        the converted string
2045       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2046       */
2047  	function euc_char_mapping($str,$charset,$mode,$opt='')    {
2048          switch($mode)    {
2049              case 'case':
2050                  if (!$this->initCaseFolding($charset))    return $str;    // do nothing
2051                  $map =& $this->caseFolding[$charset][$opt];
2052                  break;
2053  
2054              case 'ascii':
2055                  if (!$this->initToASCII($charset))    return $str;    // do nothing
2056                  $map =& $this->toASCII[$charset];
2057                  break;
2058  
2059              default:
2060                  return $str;
2061          }
2062  
2063          $sjis = ($charset == 'shift_jis');
2064          $out = '';
2065          for($i=0; strlen($str{$i}); $i++)    {
2066              $mbc = $str{$i};
2067              $c = ord($mbc);
2068  
2069              if ($sjis)    {
2070                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))    {    // a double-byte char
2071                      $mbc = substr($str,$i,2);
2072                      $i++;
2073                  }
2074              }
2075              else    {
2076                  if ($c >= 0x80)    {    // a double-byte char
2077                      $mbc = substr($str,$i,2);
2078                      $i++;
2079                  }
2080              }
2081  
2082              if (isset($map[$mbc]))    {
2083                  $out .= $map[$mbc];
2084              } else {
2085                  $out .= $mbc;
2086              }
2087          }
2088  
2089          return $out;
2090      }
2091  
2092  }
2093  
2094  if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])    {
2095      include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2096  }
2097  ?>


Generated: Wed Jan 14 11:33:29 2009 Cross-referenced by PHPXref 0.7