Main Page | Directories | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages | Examples

class.t3lib_cs.php

Go to the documentation of this file.
00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2003-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the Typo3 project. The Typo3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *
00017 *  This script is distributed in the hope that it will be useful,
00018 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020 *  GNU General Public License for more details.
00021 *
00022 *  This copyright notice MUST APPEAR in all copies of the script!
00023 ***************************************************************/
00136 class t3lib_cs {
00137    var $noCharByteVal=63;     // ASCII Value for chars with no equivalent.
00138 
00139       // This is the array where parsed conversion tables are stored (cached)
00140    var $parsedCharsets=array();
00141 
00142       // An array where case folding data will be stored (cached)
00143    var $caseFolding=array();
00144 
00145       // An array where charset-to-ASCII mappings are stored (cached)
00146    var $toASCII=array();
00147 
00148       // This tells the converter which charsets has two bytes per char:
00149    var $twoByteSets=array(
00150       'ucs-2'=>1, // 2-byte Unicode
00151    );
00152 
00153       // This tells the converter which charsets has four bytes per char:
00154    var $fourByteSets=array(
00155       'ucs-4'=>1, // 4-byte Unicode
00156       'utf-32'=>1,   // 4-byte Unicode (limited to the 21-bits of UTF-16)
00157    );
00158 
00159       // This tells the converter which charsets use a scheme like the Extended Unix Code:
00160    var $eucBasedSets=array(
00161       'gb2312'=>1,      // Chinese, simplified.
00162       'big5'=>1,     // Chinese, traditional.
00163       'euc-kr'=>1,      // Korean
00164       'shift_jis'=>1,      // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
00165    );
00166 
00167       // see   http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
00168       // http://czyborra.com/charsets/iso8859.html
00169    var $synonyms=array(
00170       'us' => 'ascii',
00171       'us-ascii'=> 'ascii',
00172       'cp819' => 'iso-8859-1',
00173       'ibm819' => 'iso-8859-1',
00174       'iso-ir-100' => 'iso-8859-1',
00175       'iso-ir-109' => 'iso-8859-2',
00176       'iso-ir-148' => 'iso-8859-9',
00177       'iso-ir-199' => 'iso-8859-14',
00178       'iso-ir-203' => 'iso-8859-15',
00179       'csisolatin1' => 'iso-8859-1',
00180       'csisolatin2' => 'iso-8859-2',
00181       'csisolatin3' => 'iso-8859-3',
00182       'csisolatin5' => 'iso-8859-9',
00183       'csisolatin8' => 'iso-8859-14',
00184       'csisolatin9' => 'iso-8859-15',
00185       'csisolatingreek' => 'iso-8859-7',
00186       'iso-celtic' => 'iso-8859-14',
00187       'latin1' => 'iso-8859-1',
00188       'latin2' => 'iso-8859-2',
00189       'latin3' => 'iso-8859-3',
00190       'latin5' => 'iso-8859-9',
00191       'latin6' => 'iso-8859-10',
00192       'latin8' => 'iso-8859-14',
00193       'latin9' => 'iso-8859-15',
00194       'l1' => 'iso-8859-1',
00195       'l2' => 'iso-8859-2',
00196       'l3' => 'iso-8859-3',
00197       'l5' => 'iso-8859-9',
00198       'l6' => 'iso-8859-10',
00199       'l8' => 'iso-8859-14',
00200       'l9' => 'iso-8859-15',
00201       'cyrillic' => 'iso-8859-5',
00202       'arabic' => 'iso-8859-6',
00203       'tis-620' => 'iso-8859-11',
00204       'win874' => 'windows-874',
00205       'win1250' => 'windows-1250',
00206       'win1251' => 'windows-1251',
00207       'win1252' => 'windows-1252',
00208       'win1253' => 'windows-1253',
00209       'win1254' => 'windows-1254',
00210       'win1255' => 'windows-1255',
00211       'win1256' => 'windows-1256',
00212       'win1257' => 'windows-1257',
00213       'win1258' => 'windows-1258',
00214       'cp1250' => 'windows-1250',
00215       'cp1251' => 'windows-1251',
00216       'cp1252' => 'windows-1252',
00217       'ms-ee' => 'windows-1250',
00218       'ms-ansi' => 'windows-1252',
00219       'ms-greek' => 'windows-1253',
00220       'ms-turk' => 'windows-1254',
00221       'winbaltrim' => 'windows-1257',
00222       'koi-8ru' => 'koi-8r',
00223       'koi8r' => 'koi-8r',
00224       'cp878' => 'koi-8r',
00225       'mac' => 'macroman',
00226       'macintosh' => 'macroman',
00227       'euc-cn' => 'gb2312',
00228       'x-euc-cn' => 'gb2312',
00229       'euccn' => 'gb2312',
00230       'cp936' => 'gb2312',
00231       'big-5' => 'big5',
00232       'cp950' => 'big5',
00233       'eucjp' => 'euc-jp',
00234       'sjis' => 'shift_jis',
00235       'shift-jis' => 'shift_jis',
00236       'cp932' => 'shift_jis',
00237       'cp949' => 'euc-kr',
00238       'utf7' => 'utf-7',
00239       'utf8' => 'utf-8',
00240       'utf16' => 'utf-16',
00241       'utf32' => 'utf-32',
00242       'utf8' => 'utf-8',
00243       'ucs2' => 'ucs-2',
00244       'ucs4' => 'ucs-4',
00245    );
00246 
00247       // mapping of iso-639:2 language codes to language (family) names
00248    var $lang_to_langfamily=array(
00249          // iso-639:2 language codes, see:
00250          //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
00251          //  http://www.unicode.org/onlinedat/languages.html
00252       'ar' => 'arabic',
00253       'bg' => 'cyrillic',
00254       'cs' => 'east_european',
00255       'da' => 'west_european',
00256       'de' => 'west_european',
00257       'es' => 'west_european',
00258       'et' => 'estonian',
00259       'eu' => 'west_european',
00260       'fi' => 'west_european',
00261       'fr' => 'west_european',
00262       'gr' => 'greek',
00263       'hr' => 'east_european',
00264       'hu' => 'east_european',
00265       'iw' => 'hebrew',
00266       'is' => 'west_european',
00267       'it' => 'west_european',
00268       'ja' => 'japanese',
00269       'kl' => 'west_european',
00270       'ko' => 'korean',
00271       'lt' => 'lithuanian',
00272       'lv' => 'west_european', // Latvian/Lettish
00273       'nl' => 'west_european',
00274       'no' => 'west_european',
00275       'pl' => 'east_european',
00276       'pt' => 'west_european',
00277       'ro' => 'east_european',
00278       'ru' => 'cyrillic',
00279       'sk' => 'east_european',
00280       'sl' => 'east_european',
00281       'sv' => 'west_european',
00282       'th' => 'thai',
00283       'uk' => 'cyrillic',
00284       'vi' => 'vietnamese',
00285       'zh' => 'chinese',
00286          // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
00287       'chs' => 'simpl_chinese',
00288       'cht' => 'trad_chinese',
00289       'csy' => 'east_european',
00290       'dan' => 'west_european',
00291       'deu' => 'west_european',
00292       'dea' => 'west_european',
00293       'des' => 'west_european',
00294       'ena' => 'west_european',
00295       'enc' => 'west_european',
00296       'eng' => 'west_european',
00297       'enz' => 'west_european',
00298       'enu' => 'west_european',
00299       'nld' => 'west_european',
00300       'nlb' => 'west_european',
00301       'fin' => 'west_european',
00302       'fra' => 'west_european',
00303       'frb' => 'west_european',
00304       'frc' => 'west_european',
00305       'frs' => 'west_european',
00306       'ell' => 'greek',
00307       'hun' => 'east_european',
00308       'isl' => 'west_euorpean',
00309       'ita' => 'west_european',
00310       'its' => 'west_european',
00311       'jpn' => 'japanese',
00312       'kor' => 'korean',
00313       'nor' => 'west_european',
00314       'non' => 'west_european',
00315       'plk' => 'east_european',
00316       'ptg' => 'west_european',
00317       'ptb' => 'west_european',
00318       'rus' => 'east_european',
00319       'sky' => 'east_european',
00320       'esp' => 'west_european',
00321       'esm' => 'west_european',
00322       'esn' => 'west_european',
00323       'sve' => 'west_european',
00324       'trk' => 'turkish',
00325          // English language names
00326       'bulgarian' => 'east_european',
00327       'catalan' => 'west_european',
00328       'croatian' => 'east_european',
00329       'czech' => 'east_european',
00330       'danish' => 'west_european',
00331       'dutch' => 'west_european',
00332       'english' => 'west_european',
00333       'finnish' => 'west_european',
00334       'french' => 'west_european',
00335       'galician' => 'west_european',
00336       'german' => 'west_european',
00337       'hungarian' => 'east_european',
00338       'icelandic' => 'west_european',
00339       'italian' => 'west_european',
00340       'latvian' => 'west_european',
00341       'lettish' => 'west_european',
00342       'norwegian' => 'west_european',
00343       'polish' => 'east_european',
00344       'portuguese' => 'west_european',
00345       'russian' => 'cyrillic',
00346       'romanian' => 'east_european',
00347       'slovak' => 'east_european',
00348       'slovenian' => 'east_european',
00349       'spanish' => 'west_european',
00350       'svedish' => 'west_european',
00351       'turkish' => 'east_european',
00352       'ukrainian' => 'cyrillic',
00353    );
00354 
00355       // mapping of language (family) names to charsets on Unix
00356    var $lang_to_charset_unix=array(
00357       'west_european' => 'iso-8859-1',
00358       'estonian' => 'iso-8859-1',
00359       'east_european' => 'iso-8859-2',
00360       'baltic' => 'iso-8859-4',
00361       'cyrillic' => 'iso-8859-5',
00362       'arabic' => 'iso-8859-6',
00363       'greek' => 'iso-8859-7',
00364       'hebrew' => 'iso-8859-8',
00365       'turkish' => 'iso-8859-9',
00366       'thai' => 'iso-8859-11', // = TIS-620
00367       'lithuanian' => 'iso-8859-13',
00368       'chinese' => 'gb2312', // = euc-cn
00369       'japanese' => 'euc-jp',
00370       'korean' => 'euc-kr',
00371       'simpl_chinese' => 'gb2312',
00372       'trad_chinese' => 'big5',
00373       'vietnamese' => '',
00374    );
00375 
00376       // mapping of language (family) names to charsets on Windows
00377    var $lang_to_charset_windows=array(
00378       'east_european' => 'windows-1250',
00379       'cyrillic' => 'windows-1251',
00380       'west_european' => 'windows-1252',
00381       'greek' => 'windows-1253',
00382       'turkish' => 'windows-1254',
00383       'hebrew' => 'windows-1255',
00384       'arabic' => 'windows-1256',
00385       'baltic' => 'windows-1257',
00386       'estonian' => 'windows-1257',
00387       'lithuanian' => 'windows-1257',
00388       'vietnamese' => 'windows-1258',
00389       'thai' => 'cp874',
00390       'korean' => 'cp949',
00391       'chinese' => 'gb2312',
00392       'japanese' => 'shift_jis',
00393       'simpl_chinese' => 'gb2312',
00394       'trad_chinese' => 'big5',
00395    );
00396 
00397       // mapping of locale names to charsets
00398    var $locale_to_charset=array(
00399       'japanese.euc' => 'euc-jp',
00400       'ja_jp.ujis' => 'euc-jp',
00401       'korean.euc' => 'euc-kr',
00402       'zh_cn' => 'gb2312',
00403       'zh_hk' => 'big5',
00404       'zh_tw' => 'big5',
00405    );
00406 
00407       // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
00408       // Empty values means "iso-8859-1"
00409    var $charSetArray = array(
00410       'dk' => '',
00411       'de' => '',
00412       'no' => '',
00413       'it' => '',
00414       'fr' => '',
00415       'es' => '',
00416       'nl' => '',
00417       'cz' => 'windows-1250',
00418       'pl' => 'iso-8859-2',
00419       'si' => 'windows-1250',
00420       'fi' => '',
00421       'tr' => 'iso-8859-9',
00422       'se' => '',
00423       'pt' => '',
00424       'ru' => 'windows-1251',
00425       'ro' => 'iso-8859-2',
00426       'ch' => 'gb2312',
00427       'sk' => 'windows-1250',
00428       'lt' => 'windows-1257',
00429       'is' => 'utf-8',
00430       'hr' => 'windows-1250',
00431       'hu' => 'iso-8859-2',
00432       'gl' => '',
00433       'th' => 'iso-8859-11',
00434       'gr' => 'iso-8859-7',
00435       'hk' => 'big5',
00436       'eu' => '',
00437       'bg' => 'windows-1251',
00438       'br' => '',
00439       'et' => 'iso-8859-4',
00440       'ar' => 'iso-8859-6',
00441       'he' => 'utf-8',
00442       'ua' => 'windows-1251',
00443       'jp' => 'shift_jis',
00444       'lv' => 'utf-8',
00445       'vn' => 'utf-8',
00446       'ca' => 'iso-8859-15',
00447       'ba' => 'iso-8859-2',
00448       'kr' => 'euc-kr',
00449    );
00450 
00451       // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
00452       // Empty values means sames as Typo3
00453    var $isoArray = array(
00454       'dk' => 'da',
00455       'de' => '',
00456       'no' => '',
00457       'it' => '',
00458       'fr' => '',
00459       'es' => '',
00460       'nl' => '',
00461       'cz' => 'cs',
00462       'pl' => '',
00463       'si' => 'sl',
00464       'fi' => '',
00465       'tr' => '',
00466       'se' => 'sv',
00467       'pt' => '',
00468       'ru' => '',
00469       'ro' => '',
00470       'ch' => 'zh_CN',
00471       'sk' => '',
00472       'lt' => '',
00473       'is' => '',
00474       'hr' => '',
00475       'hu' => '',
00476       'gl' => '', // Greenlandic
00477       'th' => '',
00478       'gr' => 'el',
00479       'hk' => 'zh_HK',
00480       'eu' => '',
00481       'bg' => '',
00482       'br' => 'pt_BR',
00483       'et' => '',
00484       'ar' => '',
00485       'he' => 'iw',
00486       'ua' => 'uk',
00487       'jp' => 'ja',
00488       'lv' => '',
00489       'vn' => 'vi',
00490       'ca' => '',
00491       'ba' => '', // Bosnian
00492       'kr' => '',
00493    );
00494 
00502    function parse_charset($charset) {
00503       $charset = strtolower($charset);
00504       if (isset($this->synonyms[$charset]))  $charset = $this->synonyms[$charset];
00505 
00506       return $charset;
00507    }
00508 
00521    function get_locale_charset($locale)   {
00522       $locale = strtolower($locale);
00523 
00524          // exact locale specific charset?
00525       if (isset($this->locale_to_charset[$locale]))   return $this->locale_to_charset[$locale];
00526 
00527          // get modifier
00528       list($locale,$modifier) = explode('@',$locale);
00529 
00530          // locale contains charset: use it
00531       list($locale,$charset) = explode('.',$locale);
00532       if ($charset)  return $this->parse_charset($charset);
00533 
00534          // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
00535       if ($modifier == 'euro')   return 'iso-8859-15';
00536 
00537          // get language
00538       list($language,$country) = explode('_',$locale);
00539       if (isset($this->lang_to_langfamily[$language]))   $language = $this->lang_to_langfamily[$language];
00540 
00541       if (TYPO3_OS == 'WIN')  {
00542          $cs = $this->lang_to_charset_windows[$language];
00543       } else {
00544          $cs = $this->lang_to_charset_unix[$language];
00545       }
00546 
00547       return $cs ? $cs : 'iso-8859-1';
00548    }
00549 
00550 
00551 
00552 
00553 
00554 
00555 
00556 
00557 
00558    /********************************************
00559     *
00560     * Charset Conversion functions
00561     *
00562     ********************************************/
00563 
00574    function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)  {
00575       if ($fromCS==$toCS)  return $str;
00576 
00577          // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
00578       if ($toCS=='utf-8' || !$useEntityForNoChar)  {
00579          switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
00580          case 'mbstring':
00581             $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
00582             if (false !== $conv_str)   return $conv_str; // returns false for unsupported charsets
00583             break;
00584 
00585          case 'iconv':
00586             $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
00587             if (false !== $conv_str)   return $conv_str;
00588             break;
00589 
00590          case 'recode':
00591             $conv_str = recode_string($fromCS.'..'.$toCS,$str);
00592             if (false !== $conv_str)   return $conv_str;
00593             break;
00594          }
00595          // fallback to TYPO3 conversion
00596       }
00597 
00598       if ($fromCS!='utf-8')   $str=$this->utf8_encode($str,$fromCS);
00599       if ($toCS!='utf-8')  $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
00600       return $str;
00601    }
00602 
00614    function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)   {
00615       foreach($array as $key => $value)   {
00616          if (is_array($array[$key]))   {
00617             $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00618          } else {
00619             $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00620          }
00621       }
00622    }
00623 
00631    function utf8_encode($str,$charset) {
00632 
00633          // Charset is case-insensitive.
00634       if ($this->initCharset($charset))   {  // Parse conv. table if not already...
00635          $strLen = strlen($str);
00636          $outStr='';
00637 
00638          for ($a=0;$a<$strLen;$a++) {  // Traverse each char in string.
00639             $chr=substr($str,$a,1);
00640             $ord=ord($chr);
00641             if (isset($this->twoByteSets[$charset]))  {  // If the charset has two bytes per char
00642                $ord2 = ord($str{$a+1});
00643                $ord = $ord<<8 & $ord2; // assume big endian
00644 
00645                if (isset($this->parsedCharsets[$charset]['local'][$ord]))  {  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00646                   $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
00647                } else $outStr.=chr($this->noCharByteVal);   // No char exists
00648                $a++;
00649             } elseif ($ord>127)  {  // If char has value over 127 it's a multibyte char in UTF-8
00650                if (isset($this->eucBasedSets[$charset])) {  // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
00651                   $a++;
00652                   $ord2=ord(substr($str,$a,1));
00653                   $ord = $ord*256+$ord2;
00654                }
00655                elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) {  // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
00656                   $a++;
00657                   $ord2=ord(substr($str,$a,1));
00658                   $ord = $ord*256+$ord2;
00659                }
00660 
00661                if (isset($this->parsedCharsets[$charset]['local'][$ord]))  {  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00662                   $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
00663                } else $outStr.=chr($this->noCharByteVal);   // No char exists
00664             } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00665          }
00666          return $outStr;
00667       }
00668    }
00669 
00678    function utf8_decode($str,$charset,$useEntityForNoChar=0)   {
00679 
00680          // Charset is case-insensitive.
00681       if ($this->initCharset($charset))   {  // Parse conv. table if not already...
00682          $strLen = strlen($str);
00683          $outStr='';
00684          $buf='';
00685          for ($a=0,$i=0;$a<$strLen;$a++,$i++)   {  // Traverse each char in UTF-8 string.
00686             $chr=substr($str,$a,1);
00687             $ord=ord($chr);
00688             if ($ord>127)  {  // This means multibyte! (first byte!)
00689                if ($ord & 64) {  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00690 
00691                   $buf=$chr;  // Add first byte
00692                   for ($b=0;$b<8;$b++) {  // for each byte in multibyte string...
00693                      $ord = $ord << 1; // Shift it left and ...
00694                      if ($ord & 128)   {  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00695                         $a++; // Increase pointer...
00696                         $buf.=substr($str,$a,1);   // ... and add the next char.
00697                      } else break;
00698                   }
00699 
00700                   if (isset($this->parsedCharsets[$charset]['utf8'][$buf]))   {  // If the UTF-8 char-sequence is found then...
00701                      $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];  // The local number
00702                      if ($mByte>255)   {  // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
00703                         $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
00704                      } else $outStr.= chr($mByte);
00705                   } elseif ($useEntityForNoChar) { // Create num entity:
00706                      $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00707                   } else $outStr.=chr($this->noCharByteVal);   // No char exists
00708                } else $outStr.=chr($this->noCharByteVal);   // No char exists (MIDDLE of MB sequence!)
00709             } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00710          }
00711          return $outStr;
00712       }
00713    }
00714 
00721    function utf8_to_entities($str)  {
00722       $strLen = strlen($str);
00723       $outStr='';
00724       $buf='';
00725       for ($a=0;$a<$strLen;$a++) {  // Traverse each char in UTF-8 string.
00726          $chr=substr($str,$a,1);
00727          $ord=ord($chr);
00728          if ($ord>127)  {  // This means multibyte! (first byte!)
00729             if ($ord & 64) {  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00730                $buf=$chr;  // Add first byte
00731                for ($b=0;$b<8;$b++) {  // for each byte in multibyte string...
00732                   $ord = $ord << 1; // Shift it left and ...
00733                   if ($ord & 128)   {  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00734                      $a++; // Increase pointer...
00735                      $buf.=substr($str,$a,1);   // ... and add the next char.
00736                   } else break;
00737                }
00738 
00739                $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00740             } else $outStr.=chr($this->noCharByteVal);   // No char exists (MIDDLE of MB sequence!)
00741          } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00742       }
00743 
00744       return $outStr;
00745    }
00746 
00754    function entities_to_utf8($str,$alsoStdHtmlEnt=0)  {
00755       if ($alsoStdHtmlEnt) {
00756          $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));     // Getting them in iso-8859-1 - but thats ok since this is observed below.
00757       }
00758 
00759       $token = md5(microtime());
00760       $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
00761       foreach($parts as $k => $v)   {
00762          if ($k%2)   {
00763             if (substr($v,0,1)=='#')   {  // Dec or hex entities:
00764                if (substr($v,1,1)=='x')   {
00765                   $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
00766                } else {
00767                   $parts[$k] = $this->UnumberToChar(substr($v,1));
00768                }
00769             } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {   // Other entities:
00770                $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
00771             } else { // No conversion:
00772                $parts[$k] ='&'.$v.';';
00773             }
00774          }
00775       }
00776 
00777       return implode('',$parts);
00778    }
00779 
00788    function utf8_to_numberarray($str,$convEntities=0,$retChar=0)  {
00789          // If entities must be registered as well...:
00790       if ($convEntities)   {
00791          $str = $this->entities_to_utf8($str,1);
00792       }
00793          // Do conversion:
00794       $strLen = strlen($str);
00795       $outArr=array();
00796       $buf='';
00797       for ($a=0;$a<$strLen;$a++) {  // Traverse each char in UTF-8 string.
00798          $chr=substr($str,$a,1);
00799          $ord=ord($chr);
00800          if ($ord>127)  {  // This means multibyte! (first byte!)
00801             if ($ord & 64) {  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00802                $buf=$chr;  // Add first byte
00803                for ($b=0;$b<8;$b++) {  // for each byte in multibyte string...
00804                   $ord = $ord << 1; // Shift it left and ...
00805                   if ($ord & 128)   {  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00806                      $a++; // Increase pointer...
00807                      $buf.=substr($str,$a,1);   // ... and add the next char.
00808                   } else break;
00809                }
00810 
00811                $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
00812             } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;  // No char exists (MIDDLE of MB sequence!)
00813          } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00814       }
00815 
00816       return $outArr;
00817    }
00818 
00838    function UnumberToChar($cbyte)   {
00839       $str='';
00840 
00841       if ($cbyte < 0x80) {
00842          $str.=chr($cbyte);
00843       } else if ($cbyte < 0x800) {
00844          $str.=chr(0xC0 | ($cbyte >> 6));
00845          $str.=chr(0x80 | ($cbyte & 0x3F));
00846       } else if ($cbyte < 0x10000) {
00847          $str.=chr(0xE0 | ($cbyte >> 12));
00848          $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00849          $str.=chr(0x80 | ($cbyte & 0x3F));
00850       } else if ($cbyte < 0x200000) {
00851          $str.=chr(0xF0 | ($cbyte >> 18));
00852          $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00853          $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00854          $str.=chr(0x80 | ($cbyte & 0x3F));
00855       } else if ($cbyte < 0x4000000) {
00856          $str.=chr(0xF8 | ($cbyte >> 24));
00857          $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00858          $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00859          $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00860          $str.=chr(0x80 | ($cbyte & 0x3F));
00861       } else if ($cbyte < 0x80000000) {
00862          $str.=chr(0xFC | ($cbyte >> 30));
00863          $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
00864          $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00865          $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00866          $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00867          $str.=chr(0x80 | ($cbyte & 0x3F));
00868       } else { // Cannot express a 32-bit character in UTF-8
00869          $str .= chr($this->noCharByteVal);
00870       }
00871       return $str;
00872    }
00873 
00883    function utf8CharToUnumber($str,$hex=0)   {
00884       $ord=ord(substr($str,0,1));   // First char
00885 
00886       if (($ord & 192) == 192)   {  // This verifyes that it IS a multi byte string
00887          $binBuf='';
00888          for ($b=0;$b<8;$b++) {  // for each byte in multibyte string...
00889             $ord = $ord << 1; // Shift it left and ...
00890             if ($ord & 128)   {  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00891                $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
00892             } else break;
00893          }
00894          $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
00895 
00896          $int = bindec($binBuf);
00897       } else $int = $ord;
00898 
00899       return $hex ? 'x'.dechex($int) : $int;
00900    }
00901 
00902 
00903 
00904 
00905 
00906 
00907 
00908 
00909 
00910    /********************************************
00911     *
00912     * Init functions
00913     *
00914     ********************************************/
00915 
00926    function initCharset($charset)   {
00927          // Only process if the charset is not yet loaded:
00928       if (!is_array($this->parsedCharsets[$charset])) {
00929 
00930             // Conversion table filename:
00931          $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
00932 
00933             // If the conversion table is found:
00934          if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
00935                // Cache file for charsets:
00936                // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
00937             $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
00938             if ($cacheFile && @is_file($cacheFile))   {
00939                $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
00940             } else {
00941                   // Parse conversion table into lines:
00942                $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
00943                   // Initialize the internal variable holding the conv. table:
00944                $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
00945                   // traverse the lines:
00946                $detectedType='';
00947                foreach($lines as $value)  {
00948                   if (trim($value) && substr($value,0,1)!='#') {  // Comment line or blanks are ignored.
00949 
00950                         // Detect type if not done yet: (Done on first real line)
00951                         // The "whitespaced" type is on the syntax   "0x0A 0x000A   #LINE FEED"    while    "ms-token" is like      "B9 = U+00B9 : SUPERSCRIPT ONE"
00952                      if (!$detectedType)     $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
00953 
00954                      if ($detectedType=='ms-token')   {
00955                         list($hexbyte,$utf8) = split('=|:',$value,3);
00956                      } elseif ($detectedType=='whitespaced')   {
00957                         $regA=array();
00958                         ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
00959                         $hexbyte = $regA[1];
00960                         $utf8 = 'U+'.$regA[2];
00961                      }
00962                      $decval = hexdec(trim($hexbyte));
00963                      if ($decval>127)  {
00964                         $utf8decval = hexdec(substr(trim($utf8),2));
00965                         $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
00966                         $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
00967                      }
00968                   }
00969                }
00970                if ($cacheFile)   {
00971                   t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
00972                }
00973             }
00974             return 2;
00975          } else return false;
00976       } else return 1;
00977    }
00978 
00988    function initUnicodeData($mode=null)   {
00989          // cache files
00990       $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
00991       $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
00992 
00993          // Only process if the tables are not yet loaded
00994       switch($mode)  {
00995          case 'case':
00996             if (is_array($this->caseFolding['utf-8']))   return 1;
00997 
00998                // Use cached version if possible
00999             if ($cacheFileCase && @is_file($cacheFileCase)) {
01000                $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
01001                return 2;
01002             }
01003             break;
01004 
01005          case 'ascii':
01006             if (is_array($this->toASCII['utf-8'])) return 1;
01007 
01008                // Use cached version if possible
01009             if ($cacheFileASCII && @is_file($cacheFileASCII))  {
01010                $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
01011                return 2;
01012             }
01013             break;
01014       }
01015 
01016          // process main Unicode data file
01017       $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
01018       if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile)))  return false;
01019 
01020       $fh = fopen($unicodeDataFile,'rb');
01021       if (!$fh)   return false;
01022 
01023          // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
01024          // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
01025       $this->caseFolding['utf-8'] = array();
01026       $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
01027       $utf8CaseFolding['toUpper'] = array();
01028       $utf8CaseFolding['toLower'] = array();
01029       $utf8CaseFolding['toTitle'] = array();
01030 
01031       $decomposition = array();  // array of temp. decompositions
01032       $mark = array();     // array of chars that are marks (eg. composing accents)
01033       $number = array();      // array of chars that are numbers (eg. digits)
01034       $omit = array();     // array of chars to be omitted (eg. Russian hard sign)
01035 
01036       while (!feof($fh))   {
01037          $line = fgets($fh,4096);
01038             // has a lot of info
01039          list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
01040 
01041          $ord = hexdec($char);
01042          if ($ord > 0xFFFF)   break;   // only process the BMP
01043 
01044          $utf8_char = $this->UnumberToChar($ord);
01045 
01046          if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
01047          if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
01048             // store "title" only when different from "upper" (only a few)
01049          if ($title && $title != $upper)  $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
01050 
01051          switch ($cat{0})  {
01052             case 'M':   // mark (accent, umlaut, ...)
01053                $mark["U+$char"] = 1;
01054                break;
01055 
01056             case 'N':   // numeric value
01057                if ($ord > 0x80 && $num != '')   $number["U+$char"] = $num;
01058          }
01059 
01060             // accented Latin letters without "official" decomposition
01061          $match = array();
01062          if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
01063             $c = ord($match[2]);
01064             if ($match[1] == 'SMALL')  $c += 32;
01065 
01066             $decomposition["U+$char"] = array(dechex($c));
01067             continue;
01068          }
01069 
01070          $match = array();
01071          if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
01072             switch($match[1]) {
01073                case '<circle>':  // add parenthesis as circle replacement, eg (1)
01074                   $match[2] = '0028 '.$match[2].' 0029';
01075                   break;
01076 
01077                case '<square>':  // add square brackets as square replacement, eg [1]
01078                   $match[2] = '005B '.$match[2].' 005D';
01079                   break;
01080 
01081                case '<compat>':  // ignore multi char decompositions that start with a space
01082                   if (ereg('^0020 ',$match[2])) continue 2;
01083                   break;
01084 
01085                   // ignore Arabic and vertical layout presentation decomposition
01086                case '<initial>':
01087                case '<medial>':
01088                case '<final>':
01089                case '<isolated>':
01090                case '<vertical>':
01091                   continue 2;
01092             }
01093             $decomposition["U+$char"] = split(' ',$match[2]);
01094          }
01095       }
01096       fclose($fh);
01097 
01098          // process additional Unicode data for casing (allow folded characters to expand into a sequence)
01099       $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
01100       if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
01101          $fh = fopen($specialCasingFile,'rb');
01102          if ($fh) {
01103             while (!feof($fh))   {
01104                $line = fgets($fh,4096);
01105                if ($line{0} != '#' && trim($line) != '') {
01106 
01107                   list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
01108                   if ($cond == '' || $cond{0} == '#') {
01109                      $utf8_char = $this->UnumberToChar(hexdec($char));
01110                      if ($char != $lower) {
01111                         $arr = split(' ',$lower);
01112                         for ($i=0; isset($arr[$i]); $i++)   $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01113                         $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
01114                      }
01115                      if ($char != $title && $title != $upper)  {
01116                         $arr = split(' ',$title);
01117                         for ($i=0; isset($arr[$i]); $i++)   $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01118                         $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
01119                      }
01120                      if ($char != $upper) {
01121                            $arr = split(' ',$upper);
01122                         for ($i=0; isset($arr[$i]); $i++)   $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01123                         $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
01124                      }
01125                   }
01126                }
01127             }
01128             fclose($fh);
01129          }
01130       }
01131 
01132          // process custom decompositions
01133       $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
01134       if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))  {
01135          $fh = fopen($customTranslitFile,'rb');
01136          if ($fh) {
01137             while (!feof($fh))   {
01138                $line = fgets($fh,4096);
01139                if ($line{0} != '#' && trim($line) != '') {
01140                   list($char,$translit) = t3lib_div::trimExplode(';', $line);
01141                   if (!$translit)   $omit["U+$char"] = 1;
01142                   $decomposition["U+$char"] = split(' ', $translit);
01143 
01144                }
01145             }
01146             fclose($fh);
01147          }
01148       }
01149 
01150          // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
01151       foreach($decomposition as $from => $to)   {
01152          $code_decomp = array();
01153 
01154          while ($code_value = array_shift($to)) {
01155             if (isset($decomposition["U+$code_value"]))  {  // do recursive decomposition
01156                foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
01157                   array_unshift($to, $cv);
01158                }
01159             } elseif (!isset($mark["U+$code_value"])) {  // remove mark
01160                array_push($code_decomp, $code_value);
01161             }
01162          }
01163          if (count($code_decomp) || isset($omit[$from])) {
01164             $decomposition[$from] = $code_decomp;
01165          } else {
01166             unset($decomposition[$from]);
01167          }
01168       }
01169 
01170          // create ascii only mapping
01171       $this->toASCII['utf-8'] = array();
01172       $ascii =& $this->toASCII['utf-8'];
01173 
01174       foreach($decomposition as $from => $to)   {
01175          $code_decomp = array();
01176          while ($code_value = array_shift($to)) {
01177             $ord = hexdec($code_value);
01178             if ($ord > 127)
01179                continue 2; // skip decompositions containing non-ASCII chars
01180             else
01181                array_push($code_decomp,chr($ord));
01182          }
01183          $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
01184       }
01185 
01186          // add numeric decompositions
01187       foreach($number as $from => $to) {
01188          $utf8_char = $this->UnumberToChar(hexdec($from));
01189          if (!isset($ascii[$utf8_char]))  {
01190             $ascii[$utf8_char] = $to;
01191          }
01192       }
01193 
01194       if ($cacheFileCase)  {
01195             t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
01196       }
01197 
01198       if ($cacheFileASCII) {
01199             t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
01200       }
01201 
01202       return 3;
01203    }
01204 
01213    function initCaseFolding($charset)  {
01214          // Only process if the case table is not yet loaded:
01215       if (is_array($this->caseFolding[$charset]))  return 1;
01216 
01217          // Use cached version if possible
01218       $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
01219       if ($cacheFile && @is_file($cacheFile))   {
01220          $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01221          return 2;
01222       }
01223 
01224          // init UTF-8 conversion for this charset
01225       if (!$this->initCharset($charset))  {
01226          return false;
01227       }
01228 
01229          // UTF-8 case folding is used as the base conversion table
01230       if (!$this->initUnicodeData('case'))   {
01231          return false;
01232       }
01233 
01234       $nochar = chr($this->noCharByteVal);
01235       foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)   {
01236             // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01237          $c = $this->utf8_decode($utf8, $charset);
01238 
01239             // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
01240          $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
01241          if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
01242 
01243             // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
01244          $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
01245          if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
01246 
01247             // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
01248          $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
01249          if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
01250       }
01251 
01252          // add the ASCII case table
01253       for ($i=ord('a'); $i<=ord('z'); $i++)  {
01254          $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
01255       }
01256       for ($i=ord('A'); $i<=ord('Z'); $i++)  {
01257          $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
01258       }
01259 
01260       if ($cacheFile)   {
01261             t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
01262       }
01263 
01264       return 3;
01265    }
01266 
01275    function initToASCII($charset)   {
01276          // Only process if the case table is not yet loaded:
01277       if (is_array($this->toASCII[$charset]))   return 1;
01278 
01279          // Use cached version if possible
01280       $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
01281       if ($cacheFile && @is_file($cacheFile))   {
01282          $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01283          return 2;
01284       }
01285 
01286          // init UTF-8 conversion for this charset
01287       if (!$this->initCharset($charset))  {
01288          return false;
01289       }
01290 
01291          // UTF-8/ASCII transliteration is used as the base conversion table
01292       if (!$this->initUnicodeData('ascii'))  {
01293          return false;
01294       }
01295 
01296       $nochar = chr($this->noCharByteVal);
01297       foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)   {
01298             // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01299          $c = $this->utf8_decode($utf8, $charset);
01300 
01301          if (isset($this->toASCII['utf-8'][$utf8]))   {
01302             $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
01303          }
01304       }
01305 
01306       if ($cacheFile)   {
01307             t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
01308       }
01309 
01310       return 3;
01311    }
01312 
01313 
01314 
01315 
01316 
01317 
01318 
01319 
01320 
01321 
01322 
01323 
01324 
01325 
01326 
01327 
01328    /********************************************
01329     *
01330     * String operation functions
01331     *
01332     ********************************************/
01333 
01346    function substr($charset,$string,$start,$len=null) {
01347       if ($len===0)  return '';
01348 
01349       if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01350             // cannot omit $len, when specifying charset
01351          if ($len==null)   {
01352             $enc = mb_internal_encoding();   // save internal encoding
01353             mb_internal_encoding('utf-8');
01354             $str = mb_substr($string,$start);
01355             mb_internal_encoding($enc);   // restore internal encoding
01356 
01357             return $str;
01358          }
01359          else  return mb_substr($string,$start,$len,'utf-8');
01360       } elseif ($charset == 'utf-8')   {
01361          return $this->utf8_substr($string,$start,$len);
01362       } elseif ($this->eucBasedSets[$charset])  {
01363          return $this->euc_substr($string,$start,$charset,$len);
01364       } elseif ($this->twoByteSets[$charset])   {
01365          return substr($string,$start*2,$len*2);
01366       } elseif ($this->fourByteSets[$charset])  {
01367          return substr($string,$start*4,$len*4);
01368       }
01369 
01370       // treat everything else as single-byte encoding
01371       return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
01372    }
01373 
01384    function strlen($charset,$string)   {
01385       if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01386          return mb_strlen($string,$charset);
01387       } elseif ($charset == 'utf-8')   {
01388          return $this->utf8_strlen($string);
01389       } elseif ($this->eucBasedSets[$charset])  {
01390          return $this->euc_strlen($string,$charset);
01391       } elseif ($this->twoByteSets[$charset])   {
01392          return strlen($string)/2;
01393       } elseif ($this->fourByteSets[$charset])  {
01394          return strlen($string)/4;
01395       }
01396       // treat everything else as single-byte encoding
01397       return strlen($string);
01398    }
01399 
01412    function crop($charset,$string,$len,$crop='')   {
01413       if (intval($len) == 0)  return $string;
01414 
01415       if ($charset == 'utf-8')   {
01416          $i = $this->utf8_char2byte_pos($string,$len);
01417       } elseif ($this->eucBasedSets[$charset])  {
01418          $i = $this->euc_char2byte_pos($string,$len,$charset);
01419       } else {
01420          if ($len > 0)  {
01421             $i = $len;
01422          } else {
01423             $i = strlen($string)+$len;
01424             if ($i<=0)  $i = false;
01425          }
01426       }
01427 
01428       if ($i === false) {  // $len outside actual string length
01429          return $string;
01430       } else   {
01431          if ($len > 0)  {
01432             if (strlen($string{$i}))   {
01433                return substr($string,0,$i).$crop;
01434 
01435             }
01436          } else {
01437             if (strlen($string{$i-1})) {
01438                return $crop.substr($string,$i);
01439             }
01440          }
01441 
01442 /*
01443          if (abs($len)<$this->strlen($charset,$string))  {  // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
01444             if ($len > 0)  {
01445                return substr($string,0,$i).$crop;
01446             } else {
01447                return $crop.substr($string,$i);
01448             }
01449          }
01450 */
01451       }
01452       return $string;
01453    }
01454 
01465    function strtrunc($charset,$string,$len)  {
01466       if ($len <= 0) return '';
01467 
01468       if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01469          return mb_strcut($string,0,$len,$charset);
01470       } elseif ($charset == 'utf-8')   {
01471          return $this->utf8_strtrunc($string,$len);
01472       } elseif ($this->eucBasedSets[$charset])  {
01473          return $this->euc_strtrunc($string,$charset);
01474       } elseif ($this->twoByteSets[$charset])   {
01475          if ($len % 2)  $len--;     // don't cut at odd positions
01476       } elseif ($this->fourByteSets[$charset])  {
01477          $x = $len % 4;
01478          $len -= $x; // realign to position dividable by four
01479       }
01480       // treat everything else as single-byte encoding
01481       return substr($string,0,$len);
01482    }
01483 
01499    function conv_case($charset,$string,$case)   {
01500       if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3) {
01501          if ($case == 'toLower') {
01502             return mb_strtolower($str,'utf-8');
01503          } else {
01504             return mb_strtoupper($str,'utf-8');
01505          }
01506       } elseif ($charset == 'utf-8')   {
01507          return $this->utf8_char_mapping($string,'case',$case);
01508       } elseif (isset($this->eucBasedSets[$charset])) {
01509          return $this->euc_char_mapping($string,$charset,'case',$case);
01510       } else {
01511             // treat everything else as single-byte encoding
01512          return $this->sb_char_mapping($string,$charset,'case',$case);
01513       }
01514 
01515       return $string;
01516    }
01517 
01525    function specCharsToASCII($charset,$string)  {
01526       if ($charset == 'utf-8')   {
01527          return $this->utf8_char_mapping($string,'ascii');
01528       } elseif (isset($this->eucBasedSets[$charset])) {
01529          return $this->euc_char_mapping($string,$charset,'ascii');
01530       } else {
01531             // treat everything else as single-byte encoding
01532          return $this->sb_char_mapping($string,$charset,'ascii');
01533       }
01534 
01535       return $string;
01536    }
01537 
01538 
01539 
01540 
01541 
01542 
01543 
01544 
01545 
01546 
01547 
01548 
01549    /********************************************
01550     *
01551     * Internal string operation functions
01552     *
01553     ********************************************/
01554 
01565    function sb_char_mapping($str,$charset,$mode,$opt='') {
01566       switch($mode)  {
01567          case 'case':
01568             if (!$this->initCaseFolding($charset)) return $str;   // do nothing
01569             $map =& $this->caseFolding[$charset][$opt];
01570             break;
01571 
01572          case 'ascii':
01573             if (!$this->initToASCII($charset))  return $str;   // do nothing
01574             $map =& $this->toASCII[$charset];
01575             break;
01576 
01577          default:
01578             return $str;
01579       }
01580 
01581       $out = '';
01582       for($i=0; strlen($str{$i}); $i++)   {
01583          $c = $str{$i};
01584          if (isset($map[$c])) {
01585             $out .= $map[$c];
01586          } else {
01587             $out .= $c;
01588          }
01589       }
01590 
01591       return $out;
01592    }
01593 
01594 
01595 
01596 
01597 
01598 
01599 
01600 
01601 
01602 
01603    /********************************************
01604     *
01605     * Internal UTF-8 string operation functions
01606     *
01607     ********************************************/
01608 
01620    function utf8_substr($str,$start,$len=null)  {
01621       if (!strcmp($len,'0'))  return '';
01622 
01623       $byte_start = $this->utf8_char2byte_pos($str,$start);
01624       if ($byte_start === false) {
01625          if ($start > 0)   {
01626             return false;  // $start outside string length
01627          } else {
01628             $start = 0;
01629          }
01630       }
01631 
01632       $str = substr($str,$byte_start);
01633 
01634       if ($len!=null)   {
01635          $byte_end = $this->utf8_char2byte_pos($str,$len);
01636          if ($byte_end === false)   // $len outside actual string length
01637             return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
01638          else
01639             return substr($str,0,$byte_end);
01640       }
01641       else  return $str;
01642    }
01643 
01653    function utf8_strlen($str) {
01654       $n=0;
01655       for($i=0; strlen($str{$i}); $i++)   {
01656          $c = ord($str{$i});
01657          if (!($c & 0x80)) // single-byte (0xxxxxx)
01658             $n++;
01659          elseif (($c & 0xC0) == 0xC0)  // multi-byte starting byte (11xxxxxx)
01660             $n++;
01661       }
01662       return $n;
01663    }
01664 
01674    function utf8_strtrunc($str,$len)   {
01675       $i = $len-1;
01676       if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
01677          for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)   ;  // find the first byte
01678          if ($i <= 0)   return ''; // sanity check
01679          for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++;   // calculate number of bytes
01680          if ($bc+$i > $len)   return substr($str,0,$i);
01681                         // fallthru: multibyte char fits into length
01682       }
01683       return substr($str,$len);
01684    }
01685 
01696    function utf8_strpos($haystack,$needle,$offset=0)  {
01697       if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01698          return mb_strpos($haystack,$needle,'utf-8');
01699       }
01700 
01701       $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
01702       if ($byte_offset === false)   return false; // offset beyond string length
01703 
01704       $byte_pos = strpos($haystack,$needle,$byte_offset);
01705       if ($byte_pos === false)   return false; // needle not found
01706 
01707       return $this->utf8_byte2char_pos($haystack,$byte_pos);
01708    }
01709 
01719    function utf8_strrpos($haystack,$needle)  {
01720       if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01721          return mb_strrpos($haystack,$needle,'utf-8');
01722       }
01723 
01724       $byte_pos = strrpos($haystack,$needle);
01725       if ($byte_pos === false)   return false; // needle not found
01726 
01727       return $this->utf8_byte2char_pos($haystack,$byte_pos);
01728    }
01729 
01739    function utf8_char2byte_pos($str,$pos) {
01740       $n = 0;           // number of characters found
01741       $p = abs($pos);      // number of characters wanted
01742 
01743       if ($pos >= 0) {
01744          $i = 0;
01745          $d = 1;
01746       } else {
01747          $i = strlen($str)-1;
01748          $d = -1;
01749       }
01750 
01751       for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
01752          $c = (int)ord($str{$i});
01753          if (!($c & 0x80)) // single-byte (0xxxxxx)
01754             $n++;
01755          elseif (($c & 0xC0) == 0xC0)  // multi-byte starting byte (11xxxxxx)
01756             $n++;
01757       }
01758       if (!strlen($str{$i}))  return false; // offset beyond string length
01759 
01760       if ($pos >= 0) {
01761             // skip trailing multi-byte data bytes
01762          while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
01763       } else {
01764             // correct offset
01765          $i++;
01766       }
01767 
01768       return $i;
01769    }
01770 
01780    function utf8_byte2char_pos($str,$pos) {
01781       $n = 0;  // number of characters
01782       for($i=$pos; $i>0; $i--)   {
01783          $c = (int)ord($str{$i});
01784          if (!($c & 0x80)) // single-byte (0xxxxxx)
01785             $n++;
01786          elseif (($c & 0xC0) == 0xC0)  // multi-byte starting byte (11xxxxxx)
01787             $n++;
01788       }
01789       if (!strlen($str{$i}))  return false; // offset beyond string length
01790 
01791       return $n;
01792    }
01793 
01803    function utf8_char_mapping($str,$mode,$opt='')  {
01804       if (!$this->initUnicodeData($mode)) return $str;   // do nothing
01805 
01806       $out = '';
01807       switch($mode)  {
01808          case 'case':
01809             $map =& $this->caseFolding['utf-8'][$opt];
01810             break;
01811 
01812          case 'ascii':
01813             $map =& $this->toASCII['utf-8'];
01814             break;
01815 
01816          default:
01817             return $str;
01818       }
01819 
01820       for($i=0; strlen($str{$i}); $i++)   {
01821          $c = ord($str{$i});
01822          if (!($c & 0x80)) // single-byte (0xxxxxx)
01823             $mbc = $str{$i};
01824          elseif (($c & 0xC0) == 0xC0)  {  // multi-byte starting byte (11xxxxxx)
01825             for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
01826             $mbc = substr($str,$i,$bc);
01827             $i += $bc-1;
01828          }
01829 
01830          if (isset($map[$mbc]))  {
01831             $out .= $map[$mbc];
01832          } else {
01833             $out .= $mbc;
01834          }
01835       }
01836 
01837       return $out;
01838    }
01839 
01840 
01841 
01842 
01843 
01844 
01845 
01846 
01847 
01848 
01849 
01850 
01851 
01852 
01853 
01854 
01855 
01856 
01857    /********************************************
01858     *
01859     * Internal EUC string operation functions
01860     *
01861     * Extended Unix Code:
01862     *  ASCII compatible 7bit single bytes chars
01863     *  8bit two byte chars
01864     *
01865     * Shift-JIS is treated as a special case.
01866     *
01867     ********************************************/
01868 
01879    function euc_strtrunc($str,$len,$charset)  {
01880       $sjis = ($charset == 'shift_jis');
01881       for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
01882          $c = ord($str{$i});
01883          if ($sjis)  {
01884             if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++; // advance a double-byte char
01885          }
01886          else  {
01887             if ($c >= 0x80)   $i++; // advance a double-byte char
01888          }
01889       }
01890       if (!strlen($str{$i}))  return $str;   // string shorter than supplied length
01891 
01892       if ($i>$len)
01893          return substr($str,0,$len-1); // we ended on a first byte
01894       else
01895          return substr($str,0,$len);
01896         }
01897 
01908    function euc_substr($str,$start,$charset,$len=null)   {
01909       $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
01910       if ($byte_start === false) return false;  // $start outside string length
01911 
01912       $str = substr($str,$byte_start);
01913 
01914       if ($len!=null)   {
01915          $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
01916          if ($byte_end === false)   // $len outside actual string length
01917             return $str;
01918          else
01919             return substr($str,0,$byte_end);
01920       }
01921       else  return $str;
01922    }
01923 
01933    function euc_strlen($str,$charset)   {
01934       $sjis = ($charset == 'shift_jis');
01935       $n=0;
01936       for ($i=0; strlen($str{$i}); $i++) {
01937          $c = ord($str{$i});
01938          if ($sjis)  {
01939             if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++; // advance a double-byte char
01940          }
01941          else  {
01942             if ($c >= 0x80)   $i++; // advance a double-byte char
01943          }
01944 
01945          $n++;
01946       }
01947 
01948       return $n;
01949    }
01950 
01960    function euc_char2byte_pos($str,$pos,$charset)  {
01961       $sjis = ($charset == 'shift_jis');
01962       $n = 0; // number of characters seen
01963       $p = abs($pos);   // number of characters wanted
01964 
01965       if ($pos >= 0) {
01966          $i = 0;
01967          $d = 1;
01968       } else {
01969          $i = strlen($str)-1;
01970          $d = -1;
01971       }
01972 
01973       for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
01974          $c = ord($str{$i});
01975          if ($sjis)  {
01976             if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i+=$d;  // advance a double-byte char
01977          }
01978          else  {
01979             if ($c >= 0x80)   $i+=$d;  // advance a double-byte char
01980          }
01981 
01982          $n++;
01983       }
01984       if (!strlen($str{$i}))  return false; // offset beyond string length
01985 
01986       if ($pos < 0)  $i++; // correct offset
01987 
01988       return $i;
01989    }
01990 
02001    function euc_char_mapping($str,$charset,$mode,$opt='')   {
02002       switch($mode)  {
02003          case 'case':
02004             if (!$this->initCaseFolding($charset)) return $str;   // do nothing
02005             $map =& $this->caseFolding[$charset][$opt];
02006             break;
02007 
02008          case 'ascii':
02009             if (!$this->initToASCII($charset))  return $str;   // do nothing
02010             $map =& $this->toASCII[$charset];
02011             break;
02012 
02013          default:
02014             return $str;
02015       }
02016 
02017       $sjis = ($charset == 'shift_jis');
02018       $out = '';
02019       for($i=0; strlen($str{$i}); $i++)   {
02020          $mbc = $str{$i};
02021          $c = ord($mbc);
02022 
02023          if ($sjis)  {
02024             if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {  // a double-byte char
02025                $mbc = substr($str,$i,2);
02026                $i++;
02027             }
02028          }
02029          else  {
02030             if ($c >= 0x80)   {  // a double-byte char
02031                $mbc = substr($str,$i,2);
02032                $i++;
02033             }
02034          }
02035 
02036          if (isset($map[$mbc]))  {
02037             $out .= $map[$mbc];
02038          } else {
02039             $out .= $mbc;
02040          }
02041       }
02042 
02043       return $out;
02044    }
02045 
02046 }
02047 
02048 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])   {
02049    include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
02050 }
02051 ?>

Generated on Sun Oct 3 01:05:46 2004 for TYPO3core 3.7.0 dev by  doxygen 1.3.8-20040913