From 705e118cb1c1506151ff707a7648a5fa2978cda9 Mon Sep 17 00:00:00 2001 From: Fil <fil@rezo.net> Date: Wed, 12 Mar 2003 01:12:28 +0000 Subject: [PATCH] gros boulot sur les charsets. --- ecrire/inc_charsets.php3 | 510 ++++++++++++++++---------------------- ecrire/inc_documents.php3 | 2 +- ecrire/inc_index.php3 | 4 +- ecrire/inc_sites.php3 | 7 +- ecrire/inc_version.php3 | 4 +- spip_image.php3 | 5 +- 6 files changed, 226 insertions(+), 306 deletions(-) diff --git a/ecrire/inc_charsets.php3 b/ecrire/inc_charsets.php3 index a87de32a9f..b4155dca8d 100644 --- a/ecrire/inc_charsets.php3 +++ b/ecrire/inc_charsets.php3 @@ -5,182 +5,210 @@ if (defined("_ECRIRE_INC_CHARSETS")) return; define("_ECRIRE_INC_CHARSETS", "1"); -function filtrer_entites($texte) { // html -> texte, a completer - - // NB en php4 il suffirait d'utiliser get_html_translation_table/array_flip - // HTML_ENTITIES - $trans_iso = array( - '¡' => "\xa1", - '¢' => "\xa2", - '£' => "\xa3", - '¤' => "\xa4", - '¥' => "\xa5", - '¦' => "\xa6", - '§' => "\xa7", - '¨' => "\xa8", - 'ª' => "\xaa", - '«' => "\xab", - '¬' => "\xac", - '­' => "\xad", - '¯' => "\xaf", - '°' => "\xb0", - '±' => "\xb1", - '²' => "\xb2", - '³' => "\xb3", - '´' => "\xb4", - 'µ' => "\xb5", - '¶' => "\xb6", - '·' => "\xb7", - '¸' => "\xb8", - '¹' => "\xb9", - 'º' => "\xba", - '»' => "\xbb", - '¿' => "\xbf", - 'À' => "\xc0", - 'Á' => "\xc1", - 'Â' => "\xc2", - 'Ã' => "\xc3", - 'Ä' => "\xc4", - 'Å' => "\xc5", - 'Æ' => "\xc6", - 'Ç' => "\xc7", - 'È' => "\xc8", - 'É' => "\xc9", - 'Ê' => "\xca", - 'Ë' => "\xcb", - 'Ì' => "\xcc", - 'Í' => "\xcd", - 'Î' => "\xce", - 'Ï' => "\xcf", - 'Ð' => "\xd0", - 'Ñ' => "\xd1", - 'Ò' => "\xd2", - 'Ó' => "\xd3", - 'Ô' => "\xd4", - 'Õ' => "\xd5", - 'Ö' => "\xd6", - '×' => "\xd7", - 'Ø' => "\xd8", - 'Ù' => "\xd9", - 'Ú' => "\xda", - 'Û' => "\xdb", - 'Ü' => "\xdc", - 'Ý' => "\xdd", - 'Þ' => "\xde", - 'ß' => "\xdf", - 'à' => "\xe0", - 'á' => "\xe1", - 'â' => "\xe2", - 'ã' => "\xe3", - 'ä' => "\xe4", - 'å' => "\xe5", - 'æ' => "\xe6", - 'ç' => "\xe7", - 'è' => "\xe8", - 'é' => "\xe9", - 'ê' => "\xea", - 'ë' => "\xeb", - 'ì' => "\xec", - 'í' => "\xed", - 'î' => "\xee", - 'ï' => "\xef", - 'ð' => "\xf0", - 'ñ' => "\xf1", - 'ò' => "\xf2", - 'ó' => "\xf3", - 'ô' => "\xf4", - 'õ' => "\xf5", - 'ö' => "\xf6", - '÷' => "\xf7", - 'ø' => "\xf8", - 'ù' => "\xf9", - 'ú' => "\xfa", - 'û' => "\xfb", - 'ü' => "\xfc", - 'ý' => "\xfd", - 'þ' => "\xfe" - ); - - $trans = array ( - ' ' => " ", - '©' => "(c)", - '®' => "(r)", - '¼' => "1/4", - '½' => "1/2", - '¾' => "3/4", - '&' => '&', - '"' => '"', - ''' => "'", - '<' => '<', - '>' => '>' - ); - - $texte = strtr2 ($texte, $trans); - - $charset = lire_meta('charset'); - if ($charset == 'iso-8859-1') // recuperer les caracteres iso-latin - $texte = strtr2 ($texte, $trans_iso); - else if (($charset == 'utf-8') OR ($charset == 'windows-1251')) // autres charsets supportes - { - // 1. recuperer les caracteres binaires en Ӓ - $texte = entites_unicode($texte); - // 2. les é en iso-8859-1 - $texte = strtr2 ($texte, $trans_iso); - // 3. les iso en é - $texte = iso_8859_1_to_unicode($texte); - // 4. le tout dans le charset cible - $texte = unicode2charset($texte); - } - return $texte; -} +/* charsets supportes : + iso-8859-1 ; utf-8 ; + windows-1251 = CP1251 ; +*/ +function load_charset ($charset = 'AUTO') { + if ($charset == 'AUTO') + $charset = lire_meta('charset'); -// strtr (string $texte, array $trans) = emuler le php4 -function strtr2 ($texte, $trans) { - global $flag_strtr2; + if (is_array($GLOBALS['CHARSET'][$charset])) + return $GLOBALS['CHARSET'][$charset]; + + switch($charset) { + case 'utf-8': + return $GLOBALS['CHARSET'][$charset] = array(); + + // iso latin 1 + case 'iso-8859-1': + case '': + return $GLOBALS['CHARSET'][$charset] = array ( + 128=>128, 129=>129, 130=>130, 131=>131, 132=>132, 133=>133, 134=>134, 135=>135, + 136=>136, 137=>137, 138=>138, 139=>139, 140=>140, 141=>141, 142=>142, 143=>143, + 144=>144, 145=>145, 146=>146, 147=>147, 148=>148, 149=>149, 150=>150, 151=>151, + 152=>152, 153=>153, 154=>154, 155=>155, 156=>156, 157=>157, 158=>158, 159=>159, + 160=>160, 161=>161, 162=>162, 163=>163, 164=>164, 165=>165, 166=>166, 167=>167, + 168=>168, 169=>169, 170=>170, 171=>171, 172=>172, 173=>173, 174=>174, 175=>175, + 176=>176, 177=>177, 178=>178, 179=>179, 180=>180, 181=>181, 182=>182, 183=>183, + 184=>184, 185=>185, 186=>186, 187=>187, 188=>188, 189=>189, 190=>190, 191=>191, + 192=>192, 193=>193, 194=>194, 195=>195, 196=>196, 197=>197, 198=>198, 199=>199, + 200=>200, 201=>201, 202=>202, 203=>203, 204=>204, 205=>205, 206=>206, 207=>207, + 208=>208, 209=>209, 210=>210, 211=>211, 212=>212, 213=>213, 214=>214, 215=>215, + 216=>216, 217=>217, 218=>218, 219=>219, 220=>220, 221=>221, 222=>222, 223=>223, + 224=>224, 225=>225, 226=>226, 227=>227, 228=>228, 229=>229, 230=>230, 231=>231, + 232=>232, 233=>233, 234=>234, 235=>235, 236=>236, 237=>237, 238=>238, 239=>239, + 240=>240, 241=>241, 242=>242, 243=>243, 244=>244, 245=>245, 246=>246, 247=>247, + 248=>248, 249=>249, 250=>250, 251=>251, 252=>252, 253=>253, 254=>254, 255=>255, + 256=>256 + ); + + + // iso latin 15 - Gaetan Ryckeboer <gryckeboer@virtual-net.fr> + case 'iso-8859-15': + $trans = load_charset('iso-8859-1'); + $trans[164]=8364; + $trans[166]=352; + $trans[168]=353; + $trans[180]=381; + $trans[184]=382; + $trans[188]=338; + $trans[189]=339; + $trans[190]=376; + return $GLOBALS['CHARSET'][$charset] = $trans; + + + // cyrillic - ref. http://czyborra.com/charsets/cyrillic.html + case 'windows-1251': + case 'CP1251': + return $GLOBALS['CHARSET'][$charset] = array ( + 0x80=>0x0402, 0x81=>0x0403, 0x82=>0x201A, 0x83=>0x0453, 0x84=>0x201E, + 0x85=>0x2026, 0x86=>0x2020, 0x87=>0x2021, 0x88=>0x20AC, 0x89=>0x2030, + 0x8A=>0x0409, 0x8B=>0x2039, 0x8C=>0x040A, 0x8D=>0x040C, 0x8E=>0x040B, + 0x8F=>0x040F, 0x90=>0x0452, 0x91=>0x2018, 0x92=>0x2019, 0x93=>0x201C, + 0x94=>0x201D, 0x95=>0x2022, 0x96=>0x2013, 0x97=>0x2014, 0x99=>0x2122, + 0x9A=>0x0459, 0x9B=>0x203A, 0x9C=>0x045A, 0x9D=>0x045C, 0x9E=>0x045B, + 0x9F=>0x045F, 0xA0=>0x00A0, 0xA1=>0x040E, 0xA2=>0x045E, 0xA3=>0x0408, + 0xA4=>0x00A4, 0xA5=>0x0490, 0xA6=>0x00A6, 0xA7=>0x00A7, 0xA8=>0x0401, + 0xA9=>0x00A9, 0xAA=>0x0404, 0xAB=>0x00AB, 0xAC=>0x00AC, 0xAD=>0x00AD, + 0xAE=>0x00AE, 0xAF=>0x0407, 0xB0=>0x00B0, 0xB1=>0x00B1, 0xB2=>0x0406, + 0xB3=>0x0456, 0xB4=>0x0491, 0xB5=>0x00B5, 0xB6=>0x00B6, 0xB7=>0x00B7, + 0xB8=>0x0451, 0xB9=>0x2116, 0xBA=>0x0454, 0xBB=>0x00BB, 0xBC=>0x0458, + 0xBD=>0x0405, 0xBE=>0x0455, 0xBF=>0x0457, 0xC0=>0x0410, 0xC1=>0x0411, + 0xC2=>0x0412, 0xC3=>0x0413, 0xC4=>0x0414, 0xC5=>0x0415, 0xC6=>0x0416, + 0xC7=>0x0417, 0xC8=>0x0418, 0xC9=>0x0419, 0xCA=>0x041A, 0xCB=>0x041B, + 0xCC=>0x041C, 0xCD=>0x041D, 0xCE=>0x041E, 0xCF=>0x041F, 0xD0=>0x0420, + 0xD1=>0x0421, 0xD2=>0x0422, 0xD3=>0x0423, 0xD4=>0x0424, 0xD5=>0x0425, + 0xD6=>0x0426, 0xD7=>0x0427, 0xD8=>0x0428, 0xD9=>0x0429, 0xDA=>0x042A, + 0xDB=>0x042B, 0xDC=>0x042C, 0xDD=>0x042D, 0xDE=>0x042E, 0xDF=>0x042F, + 0xE0=>0x0430, 0xE1=>0x0431, 0xE2=>0x0432, 0xE3=>0x0433, 0xE4=>0x0434, + 0xE5=>0x0435, 0xE6=>0x0436, 0xE7=>0x0437, 0xE8=>0x0438, 0xE9=>0x0439, + 0xEA=>0x043A, 0xEB=>0x043B, 0xEC=>0x043C, 0xED=>0x043D, 0xEE=>0x043E, + 0xEF=>0x043F, 0xF0=>0x0440, 0xF1=>0x0441, 0xF2=>0x0442, 0xF3=>0x0443, + 0xF4=>0x0444, 0xF5=>0x0445, 0xF6=>0x0446, 0xF7=>0x0447, 0xF8=>0x0448, + 0xF9=>0x0449, 0xFA=>0x044A, 0xFB=>0x044B, 0xFC=>0x044C, 0xFD=>0x044D, + 0xFE=>0x044E, 0xFF=>0x044F); // fin windows-1251 + + + // ------------------------------------------------------------------ + + // cas particulier pour les entites html (a completer eventuellement) + case 'html': + return $GLOBALS['CHARSET'][$charset] = array ( + 'cent'=>'¢', 'pound'=>'£', 'curren'=>'¤', 'yen'=>'¥', 'brvbar'=>'¦', + 'sect'=>'§', 'uml'=>'¨', 'ordf'=>'ª', 'laquo'=>'«', 'not'=>'¬', + 'shy'=>'­', 'macr'=>'¯', 'deg'=>'°', 'plusmn'=>'±', 'sup2'=>'²', + 'sup3'=>'³', 'acute'=>'´', 'micro'=>'µ', 'para'=>'¶', 'middot'=>'·', + 'cedil'=>'¸', 'sup1'=>'¹', 'ordm'=>'º', 'raquo'=>'»', 'iquest'=>'¿', + 'Agrave'=>'À', 'Aacute'=>'Á', 'Acirc'=>'Â', 'Atilde'=>'Ã', 'Auml'=>'Ä', + 'Aring'=>'Å', 'AElig'=>'Æ', 'Ccedil'=>'Ç', 'Egrave'=>'È', 'Eacute'=>'É', + 'Ecirc'=>'Ê', 'Euml'=>'Ë', 'Igrave'=>'Ì', 'Iacute'=>'Í', 'Icirc'=>'Î', + 'Iuml'=>'Ï', 'ETH'=>'Ð', 'Ntilde'=>'Ñ', 'Ograve'=>'Ò', 'Oacute'=>'Ó', + 'Ocirc'=>'Ô', 'Otilde'=>'Õ', 'Ouml'=>'Ö', 'times'=>'×', 'Oslash'=>'Ø', + 'Ugrave'=>'Ù', 'Uacute'=>'Ú', 'Ucirc'=>'Û', 'Uuml'=>'Ü', 'Yacute'=>'Ý', + 'THORN'=>'Þ', 'szlig'=>'ß', 'agrave'=>'à', 'aacute'=>'á', 'acirc'=>'â', + 'atilde'=>'ã', 'auml'=>'ä', 'aring'=>'å', 'aelig'=>'æ', 'ccedil'=>'ç', + 'egrave'=>'è', 'eacute'=>'é', 'ecirc'=>'ê', 'euml'=>'ë', 'igrave'=>'ì', + 'iacute'=>'í', 'icirc'=>'î', 'iuml'=>'ï', 'eth'=>'ð', 'ntilde'=>'ñ', + 'ograve'=>'ò', 'oacute'=>'ó', 'ocirc'=>'ô', 'otilde'=>'õ', 'ouml'=>'ö', + 'divide'=>'÷', 'oslash'=>'ø', 'ugrave'=>'ù', 'uacute'=>'ú', + 'ucirc'=>'û', 'uuml'=>'ü', 'yacute'=>'ý', 'thorn'=>'þ', + 'nbsp' => " ", 'copy' => "(c)", 'reg' => "(r)", 'frac14' => "1/4", + 'frac12' => "1/2", 'frac34' => "3/4", 'amp' => '&', 'quot' => '"', + 'apos' => "'", 'lt' => '<', 'gt' => '>' + ); + + // cas particulier pour la translitteration + case 'translit': + return $GLOBALS['CHARSET'][$charset] = array ( + // latin + 128=>'euro', 131=>'f', 140=>'OE', 153=>'TM', 156=>'oe', 159=>'Y', 160=>' ', + 161=>'�', 162=>'c', 163=>'L', 164=>'O', 165=>'yen',166=>'|', + 167=>'p',169=>'(c)', 171=>'<<',172=>'-',173=>'-',174=>'(R)', + 176=>'o',177=>'+-',181=>'mu',182=>'p',183=>'.',187=>'>>', 192=>'A', + 193=>'A', 194=>'A', 195=>'A', 196=>'A', 197=>'A', 198=>'AE', 199=>'C', + 200=>'E', 201=>'E', 202=>'E', 203=>'E', 204=>'I', 205=>'I', 206=>'I', + 207=>'I', 209=>'N', 210=>'O', 211=>'O', 212=>'O', 213=>'O', 214=>'O', + 216=>'O', 217=>'U', 218=>'U', 219=>'U', 220=>'U', 223=>'B', 224=>'a', + 225=>'a', 226=>'a', 227=>'a', 228=>'a', 229=>'a', 230=>'ae', 231=>'c', + 232=>'e', 233=>'e', 234=>'e', 235=>'e', 236=>'i', 237=>'i', 238=>'i', + 239=>'i', 241=>'n', 242=>'o', 243=>'o', 244=>'o', 245=>'o', 246=>'o', + 248=>'o', 249=>'u', 250=>'u', 251=>'u', 252=>'u', 255=>'y', + + // cyrillique + 1026=>'D%', 1027=>'G%', 8218=>'\'', 1107=>'g%', 8222=>'"', 8230=>'...', + 8224=>'/-', 8225=>'/=', 8364=>'EUR', 8240=>'0/00', 1033=>'LJ', + 8249=>'<', 1034=>'NJ', 1036=>'KJ', 1035=>'Ts', 1039=>'DZ', 1106=>'d%', + 8216=>'`', 8217=>'\'', 8220=>'"', 8221=>'"', 8226=>' o ', 8211=>'-', + 8212=>'--', 8212=>'~', 8482=>'(TM)', 1113=>'lj', 8250=>'>', 1114=>'nj', + 1116=>'kj', 1115=>'ts', 1119=>'dz', 1038=>'V%', 1118=>'v%', 1032=>'J%', + 1168=>'G3', 1025=>'IO', 1028=>'IE', 1031=>'YI', 1030=>'II', + 1110=>'ii', 1169=>'g3', 1105=>'io', 8470=>'No.', 1108=>'ie', + 1112=>'j%', 1029=>'DS', 1109=>'ds', 1111=>'yi', 1040=>'A', 1041=>'B', + 1042=>'V', 1043=>'G', 1044=>'D', 1045=>'E', 1046=>'ZH', 1047=>'Z', + 1048=>'I', 1049=>'J', 1050=>'K', 1051=>'L', 1052=>'M', 1053=>'N', + 1054=>'O', 1055=>'P', 1056=>'R', 1057=>'S', 1058=>'T', 1059=>'U', + 1060=>'F', 1061=>'H', 1062=>'C', 1063=>'CH', 1064=>'SH', 1065=>'SCH', + 1066=>'"', 1067=>'Y', 1068=>'\'', 1069=>'`E', 1070=>'YU', 1071=>'YA', + 1072=>'a', 1073=>'b', 1074=>'v', 1075=>'g', 1076=>'d', 1077=>'e', + 1078=>'zh', 1079=>'z', 1080=>'i', 1081=>'j', 1082=>'k', 1083=>'l', + 1084=>'m', 1085=>'n', 1086=>'o', 1087=>'p', 1088=>'r', 1089=>'s', + 1090=>'t', 1091=>'u', 1092=>'f', 1093=>'h', 1094=>'c', 1095=>'ch', + 1096=>'sh', 1097=>'sch', 1098=>'"', 1099=>'y', 1100=>'\'', 1101=>'`e', + 1102=>'yu', 1103=>'ya' + ); + + default: + spip_log("erreur charset $charset non supporte"); + return $GLOBALS['CHARSET'][$charset] = array(); - if ($flag_strtr2) - return strtr($texte,$trans); - else { - reset ($trans); - while (list($entite, $remplace) = each ($trans)) - $texte = ereg_replace($entite, $remplace, $texte); - return $texte; } } +// transformer les é en unicode +function filtrer_entites($texte) { + $trans = load_charset('html'); + while (eregi('&([a-z][a-z0-9]+);', $texte, $regs) AND !$vu[$i = $regs[1]]) { + $vu[$i] = true; + if ($s = $trans[$i]) + $texte = ereg_replace($regs[0], $s, $texte); + } + + // remettre le tout dans le charset cible + $texte = unicode2charset($texte); + return $texte; +} + // transforme une chaine en entites unicode  -function entites_unicode($chaine, $charset='AUTO') { +function entites_unicode($chaine, $charset='AUTO', $forcer = false) { if ($charset == 'AUTO') $charset=lire_meta('charset'); switch($charset) { - case 'iso-8859-1': - // On commente cet appel tant qu'il reste des spip v<1.5 dans la nature - // $chaine = iso_8859_1_to_unicode($chaine); - break; - // FORCE-iso-8859-1 passe le message suivant : on VEUT la conversion, meme - // si elle est desactivee dans entites_unicode pour maintenir (temporairement) - // la lisibilite de notre backend sur des SPIP v<1.5 - case 'FORCE-iso-8859-1': - $chaine = iso_8859_1_to_unicode($chaine); - break; - case 'utf-8': - $chaine = utf_8_to_unicode($chaine); + return utf_8_to_unicode($chaine); break; - case 'windows-1251': - $chaine = windows_1251_to_unicode($chaine); - break; - - default: - break; + case 'iso-8859-1': + // On commente cet appel tant qu'il reste des spip v<1.5 dans la nature + // pour que le filtre |entites_unicode donne des backends lisibles sur ces spips. + if (!$forcer) return $chaine; + default: + $trans = load_charset($charset); + $s = ''; + $len = strlen($chaine); + for ($p = 0; $p <= $len; $c = substr($chaine,$p++,1)) { + if ((($i=ord($c))>127) and ($j=$trans[$i])) + $s .= "&#$j;"; + else + $s .= $c; + } + return $s; } - return $chaine; } // transforme les entites unicode  dans le charset courant @@ -190,45 +218,23 @@ function unicode2charset($chaine, $charset='AUTO') { switch($charset) { - case 'iso-8859-1': - $chaine = unicode_to_iso_8859_1($chaine); - break; - case 'utf-8': - $chaine = unicode_to_utf_8($chaine); + return unicode_to_utf_8($chaine); break; - case 'windows-1251': - $chaine = unicode_to_windows_1251($chaine); - break; - default: - break; + $trans = load_charset($charset); + while (list($chr,$uni) = each($trans)) // array_flip + $ttrans[$uni] = $chr; + while (ereg('�*([0-9]+);', $chaine, $regs) AND !$vu[$i = intval($regs[1])]) { + $vu[$i] = true; + if ($s = $ttrans[$i]) + $chaine = ereg_replace($regs[0], $s, $chaine); + } + return $chaine; } - return $chaine; } -// -// Il faut deux fonctions par charset : charset->unicode et unicode->charset -// - -// ISO-8859-1 -function iso_8859_1_to_unicode($chaine) { - while ($i = ord(substr($chaine,$p++))) - if ($i>127) - $s .= "&#$i;"; - else - $s .= chr($i); - return $s; -} -function unicode_to_iso_8859_1($chaine) { - while (ereg('&#([0-9]+);', $chaine, $regs) AND !$vu[$regs[1]]) { - $vu[$regs[1]] = true; - if ($regs[1] < 256) - $chaine = ereg_replace($regs[0], chr($regs[1]), $chaine); - } - return $chaine; -} // UTF-8 function utf_8_to_unicode($source) { @@ -306,8 +312,9 @@ function utf_8_to_unicode($source) { return $encodedString; } + function unicode_to_utf_8($chaine) { - while (ereg('&#([0-9]+);', $chaine, $regs) AND !$vu[$regs[1]]) { + while (ereg('�*([0-9]+);', $chaine, $regs) AND !$vu[$regs[1]]) { $num = $regs[1]; $vu[$num] = true; if($num<128) $s = chr($num); // Ce bloc provient de php.net, auteur Ronen @@ -321,58 +328,6 @@ function unicode_to_utf_8($chaine) { } -// WINDOWS-1251 (CYRILLIQUE) -function load_windows_1251() { - // extrait de la table - // http://www.slav.helsinki.fi/atk/codepages/win1251.html - static $table; - if(is_array($table)) - return $table; - else - return $table = array( - chr(129)=>'Ѓ',chr(131)=>'ѓ',chr(138)=>'Љ',chr(140)=>'Њ',chr(141)=>'Ќ', - chr(142)=>'Ћ',chr(143)=>'Џ',chr(144)=>'ђ',chr(154)=>'љ',chr(156)=>'њ', - chr(157)=>'ќ',chr(158)=>'ћ',chr(159)=>'џ',chr(161)=>'Ў',chr(162)=>'ў', - chr(163)=>'Ј',chr(165)=>'Ґ',chr(168)=>'Ё',chr(170)=>'Є',chr(175)=>'Ї', - chr(178)=>'І',chr(179)=>'і',chr(180)=>'ґ',chr(184)=>'ё',chr(186)=>'є', - chr(188)=>'ј',chr(189)=>'Ѕ',chr(190)=>'ѕ',chr(191)=>'ї',chr(192)=>'А', - chr(193)=>'Б',chr(194)=>'В',chr(195)=>'Г',chr(196)=>'Д',chr(197)=>'Е', - chr(198)=>'Ж',chr(199)=>'З',chr(200)=>'И',chr(201)=>'Й',chr(202)=>'К', - chr(203)=>'Л',chr(204)=>'М',chr(205)=>'Н',chr(206)=>'О',chr(207)=>'П', - chr(208)=>'Р',chr(209)=>'С',chr(210)=>'Т',chr(211)=>'У',chr(212)=>'Ф', - chr(213)=>'Х',chr(214)=>'Ц',chr(215)=>'Ч',chr(216)=>'Ш',chr(217)=>'Щ', - chr(218)=>'Ъ',chr(219)=>'Ы',chr(220)=>'Ь',chr(221)=>'Э',chr(222)=>'Ю', - chr(223)=>'Я',chr(224)=>'а',chr(225)=>'б',chr(226)=>'в',chr(227)=>'г', - chr(228)=>'д',chr(229)=>'е',chr(230)=>'ж',chr(231)=>'з',chr(232)=>'и', - chr(233)=>'й',chr(234)=>'к',chr(235)=>'л',chr(236)=>'м',chr(237)=>'н', - chr(238)=>'о',chr(239)=>'п',chr(240)=>'р',chr(241)=>'с',chr(242)=>'т', - chr(243)=>'у',chr(244)=>'ф',chr(245)=>'х',chr(246)=>'ц',chr(247)=>'ч', - chr(248)=>'ш',chr(249)=>'щ',chr(250)=>'ъ',chr(251)=>'ы',chr(252)=>'ь', - chr(253)=>'э',chr(254)=>'ю',chr(255)=>'я' - ); -} -function windows_1251_to_unicode($chaine) { - $trans = load_windows_1251(); - while ($i = substr($chaine,$p++,1)) - if ($t = $trans[$i]) - $s .= $t; - else - $s .= $i; - return $s; -} -function unicode_to_windows_1251($chaine) { - $trans = load_windows_1251(); - while (list($chr,$uni) = each($trans)) // array_flip - $ttrans[$uni] = $chr; - while (ereg('&#([0-9]+);', $chaine, $regs) AND !$vu[$regs[1]]) { - $vu[$regs[1]] = true; - if ($ttrans[$regs[0]]) - $chaine = ereg_replace($regs[0], $ttrans[$regs[0]], $chaine); - } - return $chaine; -} - - // // Translitteration charset => ascii (pour l'indexation) // @@ -380,58 +335,23 @@ function translitteration ($texte, $charset='AUTO') { if ($charset == 'AUTO') $charset = lire_meta('charset'); - if ($charset == 'iso-8859-1') { - $texte = translit_iso8859_1($texte); - } else if ($charset == 'windows-1251') { - $texte = translit_windows_1251($texte); - } else if ($GLOBALS['flag_iconv'] - AND ($iconv = @iconv(strtoupper($charset), 'ASCII//TRANSLIT', $texte)) - AND !ereg('^\?+$',$iconv)) - $texte = $iconv; - return $texte; -} - -function translit_iso8859_1($texte) { - // Merci a phpDig (Antoine Bajolet) pour la fonction originale - $accents = - /* A */ chr(192).chr(193).chr(194).chr(195).chr(196).chr(197). - /* a */ chr(224).chr(225).chr(226).chr(227).chr(228).chr(229). - /* O */ chr(210).chr(211).chr(212).chr(213).chr(214).chr(216). - /* o */ chr(242).chr(243).chr(244).chr(245).chr(246).chr(248). - /* E */ chr(200).chr(201).chr(202).chr(203). - /* e */ chr(232).chr(233).chr(234).chr(235). - /* Cc */ chr(199).chr(231). - /* I */ chr(204).chr(205).chr(206).chr(207). - /* i */ chr(236).chr(237).chr(238).chr(239). - /* U */ chr(217).chr(218).chr(219).chr(220). - /* u */ chr(249).chr(250).chr(251).chr(252). - /* yNn */ chr(255).chr(209).chr(241); - return strtr($texte, - $accents, - "AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn"); -} - -function translit_windows_1251($texte) { - $code = array ( - 128=>'D',129=>'G',136=>'euro',138=>'LJ',140=>'NJ',141=>'KJ',142=>'Ts',143=>'DZ',144=>'d',149=>'o', - 154=>'lj',156=>'nj',157=>'kj',158=>'ts',159=>'dz',161=>'V',162=>'v',163=>'J',165=>'G',168=>'IO', - 170=>'IE',175=>'YI',178=>'II',179=>'ii',180=>'g',181=>'mu',184=>'io',186=>'ie',188=>'j',189=>'DS', - 190=>'ds',191=>'yi',192=>'A',193=>'B',194=>'V',195=>'G',196=>'D',197=>'E',198=>'ZH',199=>'Z', - 200=>'I',201=>'J',202=>'K',203=>'L',204=>'M',205=>'N',206=>'O',207=>'P',208=>'R',209=>'S',210=>'T', - 211=>'U',212=>'F',213=>'H',214=>'C',215=>'CH',216=>'SH',217=>'SCH',218=>'_',219=>'Y',220=>'_', - 221=>'e',222=>'YU',223=>'YA',224=>'a',225=>'b',226=>'v',227=>'g',228=>'d',229=>'e',230=>'zh', - 231=>'z',232=>'i',233=>'j',234=>'k',235=>'l',236=>'m',237=>'n',238=>'o',239=>'p',240=>'r',241=>'s', - 242=>'t',243=>'u',244=>'f',245=>'h',246=>'c',247=>'ch',248=>'sh',249=>'sch',250=>' ',251=>'y', - 252=>'_',253=>'E',254=>'yu',255=>'ya'); - - for ($i=0; $i<strlen($texte);$i++) { - $d = substr($texte,$i,1); - if ($c = $code[ord($d)]) - $ret .= $c; - else - $ret .= $d; + // 1. passer en unicode + $texte = entites_unicode(filtrer_entites($texte), $charset, true); + + // 2. translitterer + $trans = load_charset('translit'); + while (ereg('�*([0-9]+);', $texte, $regs) AND !$vu[$i = $regs[1]]) { + $vu[$i] = true; + if ($s = $trans[$i]) + $texte = ereg_replace($regs[0], $s, $texte); + // on va tenter de trouver la translitteration ailleurs + // - dans iconv par exemple + else if ($GLOBALS['flag_iconv'] AND ($iconv = @iconv($charset, 'ASCII//TRANSLIT', $texte)) AND !ereg('^\?+$',$iconv)) { + $GLOBALS['CHARSET']['translit'][$i] = $iconv; + $texte = ereg_replace($regs[0], $iconv, $texte); + } } - return $ret; + return $texte; } -?> \ No newline at end of file +?> diff --git a/ecrire/inc_documents.php3 b/ecrire/inc_documents.php3 index f85a9f2d4a..6f39a4d534 100644 --- a/ecrire/inc_documents.php3 +++ b/ecrire/inc_documents.php3 @@ -411,7 +411,7 @@ function afficher_documents_non_inclus($id_article, $type = "article", $flag_mod /// Ajouter nouveau document/image echo debut_cadre_enfonce("doc-24.gif",false,"creer.gif"); - echo "<div style='padding: 2px; background-color: $couleur_claire; text-align: ".$GLOBALS['spip_lang_right']."; color: black;'>"; + echo "<div style='padding: 2px; background-color: $couleur_claire; text-align: ".$GLOBALS['spip_lang_left']."; color: black;'>"; echo bouton_block_invisible("ajouter_document"); if ($type == "rubrique") echo "<b><font size=1>"._T('titre_publier_document')."</font></b>".aide("ins_doc"); else echo "<b><font size=1>"._T('titre_joindre_document')."</font></b>".aide("ins_doc"); diff --git a/ecrire/inc_index.php3 b/ecrire/inc_index.php3 index ef78228b84..f0635b1f71 100644 --- a/ecrire/inc_index.php3 +++ b/ecrire/inc_index.php3 @@ -7,12 +7,12 @@ define("_ECRIRE_INC_INDEX", "1"); function nettoyer_chaine_indexation($texte) { include_ecrire("inc_charsets.php3"); - return strtr(strtolower(translitteration($texte)), '\'"',' '); + return eregi_replace("[^A-Z0-9_-]","",strtolower(translitteration($texte))); } // Merci a Herve Lefebvre pour son apport sur cette fonction function separateurs_indexation() { - return "].,;`:*'\"?!\r\n\t\\/\~(){}[|&@<>$%#". + return "].,`:*'\"?!\r\n\t\\/\~(){}[|@<>$%". chr(187).chr(171).chr(133).chr(145).chr(146).chr(180).chr(147).chr(148); } diff --git a/ecrire/inc_sites.php3 b/ecrire/inc_sites.php3 index cfb633f7ba..70f9beb085 100644 --- a/ecrire/inc_sites.php3 +++ b/ecrire/inc_sites.php3 @@ -88,13 +88,14 @@ function recuperer_page($url) { function transcoder_page($texte) { // Si le backend precise son charset et que celui-ci est connu de SPIP, - // decoder puis rencoder + // decoder puis recoder if (eregi('<\\?xml[[:space:]][^>]*(utf-8)', $texte, $regs)) { $charset_page = $regs[1]; $texte = unicode2charset(entites_unicode($texte, $charset_page)); } - // Si le backend ne precise pas, on considere qu'il est iso-8859-1 - else $texte = unicode2charset(entites_unicode($texte, 'FORCE-iso-8859-1')); + // Si le backend ne precise pas, on considere qu'il est iso-8859-1 : il faut + // alors forcer la conversion + else $texte = unicode2charset(entites_unicode($texte, 'iso-8859-1', true)); return $texte; } diff --git a/ecrire/inc_version.php3 b/ecrire/inc_version.php3 index 544b3e25c4..66a17e121f 100644 --- a/ecrire/inc_version.php3 +++ b/ecrire/inc_version.php3 @@ -245,8 +245,8 @@ function spip_setcookie ($name='', $value='', $expire=0, $path='', $domain='', $ $name = ereg_replace ('^spip', $GLOBALS['cookie_prefix'], $name); // patch safari beta 51-60 - if (!$path AND eregi("Safari", $GLOBALS['HTTP_USER_AGENT'])) - $path = ereg_replace("/[^/]+$", "/", $GLOBALS['REQUEST_URI']); + if (!$path AND eregi("Safari", $GLOBALS['HTTP_USER_AGENT'])) + $path = ereg_replace("/[^/]+$", "/", $GLOBALS['REQUEST_URI']); if ($secure) @setcookie ($name, $value, $expire, $path, $domain, $secure); diff --git a/spip_image.php3 b/spip_image.php3 index 3a5e135841..6f6d46af5c 100644 --- a/spip_image.php3 +++ b/spip_image.php3 @@ -3,7 +3,7 @@ include ("ecrire/inc_version.php3"); include_ecrire("inc_filtres.php3"); -include_ecrire("inc_index.php3"); +include_ecrire("inc_charsets.php3"); include_ecrire("inc_meta.php3"); include_ecrire("inc_admin.php3"); include_local("inc-cache.php3"); @@ -281,8 +281,7 @@ function ajout_doc($orig, $source, $dest, $mode, $id_document, $doc_vignette='', $dest = 'IMG/'; if (creer_repertoire('IMG', $ext)) $dest .= $ext.'/'; - $dest .= ereg_replace("[^.a-zA-Z0-9_=-]+", "_", - nettoyer_chaine_indexation(ereg_replace("\.([^.]+)$", "", supprimer_tags(basename($orig))))); + $dest .= ereg_replace("[^.a-zA-Z0-9_=-]+", "_", translitteration(ereg_replace("\.([^.]+)$", "", supprimer_tags(basename($orig))))); $n = 0; while (file_exists($newFile = $dest.($n++ ? '-'.$n : '').'.'.$ext)); $dest_path = $newFile; -- GitLab