|
php.net | support | documentation | report a bug | advanced search | search howto | statistics | random bug | login |
[2004-04-19 20:46 UTC] BenJin at gmx dot de
Description: ------------ During some doublecheck after Bug #28042 was closed, I discovered some more mistakes in that file. I just checked the UTF-8 tables, don't know if the other charsets are wrong, too. In Bug #28042, We forgot two letters of the greek table, 'upsih' and 'piv', which are spelled with an 'i' as in ice instead of '1'. Also there are some NULLs missing at several points. This causes htmlentities(,,"UTF-8") to convert UTF-8 encoded chars into the wrong or into no HTML-Entities since the mappings are shifted. For example U+202F is mapped to ‰ which should be U+2030. Here is my diff of the php5-cvs/ext/standard/html.c, the same modifications should be made in php-4.3, please double check --- html.c 2004-04-18 02:30:24.000000000 +0200 +++ html.c.fixed 2004-04-19 18:44:47.949012992 +0200 @@ -114,13 +114,13 @@ /* 354 - 375 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 376 */ "Yuml", /* 377 - 401 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 402 */ "fnof" }; @@ -130,7 +130,7 @@ "circ", /* 711 - 731 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 732 */ "tilde", }; @@ -147,9 +147,9 @@ "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", /* 970 - 976 are not mapped */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "thetasym", "ups1h", + "thetasym", "upsih", NULL, NULL, NULL, - "p1v" + "piv" }; static entity_table_t ent_uni_punct[] = { @@ -158,7 +158,7 @@ "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", - "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", + NULL, "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, @@ -191,7 +191,7 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8624 (0x21b0) */ - NULL, NULL, NULL, NULL, "crarr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8640 (0x21c0) */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -206,9 +206,9 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8704 (0x2200) */ "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla", - "isin", "notin", "epsis", NULL, "ni", "bepsi", NULL, "prod", + "isin", "notin", "epsis", "ni", NULL, "bepsi", NULL, "prod", /* 8720 (0x2210) */ - "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", NULL, + "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast", "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90", /* 8736 (0x2220) */ "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and", @@ -232,17 +232,19 @@ "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe", /* 8840 - 8852 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, /* 8853 */ "oplus", NULL, "otimes", /* 8856 - 8868 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, /* 8869 */ "perp", /* 8870 - 8901 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, + NULL, /* 8901 */ "sdot", /* 8902 - 8967 */ @@ -252,14 +254,13 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, /* 8968 */ "lceil", "rceil", "lfloor", "rfloor", /* 8969 - 9000 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", };eck. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", }; Additionally I wonder, if it's wise to map those high (Most of the ent_uni_8592_9002, but not all) Unicode chars to non HTML-standards compliant entities. Not all browsers might interpret them correct. Probably it would be better to implement a function, which maps them to hexadecimal values like &x8661; instead of ⇕ (These are in ISO and SGML-Standards but not in HTML) PatchesPull RequestsHistoryAllCommentsChangesGit/SVN commits
|
|||||||||||||||||||||||||||
Copyright © 2001-2025 The PHP GroupAll rights reserved. |
Last updated: Sat Oct 25 21:00:01 2025 UTC |
sorry, please be careful when using the diff, have to learn to copy and paste correctly )-; the diff ends after the first: + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", }; without the "eck"