|
php.net | support | documentation | report a bug | advanced search | search howto | statistics | random bug | login |
[2001-02-21 22:53 UTC] alan_k at hklc dot com
htmlspecialchars & entities often replace the second byte of a chinese character with a 'htmlized' &xxx; character - this is annoying :) and makes it very difficult to program in PHP doing dynamic chinese stuff.
anyway this patch goes part of the way to solving it, - Note I have not tested it, so testers needed.. - I'm on the dev list, so I should be able to follow any comments..
It does produce compile time errors on the character range (i'm guessing that gcc makes the assumtion that char should be < 128 ?)
I have added a check using setlocale("LC_ALL",NULL) - This may not be the correct test - and may not respond with the correct info if set in PHP - again untested. - does LC_ALL return something? should It use env. variables...
Anyway - thats a long enough bug report...
regards
alan
Index: html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.22
diff -u -r1.22 html.c
--- html.c 2000/11/24 16:17:58 1.22
+++ html.c 2001/02/22 03:43:13
@@ -22,7 +22,7 @@
#include "php.h"
#include "reg.h"
#include "html.h"
-
+#include <locale.h>
/* This must be fixed to handle the input string according to LC_CTYPE.
Defaults to ISO-8859-1 for now. */
@@ -52,8 +52,17 @@
PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style)
{
int i, maxlen, len;
- char *new;
-
+
+ char *new, *oldnext, *oldprev;
+#if HAVE_SETLOCALE
+ int checklang=0,ischinese;
+ /* should this check the enviroment value? */
+ char *locale = setlocale(LC_ALL, NULL);
+ if ((!strcmp("zh_TW.Big5", locale)) ||
+ (!strcmp("zh_TW", locale)) ||
+ (!strcmp("zh_CN", locale)) ||
+ (!strcmp("zh_CN.GB2313", locale))) checklang=1;
+#endif
maxlen = 2 * oldlen;
if (maxlen < 128)
maxlen = 128;
@@ -62,34 +71,72 @@
i = oldlen;
while (i--) {
- if (len + 9 > maxlen)
+ if (len + 9 > maxlen)
new = erealloc (new, maxlen += 128);
- if (38 == *old) {
- memcpy (new + len, "&", 5);
- len += 5;
- } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
- memcpy (new + len, """, 6);
- len += 6;
- } else if (39 == *old && (quote_style&ENT_QUOTES)) {
- memcpy (new + len, "'", 6);
- len += 6;
- } else if (60 == *old) {
- memcpy (new + len, "<", 4);
- len += 4;
- } else if (62 == *old) {
- memcpy (new + len, ">", 4);
- len += 4;
- } else if (all && 160 <= *old) {
- new [len++] = '&';
- strcpy (new + len, EntTable [*old - 160]);
- len += strlen (EntTable [*old - 160]);
- new [len++] = ';';
+#if HAVE_SETLOCALE
+
+
+ ischinese = 0;
+ if (checklang) {
+ if (i > 1) {
+ oldnext = old+1;
+ if ((*old >= 0xa1) &&
+ (*old <= 0xf9) &&
+ (((*oldnext >= 0x40) &&
+ (*oldnext <= 0x73)) ||
+ ((*oldnext >= 0xa1) &&
+ (*oldnext <= 0xfe)))
+ ) ischinese = 1;
+ }
+ /* check if this is the seconde character in a chinese pair */
+ if ((i != oldlen) && (!ischinese)) {
+ oldprev = old-1;
+ if ((*oldprev >= 0xa1) &&
+ (*oldprev <= 0xf9) &&
+ (((*old >= 0x40) &&
+ (*old <= 0x73)) ||
+ ((*old >= 0xa1) &&
+ (*old <= 0xfe)))
+ ) ischinese = 1;
+ }
+ }
+
+ if (ischinese) {
+ /* it is chinese - ignore it */
+ new [len++] = *old;
} else {
- new [len++] = *old;
- }
- old++;
+#endif
+
+ if (38 == *old) {
+ memcpy (new + len, "&", 5);
+ len += 5;
+ } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
+ memcpy (new + len, """, 6);
+ len += 6;
+ } else if (39 == *old && (quote_style&ENT_QUOTES)) {
+ memcpy (new + len, "'", 6);
+ len += 6;
+ } else if (60 == *old) {
+ memcpy (new + len, "<", 4);
+ len += 4;
+ } else if (62 == *old) {
+ memcpy (new + len, ">", 4);
+ len += 4;
+ } else if (all && 160 <= *old) {
+ new [len++] = '&';
+ strcpy (new + len, EntTable [*old - 160]);
+ len += strlen (EntTable [*old - 160]);
+ new [len++] = ';';
+ } else {
+ new [len++] = *old;
+ }
+#if HAVE_SETLOCALE
+
+ }
+#endif
+ old++;
}
- new [len] = '\0';
+ new [len] = '\0';
*newlen = len;
return new;
PatchesPull RequestsHistoryAllCommentsChangesGit/SVN commits
|
|||||||||||||||||||||||||||
Copyright © 2001-2025 The PHP GroupAll rights reserved. |
Last updated: Mon Oct 27 05:00:01 2025 UTC |
OK, patch updated and tested.. points to note: this line in php_escape_html_entities( } else if (this_char > 0xff) { I dont thing this will work in the current code as this_char is unsigned short (0-255) - or that is what my C book says :) changed to unsigned long and the code words as expected. this appears to work both by setting and by using the hint field (undocumented - I guess until 4.0.7 release).. setlocale(LC_ALL,"zh_TW"); echo htmlentities("some chinese"); regards alan Index: html.c =================================================================== RCS file: /repository/php4/ext/standard/html.c,v retrieving revision 1.32 diff -u -r1.32 html.c --- html.c 11 Aug 2001 17:03:37 -0000 1.32 +++ html.c 20 Aug 2001 12:32:06 -0000 @@ -35,7 +35,7 @@ Defaults to ISO-8859-1 for now. */ enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, - cs_8859_15, cs_utf_8 }; + cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs }; typedef const char * entity_table_t; /* codepage 1252 is a Windows extension to iso-8859-1. */ @@ -96,6 +96,9 @@ { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, { cs_terminator } }; @@ -105,14 +108,17 @@ } charset_map[] = { { "ISO-8859-1", cs_8859_1 }, { "ISO-8859-15", cs_8859_15 }, - { "utf-8", cs_utf_8 }, + { "utf-8", cs_utf_8 }, { "cp1252", cs_cp1252 }, + { "zh_TW", cs_big5 }, + { "zh_CN", cs_gb2312 }, + { "zh_HK", cs_big5hkscs }, { NULL } }; /* {{{ get_next_char */ -inline static unsigned short get_next_char(enum entity_charset charset, +inline static unsigned long get_next_char(enum entity_charset charset, unsigned char * str, int * newpos, unsigned char * mbseq, @@ -121,7 +127,7 @@ { int pos = *newpos; int mbpos = 0; - unsigned short this_char = str[pos++]; + unsigned long this_char = str[pos++]; mbseq[mbpos++] = (unsigned char)this_char; @@ -205,7 +211,49 @@ mbseq[mbpos++] = (unsigned char)this_char; } } while(more); - } + } else if ((charset == cs_big5) || (charset == cs_gb2312) || (charset == cs_big5hkscs)) { + + unsigned long utf = 0; + int stat = 0; + int more = 1; + + /* unpack double byte encoding into a two chars. */ + + + do { + if ((stat==0) && (this_char < 0xa1) || (this_char > 0xf9)) { + more = 0; + break; + } + else if (stat==0) { + utf = this_char << 16; + stat=1; + } + else if (((this_char >= 0x40) && + (this_char <= 0x73)) || + ((this_char >= 0xa1) && + (this_char <= 0xfe))) { + utf += this_char; + more = 0; + } + else { + /* invalid; bail */ + more = 0; + utf=0; + pos = *newpos; + mbpos = 0; + break; + } + if (more) + { + this_char = str[pos++]; + mbseq[mbpos++] = (unsigned char)this_char; + } + } while(more); + if (utf != 0) + this_char = utf; + + } *newpos = pos; mbseq[mbpos] = '\0'; *mbseqlen = mbpos; @@ -223,24 +271,27 @@ int len; /* Guarantee default behaviour */ - if (charset_hint == NULL) - return cs_8859_1; + /*if (charset_hint == NULL) { + return cs_8859_1; + } + */ - if (strlen(charset_hint) == 0) { + if ((charset_hint == NULL) || strlen(charset_hint) == 0) { + char * localename; /* try to detect the charset for the locale */ #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET) - charset_hint = nl_langinfo(CODESET); + charset_hint = nl_langinfo(CODESET); #endif #if HAVE_LOCALE_H - if (charset_hint == NULL) + localename = setlocale(LC_CTYPE, NULL); + if (localename != NULL) { /* try to figure out the charset from the locale */ - char * localename; + char * dot, * at; /* lang[_territory][.codeset][@modifier] */ localename = setlocale(LC_CTYPE, NULL); - dot = strchr(localename, '.'); if (dot) { dot++; @@ -286,6 +337,7 @@ int i, maxlen, len; char *new; enum entity_charset charset = determine_charset(hint_charset); + maxlen = 2 * oldlen; if (maxlen < 128) @@ -299,12 +351,12 @@ unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence it should be more than enough.. */ - unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen); + unsigned long this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen); int matches_map = 0; if (len + 9 > maxlen) new = erealloc (new, maxlen += 128); - + if (all) { /* look for a match in the maps for this charset */ int j;