|   | php.net | support | documentation | report a bug | advanced search | search howto | statistics | random bug | login | 
| 
  [2001-02-21 22:53 UTC] alan_k at hklc dot com
 htmlspecialchars & entities often replace the second byte of a chinese character with a 'htmlized' &xxx; character - this is annoying :) and makes it very difficult to program in PHP doing dynamic chinese stuff.
anyway this patch goes part of the way to solving it, - Note I have not tested it, so testers needed.. - I'm on the dev list, so I should be able to follow any comments..
It does produce compile time errors on the character range (i'm guessing that gcc makes the assumtion that char should be < 128 ?)
I have added a check using setlocale("LC_ALL",NULL) - This may not be the correct test - and may not respond with the correct info if set in PHP - again untested. - does LC_ALL return something? should It use env. variables...
Anyway - thats a long enough bug report...
regards
alan
Index: html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.22
diff -u -r1.22 html.c
--- html.c	2000/11/24 16:17:58	1.22
+++ html.c	2001/02/22 03:43:13
@@ -22,7 +22,7 @@
 #include "php.h"
 #include "reg.h"
 #include "html.h"
-
+#include <locale.h>
 /* This must be fixed to handle the input string according to LC_CTYPE.
    Defaults to ISO-8859-1 for now. */
 	
@@ -52,8 +52,17 @@
 PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style)
 {
 	int i, maxlen, len;
-	char *new;
-
+ 
+	char *new, *oldnext, *oldprev;
+#if HAVE_SETLOCALE
+	int checklang=0,ischinese;
+	/* should this check the enviroment value? */
+  	char  *locale = setlocale(LC_ALL, NULL);
+        if ((!strcmp("zh_TW.Big5", locale)) || 
+	    (!strcmp("zh_TW", locale)) ||
+	    (!strcmp("zh_CN", locale)) ||
+	    (!strcmp("zh_CN.GB2313", locale))) checklang=1;
+#endif
 	maxlen = 2 * oldlen;
 	if (maxlen < 128)
 		maxlen = 128;
@@ -62,34 +71,72 @@
 
 	i = oldlen;
 	while (i--) {
-		if (len + 9 > maxlen)
+ 		if (len + 9 > maxlen)
 			new = erealloc (new, maxlen += 128);
-		if (38 == *old) {
-			memcpy (new + len, "&", 5);
-			len += 5;
-		} else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
-			memcpy (new + len, """, 6);
-			len += 6;
-		} else if (39 == *old && (quote_style&ENT_QUOTES)) {
-			memcpy (new + len, "'", 6);
-			len += 6;
-		} else if (60 == *old) {
-			memcpy (new + len, "<", 4);
-			len += 4;
-		} else if (62 == *old) {
-			memcpy (new + len, ">", 4);
-			len += 4;
-		} else if (all && 160 <= *old) {
-			new [len++] = '&';
-			strcpy (new + len, EntTable [*old - 160]);
-			len += strlen (EntTable [*old - 160]);
-			new [len++] = ';';
+#if HAVE_SETLOCALE
+ 	    
+	 
+         	ischinese = 0; 
+		if (checklang) {
+        	       if (i > 1) { 
+        		 oldnext = old+1; 
+        		 if ((*old >= 0xa1) &&
+                	     (*old <= 0xf9) &&
+                	     (((*oldnext >= 0x40) &&
+                	       (*oldnext <= 0x73)) ||
+                	      ((*oldnext >= 0xa1) &&
+                	       (*oldnext <= 0xfe)))  
+                	    ) ischinese = 1;
+        	       }
+        	       /* check if this is the seconde character in a chinese pair */
+        	       if ((i != oldlen) && (!ischinese)) {
+        		 oldprev = old-1;
+        		 if ((*oldprev >= 0xa1) &&  
+                	     (*oldprev <= 0xf9) &&
+                	     (((*old >= 0x40) &&
+                	       (*old <= 0x73)) ||
+                	      ((*old >= 0xa1) &&
+                	       (*old <= 0xfe)))
+                	    ) ischinese = 1;
+        	       }
+                }
+		
+                if (ischinese) { 
+          	        /* it is chinese - ignore it */
+                	new [len++] = *old;
 		} else {
-			new [len++] = *old;
-		}
-		old++;
+#endif
+		
+			if (38 == *old) {
+				memcpy (new + len, "&", 5);
+				len += 5;
+			} else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
+				memcpy (new + len, """, 6);
+				len += 6;
+			} else if (39 == *old && (quote_style&ENT_QUOTES)) {
+				memcpy (new + len, "'", 6);
+				len += 6;
+			} else if (60 == *old) {
+				memcpy (new + len, "<", 4);
+				len += 4;
+			} else if (62 == *old) {
+				memcpy (new + len, ">", 4);
+				len += 4;
+			} else if (all && 160 <= *old) {
+				new [len++] = '&';
+				strcpy (new + len, EntTable [*old - 160]);
+				len += strlen (EntTable [*old - 160]);
+				new [len++] = ';';
+			} else {
+				new [len++] = *old;
+			}
+#if HAVE_SETLOCALE
+               
+                }
+#endif
+                old++;
 	}
-    new [len] = '\0';
+        new [len] = '\0';
 	*newlen = len;
 
 	return new;
PatchesPull RequestsHistoryAllCommentsChangesGit/SVN commits             | |||||||||||||||||||||||||||
|  Copyright © 2001-2025 The PHP Group All rights reserved. | Last updated: Fri Oct 31 13:00:01 2025 UTC | 
OK, patch updated and tested.. points to note: this line in php_escape_html_entities( } else if (this_char > 0xff) { I dont thing this will work in the current code as this_char is unsigned short (0-255) - or that is what my C book says :) changed to unsigned long and the code words as expected. this appears to work both by setting and by using the hint field (undocumented - I guess until 4.0.7 release).. setlocale(LC_ALL,"zh_TW"); echo htmlentities("some chinese"); regards alan Index: html.c =================================================================== RCS file: /repository/php4/ext/standard/html.c,v retrieving revision 1.32 diff -u -r1.32 html.c --- html.c 11 Aug 2001 17:03:37 -0000 1.32 +++ html.c 20 Aug 2001 12:32:06 -0000 @@ -35,7 +35,7 @@ Defaults to ISO-8859-1 for now. */ enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, - cs_8859_15, cs_utf_8 }; + cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs }; typedef const char * entity_table_t; /* codepage 1252 is a Windows extension to iso-8859-1. */ @@ -96,6 +96,9 @@ { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, { cs_terminator } }; @@ -105,14 +108,17 @@ } charset_map[] = { { "ISO-8859-1", cs_8859_1 }, { "ISO-8859-15", cs_8859_15 }, - { "utf-8", cs_utf_8 }, + { "utf-8", cs_utf_8 }, { "cp1252", cs_cp1252 }, + { "zh_TW", cs_big5 }, + { "zh_CN", cs_gb2312 }, + { "zh_HK", cs_big5hkscs }, { NULL } }; /* {{{ get_next_char */ -inline static unsigned short get_next_char(enum entity_charset charset, +inline static unsigned long get_next_char(enum entity_charset charset, unsigned char * str, int * newpos, unsigned char * mbseq, @@ -121,7 +127,7 @@ { int pos = *newpos; int mbpos = 0; - unsigned short this_char = str[pos++]; + unsigned long this_char = str[pos++]; mbseq[mbpos++] = (unsigned char)this_char; @@ -205,7 +211,49 @@ mbseq[mbpos++] = (unsigned char)this_char; } } while(more); - } + } else if ((charset == cs_big5) || (charset == cs_gb2312) || (charset == cs_big5hkscs)) { + + unsigned long utf = 0; + int stat = 0; + int more = 1; + + /* unpack double byte encoding into a two chars. */ + + + do { + if ((stat==0) && (this_char < 0xa1) || (this_char > 0xf9)) { + more = 0; + break; + } + else if (stat==0) { + utf = this_char << 16; + stat=1; + } + else if (((this_char >= 0x40) && + (this_char <= 0x73)) || + ((this_char >= 0xa1) && + (this_char <= 0xfe))) { + utf += this_char; + more = 0; + } + else { + /* invalid; bail */ + more = 0; + utf=0; + pos = *newpos; + mbpos = 0; + break; + } + if (more) + { + this_char = str[pos++]; + mbseq[mbpos++] = (unsigned char)this_char; + } + } while(more); + if (utf != 0) + this_char = utf; + + } *newpos = pos; mbseq[mbpos] = '\0'; *mbseqlen = mbpos; @@ -223,24 +271,27 @@ int len; /* Guarantee default behaviour */ - if (charset_hint == NULL) - return cs_8859_1; + /*if (charset_hint == NULL) { + return cs_8859_1; + } + */ - if (strlen(charset_hint) == 0) { + if ((charset_hint == NULL) || strlen(charset_hint) == 0) { + char * localename; /* try to detect the charset for the locale */ #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET) - charset_hint = nl_langinfo(CODESET); + charset_hint = nl_langinfo(CODESET); #endif #if HAVE_LOCALE_H - if (charset_hint == NULL) + localename = setlocale(LC_CTYPE, NULL); + if (localename != NULL) { /* try to figure out the charset from the locale */ - char * localename; + char * dot, * at; /* lang[_territory][.codeset][@modifier] */ localename = setlocale(LC_CTYPE, NULL); - dot = strchr(localename, '.'); if (dot) { dot++; @@ -286,6 +337,7 @@ int i, maxlen, len; char *new; enum entity_charset charset = determine_charset(hint_charset); + maxlen = 2 * oldlen; if (maxlen < 128) @@ -299,12 +351,12 @@ unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence it should be more than enough.. */ - unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen); + unsigned long this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen); int matches_map = 0; if (len + 9 > maxlen) new = erealloc (new, maxlen += 128); - + if (all) { /* look for a match in the maps for this charset */ int j;