php.net |  support |  documentation |  report a bug |  advanced search |  search howto |  statistics |  random bug |  login
Bug #9392 htmlspecial chars & htmlentities do not handle double byte charactersets
Submitted: 2001-02-21 22:53 UTC Modified: 2001-08-23 06:09 UTC
From: alan_k at hklc dot com Assigned:
Status: Closed Package: *Languages/Translation
PHP Version: 4.0 Latest CVS (21/02/2001) OS: Linux
Private report: No CVE-ID: None
 [2001-02-21 22:53 UTC] alan_k at hklc dot com
htmlspecialchars & entities often replace the second byte of a chinese character with a 'htmlized' &xxx; character - this is annoying :) and makes it very difficult to program in PHP doing dynamic chinese stuff.

anyway this patch goes part of the way to solving it, - Note I have not tested it, so testers needed.. - I'm on the dev list, so I should be able to follow any comments..

It does produce compile time errors on the character range (i'm guessing that gcc makes the assumtion that char should be < 128 ?)

I have added a check using setlocale("LC_ALL",NULL) - This may not be the correct test - and may not respond with the correct info if set in PHP - again untested. - does LC_ALL return something? should It use env. variables...

Anyway - thats a long enough bug report...

regards

alan

Index: html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.22
diff -u -r1.22 html.c
--- html.c	2000/11/24 16:17:58	1.22
+++ html.c	2001/02/22 03:43:13
@@ -22,7 +22,7 @@
 #include "php.h"
 #include "reg.h"
 #include "html.h"
-
+#include <locale.h>
 /* This must be fixed to handle the input string according to LC_CTYPE.
    Defaults to ISO-8859-1 for now. */
 	
@@ -52,8 +52,17 @@
 PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style)
 {
 	int i, maxlen, len;
-	char *new;
-
+ 
+	char *new, *oldnext, *oldprev;
+#if HAVE_SETLOCALE
+	int checklang=0,ischinese;
+	/* should this check the enviroment value? */
+  	char  *locale = setlocale(LC_ALL, NULL);
+        if ((!strcmp("zh_TW.Big5", locale)) || 
+	    (!strcmp("zh_TW", locale)) ||
+	    (!strcmp("zh_CN", locale)) ||
+	    (!strcmp("zh_CN.GB2313", locale))) checklang=1;
+#endif
 	maxlen = 2 * oldlen;
 	if (maxlen < 128)
 		maxlen = 128;
@@ -62,34 +71,72 @@
 
 	i = oldlen;
 	while (i--) {
-		if (len + 9 > maxlen)
+ 		if (len + 9 > maxlen)
 			new = erealloc (new, maxlen += 128);
-		if (38 == *old) {
-			memcpy (new + len, "&amp;", 5);
-			len += 5;
-		} else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
-			memcpy (new + len, "&quot;", 6);
-			len += 6;
-		} else if (39 == *old && (quote_style&ENT_QUOTES)) {
-			memcpy (new + len, "&#039;", 6);
-			len += 6;
-		} else if (60 == *old) {
-			memcpy (new + len, "&lt;", 4);
-			len += 4;
-		} else if (62 == *old) {
-			memcpy (new + len, "&gt;", 4);
-			len += 4;
-		} else if (all && 160 <= *old) {
-			new [len++] = '&';
-			strcpy (new + len, EntTable [*old - 160]);
-			len += strlen (EntTable [*old - 160]);
-			new [len++] = ';';
+#if HAVE_SETLOCALE
+ 	    
+	 
+         	ischinese = 0; 
+		if (checklang) {
+        	       if (i > 1) { 
+        		 oldnext = old+1; 
+        		 if ((*old >= 0xa1) &&
+                	     (*old <= 0xf9) &&
+                	     (((*oldnext >= 0x40) &&
+                	       (*oldnext <= 0x73)) ||
+                	      ((*oldnext >= 0xa1) &&
+                	       (*oldnext <= 0xfe)))  
+                	    ) ischinese = 1;
+        	       }
+        	       /* check if this is the seconde character in a chinese pair */
+        	       if ((i != oldlen) && (!ischinese)) {
+        		 oldprev = old-1;
+        		 if ((*oldprev >= 0xa1) &&  
+                	     (*oldprev <= 0xf9) &&
+                	     (((*old >= 0x40) &&
+                	       (*old <= 0x73)) ||
+                	      ((*old >= 0xa1) &&
+                	       (*old <= 0xfe)))
+                	    ) ischinese = 1;
+        	       }
+                }
+		
+                if (ischinese) { 
+          	        /* it is chinese - ignore it */
+                	new [len++] = *old;
 		} else {
-			new [len++] = *old;
-		}
-		old++;
+#endif
+		
+			if (38 == *old) {
+				memcpy (new + len, "&amp;", 5);
+				len += 5;
+			} else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
+				memcpy (new + len, "&quot;", 6);
+				len += 6;
+			} else if (39 == *old && (quote_style&ENT_QUOTES)) {
+				memcpy (new + len, "&#039;", 6);
+				len += 6;
+			} else if (60 == *old) {
+				memcpy (new + len, "&lt;", 4);
+				len += 4;
+			} else if (62 == *old) {
+				memcpy (new + len, "&gt;", 4);
+				len += 4;
+			} else if (all && 160 <= *old) {
+				new [len++] = '&';
+				strcpy (new + len, EntTable [*old - 160]);
+				len += strlen (EntTable [*old - 160]);
+				new [len++] = ';';
+			} else {
+				new [len++] = *old;
+			}
+#if HAVE_SETLOCALE
+               
+                }
+#endif
+                old++;
 	}
-    new [len] = '\0';
+        new [len] = '\0';
 	*newlen = len;
 
 	return new;


Patches

Pull Requests

History

AllCommentsChangesGit/SVN commitsRelated reports
 [2001-08-18 22:15 UTC] sniper@php.net
Could you please check the latest CVS snapshot from

http://snaps.php.net/

clip from the cvs log:

"date: 2001/05/28 11:00:06;  author: wez;  state: Exp;  
Added charset awareness to htmlentities() and 
htmlspecialchars(); use an optional third parameter to 
specify the charset; otherwise tries to determine
it from the LC_CTYPE locale setting."

Please test if this is what you wanted.

--Jani


 [2001-08-19 04:30 UTC] wez@php.net
If you could rewrite your patch to fit the new architecture
for htmlentities, I'd be happy to apply it.
It should be an easier patch too.
(we might consider using the mbstring extension for this
stuff, in which case your chinese string patch might
be better off being put in there.)

--Wez.
 [2001-08-19 04:40 UTC] wez@php.net
I should add that as it stands in CVS, htmlentities only knows about iso-8859-1, iso-8859-15 and utf-8.
--Wez.
 [2001-08-20 08:33 UTC] alan_k at hklc dot com
OK, patch updated and tested..
points to note:
this line in php_escape_html_entities(
} else if (this_char > 0xff)	{
I dont thing this will work in the current code as this_char is unsigned short (0-255) - or that is what my C book says :)

changed to unsigned long and the code words as expected.

this appears to work both by setting and by using the hint field (undocumented - I guess until 4.0.7 release)..
setlocale(LC_ALL,"zh_TW");
echo htmlentities("some chinese"); 

regards

alan


Index: html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.32
diff -u -r1.32 html.c
--- html.c	11 Aug 2001 17:03:37 -0000	1.32
+++ html.c	20 Aug 2001 12:32:06 -0000
@@ -35,7 +35,7 @@
    Defaults to ISO-8859-1 for now. */
 
 enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
-	cs_8859_15, cs_utf_8 };
+	cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs };
 typedef const char * entity_table_t;
 
 /* codepage 1252 is a Windows extension to iso-8859-1. */
@@ -96,6 +96,9 @@
 	{ cs_8859_1, 		0xa0, 0xff, ent_iso_8859_1 },
 	{ cs_8859_15, 		0xa0, 0xff, ent_iso_8859_15 },
 	{ cs_utf_8, 		0xa0, 0xff, ent_iso_8859_1 },
+	{ cs_big5, 		0xa0, 0xff, ent_iso_8859_1 },	
+	{ cs_gb2312, 		0xa0, 0xff, ent_iso_8859_1 },
+	{ cs_big5hkscs, 	0xa0, 0xff, ent_iso_8859_1 },	
 	{ cs_terminator }
 };
 
@@ -105,14 +108,17 @@
 } charset_map[] = {
 	{ "ISO-8859-1", 	cs_8859_1 },
 	{ "ISO-8859-15", 	cs_8859_15 },
-	{ "utf-8", 			cs_utf_8 },
+	{ "utf-8", 		cs_utf_8 },
 	{ "cp1252", 		cs_cp1252 },
+	{ "zh_TW",		cs_big5 },
+	{ "zh_CN",		cs_gb2312 },
+	{ "zh_HK",		cs_big5hkscs },
 	{ NULL }
 };
 
 /* {{{ get_next_char
  */
-inline static unsigned short get_next_char(enum entity_charset charset,
+inline static unsigned long get_next_char(enum entity_charset charset,
 		unsigned char * str,
 		int * newpos,
 		unsigned char * mbseq,
@@ -121,7 +127,7 @@
 {
 	int pos = *newpos;
 	int mbpos = 0;
-	unsigned short this_char = str[pos++];
+	unsigned long this_char = str[pos++];
 	
 	mbseq[mbpos++] = (unsigned char)this_char;
 	
@@ -205,7 +211,49 @@
 				mbseq[mbpos++] = (unsigned char)this_char;
 			}
 		} while(more);
-	}
+	} else if ((charset == cs_big5) || (charset == cs_gb2312) || (charset == cs_big5hkscs)) {
+	
+		unsigned long utf = 0;
+		int stat = 0;
+		int more = 1;
+
+		/* unpack double byte encoding into a two chars. */
+
+		
+		do {
+			if ((stat==0) && (this_char < 0xa1) || (this_char > 0xf9))	{
+				more = 0;
+				break;
+			}
+			else if (stat==0) {
+				 utf = this_char << 16;
+				 stat=1;
+			} 
+			else if  (((this_char >= 0x40) &&
+                                   (this_char <= 0x73)) ||
+                                  ((this_char >= 0xa1) &&
+                                   (this_char <= 0xfe))) {
+				  utf += this_char;
+				  more = 0;
+			}
+			else	{
+				/* invalid; bail */
+				more = 0;
+				utf=0;
+				pos = *newpos;
+				mbpos = 0;
+				break;
+			}
+			if (more)
+			{
+				this_char = str[pos++];
+				mbseq[mbpos++] = (unsigned char)this_char;
+			}
+		} while(more);		
+	        if (utf != 0) 
+		   this_char = utf;
+		
+ 	}
 	*newpos = pos;
 	mbseq[mbpos] = '\0';
 	*mbseqlen = mbpos;
@@ -223,24 +271,27 @@
 	int len;
 
 	/* Guarantee default behaviour */
-	if (charset_hint == NULL)
-		return cs_8859_1;
+	/*if (charset_hint == NULL) {
+		return cs_8859_1;		 
+	}
+	*/	
 
-	if (strlen(charset_hint) == 0)	{
+	if ((charset_hint == NULL) || strlen(charset_hint) == 0)	{
+	        char * localename;
 		/* try to detect the charset for the locale */
 #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
-		charset_hint = nl_langinfo(CODESET);
+	 	charset_hint = nl_langinfo(CODESET);  
 #endif
 #if HAVE_LOCALE_H
-		if (charset_hint == NULL)
+                localename = setlocale(LC_CTYPE, NULL);
+		if (localename != NULL)
 		{
 			/* try to figure out the charset from the locale */
-			char * localename;
+			
 			char * dot, * at;
 
 			/* lang[_territory][.codeset][@modifier] */
 			localename = setlocale(LC_CTYPE, NULL);
-
 			dot = strchr(localename, '.');
 			if (dot)	{
 				dot++;
@@ -286,6 +337,7 @@
 	int i, maxlen, len;
 	char *new;
 	enum entity_charset charset = determine_charset(hint_charset);
+	 
 
 	maxlen = 2 * oldlen;
 	if (maxlen < 128)
@@ -299,12 +351,12 @@
 		unsigned char mbsequence[16];	/* allow up to 15 characters
 													in a multibyte sequence
 													it should be more than enough.. */
-		unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen);
+		unsigned long this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen);
 		int matches_map = 0;
 		
 		if (len + 9 > maxlen)
 			new = erealloc (new, maxlen += 128);
-		
+		 
 		if (all)	{
 			/* look for a match in the maps for this charset */
 			int j;

 [2001-08-23 06:09 UTC] wez@php.net
Now fixed in CVS.
Thanks for the patch.
--Wez.
 
PHP Copyright © 2001-2024 The PHP Group
All rights reserved.
Last updated: Sat Dec 28 23:01:30 2024 UTC