php.net |  support |  documentation |  report a bug |  advanced search |  search howto |  statistics |  random bug |  login
Request #2685 HtmlEntities should handle UTF-8
Submitted: 1999-11-08 12:07 UTC Modified: 2002-10-01 21:48 UTC
From: drew at elysium dot ltd dot uk Assigned:
Status: Closed Package: Feature/Change Request
PHP Version: 4.2.3 OS: Linux RH 6.0
Private report: No CVE-ID: None
 [1999-11-08 12:07 UTC] drew at elysium dot ltd dot uk
I needed a version of HtmlEntities that handled UTF-8 better then the default php one, I needed to cater for σ in an XML document and output it as at least Σ.  I have created another function call Utf8_HtmlEntities which does exactly this and have included the diff.

diff -c php-3.0.12.orig/functions/basic_functions.c php-3.0.12/functions/basic_functions.c
*** php-3.0.12.orig/functions/basic_functions.c	Mon Nov  8 16:51:15 1999
--- php-3.0.12/functions/basic_functions.c	Mon Nov  8 16:59:43 1999
***************
*** 167,172 ****
--- 167,173 ----
  	{"getimagesize",		php3_getimagesize,	NULL},
  	{"htmlspecialchars",	php3_htmlspecialchars,	NULL},
  	{"htmlentities",		php3_htmlentities,	NULL},
+ 	{"utf8_htmlentities",		php3_utf8_htmlentities,	NULL},
  	{"md5",					php3_md5,			NULL},
  
  	{"iptcparse",	php3_iptcparse,				NULL},
diff -c php-3.0.12.orig/functions/html.c php-3.0.12/functions/html.c
*** php-3.0.12.orig/functions/html.c	Mon Nov  8 16:51:15 1999
--- php-3.0.12/functions/html.c	Mon Nov  8 16:58:53 1999
***************
*** 60,123 ****
  	"uuml","yacute","thorn","yuml"
  };
  
! PHPAPI char * _php3_htmlentities(char *s, int i, int all)
  {
! 	int len, maxlen;
!     unsigned char *old;
  	char *new;
  
! 	old = (unsigned char *)s;
  
! 	maxlen = 2 * i;
  	if (maxlen < 128)
  		maxlen = 128;
  	new = emalloc (maxlen);
! 	len = 0;
  
! 	while (i--) {
! 		if (len + 9 > maxlen)
! 			new = erealloc (new, maxlen += 128);
! 		if (38 == *old) {
! 			memcpy (new + len, "&amp;", 5);
! 			len += 5;
! 		} else if (34 == *old) {
! 			memcpy (new + len, "&quot;", 6);
! 			len += 6;
! 		} else if (60 == *old) {
! 			memcpy (new + len, "&lt;", 4);
! 			len += 4;
! 		} else if (62 == *old) {
! 			memcpy (new + len, "&gt;", 4);
! 			len += 4;
! 		} else if (all && 160 <= *old) {
! 			new [len++] = '&';
! 			strcpy (new + len, EntTable [*old - 160]);
! 			len += strlen (EntTable [*old - 160]);
! 			new [len++] = ';';
  		} else {
! 			new [len++] = *old;
  		}
- 		old++;
  	}
!     new [len] = '\0';
  	return new;
  }
  
  static void _htmlentities(INTERNAL_FUNCTION_PARAMETERS, int all)
  {
!     pval *arg;
  	char *new;
  	TLS_VARS;
  
!     if (ARG_COUNT(ht) != 1 || getParameters(ht, 1, &arg) == FAILURE) {
  		WRONG_PARAM_COUNT;
!     }
  
!     convert_to_string(arg);
   
  	new = _php3_htmlentities(arg->value.str.val, arg->value.str.len, all);
  
! 	RETVAL_STRINGL(new,strlen(new),0);
  }
  
  /* {{{ proto string htmlspecialchars(string string)
--- 60,210 ----
  	"uuml","yacute","thorn","yuml"
  };
  
! PHPAPI char * _php3_htmlentities(char *s, int len, int all)
  {
! 	int newlen, maxlen;
! 	int pos = len;
! 	unsigned char c;
  	char *new;
  
! 	maxlen = 2 * len;
! 	if (maxlen < 128)
! 		maxlen = 128;
! 	new = emalloc(maxlen);
! 	newlen = 0;
! 
! 	while (pos > 0) {
! 		c = (unsigned char)(*s);
! 		if (newlen + 9 > maxlen)
! 			new = erealloc(new, maxlen += 128);
! 		if (c == 38) {
! 			memcpy(new + newlen, "&amp;", 5);
! 			newlen += 5;
! 		} else if (c == 34) {
! 			memcpy(new + newlen, "&quot;", 6);
! 			newlen += 6;
! 		} else if (c == 60) {
! 			memcpy(new + newlen, "&lt;", 4);
! 			newlen += 4;
! 		} else if (c == 62) {
! 			memcpy(new + newlen, "&gt;", 4);
! 			newlen += 4;
! 		} else if (all && c >= 160) {
! 			new[newlen++] = '&';
! 			strcpy(new + newlen, EntTable[c - 160]);
! 			newlen += strlen(new + newlen);
! 			new[newlen++] = ';';
! 		} else {
! 			new[newlen++] = c;
! 		}
! 		s++;
! 	}
! 	new[newlen] = '\0';
! 	return new;
! }
  
! PHPAPI char * _php3_utf8_htmlentities(char *s, int len, int all)
! {
! 	int newlen, maxlen;
!         int pos = len;
!         unsigned long c;
! 	char *new;
! 
! 	maxlen = 2 * len;
  	if (maxlen < 128)
  		maxlen = 128;
  	new = emalloc (maxlen);
! 	newlen = 0;
! 
! 	while (pos > 0) {
! 		c = (unsigned char)(*s);
! 		/* four bytes encoded, 21 bits */
! 		if (c >= 0xf0 && pos >= 4) {
! 			c = (*(s++) & 7) << 18;
! 			c += (*(s++) & 63) << 12;
! 			c += (*(s++) & 63) << 6;
! 			c += (*(s++) & 63);
! 			pos -= 4;
! 		/* three bytes encoded, 16 bits */
! 		} else if (c >= 0xe0 && c < 0xf0 && pos >= 3) {
! 			c = (*(s++) & 15) << 12;
! 			c += (*(s++) & 63) << 6;
! 			c += (*(s++) & 63);
! 			pos -= 3;
! 		/* two bytes encoded, 11 bits */
! 		} else if (c >= 0xc0 && c < 0xe0 && pos >= 2) {
! 			c = ((unsigned long)*(s++) & 31) << 6;
! 			c += ((unsigned long)*(s++) & 63);
! 			pos -= 2;
! 		} else {
! 			c = (*(s++) & 127);
! 			pos--;
! 		}
  
! 		if (newlen + 11 > maxlen)
! 			new = erealloc(new, maxlen += 128);
! 		if (c == 38) {
! 			memcpy(new + newlen, "&amp;", 5);
! 			newlen += 5;
! 		} else if (c == 34) {
! 			memcpy(new + newlen, "&quot;", 6);
! 			newlen += 6;
! 		} else if (c == 60) {
! 			memcpy (new + newlen, "&lt;", 4);
! 			newlen += 4;
! 		} else if (c == 62) {
! 			memcpy (new + newlen, "&gt;", 4);
! 			newlen += 4;
! 		} else if (all && c >= 160) {
! 			new[newlen++] = '&';
! 			if (c <= 255) {
! 				strcpy(new + newlen, EntTable[c - 160]);
! 			} else {
! 				new[newlen++] = '#';
! 				sprintf(new + newlen, "%lu", c);
! 			}
! 			newlen += strlen(new + newlen);
! 			new[newlen++] = ';';
  		} else {
! 			new[newlen++] = (unsigned char)c;
  		}
  	}
! 	new[newlen] = '\0';
  	return new;
  }
  
  static void _htmlentities(INTERNAL_FUNCTION_PARAMETERS, int all)
  {
! 	pval *arg;
  	char *new;
  	TLS_VARS;
  
! 	if (ARG_COUNT(ht) != 1 || getParameters(ht, 1, &arg) == FAILURE) {
  		WRONG_PARAM_COUNT;
! 	}
  
! 	convert_to_string(arg);
   
  	new = _php3_htmlentities(arg->value.str.val, arg->value.str.len, all);
  
! 	RETVAL_STRINGL(new,strlen(new), 0);
! }
! 
! static void _utf8_htmlentities(INTERNAL_FUNCTION_PARAMETERS, int all)
! {
! 	pval *arg;
! 	char *new;
! 	TLS_VARS;
! 
! 	if (ARG_COUNT(ht) != 1 || getParameters(ht, 1, &arg) == FAILURE) {
! 		WRONG_PARAM_COUNT;
! 	}
! 
! 	convert_to_string(arg);
!  
! 	new = _php3_utf8_htmlentities(arg->value.str.val, arg->value.str.len, all);
! 
! 	RETVAL_STRINGL(new, strlen(new), 0);
  }
  
  /* {{{ proto string htmlspecialchars(string string)
***************
*** 133,140 ****
     Convert all applicable characters to HTML entities */
  void php3_htmlentities(INTERNAL_FUNCTION_PARAMETERS)
  {
! /*      _php3_htmlentities(INTERNAL_FUNCTION_PARAM_PASSTHRU,1);*/
! 	_htmlentities(INTERNAL_FUNCTION_PARAM_PASSTHRU,1);
  }
  /* }}} */
  
--- 220,236 ----
     Convert all applicable characters to HTML entities */
  void php3_htmlentities(INTERNAL_FUNCTION_PARAMETERS)
  {
! /*      _php3_htmlentities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);*/
! 	_htmlentities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
! }
! /* }}} */
! 
! /* {{{ proto string utf8_htmlentities(string string)
!    Convert all applicable UTF-8 characters to HTML entities */
! void php3_utf8_htmlentities(INTERNAL_FUNCTION_PARAMETERS)
! {
! /*      _php3_utf8_htmlentities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);*/
! 	_utf8_htmlentities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
  }
  /* }}} */
  
***************
*** 144,146 ****
--- 240,243 ----
   * c-basic-offset: 4
   * End:
   */
+ 
diff -c php-3.0.12.orig/functions/html.h php-3.0.12/functions/html.h
*** php-3.0.12.orig/functions/html.h	Mon Nov  8 16:51:15 1999
--- php-3.0.12/functions/html.h	Mon Nov  8 16:58:56 1999
***************
*** 34,39 ****
  
  extern void php3_htmlspecialchars(INTERNAL_FUNCTION_PARAMETERS);
  extern void php3_htmlentities(INTERNAL_FUNCTION_PARAMETERS);
! PHPAPI char * _php3_htmlentities(char *s, int i, int all);
  
  #endif /* _HTML_H */
--- 34,41 ----
  
  extern void php3_htmlspecialchars(INTERNAL_FUNCTION_PARAMETERS);
  extern void php3_htmlentities(INTERNAL_FUNCTION_PARAMETERS);
! extern void php3_utf8_htmlentities(INTERNAL_FUNCTION_PARAMETERS);
! PHPAPI char * _php3_htmlentities(char *s, int len, int all);
! PHPAPI char * _php3_utf8_htmlentities(char *s, int len, int all);
  
  #endif /* _HTML_H */

Patches

Pull Requests

History

AllCommentsChangesGit/SVN commitsRelated reports
 [2002-10-01 10:24 UTC] hholzgra@php.net
changed to feature request and bumped up the version as the request still looks valid ...
 [2002-10-01 21:48 UTC] wez@php.net
IIRC, this is in the 4.2 branch already.
htmlentities($str, ENT_COMPAT, "utf-8");
If not, it's in 4.3 due soon.
 
PHP Copyright © 2001-2025 The PHP Group
All rights reserved.
Last updated: Wed Jan 08 08:01:28 2025 UTC