Re: [PHP-DEV] Counterpart to htmlentities function: unhtmlentities

Brad Fisher Thu, 28 Feb 2002 17:47:29 -0800

Just a newer version of the unhtmlentities function.  This one utilizes
the same tables as htmlentities, and (hopefully) is a little more locale
friendly.  It also supports &#...; (decimal) &#x...; (hex) and &#X...;
(hex) numeric entity formats.


-Brad

Index: ext/standard//basic_functions.c
===================================================================
RCS file: /repository/php4/ext/standard/basic_functions.c,v
retrieving revision 1.446
diff -u -r1.446 basic_functions.c
--- ext/standard//basic_functions.c     28 Feb 2002 16:00:26 -0000      1.446
+++ ext/standard//basic_functions.c     1 Mar 2002 01:30:40 -0000
@@ -274,6 +274,7 @@
        PHP_FE(wordwrap,                                                               
                                                 NULL)
        PHP_FE(htmlspecialchars,                                                       
                                         NULL)
        PHP_FE(htmlentities,                                                           
                                         NULL)
+       PHP_FE(unhtmlentities,                                                         
+                                         NULL)
        PHP_FE(get_html_translation_table,                                             
                                 NULL)
        PHP_NAMED_FE(md5,php_if_md5,                                                   
                                 NULL)
        PHP_NAMED_FE(md5_file,php_if_md5_file,                                         
                         NULL)
Index: ext/standard//html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.40
diff -u -r1.40 html.c
--- ext/standard//html.c        28 Feb 2002 08:26:45 -0000      1.40
+++ ext/standard//html.c        1 Mar 2002 01:30:40 -0000
@@ -123,6 +123,7 @@
        { NULL }
 };
 
+
 /* {{{ get_next_char
  */
 inline static unsigned short get_next_char(enum entity_charset charset,
@@ -319,6 +320,177 @@
 }
 /* }}} */
 
+
+/* {{{ encode_char
+ */
+/* It appears that there _may_ be a problem with get_next_char and encode_char.  
+get_next_char only returns an
+    unsigned short (16bits) and JIS X 0212 hojo-kanji appears to be a 3 byte sequence 
+(24bits)...  encode_char
+    also has the same problem.  After looking at the mbstring extension, it appears 
+that a 32-bit int would
+       be a better container.
+*/
+inline static int encode_char(enum entity_charset charset,
+               unsigned short this_char,
+               unsigned char * mbsequence,
+               int * mbseqlen)
+{
+       int mbpos = 0, valid = 1;
+       unsigned char high_byte = this_char >> 8,
+                                 low_byte = this_char & 0xff;
+
+       switch(charset) {
+               case cs_utf_8:
+                       {
+                               /* Need to UTF encode */
+                               /* pack utf-8
+                                * Code stolen from the mbstring extension */
+                               if (this_char >= 0x80) {
+                                       if (this_char < 0x800)  {
+                                               mbsequence[mbpos++] = ((this_char >> 
+6) & 0x1f) | 0xc0;
+                                               low_byte = (this_char & 0x3f) | 0x80;
+                                       } else if (this_char < 0x10000) {
+                                               mbsequence[mbpos++] = ((this_char >> 
+12) & 0x0f) | 0xe0;
+                                               mbsequence[mbpos++] = ((this_char >> 
+6) & 0x3f) | 0x80;
+                                               this_char = (this_char & 0x3f) | 0x80;
+                                       } else if (this_char < 0x200000) {
+                                               mbsequence[mbpos++] = ((this_char >> 
+18) & 0x07) | 0xf0;
+                                               mbsequence[mbpos++] = ((this_char >> 
+12) & 0x3f) | 0x80;
+                                               mbsequence[mbpos++] = ((this_char >> 
+6) & 0x3f) | 0x80;
+                                               this_char = (this_char & 0x3f) | 0x80;
+                                       } else if (this_char < 0x4000000) {
+                                               mbsequence[mbpos++] = ((this_char >> 
+24) & 0x03) | 0xf8;
+                                               mbsequence[mbpos++] = ((this_char >> 
+18) & 0x3f) | 0x80;
+                                               mbsequence[mbpos++] = ((this_char >> 
+12) & 0x3f) | 0x80;
+                                               mbsequence[mbpos++] = ((this_char >> 
+6) & 0x3f) | 0x80;
+                                               this_char = (this_char & 0x3f) | 0x80;
+                                       } else {
+                                               mbsequence[mbpos++] = ((this_char >> 
+30) & 0x01) | 0xfc;
+                                               mbsequence[mbpos++] = ((this_char >> 
+24) & 0x3f) | 0x80;
+                                               mbsequence[mbpos++] = ((this_char >> 
+18) & 0x3f) | 0x80;
+                                               mbsequence[mbpos++] = ((this_char >> 
+12) & 0x3f) | 0x80;
+                                               mbsequence[mbpos++] = ((this_char >> 
+6) & 0x3f) | 0x80;
+                                               this_char = (this_char & 0x3f) | 0x80;
+                                       }
+                       } else {
+                                       valid = 0;
+                       } /*if*/
+
+                               break;
+                       } /*case*/
+
+               case cs_big5:
+               case cs_gb2312:
+               case cs_big5hkscs:
+                       {
+                               if (high_byte) {
+                                       if (high_byte >= 0xa1 && high_byte <= 0xf9) {
+                                               if ((low_byte >= 0x40 && low_byte <= 
+0x73) ||
+                                                       (low_byte >= 0xa1 && low_byte 
+<= 0xfe))
+                                               {
+                                                       mbsequence[mbpos++] = 
+high_byte;
+                                               } else {
+                                                       /* low_byte invalid for 2-byte 
+sequence */
+                                                       valid = 0;
+                                               } /*if*/
+                                       } else {
+                                               /* high_byte invalid for 2-byte 
+sequence */
+                                               valid = 0;
+                                       } /*if*/
+                               } /*if*/
+
+                               mbsequence[mbpos++] = low_byte;
+
+                               break;
+                       } /*case*/
+
+               case cs_sjis:
+                       {
+                               /* Check if high_byte is valid start of a 2-byte 
+sequence */
+                               if (high_byte) {
+                                       if ( (high_byte >= 0x81 && high_byte <= 0x9f) 
+||
+                                                (high_byte >= 0xe0 && high_byte <= 
+0xef))
+                                       {
+                                               /* Check the low_byte */
+                                               if ((low_byte >= 0x40 && low_byte <= 
+0x7e) ||
+                                                       (low_byte >= 0x80 && low_byte 
+<= 0xfc))
+                                               {
+                                                       mbsequence[mbpos++] = 
+high_byte;
+                                               } else {
+                                                       /* low_byte invalid for 2-byte 
+sequence */
+                                                       valid = 0;
+                                               } /*if*/
+                                       } else {
+                                               /* high_byte invalid for 2-byte 
+sequence */
+                                               valid = 0;
+                                       }/*if*/
+                               }/*if*/
+
+                               break;
+                       } /*case*/
+
+               case cs_eucjp:
+                       {
+                               unsigned char really_high_byte = this_char >> 16;
+
+                               if (really_high_byte) {
+                                       /* check if this is valid start of a 
+three-byte sequence */
+                                       if (really_high_byte == 0x8f) {
+                                               if (high_byte >= 0xa1 && high_byte <= 
+0xfe) {
+                                                       if (low_byte >= 0xa1 && 
+low_byte <= 0xfe) {
+                                                               /* JIS X 0212 
+hojo-kanji */
+                                                               mbsequence[mbpos++] = 
+really_high_byte;
+                                                               mbsequence[mbpos++] = 
+high_byte;
+                                                       } else {
+                                                               /* low_byte invalid 
+for 3-byte sequence */
+                                                               valid = 0;
+                                                       } /*if*/
+                                               } else {
+                                                       /* high_byte invalid for 
+3-byte sequence */
+                                                       valid = 0;
+                                               } /*if*/
+                                       } else {
+                                               /* really_high_byte invalid for 3-byte 
+sequence */
+                                               valid = 0;
+                                       } /*if*/
+                               } else if (high_byte) {
+                                       /* check if this is valid start of a 2-byte 
+sequence */
+                                       if (high_byte >= 0xa1 && high_byte <= 0xfe)    
+ {
+                                               /* check the low_byte */
+                                               if (low_byte >= 0xa1 && low_byte <= 
+0xfe) {
+                                                       /* yes, this a jis kanji char 
+*/
+                                                       mbsequence[mbpos++] = 
+high_byte;
+                                               } else {
+                                                       /* low_byte invalid for 2-byte 
+sequence */
+                                                       valid = 0;
+                                               } /*if*/
+                                       } else if (high_byte == 0x8e)   {
+                                               /* check the low_byte */
+                                               if (low_byte >= 0xa1 && low_byte <= 
+0xdf) {
+                                                       /* JIS X 0201 kana */
+                                                       mbsequence[mbpos++] = 
+high_byte;
+                                               } else {
+                                                       /* low_byte invalid for 2-byte 
+sequence */
+                                                       valid = 0;
+                                               } /*if*/
+                                       } else {
+                                               /* high_byte invalid for 2-byte 
+sequence */
+                                               valid = 0;
+                    } /*if*/
+                               } /*if*/
+
+                               break;
+                       } /*case*/
+       } /*switch*/
+
+       mbsequence[mbpos++] = low_byte;
+       mbsequence[mbpos] = '\0';
+       *mbseqlen = mbpos;
+
+       return valid;
+}
+/* }}} */
+
+
+/* {{{ entity_charset determine_charset
 /* {{{ entity_charset determine_charset
  * returns the charset identifier based on current locale or a hint.
  * defaults to iso-8859-1 */
@@ -575,6 +747,241 @@
        }
 }
 /* }}} */
+
+
+/* {{{ proto string il_str_decode_htmlentities(string str)
+  Translates HTML entities in the given string into the appropriate characters. */
+PHPAPI char* php_str_unhtmlentities(char *str, unsigned int *resultlen, char* 
+hint_charset TSRMLS_DC)
+{
+       enum entity_charset charset = determine_charset(hint_charset);
+       char                    *buf;
+    int                                buflen, rlen, slen, i, nmaps;
+       struct html_entity_map  **mapcache;
+
+       if (resultlen && *resultlen) {
+               /* resultlen should have the "real" length of the string
+               slen = *resultlen;
+       } else {
+               /* Try to guess the length of the string - will not work correctly for 
+multibyte strings */
+               slen = strlen(str);
+       } /*if*/
+
+       /* Cache the character set's entity maps */
+       /*  This _may_ cause a performance decrease if no entities are found, but 
+should give a (hopefully)
+          noticable increase when lots of matches are found.  Will make a bigger 
+difference as more
+             entity maps are defined.
+    */
+
+       /* Count the number of entity maps */
+       for (i=0; entity_map[i].charset != cs_terminator; i++)  {
+               if (entity_map[i].charset == charset) {
+                       ++nmaps;
+               } /*if*/
+       } /*for*/
+
+       /* Allocate the cache array & populate it */
+       mapcache = malloc(nmaps * sizeof(struct html_entity_map *));
+       nmaps = 0;
+
+       for (i=0; entity_map[i].charset != cs_terminator; i++)  {
+               if (entity_map[i].charset == charset) {
+                       mapcache[nmaps++] = (struct html_entity_map *) &entity_map[i];
+               } /*if*/
+       } /*for*/
+
+       /* Scan through the string and find entities to decode */
+       buflen = slen + 128;
+       buf = emalloc(buflen);  /* allocate buffer */
+       rlen = 0;                               /* initial result is empty */
+
+       i = 0;
+       while (i < slen) {
+               int mbseqlen;
+               unsigned char mbsequence[16];   /* allow up to 15 characters in a 
+multibyte sequence */
+               unsigned short this_char;
+               int copy_char = 1;
+
+               this_char = get_next_char(charset, str, &i, mbsequence, &mbseqlen);
+
+               if (this_char == '&') {
+                       /* Start of an entity, parse the following entity */
+                       int p, len, found_entity = 0, valid;
+
+                       p = i;
+                       if (str[p] == '#') {
+                               char* end;
+                               int base = -1;
+
+                               /* Numeric entity */
+                               ++p;
+
+                               if ((str[p] == 'x') || (str[p] == 'X')) {
+                                       /* &#x or &#X found - hex value should follow 
+*/
+                                       base = 16;
+                                       ++p;
+                               } else if ((str[p] >= '0') && (str[p] <= '9')) {
+                                       /* &# followed by digit found - decimal value 
+should follow */
+                                       base = 10;
+                               } /*if*/
+
+                               if (base >= 0) {
+                                       this_char = strtol(&str[p], &end, base);
+                                       valid = encode_char(charset, this_char, 
+mbsequence, &mbseqlen);
+
+                                       len = end - str - i;
+                                       if ((errno != ERANGE) && (*end == ';') && 
+valid) {
+                                               /* At end of entity, valid number */
+                                               found_entity = 1;
+                                               i += len + 1;
+                                       } else {
+                                               /* Invalid entry, set pointer to end */
+                                               p = i + len;
+                                       } /*if*/
+                               } /*if*/
+                       } else {
+                               char entity[20];
+                               int j, k;
+
+                               /* Scan until end of entity or invalid char found */
+                               len = 0;
+                               valid = 1;
+                               while (str[p] != ';') {
+                                       if (!isalnum(str[p]) || (len > sizeof(entity) 
+- 2)) {
+                                               /* Invalid character found or entity 
+too long */
+                                               valid = 0;
+                                               break;
+                                       } /*if*/
+
+                                       entity[len++] = str[p++];
+                               } /*while*/
+
+                               if (valid) { /* Entity appears to be valid */
+                                       /* Add NULL */
+                                       entity[len] = 0;
+
+                                       /* Check the entities which have no table */
+                                       if (!strcmp("amp", entity)) {
+                                               /* &amp; */
+                                               this_char = '&';
+                                               found_entity = 1;
+                                       } else if (!strcmp("lt", entity)) {
+                                               /* &lt; */
+                                               this_char = '<';
+                                               found_entity = 1;
+                                       } else if (!strcmp("gt", entity)) {
+                                               /* &gt; */
+                                               this_char = '>';
+                                               found_entity = 1;
+                                       } else if (!strcmp("quot", entity)) {
+                                               /* &quot; */
+                                               this_char = '"';
+                                               found_entity = 1;
+                                       } else {
+
+                                               /* Find match in tables (if it exists) 
+*/
+                                               for (j = 0; j < nmaps; ++j) {
+                                                       for (k = 0; k < 
+mapcache[j]->endchar - mapcache[j]->basechar + 1; ++k) {
+                                                               /* Using strcmp may 
+not be the best solution, however I don't think entities can contain
+                                                                  mbyte chars 
+themselves... */
+                                                               if 
+(mapcache[j]->table[k] && !strcmp(mapcache[j]->table[k], entity)) {
+                                                                       this_char = 
+mapcache[j]->basechar + k;
+                                                                       found_entity = 
+1;
+                                                                       break;
+                                                               } /*if*/
+                                                       } /*for*/
+
+                                                       if (found_entity) break;
+                                               } /*for*/
+
+
+                                       } /*if*/
+
+                                       if (found_entity) {
+                                               i = p + 1;
+                                               encode_char(charset, this_char, 
+mbsequence, &mbseqlen);  /* Assuming the tables always have correct values... */
+                                       } /*if*/
+                               } /*if*/
+                       } /*if*/
+
+                       if (!found_entity) {
+                               /* Entity not found in tables (not known or invalid) 
+Just pass through to result string. */
+                               len = p - i + 1;
+
+                               if (rlen + len + 10 > buflen) {
+                                       /* Expand buffer.  Should not ever happen with 
+single byte chars. */
+                                       buflen += 256;
+                                       buf = erealloc(buf, buflen);
+                               } /*if*/
+
+                               /* Copy in the entity */
+                               memcpy(buf + rlen, &str[i - 1], len);
+                               rlen += len;
+
+                               i = p;
+                               copy_char = 0;
+                       } /*if*/
+               } /*if*/
+
+               if (copy_char) {
+                       /* Copy the character into the result string */
+                       if (rlen + mbseqlen + 10 > buflen) {
+                               /* Expand buffer.  Should not ever happen with single 
+byte chars. */
+                               buflen += 256;
+                               buf = erealloc(buf, buflen);
+                       } /*if*/
+
+                       if (this_char <= 0xff) {
+                               buf[rlen++] = (unsigned char) this_char;
+                       } else {
+                               memcpy(buf + rlen, &mbsequence, mbseqlen);
+                               rlen += mbseqlen;
+                       } /*if*/
+               } /*if*/
+
+       } /*while*/
+
+       /* Free the map cache */
+       free(mapcache);
+
+       /* Reallocate the buffer to match the size of the result */
+       buf = erealloc(buf, rlen + 1);
+       buf[rlen] = 0;
+
+       if (resultlen) {
+               *resultlen = rlen;
+       } /*if*/
+
+       return buf;
+} /* php_str_unhtmlentities() */
+/* }}} */
+
+
+/* {{{ proto string il_str_decode_htmlentities(string str)
+  Translates HTML entities in the given string into the appropriate characters. */
+PHP_FUNCTION(unhtmlentities)
+{
+/*
+  This function is basically the reverse of the standard PHP function htmlentities, 
+however it DOES NOT currently
+  use the same translation table (or the locale).  HTML entities have the form 
+"&data;" where data is either the
+  name of an entity (ie. &gt;, &lt;, &quot;) or a # symbol followed by a decimal 
+value from 0 to 255 (ie. &#34;, &#38)
+*/
+       char *str, *hint_charset = NULL;
+       int strlen, hint_charset_len;
+    char    *result;
+
+       if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s",
+                                                               &str, &strlen, 
+&hint_charset, &hint_charset_len) == FAILURE)
+       {
+               return;
+       } /*if*/
+
+       result = php_str_unhtmlentities(str, &strlen, hint_charset TSRMLS_CC);
+
+       /* Return the result */
+       RETURN_STRINGL(result, strlen, 0);
+} /* PHP_FUNCTION(unhtmlentities) */
+/* }}} */
+
 
 /*
  * Local variables:
Index: ext/standard//html.h
===================================================================
RCS file: /repository/php4/ext/standard/html.h,v
retrieving revision 1.12
diff -u -r1.12 html.h
--- ext/standard//html.h        28 Feb 2002 08:26:45 -0000      1.12
+++ ext/standard//html.h        1 Mar 2002 01:30:40 -0000
@@ -29,6 +29,7 @@
 
 PHP_FUNCTION(htmlspecialchars);
 PHP_FUNCTION(htmlentities);
+PHP_FUNCTION(unhtmlentities);
 PHP_FUNCTION(get_html_translation_table);
 
 PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, 
int all, int quote_style, char * hint_charset);

-- 
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php

Re: [PHP-DEV] Counterpart to htmlentities function: unhtmlentities

Reply via email to