Just a newer version of the unhtmlentities function. This one utilizes
the same tables as htmlentities, and (hopefully) is a little more locale
friendly. It also supports &#...; (decimal) &#x...; (hex) and &#X...;
(hex) numeric entity formats.
-Brad
Index: ext/standard//basic_functions.c
===================================================================
RCS file: /repository/php4/ext/standard/basic_functions.c,v
retrieving revision 1.446
diff -u -r1.446 basic_functions.c
--- ext/standard//basic_functions.c 28 Feb 2002 16:00:26 -0000 1.446
+++ ext/standard//basic_functions.c 1 Mar 2002 01:30:40 -0000
@@ -274,6 +274,7 @@
PHP_FE(wordwrap,
NULL)
PHP_FE(htmlspecialchars,
NULL)
PHP_FE(htmlentities,
NULL)
+ PHP_FE(unhtmlentities,
+ NULL)
PHP_FE(get_html_translation_table,
NULL)
PHP_NAMED_FE(md5,php_if_md5,
NULL)
PHP_NAMED_FE(md5_file,php_if_md5_file,
NULL)
Index: ext/standard//html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.40
diff -u -r1.40 html.c
--- ext/standard//html.c 28 Feb 2002 08:26:45 -0000 1.40
+++ ext/standard//html.c 1 Mar 2002 01:30:40 -0000
@@ -123,6 +123,7 @@
{ NULL }
};
+
/* {{{ get_next_char
*/
inline static unsigned short get_next_char(enum entity_charset charset,
@@ -319,6 +320,177 @@
}
/* }}} */
+
+/* {{{ encode_char
+ */
+/* It appears that there _may_ be a problem with get_next_char and encode_char.
+get_next_char only returns an
+ unsigned short (16bits) and JIS X 0212 hojo-kanji appears to be a 3 byte sequence
+(24bits)... encode_char
+ also has the same problem. After looking at the mbstring extension, it appears
+that a 32-bit int would
+ be a better container.
+*/
+inline static int encode_char(enum entity_charset charset,
+ unsigned short this_char,
+ unsigned char * mbsequence,
+ int * mbseqlen)
+{
+ int mbpos = 0, valid = 1;
+ unsigned char high_byte = this_char >> 8,
+ low_byte = this_char & 0xff;
+
+ switch(charset) {
+ case cs_utf_8:
+ {
+ /* Need to UTF encode */
+ /* pack utf-8
+ * Code stolen from the mbstring extension */
+ if (this_char >= 0x80) {
+ if (this_char < 0x800) {
+ mbsequence[mbpos++] = ((this_char >>
+6) & 0x1f) | 0xc0;
+ low_byte = (this_char & 0x3f) | 0x80;
+ } else if (this_char < 0x10000) {
+ mbsequence[mbpos++] = ((this_char >>
+12) & 0x0f) | 0xe0;
+ mbsequence[mbpos++] = ((this_char >>
+6) & 0x3f) | 0x80;
+ this_char = (this_char & 0x3f) | 0x80;
+ } else if (this_char < 0x200000) {
+ mbsequence[mbpos++] = ((this_char >>
+18) & 0x07) | 0xf0;
+ mbsequence[mbpos++] = ((this_char >>
+12) & 0x3f) | 0x80;
+ mbsequence[mbpos++] = ((this_char >>
+6) & 0x3f) | 0x80;
+ this_char = (this_char & 0x3f) | 0x80;
+ } else if (this_char < 0x4000000) {
+ mbsequence[mbpos++] = ((this_char >>
+24) & 0x03) | 0xf8;
+ mbsequence[mbpos++] = ((this_char >>
+18) & 0x3f) | 0x80;
+ mbsequence[mbpos++] = ((this_char >>
+12) & 0x3f) | 0x80;
+ mbsequence[mbpos++] = ((this_char >>
+6) & 0x3f) | 0x80;
+ this_char = (this_char & 0x3f) | 0x80;
+ } else {
+ mbsequence[mbpos++] = ((this_char >>
+30) & 0x01) | 0xfc;
+ mbsequence[mbpos++] = ((this_char >>
+24) & 0x3f) | 0x80;
+ mbsequence[mbpos++] = ((this_char >>
+18) & 0x3f) | 0x80;
+ mbsequence[mbpos++] = ((this_char >>
+12) & 0x3f) | 0x80;
+ mbsequence[mbpos++] = ((this_char >>
+6) & 0x3f) | 0x80;
+ this_char = (this_char & 0x3f) | 0x80;
+ }
+ } else {
+ valid = 0;
+ } /*if*/
+
+ break;
+ } /*case*/
+
+ case cs_big5:
+ case cs_gb2312:
+ case cs_big5hkscs:
+ {
+ if (high_byte) {
+ if (high_byte >= 0xa1 && high_byte <= 0xf9) {
+ if ((low_byte >= 0x40 && low_byte <=
+0x73) ||
+ (low_byte >= 0xa1 && low_byte
+<= 0xfe))
+ {
+ mbsequence[mbpos++] =
+high_byte;
+ } else {
+ /* low_byte invalid for 2-byte
+sequence */
+ valid = 0;
+ } /*if*/
+ } else {
+ /* high_byte invalid for 2-byte
+sequence */
+ valid = 0;
+ } /*if*/
+ } /*if*/
+
+ mbsequence[mbpos++] = low_byte;
+
+ break;
+ } /*case*/
+
+ case cs_sjis:
+ {
+ /* Check if high_byte is valid start of a 2-byte
+sequence */
+ if (high_byte) {
+ if ( (high_byte >= 0x81 && high_byte <= 0x9f)
+||
+ (high_byte >= 0xe0 && high_byte <=
+0xef))
+ {
+ /* Check the low_byte */
+ if ((low_byte >= 0x40 && low_byte <=
+0x7e) ||
+ (low_byte >= 0x80 && low_byte
+<= 0xfc))
+ {
+ mbsequence[mbpos++] =
+high_byte;
+ } else {
+ /* low_byte invalid for 2-byte
+sequence */
+ valid = 0;
+ } /*if*/
+ } else {
+ /* high_byte invalid for 2-byte
+sequence */
+ valid = 0;
+ }/*if*/
+ }/*if*/
+
+ break;
+ } /*case*/
+
+ case cs_eucjp:
+ {
+ unsigned char really_high_byte = this_char >> 16;
+
+ if (really_high_byte) {
+ /* check if this is valid start of a
+three-byte sequence */
+ if (really_high_byte == 0x8f) {
+ if (high_byte >= 0xa1 && high_byte <=
+0xfe) {
+ if (low_byte >= 0xa1 &&
+low_byte <= 0xfe) {
+ /* JIS X 0212
+hojo-kanji */
+ mbsequence[mbpos++] =
+really_high_byte;
+ mbsequence[mbpos++] =
+high_byte;
+ } else {
+ /* low_byte invalid
+for 3-byte sequence */
+ valid = 0;
+ } /*if*/
+ } else {
+ /* high_byte invalid for
+3-byte sequence */
+ valid = 0;
+ } /*if*/
+ } else {
+ /* really_high_byte invalid for 3-byte
+sequence */
+ valid = 0;
+ } /*if*/
+ } else if (high_byte) {
+ /* check if this is valid start of a 2-byte
+sequence */
+ if (high_byte >= 0xa1 && high_byte <= 0xfe)
+ {
+ /* check the low_byte */
+ if (low_byte >= 0xa1 && low_byte <=
+0xfe) {
+ /* yes, this a jis kanji char
+*/
+ mbsequence[mbpos++] =
+high_byte;
+ } else {
+ /* low_byte invalid for 2-byte
+sequence */
+ valid = 0;
+ } /*if*/
+ } else if (high_byte == 0x8e) {
+ /* check the low_byte */
+ if (low_byte >= 0xa1 && low_byte <=
+0xdf) {
+ /* JIS X 0201 kana */
+ mbsequence[mbpos++] =
+high_byte;
+ } else {
+ /* low_byte invalid for 2-byte
+sequence */
+ valid = 0;
+ } /*if*/
+ } else {
+ /* high_byte invalid for 2-byte
+sequence */
+ valid = 0;
+ } /*if*/
+ } /*if*/
+
+ break;
+ } /*case*/
+ } /*switch*/
+
+ mbsequence[mbpos++] = low_byte;
+ mbsequence[mbpos] = '\0';
+ *mbseqlen = mbpos;
+
+ return valid;
+}
+/* }}} */
+
+
+/* {{{ entity_charset determine_charset
/* {{{ entity_charset determine_charset
* returns the charset identifier based on current locale or a hint.
* defaults to iso-8859-1 */
@@ -575,6 +747,241 @@
}
}
/* }}} */
+
+
+/* {{{ proto string il_str_decode_htmlentities(string str)
+ Translates HTML entities in the given string into the appropriate characters. */
+PHPAPI char* php_str_unhtmlentities(char *str, unsigned int *resultlen, char*
+hint_charset TSRMLS_DC)
+{
+ enum entity_charset charset = determine_charset(hint_charset);
+ char *buf;
+ int buflen, rlen, slen, i, nmaps;
+ struct html_entity_map **mapcache;
+
+ if (resultlen && *resultlen) {
+ /* resultlen should have the "real" length of the string
+ slen = *resultlen;
+ } else {
+ /* Try to guess the length of the string - will not work correctly for
+multibyte strings */
+ slen = strlen(str);
+ } /*if*/
+
+ /* Cache the character set's entity maps */
+ /* This _may_ cause a performance decrease if no entities are found, but
+should give a (hopefully)
+ noticable increase when lots of matches are found. Will make a bigger
+difference as more
+ entity maps are defined.
+ */
+
+ /* Count the number of entity maps */
+ for (i=0; entity_map[i].charset != cs_terminator; i++) {
+ if (entity_map[i].charset == charset) {
+ ++nmaps;
+ } /*if*/
+ } /*for*/
+
+ /* Allocate the cache array & populate it */
+ mapcache = malloc(nmaps * sizeof(struct html_entity_map *));
+ nmaps = 0;
+
+ for (i=0; entity_map[i].charset != cs_terminator; i++) {
+ if (entity_map[i].charset == charset) {
+ mapcache[nmaps++] = (struct html_entity_map *) &entity_map[i];
+ } /*if*/
+ } /*for*/
+
+ /* Scan through the string and find entities to decode */
+ buflen = slen + 128;
+ buf = emalloc(buflen); /* allocate buffer */
+ rlen = 0; /* initial result is empty */
+
+ i = 0;
+ while (i < slen) {
+ int mbseqlen;
+ unsigned char mbsequence[16]; /* allow up to 15 characters in a
+multibyte sequence */
+ unsigned short this_char;
+ int copy_char = 1;
+
+ this_char = get_next_char(charset, str, &i, mbsequence, &mbseqlen);
+
+ if (this_char == '&') {
+ /* Start of an entity, parse the following entity */
+ int p, len, found_entity = 0, valid;
+
+ p = i;
+ if (str[p] == '#') {
+ char* end;
+ int base = -1;
+
+ /* Numeric entity */
+ ++p;
+
+ if ((str[p] == 'x') || (str[p] == 'X')) {
+ /* &#x or &#X found - hex value should follow
+*/
+ base = 16;
+ ++p;
+ } else if ((str[p] >= '0') && (str[p] <= '9')) {
+ /* &# followed by digit found - decimal value
+should follow */
+ base = 10;
+ } /*if*/
+
+ if (base >= 0) {
+ this_char = strtol(&str[p], &end, base);
+ valid = encode_char(charset, this_char,
+mbsequence, &mbseqlen);
+
+ len = end - str - i;
+ if ((errno != ERANGE) && (*end == ';') &&
+valid) {
+ /* At end of entity, valid number */
+ found_entity = 1;
+ i += len + 1;
+ } else {
+ /* Invalid entry, set pointer to end */
+ p = i + len;
+ } /*if*/
+ } /*if*/
+ } else {
+ char entity[20];
+ int j, k;
+
+ /* Scan until end of entity or invalid char found */
+ len = 0;
+ valid = 1;
+ while (str[p] != ';') {
+ if (!isalnum(str[p]) || (len > sizeof(entity)
+- 2)) {
+ /* Invalid character found or entity
+too long */
+ valid = 0;
+ break;
+ } /*if*/
+
+ entity[len++] = str[p++];
+ } /*while*/
+
+ if (valid) { /* Entity appears to be valid */
+ /* Add NULL */
+ entity[len] = 0;
+
+ /* Check the entities which have no table */
+ if (!strcmp("amp", entity)) {
+ /* & */
+ this_char = '&';
+ found_entity = 1;
+ } else if (!strcmp("lt", entity)) {
+ /* < */
+ this_char = '<';
+ found_entity = 1;
+ } else if (!strcmp("gt", entity)) {
+ /* > */
+ this_char = '>';
+ found_entity = 1;
+ } else if (!strcmp("quot", entity)) {
+ /* " */
+ this_char = '"';
+ found_entity = 1;
+ } else {
+
+ /* Find match in tables (if it exists)
+*/
+ for (j = 0; j < nmaps; ++j) {
+ for (k = 0; k <
+mapcache[j]->endchar - mapcache[j]->basechar + 1; ++k) {
+ /* Using strcmp may
+not be the best solution, however I don't think entities can contain
+ mbyte chars
+themselves... */
+ if
+(mapcache[j]->table[k] && !strcmp(mapcache[j]->table[k], entity)) {
+ this_char =
+mapcache[j]->basechar + k;
+ found_entity =
+1;
+ break;
+ } /*if*/
+ } /*for*/
+
+ if (found_entity) break;
+ } /*for*/
+
+
+ } /*if*/
+
+ if (found_entity) {
+ i = p + 1;
+ encode_char(charset, this_char,
+mbsequence, &mbseqlen); /* Assuming the tables always have correct values... */
+ } /*if*/
+ } /*if*/
+ } /*if*/
+
+ if (!found_entity) {
+ /* Entity not found in tables (not known or invalid)
+Just pass through to result string. */
+ len = p - i + 1;
+
+ if (rlen + len + 10 > buflen) {
+ /* Expand buffer. Should not ever happen with
+single byte chars. */
+ buflen += 256;
+ buf = erealloc(buf, buflen);
+ } /*if*/
+
+ /* Copy in the entity */
+ memcpy(buf + rlen, &str[i - 1], len);
+ rlen += len;
+
+ i = p;
+ copy_char = 0;
+ } /*if*/
+ } /*if*/
+
+ if (copy_char) {
+ /* Copy the character into the result string */
+ if (rlen + mbseqlen + 10 > buflen) {
+ /* Expand buffer. Should not ever happen with single
+byte chars. */
+ buflen += 256;
+ buf = erealloc(buf, buflen);
+ } /*if*/
+
+ if (this_char <= 0xff) {
+ buf[rlen++] = (unsigned char) this_char;
+ } else {
+ memcpy(buf + rlen, &mbsequence, mbseqlen);
+ rlen += mbseqlen;
+ } /*if*/
+ } /*if*/
+
+ } /*while*/
+
+ /* Free the map cache */
+ free(mapcache);
+
+ /* Reallocate the buffer to match the size of the result */
+ buf = erealloc(buf, rlen + 1);
+ buf[rlen] = 0;
+
+ if (resultlen) {
+ *resultlen = rlen;
+ } /*if*/
+
+ return buf;
+} /* php_str_unhtmlentities() */
+/* }}} */
+
+
+/* {{{ proto string il_str_decode_htmlentities(string str)
+ Translates HTML entities in the given string into the appropriate characters. */
+PHP_FUNCTION(unhtmlentities)
+{
+/*
+ This function is basically the reverse of the standard PHP function htmlentities,
+however it DOES NOT currently
+ use the same translation table (or the locale). HTML entities have the form
+"&data;" where data is either the
+ name of an entity (ie. >, <, ") or a # symbol followed by a decimal
+value from 0 to 255 (ie. ", &)
+*/
+ char *str, *hint_charset = NULL;
+ int strlen, hint_charset_len;
+ char *result;
+
+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s",
+ &str, &strlen,
+&hint_charset, &hint_charset_len) == FAILURE)
+ {
+ return;
+ } /*if*/
+
+ result = php_str_unhtmlentities(str, &strlen, hint_charset TSRMLS_CC);
+
+ /* Return the result */
+ RETURN_STRINGL(result, strlen, 0);
+} /* PHP_FUNCTION(unhtmlentities) */
+/* }}} */
+
/*
* Local variables:
Index: ext/standard//html.h
===================================================================
RCS file: /repository/php4/ext/standard/html.h,v
retrieving revision 1.12
diff -u -r1.12 html.h
--- ext/standard//html.h 28 Feb 2002 08:26:45 -0000 1.12
+++ ext/standard//html.h 1 Mar 2002 01:30:40 -0000
@@ -29,6 +29,7 @@
PHP_FUNCTION(htmlspecialchars);
PHP_FUNCTION(htmlentities);
+PHP_FUNCTION(unhtmlentities);
PHP_FUNCTION(get_html_translation_table);
PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen,
int all, int quote_style, char * hint_charset);
--
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php