Just a newer version of the unhtmlentities function. This one utilizes the same tables as htmlentities, and (hopefully) is a little more locale friendly. It also supports &#...; (decimal) &#x...; (hex) and &#X...; (hex) numeric entity formats.
-Brad
Index: ext/standard//basic_functions.c =================================================================== RCS file: /repository/php4/ext/standard/basic_functions.c,v retrieving revision 1.446 diff -u -r1.446 basic_functions.c --- ext/standard//basic_functions.c 28 Feb 2002 16:00:26 -0000 1.446 +++ ext/standard//basic_functions.c 1 Mar 2002 01:30:40 -0000 @@ -274,6 +274,7 @@ PHP_FE(wordwrap, NULL) PHP_FE(htmlspecialchars, NULL) PHP_FE(htmlentities, NULL) + PHP_FE(unhtmlentities, + NULL) PHP_FE(get_html_translation_table, NULL) PHP_NAMED_FE(md5,php_if_md5, NULL) PHP_NAMED_FE(md5_file,php_if_md5_file, NULL) Index: ext/standard//html.c =================================================================== RCS file: /repository/php4/ext/standard/html.c,v retrieving revision 1.40 diff -u -r1.40 html.c --- ext/standard//html.c 28 Feb 2002 08:26:45 -0000 1.40 +++ ext/standard//html.c 1 Mar 2002 01:30:40 -0000 @@ -123,6 +123,7 @@ { NULL } }; + /* {{{ get_next_char */ inline static unsigned short get_next_char(enum entity_charset charset, @@ -319,6 +320,177 @@ } /* }}} */ + +/* {{{ encode_char + */ +/* It appears that there _may_ be a problem with get_next_char and encode_char. +get_next_char only returns an + unsigned short (16bits) and JIS X 0212 hojo-kanji appears to be a 3 byte sequence +(24bits)... encode_char + also has the same problem. After looking at the mbstring extension, it appears +that a 32-bit int would + be a better container. +*/ +inline static int encode_char(enum entity_charset charset, + unsigned short this_char, + unsigned char * mbsequence, + int * mbseqlen) +{ + int mbpos = 0, valid = 1; + unsigned char high_byte = this_char >> 8, + low_byte = this_char & 0xff; + + switch(charset) { + case cs_utf_8: + { + /* Need to UTF encode */ + /* pack utf-8 + * Code stolen from the mbstring extension */ + if (this_char >= 0x80) { + if (this_char < 0x800) { + mbsequence[mbpos++] = ((this_char >> +6) & 0x1f) | 0xc0; + low_byte = (this_char & 0x3f) | 0x80; + } else if (this_char < 0x10000) { + mbsequence[mbpos++] = ((this_char >> +12) & 0x0f) | 0xe0; + mbsequence[mbpos++] = ((this_char >> +6) & 0x3f) | 0x80; + this_char = (this_char & 0x3f) | 0x80; + } else if (this_char < 0x200000) { + mbsequence[mbpos++] = ((this_char >> +18) & 0x07) | 0xf0; + mbsequence[mbpos++] = ((this_char >> +12) & 0x3f) | 0x80; + mbsequence[mbpos++] = ((this_char >> +6) & 0x3f) | 0x80; + this_char = (this_char & 0x3f) | 0x80; + } else if (this_char < 0x4000000) { + mbsequence[mbpos++] = ((this_char >> +24) & 0x03) | 0xf8; + mbsequence[mbpos++] = ((this_char >> +18) & 0x3f) | 0x80; + mbsequence[mbpos++] = ((this_char >> +12) & 0x3f) | 0x80; + mbsequence[mbpos++] = ((this_char >> +6) & 0x3f) | 0x80; + this_char = (this_char & 0x3f) | 0x80; + } else { + mbsequence[mbpos++] = ((this_char >> +30) & 0x01) | 0xfc; + mbsequence[mbpos++] = ((this_char >> +24) & 0x3f) | 0x80; + mbsequence[mbpos++] = ((this_char >> +18) & 0x3f) | 0x80; + mbsequence[mbpos++] = ((this_char >> +12) & 0x3f) | 0x80; + mbsequence[mbpos++] = ((this_char >> +6) & 0x3f) | 0x80; + this_char = (this_char & 0x3f) | 0x80; + } + } else { + valid = 0; + } /*if*/ + + break; + } /*case*/ + + case cs_big5: + case cs_gb2312: + case cs_big5hkscs: + { + if (high_byte) { + if (high_byte >= 0xa1 && high_byte <= 0xf9) { + if ((low_byte >= 0x40 && low_byte <= +0x73) || + (low_byte >= 0xa1 && low_byte +<= 0xfe)) + { + mbsequence[mbpos++] = +high_byte; + } else { + /* low_byte invalid for 2-byte +sequence */ + valid = 0; + } /*if*/ + } else { + /* high_byte invalid for 2-byte +sequence */ + valid = 0; + } /*if*/ + } /*if*/ + + mbsequence[mbpos++] = low_byte; + + break; + } /*case*/ + + case cs_sjis: + { + /* Check if high_byte is valid start of a 2-byte +sequence */ + if (high_byte) { + if ( (high_byte >= 0x81 && high_byte <= 0x9f) +|| + (high_byte >= 0xe0 && high_byte <= +0xef)) + { + /* Check the low_byte */ + if ((low_byte >= 0x40 && low_byte <= +0x7e) || + (low_byte >= 0x80 && low_byte +<= 0xfc)) + { + mbsequence[mbpos++] = +high_byte; + } else { + /* low_byte invalid for 2-byte +sequence */ + valid = 0; + } /*if*/ + } else { + /* high_byte invalid for 2-byte +sequence */ + valid = 0; + }/*if*/ + }/*if*/ + + break; + } /*case*/ + + case cs_eucjp: + { + unsigned char really_high_byte = this_char >> 16; + + if (really_high_byte) { + /* check if this is valid start of a +three-byte sequence */ + if (really_high_byte == 0x8f) { + if (high_byte >= 0xa1 && high_byte <= +0xfe) { + if (low_byte >= 0xa1 && +low_byte <= 0xfe) { + /* JIS X 0212 +hojo-kanji */ + mbsequence[mbpos++] = +really_high_byte; + mbsequence[mbpos++] = +high_byte; + } else { + /* low_byte invalid +for 3-byte sequence */ + valid = 0; + } /*if*/ + } else { + /* high_byte invalid for +3-byte sequence */ + valid = 0; + } /*if*/ + } else { + /* really_high_byte invalid for 3-byte +sequence */ + valid = 0; + } /*if*/ + } else if (high_byte) { + /* check if this is valid start of a 2-byte +sequence */ + if (high_byte >= 0xa1 && high_byte <= 0xfe) + { + /* check the low_byte */ + if (low_byte >= 0xa1 && low_byte <= +0xfe) { + /* yes, this a jis kanji char +*/ + mbsequence[mbpos++] = +high_byte; + } else { + /* low_byte invalid for 2-byte +sequence */ + valid = 0; + } /*if*/ + } else if (high_byte == 0x8e) { + /* check the low_byte */ + if (low_byte >= 0xa1 && low_byte <= +0xdf) { + /* JIS X 0201 kana */ + mbsequence[mbpos++] = +high_byte; + } else { + /* low_byte invalid for 2-byte +sequence */ + valid = 0; + } /*if*/ + } else { + /* high_byte invalid for 2-byte +sequence */ + valid = 0; + } /*if*/ + } /*if*/ + + break; + } /*case*/ + } /*switch*/ + + mbsequence[mbpos++] = low_byte; + mbsequence[mbpos] = '\0'; + *mbseqlen = mbpos; + + return valid; +} +/* }}} */ + + +/* {{{ entity_charset determine_charset /* {{{ entity_charset determine_charset * returns the charset identifier based on current locale or a hint. * defaults to iso-8859-1 */ @@ -575,6 +747,241 @@ } } /* }}} */ + + +/* {{{ proto string il_str_decode_htmlentities(string str) + Translates HTML entities in the given string into the appropriate characters. */ +PHPAPI char* php_str_unhtmlentities(char *str, unsigned int *resultlen, char* +hint_charset TSRMLS_DC) +{ + enum entity_charset charset = determine_charset(hint_charset); + char *buf; + int buflen, rlen, slen, i, nmaps; + struct html_entity_map **mapcache; + + if (resultlen && *resultlen) { + /* resultlen should have the "real" length of the string + slen = *resultlen; + } else { + /* Try to guess the length of the string - will not work correctly for +multibyte strings */ + slen = strlen(str); + } /*if*/ + + /* Cache the character set's entity maps */ + /* This _may_ cause a performance decrease if no entities are found, but +should give a (hopefully) + noticable increase when lots of matches are found. Will make a bigger +difference as more + entity maps are defined. + */ + + /* Count the number of entity maps */ + for (i=0; entity_map[i].charset != cs_terminator; i++) { + if (entity_map[i].charset == charset) { + ++nmaps; + } /*if*/ + } /*for*/ + + /* Allocate the cache array & populate it */ + mapcache = malloc(nmaps * sizeof(struct html_entity_map *)); + nmaps = 0; + + for (i=0; entity_map[i].charset != cs_terminator; i++) { + if (entity_map[i].charset == charset) { + mapcache[nmaps++] = (struct html_entity_map *) &entity_map[i]; + } /*if*/ + } /*for*/ + + /* Scan through the string and find entities to decode */ + buflen = slen + 128; + buf = emalloc(buflen); /* allocate buffer */ + rlen = 0; /* initial result is empty */ + + i = 0; + while (i < slen) { + int mbseqlen; + unsigned char mbsequence[16]; /* allow up to 15 characters in a +multibyte sequence */ + unsigned short this_char; + int copy_char = 1; + + this_char = get_next_char(charset, str, &i, mbsequence, &mbseqlen); + + if (this_char == '&') { + /* Start of an entity, parse the following entity */ + int p, len, found_entity = 0, valid; + + p = i; + if (str[p] == '#') { + char* end; + int base = -1; + + /* Numeric entity */ + ++p; + + if ((str[p] == 'x') || (str[p] == 'X')) { + /* &#x or &#X found - hex value should follow +*/ + base = 16; + ++p; + } else if ((str[p] >= '0') && (str[p] <= '9')) { + /* &# followed by digit found - decimal value +should follow */ + base = 10; + } /*if*/ + + if (base >= 0) { + this_char = strtol(&str[p], &end, base); + valid = encode_char(charset, this_char, +mbsequence, &mbseqlen); + + len = end - str - i; + if ((errno != ERANGE) && (*end == ';') && +valid) { + /* At end of entity, valid number */ + found_entity = 1; + i += len + 1; + } else { + /* Invalid entry, set pointer to end */ + p = i + len; + } /*if*/ + } /*if*/ + } else { + char entity[20]; + int j, k; + + /* Scan until end of entity or invalid char found */ + len = 0; + valid = 1; + while (str[p] != ';') { + if (!isalnum(str[p]) || (len > sizeof(entity) +- 2)) { + /* Invalid character found or entity +too long */ + valid = 0; + break; + } /*if*/ + + entity[len++] = str[p++]; + } /*while*/ + + if (valid) { /* Entity appears to be valid */ + /* Add NULL */ + entity[len] = 0; + + /* Check the entities which have no table */ + if (!strcmp("amp", entity)) { + /* & */ + this_char = '&'; + found_entity = 1; + } else if (!strcmp("lt", entity)) { + /* < */ + this_char = '<'; + found_entity = 1; + } else if (!strcmp("gt", entity)) { + /* > */ + this_char = '>'; + found_entity = 1; + } else if (!strcmp("quot", entity)) { + /* " */ + this_char = '"'; + found_entity = 1; + } else { + + /* Find match in tables (if it exists) +*/ + for (j = 0; j < nmaps; ++j) { + for (k = 0; k < +mapcache[j]->endchar - mapcache[j]->basechar + 1; ++k) { + /* Using strcmp may +not be the best solution, however I don't think entities can contain + mbyte chars +themselves... */ + if +(mapcache[j]->table[k] && !strcmp(mapcache[j]->table[k], entity)) { + this_char = +mapcache[j]->basechar + k; + found_entity = +1; + break; + } /*if*/ + } /*for*/ + + if (found_entity) break; + } /*for*/ + + + } /*if*/ + + if (found_entity) { + i = p + 1; + encode_char(charset, this_char, +mbsequence, &mbseqlen); /* Assuming the tables always have correct values... */ + } /*if*/ + } /*if*/ + } /*if*/ + + if (!found_entity) { + /* Entity not found in tables (not known or invalid) +Just pass through to result string. */ + len = p - i + 1; + + if (rlen + len + 10 > buflen) { + /* Expand buffer. Should not ever happen with +single byte chars. */ + buflen += 256; + buf = erealloc(buf, buflen); + } /*if*/ + + /* Copy in the entity */ + memcpy(buf + rlen, &str[i - 1], len); + rlen += len; + + i = p; + copy_char = 0; + } /*if*/ + } /*if*/ + + if (copy_char) { + /* Copy the character into the result string */ + if (rlen + mbseqlen + 10 > buflen) { + /* Expand buffer. Should not ever happen with single +byte chars. */ + buflen += 256; + buf = erealloc(buf, buflen); + } /*if*/ + + if (this_char <= 0xff) { + buf[rlen++] = (unsigned char) this_char; + } else { + memcpy(buf + rlen, &mbsequence, mbseqlen); + rlen += mbseqlen; + } /*if*/ + } /*if*/ + + } /*while*/ + + /* Free the map cache */ + free(mapcache); + + /* Reallocate the buffer to match the size of the result */ + buf = erealloc(buf, rlen + 1); + buf[rlen] = 0; + + if (resultlen) { + *resultlen = rlen; + } /*if*/ + + return buf; +} /* php_str_unhtmlentities() */ +/* }}} */ + + +/* {{{ proto string il_str_decode_htmlentities(string str) + Translates HTML entities in the given string into the appropriate characters. */ +PHP_FUNCTION(unhtmlentities) +{ +/* + This function is basically the reverse of the standard PHP function htmlentities, +however it DOES NOT currently + use the same translation table (or the locale). HTML entities have the form +"&data;" where data is either the + name of an entity (ie. >, <, ") or a # symbol followed by a decimal +value from 0 to 255 (ie. ", &) +*/ + char *str, *hint_charset = NULL; + int strlen, hint_charset_len; + char *result; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s", + &str, &strlen, +&hint_charset, &hint_charset_len) == FAILURE) + { + return; + } /*if*/ + + result = php_str_unhtmlentities(str, &strlen, hint_charset TSRMLS_CC); + + /* Return the result */ + RETURN_STRINGL(result, strlen, 0); +} /* PHP_FUNCTION(unhtmlentities) */ +/* }}} */ + /* * Local variables: Index: ext/standard//html.h =================================================================== RCS file: /repository/php4/ext/standard/html.h,v retrieving revision 1.12 diff -u -r1.12 html.h --- ext/standard//html.h 28 Feb 2002 08:26:45 -0000 1.12 +++ ext/standard//html.h 1 Mar 2002 01:30:40 -0000 @@ -29,6 +29,7 @@ PHP_FUNCTION(htmlspecialchars); PHP_FUNCTION(htmlentities); +PHP_FUNCTION(unhtmlentities); PHP_FUNCTION(get_html_translation_table); PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char * hint_charset);
-- PHP Development Mailing List <http://www.php.net/> To unsubscribe, visit: http://www.php.net/unsub.php