Here are the diffs for the unhtmlentities function.  Also included is a test
script for some rudimentary benchmarking.

-Brad

Attachment: unhtmlentities_test.php
Description: application/unknown-content-type-microsoft

--- html.c      Tue Feb 26 22:44:44 2002
+++ html.c      Wed Feb 27 00:42:04 2002
@@ -123,6 +123,39 @@
        { NULL }
 };
 
+
+/* Entities table used by unhtmlentities - This should be changed to use the same 
+tables as htmlentities */
+struct entity {
+        char*   str;
+        char    ch;
+};
+
+static struct entity il_EntTable[] =
+{
+        {"quot",34},    {"amp",38},     {"lt",60},      {"gt",62},      {"nbsp",160},
+        {"iexcl",161},  {"cent",162},   {"pound",163},  {"curren",164}, {"yen",165},
+        {"brvbar",166}, {"sect",167},   {"uml",168},    {"copy",169},   {"ordf",170},
+        {"laquo",171},  {"not",172},    {"shy",173},    {"reg",174},    {"macr",175},
+        {"deg",176},    {"plusmn",177}, {"sup2",178},   {"sup3",179},   {"acute",180},
+        {"micro",181},  {"para",182},   {"middot",183}, {"cedil",184},  {"sup1",185},
+        {"ordm",186},   {"raquo",187},  {"frac14",188}, {"frac12",189}, 
+{"frac34",190},
+        {"iquest",191}, {"Agrave",192}, {"Aacute",193}, {"Acirc",194},  
+{"Atilde",195},
+        {"Auml",196},   {"Aring",197},  {"AElig",198},  {"Ccedil",199}, 
+{"Egrave",200},
+        {"Eacute",201}, {"Ecirc",202},  {"Euml",203},   {"Igrave",204}, 
+{"Iacute",205},
+        {"Icirc",206},  {"Iuml",207},   {"ETH",208},    {"Ntilde",209}, 
+{"Ograve",210},
+        {"Oacute",211}, {"Ocirc",212},  {"Otilde",213}, {"Ouml",214},   {"times",215},
+        {"Oslash",216}, {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219},  {"Uuml",220},
+        {"Yacute",221}, {"THORN",222},  {"szlig",223},  {"agrave",224}, 
+{"aacute",225},
+        {"acirc",226},  {"atilde",227}, {"auml",228},   {"aring",229},  {"aelig",230},
+        {"ccedil",231}, {"egrave",232}, {"eacute",233}, {"ecirc",234},  {"euml",235},
+        {"igrave",236}, {"iacute",237}, {"icirc",238},  {"iuml",239},   {"eth",240},
+        {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244},  
+{"otilde",245},
+        {"ouml",246},   {"divide",247}, {"oslash",248}, {"ugrave",249}, 
+{"uacute",250},
+        {"ucirc",251},  {"uuml",252},   {"yacute",253}, {"thorn",254},  {"yuml",255}
+};
+
+
+
 /* {{{ get_next_char
  */
 inline static unsigned short get_next_char(enum entity_charset charset,
@@ -575,6 +608,136 @@
        }
 }
 /* }}} */
+
+
+/* {{{ proto string il_str_decode_htmlentities(string str)
+  Translates HTML entities in the given string into the appropriate characters. */
+PHPAPI char* php_str_unhtmlentities(char *str, unsigned int *resultlen TSRMLS_DC)
+{
+        char    *p, *sp, *ep,
+                *buf;
+        int     buflen = 0,
+                len,
+                slen;
+
+        if (resultlen) {
+                if (*resultlen != 0)
+                        slen = *resultlen;
+                else
+                        slen = strlen(str);
+        } //if
+
+        // Scan through the string and find entities to decode
+        buf = emalloc(slen * 2);
+
+        p = str;
+        while (sp = strchr(p, '&')) {
+                ep = sp + sizeof(char);
+
+                // Scan up to 15 characters ahead for a ';'
+                while ((*ep) && (*ep != ';') && (ep < sp + sizeof(char)*15)) {
+                        if (*ep == '&') sp = ep;
+                        ep += sizeof(char);
+                } //while
+                if (!(*ep)) break;      // End of string
+
+                // Copy the previous string data up to this point
+                len = (sp - p) / sizeof(char);
+//                strncpy(&buf[buflen], p, len);
+                memcpy(&buf[buflen], p, len * sizeof(char));
+                buflen += len;
+
+                // Translate the entity
+                len = (ep - sp) / sizeof(char) - 1;
+                if (len > 0) {
+                        int     i,
+                                found = 0;
+                        char    ch = *(ep);
+
+                        sp += sizeof(char);
+                        *ep = 0;
+
+                        if (*sp == '#') {
+                                if ((len > 1) && (len <= 4)) {
+                                        unsigned long ch = strtoul(sp + sizeof(char), 
+(char**) NULL, 10);
+                                        if (ch <= (unsigned long) UCHAR_MAX) {
+                                                buf[buflen] = (char) ch;
+                                                ++buflen;
+                                                found = 1;
+                                        } //if
+                                } //if
+                        } else {
+                                for (i = 0; i < (sizeof(il_EntTable) / sizeof(struct 
+entity)); ++i) {
+                                        if (strcmp(sp, il_EntTable[i].str) == 0) {
+                                                buf[buflen] = il_EntTable[i].ch;
+                                                ++buflen;
+
+                                                found = 1;
+                                                break;
+                                        } //if
+                                } //for
+                        } //if
+
+                        *ep = ch;
+
+                        // Copy the entity as-is if it is not recognized
+                        if (!found) {
+                                len += 2;
+                                sp -= sizeof(char);
+//                                strncpy(&buf[buflen], sp, len);
+                                memcpy(&buf[buflen], sp, len * sizeof(char));
+                                buflen += len;
+                        } //if
+                } else { // No data in entity? (ie. "&;" Just copy as-is... not an 
+entity
+//                        strncpy(&buf[buflen], sp, 2);
+                        memcpy(&buf[buflen], sp, 2 * sizeof(char));
+                        buflen += 2;
+                } //if
+
+                // Start checking for the next match
+                p = ep + sizeof(char);
+        } //while
+
+        // Copy any remaining portion of the string
+        len = slen - ((p - str) / sizeof(char));
+        strncpy(&buf[buflen], p, len);
+        buflen += len;
+
+        // Reallocate the buffer to match the size of the result
+        buf = erealloc(buf, buflen + 1);
+        buf[buflen] = 0;
+
+        if (resultlen)
+                *resultlen = buflen;
+} // php_str_unhtmlentities()
+/* }}} */
+
+
+/* {{{ proto string il_str_decode_htmlentities(string str)
+  Translates HTML entities in the given string into the appropriate characters. */
+PHP_FUNCTION(unhtmlentities)
+{
+/*
+  This function is basically the reverse of the standard PHP function htmlentities, 
+however it DOES NOT currently
+  use the same translation table (or the locale).  HTML entities have the form 
+"&data;" where data is either the
+  name of an entity (ie. &gt;, &lt;, &quot;) or a # symbol followed by a decimal 
+value from 0 to 255 (ie. &#34;, &#38)
+*/
+        char *str;
+        uint strlen = 0;
+
+        char    *result;
+
+        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &str, &strlen) == 
+FAILURE) {
+          return;
+        }
+
+        result = php_str_unhtmlentities(str, &strlen);
+
+        // Return the result
+        RETURN_STRINGL(result, strlen, 0);
+} // PHP_FUNCTION(unhtmlentities)
+/* }}} */
+
 
 /*
  * Local variables:
--- html.h      Tue Feb 26 22:44:44 2002
+++ html.h      Wed Feb 27 00:14:54 2002
@@ -29,6 +29,7 @@
 
 PHP_FUNCTION(htmlspecialchars);
 PHP_FUNCTION(htmlentities);
+PHP_FUNCTION(unhtmlentities);
 PHP_FUNCTION(get_html_translation_table);
 
 PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, 
int all, int quote_style, char * hint_charset);
--- basic_functions.c   Tue Feb 26 22:44:44 2002
+++ basic_functions.c   Wed Feb 27 00:22:04 2002
@@ -274,6 +274,7 @@
        PHP_FE(wordwrap,                                                               
                                                 NULL)
        PHP_FE(htmlspecialchars,                                                       
                                         NULL)
        PHP_FE(htmlentities,                                                           
                                         NULL)
+       PHP_FE(unhtmlentities,                                                         
+                                         NULL)
        PHP_FE(get_html_translation_table,                                             
                                 NULL)
        PHP_NAMED_FE(md5,php_if_md5,                                                   
                                 NULL)
        PHP_NAMED_FE(md5_file,php_if_md5_file,                                         
                         NULL)

-- 
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to