[PHP-DEV] Counterpart to htmlentities function: unhtmlentities

Brad Fisher Tue, 26 Feb 2002 18:40:13 -0800


Just another function I have found useful..  PHP has a htmlentities
function, but no unhtmlentities function to go the other direction.. (At
least not that I am aware of).   So, here you go.  Don't think this one
would perform nearly as quickly if it were done using regexps in PHP...
This function also has the added benefit of supporting entities like
&#32; which I don't believe would be supported by a naive regexp
approach.


So if someone would like to include it, be my guest.

-Brad

--------------------- Start of code ------------------------------

struct entity {
        char*   str;
        char    ch;
};

// Perhaps this could be modified to use the htmlentities translation
table...?
static struct entity il_EntTable[] =
{
        {"quot",34},    {"amp",38},     {"lt",60},      {"gt",62},
{"nbsp",160},
        {"iexcl",161},  {"cent",162},   {"pound",163},  {"curren",164},
{"yen",165},
        {"brvbar",166}, {"sect",167},   {"uml",168},    {"copy",169},
{"ordf",170},
        {"laquo",171},  {"not",172},    {"shy",173},    {"reg",174},
{"macr",175},
        {"deg",176},    {"plusmn",177}, {"sup2",178},   {"sup3",179},
{"acute",180},
        {"micro",181},  {"para",182},   {"middot",183}, {"cedil",184},
{"sup1",185},
        {"ordm",186},   {"raquo",187},  {"frac14",188}, {"frac12",189},
{"frac34",190},
        {"iquest",191}, {"Agrave",192}, {"Aacute",193}, {"Acirc",194},
{"Atilde",195},
        {"Auml",196},   {"Aring",197},  {"AElig",198},  {"Ccedil",199},
{"Egrave",200},
        {"Eacute",201}, {"Ecirc",202},  {"Euml",203},   {"Igrave",204},
{"Iacute",205},
        {"Icirc",206},  {"Iuml",207},   {"ETH",208},    {"Ntilde",209},
{"Ograve",210},
        {"Oacute",211}, {"Ocirc",212},  {"Otilde",213}, {"Ouml",214},
{"times",215},
        {"Oslash",216}, {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219},
{"Uuml",220},
        {"Yacute",221}, {"THORN",222},  {"szlig",223},  {"agrave",224},
{"aacute",225},
        {"acirc",226},  {"atilde",227}, {"auml",228},   {"aring",229},
{"aelig",230},
        {"ccedil",231}, {"egrave",232}, {"eacute",233}, {"ecirc",234},
{"euml",235},
        {"igrave",236}, {"iacute",237}, {"icirc",238},  {"iuml",239},
{"eth",240},
        {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244},
{"otilde",245},
        {"ouml",246},   {"divide",247}, {"oslash",248}, {"ugrave",249},
{"uacute",250},
        {"ucirc",251},  {"uuml",252},   {"yacute",253}, {"thorn",254},
{"yuml",255}
};


/* BF 6/11/01 ([EMAIL PROTECTED]) */
/*

  Translates HTML entities in the given string into the appropriate
characters.  This function is
  the reverse of the standard PHP function htmlentities, however it DOES
NOT currently use the same
  translation table.  HTML entities have the form "&data;" where data is
either the name of an
  entity (ie. &gt;, &lt;, &quot;) or a # symbol followed by a decimal
value from 0 to 255 (ie. &#34;, &#38)

  str       = the string to decode

*/
PHPAPI char* php_str_unhtmlentities(char *str, unsigned int *resultlen)
{
        char    *p, *sp, *ep,
                *buf;
        int     buflen = 0,
                len,
                slen;

        if (resultlen) {
                if (*resultlen != 0)
                        slen = *resultlen;
                else
                        slen = strlen(str);
        } //if

        // Scan through the string and find entities to decode
        buf = emalloc(slen * 2);

        p = str;
        while (sp = strchr(p, '&')) {
                ep = sp + sizeof(char);

                // Scan up to 15 characters ahead for a ';'
                while ((*ep) && (*ep != ';') && (ep < sp +
sizeof(char)*15)) {
                        if (*ep == '&') sp = ep;
                        ep += sizeof(char);
                } //while
                if (!(*ep)) break;      // End of string

                // Copy the previous string data up to this point
                len = (sp - p) / sizeof(char);
                strncpy(&buf[buflen], p, len);
                buflen += len;

                // Translate the entity
                len = (ep - sp) / sizeof(char) - 1;
                if (len > 0) {
                        int     i,
                                found = 0;
                        char    ch = *(ep);

                        sp += sizeof(char);
                        *ep = 0;

                        if (*sp == '#') {
                                if ((len > 1) && (len <= 4)) {
                                        unsigned long ch = strtoul(sp +
sizeof(char), (char**) NULL, 10);
                                        if (ch <= (unsigned long)
UCHAR_MAX) {
                                                buf[buflen] = (char) ch;

                                                ++buflen;
                                                found = 1;
                                        } //if
                                } //if
                        } else {
                                for (i = 0; i < (sizeof(il_EntTable) /
sizeof(struct entity)); ++i) {
                                        if (strcmp(sp,
il_EntTable[i].str) == 0) {
                                                buf[buflen] =
il_EntTable[i].ch;
                                                ++buflen;

                                                found = 1;
                                                break;
                                        } //if
                                } //for
                        } //if

                        *ep = ch;

                        // Copy the entity as-is if it is not recognized

                        if (!found) {
                                len += 2;
                                sp -= sizeof(char);
                                strncpy(&buf[buflen], sp, len);
                                buflen += len;
                        } //if
                } else { // No data in entity? (ie. "&;" Just copy
as-is... not an entity
                        strncpy(&buf[buflen], sp, 2);
                        buflen += 2;
                } //if

                // Start checking for the next match
                p = ep + sizeof(char);
        } //while

        // Copy any remaining portion of the string
        len = slen - ((p - str) / sizeof(char));
        strncpy(&buf[buflen], p, len);
        buflen += len;

        // Reallocate the buffer to match the size of the result
        buf = erealloc(buf, buflen + 1);
        buf[buflen] = 0;

        if (resultlen)
                *resultlen = buflen;

        return buf;
} // php_str_unhtmlentities()
/* }}} */


/* BF 6/11/01 ([EMAIL PROTECTED]) */
/* {{{ proto string unhtmlentities(string str)

  Translates HTML entities in the given string into the appropriate
characters.  This function is
  the reverse of the standard PHP function htmlentities, however it DOES
NOT currently use the same
  translation table.  HTML entities have the form "&data;" where data is
either the name of an
  entity (ie. &gt;, &lt;, &quot;) or a # symbol followed by a decimal
value from 0 to 255 (ie. &#34;, &#38)

  str       = the string to decode

*/
PHP_FUNCTION(unhtmlentities)
{
        zval    **_str;         // The string
        uint    resultlen;
        char    *result;

        int myargc = ZEND_NUM_ARGS();
        if (myargc != 1 ||
            zend_get_parameters_ex(myargc, &_str) == FAILURE)
        {
                ZEND_WRONG_PARAM_COUNT();
        } //if

        // Convert the parameters to the appropriate types
        convert_to_string_ex(_str);

        resultlen = Z_STRLEN_PP(_str);
        result = php_str_unhtmlentities(Z_STRVAL_PP(_str), &resultlen);

        // Return the result
        RETURN_STRINGL(result, resultlen, 0);
} // PHP_FUNCTION(unhtmlentities)
/* }}} */



-- 
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php

[PHP-DEV] Counterpart to htmlentities function: unhtmlentities

Reply via email to