ID: 9392 User updated by: [EMAIL PROTECTED] Reported By: [EMAIL PROTECTED] Old Status: Feedback Status: Open Bug Type: *Languages/Translation Operating System: Linux PHP Version: 4.0 Latest CVS (21/02/2001) New Comment: OK, patch updated and tested.. points to note: this line in php_escape_html_entities( } else if (this_char > 0xff) { I dont thing this will work in the current code as this_char is unsigned short (0-255) - or that is what my C book says :) changed to unsigned long and the code words as expected. this appears to work both by setting and by using the hint field (undocumented - I guess until 4.0.7 release).. setlocale(LC_ALL,"zh_TW"); echo htmlentities("some chinese"); regards alan Index: html.c =================================================================== RCS file: /repository/php4/ext/standard/html.c,v retrieving revision 1.32 diff -u -r1.32 html.c --- html.c 11 Aug 2001 17:03:37 -0000 1.32 +++ html.c 20 Aug 2001 12:32:06 -0000 @@ -35,7 +35,7 @@ Defaults to ISO-8859-1 for now. */ enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, - cs_8859_15, cs_utf_8 }; + cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs }; typedef const char * entity_table_t; /* codepage 1252 is a Windows extension to iso-8859-1. */ @@ -96,6 +96,9 @@ { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, { cs_terminator } }; @@ -105,14 +108,17 @@ } charset_map[] = { { "ISO-8859-1", cs_8859_1 }, { "ISO-8859-15", cs_8859_15 }, - { "utf-8", cs_utf_8 }, + { "utf-8", cs_utf_8 }, { "cp1252", cs_cp1252 }, + { "zh_TW", cs_big5 }, + { "zh_CN", cs_gb2312 }, + { "zh_HK", cs_big5hkscs }, { NULL } }; /* {{{ get_next_char */ -inline static unsigned short get_next_char(enum entity_charset charset, +inline static unsigned long get_next_char(enum entity_charset charset, unsigned char * str, int * newpos, unsigned char * mbseq, @@ -121,7 +127,7 @@ { int pos = *newpos; int mbpos = 0; - unsigned short this_char = str[pos++]; + unsigned long this_char = str[pos++]; mbseq[mbpos++] = (unsigned char)this_char; @@ -205,7 +211,49 @@ mbseq[mbpos++] = (unsigned char)this_char; } } while(more); - } + } else if ((charset == cs_big5) || (charset == cs_gb2312) || (charset == +cs_big5hkscs)) { + + unsigned long utf = 0; + int stat = 0; + int more = 1; + + /* unpack double byte encoding into a two chars. */ + + + do { + if ((stat==0) && (this_char < 0xa1) || (this_char > 0xf9)) + { + more = 0; + break; + } + else if (stat==0) { + utf = this_char << 16; + stat=1; + } + else if (((this_char >= 0x40) && + (this_char <= 0x73)) || + ((this_char >= 0xa1) && + (this_char <= 0xfe))) { + utf += this_char; + more = 0; + } + else { + /* invalid; bail */ + more = 0; + utf=0; + pos = *newpos; + mbpos = 0; + break; + } + if (more) + { + this_char = str[pos++]; + mbseq[mbpos++] = (unsigned char)this_char; + } + } while(more); + if (utf != 0) + this_char = utf; + + } *newpos = pos; mbseq[mbpos] = '\0'; *mbseqlen = mbpos; @@ -223,24 +271,27 @@ int len; /* Guarantee default behaviour */ - if (charset_hint == NULL) - return cs_8859_1; + /*if (charset_hint == NULL) { + return cs_8859_1; + } + */ - if (strlen(charset_hint) == 0) { + if ((charset_hint == NULL) || strlen(charset_hint) == 0) { + char * localename; /* try to detect the charset for the locale */ #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET) - charset_hint = nl_langinfo(CODESET); + charset_hint = nl_langinfo(CODESET); #endif #if HAVE_LOCALE_H - if (charset_hint == NULL) + localename = setlocale(LC_CTYPE, NULL); + if (localename != NULL) { /* try to figure out the charset from the locale */ - char * localename; + char * dot, * at; /* lang[_territory][.codeset][@modifier] */ localename = setlocale(LC_CTYPE, NULL); - dot = strchr(localename, '.'); if (dot) { dot++; @@ -286,6 +337,7 @@ int i, maxlen, len; char *new; enum entity_charset charset = determine_charset(hint_charset); + maxlen = 2 * oldlen; if (maxlen < 128) @@ -299,12 +351,12 @@ unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence it should be more than enough.. */ - unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen); + unsigned long this_char = get_next_char(charset, old, &i, mbsequence, +&mbseqlen); int matches_map = 0; if (len + 9 > maxlen) new = erealloc (new, maxlen += 128); - + if (all) { /* look for a match in the maps for this charset */ int j; Previous Comments: ------------------------------------------------------------------------ [2001-08-19 04:40:53] [EMAIL PROTECTED] I should add that as it stands in CVS, htmlentities only knows about iso-8859-1, iso-8859-15 and utf-8. --Wez. ------------------------------------------------------------------------ [2001-08-19 04:30:06] [EMAIL PROTECTED] If you could rewrite your patch to fit the new architecture for htmlentities, I'd be happy to apply it. It should be an easier patch too. (we might consider using the mbstring extension for this stuff, in which case your chinese string patch might be better off being put in there.) --Wez. ------------------------------------------------------------------------ [2001-08-18 22:15:01] [EMAIL PROTECTED] Could you please check the latest CVS snapshot from http://snaps.php.net/ clip from the cvs log: "date: 2001/05/28 11:00:06; author: wez; state: Exp; Added charset awareness to htmlentities() and htmlspecialchars(); use an optional third parameter to specify the charset; otherwise tries to determine it from the LC_CTYPE locale setting." Please test if this is what you wanted. --Jani ------------------------------------------------------------------------ [2001-02-21 22:53:04] [EMAIL PROTECTED] htmlspecialchars & entities often replace the second byte of a chinese character with a 'htmlized' &xxx; character - this is annoying :) and makes it very difficult to program in PHP doing dynamic chinese stuff. anyway this patch goes part of the way to solving it, - Note I have not tested it, so testers needed.. - I'm on the dev list, so I should be able to follow any comments.. It does produce compile time errors on the character range (i'm guessing that gcc makes the assumtion that char should be < 128 ?) I have added a check using setlocale("LC_ALL",NULL) - This may not be the correct test - and may not respond with the correct info if set in PHP - again untested. - does LC_ALL return something? should It use env. variables... Anyway - thats a long enough bug report... regards alan Index: html.c =================================================================== RCS file: /repository/php4/ext/standard/html.c,v retrieving revision 1.22 diff -u -r1.22 html.c --- html.c 2000/11/24 16:17:58 1.22 +++ html.c 2001/02/22 03:43:13 @@ -22,7 +22,7 @@ #include "php.h" #include "reg.h" #include "html.h" - +#include <locale.h> /* This must be fixed to handle the input string according to LC_CTYPE. Defaults to ISO-8859-1 for now. */ @@ -52,8 +52,17 @@ PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style) { int i, maxlen, len; - char *new; - + + char *new, *oldnext, *oldprev; +#if HAVE_SETLOCALE + int checklang=0,ischinese; + /* should this check the enviroment value? */ + char *locale = setlocale(LC_ALL, NULL); + if ((!strcmp("zh_TW.Big5", locale)) || + (!strcmp("zh_TW", locale)) || + (!strcmp("zh_CN", locale)) || + (!strcmp("zh_CN.GB2313", locale))) checklang=1; +#endif maxlen = 2 * oldlen; if (maxlen < 128) maxlen = 128; @@ -62,34 +71,72 @@ i = oldlen; while (i--) { - if (len + 9 > maxlen) + if (len + 9 > maxlen) new = erealloc (new, maxlen += 128); - if (38 == *old) { - memcpy (new + len, "&", 5); - len += 5; - } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) { - memcpy (new + len, """, 6); - len += 6; - } else if (39 == *old && (quote_style&ENT_QUOTES)) { - memcpy (new + len, "'", 6); - len += 6; - } else if (60 == *old) { - memcpy (new + len, "<", 4); - len += 4; - } else if (62 == *old) { - memcpy (new + len, ">", 4); - len += 4; - } else if (all && 160 <= *old) { - new [len++] = '&'; - strcpy (new + len, EntTable [*old - 160]); - len += strlen (EntTable [*old - 160]); - new [len++] = ';'; +#if HAVE_SETLOCALE + + + ischinese = 0; + if (checklang) { + if (i > 1) { + oldnext = old+1; + if ((*old >= 0xa1) && + (*old <= 0xf9) && + (((*oldnext >= 0x40) && + (*oldnext <= 0x73)) || + ((*oldnext >= 0xa1) && + (*oldnext <= 0xfe))) + ) ischinese = 1; + } + /* check if this is the seconde character in a chinese pair */ + if ((i != oldlen) && (!ischinese)) { + oldprev = old-1; + if ((*oldprev >= 0xa1) && + (*oldprev <= 0xf9) && + (((*old >= 0x40) && + (*old <= 0x73)) || + ((*old >= 0xa1) && + (*old <= 0xfe))) + ) ischinese = 1; + } + } + + if (ischinese) { + /* it is chinese - ignore it */ + new [len++] = *old; } else { - new [len++] = *old; - } - old++; +#endif + + if (38 == *old) { + memcpy (new + len, "&", 5); + len += 5; + } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) { + memcpy (new + len, """, 6); + len += 6; + } else if (39 == *old && (quote_style&ENT_QUOTES)) { + memcpy (new + len, "'", 6); + len += 6; + } else if (60 == *old) { + memcpy (new + len, "<", 4); + len += 4; + } else if (62 == *old) { + memcpy (new + len, ">", 4); + len += 4; + } else if (all && 160 <= *old) { + new [len++] = '&'; + strcpy (new + len, EntTable [*old - 160]); + len += strlen (EntTable [*old - 160]); + new [len++] = ';'; + } else { + new [len++] = *old; + } +#if HAVE_SETLOCALE + + } +#endif + old++; } - new [len] = '\0'; + new [len] = '\0'; *newlen = len; return new; ------------------------------------------------------------------------ Edit this bug report at http://bugs.php.net/?id=9392&edit=1 -- PHP Development Mailing List <http://www.php.net/> To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED] To contact the list administrators, e-mail: [EMAIL PROTECTED]