[PHP-DEV] Bug #9392 Updated: htmlspecial chars & htmlentities do not handle double byte charactersets

alan_k Mon, 20 Aug 2001 05:11:26 -0700
ID: 9392
User updated by: [EMAIL PROTECTED]
Reported By: [EMAIL PROTECTED]
Old Status: Feedback
Status: Open
Bug Type: *Languages/Translation
Operating System: Linux
PHP Version: 4.0 Latest CVS (21/02/2001)
New Comment:

OK, patch updated and tested..
points to note:
this line in php_escape_html_entities(
} else if (this_char > 0xff)    {
I dont thing this will work in the current code as this_char is unsigned short (0-255) 
- or that is what my C book says :)

changed to unsigned long and the code words as expected.

this appears to work both by setting and by using the hint field (undocumented - I 
guess until 4.0.7 release)..
setlocale(LC_ALL,"zh_TW");
echo htmlentities("some chinese"); 

regards

alan


Index: html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.32
diff -u -r1.32 html.c
--- html.c      11 Aug 2001 17:03:37 -0000      1.32
+++ html.c      20 Aug 2001 12:32:06 -0000
@@ -35,7 +35,7 @@
    Defaults to ISO-8859-1 for now. */
 
 enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
-       cs_8859_15, cs_utf_8 };
+       cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs };
 typedef const char * entity_table_t;
 
 /* codepage 1252 is a Windows extension to iso-8859-1. */
@@ -96,6 +96,9 @@
        { cs_8859_1,            0xa0, 0xff, ent_iso_8859_1 },
        { cs_8859_15,           0xa0, 0xff, ent_iso_8859_15 },
        { cs_utf_8,             0xa0, 0xff, ent_iso_8859_1 },
+       { cs_big5,              0xa0, 0xff, ent_iso_8859_1 },   
+       { cs_gb2312,            0xa0, 0xff, ent_iso_8859_1 },
+       { cs_big5hkscs,         0xa0, 0xff, ent_iso_8859_1 },   
        { cs_terminator }
 };
 
@@ -105,14 +108,17 @@
 } charset_map[] = {
        { "ISO-8859-1",         cs_8859_1 },
        { "ISO-8859-15",        cs_8859_15 },
-       { "utf-8",                      cs_utf_8 },
+       { "utf-8",              cs_utf_8 },
        { "cp1252",             cs_cp1252 },
+       { "zh_TW",              cs_big5 },
+       { "zh_CN",              cs_gb2312 },
+       { "zh_HK",              cs_big5hkscs },
        { NULL }
 };
 
 /* {{{ get_next_char
  */
-inline static unsigned short get_next_char(enum entity_charset charset,
+inline static unsigned long get_next_char(enum entity_charset charset,
                unsigned char * str,
                int * newpos,
                unsigned char * mbseq,
@@ -121,7 +127,7 @@
 {
        int pos = *newpos;
        int mbpos = 0;
-       unsigned short this_char = str[pos++];
+       unsigned long this_char = str[pos++];
        
        mbseq[mbpos++] = (unsigned char)this_char;
        
@@ -205,7 +211,49 @@
                                mbseq[mbpos++] = (unsigned char)this_char;
                        }
                } while(more);
-       }
+       } else if ((charset == cs_big5) || (charset == cs_gb2312) || (charset == 
+cs_big5hkscs)) {
+       
+               unsigned long utf = 0;
+               int stat = 0;
+               int more = 1;
+
+               /* unpack double byte encoding into a two chars. */
+
+               
+               do {
+                       if ((stat==0) && (this_char < 0xa1) || (this_char > 0xf9))     
+ {
+                               more = 0;
+                               break;
+                       }
+                       else if (stat==0) {
+                                utf = this_char << 16;
+                                stat=1;
+                       } 
+                       else if  (((this_char >= 0x40) &&
+                                   (this_char <= 0x73)) ||
+                                  ((this_char >= 0xa1) &&
+                                   (this_char <= 0xfe))) {
+                                 utf += this_char;
+                                 more = 0;
+                       }
+                       else    {
+                               /* invalid; bail */
+                               more = 0;
+                               utf=0;
+                               pos = *newpos;
+                               mbpos = 0;
+                               break;
+                       }
+                       if (more)
+                       {
+                               this_char = str[pos++];
+                               mbseq[mbpos++] = (unsigned char)this_char;
+                       }
+               } while(more);          
+               if (utf != 0) 
+                  this_char = utf;
+               
+       }
        *newpos = pos;
        mbseq[mbpos] = '\0';
        *mbseqlen = mbpos;
@@ -223,24 +271,27 @@
        int len;
 
        /* Guarantee default behaviour */
-       if (charset_hint == NULL)
-               return cs_8859_1;
+       /*if (charset_hint == NULL) {
+               return cs_8859_1;                
+       }
+       */      
 
-       if (strlen(charset_hint) == 0)  {
+       if ((charset_hint == NULL) || strlen(charset_hint) == 0)        {
+               char * localename;
                /* try to detect the charset for the locale */
 #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
-               charset_hint = nl_langinfo(CODESET);
+               charset_hint = nl_langinfo(CODESET);  
 #endif
 #if HAVE_LOCALE_H
-               if (charset_hint == NULL)
+                localename = setlocale(LC_CTYPE, NULL);
+               if (localename != NULL)
                {
                        /* try to figure out the charset from the locale */
-                       char * localename;
+                       
                        char * dot, * at;
 
                        /* lang[_territory][.codeset][@modifier] */
                        localename = setlocale(LC_CTYPE, NULL);
-
                        dot = strchr(localename, '.');
                        if (dot)        {
                                dot++;
@@ -286,6 +337,7 @@
        int i, maxlen, len;
        char *new;
        enum entity_charset charset = determine_charset(hint_charset);
+        
 
        maxlen = 2 * oldlen;
        if (maxlen < 128)
@@ -299,12 +351,12 @@
                unsigned char mbsequence[16];   /* allow up to 15 characters
                                                                                       
                 in a multibyte sequence
                                                                                       
                 it should be more than enough.. */
-               unsigned short this_char = get_next_char(charset, old, &i, mbsequence, 
&mbseqlen);
+               unsigned long this_char = get_next_char(charset, old, &i, mbsequence, 
+&mbseqlen);
                int matches_map = 0;
                
                if (len + 9 > maxlen)
                        new = erealloc (new, maxlen += 128);
-               
+                
                if (all)        {
                        /* look for a match in the maps for this charset */
                        int j;


Previous Comments:
------------------------------------------------------------------------

[2001-08-19 04:40:53] [EMAIL PROTECTED]

I should add that as it stands in CVS, htmlentities only knows about iso-8859-1, 
iso-8859-15 and utf-8.
--Wez.

------------------------------------------------------------------------

[2001-08-19 04:30:06] [EMAIL PROTECTED]

If you could rewrite your patch to fit the new architecture
for htmlentities, I'd be happy to apply it.
It should be an easier patch too.
(we might consider using the mbstring extension for this
stuff, in which case your chinese string patch might
be better off being put in there.)

--Wez.

------------------------------------------------------------------------

[2001-08-18 22:15:01] [EMAIL PROTECTED]

Could you please check the latest CVS snapshot from

http://snaps.php.net/

clip from the cvs log:

"date: 2001/05/28 11:00:06;  author: wez;  state: Exp;  
Added charset awareness to htmlentities() and 
htmlspecialchars(); use an optional third parameter to 
specify the charset; otherwise tries to determine
it from the LC_CTYPE locale setting."

Please test if this is what you wanted.

--Jani



------------------------------------------------------------------------

[2001-02-21 22:53:04] [EMAIL PROTECTED]

htmlspecialchars & entities often replace the second byte of a chinese character with 
a 'htmlized' &xxx; character - this is annoying :) and makes it very difficult to 
program in PHP doing dynamic chinese stuff.

anyway this patch goes part of the way to solving it, - Note I have not tested it, so 
testers needed.. - I'm on the dev list, so I should be able to follow any comments..

It does produce compile time errors on the character range (i'm guessing that gcc 
makes the assumtion that char should be < 128 ?)

I have added a check using setlocale("LC_ALL",NULL) - This may not be the correct test 
- and may not respond with the correct info if set in PHP - again untested. - does 
LC_ALL return something? should It use env. variables...

Anyway - thats a long enough bug report...

regards

alan

Index: html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.22
diff -u -r1.22 html.c
--- html.c      2000/11/24 16:17:58     1.22
+++ html.c      2001/02/22 03:43:13
@@ -22,7 +22,7 @@
 #include "php.h"
 #include "reg.h"
 #include "html.h"
-
+#include <locale.h>
 /* This must be fixed to handle the input string according to LC_CTYPE.
    Defaults to ISO-8859-1 for now. */
        
@@ -52,8 +52,17 @@
 PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, 
int all, int quote_style)
 {
        int i, maxlen, len;
-       char *new;
-
+ 
+       char *new, *oldnext, *oldprev;
+#if HAVE_SETLOCALE
+       int checklang=0,ischinese;
+       /* should this check the enviroment value? */
+       char  *locale = setlocale(LC_ALL, NULL);
+        if ((!strcmp("zh_TW.Big5", locale)) || 
+           (!strcmp("zh_TW", locale)) ||
+           (!strcmp("zh_CN", locale)) ||
+           (!strcmp("zh_CN.GB2313", locale))) checklang=1;
+#endif
        maxlen = 2 * oldlen;
        if (maxlen < 128)
                maxlen = 128;
@@ -62,34 +71,72 @@
 
        i = oldlen;
        while (i--) {
-               if (len + 9 > maxlen)
+               if (len + 9 > maxlen)
                        new = erealloc (new, maxlen += 128);
-               if (38 == *old) {
-                       memcpy (new + len, "&amp;", 5);
-                       len += 5;
-               } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
-                       memcpy (new + len, "&quot;", 6);
-                       len += 6;
-               } else if (39 == *old && (quote_style&ENT_QUOTES)) {
-                       memcpy (new + len, "&#039;", 6);
-                       len += 6;
-               } else if (60 == *old) {
-                       memcpy (new + len, "&lt;", 4);
-                       len += 4;
-               } else if (62 == *old) {
-                       memcpy (new + len, "&gt;", 4);
-                       len += 4;
-               } else if (all && 160 <= *old) {
-                       new [len++] = '&';
-                       strcpy (new + len, EntTable [*old - 160]);
-                       len += strlen (EntTable [*old - 160]);
-                       new [len++] = ';';
+#if HAVE_SETLOCALE
+           
+        
+               ischinese = 0; 
+               if (checklang) {
+                      if (i > 1) { 
+                        oldnext = old+1; 
+                        if ((*old >= 0xa1) &&
+                            (*old <= 0xf9) &&
+                            (((*oldnext >= 0x40) &&
+                              (*oldnext <= 0x73)) ||
+                             ((*oldnext >= 0xa1) &&
+                              (*oldnext <= 0xfe)))  
+                           ) ischinese = 1;
+                      }
+                      /* check if this is the seconde character in a chinese pair */
+                      if ((i != oldlen) && (!ischinese)) {
+                        oldprev = old-1;
+                        if ((*oldprev >= 0xa1) &&  
+                            (*oldprev <= 0xf9) &&
+                            (((*old >= 0x40) &&
+                              (*old <= 0x73)) ||
+                             ((*old >= 0xa1) &&
+                              (*old <= 0xfe)))
+                           ) ischinese = 1;
+                      }
+                }
+               
+                if (ischinese) { 
+                       /* it is chinese - ignore it */
+                       new [len++] = *old;
                } else {
-                       new [len++] = *old;
-               }
-               old++;
+#endif
+               
+                       if (38 == *old) {
+                               memcpy (new + len, "&amp;", 5);
+                               len += 5;
+                       } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
+                               memcpy (new + len, "&quot;", 6);
+                               len += 6;
+                       } else if (39 == *old && (quote_style&ENT_QUOTES)) {
+                               memcpy (new + len, "&#039;", 6);
+                               len += 6;
+                       } else if (60 == *old) {
+                               memcpy (new + len, "&lt;", 4);
+                               len += 4;
+                       } else if (62 == *old) {
+                               memcpy (new + len, "&gt;", 4);
+                               len += 4;
+                       } else if (all && 160 <= *old) {
+                               new [len++] = '&';
+                               strcpy (new + len, EntTable [*old - 160]);
+                               len += strlen (EntTable [*old - 160]);
+                               new [len++] = ';';
+                       } else {
+                               new [len++] = *old;
+                       }
+#if HAVE_SETLOCALE
+               
+                }
+#endif
+                old++;
        }
-    new [len] = '\0';
+        new [len] = '\0';
        *newlen = len;
 
        return new;



------------------------------------------------------------------------



Edit this bug report at http://bugs.php.net/?id=9392&edit=1


-- 
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]
To contact the list administrators, e-mail: [EMAIL PROTECTED]
[PHP-DEV] Bug #9392 Updated: htmlspecial chars & htmlentities do not handle double byte charactersets

Reply via email to