From:             [EMAIL PROTECTED]
Operating system: Linux 
PHP version:      4.0 Latest CVS (21/02/2001)
PHP Bug Type:     *Languages/Translation
Bug description:  htmlspecial chars & htmlentities do not handle double byte 
charactersets

htmlspecialchars & entities often replace the second byte of a chinese character with 
a 'htmlized' &xxx; character - this is annoying :) and makes it very difficult to 
program in PHP doing dynamic chinese stuff.

anyway this patch goes part of the way to solving it, - Note I have not tested it, so 
testers needed.. - I'm on the dev list, so I should be able to follow any comments..

It does produce compile time errors on the character range (i'm guessing that gcc 
makes the assumtion that char should be < 128 ?)

I have added a check using setlocale("LC_ALL",NULL) - This may not be the correct test 
- and may not respond with the correct info if set in PHP - again untested. - does 
LC_ALL return something? should It use env. variables...

Anyway - thats a long enough bug report...

regards

alan

Index: html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.22
diff -u -r1.22 html.c
--- html.c      2000/11/24 16:17:58     1.22
+++ html.c      2001/02/22 03:43:13
@@ -22,7 +22,7 @@
 #include "php.h"
 #include "reg.h"
 #include "html.h"
-
+#include <locale.h>
 /* This must be fixed to handle the input string according to LC_CTYPE.
    Defaults to ISO-8859-1 for now. */
        
@@ -52,8 +52,17 @@
 PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, 
int all, int quote_style)
 {
        int i, maxlen, len;
-       char *new;
-
+ 
+       char *new, *oldnext, *oldprev;
+#if HAVE_SETLOCALE
+       int checklang=0,ischinese;
+       /* should this check the enviroment value? */
+       char  *locale = setlocale(LC_ALL, NULL);
+        if ((!strcmp("zh_TW.Big5", locale)) || 
+           (!strcmp("zh_TW", locale)) ||
+           (!strcmp("zh_CN", locale)) ||
+           (!strcmp("zh_CN.GB2313", locale))) checklang=1;
+#endif
        maxlen = 2 * oldlen;
        if (maxlen < 128)
                maxlen = 128;
@@ -62,34 +71,72 @@
 
        i = oldlen;
        while (i--) {
-               if (len + 9 > maxlen)
+               if (len + 9 > maxlen)
                        new = erealloc (new, maxlen += 128);
-               if (38 == *old) {
-                       memcpy (new + len, "&amp;", 5);
-                       len += 5;
-               } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
-                       memcpy (new + len, "&quot;", 6);
-                       len += 6;
-               } else if (39 == *old && (quote_style&ENT_QUOTES)) {
-                       memcpy (new + len, "&#039;", 6);
-                       len += 6;
-               } else if (60 == *old) {
-                       memcpy (new + len, "&lt;", 4);
-                       len += 4;
-               } else if (62 == *old) {
-                       memcpy (new + len, "&gt;", 4);
-                       len += 4;
-               } else if (all && 160 <= *old) {
-                       new [len++] = '&';
-                       strcpy (new + len, EntTable [*old - 160]);
-                       len += strlen (EntTable [*old - 160]);
-                       new [len++] = ';';
+#if HAVE_SETLOCALE
+           
+        
+               ischinese = 0; 
+               if (checklang) {
+                      if (i > 1) { 
+                        oldnext = old+1; 
+                        if ((*old >= 0xa1) &&
+                            (*old <= 0xf9) &&
+                            (((*oldnext >= 0x40) &&
+                              (*oldnext <= 0x73)) ||
+                             ((*oldnext >= 0xa1) &&
+                              (*oldnext <= 0xfe)))  
+                           ) ischinese = 1;
+                      }
+                      /* check if this is the seconde character in a chinese pair */
+                      if ((i != oldlen) && (!ischinese)) {
+                        oldprev = old-1;
+                        if ((*oldprev >= 0xa1) &&  
+                            (*oldprev <= 0xf9) &&
+                            (((*old >= 0x40) &&
+                              (*old <= 0x73)) ||
+                             ((*old >= 0xa1) &&
+                              (*old <= 0xfe)))
+                           ) ischinese = 1;
+                      }
+                }
+               
+                if (ischinese) { 
+                       /* it is chinese - ignore it */
+                       new [len++] = *old;
                } else {
-                       new [len++] = *old;
-               }
-               old++;
+#endif
+               
+                       if (38 == *old) {
+                               memcpy (new + len, "&amp;", 5);
+                               len += 5;
+                       } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
+                               memcpy (new + len, "&quot;", 6);
+                               len += 6;
+                       } else if (39 == *old && (quote_style&ENT_QUOTES)) {
+                               memcpy (new + len, "&#039;", 6);
+                               len += 6;
+                       } else if (60 == *old) {
+                               memcpy (new + len, "&lt;", 4);
+                               len += 4;
+                       } else if (62 == *old) {
+                               memcpy (new + len, "&gt;", 4);
+                               len += 4;
+                       } else if (all && 160 <= *old) {
+                               new [len++] = '&';
+                               strcpy (new + len, EntTable [*old - 160]);
+                               len += strlen (EntTable [*old - 160]);
+                               new [len++] = ';';
+                       } else {
+                               new [len++] = *old;
+                       }
+#if HAVE_SETLOCALE
+               
+                }
+#endif
+                old++;
        }
-    new [len] = '\0';
+        new [len] = '\0';
        *newlen = len;
 
        return new;




-- 
Edit Bug report at: http://bugs.php.net/?id=9392&edit=1



-- 
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]
To contact the list administrators, e-mail: [EMAIL PROTECTED]

Reply via email to