From: [EMAIL PROTECTED]
Operating system: Linux
PHP version: 4.0 Latest CVS (21/02/2001)
PHP Bug Type: *Languages/Translation
Bug description: htmlspecial chars & htmlentities do not handle double byte
charactersets
htmlspecialchars & entities often replace the second byte of a chinese character with
a 'htmlized' &xxx; character - this is annoying :) and makes it very difficult to
program in PHP doing dynamic chinese stuff.
anyway this patch goes part of the way to solving it, - Note I have not tested it, so
testers needed.. - I'm on the dev list, so I should be able to follow any comments..
It does produce compile time errors on the character range (i'm guessing that gcc
makes the assumtion that char should be < 128 ?)
I have added a check using setlocale("LC_ALL",NULL) - This may not be the correct test
- and may not respond with the correct info if set in PHP - again untested. - does
LC_ALL return something? should It use env. variables...
Anyway - thats a long enough bug report...
regards
alan
Index: html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.22
diff -u -r1.22 html.c
--- html.c 2000/11/24 16:17:58 1.22
+++ html.c 2001/02/22 03:43:13
@@ -22,7 +22,7 @@
#include "php.h"
#include "reg.h"
#include "html.h"
-
+#include <locale.h>
/* This must be fixed to handle the input string according to LC_CTYPE.
Defaults to ISO-8859-1 for now. */
@@ -52,8 +52,17 @@
PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen,
int all, int quote_style)
{
int i, maxlen, len;
- char *new;
-
+
+ char *new, *oldnext, *oldprev;
+#if HAVE_SETLOCALE
+ int checklang=0,ischinese;
+ /* should this check the enviroment value? */
+ char *locale = setlocale(LC_ALL, NULL);
+ if ((!strcmp("zh_TW.Big5", locale)) ||
+ (!strcmp("zh_TW", locale)) ||
+ (!strcmp("zh_CN", locale)) ||
+ (!strcmp("zh_CN.GB2313", locale))) checklang=1;
+#endif
maxlen = 2 * oldlen;
if (maxlen < 128)
maxlen = 128;
@@ -62,34 +71,72 @@
i = oldlen;
while (i--) {
- if (len + 9 > maxlen)
+ if (len + 9 > maxlen)
new = erealloc (new, maxlen += 128);
- if (38 == *old) {
- memcpy (new + len, "&", 5);
- len += 5;
- } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
- memcpy (new + len, """, 6);
- len += 6;
- } else if (39 == *old && (quote_style&ENT_QUOTES)) {
- memcpy (new + len, "'", 6);
- len += 6;
- } else if (60 == *old) {
- memcpy (new + len, "<", 4);
- len += 4;
- } else if (62 == *old) {
- memcpy (new + len, ">", 4);
- len += 4;
- } else if (all && 160 <= *old) {
- new [len++] = '&';
- strcpy (new + len, EntTable [*old - 160]);
- len += strlen (EntTable [*old - 160]);
- new [len++] = ';';
+#if HAVE_SETLOCALE
+
+
+ ischinese = 0;
+ if (checklang) {
+ if (i > 1) {
+ oldnext = old+1;
+ if ((*old >= 0xa1) &&
+ (*old <= 0xf9) &&
+ (((*oldnext >= 0x40) &&
+ (*oldnext <= 0x73)) ||
+ ((*oldnext >= 0xa1) &&
+ (*oldnext <= 0xfe)))
+ ) ischinese = 1;
+ }
+ /* check if this is the seconde character in a chinese pair */
+ if ((i != oldlen) && (!ischinese)) {
+ oldprev = old-1;
+ if ((*oldprev >= 0xa1) &&
+ (*oldprev <= 0xf9) &&
+ (((*old >= 0x40) &&
+ (*old <= 0x73)) ||
+ ((*old >= 0xa1) &&
+ (*old <= 0xfe)))
+ ) ischinese = 1;
+ }
+ }
+
+ if (ischinese) {
+ /* it is chinese - ignore it */
+ new [len++] = *old;
} else {
- new [len++] = *old;
- }
- old++;
+#endif
+
+ if (38 == *old) {
+ memcpy (new + len, "&", 5);
+ len += 5;
+ } else if (34 == *old && !(quote_style&ENT_NOQUOTES)) {
+ memcpy (new + len, """, 6);
+ len += 6;
+ } else if (39 == *old && (quote_style&ENT_QUOTES)) {
+ memcpy (new + len, "'", 6);
+ len += 6;
+ } else if (60 == *old) {
+ memcpy (new + len, "<", 4);
+ len += 4;
+ } else if (62 == *old) {
+ memcpy (new + len, ">", 4);
+ len += 4;
+ } else if (all && 160 <= *old) {
+ new [len++] = '&';
+ strcpy (new + len, EntTable [*old - 160]);
+ len += strlen (EntTable [*old - 160]);
+ new [len++] = ';';
+ } else {
+ new [len++] = *old;
+ }
+#if HAVE_SETLOCALE
+
+ }
+#endif
+ old++;
}
- new [len] = '\0';
+ new [len] = '\0';
*newlen = len;
return new;
--
Edit Bug report at: http://bugs.php.net/?id=9392&edit=1
--
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]
To contact the list administrators, e-mail: [EMAIL PROTECTED]