moriyoshi               Tue Feb 24 13:18:40 2009 UTC

  Added files:                 
    /php-src/ext/mbstring/tests illformed_utf_sequences.phpt 

  Modified files:              
    /php-src/ext/mbstring/libmbfl/filters       mbfilter_utf32.c 
                                                mbfilter_utf8.c 
    /php-src/ext/mbstring/libmbfl/mbfl  mbfl_consts.h 
  Log:
  - Strictly check UTF-8 and UTF-8 validity.
  
  
http://cvs.php.net/viewvc.cgi/php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c?r1=1.1&r2=1.2&diff_format=u
Index: php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c
diff -u php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c:1.1 
php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c:1.2
--- php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c:1.1   Sat Aug 23 
06:18:36 2003
+++ php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c       Tue Feb 24 
13:18:40 2009
@@ -171,7 +171,9 @@
                        CK((*filter->output_function)(0xfeff, filter->data));
                } else {
                        filter->status &= ~0xff;
-                       CK((*filter->output_function)(n, filter->data));
+                       if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 
0xdfff)) {
+                               CK((*filter->output_function)(n, filter->data));
+                       }
                }
                break;
        }
@@ -201,7 +203,9 @@
        } else {
                filter->status = 0;
                n = (c & 0xff) | filter->cache;
-               CK((*filter->output_function)(n, filter->data));
+               if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
+                       CK((*filter->output_function)(n, filter->data));
+               }
        }
        return c;
 }
@@ -211,7 +215,7 @@
  */
 int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter)
 {
-       if (c >= 0 && c < MBFL_WCSGROUP_UCS4MAX) {
+       if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
                CK((*filter->output_function)((c >> 24) & 0xff, filter->data));
                CK((*filter->output_function)((c >> 16) & 0xff, filter->data));
                CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
@@ -247,7 +251,9 @@
        } else {
                filter->status = 0;
                n = ((c & 0xff) << 24) | filter->cache;
-               CK((*filter->output_function)(n, filter->data));
+               if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) {
+                       CK((*filter->output_function)(n, filter->data));
+               }
        }
        return c;
 }
@@ -257,7 +263,7 @@
  */
 int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter)
 {
-       if (c >= 0 && c < MBFL_WCSGROUP_UCS4MAX) {
+       if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) {
                CK((*filter->output_function)(c & 0xff, filter->data));
                CK((*filter->output_function)((c >> 8) & 0xff, filter->data));
                CK((*filter->output_function)((c >> 16) & 0xff, filter->data));
http://cvs.php.net/viewvc.cgi/php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c?r1=1.1&r2=1.2&diff_format=u
Index: php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c
diff -u php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c:1.1 
php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c:1.2
--- php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c:1.1    Sat Aug 23 
06:18:36 2003
+++ php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c        Tue Feb 24 
13:18:40 2009
@@ -106,7 +106,8 @@
                }
                filter->status = 0;
        } else if (c < 0xc0) {
-               switch (filter->status & 0xff) {
+               int status = filter->status & 0xff;
+               switch (status) {
                case 0x10: /* 2byte code 2nd char */
                case 0x21: /* 3byte code 3rd char */
                case 0x32: /* 4byte code 4th char */
@@ -114,7 +115,11 @@
                case 0x54: /* 6byte code 6th char */
                        filter->status = 0;
                        s = filter->cache | (c & 0x3f);
-                       if (s >= 0x80) {
+                       if ((status == 0x10 && s >= 0x80) ||
+                           (status == 0x21 && s >= 0x800 && (s < 0xd800 || s > 
0xdfff)) ||
+                           (status == 0x32 && s >= 0x10000) ||
+                           (status == 0x43 && s >= 0x200000) ||
+                           (status == 0x54 && s >= 0x4000000 && s < 
MBFL_WCSGROUP_UCS4MAX)) {
                                CK((*filter->output_function)(s, filter->data));
                        }
                        break;
http://cvs.php.net/viewvc.cgi/php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h?r1=1.3&r2=1.4&diff_format=u
Index: php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h
diff -u php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h:1.3 
php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h:1.4
--- php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h:1.3 Tue Mar 22 22:22:10 2005
+++ php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h     Tue Feb 24 13:18:40 2009
@@ -47,6 +47,7 @@
 /* wchar plane, special charactor */
 #define MBFL_WCSPLANE_MASK                     0xffff
 #define MBFL_WCSPLANE_UCS2MAX          0x00010000
+#define MBFL_WCSPLANE_UTF32MAX         0x00110000
 #define MBFL_WCSPLANE_SUPMIN           0x00010000
 #define MBFL_WCSPLANE_SUPMAX           0x00200000
 #define MBFL_WCSPLANE_JIS0208          0x70e10000              /* JIS HEX : 
2121h - 7E7Eh */

http://cvs.php.net/viewvc.cgi/php-src/ext/mbstring/tests/illformed_utf_sequences.phpt?view=markup&rev=1.1
Index: php-src/ext/mbstring/tests/illformed_utf_sequences.phpt
+++ php-src/ext/mbstring/tests/illformed_utf_sequences.phpt
--TEST--
Unicode standard conformance test (ill-formed UTF sequences.)
--SKIPIF--
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
--FILE--
<?php
echo "UTF-8 redundancy\n";
var_dump(bin2hex(mb_convert_encoding(b"\x31\x32\x33", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\x41\x42\x43", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xc0\xb1\xc0\xb2\xc0\xb3", "UCS-4BE", 
"UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xc1\x81\xc1\x82\xc1\x83", "UCS-4BE", 
"UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 
"UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 
"UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3",
 "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83",
 "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3",
 "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83",
 "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3",
 "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83",
 "UCS-4BE", "UTF-8")));

var_dump(bin2hex(mb_convert_encoding(b"\xc2\xa2\xc2\xa3\xc2\xa5", "UCS-4BE", 
"UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 
"UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5",
 "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5",
 "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5",
 "UCS-4BE", "UTF-8")));

var_dump(bin2hex(mb_convert_encoding(b"\xc1\xbf", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xc2\x80", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xdf\xbf", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xe0\x9f\xff", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xe0\xa0\x80", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xef\xbf\xbf", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf0\x8f\xbf\xbf", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf0\x90\x80\x80", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf7\xbf\xbf\xbf", "UCS-4BE", "UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf8\x87\xbf\xbf\xbf", "UCS-4BE", 
"UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xf8\x88\x80\x80\x80", "UCS-4BE", 
"UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xfb\xbf\xbf\xbf\xbf", "UCS-4BE", 
"UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xfc\x83\xbf\xbf\xbf\xbf", "UCS-4BE", 
"UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xfc\x84\x80\x80\x80\x80", "UCS-4BE", 
"UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xfd\xaf\xbf\xbf\xbf\xbf", "UCS-4BE", 
"UTF-8")));
var_dump(bin2hex(mb_convert_encoding(b"\xfd\xbf\xbf\xbf\xbf\xbf", "UCS-4BE", 
"UTF-8")));

echo "UTF-8 and surrogates area\n";
$out = b'';
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
    $out .= mb_convert_encoding(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) 
& 0x3f, 0x80 | $i & 0x3f), "UCS-4BE", "UTF-8");
}
var_dump(bin2hex($out));

echo "UTF-32 code range\n";
var_dump(bin2hex(mb_convert_encoding("\x00\x11\x00\x00", "UCS-4BE", 
"UTF-32BE")));
var_dump(bin2hex(mb_convert_encoding("\x00\x10\xff\xff", "UCS-4BE", 
"UTF-32BE")));
var_dump(bin2hex(mb_convert_encoding("\x00\x00\x11\x00", "UCS-4BE", 
"UTF-32LE")));
var_dump(bin2hex(mb_convert_encoding("\xff\xff\x10\x00", "UCS-4BE", 
"UTF-32LE")));
var_dump(bin2hex(mb_convert_encoding("\x00\x11\x00\x00", "UCS-4BE", "UTF-32")));
var_dump(bin2hex(mb_convert_encoding("\x00\x10\xff\xff", "UCS-4BE", "UTF-32")));
var_dump(bin2hex(mb_convert_encoding("\x00\x00\xfe\xff\x00\x11\x00\x00", 
"UCS-4BE", "UTF-32")));
var_dump(bin2hex(mb_convert_encoding("\x00\x00\xfe\xff\x00\x10\xff\xff", 
"UCS-4BE", "UTF-32")));
var_dump(bin2hex(mb_convert_encoding("\xff\xfe\x00\x00\x00\x00\x11\x00", 
"UCS-4BE", "UTF-32")));
var_dump(bin2hex(mb_convert_encoding("\xff\xfe\x00\x00\xff\xff\x10\x00", 
"UCS-4BE", "UTF-32")));

echo "UTF-32 and surrogates area\n";
$out = b'';
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
    $out .= mb_convert_encoding(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 
8) & 0xff, $i & 0xff), "UCS-4BE", "UTF-32BE");
}
var_dump(bin2hex($out));

$out = b'';
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
    $out .= mb_convert_encoding(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 
16) & 0xff, ($i >> 24) & 0xff), "UCS-4BE", "UTF-32LE");
}
var_dump(bin2hex($out));

$out = b'';
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
    $out .= mb_convert_encoding(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 
8) & 0xff, $i & 0xff), "UCS-4BE", "UTF-32");
}
var_dump(bin2hex($out));

$out = b'';
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
    $out .= mb_convert_encoding("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 
16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), "UCS-4BE", "UTF-32");
}
var_dump(str_replace("0000feff", "", bin2hex($out)));

$out = b'';
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
    $out .= mb_convert_encoding("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i 
>> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), "UCS-4BE", "UTF-32");
}
var_dump(str_replace("0000feff", "", bin2hex($out)));
?>
--EXPECT--
UTF-8 redundancy
unicode(24) "000000310000003200000033"
unicode(24) "000000410000004200000043"
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(24) "000000a2000000a3000000a5"
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(0) ""
unicode(8) "00000080"
unicode(8) "000007ff"
unicode(0) ""
unicode(8) "00000800"
unicode(8) "0000ffff"
unicode(0) ""
unicode(8) "00010000"
unicode(8) "001fffff"
unicode(0) ""
unicode(8) "00200000"
unicode(8) "03ffffff"
unicode(0) ""
unicode(8) "04000000"
unicode(8) "6fffffff"
unicode(0) ""
UTF-8 and surrogates area
unicode(16) "0000d7ff0000e000"
UTF-32 code range
unicode(0) ""
unicode(8) "0010ffff"
unicode(0) ""
unicode(8) "0010ffff"
unicode(0) ""
unicode(8) "0010ffff"
unicode(8) "0000feff"
unicode(16) "0000feff0010ffff"
unicode(8) "0000feff"
unicode(16) "0000feff0010ffff"
UTF-32 and surrogates area
unicode(16) "0000d7ff0000e000"
unicode(16) "0000d7ff0000e000"
unicode(16) "0000d7ff0000e000"
unicode(16) "0000d7ff0000e000"
unicode(16) "0000d7ff0000e000"

-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to