moriyoshi Tue Feb 24 13:19:09 2009 UTC Added files: (Branch: PHP_5_3) /php-src/ext/mbstring/tests illformed_utf_sequences.phpt
Modified files: /php-src/ext/mbstring/libmbfl/filters mbfilter_utf32.c mbfilter_utf8.c /php-src/ext/mbstring/libmbfl/mbfl mbfl_consts.h Log: - MFH: strictly check UTF-8 and UTF-32 validity
http://cvs.php.net/viewvc.cgi/php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c?r1=1.1&r2=1.1.10.1&diff_format=u Index: php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c diff -u php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c:1.1 php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c:1.1.10.1 --- php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c:1.1 Sat Aug 23 06:18:36 2003 +++ php-src/ext/mbstring/libmbfl/filters/mbfilter_utf32.c Tue Feb 24 13:19:09 2009 @@ -171,7 +171,9 @@ CK((*filter->output_function)(0xfeff, filter->data)); } else { filter->status &= ~0xff; - CK((*filter->output_function)(n, filter->data)); + if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) { + CK((*filter->output_function)(n, filter->data)); + } } break; } @@ -201,7 +203,9 @@ } else { filter->status = 0; n = (c & 0xff) | filter->cache; - CK((*filter->output_function)(n, filter->data)); + if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) { + CK((*filter->output_function)(n, filter->data)); + } } return c; } @@ -211,7 +215,7 @@ */ int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter) { - if (c >= 0 && c < MBFL_WCSGROUP_UCS4MAX) { + if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) { CK((*filter->output_function)((c >> 24) & 0xff, filter->data)); CK((*filter->output_function)((c >> 16) & 0xff, filter->data)); CK((*filter->output_function)((c >> 8) & 0xff, filter->data)); @@ -247,7 +251,9 @@ } else { filter->status = 0; n = ((c & 0xff) << 24) | filter->cache; - CK((*filter->output_function)(n, filter->data)); + if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xd800 || n > 0xdfff)) { + CK((*filter->output_function)(n, filter->data)); + } } return c; } @@ -257,7 +263,7 @@ */ int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter) { - if (c >= 0 && c < MBFL_WCSGROUP_UCS4MAX) { + if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) { CK((*filter->output_function)(c & 0xff, filter->data)); CK((*filter->output_function)((c >> 8) & 0xff, filter->data)); CK((*filter->output_function)((c >> 16) & 0xff, filter->data)); http://cvs.php.net/viewvc.cgi/php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c?r1=1.1&r2=1.1.10.1&diff_format=u Index: php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c diff -u php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c:1.1 php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c:1.1.10.1 --- php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c:1.1 Sat Aug 23 06:18:36 2003 +++ php-src/ext/mbstring/libmbfl/filters/mbfilter_utf8.c Tue Feb 24 13:19:09 2009 @@ -106,7 +106,8 @@ } filter->status = 0; } else if (c < 0xc0) { - switch (filter->status & 0xff) { + int status = filter->status & 0xff; + switch (status) { case 0x10: /* 2byte code 2nd char */ case 0x21: /* 3byte code 3rd char */ case 0x32: /* 4byte code 4th char */ @@ -114,7 +115,11 @@ case 0x54: /* 6byte code 6th char */ filter->status = 0; s = filter->cache | (c & 0x3f); - if (s >= 0x80) { + if ((status == 0x10 && s >= 0x80) || + (status == 0x21 && s >= 0x800 && (s < 0xd800 || s > 0xdfff)) || + (status == 0x32 && s >= 0x10000) || + (status == 0x43 && s >= 0x200000) || + (status == 0x54 && s >= 0x4000000 && s < MBFL_WCSGROUP_UCS4MAX)) { CK((*filter->output_function)(s, filter->data)); } break; http://cvs.php.net/viewvc.cgi/php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h?r1=1.3.6.1&r2=1.3.6.2&diff_format=u Index: php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h diff -u php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h:1.3.6.1 php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h:1.3.6.2 --- php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h:1.3.6.1 Sat Jul 5 06:52:04 2008 +++ php-src/ext/mbstring/libmbfl/mbfl/mbfl_consts.h Tue Feb 24 13:19:09 2009 @@ -47,6 +47,7 @@ /* wchar plane, special charactor */ #define MBFL_WCSPLANE_MASK 0xffff #define MBFL_WCSPLANE_UCS2MAX 0x00010000 +#define MBFL_WCSPLANE_UTF32MAX 0x00110000 #define MBFL_WCSPLANE_SUPMIN 0x00010000 #define MBFL_WCSPLANE_SUPMAX 0x00200000 #define MBFL_WCSPLANE_JIS0208 0x70e10000 /* JIS HEX : 2121h - 7E7Eh */ http://cvs.php.net/viewvc.cgi/php-src/ext/mbstring/tests/illformed_utf_sequences.phpt?view=markup&rev=1.1 Index: php-src/ext/mbstring/tests/illformed_utf_sequences.phpt +++ php-src/ext/mbstring/tests/illformed_utf_sequences.phpt --TEST-- Unicode standard conformance test (ill-formed UTF sequences.) --SKIPIF-- <?php extension_loaded('mbstring') or die('skip mbstring not available'); ?> --FILE-- <?php echo "UTF-8 redundancy\n"; var_dump(bin2hex(mb_convert_encoding(b"\x31\x32\x33", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\x41\x42\x43", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xc0\xb1\xc0\xb2\xc0\xb3", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xc1\x81\xc1\x82\xc1\x83", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xc2\xa2\xc2\xa3\xc2\xa5", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xc1\xbf", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xc2\x80", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xdf\xbf", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xe0\x9f\xff", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xe0\xa0\x80", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xef\xbf\xbf", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf0\x8f\xbf\xbf", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf0\x90\x80\x80", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf7\xbf\xbf\xbf", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf8\x87\xbf\xbf\xbf", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xf8\x88\x80\x80\x80", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xfb\xbf\xbf\xbf\xbf", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xfc\x83\xbf\xbf\xbf\xbf", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xfc\x84\x80\x80\x80\x80", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xfd\xaf\xbf\xbf\xbf\xbf", "UCS-4BE", "UTF-8"))); var_dump(bin2hex(mb_convert_encoding(b"\xfd\xbf\xbf\xbf\xbf\xbf", "UCS-4BE", "UTF-8"))); echo "UTF-8 and surrogates area\n"; $out = b''; for ($i = 0xd7ff; $i <= 0xe000; ++$i) { $out .= mb_convert_encoding(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), "UCS-4BE", "UTF-8"); } var_dump(bin2hex($out)); echo "UTF-32 code range\n"; var_dump(bin2hex(mb_convert_encoding("\x00\x11\x00\x00", "UCS-4BE", "UTF-32BE"))); var_dump(bin2hex(mb_convert_encoding("\x00\x10\xff\xff", "UCS-4BE", "UTF-32BE"))); var_dump(bin2hex(mb_convert_encoding("\x00\x00\x11\x00", "UCS-4BE", "UTF-32LE"))); var_dump(bin2hex(mb_convert_encoding("\xff\xff\x10\x00", "UCS-4BE", "UTF-32LE"))); var_dump(bin2hex(mb_convert_encoding("\x00\x11\x00\x00", "UCS-4BE", "UTF-32"))); var_dump(bin2hex(mb_convert_encoding("\x00\x10\xff\xff", "UCS-4BE", "UTF-32"))); var_dump(bin2hex(mb_convert_encoding("\x00\x00\xfe\xff\x00\x11\x00\x00", "UCS-4BE", "UTF-32"))); var_dump(bin2hex(mb_convert_encoding("\x00\x00\xfe\xff\x00\x10\xff\xff", "UCS-4BE", "UTF-32"))); var_dump(bin2hex(mb_convert_encoding("\xff\xfe\x00\x00\x00\x00\x11\x00", "UCS-4BE", "UTF-32"))); var_dump(bin2hex(mb_convert_encoding("\xff\xfe\x00\x00\xff\xff\x10\x00", "UCS-4BE", "UTF-32"))); echo "UTF-32 and surrogates area\n"; $out = b''; for ($i = 0xd7ff; $i <= 0xe000; ++$i) { $out .= mb_convert_encoding(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), "UCS-4BE", "UTF-32BE"); } var_dump(bin2hex($out)); $out = b''; for ($i = 0xd7ff; $i <= 0xe000; ++$i) { $out .= mb_convert_encoding(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), "UCS-4BE", "UTF-32LE"); } var_dump(bin2hex($out)); $out = b''; for ($i = 0xd7ff; $i <= 0xe000; ++$i) { $out .= mb_convert_encoding(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), "UCS-4BE", "UTF-32"); } var_dump(bin2hex($out)); $out = b''; for ($i = 0xd7ff; $i <= 0xe000; ++$i) { $out .= mb_convert_encoding("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), "UCS-4BE", "UTF-32"); } var_dump(str_replace("0000feff", "", bin2hex($out))); $out = b''; for ($i = 0xd7ff; $i <= 0xe000; ++$i) { $out .= mb_convert_encoding("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), "UCS-4BE", "UTF-32"); } var_dump(str_replace("0000feff", "", bin2hex($out))); ?> --EXPECT-- UTF-8 redundancy unicode(24) "000000310000003200000033" unicode(24) "000000410000004200000043" unicode(0) "" unicode(0) "" unicode(0) "" unicode(0) "" unicode(0) "" unicode(0) "" unicode(0) "" unicode(0) "" unicode(0) "" unicode(0) "" unicode(24) "000000a2000000a3000000a5" unicode(0) "" unicode(0) "" unicode(0) "" unicode(0) "" unicode(0) "" unicode(8) "00000080" unicode(8) "000007ff" unicode(0) "" unicode(8) "00000800" unicode(8) "0000ffff" unicode(0) "" unicode(8) "00010000" unicode(8) "001fffff" unicode(0) "" unicode(8) "00200000" unicode(8) "03ffffff" unicode(0) "" unicode(8) "04000000" unicode(8) "6fffffff" unicode(0) "" UTF-8 and surrogates area unicode(16) "0000d7ff0000e000" UTF-32 code range unicode(0) "" unicode(8) "0010ffff" unicode(0) "" unicode(8) "0010ffff" unicode(0) "" unicode(8) "0010ffff" unicode(8) "0000feff" unicode(16) "0000feff0010ffff" unicode(8) "0000feff" unicode(16) "0000feff0010ffff" UTF-32 and surrogates area unicode(16) "0000d7ff0000e000" unicode(16) "0000d7ff0000e000" unicode(16) "0000d7ff0000e000" unicode(16) "0000d7ff0000e000" unicode(16) "0000d7ff0000e000"
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php