Commit: acbb84b021a145ee2dde73ec5923990f42fc18fb Author: Bastien Montagne Date: Sat Dec 31 16:06:51 2016 +0100 Branches: master https://developer.blender.org/rBacbb84b021a145ee2dde73ec5923990f42fc18fb
Add BLI_string_utf8 specific test. This test should ensure we correctly detect all invalid utf-8 sequences in a given string. DISCLAIMER: Do not run this with current code - you'll either laugh or cry, nearly *all* checks fail! Based on utf-8 decoder stress-test (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt) by Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0 =================================================================== A tests/gtests/blenlib/BLI_string_utf8_test.cc M tests/gtests/blenlib/CMakeLists.txt =================================================================== diff --git a/tests/gtests/blenlib/BLI_string_utf8_test.cc b/tests/gtests/blenlib/BLI_string_utf8_test.cc new file mode 100644 index 0000000..c0beb92 --- /dev/null +++ b/tests/gtests/blenlib/BLI_string_utf8_test.cc @@ -0,0 +1,304 @@ +/* Apache License, Version 2.0 */ + +#include "testing/testing.h" + +extern "C" { +#include "BLI_utildefines.h" +#include "BLI_string.h" +#include "BLI_string_utf8.h" +} + +/* Note that 'common' utf-8 variants of string functions (like copy, etc.) are tested in BLI_string_test.cc + * However, tests below are specific utf-8 conformance ones, and since they eat quite their share of lines, + * they deserved their own file. */ + +/* -------------------------------------------------------------------- */ +/* stubs */ + +extern "C" { + +int mk_wcwidth(wchar_t ucs); +int mk_wcswidth(const wchar_t *pwcs, size_t n); + +int mk_wcwidth(wchar_t ucs) +{ + return 0; +} + +int mk_wcswidth(const wchar_t *pwcs, size_t n) +{ + return 0; +} + +} + +/* -------------------------------------------------------------------- */ +/* tests */ + +/* Each test is made of a 79 bytes (80 with NULL char) string to test, expected string result after + * stripping invalid utf8 bytes, and a single-byte string encoded with expected number of errors. + * + * Based on utf-8 decoder stress-test (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt) + * by Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0 + */ +const char *utf8_invalid_tests[][3] = { +// 1 Some correct UTF-8 text + {"You should see the Greek word 'kosme': \"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\" |", + "You should see the Greek word 'kosme': \"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\" |", "\x00"}, + +// 2 Boundary condition test cases +// Note that those will pass for us, those are not erronéous unicode code points +// (asside from \x00, which is only valid as string terminator). +// 2.1 First possible sequence of a certain length + {"2.1.1 1 byte (U-00000000): \"\x00\" |", + "2.1.1 1 byte (U-00000000): \"\" |", "\x01"}, + {"2.1.2 2 bytes (U-00000080): \"\xc2\x80\" |", + "2.1.2 2 bytes (U-00000080): \"\xc2\x80\" |", "\x00"}, + {"2.1.3 3 bytes (U-00000800): \"\xe0\xa0\x80\" |", + "2.1.3 3 bytes (U-00000800): \"\xe0\xa0\x80\" |", "\x00"}, + {"2.1.4 4 bytes (U-00010000): \"\xf0\x90\x80\x80\" |", + "2.1.4 4 bytes (U-00010000): \"\xf0\x90\x80\x80\" |", "\x00"}, + {"2.1.5 5 bytes (U-00200000): \"\xf8\x88\x80\x80\x80\" |", + "2.1.5 5 bytes (U-00200000): \"\xf8\x88\x80\x80\x80\" |", "\x00"}, + {"2.1.6 6 bytes (U-04000000): \"\xfc\x84\x80\x80\x80\x80\" |", + "2.1.6 6 bytes (U-04000000): \"\xfc\x84\x80\x80\x80\x80\" |", "\x00"}, +// 2.2 Last possible sequence of a certain length + {"2.2.1 1 byte (U-0000007F): \"\x7f\" |", + "2.2.1 1 byte (U-0000007F): \"\x7f\" |", "\x00"}, + {"2.2.2 2 bytes (U-000007FF): \"\xdf\xbf\" |", + "2.2.2 2 bytes (U-000007FF): \"\xdf\xbf\" |", "\x00"}, + {"2.2.3 3 bytes (U-0000FFFF): \"\xef\xbf\xbf\" |", + "2.2.3 3 bytes (U-0000FFFF): \"\" |", "\x03"}, /* matches one of 5.3 sequences... */ + {"2.2.4 4 bytes (U-001FFFFF): \"\xf7\xbf\xbf\xbf\" |", + "2.2.4 4 bytes (U-001FFFFF): \"\xf7\xbf\xbf\xbf\" |", "\x00"}, + {"2.2.5 5 bytes (U-03FFFFFF): \"\xfb\xbf\xbf\xbf\xbf\" |", + "2.2.5 5 bytes (U-03FFFFFF): \"\xfb\xbf\xbf\xbf\xbf\" |", "\x00"}, + {"2.2.6 6 bytes (U-7FFFFFFF): \"\xfd\xbf\xbf\xbf\xbf\xbf\" |", + "2.2.6 6 bytes (U-7FFFFFFF): \"\xfd\xbf\xbf\xbf\xbf\xbf\" |", "\x00"}, +// 2.3 Other boundary conditions + {"2.3.1 U-0000D7FF = ed 9f bf = \"\xed\x9f\xbf\" |", + "2.3.1 U-0000D7FF = ed 9f bf = \"\xed\x9f\xbf\" |", "\x00"}, + {"2.3.2 U-0000E000 = ee 80 80 = \"\xee\x80\x80\" |", + "2.3.2 U-0000E000 = ee 80 80 = \"\xee\x80\x80\" |", "\x00"}, + {"2.3.3 U-0000FFFD = ef bf bd = \"\xef\xbf\xbd\" |", + "2.3.3 U-0000FFFD = ef bf bd = \"\xef\xbf\xbd\" |", "\x00"}, + {"2.3.4 U-0010FFFF = f4 8f bf bf = \"\xf4\x8f\xbf\xbf\" |", + "2.3.4 U-0010FFFF = f4 8f bf bf = \"\xf4\x8f\xbf\xbf\" |", "\x00"}, + {"2.3.5 U-00110000 = f4 90 80 80 = \"\xf4\x90\x80\x80\" |", + "2.3.5 U-00110000 = f4 90 80 80 = \"\xf4\x90\x80\x80\" |", "\x00"}, + +// 3 Malformed sequences +// 3.1 Unexpected continuation bytes +// Each unexpected continuation byte should be separately signalled as a malformed sequence of its own. + {"3.1.1 First continuation byte 0x80: \"\x80\" |", + "3.1.1 First continuation byte 0x80: \"\" |", "\x01"}, + {"3.1.2 Last continuation byte 0xbf: \"\xbf\" |", + "3.1.2 Last continuation byte 0xbf: \"\" |", "\x01"}, + {"3.1.3 2 continuation bytes: \"\x80\xbf\" |", + "3.1.3 2 continuation bytes: \"\" |", "\x02"}, + {"3.1.4 3 continuation bytes: \"\x80\xbf\x80\" |", + "3.1.4 3 continuation bytes: \"\" |", "\x03"}, + {"3.1.5 4 continuation bytes: \"\x80\xbf\x80\xbf\" |", + "3.1.5 4 continuation bytes: \"\" |", "\x04"}, + {"3.1.6 5 continuation bytes: \"\x80\xbf\x80\xbf\x80\" |", + "3.1.6 5 continuation bytes: \"\" |", "\x05"}, + {"3.1.7 6 continuation bytes: \"\x80\xbf\x80\xbf\x80\xbf\" |", + "3.1.7 6 continuation bytes: \"\" |", "\x06"}, + {"3.1.8 7 continuation bytes: \"\x80\xbf\x80\xbf\x80\xbf\x80\" |", + "3.1.8 7 continuation bytes: \"\" |", "\x07"}, +// 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): | + {"3.1.9 \"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf" + "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\" |", + "3.1.9 \"\" |", "\x40"}, +// 3.2 Lonely start characters +// 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), each followed by a space character: + {"3.2.1 \"\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf " + "\xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \" |", + "3.2.1 \" \" |", "\x20"}, +// 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), each followed by a space character: + {"3.2.2 \"\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \" |", + "3.2.2 \" \" |", "\x10"}, +// 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), each followed by a space character: + {"3.2.3 \"\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \" |", + "3.2.3 \" \" |", "\x08"}, +// 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), each followed by a space character: + {"3.2.4 \"\xf8 \xf9 \xfa \xfb \" |", + "3.2.4 \" \" |", "\x04"}, +// 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), each followed by a space character: + {"3.2.4 \"\xfc \xfd \" |", + "3.2.4 \" \" |", "\x02"}, +// 3.3 Sequences with last continuation byte missing +// All bytes of an incomplete sequence should be signalled as a single malformed sequence, +// i.e., you should see only a single replacement character in each of the next 10 tests. +// (Characters as in section 2) + {"3.3.1 2-byte sequence with last byte missing (U+0000): \"\xc0\" |", + "3.3.1 2-byte sequence with last byte missing (U+0000): \"\" |", "\x01"}, + {"3.3.2 3-byte sequence with last byte missing (U+0000): \"\xe0\x80\" |", + @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list [email protected] https://lists.blender.org/mailman/listinfo/bf-blender-cvs
