Hi Sergei, Please review a patch for MDEV-9823.
This is a prerequisite for the current sprint task: MDEV-6353 my_ismbchar() and my_mbcharlen() refactoring Thanks.
commit 9b6bcea8894474bb0d660a11ac21c1f64d12099f Author: Alexander Barkov <[email protected]> Date: Mon Apr 4 14:03:42 2016 +0400 MDEV-9823 LOAD DATA INFILE silently truncates incomplete byte sequences diff --git a/mysql-test/r/ctype_eucjpms.result b/mysql-test/r/ctype_eucjpms.result index f9cb4f1..8d4d8f6 100644 --- a/mysql-test/r/ctype_eucjpms.result +++ b/mysql-test/r/ctype_eucjpms.result @@ -33913,3 +33913,24 @@ DROP TABLE t1; # # End of 10.1 tests # +# +# End of 10.2 tests +# +# +# MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis +# +CREATE TABLE t1 (a TEXT CHARACTER SET eucjpms); +LOAD DATA INFILE '../../std_data/loaddata/mdev9823.ujis.txt' INTO TABLE t1 CHARACTER SET eucjpms IGNORE 4 LINES; +SELECT HEX(a) FROM t1; +HEX(a) +3F +78787831 +3F3F +78787832 +8FA1A1 +78787833 +3F3F +DROP TABLE t1; +# +# End of 10.2 tests +# diff --git a/mysql-test/r/ctype_ujis.result b/mysql-test/r/ctype_ujis.result index 61541ec..5eb9a3e 100644 --- a/mysql-test/r/ctype_ujis.result +++ b/mysql-test/r/ctype_ujis.result @@ -26218,3 +26218,24 @@ DROP TABLE t1; # # End of 10.1 tests # +# +# End of 10.2 tests +# +# +# MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis +# +CREATE TABLE t1 (a TEXT CHARACTER SET ujis); +LOAD DATA INFILE '../../std_data/loaddata/mdev9823.ujis.txt' INTO TABLE t1 CHARACTER SET ujis IGNORE 4 LINES; +SELECT HEX(a) FROM t1; +HEX(a) +3F +78787831 +3F3F +78787832 +8FA1A1 +78787833 +3F3F +DROP TABLE t1; +# +# End of 10.2 tests +# diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index f52e08a..af85841 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -10426,5 +10426,27 @@ b c DROP TABLE t1; # +# MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis +# +CREATE TABLE t1 (a TEXT CHARACTER SET utf8); +LOAD DATA INFILE '../../std_data/loaddata/mdev9823.utf8mb4.txt' INTO TABLE t1 CHARACTER SET utf8 IGNORE 4 LINES; +Warnings: +Warning 1366 Incorrect string value: '\xD0' for column 'a' at row 1 +Warning 1366 Incorrect string value: '\xE1\x80' for column 'a' at row 3 +Warning 1366 Incorrect string value: '\xF0\x9F\x98' for column 'a' at row 5 +Warning 1366 Incorrect string value: '\xF0\x9F\x98\x8E' for column 'a' at row 7 +Warning 1366 Incorrect string value: '\xF0\x9F\x98' for column 'a' at row 8 +SELECT HEX(a) FROM t1; +HEX(a) +3F +78787831 +3F3F +78787832 +3F3F3F +78787833 +3F3F3F3F +3F3F3F +DROP TABLE t1; +# # End of 10.2 tests # diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result index 10d77ae..558aba9 100644 --- a/mysql-test/r/ctype_utf8mb4.result +++ b/mysql-test/r/ctype_utf8mb4.result @@ -3398,3 +3398,30 @@ DROP FUNCTION f1; # # End of 10.1 tests # +# +# End of 10.2 tests +# +# +# MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis +# +CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4); +LOAD DATA INFILE '../../std_data/loaddata/mdev9823.utf8mb4.txt' INTO TABLE t1 CHARACTER SET utf8mb4 IGNORE 4 LINES; +Warnings: +Warning 1366 Incorrect string value: '\xD0' for column 'a' at row 1 +Warning 1366 Incorrect string value: '\xE1\x80' for column 'a' at row 3 +Warning 1366 Incorrect string value: '\xF0\x9F\x98' for column 'a' at row 5 +Warning 1366 Incorrect string value: '\xF0\x9F\x98' for column 'a' at row 8 +SELECT HEX(a) FROM t1; +HEX(a) +3F +78787831 +3F3F +78787832 +3F3F3F +78787833 +F09F988E +3F3F3F +DROP TABLE t1; +# +# End of 10.2 tests +# diff --git a/mysql-test/std_data/loaddata/mdev9823.ujis.txt b/mysql-test/std_data/loaddata/mdev9823.ujis.txt new file mode 100644 index 0000000..5468c99 --- /dev/null +++ b/mysql-test/std_data/loaddata/mdev9823.ujis.txt @@ -0,0 +1,11 @@ +# This file has incomplete UJIS sequences {8F}, {8FA1}, +# has a valid UJIS sequence {8FA1A1}, +# and has no NL at the end: +# {8F} \n xxx1 {8FA1} \n xxx2 {8FA1A1} \n xxx3 \n {8FA1} EOF + +xxx1 +¡ +xxx2 +¡¡ +xxx3 +¡ \ No newline at end of file diff --git a/mysql-test/std_data/loaddata/mdev9823.utf8mb4.txt b/mysql-test/std_data/loaddata/mdev9823.utf8mb4.txt new file mode 100644 index 0000000..8773956 --- /dev/null +++ b/mysql-test/std_data/loaddata/mdev9823.utf8mb4.txt @@ -0,0 +1,12 @@ +# This file has incomplete utf8mb4 sequences {D0}, {E180}, {F09F98}, +# has a valid utf8mb4 sequence {F09F988E} +# and has no NL at the end: +# {D0} \n xxx1 {E180} xxx2 \n {F09F98} \n xxx3 {F09F988E} {F09F98} EOF +Ð +xxx1 +á +xxx2 +ð +xxx3 +ð +ð \ No newline at end of file diff --git a/mysql-test/t/ctype_eucjpms.test b/mysql-test/t/ctype_eucjpms.test index d533e38..b5bd92d 100644 --- a/mysql-test/t/ctype_eucjpms.test +++ b/mysql-test/t/ctype_eucjpms.test @@ -566,3 +566,19 @@ DROP TABLE t1; --echo # --echo # End of 10.1 tests --echo # + +--echo # +--echo # End of 10.2 tests +--echo # + +--echo # +--echo # MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis +--echo # +CREATE TABLE t1 (a TEXT CHARACTER SET eucjpms); +LOAD DATA INFILE '../../std_data/loaddata/mdev9823.ujis.txt' INTO TABLE t1 CHARACTER SET eucjpms IGNORE 4 LINES; +SELECT HEX(a) FROM t1; +DROP TABLE t1; + +--echo # +--echo # End of 10.2 tests +--echo # diff --git a/mysql-test/t/ctype_ujis.test b/mysql-test/t/ctype_ujis.test index 3f44458..db85585 100644 --- a/mysql-test/t/ctype_ujis.test +++ b/mysql-test/t/ctype_ujis.test @@ -1396,3 +1396,20 @@ SELECT HEX(a) FROM t1 ORDER BY a;DROP TABLE t1; --echo # --echo # End of 10.1 tests --echo # + + +--echo # +--echo # End of 10.2 tests +--echo # + +--echo # +--echo # MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis +--echo # +CREATE TABLE t1 (a TEXT CHARACTER SET ujis); +LOAD DATA INFILE '../../std_data/loaddata/mdev9823.ujis.txt' INTO TABLE t1 CHARACTER SET ujis IGNORE 4 LINES; +SELECT HEX(a) FROM t1; +DROP TABLE t1; + +--echo # +--echo # End of 10.2 tests +--echo # diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index f3a9e63..edf66f8 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -1967,5 +1967,13 @@ SELECT c1 FROM t1; DROP TABLE t1; --echo # +--echo # MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis +--echo # +CREATE TABLE t1 (a TEXT CHARACTER SET utf8); +LOAD DATA INFILE '../../std_data/loaddata/mdev9823.utf8mb4.txt' INTO TABLE t1 CHARACTER SET utf8 IGNORE 4 LINES; +SELECT HEX(a) FROM t1; +DROP TABLE t1; + +--echo # --echo # End of 10.2 tests --echo # diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test index 2fe9b5e..74e39a8 100644 --- a/mysql-test/t/ctype_utf8mb4.test +++ b/mysql-test/t/ctype_utf8mb4.test @@ -1919,3 +1919,20 @@ DROP FUNCTION f1; --echo # --echo # End of 10.1 tests --echo # + + +--echo # +--echo # End of 10.2 tests +--echo # + +--echo # +--echo # MDEV-9842 LOAD DATA INFILE does not work well with a TEXT column when using sjis +--echo # +CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4); +LOAD DATA INFILE '../../std_data/loaddata/mdev9823.utf8mb4.txt' INTO TABLE t1 CHARACTER SET utf8mb4 IGNORE 4 LINES; +SELECT HEX(a) FROM t1; +DROP TABLE t1; + +--echo # +--echo # End of 10.2 tests +--echo # diff --git a/sql/sql_load.cc b/sql/sql_load.cc index e2d579b..a2cc7e0 100644 --- a/sql/sql_load.cc +++ b/sql/sql_load.cc @@ -119,6 +119,64 @@ class READ_INFO { *to= chr; return false; } + + /** + Read a tail of a multi-byte character. + The first byte is assumed to be already read from the file + and appended to "data". + + @returns true - if EOF happened unexpectedly + @returns false - no EOF happened: found a good multi-byte character, + or a bad byte sequence + + Note: + read_mbtail() returns "false" if an incomplete byte sequence was found. + For example, suppose we have an ujis file with bytes 0x8FA10A, where: + - 0x8FA1 is an incomplete prefix of a 3-byte character + (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character) + - 0x0A is a line demiliter + This file has some broken data, the trailing 0xA1 got lost for some reasons. + + In this example it will work as follows: + - 0x8F is read from the file and put into "data" before the call + for read_mbtail() + - 0xA1 is read from the file and put into "data" by read_mbtail() + - 0x0A is kept in the read queue, so the next read iteration after + the current read_mbtail() call will normally find it and recognize as + a line delimiter + - the current call for read_mbtail() returns "false", + because no EOF happened + */ + bool read_mbtail(String *data) + { + DBUG_ENTER("READ_INFO::read_mbtail"); + int chlen; + if ((chlen= my_charlen(read_charset, data->end() - 1, + data->end())) != 1) + { + for (uint32 length0= data->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); ) + { + int chr= GET; + if (chr == my_b_EOF) + DBUG_RETURN(true); + data->append(chr); + chlen= my_charlen(read_charset, data->ptr() + length0, data->end()); + if (chlen == MY_CS_ILSEQ) + { + /** + It has been an incomplete (but a valid) sequence so far, + but the last byte turned it into a bad byte sequence. + Unget the very last byte. + */ + data->length(data->length() - 1); + PUSH(chr); + break; + } + } + } + DBUG_RETURN(false); + } + public: bool error,line_cuted,found_null,enclosed; uchar *row_start, /* Found row starts here */ @@ -1589,38 +1647,9 @@ int READ_INFO::read_field() return 0; } } -#ifdef USE_MB - if (my_mbcharlen(read_charset, chr) > 1) - { - uint32 length0= data.length(); - int ml= my_mbcharlen(read_charset, chr); - data.append(chr); - - for (int i= 1; i < ml; i++) - { - chr= GET; - if (chr == my_b_EOF) - { - /* - Need to back up the bytes already ready from illformed - multi-byte char - */ - data.length(length0); - goto found_eof; - } - data.append(chr); - } - if (my_ismbchar(read_charset, - (const char *) data.ptr() + length0, - (const char *) data.end())) - continue; - for (int i= 0; i < ml; i++) - PUSH(data.end()[-1 - i]); - data.length(length0); - chr= GET; - } -#endif data.append(chr); + if (use_mb(read_charset) && read_mbtail(&data)) + goto found_eof; } /* ** We come here if buffer is too small. Enlarge it and continue diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c index 52494b7..469d3a5 100644 --- a/strings/ctype-eucjpms.c +++ b/strings/ctype-eucjpms.c @@ -199,6 +199,7 @@ static const uchar sort_order_eucjpms[]= #define IS_MB2_KATA(x,y) (iseucjpms_ss2(x) && iskata(y)) #define IS_MB2_CHAR(x,y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y)) #define IS_MB3_CHAR(x,y,z) (iseucjpms_ss3(x) && IS_MB2_JIS(y,z)) +#define IS_MB_PREFIX2(x,y) (iseucjpms_ss3(x) && iseucjpms(y)) #define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic" diff --git a/strings/ctype-mb.ic b/strings/ctype-mb.ic index 6fc4d6e..2df9c9d 100644 --- a/strings/ctype-mb.ic +++ b/strings/ctype-mb.ic @@ -75,7 +75,13 @@ MY_FUNCTION_NAME(charlen)(CHARSET_INFO *cs __attribute__((unused)), #ifdef IS_MB3_CHAR if (b + 3 > e) + { +#ifdef IS_MB_PREFIX2 + if (!IS_MB_PREFIX2(b[0], b[1])) + return MY_CS_ILSEQ; +#endif return MY_CS_TOOSMALLN(3); + } if (IS_MB3_CHAR(b[0], b[1], b[2])) return 3; /* Three-byte character */ #endif diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index 67e6890..b24fdb3 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -198,6 +198,7 @@ static const uchar sort_order_ujis[]= #define IS_MB2_KATA(x,y) (isujis_ss2(x) && iskata(y)) #define IS_MB2_CHAR(x, y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y)) #define IS_MB3_CHAR(x, y, z) (isujis_ss3(x) && IS_MB2_JIS(y,z)) +#define IS_MB_PREFIX2(x,y) (isujis_ss3(x) && isujis(y)) #define DEFINE_ASIAN_ROUTINES #include "ctype-mb.ic"
_______________________________________________ Mailing list: https://launchpad.net/~maria-developers Post to : [email protected] Unsubscribe : https://launchpad.net/~maria-developers More help : https://help.launchpad.net/ListHelp

