Hi Sergei, Please review a patch fixing MDEV-9811 and MDEV-9824.
They are prerequisites for: MDEV-6353 my_ismbchar() and my_mbcharlen() refactoring Thanks.
commit afe22406280634cb97ce4c0943db24033fb7d3bc Author: Alexander Barkov <[email protected]> Date: Tue Mar 29 15:39:15 2016 +0400 MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified diff --git a/include/m_ctype.h b/include/m_ctype.h index 615ee6a..5eb71b6 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -180,6 +180,10 @@ extern MY_UNI_CTYPE my_uni_ctype[256]; /* A helper macros for "need at least n bytes" */ #define MY_CS_TOOSMALLN(n) (-100-(n)) +#define MY_CS_MBMAXLEN 6 /* Maximum supported mbmaxlen */ +#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL) + + #define MY_SEQ_INTTAIL 1 #define MY_SEQ_SPACES 2 diff --git a/mysql-test/r/ctype_gbk.result b/mysql-test/r/ctype_gbk.result index b577454..e454347 100644 --- a/mysql-test/r/ctype_gbk.result +++ b/mysql-test/r/ctype_gbk.result @@ -5926,3 +5926,24 @@ Warning 1300 Invalid gb2312 character string: '\xA3A' # # End of 10.1 tests # +# +# Start of 10.2 tests +# +# +# MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases +# +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk); +LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@'; +SELECT HEX(a) FROM t1; +HEX(a) +B04061B041 +B042 +DELETE FROM t1; +LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES; +SELECT HEX(a) FROM t1; +HEX(a) +B042 +DROP TABLE t1; +# +# End of 10.2 tests +# diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index 816fe65..f52e08a 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -10401,3 +10401,30 @@ SET @@SQL_MODE=default; # # End of 10.1 tests # +# +# Start of 10.2 tests +# +# +# MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified +# +CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8); +LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ÑÑ'; +Warnings: +Warning 1638 Non-ASCII separator arguments are not fully supported +SELECT c1 FROM t1; +c1 +a +b +c +DELETE FROM t1; +LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ÑÑ' IGNORE 1 LINES; +Warnings: +Warning 1638 Non-ASCII separator arguments are not fully supported +SELECT c1 FROM t1; +c1 +b +c +DROP TABLE t1; +# +# End of 10.2 tests +# diff --git a/mysql-test/std_data/loaddata/mdev8711.txt b/mysql-test/std_data/loaddata/mdev8711.txt new file mode 100644 index 0000000..49296a7 --- /dev/null +++ b/mysql-test/std_data/loaddata/mdev8711.txt @@ -0,0 +1 @@ +°@a°A@°B@ \ No newline at end of file diff --git a/mysql-test/std_data/loaddata/mdev9824.txt b/mysql-test/std_data/loaddata/mdev9824.txt new file mode 100644 index 0000000..7050e08 --- /dev/null +++ b/mysql-test/std_data/loaddata/mdev9824.txt @@ -0,0 +1 @@ +aÑÑbÑÑcÑÑ \ No newline at end of file diff --git a/mysql-test/t/ctype_gbk.test b/mysql-test/t/ctype_gbk.test index 07e73cd..ae66dbb 100644 --- a/mysql-test/t/ctype_gbk.test +++ b/mysql-test/t/ctype_gbk.test @@ -435,3 +435,22 @@ SELECT HEX(CONVERT(CAST(0xA341 AS CHAR CHARACTER SET gb2312) USING utf8)); --echo # --echo # End of 10.1 tests --echo # + +--echo # +--echo # Start of 10.2 tests +--echo # + +--echo # +--echo # MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases +--echo # +CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET gbk); +LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@'; +SELECT HEX(a) FROM t1; +DELETE FROM t1; +LOAD DATA INFILE '../../std_data/loaddata/mdev8711.txt' INTO TABLE t1 CHARACTER SET gbk LINES TERMINATED BY '@' IGNORE 1 LINES; +SELECT HEX(a) FROM t1; +DROP TABLE t1; + +--echo # +--echo # End of 10.2 tests +--echo # diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index 85ffed9..f3a9e63 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -1950,3 +1950,22 @@ SET @@SQL_MODE=default; --echo # --echo # End of 10.1 tests --echo # + +--echo # +--echo # Start of 10.2 tests +--echo # + +--echo # +--echo # MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES TERMINATED BY when IGNORE is specified +--echo # +CREATE TABLE t1 (c1 VARCHAR(10) CHARACTER SET utf8); +LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ÑÑ'; +SELECT c1 FROM t1; +DELETE FROM t1; +LOAD DATA INFILE '../../std_data/loaddata/mdev9824.txt' INTO TABLE t1 CHARACTER SET utf8 LINES TERMINATED BY 'ÑÑ' IGNORE 1 LINES; +SELECT c1 FROM t1; +DROP TABLE t1; + +--echo # +--echo # End of 10.2 tests +--echo # diff --git a/sql/sql_load.cc b/sql/sql_load.cc index d43eb88..f6104f1 100644 --- a/sql/sql_load.cc +++ b/sql/sql_load.cc @@ -1707,32 +1707,93 @@ int READ_INFO::next_line() for (;;) { int chr = GET; -#ifdef USE_MB - if (my_mbcharlen(read_charset, chr) > 1) - { - for (uint i=1; - chr != my_b_EOF && i<my_mbcharlen(read_charset, chr); - i++) - chr = GET; - if (chr == escape_char) - continue; - } -#endif - if (chr == my_b_EOF) - { - eof=1; + if (chr == my_b_EOF) + { + eof= true; return 1; } + if (use_mb(read_charset)) + { + char buf[MY_CS_MBMAXLEN]; + buf[0]= chr; + for (uint i= 1; ; buf[i++]= chr) + { + DBUG_ASSERT(i < sizeof(buf)); + int chlen= my_charlen(read_charset, buf, buf + i); + if (chlen == 1) + { + /* + A single byte character was found, + proceed to check escape_char and line_term_char. + */ + DBUG_ASSERT(i == 1); + goto check_single_byte; + } + if (MY_CS_IS_TOOSMALL(chlen)) + { + // buf[] is a prefix of a multi-byte character + chr= GET; + if (chr == my_b_EOF) + { + eof= true; + return 1; + } + continue; // Collect more bytes to buf[]. + } + /* + Either a complete multi-byte sequence, + or a broken byte sequence was found. + Check if the sequence is a prefix of the "LINES TERMINATED BY" + string. + */ + if ((uchar) buf[0] == line_term_char && i <= line_term_length && + !memcmp(buf, line_term_ptr, i)) + { + if (line_term_length == i) + { + /* + We found a "LINES TERMINATED BY" string that consists + of a single multi-byte character. + */ + return 0; + } + /* + Our sequence is a prefix of "LINES TERMINATED BY". + Now check the suffix. Length of the suffix of line_term_ptr + that still needs to be checked is (line_term_length - i). + Note, READ_INFO::terminator() assumes that the leftmost byte of the + argument is already scanned from the file and is checked + (e.g. against line_term_char). So we need to pass one extra byte. + */ + if (terminator(line_term_ptr + i - 1, line_term_length - i + 1)) + return 0; + } + /* + Here we have a good multi-byte character or a broken byte sequence, + and the sequence is not equal to "LINES TERMINATED BY". + No needs to check for escape_char, because: + - multi-byte escape characters in "FIELDS ESCAPED BY" are not + supported and are rejected at parse time. + - broken single-byte sequences are not recognized as escapes, + they are considered to be part of the data and are converted to + question marks. + */ + goto fin; + } + DBUG_ASSERT(0); // Should not get to here + } +check_single_byte: if (chr == escape_char) { - line_cuted=1; + line_cuted= true; if (GET == my_b_EOF) - return 1; + return 1; continue; } if (chr == line_term_char && terminator(line_term_ptr,line_term_length)) return 0; - line_cuted=1; +fin: + line_cuted= true; } }
_______________________________________________ Mailing list: https://launchpad.net/~maria-developers Post to : [email protected] Unsubscribe : https://launchpad.net/~maria-developers More help : https://help.launchpad.net/ListHelp

