Hi, Alexander! Looks ok to push. A couple of sugestions below.
On Mar 29, Alexander Barkov wrote: > commit afe22406280634cb97ce4c0943db24033fb7d3bc > Author: Alexander Barkov <[email protected]> > Date: Tue Mar 29 15:39:15 2016 +0400 > > MDEV-9811 LOAD DATA INFILE does not work well with gbk in some cases > MDEV-9824 LOAD DATA does not work with multi-byte strings in LINES > TERMINATED BY when IGNORE is specified > > diff --git a/include/m_ctype.h b/include/m_ctype.h > index 615ee6a..5eb71b6 100644 > --- a/include/m_ctype.h > +++ b/include/m_ctype.h > @@ -180,6 +180,10 @@ extern MY_UNI_CTYPE my_uni_ctype[256]; > /* A helper macros for "need at least n bytes" */ > #define MY_CS_TOOSMALLN(n) (-100-(n)) > > +#define MY_CS_MBMAXLEN 6 /* Maximum supported mbmaxlen */ I'd add in init_available_charsets(): DBUG_ASSERT(cs[0]->mbmaxlen <= MY_CS_MBMAXLEN); > +#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= > MY_CS_TOOSMALL) > + > + > #define MY_SEQ_INTTAIL 1 > #define MY_SEQ_SPACES 2 > > diff --git a/sql/sql_load.cc b/sql/sql_load.cc > index d43eb88..f6104f1 100644 > --- a/sql/sql_load.cc > +++ b/sql/sql_load.cc > @@ -1707,32 +1707,93 @@ int READ_INFO::next_line() > for (;;) > { > int chr = GET; > -#ifdef USE_MB > - if (my_mbcharlen(read_charset, chr) > 1) > - { > - for (uint i=1; > - chr != my_b_EOF && i<my_mbcharlen(read_charset, chr); > - i++) > - chr = GET; > - if (chr == escape_char) > - continue; > - } > -#endif > if (chr == my_b_EOF) > { > eof= true; > return 1; > } > + if (use_mb(read_charset)) > + { > + char buf[MY_CS_MBMAXLEN]; > + buf[0]= chr; > + for (uint i= 1; ; buf[i++]= chr) > + { > + DBUG_ASSERT(i < sizeof(buf)); > + int chlen= my_charlen(read_charset, buf, buf + i); > + if (chlen == 1) I would've moved it to the first if(): if (use_mb(read_charset) && my_charlen(read_charset, &chr, 1) != 1) > + { > + /* > + A single byte character was found, > + proceed to check escape_char and line_term_char. > + */ > + DBUG_ASSERT(i == 1); > + goto check_single_byte; > + } > + if (MY_CS_IS_TOOSMALL(chlen)) > + { > + // buf[] is a prefix of a multi-byte character > + chr= GET; > + if (chr == my_b_EOF) > + { > + eof= true; > + return 1; > + } > + continue; // Collect more bytes to buf[]. > + } > + /* > + Either a complete multi-byte sequence, > + or a broken byte sequence was found. > + Check if the sequence is a prefix of the "LINES TERMINATED BY" > + string. > + */ > + if ((uchar) buf[0] == line_term_char && i <= line_term_length && > + !memcmp(buf, line_term_ptr, i)) > + { > + if (line_term_length == i) > + { > + /* > + We found a "LINES TERMINATED BY" string that consists > + of a single multi-byte character. > + */ > + return 0; > + } > + /* > + Our sequence is a prefix of "LINES TERMINATED BY". > + Now check the suffix. Length of the suffix of line_term_ptr > + that still needs to be checked is (line_term_length - i). > + Note, READ_INFO::terminator() assumes that the leftmost byte of > the > + argument is already scanned from the file and is checked > + (e.g. against line_term_char). So we need to pass one extra byte. > + */ > + if (terminator(line_term_ptr + i - 1, line_term_length - i + 1)) > + return 0; > + } > + /* > + Here we have a good multi-byte character or a broken byte sequence, > + and the sequence is not equal to "LINES TERMINATED BY". > + No needs to check for escape_char, because: > + - multi-byte escape characters in "FIELDS ESCAPED BY" are not > + supported and are rejected at parse time. > + - broken single-byte sequences are not recognized as escapes, > + they are considered to be part of the data and are converted to > + question marks. > + */ > + goto fin; > + } > + DBUG_ASSERT(0); // Should not get to here > + } > +check_single_byte: > if (chr == escape_char) > { > line_cuted= true; > if (GET == my_b_EOF) > return 1; > continue; > } > if (chr == line_term_char && terminator(line_term_ptr,line_term_length)) > return 0; > +fin: > line_cuted= true; > } > } Regards, Sergei Chief Architect MariaDB and [email protected] _______________________________________________ Mailing list: https://launchpad.net/~maria-developers Post to : [email protected] Unsubscribe : https://launchpad.net/~maria-developers More help : https://help.launchpad.net/ListHelp

