Revert "IMPALA-6389: Make '\0' delimited text files work" This reverts commit c2bdaf8af4cf35d3462595c2a341ed84dcf5d960.
An ASAN issue and potentially other problem have been found; reverting to unbreak the build and tests. Change-Id: If581311033de8c26e33316b19192c4579594f261 Reviewed-on: http://gerrit.cloudera.org:8080/9851 Reviewed-by: Lars Volker <[email protected]> Tested-by: Zach Amsden <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/b78daedf Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/b78daedf Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/b78daedf Branch: refs/heads/master Commit: b78daedf52bdaf5028bae90f157d37b6c5bea2c6 Parents: 2194dfd Author: Zach Amsden <[email protected]> Authored: Thu Mar 29 03:51:41 2018 +0000 Committer: Zach Amsden <[email protected]> Committed: Thu Mar 29 04:59:48 2018 +0000 ---------------------------------------------------------------------- be/src/exec/delimited-text-parser-test.cc | 56 ++++--------------- be/src/exec/delimited-text-parser.cc | 74 ++++++++----------------- be/src/exec/delimited-text-parser.h | 43 +++++--------- be/src/exec/delimited-text-parser.inline.h | 70 +++++++++++------------ be/src/exec/hdfs-sequence-scanner.cc | 2 +- be/src/exec/hdfs-sequence-scanner.h | 3 +- be/src/exec/hdfs-text-scanner.cc | 2 +- be/src/exec/hdfs-text-scanner.h | 3 +- 8 files changed, 84 insertions(+), 169 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/delimited-text-parser-test.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/delimited-text-parser-test.cc b/be/src/exec/delimited-text-parser-test.cc index d8e977d..3156b36 100644 --- a/be/src/exec/delimited-text-parser-test.cc +++ b/be/src/exec/delimited-text-parser-test.cc @@ -24,7 +24,7 @@ namespace impala { -void Validate(TupleDelimitedTextParser* parser, const string& data, +void Validate(DelimitedTextParser* parser, const string& data, int expected_offset, char tuple_delim, int expected_num_tuples, int expected_num_fields) { parser->ParserReset(); @@ -72,8 +72,8 @@ TEST(DelimitedTextParser, Basic) { bool is_materialized_col[NUM_COLS]; for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true; - TupleDelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col, - TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM); + DelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col, + TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM); // Note that only complete tuples "count" Validate(&no_escape_parser, "no_delims", -1, TUPLE_DELIM, 0, 0); Validate(&no_escape_parser, "abc||abc", 4, TUPLE_DELIM, 1, 1); @@ -81,9 +81,9 @@ TEST(DelimitedTextParser, Basic) { Validate(&no_escape_parser, "a|bcd", 2, TUPLE_DELIM, 0, 0); // Test with escape char - TupleDelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col, - TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM, - ESCAPE_CHAR); + DelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col, + TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM, + ESCAPE_CHAR); Validate(&escape_parser, "a@|a|bcd", 5, TUPLE_DELIM, 0, 0); Validate(&escape_parser, "a@@|a|bcd", 4, TUPLE_DELIM, 1, 1); Validate(&escape_parser, "a@@@|a|bcd", 7, TUPLE_DELIM, 0, 0); @@ -127,8 +127,8 @@ TEST(DelimitedTextParser, Fields) { bool is_materialized_col[NUM_COLS]; for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true; - TupleDelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col, - TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM); + DelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col, + TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM); Validate(&no_escape_parser, "a,b|c,d|e,f", 4, TUPLE_DELIM, 1, 3); Validate(&no_escape_parser, "b|c,d|e,f", 2, TUPLE_DELIM, 1, 3); @@ -137,9 +137,9 @@ TEST(DelimitedTextParser, Fields) { const string str10("a,\0|c,d|e", 9); Validate(&no_escape_parser, str10, 4, TUPLE_DELIM, 1, 2); - TupleDelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col, - TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM, - ESCAPE_CHAR); + DelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col, + TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM, + ESCAPE_CHAR); Validate(&escape_parser, "a,b|c,d|e,f", 4, TUPLE_DELIM, 1, 3); Validate(&escape_parser, "a,@|c|e,f", 6, TUPLE_DELIM, 0, 1); @@ -148,20 +148,14 @@ TEST(DelimitedTextParser, Fields) { TEST(DelimitedTextParser, SpecialDelimiters) { const char TUPLE_DELIM = '\n'; // implies '\r' and "\r\n" are also delimiters - const char NUL_DELIM = '\0'; const int NUM_COLS = 1; bool is_materialized_col[NUM_COLS]; for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true; - TupleDelimitedTextParser tuple_delim_parser(NUM_COLS, 0, is_materialized_col, + DelimitedTextParser tuple_delim_parser(NUM_COLS, 0, is_materialized_col, TUPLE_DELIM); - TupleDelimitedTextParser nul_delim_parser(NUM_COLS, 0, is_materialized_col, NUL_DELIM); - - TupleDelimitedTextParser nul_field_parser(2, 0, is_materialized_col, - TUPLE_DELIM, NUL_DELIM); - // Non-SSE case Validate(&tuple_delim_parser, "A\r\nB", 3, TUPLE_DELIM, 0, 0); Validate(&tuple_delim_parser, "A\rB", 2, TUPLE_DELIM, 0, 0); @@ -171,16 +165,6 @@ TEST(DelimitedTextParser, SpecialDelimiters) { Validate(&tuple_delim_parser, "A\rB\nC\r\nD", 2, TUPLE_DELIM, 2, 2); Validate(&tuple_delim_parser, "\r\r\n\n", 1, TUPLE_DELIM, 2, 2); - // NUL tuple delimiter; no field delimiter - const string nul1("\0\0\0", 3); - const string nul2("AAA\0BBB\0", 8); - const string nul3("\n\0\r\0\r\n\0", 7); - const string nul4("\n\0\r\0\r\n", 6); - Validate(&nul_delim_parser, nul1, 1, NUL_DELIM, 2, 2); - Validate(&nul_delim_parser, nul2, 4, NUL_DELIM, 1, 1); - Validate(&nul_delim_parser, nul3, 2, NUL_DELIM, 2, 2); - Validate(&nul_delim_parser, nul4, 2, NUL_DELIM, 1, 1); - // SSE case string data = "\rAAAAAAAAAAAAAAA"; DCHECK_EQ(data.size(), SSEUtil::CHARS_PER_128_BIT_REGISTER); @@ -194,22 +178,6 @@ TEST(DelimitedTextParser, SpecialDelimiters) { data = "\r\nAAA\n\r\r\nAAAAAAA"; DCHECK_EQ(data.size(), SSEUtil::CHARS_PER_128_BIT_REGISTER); Validate(&tuple_delim_parser, data, 2, TUPLE_DELIM, 3, 3); - - // NUL SSE case - const string nulsse1("AAAAA\0AAAAAAAAAAA\0AAAAAAAAAAAA\0\0", 32); - const string nulsse2("AAAAA\0AAAAAAAAAAA\0AAAAAAAAAAAA\0A", 32); - const string nulsse3("AAA\0BBBbbbbbbbbbbbbbbbbbbb\0cccc,ddd\0", 36); - const string nulsse4("AAA\0BBBbbbbbbbbbbbbbbbbbbb\0cccc,dddd", 36); - Validate(&nul_delim_parser, nulsse1, 6, NUL_DELIM, 3, 3); - Validate(&nul_delim_parser, nulsse2, 6, NUL_DELIM, 2, 2); - Validate(&nul_delim_parser, nulsse3, 4, NUL_DELIM, 2, 2); - Validate(&nul_delim_parser, nulsse4, 4, NUL_DELIM, 1, 1); - - // NUL Field delimiters - const string field1("\na\0b\0c\n", 7); - const string field2("aaaa\na\0b\0c\naaaaa\0b\na\0b\0c\n", 25); - Validate(&nul_field_parser, field1, 1, TUPLE_DELIM, 1, 2); - Validate(&nul_field_parser, field2, 5, TUPLE_DELIM, 3, 6); } // TODO: expand test for other delimited text parser functions/cases. http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/delimited-text-parser.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/delimited-text-parser.cc b/be/src/exec/delimited-text-parser.cc index 7db65fd..18fcde1 100644 --- a/be/src/exec/delimited-text-parser.cc +++ b/be/src/exec/delimited-text-parser.cc @@ -24,8 +24,7 @@ using namespace impala; -template<bool DELIMITED_TUPLES> -DelimitedTextParser<DELIMITED_TUPLES>::DelimitedTextParser( +DelimitedTextParser::DelimitedTextParser( int num_cols, int num_partition_keys, const bool* is_materialized_col, char tuple_delim, char field_delim, char collection_item_delim, char escape_char) : is_materialized_col_(is_materialized_col), @@ -73,7 +72,7 @@ DelimitedTextParser<DELIMITED_TUPLES>::DelimitedTextParser( memset(low_mask_, 0, sizeof(low_mask_)); } - if (DELIMITED_TUPLES) { + if (tuple_delim != '\0') { search_chars[num_delims_++] = tuple_delim_; ++num_tuple_delims_; // Hive will treats \r (^M) as an alternate tuple delimiter, but \r\n is a @@ -83,12 +82,12 @@ DelimitedTextParser<DELIMITED_TUPLES>::DelimitedTextParser( ++num_tuple_delims_; } xmm_tuple_search_ = _mm_loadu_si128(reinterpret_cast<__m128i*>(search_chars)); - if (field_delim_ != tuple_delim_) search_chars[num_delims_++] = field_delim_; - } else { - search_chars[num_delims_++] = field_delim_; } - if (collection_item_delim != '\0') search_chars[num_delims_++] = collection_item_delim_; + if (field_delim != '\0' || collection_item_delim != '\0') { + search_chars[num_delims_++] = field_delim_; + search_chars[num_delims_++] = collection_item_delim_; + } DCHECK_GT(num_delims_, 0); xmm_delim_search_ = _mm_loadu_si128(reinterpret_cast<__m128i*>(search_chars)); @@ -96,30 +95,16 @@ DelimitedTextParser<DELIMITED_TUPLES>::DelimitedTextParser( ParserReset(); } -template -DelimitedTextParser<true>::DelimitedTextParser( - int num_cols, int num_partition_keys, const bool* is_materialized_col, - char tuple_delim, char field_delim, char collection_item_delim, char escape_char); - -template -DelimitedTextParser<false>::DelimitedTextParser( - int num_cols, int num_partition_keys, const bool* is_materialized_col, - char tuple_delim, char field_delim, char collection_item_delim, char escape_char); - -template<bool DELIMITED_TUPLES> -void DelimitedTextParser<DELIMITED_TUPLES>::ParserReset() { +void DelimitedTextParser::ParserReset() { current_column_has_escape_ = false; last_char_is_escape_ = false; last_row_delim_offset_ = -1; column_idx_ = num_partition_keys_; } -template void DelimitedTextParser<true>::ParserReset(); - // Parsing raw csv data into FieldLocation descriptors. -template<bool DELIMITED_TUPLES> -Status DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int max_tuples, - int64_t remaining_len, char** byte_buffer_ptr, char** row_end_locations, +Status DelimitedTextParser::ParseFieldLocations(int max_tuples, int64_t remaining_len, + char** byte_buffer_ptr, char** row_end_locations, FieldLocation* field_locations, int* num_tuples, int* num_fields, char** next_column_start) { // Start of this batch. @@ -148,10 +133,10 @@ Status DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int max_tuples while (remaining_len > 0) { bool new_tuple = false; bool new_col = false; - if (DELIMITED_TUPLES) unfinished_tuple_ = true; + unfinished_tuple_ = true; if (!last_char_is_escape_) { - if (DELIMITED_TUPLES && (**byte_buffer_ptr == tuple_delim_ || + if (tuple_delim_ != '\0' && (**byte_buffer_ptr == tuple_delim_ || (tuple_delim_ == '\n' && **byte_buffer_ptr == '\r'))) { new_tuple = true; new_col = true; @@ -181,7 +166,6 @@ Status DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int max_tuples row_end_locations[*num_tuples] = *byte_buffer_ptr; ++(*num_tuples); } - DCHECK(DELIMITED_TUPLES); unfinished_tuple_ = false; last_row_delim_offset_ = **byte_buffer_ptr == '\r' ? remaining_len - 1 : -1; if (*num_tuples == max_tuples) { @@ -201,7 +185,7 @@ Status DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int max_tuples // For formats that store the length of the row, the row is not delimited: // e.g. Sequence files. - if (!DELIMITED_TUPLES) { + if (tuple_delim_ == '\0') { DCHECK_EQ(remaining_len, 0); RETURN_IF_ERROR(AddColumn<true>(*byte_buffer_ptr - *next_column_start, next_column_start, num_fields, field_locations)); @@ -209,30 +193,18 @@ Status DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int max_tuples DCHECK(status.ok()); column_idx_ = num_partition_keys_; ++(*num_tuples); + unfinished_tuple_ = false; } return Status::OK(); } -template -Status DelimitedTextParser<true>::ParseFieldLocations(int max_tuples, - int64_t remaining_len, char** byte_buffer_ptr, char** row_end_locations, - FieldLocation* field_locations, - int* num_tuples, int* num_fields, char** next_column_start); - -template -Status DelimitedTextParser<false>::ParseFieldLocations(int max_tuples, - int64_t remaining_len, char** byte_buffer_ptr, char** row_end_locations, - FieldLocation* field_locations, - int* num_tuples, int* num_fields, char** next_column_start); - -template<bool DELIMITED_TUPLES> -int64_t DelimitedTextParser<DELIMITED_TUPLES>::FindFirstInstance(const char* buffer, - int64_t len) { +// Find the first instance of the tuple delimiter. This will find the start of the first +// full tuple in buffer by looking for the end of the previous tuple. +int64_t DelimitedTextParser::FindFirstInstance(const char* buffer, int64_t len) { int64_t tuple_start = 0; const char* buffer_start = buffer; bool found = false; - DCHECK(DELIMITED_TUPLES); // If the last char in the previous buffer was \r then either return the start of // this buffer or skip a \n at the beginning of the buffer. if (last_row_delim_offset_ != -1) { @@ -254,10 +226,13 @@ restart: int tuple_mask = _mm_extract_epi16(xmm_tuple_mask, 0); if (tuple_mask != 0) { found = true; - // Find first set bit (1-based) - int i = ffs(tuple_mask); - tuple_start += i; - buffer += i; + for (int i = 0; i < SSEUtil::CHARS_PER_128_BIT_REGISTER; ++i) { + if ((tuple_mask & SSEUtil::SSE_BITMASK[i]) != 0) { + tuple_start += i + 1; + buffer += i + 1; + break; + } + } break; } tuple_start += SSEUtil::CHARS_PER_128_BIT_REGISTER; @@ -320,6 +295,3 @@ restart: } return tuple_start; } - -template -int64_t DelimitedTextParser<true>::FindFirstInstance(const char* buffer, int64_t len); http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/delimited-text-parser.h ---------------------------------------------------------------------- diff --git a/be/src/exec/delimited-text-parser.h b/be/src/exec/delimited-text-parser.h index 9b89127..b966081 100644 --- a/be/src/exec/delimited-text-parser.h +++ b/be/src/exec/delimited-text-parser.h @@ -25,27 +25,22 @@ namespace impala { -template <bool DELIMITED_TUPLES> class DelimitedTextParser { public: /// The Delimited Text Parser parses text rows that are delimited by specific /// characters: - /// tuple_delim: delimits tuples. Only used if DELIMITED_TUPLES is true. + /// tuple_delim: delimits tuples /// field_delim: delimits fields /// collection_item_delim: delimits collection items /// escape_char: escape delimiters, make them part of the data. - /// - /// If the template parameter DELIMITED_TUPLES is false there is no support - /// for tuple delimiters and we do not need to search for them. Any value - /// may be passed for tuple_delim, as it is ignored. - /// + // /// 'num_cols' is the total number of columns including partition keys. - /// + // /// 'is_materialized_col' should be initialized to an array of length 'num_cols', with /// is_materialized_col[i] = <true if column i should be materialized, false otherwise> /// Owned by caller. - /// + // /// The main method is ParseData which fills in a vector of pointers and lengths to the /// fields. It also can handle an escape character which masks a tuple or field /// delimiter that occurs in the data. @@ -96,14 +91,14 @@ class DelimitedTextParser { /// This function is used to parse sequence file records which do not need to /// parse for tuple delimiters. Returns an error status if any column exceeds the /// size limit. See AddColumn() for details. - /// This function is disabled for non-sequence file parsing. - template <bool PROCESS_ESCAPES> + template <bool process_escapes> Status ParseSingleTuple(int64_t len, char* buffer, FieldLocation* field_locations, int* num_fields); /// FindFirstInstance returns the position after the first non-escaped tuple /// delimiter from the starting offset. /// Used to find the start of a tuple if jumping into the middle of a text file. + /// Also used to find the sync marker for Sequenced and RC files. /// If no tuple delimiter is found within the buffer, return -1; int64_t FindFirstInstance(const char* buffer, int64_t len); @@ -124,16 +119,13 @@ class DelimitedTextParser { /// by the number fields added. /// 'field_locations' will be updated with the start and length of the fields. /// Returns an error status if 'len' exceeds the size limit specified in AddColumn(). - template <bool PROCESS_ESCAPES> + template <bool process_escapes> Status FillColumns(int64_t len, char** last_column, int* num_fields, impala::FieldLocation* field_locations); /// Return true if we have not seen a tuple delimiter for the current tuple being /// parsed (i.e., the last byte read was not a tuple delimiter). - bool HasUnfinishedTuple() { - DCHECK(DELIMITED_TUPLES); - return unfinished_tuple_; - } + bool HasUnfinishedTuple() { return unfinished_tuple_; } private: /// Initialize the parser state. @@ -141,7 +133,7 @@ class DelimitedTextParser { /// Helper routine to add a column to the field_locations vector. /// Template parameter: - /// PROCESS_ESCAPES -- if true the the column may have escape characters + /// process_escapes -- if true the the column may have escape characters /// and the negative of the len will be stored. /// len: length of the current column. The length of a column must fit in a 32-bit /// signed integer (i.e. <= 2147483647 bytes). If a column is larger than that, @@ -152,29 +144,23 @@ class DelimitedTextParser { /// Output: /// field_locations: updated with start and length of current field. /// Return an error status if 'len' exceeds the size limit specified above. - template <bool PROCESS_ESCAPES> + template <bool process_escapes> Status AddColumn(int64_t len, char** next_column_start, int* num_fields, FieldLocation* field_locations); /// Helper routine to parse delimited text using SSE instructions. /// Identical arguments as ParseFieldLocations. - /// If the template argument, 'PROCESS_ESCAPES' is true, this function will handle + /// If the template argument, 'process_escapes' is true, this function will handle /// escapes, otherwise, it will assume the text is unescaped. By using templates, /// we can special case the un-escaped path for better performance. The unescaped /// path is optimized away by the compiler. Returns an error status if the length /// of any column exceeds the size limit. See AddColumn() for details. - template <bool PROCESS_ESCAPES> + template <bool process_escapes> Status ParseSse(int max_tuples, int64_t* remaining_len, char** byte_buffer_ptr, char** row_end_locations_, FieldLocation* field_locations, int* num_tuples, int* num_fields, char** next_column_start); - bool IsFieldOrCollectionItemDelimiter(char c) { - return (!DELIMITED_TUPLES && c == field_delim_) || - (DELIMITED_TUPLES && field_delim_ != tuple_delim_ && c == field_delim_) || - (collection_item_delim_ != '\0' && c == collection_item_delim_); - } - /// SSE(xmm) register containing the tuple search character(s). __m128i xmm_tuple_search_; @@ -228,7 +214,7 @@ class DelimitedTextParser { /// Character delimiting collection items (to become slots). char collection_item_delim_; - /// Character delimiting tuples. Only used if DELIMITED_TUPLES is true. + /// Character delimiting tuples. char tuple_delim_; /// Whether or not the current column has an escape character in it @@ -242,8 +228,5 @@ class DelimitedTextParser { bool unfinished_tuple_; }; -using TupleDelimitedTextParser = DelimitedTextParser<true>; -using SequenceDelimitedTextParser = DelimitedTextParser<false>; - }// namespace impala #endif// IMPALA_EXEC_DELIMITED_TEXT_PARSER_H http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/delimited-text-parser.inline.h ---------------------------------------------------------------------- diff --git a/be/src/exec/delimited-text-parser.inline.h b/be/src/exec/delimited-text-parser.inline.h index 9fe737e..02fa132 100644 --- a/be/src/exec/delimited-text-parser.inline.h +++ b/be/src/exec/delimited-text-parser.inline.h @@ -52,10 +52,9 @@ inline void ProcessEscapeMask(uint16_t escape_mask, bool* last_char_is_escape, *delim_mask &= ~escape_mask; } -template <bool DELIMITED_TUPLES> -template <bool PROCESS_ESCAPES> -inline Status DelimitedTextParser<DELIMITED_TUPLES>::AddColumn(int64_t len, - char** next_column_start, int* num_fields, FieldLocation* field_locations) { +template <bool process_escapes> +inline Status DelimitedTextParser::AddColumn(int64_t len, char** next_column_start, + int* num_fields, FieldLocation* field_locations) { if (UNLIKELY(!BitUtil::IsNonNegative32Bit(len))) { return Status(TErrorCode::TEXT_PARSER_TRUNCATED_COLUMN, len); } @@ -63,27 +62,26 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::AddColumn(int64_t len, // Found a column that needs to be parsed, write the start/len to 'field_locations' field_locations[*num_fields].start = *next_column_start; int64_t field_len = len; - if (PROCESS_ESCAPES && current_column_has_escape_) { + if (process_escapes && current_column_has_escape_) { field_len = -len; } field_locations[*num_fields].len = static_cast<int32_t>(field_len); ++(*num_fields); } - if (PROCESS_ESCAPES) current_column_has_escape_ = false; + if (process_escapes) current_column_has_escape_ = false; *next_column_start += len + 1; ++column_idx_; return Status::OK(); } -template <bool DELIMITED_TUPLES> -template <bool PROCESS_ESCAPES> -inline Status DelimitedTextParser<DELIMITED_TUPLES>::FillColumns(int64_t len, - char** last_column, int* num_fields, FieldLocation* field_locations) { +template <bool process_escapes> +inline Status DelimitedTextParser::FillColumns(int64_t len, char** last_column, + int* num_fields, FieldLocation* field_locations) { // Fill in any columns missing from the end of the tuple. char* dummy = NULL; if (last_column == NULL) last_column = &dummy; while (column_idx_ < num_cols_) { - RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(len, last_column, + RETURN_IF_ERROR(AddColumn<process_escapes>(len, last_column, num_fields, field_locations)); // The rest of the columns will be null. last_column = &dummy; @@ -105,9 +103,8 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::FillColumns(int64_t len, /// Needle = 'abcd000000000000' (we're searching for any a's, b's, c's or d's) /// Haystack = 'asdfghjklhjbdwwc' (the raw string) /// Result = '1010000000011001' -template <bool DELIMITED_TUPLES> -template <bool PROCESS_ESCAPES> -inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples, +template <bool process_escapes> +inline Status DelimitedTextParser::ParseSse(int max_tuples, int64_t* remaining_len, char** byte_buffer_ptr, char** row_end_locations, FieldLocation* field_locations, int* num_tuples, int* num_fields, char** next_column_start) { @@ -149,7 +146,7 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples, uint16_t escape_mask = 0; // If the table does not use escape characters, skip processing for it. - if (PROCESS_ESCAPES) { + if (process_escapes) { DCHECK(escape_char_ != '\0'); xmm_escape_mask = SSE4_cmpestrm<SSEUtil::STRCHR_MODE>(xmm_escape_search_, 1, xmm_buffer, SSEUtil::CHARS_PER_128_BIT_REGISTER); @@ -159,10 +156,8 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples, char* last_char = *byte_buffer_ptr + 15; bool last_char_is_unescaped_delim = delim_mask >> 15; - if (DELIMITED_TUPLES) { - unfinished_tuple_ = !(last_char_is_unescaped_delim && - (*last_char == tuple_delim_ || (tuple_delim_ == '\n' && *last_char == '\r'))); - } + unfinished_tuple_ = !(last_char_is_unescaped_delim && + (*last_char == tuple_delim_ || (tuple_delim_ == '\n' && *last_char == '\r'))); int last_col_idx = 0; // Process all non-zero bits in the delim_mask from lsb->msb. If a bit @@ -175,7 +170,7 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples, // clear current bit delim_mask &= ~(SSEUtil::SSE_BITMASK[n]); - if (PROCESS_ESCAPES) { + if (process_escapes) { // Determine if there was an escape character between [last_col_idx, n] bool escaped = (escape_mask & low_mask_[last_col_idx] & high_mask_[n]) != 0; current_column_has_escape_ |= escaped; @@ -184,14 +179,13 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples, char* delim_ptr = *byte_buffer_ptr + n; - if (IsFieldOrCollectionItemDelimiter(*delim_ptr)) { - RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(delim_ptr - *next_column_start, + if (*delim_ptr == field_delim_ || *delim_ptr == collection_item_delim_) { + RETURN_IF_ERROR(AddColumn<process_escapes>(delim_ptr - *next_column_start, next_column_start, num_fields, field_locations)); continue; } - if (DELIMITED_TUPLES && - (*delim_ptr == tuple_delim_ || (tuple_delim_ == '\n' && *delim_ptr == '\r'))) { + if (*delim_ptr == tuple_delim_ || (tuple_delim_ == '\n' && *delim_ptr == '\r')) { if (UNLIKELY( last_row_delim_offset_ == *remaining_len - n && *delim_ptr == '\n')) { // If the row ended in \r\n then move the next start past the \n @@ -199,7 +193,7 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples, last_row_delim_offset_ = -1; continue; } - RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(delim_ptr - *next_column_start, + RETURN_IF_ERROR(AddColumn<process_escapes>(delim_ptr - *next_column_start, next_column_start, num_fields, field_locations)); Status status = FillColumns<false>(0, NULL, num_fields, field_locations); DCHECK(status.ok()); @@ -210,7 +204,7 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples, last_row_delim_offset_ = *delim_ptr == '\r' ? *remaining_len - n - 1 : -1; if (UNLIKELY(*num_tuples == max_tuples)) { (*byte_buffer_ptr) += (n + 1); - if (PROCESS_ESCAPES) last_char_is_escape_ = false; + if (process_escapes) last_char_is_escape_ = false; *remaining_len -= (n + 1); // If the last character we processed was \r then set the offset to 0 // so that we will use it at the beginning of the next batch. @@ -220,7 +214,7 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples, } } - if (PROCESS_ESCAPES) { + if (process_escapes) { // Determine if there was an escape character between (last_col_idx, 15) bool unprocessed_escape = escape_mask & low_mask_[last_col_idx] & high_mask_[15]; current_column_has_escape_ |= unprocessed_escape; @@ -233,10 +227,9 @@ inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples, } /// Simplified version of ParseSSE which does not handle tuple delimiters. -template<> -template <bool PROCESS_ESCAPES> -inline Status DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len, - char* buffer, FieldLocation* field_locations, int* num_fields) { +template <bool process_escapes> +inline Status DelimitedTextParser::ParseSingleTuple(int64_t remaining_len, char* buffer, + FieldLocation* field_locations, int* num_fields) { char* next_column_start = buffer; __m128i xmm_buffer, xmm_delim_mask, xmm_escape_mask; @@ -253,7 +246,7 @@ inline Status DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len uint16_t escape_mask = 0; // If the table does not use escape characters, skip processing for it. - if (PROCESS_ESCAPES) { + if (process_escapes) { DCHECK(escape_char_ != '\0'); xmm_escape_mask = SSE4_cmpestrm<SSEUtil::STRCHR_MODE>(xmm_escape_search_, 1, xmm_buffer, SSEUtil::CHARS_PER_128_BIT_REGISTER); @@ -270,7 +263,7 @@ inline Status DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len DCHECK_GE(n, 0); DCHECK_LT(n, 16); - if (PROCESS_ESCAPES) { + if (process_escapes) { // Determine if there was an escape character between [last_col_idx, n] bool escaped = (escape_mask & low_mask_[last_col_idx] & high_mask_[n]) != 0; current_column_has_escape_ |= escaped; @@ -280,11 +273,11 @@ inline Status DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len // clear current bit delim_mask &= ~(SSEUtil::SSE_BITMASK[n]); - RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(buffer + n - next_column_start, + RETURN_IF_ERROR(AddColumn<process_escapes>(buffer + n - next_column_start, &next_column_start, num_fields, field_locations)); } - if (PROCESS_ESCAPES) { + if (process_escapes) { // Determine if there was an escape character between (last_col_idx, 15) bool unprocessed_escape = escape_mask & low_mask_[last_col_idx] & high_mask_[15]; current_column_has_escape_ |= unprocessed_escape; @@ -303,8 +296,9 @@ inline Status DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len last_char_is_escape_ = false; } - if (!last_char_is_escape_ && IsFieldOrCollectionItemDelimiter(*buffer)) { - RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(buffer - next_column_start, + if (!last_char_is_escape_ && + (*buffer == field_delim_ || *buffer == collection_item_delim_)) { + RETURN_IF_ERROR(AddColumn<process_escapes>(buffer - next_column_start, &next_column_start, num_fields, field_locations)); } @@ -314,7 +308,7 @@ inline Status DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len // Last column does not have a delimiter after it. Add that column and also // pad with empty cols if the input is ragged. - return FillColumns<PROCESS_ESCAPES>(buffer - next_column_start, + return FillColumns<process_escapes>(buffer - next_column_start, &next_column_start, num_fields, field_locations); } http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/hdfs-sequence-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-sequence-scanner.cc b/be/src/exec/hdfs-sequence-scanner.cc index 8a9151e..346a18a 100644 --- a/be/src/exec/hdfs-sequence-scanner.cc +++ b/be/src/exec/hdfs-sequence-scanner.cc @@ -73,7 +73,7 @@ Status HdfsSequenceScanner::InitNewRange() { text_converter_.reset(new TextConverter(hdfs_partition->escape_char(), scan_node_->hdfs_table()->null_column_value())); - delimited_text_parser_.reset(new SequenceDelimitedTextParser( + delimited_text_parser_.reset(new DelimitedTextParser( scan_node_->hdfs_table()->num_cols(), scan_node_->num_partition_keys(), scan_node_->is_materialized_col(), '\0', hdfs_partition->field_delim(), hdfs_partition->collection_delim(), hdfs_partition->escape_char())); http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/hdfs-sequence-scanner.h ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-sequence-scanner.h b/be/src/exec/hdfs-sequence-scanner.h index 463ffc7..4845edb 100644 --- a/be/src/exec/hdfs-sequence-scanner.h +++ b/be/src/exec/hdfs-sequence-scanner.h @@ -153,7 +153,6 @@ namespace impala { -template <bool> class DelimitedTextParser; class HdfsSequenceScanner : public BaseSequenceScanner { @@ -223,7 +222,7 @@ class HdfsSequenceScanner : public BaseSequenceScanner { Status GetRecord(uint8_t** record_ptr, int64_t* record_len) WARN_UNUSED_RESULT; /// Helper class for picking fields and rows from delimited text. - boost::scoped_ptr<DelimitedTextParser<false>> delimited_text_parser_; + boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_; std::vector<FieldLocation> field_locations_; /// Data that is fixed across headers. This struct is shared between scan ranges. http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/hdfs-text-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc index b78115d..253bcc8 100644 --- a/be/src/exec/hdfs-text-scanner.cc +++ b/be/src/exec/hdfs-text-scanner.cc @@ -203,7 +203,7 @@ Status HdfsTextScanner::InitNewRange() { collection_delim = '\0'; } - delimited_text_parser_.reset(new TupleDelimitedTextParser( + delimited_text_parser_.reset(new DelimitedTextParser( scan_node_->hdfs_table()->num_cols(), scan_node_->num_partition_keys(), scan_node_->is_materialized_col(), hdfs_partition->line_delim(), field_delim, collection_delim, hdfs_partition->escape_char())); http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/hdfs-text-scanner.h ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-text-scanner.h b/be/src/exec/hdfs-text-scanner.h index 25886ba..610c612 100644 --- a/be/src/exec/hdfs-text-scanner.h +++ b/be/src/exec/hdfs-text-scanner.h @@ -25,7 +25,6 @@ namespace impala { -template<bool> class DelimitedTextParser; class ScannerContext; struct HdfsFileDesc; @@ -238,7 +237,7 @@ class HdfsTextScanner : public HdfsScanner { int slot_idx_; /// Helper class for picking fields and rows from delimited text. - boost::scoped_ptr<DelimitedTextParser<true>> delimited_text_parser_; + boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_; /// Return field locations from the Delimited Text Parser. std::vector<FieldLocation> field_locations_;
