Revert "IMPALA-6389: Make '\0' delimited text files work"

This reverts commit c2bdaf8af4cf35d3462595c2a341ed84dcf5d960.

An ASAN issue and potentially other problem have been found;
reverting to unbreak the build and tests.

Change-Id: If581311033de8c26e33316b19192c4579594f261
Reviewed-on: http://gerrit.cloudera.org:8080/9851
Reviewed-by: Lars Volker <[email protected]>
Tested-by: Zach Amsden <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/b78daedf
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/b78daedf
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/b78daedf

Branch: refs/heads/master
Commit: b78daedf52bdaf5028bae90f157d37b6c5bea2c6
Parents: 2194dfd
Author: Zach Amsden <[email protected]>
Authored: Thu Mar 29 03:51:41 2018 +0000
Committer: Zach Amsden <[email protected]>
Committed: Thu Mar 29 04:59:48 2018 +0000

----------------------------------------------------------------------
 be/src/exec/delimited-text-parser-test.cc  | 56 ++++---------------
 be/src/exec/delimited-text-parser.cc       | 74 ++++++++-----------------
 be/src/exec/delimited-text-parser.h        | 43 +++++---------
 be/src/exec/delimited-text-parser.inline.h | 70 +++++++++++------------
 be/src/exec/hdfs-sequence-scanner.cc       |  2 +-
 be/src/exec/hdfs-sequence-scanner.h        |  3 +-
 be/src/exec/hdfs-text-scanner.cc           |  2 +-
 be/src/exec/hdfs-text-scanner.h            |  3 +-
 8 files changed, 84 insertions(+), 169 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/delimited-text-parser-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/delimited-text-parser-test.cc 
b/be/src/exec/delimited-text-parser-test.cc
index d8e977d..3156b36 100644
--- a/be/src/exec/delimited-text-parser-test.cc
+++ b/be/src/exec/delimited-text-parser-test.cc
@@ -24,7 +24,7 @@
 
 namespace impala {
 
-void Validate(TupleDelimitedTextParser* parser, const string& data,
+void Validate(DelimitedTextParser* parser, const string& data,
     int expected_offset, char tuple_delim, int expected_num_tuples,
     int expected_num_fields) {
   parser->ParserReset();
@@ -72,8 +72,8 @@ TEST(DelimitedTextParser, Basic) {
   bool is_materialized_col[NUM_COLS];
   for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true;
 
-  TupleDelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
-                                            TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM);
+  DelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
+                                       TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM);
   // Note that only complete tuples "count"
   Validate(&no_escape_parser, "no_delims", -1, TUPLE_DELIM, 0, 0);
   Validate(&no_escape_parser, "abc||abc", 4, TUPLE_DELIM, 1, 1);
@@ -81,9 +81,9 @@ TEST(DelimitedTextParser, Basic) {
   Validate(&no_escape_parser, "a|bcd", 2, TUPLE_DELIM, 0, 0);
 
   // Test with escape char
-  TupleDelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
-                                         TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM,
-                                         ESCAPE_CHAR);
+  DelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
+                                    TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM,
+                                    ESCAPE_CHAR);
   Validate(&escape_parser, "a@|a|bcd", 5, TUPLE_DELIM, 0, 0);
   Validate(&escape_parser, "a@@|a|bcd", 4, TUPLE_DELIM, 1, 1);
   Validate(&escape_parser, "a@@@|a|bcd", 7, TUPLE_DELIM, 0, 0);
@@ -127,8 +127,8 @@ TEST(DelimitedTextParser, Fields) {
   bool is_materialized_col[NUM_COLS];
   for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true;
 
-  TupleDelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
-                                            TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM);
+  DelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
+                                       TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM);
 
   Validate(&no_escape_parser, "a,b|c,d|e,f", 4, TUPLE_DELIM, 1, 3);
   Validate(&no_escape_parser, "b|c,d|e,f", 2, TUPLE_DELIM, 1, 3);
@@ -137,9 +137,9 @@ TEST(DelimitedTextParser, Fields) {
   const string str10("a,\0|c,d|e", 9);
   Validate(&no_escape_parser, str10, 4, TUPLE_DELIM, 1, 2);
 
-  TupleDelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
-                                         TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM,
-                                         ESCAPE_CHAR);
+  DelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
+                                    TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM,
+                                    ESCAPE_CHAR);
 
   Validate(&escape_parser, "a,b|c,d|e,f", 4, TUPLE_DELIM, 1, 3);
   Validate(&escape_parser, "a,@|c|e,f", 6, TUPLE_DELIM, 0, 1);
@@ -148,20 +148,14 @@ TEST(DelimitedTextParser, Fields) {
 
 TEST(DelimitedTextParser, SpecialDelimiters) {
   const char TUPLE_DELIM = '\n'; // implies '\r' and "\r\n" are also delimiters
-  const char NUL_DELIM = '\0';
   const int NUM_COLS = 1;
 
   bool is_materialized_col[NUM_COLS];
   for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true;
 
-  TupleDelimitedTextParser tuple_delim_parser(NUM_COLS, 0, is_materialized_col,
+  DelimitedTextParser tuple_delim_parser(NUM_COLS, 0, is_materialized_col,
       TUPLE_DELIM);
 
-  TupleDelimitedTextParser nul_delim_parser(NUM_COLS, 0, is_materialized_col, 
NUL_DELIM);
-
-  TupleDelimitedTextParser nul_field_parser(2, 0, is_materialized_col,
-                                            TUPLE_DELIM, NUL_DELIM);
-
   // Non-SSE case
   Validate(&tuple_delim_parser, "A\r\nB", 3, TUPLE_DELIM, 0, 0);
   Validate(&tuple_delim_parser, "A\rB", 2, TUPLE_DELIM, 0, 0);
@@ -171,16 +165,6 @@ TEST(DelimitedTextParser, SpecialDelimiters) {
   Validate(&tuple_delim_parser, "A\rB\nC\r\nD", 2, TUPLE_DELIM, 2, 2);
   Validate(&tuple_delim_parser, "\r\r\n\n", 1, TUPLE_DELIM, 2, 2);
 
-  // NUL tuple delimiter; no field delimiter
-  const string nul1("\0\0\0", 3);
-  const string nul2("AAA\0BBB\0", 8);
-  const string nul3("\n\0\r\0\r\n\0", 7);
-  const string nul4("\n\0\r\0\r\n", 6);
-  Validate(&nul_delim_parser, nul1, 1, NUL_DELIM, 2, 2);
-  Validate(&nul_delim_parser, nul2, 4, NUL_DELIM, 1, 1);
-  Validate(&nul_delim_parser, nul3, 2, NUL_DELIM, 2, 2);
-  Validate(&nul_delim_parser, nul4, 2, NUL_DELIM, 1, 1);
-
   // SSE case
   string data = "\rAAAAAAAAAAAAAAA";
   DCHECK_EQ(data.size(), SSEUtil::CHARS_PER_128_BIT_REGISTER);
@@ -194,22 +178,6 @@ TEST(DelimitedTextParser, SpecialDelimiters) {
   data = "\r\nAAA\n\r\r\nAAAAAAA";
   DCHECK_EQ(data.size(), SSEUtil::CHARS_PER_128_BIT_REGISTER);
   Validate(&tuple_delim_parser, data, 2, TUPLE_DELIM, 3, 3);
-
-  // NUL SSE case
-  const string nulsse1("AAAAA\0AAAAAAAAAAA\0AAAAAAAAAAAA\0\0", 32);
-  const string nulsse2("AAAAA\0AAAAAAAAAAA\0AAAAAAAAAAAA\0A", 32);
-  const string nulsse3("AAA\0BBBbbbbbbbbbbbbbbbbbbb\0cccc,ddd\0", 36);
-  const string nulsse4("AAA\0BBBbbbbbbbbbbbbbbbbbbb\0cccc,dddd", 36);
-  Validate(&nul_delim_parser, nulsse1, 6, NUL_DELIM, 3, 3);
-  Validate(&nul_delim_parser, nulsse2, 6, NUL_DELIM, 2, 2);
-  Validate(&nul_delim_parser, nulsse3, 4, NUL_DELIM, 2, 2);
-  Validate(&nul_delim_parser, nulsse4, 4, NUL_DELIM, 1, 1);
-
-  // NUL Field delimiters
-  const string field1("\na\0b\0c\n", 7);
-  const string field2("aaaa\na\0b\0c\naaaaa\0b\na\0b\0c\n", 25);
-  Validate(&nul_field_parser, field1, 1, TUPLE_DELIM, 1, 2);
-  Validate(&nul_field_parser, field2, 5, TUPLE_DELIM, 3, 6);
 }
 
 // TODO: expand test for other delimited text parser functions/cases.

http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/delimited-text-parser.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/delimited-text-parser.cc 
b/be/src/exec/delimited-text-parser.cc
index 7db65fd..18fcde1 100644
--- a/be/src/exec/delimited-text-parser.cc
+++ b/be/src/exec/delimited-text-parser.cc
@@ -24,8 +24,7 @@
 
 using namespace impala;
 
-template<bool DELIMITED_TUPLES>
-DelimitedTextParser<DELIMITED_TUPLES>::DelimitedTextParser(
+DelimitedTextParser::DelimitedTextParser(
     int num_cols, int num_partition_keys, const bool* is_materialized_col,
     char tuple_delim, char field_delim, char collection_item_delim, char 
escape_char)
     : is_materialized_col_(is_materialized_col),
@@ -73,7 +72,7 @@ DelimitedTextParser<DELIMITED_TUPLES>::DelimitedTextParser(
     memset(low_mask_, 0, sizeof(low_mask_));
   }
 
-  if (DELIMITED_TUPLES) {
+  if (tuple_delim != '\0') {
     search_chars[num_delims_++] = tuple_delim_;
     ++num_tuple_delims_;
     // Hive will treats \r (^M) as an alternate tuple delimiter, but \r\n is a
@@ -83,12 +82,12 @@ DelimitedTextParser<DELIMITED_TUPLES>::DelimitedTextParser(
       ++num_tuple_delims_;
     }
     xmm_tuple_search_ = 
_mm_loadu_si128(reinterpret_cast<__m128i*>(search_chars));
-    if (field_delim_ != tuple_delim_) search_chars[num_delims_++] = 
field_delim_;
-  } else {
-    search_chars[num_delims_++] = field_delim_;
   }
 
-  if (collection_item_delim != '\0') search_chars[num_delims_++] = 
collection_item_delim_;
+  if (field_delim != '\0' || collection_item_delim != '\0') {
+    search_chars[num_delims_++] = field_delim_;
+    search_chars[num_delims_++] = collection_item_delim_;
+  }
 
   DCHECK_GT(num_delims_, 0);
   xmm_delim_search_ = 
_mm_loadu_si128(reinterpret_cast<__m128i*>(search_chars));
@@ -96,30 +95,16 @@ DelimitedTextParser<DELIMITED_TUPLES>::DelimitedTextParser(
   ParserReset();
 }
 
-template
-DelimitedTextParser<true>::DelimitedTextParser(
-    int num_cols, int num_partition_keys, const bool* is_materialized_col,
-    char tuple_delim, char field_delim, char collection_item_delim, char 
escape_char);
-
-template
-DelimitedTextParser<false>::DelimitedTextParser(
-    int num_cols, int num_partition_keys, const bool* is_materialized_col,
-    char tuple_delim, char field_delim, char collection_item_delim, char 
escape_char);
-
-template<bool DELIMITED_TUPLES>
-void DelimitedTextParser<DELIMITED_TUPLES>::ParserReset() {
+void DelimitedTextParser::ParserReset() {
   current_column_has_escape_ = false;
   last_char_is_escape_ = false;
   last_row_delim_offset_ = -1;
   column_idx_ = num_partition_keys_;
 }
 
-template void DelimitedTextParser<true>::ParserReset();
-
 // Parsing raw csv data into FieldLocation descriptors.
-template<bool DELIMITED_TUPLES>
-Status DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int 
max_tuples,
-    int64_t remaining_len, char** byte_buffer_ptr, char** row_end_locations,
+Status DelimitedTextParser::ParseFieldLocations(int max_tuples, int64_t 
remaining_len,
+    char** byte_buffer_ptr, char** row_end_locations,
     FieldLocation* field_locations,
     int* num_tuples, int* num_fields, char** next_column_start) {
   // Start of this batch.
@@ -148,10 +133,10 @@ Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int max_tuples
   while (remaining_len > 0) {
     bool new_tuple = false;
     bool new_col = false;
-    if (DELIMITED_TUPLES) unfinished_tuple_ = true;
+    unfinished_tuple_ = true;
 
     if (!last_char_is_escape_) {
-      if (DELIMITED_TUPLES && (**byte_buffer_ptr == tuple_delim_ ||
+      if (tuple_delim_ != '\0' && (**byte_buffer_ptr == tuple_delim_ ||
            (tuple_delim_ == '\n' && **byte_buffer_ptr == '\r'))) {
         new_tuple = true;
         new_col = true;
@@ -181,7 +166,6 @@ Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int max_tuples
         row_end_locations[*num_tuples] = *byte_buffer_ptr;
         ++(*num_tuples);
       }
-      DCHECK(DELIMITED_TUPLES);
       unfinished_tuple_ = false;
       last_row_delim_offset_ = **byte_buffer_ptr == '\r' ? remaining_len - 1 : 
-1;
       if (*num_tuples == max_tuples) {
@@ -201,7 +185,7 @@ Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int max_tuples
 
   // For formats that store the length of the row, the row is not delimited:
   // e.g. Sequence files.
-  if (!DELIMITED_TUPLES) {
+  if (tuple_delim_ == '\0') {
     DCHECK_EQ(remaining_len, 0);
     RETURN_IF_ERROR(AddColumn<true>(*byte_buffer_ptr - *next_column_start,
         next_column_start, num_fields, field_locations));
@@ -209,30 +193,18 @@ Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int max_tuples
     DCHECK(status.ok());
     column_idx_ = num_partition_keys_;
     ++(*num_tuples);
+    unfinished_tuple_ = false;
   }
   return Status::OK();
 }
 
-template
-Status DelimitedTextParser<true>::ParseFieldLocations(int max_tuples,
-    int64_t remaining_len, char** byte_buffer_ptr, char** row_end_locations,
-    FieldLocation* field_locations,
-    int* num_tuples, int* num_fields, char** next_column_start);
-
-template
-Status DelimitedTextParser<false>::ParseFieldLocations(int max_tuples,
-    int64_t remaining_len, char** byte_buffer_ptr, char** row_end_locations,
-    FieldLocation* field_locations,
-    int* num_tuples, int* num_fields, char** next_column_start);
-
-template<bool DELIMITED_TUPLES>
-int64_t DelimitedTextParser<DELIMITED_TUPLES>::FindFirstInstance(const char* 
buffer,
-    int64_t len) {
+// Find the first instance of the tuple delimiter. This will find the start of 
the first
+// full tuple in buffer by looking for the end of the previous tuple.
+int64_t DelimitedTextParser::FindFirstInstance(const char* buffer, int64_t 
len) {
   int64_t tuple_start = 0;
   const char* buffer_start = buffer;
   bool found = false;
 
-  DCHECK(DELIMITED_TUPLES);
   // If the last char in the previous buffer was \r then either return the 
start of
   // this buffer or skip a \n at the beginning of the buffer.
   if (last_row_delim_offset_ != -1) {
@@ -254,10 +226,13 @@ restart:
       int tuple_mask = _mm_extract_epi16(xmm_tuple_mask, 0);
       if (tuple_mask != 0) {
         found = true;
-        // Find first set bit (1-based)
-        int i = ffs(tuple_mask);
-        tuple_start += i;
-        buffer += i;
+        for (int i = 0; i < SSEUtil::CHARS_PER_128_BIT_REGISTER; ++i) {
+          if ((tuple_mask & SSEUtil::SSE_BITMASK[i]) != 0) {
+            tuple_start += i + 1;
+            buffer += i + 1;
+            break;
+          }
+        }
         break;
       }
       tuple_start += SSEUtil::CHARS_PER_128_BIT_REGISTER;
@@ -320,6 +295,3 @@ restart:
   }
   return tuple_start;
 }
-
-template
-int64_t DelimitedTextParser<true>::FindFirstInstance(const char* buffer, 
int64_t len);

http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/delimited-text-parser.h
----------------------------------------------------------------------
diff --git a/be/src/exec/delimited-text-parser.h 
b/be/src/exec/delimited-text-parser.h
index 9b89127..b966081 100644
--- a/be/src/exec/delimited-text-parser.h
+++ b/be/src/exec/delimited-text-parser.h
@@ -25,27 +25,22 @@
 
 namespace impala {
 
-template <bool DELIMITED_TUPLES>
 class DelimitedTextParser {
  public:
 
   /// The Delimited Text Parser parses text rows that are delimited by specific
   /// characters:
-  ///   tuple_delim: delimits tuples.  Only used if DELIMITED_TUPLES is true.
+  ///   tuple_delim: delimits tuples
   ///   field_delim: delimits fields
   ///   collection_item_delim: delimits collection items
   ///   escape_char: escape delimiters, make them part of the data.
-  ///
-  /// If the template parameter DELIMITED_TUPLES is false there is no support
-  /// for tuple delimiters and we do not need to search for them.  Any value
-  /// may be passed for tuple_delim, as it is ignored.
-  ///
+  //
   /// 'num_cols' is the total number of columns including partition keys.
-  ///
+  //
   /// 'is_materialized_col' should be initialized to an array of length 
'num_cols', with
   /// is_materialized_col[i] = <true if column i should be materialized, false 
otherwise>
   /// Owned by caller.
-  ///
+  //
   /// The main method is ParseData which fills in a vector of pointers and 
lengths to the
   /// fields.  It also can handle an escape character which masks a tuple or 
field
   /// delimiter that occurs in the data.
@@ -96,14 +91,14 @@ class DelimitedTextParser {
   /// This function is used to parse sequence file records which do not need to
   /// parse for tuple delimiters. Returns an error status if any column 
exceeds the
   /// size limit. See AddColumn() for details.
-  /// This function is disabled for non-sequence file parsing.
-  template <bool PROCESS_ESCAPES>
+  template <bool process_escapes>
   Status ParseSingleTuple(int64_t len, char* buffer, FieldLocation* 
field_locations,
       int* num_fields);
 
   /// FindFirstInstance returns the position after the first non-escaped tuple
   /// delimiter from the starting offset.
   /// Used to find the start of a tuple if jumping into the middle of a text 
file.
+  /// Also used to find the sync marker for Sequenced and RC files.
   /// If no tuple delimiter is found within the buffer, return -1;
   int64_t FindFirstInstance(const char* buffer, int64_t len);
 
@@ -124,16 +119,13 @@ class DelimitedTextParser {
   /// by the number fields added.
   /// 'field_locations' will be updated with the start and length of the 
fields.
   /// Returns an error status if 'len' exceeds the size limit specified in 
AddColumn().
-  template <bool PROCESS_ESCAPES>
+  template <bool process_escapes>
   Status FillColumns(int64_t len, char** last_column, int* num_fields,
       impala::FieldLocation* field_locations);
 
   /// Return true if we have not seen a tuple delimiter for the current tuple 
being
   /// parsed (i.e., the last byte read was not a tuple delimiter).
-  bool HasUnfinishedTuple() {
-    DCHECK(DELIMITED_TUPLES);
-    return unfinished_tuple_;
-  }
+  bool HasUnfinishedTuple() { return unfinished_tuple_; }
 
  private:
   /// Initialize the parser state.
@@ -141,7 +133,7 @@ class DelimitedTextParser {
 
   /// Helper routine to add a column to the field_locations vector.
   /// Template parameter:
-  ///   PROCESS_ESCAPES -- if true the the column may have escape characters
+  ///   process_escapes -- if true the the column may have escape characters
   ///                      and the negative of the len will be stored.
   ///   len: length of the current column. The length of a column must fit in 
a 32-bit
   ///        signed integer (i.e. <= 2147483647 bytes). If a column is larger 
than that,
@@ -152,29 +144,23 @@ class DelimitedTextParser {
   /// Output:
   ///   field_locations: updated with start and length of current field.
   /// Return an error status if 'len' exceeds the size limit specified above.
-  template <bool PROCESS_ESCAPES>
+  template <bool process_escapes>
   Status AddColumn(int64_t len, char** next_column_start, int* num_fields,
       FieldLocation* field_locations);
 
   /// Helper routine to parse delimited text using SSE instructions.
   /// Identical arguments as ParseFieldLocations.
-  /// If the template argument, 'PROCESS_ESCAPES' is true, this function will 
handle
+  /// If the template argument, 'process_escapes' is true, this function will 
handle
   /// escapes, otherwise, it will assume the text is unescaped.  By using 
templates,
   /// we can special case the un-escaped path for better performance.  The 
unescaped
   /// path is optimized away by the compiler. Returns an error status if the 
length
   /// of any column exceeds the size limit. See AddColumn() for details.
-  template <bool PROCESS_ESCAPES>
+  template <bool process_escapes>
   Status ParseSse(int max_tuples, int64_t* remaining_len,
       char** byte_buffer_ptr, char** row_end_locations_,
       FieldLocation* field_locations,
       int* num_tuples, int* num_fields, char** next_column_start);
 
-  bool IsFieldOrCollectionItemDelimiter(char c) {
-    return (!DELIMITED_TUPLES && c == field_delim_) ||
-      (DELIMITED_TUPLES && field_delim_ != tuple_delim_ && c == field_delim_) 
||
-      (collection_item_delim_ != '\0' && c == collection_item_delim_);
-  }
-
   /// SSE(xmm) register containing the tuple search character(s).
   __m128i xmm_tuple_search_;
 
@@ -228,7 +214,7 @@ class DelimitedTextParser {
   /// Character delimiting collection items (to become slots).
   char collection_item_delim_;
 
-  /// Character delimiting tuples.  Only used if DELIMITED_TUPLES is true.
+  /// Character delimiting tuples.
   char tuple_delim_;
 
   /// Whether or not the current column has an escape character in it
@@ -242,8 +228,5 @@ class DelimitedTextParser {
   bool unfinished_tuple_;
 };
 
-using TupleDelimitedTextParser = DelimitedTextParser<true>;
-using SequenceDelimitedTextParser = DelimitedTextParser<false>;
-
 }// namespace impala
 #endif// IMPALA_EXEC_DELIMITED_TEXT_PARSER_H

http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/delimited-text-parser.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/delimited-text-parser.inline.h 
b/be/src/exec/delimited-text-parser.inline.h
index 9fe737e..02fa132 100644
--- a/be/src/exec/delimited-text-parser.inline.h
+++ b/be/src/exec/delimited-text-parser.inline.h
@@ -52,10 +52,9 @@ inline void ProcessEscapeMask(uint16_t escape_mask, bool* 
last_char_is_escape,
   *delim_mask &= ~escape_mask;
 }
 
-template <bool DELIMITED_TUPLES>
-template <bool PROCESS_ESCAPES>
-inline Status DelimitedTextParser<DELIMITED_TUPLES>::AddColumn(int64_t len,
-    char** next_column_start, int* num_fields, FieldLocation* field_locations) 
{
+template <bool process_escapes>
+inline Status DelimitedTextParser::AddColumn(int64_t len, char** 
next_column_start,
+    int* num_fields, FieldLocation* field_locations) {
   if (UNLIKELY(!BitUtil::IsNonNegative32Bit(len))) {
     return Status(TErrorCode::TEXT_PARSER_TRUNCATED_COLUMN, len);
   }
@@ -63,27 +62,26 @@ inline Status 
DelimitedTextParser<DELIMITED_TUPLES>::AddColumn(int64_t len,
     // Found a column that needs to be parsed, write the start/len to 
'field_locations'
     field_locations[*num_fields].start = *next_column_start;
     int64_t field_len = len;
-    if (PROCESS_ESCAPES && current_column_has_escape_) {
+    if (process_escapes && current_column_has_escape_) {
       field_len = -len;
     }
     field_locations[*num_fields].len = static_cast<int32_t>(field_len);
     ++(*num_fields);
   }
-  if (PROCESS_ESCAPES) current_column_has_escape_ = false;
+  if (process_escapes) current_column_has_escape_ = false;
   *next_column_start += len + 1;
   ++column_idx_;
   return Status::OK();
 }
 
-template <bool DELIMITED_TUPLES>
-template <bool PROCESS_ESCAPES>
-inline Status DelimitedTextParser<DELIMITED_TUPLES>::FillColumns(int64_t len,
-    char** last_column, int* num_fields, FieldLocation* field_locations) {
+template <bool process_escapes>
+inline Status DelimitedTextParser::FillColumns(int64_t len, char** last_column,
+    int* num_fields, FieldLocation* field_locations) {
   // Fill in any columns missing from the end of the tuple.
   char* dummy = NULL;
   if (last_column == NULL) last_column = &dummy;
   while (column_idx_ < num_cols_) {
-    RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(len, last_column,
+    RETURN_IF_ERROR(AddColumn<process_escapes>(len, last_column,
         num_fields, field_locations));
     // The rest of the columns will be null.
     last_column = &dummy;
@@ -105,9 +103,8 @@ inline Status 
DelimitedTextParser<DELIMITED_TUPLES>::FillColumns(int64_t len,
 ///  Needle   = 'abcd000000000000' (we're searching for any a's, b's, c's or 
d's)
 ///  Haystack = 'asdfghjklhjbdwwc' (the raw string)
 ///  Result   = '1010000000011001'
-template <bool DELIMITED_TUPLES>
-template <bool PROCESS_ESCAPES>
-inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples,
+template <bool process_escapes>
+inline Status DelimitedTextParser::ParseSse(int max_tuples,
     int64_t* remaining_len, char** byte_buffer_ptr,
     char** row_end_locations, FieldLocation* field_locations,
     int* num_tuples, int* num_fields, char** next_column_start) {
@@ -149,7 +146,7 @@ inline Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples,
 
     uint16_t escape_mask = 0;
     // If the table does not use escape characters, skip processing for it.
-    if (PROCESS_ESCAPES) {
+    if (process_escapes) {
       DCHECK(escape_char_ != '\0');
       xmm_escape_mask = 
SSE4_cmpestrm<SSEUtil::STRCHR_MODE>(xmm_escape_search_, 1,
           xmm_buffer, SSEUtil::CHARS_PER_128_BIT_REGISTER);
@@ -159,10 +156,8 @@ inline Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples,
 
     char* last_char = *byte_buffer_ptr + 15;
     bool last_char_is_unescaped_delim = delim_mask >> 15;
-    if (DELIMITED_TUPLES) {
-      unfinished_tuple_ = !(last_char_is_unescaped_delim &&
-          (*last_char == tuple_delim_ || (tuple_delim_ == '\n' && *last_char 
== '\r')));
-    }
+    unfinished_tuple_ = !(last_char_is_unescaped_delim &&
+        (*last_char == tuple_delim_ || (tuple_delim_ == '\n' && *last_char == 
'\r')));
 
     int last_col_idx = 0;
     // Process all non-zero bits in the delim_mask from lsb->msb.  If a bit
@@ -175,7 +170,7 @@ inline Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples,
       // clear current bit
       delim_mask &= ~(SSEUtil::SSE_BITMASK[n]);
 
-      if (PROCESS_ESCAPES) {
+      if (process_escapes) {
         // Determine if there was an escape character between [last_col_idx, n]
         bool escaped = (escape_mask & low_mask_[last_col_idx] & high_mask_[n]) 
!= 0;
         current_column_has_escape_ |= escaped;
@@ -184,14 +179,13 @@ inline Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples,
 
       char* delim_ptr = *byte_buffer_ptr + n;
 
-      if (IsFieldOrCollectionItemDelimiter(*delim_ptr)) {
-        RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(delim_ptr - 
*next_column_start,
+      if (*delim_ptr == field_delim_ || *delim_ptr == collection_item_delim_) {
+        RETURN_IF_ERROR(AddColumn<process_escapes>(delim_ptr - 
*next_column_start,
             next_column_start, num_fields, field_locations));
         continue;
       }
 
-      if (DELIMITED_TUPLES &&
-          (*delim_ptr == tuple_delim_ || (tuple_delim_ == '\n' && *delim_ptr 
== '\r'))) {
+      if (*delim_ptr == tuple_delim_ || (tuple_delim_ == '\n' && *delim_ptr == 
'\r')) {
         if (UNLIKELY(
                 last_row_delim_offset_ == *remaining_len - n && *delim_ptr == 
'\n')) {
           // If the row ended in \r\n then move the next start past the \n
@@ -199,7 +193,7 @@ inline Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples,
           last_row_delim_offset_ = -1;
           continue;
         }
-        RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(delim_ptr - 
*next_column_start,
+        RETURN_IF_ERROR(AddColumn<process_escapes>(delim_ptr - 
*next_column_start,
             next_column_start, num_fields, field_locations));
         Status status = FillColumns<false>(0, NULL, num_fields, 
field_locations);
         DCHECK(status.ok());
@@ -210,7 +204,7 @@ inline Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples,
         last_row_delim_offset_ = *delim_ptr == '\r' ? *remaining_len - n - 1 : 
-1;
         if (UNLIKELY(*num_tuples == max_tuples)) {
           (*byte_buffer_ptr) += (n + 1);
-          if (PROCESS_ESCAPES) last_char_is_escape_ = false;
+          if (process_escapes) last_char_is_escape_ = false;
           *remaining_len -= (n + 1);
           // If the last character we processed was \r then set the offset to 0
           // so that we will use it at the beginning of the next batch.
@@ -220,7 +214,7 @@ inline Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples,
       }
     }
 
-    if (PROCESS_ESCAPES) {
+    if (process_escapes) {
       // Determine if there was an escape character between (last_col_idx, 15)
       bool unprocessed_escape = escape_mask & low_mask_[last_col_idx] & 
high_mask_[15];
       current_column_has_escape_ |= unprocessed_escape;
@@ -233,10 +227,9 @@ inline Status 
DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples,
 }
 
 /// Simplified version of ParseSSE which does not handle tuple delimiters.
-template<>
-template <bool PROCESS_ESCAPES>
-inline Status DelimitedTextParser<false>::ParseSingleTuple(int64_t 
remaining_len,
-    char* buffer, FieldLocation* field_locations, int* num_fields) {
+template <bool process_escapes>
+inline Status DelimitedTextParser::ParseSingleTuple(int64_t remaining_len, 
char* buffer,
+    FieldLocation* field_locations, int* num_fields) {
   char* next_column_start = buffer;
   __m128i xmm_buffer, xmm_delim_mask, xmm_escape_mask;
 
@@ -253,7 +246,7 @@ inline Status 
DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len
 
       uint16_t escape_mask = 0;
       // If the table does not use escape characters, skip processing for it.
-      if (PROCESS_ESCAPES) {
+      if (process_escapes) {
         DCHECK(escape_char_ != '\0');
         xmm_escape_mask = 
SSE4_cmpestrm<SSEUtil::STRCHR_MODE>(xmm_escape_search_, 1,
             xmm_buffer, SSEUtil::CHARS_PER_128_BIT_REGISTER);
@@ -270,7 +263,7 @@ inline Status 
DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len
         DCHECK_GE(n, 0);
         DCHECK_LT(n, 16);
 
-        if (PROCESS_ESCAPES) {
+        if (process_escapes) {
           // Determine if there was an escape character between [last_col_idx, 
n]
           bool escaped = (escape_mask & low_mask_[last_col_idx] & 
high_mask_[n]) != 0;
           current_column_has_escape_ |= escaped;
@@ -280,11 +273,11 @@ inline Status 
DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len
         // clear current bit
         delim_mask &= ~(SSEUtil::SSE_BITMASK[n]);
 
-        RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(buffer + n - 
next_column_start,
+        RETURN_IF_ERROR(AddColumn<process_escapes>(buffer + n - 
next_column_start,
             &next_column_start, num_fields, field_locations));
       }
 
-      if (PROCESS_ESCAPES) {
+      if (process_escapes) {
         // Determine if there was an escape character between (last_col_idx, 
15)
         bool unprocessed_escape = escape_mask & low_mask_[last_col_idx] & 
high_mask_[15];
         current_column_has_escape_ |= unprocessed_escape;
@@ -303,8 +296,9 @@ inline Status 
DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len
       last_char_is_escape_ = false;
     }
 
-    if (!last_char_is_escape_ && IsFieldOrCollectionItemDelimiter(*buffer)) {
-      RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(buffer - next_column_start,
+    if (!last_char_is_escape_ &&
+          (*buffer == field_delim_ || *buffer == collection_item_delim_)) {
+      RETURN_IF_ERROR(AddColumn<process_escapes>(buffer - next_column_start,
           &next_column_start, num_fields, field_locations));
     }
 
@@ -314,7 +308,7 @@ inline Status 
DelimitedTextParser<false>::ParseSingleTuple(int64_t remaining_len
 
   // Last column does not have a delimiter after it.  Add that column and also
   // pad with empty cols if the input is ragged.
-  return FillColumns<PROCESS_ESCAPES>(buffer - next_column_start,
+  return FillColumns<process_escapes>(buffer - next_column_start,
       &next_column_start, num_fields, field_locations);
 }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/hdfs-sequence-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-sequence-scanner.cc 
b/be/src/exec/hdfs-sequence-scanner.cc
index 8a9151e..346a18a 100644
--- a/be/src/exec/hdfs-sequence-scanner.cc
+++ b/be/src/exec/hdfs-sequence-scanner.cc
@@ -73,7 +73,7 @@ Status HdfsSequenceScanner::InitNewRange() {
   text_converter_.reset(new TextConverter(hdfs_partition->escape_char(),
       scan_node_->hdfs_table()->null_column_value()));
 
-  delimited_text_parser_.reset(new SequenceDelimitedTextParser(
+  delimited_text_parser_.reset(new DelimitedTextParser(
       scan_node_->hdfs_table()->num_cols(), scan_node_->num_partition_keys(),
       scan_node_->is_materialized_col(), '\0', hdfs_partition->field_delim(),
       hdfs_partition->collection_delim(), hdfs_partition->escape_char()));

http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/hdfs-sequence-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-sequence-scanner.h 
b/be/src/exec/hdfs-sequence-scanner.h
index 463ffc7..4845edb 100644
--- a/be/src/exec/hdfs-sequence-scanner.h
+++ b/be/src/exec/hdfs-sequence-scanner.h
@@ -153,7 +153,6 @@
 
 namespace impala {
 
-template <bool>
 class DelimitedTextParser;
 
 class HdfsSequenceScanner : public BaseSequenceScanner {
@@ -223,7 +222,7 @@ class HdfsSequenceScanner : public BaseSequenceScanner {
   Status GetRecord(uint8_t** record_ptr, int64_t* record_len) 
WARN_UNUSED_RESULT;
 
   /// Helper class for picking fields and rows from delimited text.
-  boost::scoped_ptr<DelimitedTextParser<false>> delimited_text_parser_;
+  boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_;
   std::vector<FieldLocation> field_locations_;
 
   /// Data that is fixed across headers.  This struct is shared between scan 
ranges.

http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/hdfs-text-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc
index b78115d..253bcc8 100644
--- a/be/src/exec/hdfs-text-scanner.cc
+++ b/be/src/exec/hdfs-text-scanner.cc
@@ -203,7 +203,7 @@ Status HdfsTextScanner::InitNewRange() {
     collection_delim = '\0';
   }
 
-  delimited_text_parser_.reset(new TupleDelimitedTextParser(
+  delimited_text_parser_.reset(new DelimitedTextParser(
       scan_node_->hdfs_table()->num_cols(), scan_node_->num_partition_keys(),
       scan_node_->is_materialized_col(), hdfs_partition->line_delim(),
       field_delim, collection_delim, hdfs_partition->escape_char()));

http://git-wip-us.apache.org/repos/asf/impala/blob/b78daedf/be/src/exec/hdfs-text-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-text-scanner.h b/be/src/exec/hdfs-text-scanner.h
index 25886ba..610c612 100644
--- a/be/src/exec/hdfs-text-scanner.h
+++ b/be/src/exec/hdfs-text-scanner.h
@@ -25,7 +25,6 @@
 
 namespace impala {
 
-template<bool>
 class DelimitedTextParser;
 class ScannerContext;
 struct HdfsFileDesc;
@@ -238,7 +237,7 @@ class HdfsTextScanner : public HdfsScanner {
   int slot_idx_;
 
   /// Helper class for picking fields and rows from delimited text.
-  boost::scoped_ptr<DelimitedTextParser<true>> delimited_text_parser_;
+  boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_;
 
   /// Return field locations from the Delimited Text Parser.
   std::vector<FieldLocation> field_locations_;

Reply via email to