IMPALA-6389: Make '\0' delimited text files work

Initially I didn't want to fully implement this, as the metadata
for these tables can't even be fully stored in Postgres; however
after digging into some older documentation, it appears that the
ASCII NUL character actually has been used as a field separator
in various vendors CSV implementation.

Therefore, this patch attempts to make things as non-broken as
possible and allows \0 as a field or tuple delimiter.  Collection
column delimiters are not allowed to be \0, as they genuinly may
not exist and we don't want to force special escaping on an
arbitrary character.  Note that the field delimiter must be distinct
from the tuple delimiter when they both exist; if it is not, the
effect will be that there is no field delimiter (this is actually
possible with single column tables).

Testing: Created a zero delimited table as described in the JIRA,
using MySQL backed Hive metastore; ran select * from tab_separated
on the table, updated the unit test.  Additionally, build ASAN
and ran the unit test.

Change-Id: I2190c57681f29f34ee1eb393e30dfdda5839098c
Reviewed-on: http://gerrit.cloudera.org:8080/9857
Tested-by: Impala Public Jenkins
Reviewed-by: Zach Amsden <zams...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/380e17aa
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/380e17aa
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/380e17aa

Branch: refs/heads/master
Commit: 380e17aa3cf678d4502245f12d8a77f58f4b8996
Parents: 8e5f923
Author: Zach Amsden <zams...@cloudera.com>
Authored: Wed Mar 7 01:44:43 2018 +0000
Committer: Zach Amsden <zams...@cloudera.com>
Committed: Thu Apr 5 18:42:03 2018 +0000

----------------------------------------------------------------------
 be/src/exec/delimited-text-parser-test.cc  | 61 +++++++++++++++-----
 be/src/exec/delimited-text-parser.cc       | 74 +++++++++++++++++--------
 be/src/exec/delimited-text-parser.h        | 43 +++++++++-----
 be/src/exec/delimited-text-parser.inline.h | 70 ++++++++++++-----------
 be/src/exec/hdfs-sequence-scanner.cc       |  2 +-
 be/src/exec/hdfs-sequence-scanner.h        |  3 +-
 be/src/exec/hdfs-text-scanner.cc           |  2 +-
 be/src/exec/hdfs-text-scanner.h            |  3 +-
 8 files changed, 172 insertions(+), 86 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/380e17aa/be/src/exec/delimited-text-parser-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/delimited-text-parser-test.cc 
b/be/src/exec/delimited-text-parser-test.cc
index 3156b36..76d9444 100644
--- a/be/src/exec/delimited-text-parser-test.cc
+++ b/be/src/exec/delimited-text-parser-test.cc
@@ -24,7 +24,7 @@
 
 namespace impala {
 
-void Validate(DelimitedTextParser* parser, const string& data,
+void Validate(TupleDelimitedTextParser* parser, const string& data,
     int expected_offset, char tuple_delim, int expected_num_tuples,
     int expected_num_fields) {
   parser->ParserReset();
@@ -72,8 +72,8 @@ TEST(DelimitedTextParser, Basic) {
   bool is_materialized_col[NUM_COLS];
   for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true;
 
-  DelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
-                                       TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM);
+  TupleDelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
+                                            TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM);
   // Note that only complete tuples "count"
   Validate(&no_escape_parser, "no_delims", -1, TUPLE_DELIM, 0, 0);
   Validate(&no_escape_parser, "abc||abc", 4, TUPLE_DELIM, 1, 1);
@@ -81,9 +81,9 @@ TEST(DelimitedTextParser, Basic) {
   Validate(&no_escape_parser, "a|bcd", 2, TUPLE_DELIM, 0, 0);
 
   // Test with escape char
-  DelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
-                                    TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM,
-                                    ESCAPE_CHAR);
+  TupleDelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
+                                         TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM,
+                                         ESCAPE_CHAR);
   Validate(&escape_parser, "a@|a|bcd", 5, TUPLE_DELIM, 0, 0);
   Validate(&escape_parser, "a@@|a|bcd", 4, TUPLE_DELIM, 1, 1);
   Validate(&escape_parser, "a@@@|a|bcd", 7, TUPLE_DELIM, 0, 0);
@@ -127,8 +127,8 @@ TEST(DelimitedTextParser, Fields) {
   bool is_materialized_col[NUM_COLS];
   for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true;
 
-  DelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
-                                       TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM);
+  TupleDelimitedTextParser no_escape_parser(NUM_COLS, 0, is_materialized_col,
+                                            TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM);
 
   Validate(&no_escape_parser, "a,b|c,d|e,f", 4, TUPLE_DELIM, 1, 3);
   Validate(&no_escape_parser, "b|c,d|e,f", 2, TUPLE_DELIM, 1, 3);
@@ -137,9 +137,9 @@ TEST(DelimitedTextParser, Fields) {
   const string str10("a,\0|c,d|e", 9);
   Validate(&no_escape_parser, str10, 4, TUPLE_DELIM, 1, 2);
 
-  DelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
-                                    TUPLE_DELIM, FIELD_DELIM, COLLECTION_DELIM,
-                                    ESCAPE_CHAR);
+  TupleDelimitedTextParser escape_parser(NUM_COLS, 0, is_materialized_col,
+                                         TUPLE_DELIM, FIELD_DELIM, 
COLLECTION_DELIM,
+                                         ESCAPE_CHAR);
 
   Validate(&escape_parser, "a,b|c,d|e,f", 4, TUPLE_DELIM, 1, 3);
   Validate(&escape_parser, "a,@|c|e,f", 6, TUPLE_DELIM, 0, 1);
@@ -148,14 +148,21 @@ TEST(DelimitedTextParser, Fields) {
 
 TEST(DelimitedTextParser, SpecialDelimiters) {
   const char TUPLE_DELIM = '\n'; // implies '\r' and "\r\n" are also delimiters
+  const char NUL_DELIM = '\0';
   const int NUM_COLS = 1;
+  const int MAX_COLS = 2;
 
-  bool is_materialized_col[NUM_COLS];
-  for (int i = 0; i < NUM_COLS; ++i) is_materialized_col[i] = true;
+  bool is_materialized_col[MAX_COLS];
+  for (int i = 0; i < MAX_COLS; ++i) is_materialized_col[i] = true;
 
-  DelimitedTextParser tuple_delim_parser(NUM_COLS, 0, is_materialized_col,
+  TupleDelimitedTextParser tuple_delim_parser(NUM_COLS, 0, is_materialized_col,
       TUPLE_DELIM);
 
+  TupleDelimitedTextParser nul_tuple_parser(NUM_COLS, 0, is_materialized_col, 
NUL_DELIM);
+
+  TupleDelimitedTextParser nul_field_parser(MAX_COLS, 0, is_materialized_col,
+                                            TUPLE_DELIM, NUL_DELIM);
+
   // Non-SSE case
   Validate(&tuple_delim_parser, "A\r\nB", 3, TUPLE_DELIM, 0, 0);
   Validate(&tuple_delim_parser, "A\rB", 2, TUPLE_DELIM, 0, 0);
@@ -165,6 +172,16 @@ TEST(DelimitedTextParser, SpecialDelimiters) {
   Validate(&tuple_delim_parser, "A\rB\nC\r\nD", 2, TUPLE_DELIM, 2, 2);
   Validate(&tuple_delim_parser, "\r\r\n\n", 1, TUPLE_DELIM, 2, 2);
 
+  // NUL tuple delimiter; no field delimiter
+  const string nul1("\0\0\0", 3);
+  const string nul2("AAA\0BBB\0", 8);
+  const string nul3("\n\0\r\0\r\n\0", 7);
+  const string nul4("\n\0\r\0\r\n", 6);
+  Validate(&nul_tuple_parser, nul1, 1, NUL_DELIM, 2, 2);
+  Validate(&nul_tuple_parser, nul2, 4, NUL_DELIM, 1, 1);
+  Validate(&nul_tuple_parser, nul3, 2, NUL_DELIM, 2, 2);
+  Validate(&nul_tuple_parser, nul4, 2, NUL_DELIM, 1, 1);
+
   // SSE case
   string data = "\rAAAAAAAAAAAAAAA";
   DCHECK_EQ(data.size(), SSEUtil::CHARS_PER_128_BIT_REGISTER);
@@ -178,6 +195,22 @@ TEST(DelimitedTextParser, SpecialDelimiters) {
   data = "\r\nAAA\n\r\r\nAAAAAAA";
   DCHECK_EQ(data.size(), SSEUtil::CHARS_PER_128_BIT_REGISTER);
   Validate(&tuple_delim_parser, data, 2, TUPLE_DELIM, 3, 3);
+
+  // NUL SSE case
+  const string nulsse1("AAAAA\0AAAAAAAAAAA\0AAAAAAAAAAAA\0\0", 32);
+  const string nulsse2("AAAAA\0AAAAAAAAAAA\0AAAAAAAAAAAA\0A", 32);
+  const string nulsse3("AAA\0BBBbbbbbbbbbbbbbbbbbbb\0cccc,ddd\0", 36);
+  const string nulsse4("AAA\0BBBbbbbbbbbbbbbbbbbbbb\0cccc,dddd", 36);
+  Validate(&nul_tuple_parser, nulsse1, 6, NUL_DELIM, 3, 3);
+  Validate(&nul_tuple_parser, nulsse2, 6, NUL_DELIM, 2, 2);
+  Validate(&nul_tuple_parser, nulsse3, 4, NUL_DELIM, 2, 2);
+  Validate(&nul_tuple_parser, nulsse4, 4, NUL_DELIM, 1, 1);
+
+  // NUL Field delimiters
+  const string field1("\na\0b\0c\n", 7);
+  const string field2("aaaa\na\0b\0c\naaaaa\0b\na\0b\0c\n", 25);
+  Validate(&nul_field_parser, field1, 1, TUPLE_DELIM, 1, 2);
+  Validate(&nul_field_parser, field2, 5, TUPLE_DELIM, 3, 6);
 }
 
 // TODO: expand test for other delimited text parser functions/cases.

http://git-wip-us.apache.org/repos/asf/impala/blob/380e17aa/be/src/exec/delimited-text-parser.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/delimited-text-parser.cc 
b/be/src/exec/delimited-text-parser.cc
index 18fcde1..7db65fd 100644
--- a/be/src/exec/delimited-text-parser.cc
+++ b/be/src/exec/delimited-text-parser.cc
@@ -24,7 +24,8 @@
 
 using namespace impala;
 
-DelimitedTextParser::DelimitedTextParser(
+template<bool DELIMITED_TUPLES>
+DelimitedTextParser<DELIMITED_TUPLES>::DelimitedTextParser(
     int num_cols, int num_partition_keys, const bool* is_materialized_col,
     char tuple_delim, char field_delim, char collection_item_delim, char 
escape_char)
     : is_materialized_col_(is_materialized_col),
@@ -72,7 +73,7 @@ DelimitedTextParser::DelimitedTextParser(
     memset(low_mask_, 0, sizeof(low_mask_));
   }
 
-  if (tuple_delim != '\0') {
+  if (DELIMITED_TUPLES) {
     search_chars[num_delims_++] = tuple_delim_;
     ++num_tuple_delims_;
     // Hive will treats \r (^M) as an alternate tuple delimiter, but \r\n is a
@@ -82,29 +83,43 @@ DelimitedTextParser::DelimitedTextParser(
       ++num_tuple_delims_;
     }
     xmm_tuple_search_ = 
_mm_loadu_si128(reinterpret_cast<__m128i*>(search_chars));
-  }
-
-  if (field_delim != '\0' || collection_item_delim != '\0') {
+    if (field_delim_ != tuple_delim_) search_chars[num_delims_++] = 
field_delim_;
+  } else {
     search_chars[num_delims_++] = field_delim_;
-    search_chars[num_delims_++] = collection_item_delim_;
   }
 
+  if (collection_item_delim != '\0') search_chars[num_delims_++] = 
collection_item_delim_;
+
   DCHECK_GT(num_delims_, 0);
   xmm_delim_search_ = 
_mm_loadu_si128(reinterpret_cast<__m128i*>(search_chars));
 
   ParserReset();
 }
 
-void DelimitedTextParser::ParserReset() {
+template
+DelimitedTextParser<true>::DelimitedTextParser(
+    int num_cols, int num_partition_keys, const bool* is_materialized_col,
+    char tuple_delim, char field_delim, char collection_item_delim, char 
escape_char);
+
+template
+DelimitedTextParser<false>::DelimitedTextParser(
+    int num_cols, int num_partition_keys, const bool* is_materialized_col,
+    char tuple_delim, char field_delim, char collection_item_delim, char 
escape_char);
+
+template<bool DELIMITED_TUPLES>
+void DelimitedTextParser<DELIMITED_TUPLES>::ParserReset() {
   current_column_has_escape_ = false;
   last_char_is_escape_ = false;
   last_row_delim_offset_ = -1;
   column_idx_ = num_partition_keys_;
 }
 
+template void DelimitedTextParser<true>::ParserReset();
+
 // Parsing raw csv data into FieldLocation descriptors.
-Status DelimitedTextParser::ParseFieldLocations(int max_tuples, int64_t 
remaining_len,
-    char** byte_buffer_ptr, char** row_end_locations,
+template<bool DELIMITED_TUPLES>
+Status DelimitedTextParser<DELIMITED_TUPLES>::ParseFieldLocations(int 
max_tuples,
+    int64_t remaining_len, char** byte_buffer_ptr, char** row_end_locations,
     FieldLocation* field_locations,
     int* num_tuples, int* num_fields, char** next_column_start) {
   // Start of this batch.
@@ -133,10 +148,10 @@ Status DelimitedTextParser::ParseFieldLocations(int 
max_tuples, int64_t remainin
   while (remaining_len > 0) {
     bool new_tuple = false;
     bool new_col = false;
-    unfinished_tuple_ = true;
+    if (DELIMITED_TUPLES) unfinished_tuple_ = true;
 
     if (!last_char_is_escape_) {
-      if (tuple_delim_ != '\0' && (**byte_buffer_ptr == tuple_delim_ ||
+      if (DELIMITED_TUPLES && (**byte_buffer_ptr == tuple_delim_ ||
            (tuple_delim_ == '\n' && **byte_buffer_ptr == '\r'))) {
         new_tuple = true;
         new_col = true;
@@ -166,6 +181,7 @@ Status DelimitedTextParser::ParseFieldLocations(int 
max_tuples, int64_t remainin
         row_end_locations[*num_tuples] = *byte_buffer_ptr;
         ++(*num_tuples);
       }
+      DCHECK(DELIMITED_TUPLES);
       unfinished_tuple_ = false;
       last_row_delim_offset_ = **byte_buffer_ptr == '\r' ? remaining_len - 1 : 
-1;
       if (*num_tuples == max_tuples) {
@@ -185,7 +201,7 @@ Status DelimitedTextParser::ParseFieldLocations(int 
max_tuples, int64_t remainin
 
   // For formats that store the length of the row, the row is not delimited:
   // e.g. Sequence files.
-  if (tuple_delim_ == '\0') {
+  if (!DELIMITED_TUPLES) {
     DCHECK_EQ(remaining_len, 0);
     RETURN_IF_ERROR(AddColumn<true>(*byte_buffer_ptr - *next_column_start,
         next_column_start, num_fields, field_locations));
@@ -193,18 +209,30 @@ Status DelimitedTextParser::ParseFieldLocations(int 
max_tuples, int64_t remainin
     DCHECK(status.ok());
     column_idx_ = num_partition_keys_;
     ++(*num_tuples);
-    unfinished_tuple_ = false;
   }
   return Status::OK();
 }
 
-// Find the first instance of the tuple delimiter. This will find the start of 
the first
-// full tuple in buffer by looking for the end of the previous tuple.
-int64_t DelimitedTextParser::FindFirstInstance(const char* buffer, int64_t 
len) {
+template
+Status DelimitedTextParser<true>::ParseFieldLocations(int max_tuples,
+    int64_t remaining_len, char** byte_buffer_ptr, char** row_end_locations,
+    FieldLocation* field_locations,
+    int* num_tuples, int* num_fields, char** next_column_start);
+
+template
+Status DelimitedTextParser<false>::ParseFieldLocations(int max_tuples,
+    int64_t remaining_len, char** byte_buffer_ptr, char** row_end_locations,
+    FieldLocation* field_locations,
+    int* num_tuples, int* num_fields, char** next_column_start);
+
+template<bool DELIMITED_TUPLES>
+int64_t DelimitedTextParser<DELIMITED_TUPLES>::FindFirstInstance(const char* 
buffer,
+    int64_t len) {
   int64_t tuple_start = 0;
   const char* buffer_start = buffer;
   bool found = false;
 
+  DCHECK(DELIMITED_TUPLES);
   // If the last char in the previous buffer was \r then either return the 
start of
   // this buffer or skip a \n at the beginning of the buffer.
   if (last_row_delim_offset_ != -1) {
@@ -226,13 +254,10 @@ restart:
       int tuple_mask = _mm_extract_epi16(xmm_tuple_mask, 0);
       if (tuple_mask != 0) {
         found = true;
-        for (int i = 0; i < SSEUtil::CHARS_PER_128_BIT_REGISTER; ++i) {
-          if ((tuple_mask & SSEUtil::SSE_BITMASK[i]) != 0) {
-            tuple_start += i + 1;
-            buffer += i + 1;
-            break;
-          }
-        }
+        // Find first set bit (1-based)
+        int i = ffs(tuple_mask);
+        tuple_start += i;
+        buffer += i;
         break;
       }
       tuple_start += SSEUtil::CHARS_PER_128_BIT_REGISTER;
@@ -295,3 +320,6 @@ restart:
   }
   return tuple_start;
 }
+
+template
+int64_t DelimitedTextParser<true>::FindFirstInstance(const char* buffer, 
int64_t len);

http://git-wip-us.apache.org/repos/asf/impala/blob/380e17aa/be/src/exec/delimited-text-parser.h
----------------------------------------------------------------------
diff --git a/be/src/exec/delimited-text-parser.h 
b/be/src/exec/delimited-text-parser.h
index b966081..9b89127 100644
--- a/be/src/exec/delimited-text-parser.h
+++ b/be/src/exec/delimited-text-parser.h
@@ -25,22 +25,27 @@
 
 namespace impala {
 
+template <bool DELIMITED_TUPLES>
 class DelimitedTextParser {
  public:
 
   /// The Delimited Text Parser parses text rows that are delimited by specific
   /// characters:
-  ///   tuple_delim: delimits tuples
+  ///   tuple_delim: delimits tuples.  Only used if DELIMITED_TUPLES is true.
   ///   field_delim: delimits fields
   ///   collection_item_delim: delimits collection items
   ///   escape_char: escape delimiters, make them part of the data.
-  //
+  ///
+  /// If the template parameter DELIMITED_TUPLES is false there is no support
+  /// for tuple delimiters and we do not need to search for them.  Any value
+  /// may be passed for tuple_delim, as it is ignored.
+  ///
   /// 'num_cols' is the total number of columns including partition keys.
-  //
+  ///
   /// 'is_materialized_col' should be initialized to an array of length 
'num_cols', with
   /// is_materialized_col[i] = <true if column i should be materialized, false 
otherwise>
   /// Owned by caller.
-  //
+  ///
   /// The main method is ParseData which fills in a vector of pointers and 
lengths to the
   /// fields.  It also can handle an escape character which masks a tuple or 
field
   /// delimiter that occurs in the data.
@@ -91,14 +96,14 @@ class DelimitedTextParser {
   /// This function is used to parse sequence file records which do not need to
   /// parse for tuple delimiters. Returns an error status if any column 
exceeds the
   /// size limit. See AddColumn() for details.
-  template <bool process_escapes>
+  /// This function is disabled for non-sequence file parsing.
+  template <bool PROCESS_ESCAPES>
   Status ParseSingleTuple(int64_t len, char* buffer, FieldLocation* 
field_locations,
       int* num_fields);
 
   /// FindFirstInstance returns the position after the first non-escaped tuple
   /// delimiter from the starting offset.
   /// Used to find the start of a tuple if jumping into the middle of a text 
file.
-  /// Also used to find the sync marker for Sequenced and RC files.
   /// If no tuple delimiter is found within the buffer, return -1;
   int64_t FindFirstInstance(const char* buffer, int64_t len);
 
@@ -119,13 +124,16 @@ class DelimitedTextParser {
   /// by the number fields added.
   /// 'field_locations' will be updated with the start and length of the 
fields.
   /// Returns an error status if 'len' exceeds the size limit specified in 
AddColumn().
-  template <bool process_escapes>
+  template <bool PROCESS_ESCAPES>
   Status FillColumns(int64_t len, char** last_column, int* num_fields,
       impala::FieldLocation* field_locations);
 
   /// Return true if we have not seen a tuple delimiter for the current tuple 
being
   /// parsed (i.e., the last byte read was not a tuple delimiter).
-  bool HasUnfinishedTuple() { return unfinished_tuple_; }
+  bool HasUnfinishedTuple() {
+    DCHECK(DELIMITED_TUPLES);
+    return unfinished_tuple_;
+  }
 
  private:
   /// Initialize the parser state.
@@ -133,7 +141,7 @@ class DelimitedTextParser {
 
   /// Helper routine to add a column to the field_locations vector.
   /// Template parameter:
-  ///   process_escapes -- if true the the column may have escape characters
+  ///   PROCESS_ESCAPES -- if true the the column may have escape characters
   ///                      and the negative of the len will be stored.
   ///   len: length of the current column. The length of a column must fit in 
a 32-bit
   ///        signed integer (i.e. <= 2147483647 bytes). If a column is larger 
than that,
@@ -144,23 +152,29 @@ class DelimitedTextParser {
   /// Output:
   ///   field_locations: updated with start and length of current field.
   /// Return an error status if 'len' exceeds the size limit specified above.
-  template <bool process_escapes>
+  template <bool PROCESS_ESCAPES>
   Status AddColumn(int64_t len, char** next_column_start, int* num_fields,
       FieldLocation* field_locations);
 
   /// Helper routine to parse delimited text using SSE instructions.
   /// Identical arguments as ParseFieldLocations.
-  /// If the template argument, 'process_escapes' is true, this function will 
handle
+  /// If the template argument, 'PROCESS_ESCAPES' is true, this function will 
handle
   /// escapes, otherwise, it will assume the text is unescaped.  By using 
templates,
   /// we can special case the un-escaped path for better performance.  The 
unescaped
   /// path is optimized away by the compiler. Returns an error status if the 
length
   /// of any column exceeds the size limit. See AddColumn() for details.
-  template <bool process_escapes>
+  template <bool PROCESS_ESCAPES>
   Status ParseSse(int max_tuples, int64_t* remaining_len,
       char** byte_buffer_ptr, char** row_end_locations_,
       FieldLocation* field_locations,
       int* num_tuples, int* num_fields, char** next_column_start);
 
+  bool IsFieldOrCollectionItemDelimiter(char c) {
+    return (!DELIMITED_TUPLES && c == field_delim_) ||
+      (DELIMITED_TUPLES && field_delim_ != tuple_delim_ && c == field_delim_) 
||
+      (collection_item_delim_ != '\0' && c == collection_item_delim_);
+  }
+
   /// SSE(xmm) register containing the tuple search character(s).
   __m128i xmm_tuple_search_;
 
@@ -214,7 +228,7 @@ class DelimitedTextParser {
   /// Character delimiting collection items (to become slots).
   char collection_item_delim_;
 
-  /// Character delimiting tuples.
+  /// Character delimiting tuples.  Only used if DELIMITED_TUPLES is true.
   char tuple_delim_;
 
   /// Whether or not the current column has an escape character in it
@@ -228,5 +242,8 @@ class DelimitedTextParser {
   bool unfinished_tuple_;
 };
 
+using TupleDelimitedTextParser = DelimitedTextParser<true>;
+using SequenceDelimitedTextParser = DelimitedTextParser<false>;
+
 }// namespace impala
 #endif// IMPALA_EXEC_DELIMITED_TEXT_PARSER_H

http://git-wip-us.apache.org/repos/asf/impala/blob/380e17aa/be/src/exec/delimited-text-parser.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/delimited-text-parser.inline.h 
b/be/src/exec/delimited-text-parser.inline.h
index 02fa132..9fe737e 100644
--- a/be/src/exec/delimited-text-parser.inline.h
+++ b/be/src/exec/delimited-text-parser.inline.h
@@ -52,9 +52,10 @@ inline void ProcessEscapeMask(uint16_t escape_mask, bool* 
last_char_is_escape,
   *delim_mask &= ~escape_mask;
 }
 
-template <bool process_escapes>
-inline Status DelimitedTextParser::AddColumn(int64_t len, char** 
next_column_start,
-    int* num_fields, FieldLocation* field_locations) {
+template <bool DELIMITED_TUPLES>
+template <bool PROCESS_ESCAPES>
+inline Status DelimitedTextParser<DELIMITED_TUPLES>::AddColumn(int64_t len,
+    char** next_column_start, int* num_fields, FieldLocation* field_locations) 
{
   if (UNLIKELY(!BitUtil::IsNonNegative32Bit(len))) {
     return Status(TErrorCode::TEXT_PARSER_TRUNCATED_COLUMN, len);
   }
@@ -62,26 +63,27 @@ inline Status DelimitedTextParser::AddColumn(int64_t len, 
char** next_column_sta
     // Found a column that needs to be parsed, write the start/len to 
'field_locations'
     field_locations[*num_fields].start = *next_column_start;
     int64_t field_len = len;
-    if (process_escapes && current_column_has_escape_) {
+    if (PROCESS_ESCAPES && current_column_has_escape_) {
       field_len = -len;
     }
     field_locations[*num_fields].len = static_cast<int32_t>(field_len);
     ++(*num_fields);
   }
-  if (process_escapes) current_column_has_escape_ = false;
+  if (PROCESS_ESCAPES) current_column_has_escape_ = false;
   *next_column_start += len + 1;
   ++column_idx_;
   return Status::OK();
 }
 
-template <bool process_escapes>
-inline Status DelimitedTextParser::FillColumns(int64_t len, char** last_column,
-    int* num_fields, FieldLocation* field_locations) {
+template <bool DELIMITED_TUPLES>
+template <bool PROCESS_ESCAPES>
+inline Status DelimitedTextParser<DELIMITED_TUPLES>::FillColumns(int64_t len,
+    char** last_column, int* num_fields, FieldLocation* field_locations) {
   // Fill in any columns missing from the end of the tuple.
   char* dummy = NULL;
   if (last_column == NULL) last_column = &dummy;
   while (column_idx_ < num_cols_) {
-    RETURN_IF_ERROR(AddColumn<process_escapes>(len, last_column,
+    RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(len, last_column,
         num_fields, field_locations));
     // The rest of the columns will be null.
     last_column = &dummy;
@@ -103,8 +105,9 @@ inline Status DelimitedTextParser::FillColumns(int64_t len, 
char** last_column,
 ///  Needle   = 'abcd000000000000' (we're searching for any a's, b's, c's or 
d's)
 ///  Haystack = 'asdfghjklhjbdwwc' (the raw string)
 ///  Result   = '1010000000011001'
-template <bool process_escapes>
-inline Status DelimitedTextParser::ParseSse(int max_tuples,
+template <bool DELIMITED_TUPLES>
+template <bool PROCESS_ESCAPES>
+inline Status DelimitedTextParser<DELIMITED_TUPLES>::ParseSse(int max_tuples,
     int64_t* remaining_len, char** byte_buffer_ptr,
     char** row_end_locations, FieldLocation* field_locations,
     int* num_tuples, int* num_fields, char** next_column_start) {
@@ -146,7 +149,7 @@ inline Status DelimitedTextParser::ParseSse(int max_tuples,
 
     uint16_t escape_mask = 0;
     // If the table does not use escape characters, skip processing for it.
-    if (process_escapes) {
+    if (PROCESS_ESCAPES) {
       DCHECK(escape_char_ != '\0');
       xmm_escape_mask = 
SSE4_cmpestrm<SSEUtil::STRCHR_MODE>(xmm_escape_search_, 1,
           xmm_buffer, SSEUtil::CHARS_PER_128_BIT_REGISTER);
@@ -156,8 +159,10 @@ inline Status DelimitedTextParser::ParseSse(int max_tuples,
 
     char* last_char = *byte_buffer_ptr + 15;
     bool last_char_is_unescaped_delim = delim_mask >> 15;
-    unfinished_tuple_ = !(last_char_is_unescaped_delim &&
-        (*last_char == tuple_delim_ || (tuple_delim_ == '\n' && *last_char == 
'\r')));
+    if (DELIMITED_TUPLES) {
+      unfinished_tuple_ = !(last_char_is_unescaped_delim &&
+          (*last_char == tuple_delim_ || (tuple_delim_ == '\n' && *last_char 
== '\r')));
+    }
 
     int last_col_idx = 0;
     // Process all non-zero bits in the delim_mask from lsb->msb.  If a bit
@@ -170,7 +175,7 @@ inline Status DelimitedTextParser::ParseSse(int max_tuples,
       // clear current bit
       delim_mask &= ~(SSEUtil::SSE_BITMASK[n]);
 
-      if (process_escapes) {
+      if (PROCESS_ESCAPES) {
         // Determine if there was an escape character between [last_col_idx, n]
         bool escaped = (escape_mask & low_mask_[last_col_idx] & high_mask_[n]) 
!= 0;
         current_column_has_escape_ |= escaped;
@@ -179,13 +184,14 @@ inline Status DelimitedTextParser::ParseSse(int 
max_tuples,
 
       char* delim_ptr = *byte_buffer_ptr + n;
 
-      if (*delim_ptr == field_delim_ || *delim_ptr == collection_item_delim_) {
-        RETURN_IF_ERROR(AddColumn<process_escapes>(delim_ptr - 
*next_column_start,
+      if (IsFieldOrCollectionItemDelimiter(*delim_ptr)) {
+        RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(delim_ptr - 
*next_column_start,
             next_column_start, num_fields, field_locations));
         continue;
       }
 
-      if (*delim_ptr == tuple_delim_ || (tuple_delim_ == '\n' && *delim_ptr == 
'\r')) {
+      if (DELIMITED_TUPLES &&
+          (*delim_ptr == tuple_delim_ || (tuple_delim_ == '\n' && *delim_ptr 
== '\r'))) {
         if (UNLIKELY(
                 last_row_delim_offset_ == *remaining_len - n && *delim_ptr == 
'\n')) {
           // If the row ended in \r\n then move the next start past the \n
@@ -193,7 +199,7 @@ inline Status DelimitedTextParser::ParseSse(int max_tuples,
           last_row_delim_offset_ = -1;
           continue;
         }
-        RETURN_IF_ERROR(AddColumn<process_escapes>(delim_ptr - 
*next_column_start,
+        RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(delim_ptr - 
*next_column_start,
             next_column_start, num_fields, field_locations));
         Status status = FillColumns<false>(0, NULL, num_fields, 
field_locations);
         DCHECK(status.ok());
@@ -204,7 +210,7 @@ inline Status DelimitedTextParser::ParseSse(int max_tuples,
         last_row_delim_offset_ = *delim_ptr == '\r' ? *remaining_len - n - 1 : 
-1;
         if (UNLIKELY(*num_tuples == max_tuples)) {
           (*byte_buffer_ptr) += (n + 1);
-          if (process_escapes) last_char_is_escape_ = false;
+          if (PROCESS_ESCAPES) last_char_is_escape_ = false;
           *remaining_len -= (n + 1);
           // If the last character we processed was \r then set the offset to 0
           // so that we will use it at the beginning of the next batch.
@@ -214,7 +220,7 @@ inline Status DelimitedTextParser::ParseSse(int max_tuples,
       }
     }
 
-    if (process_escapes) {
+    if (PROCESS_ESCAPES) {
       // Determine if there was an escape character between (last_col_idx, 15)
       bool unprocessed_escape = escape_mask & low_mask_[last_col_idx] & 
high_mask_[15];
       current_column_has_escape_ |= unprocessed_escape;
@@ -227,9 +233,10 @@ inline Status DelimitedTextParser::ParseSse(int max_tuples,
 }
 
 /// Simplified version of ParseSSE which does not handle tuple delimiters.
-template <bool process_escapes>
-inline Status DelimitedTextParser::ParseSingleTuple(int64_t remaining_len, 
char* buffer,
-    FieldLocation* field_locations, int* num_fields) {
+template<>
+template <bool PROCESS_ESCAPES>
+inline Status DelimitedTextParser<false>::ParseSingleTuple(int64_t 
remaining_len,
+    char* buffer, FieldLocation* field_locations, int* num_fields) {
   char* next_column_start = buffer;
   __m128i xmm_buffer, xmm_delim_mask, xmm_escape_mask;
 
@@ -246,7 +253,7 @@ inline Status DelimitedTextParser::ParseSingleTuple(int64_t 
remaining_len, char*
 
       uint16_t escape_mask = 0;
       // If the table does not use escape characters, skip processing for it.
-      if (process_escapes) {
+      if (PROCESS_ESCAPES) {
         DCHECK(escape_char_ != '\0');
         xmm_escape_mask = 
SSE4_cmpestrm<SSEUtil::STRCHR_MODE>(xmm_escape_search_, 1,
             xmm_buffer, SSEUtil::CHARS_PER_128_BIT_REGISTER);
@@ -263,7 +270,7 @@ inline Status DelimitedTextParser::ParseSingleTuple(int64_t 
remaining_len, char*
         DCHECK_GE(n, 0);
         DCHECK_LT(n, 16);
 
-        if (process_escapes) {
+        if (PROCESS_ESCAPES) {
           // Determine if there was an escape character between [last_col_idx, 
n]
           bool escaped = (escape_mask & low_mask_[last_col_idx] & 
high_mask_[n]) != 0;
           current_column_has_escape_ |= escaped;
@@ -273,11 +280,11 @@ inline Status 
DelimitedTextParser::ParseSingleTuple(int64_t remaining_len, char*
         // clear current bit
         delim_mask &= ~(SSEUtil::SSE_BITMASK[n]);
 
-        RETURN_IF_ERROR(AddColumn<process_escapes>(buffer + n - 
next_column_start,
+        RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(buffer + n - 
next_column_start,
             &next_column_start, num_fields, field_locations));
       }
 
-      if (process_escapes) {
+      if (PROCESS_ESCAPES) {
         // Determine if there was an escape character between (last_col_idx, 
15)
         bool unprocessed_escape = escape_mask & low_mask_[last_col_idx] & 
high_mask_[15];
         current_column_has_escape_ |= unprocessed_escape;
@@ -296,9 +303,8 @@ inline Status DelimitedTextParser::ParseSingleTuple(int64_t 
remaining_len, char*
       last_char_is_escape_ = false;
     }
 
-    if (!last_char_is_escape_ &&
-          (*buffer == field_delim_ || *buffer == collection_item_delim_)) {
-      RETURN_IF_ERROR(AddColumn<process_escapes>(buffer - next_column_start,
+    if (!last_char_is_escape_ && IsFieldOrCollectionItemDelimiter(*buffer)) {
+      RETURN_IF_ERROR(AddColumn<PROCESS_ESCAPES>(buffer - next_column_start,
           &next_column_start, num_fields, field_locations));
     }
 
@@ -308,7 +314,7 @@ inline Status DelimitedTextParser::ParseSingleTuple(int64_t 
remaining_len, char*
 
   // Last column does not have a delimiter after it.  Add that column and also
   // pad with empty cols if the input is ragged.
-  return FillColumns<process_escapes>(buffer - next_column_start,
+  return FillColumns<PROCESS_ESCAPES>(buffer - next_column_start,
       &next_column_start, num_fields, field_locations);
 }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/380e17aa/be/src/exec/hdfs-sequence-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-sequence-scanner.cc 
b/be/src/exec/hdfs-sequence-scanner.cc
index 346a18a..8a9151e 100644
--- a/be/src/exec/hdfs-sequence-scanner.cc
+++ b/be/src/exec/hdfs-sequence-scanner.cc
@@ -73,7 +73,7 @@ Status HdfsSequenceScanner::InitNewRange() {
   text_converter_.reset(new TextConverter(hdfs_partition->escape_char(),
       scan_node_->hdfs_table()->null_column_value()));
 
-  delimited_text_parser_.reset(new DelimitedTextParser(
+  delimited_text_parser_.reset(new SequenceDelimitedTextParser(
       scan_node_->hdfs_table()->num_cols(), scan_node_->num_partition_keys(),
       scan_node_->is_materialized_col(), '\0', hdfs_partition->field_delim(),
       hdfs_partition->collection_delim(), hdfs_partition->escape_char()));

http://git-wip-us.apache.org/repos/asf/impala/blob/380e17aa/be/src/exec/hdfs-sequence-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-sequence-scanner.h 
b/be/src/exec/hdfs-sequence-scanner.h
index 4845edb..463ffc7 100644
--- a/be/src/exec/hdfs-sequence-scanner.h
+++ b/be/src/exec/hdfs-sequence-scanner.h
@@ -153,6 +153,7 @@
 
 namespace impala {
 
+template <bool>
 class DelimitedTextParser;
 
 class HdfsSequenceScanner : public BaseSequenceScanner {
@@ -222,7 +223,7 @@ class HdfsSequenceScanner : public BaseSequenceScanner {
   Status GetRecord(uint8_t** record_ptr, int64_t* record_len) 
WARN_UNUSED_RESULT;
 
   /// Helper class for picking fields and rows from delimited text.
-  boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_;
+  boost::scoped_ptr<DelimitedTextParser<false>> delimited_text_parser_;
   std::vector<FieldLocation> field_locations_;
 
   /// Data that is fixed across headers.  This struct is shared between scan 
ranges.

http://git-wip-us.apache.org/repos/asf/impala/blob/380e17aa/be/src/exec/hdfs-text-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc
index 253bcc8..b78115d 100644
--- a/be/src/exec/hdfs-text-scanner.cc
+++ b/be/src/exec/hdfs-text-scanner.cc
@@ -203,7 +203,7 @@ Status HdfsTextScanner::InitNewRange() {
     collection_delim = '\0';
   }
 
-  delimited_text_parser_.reset(new DelimitedTextParser(
+  delimited_text_parser_.reset(new TupleDelimitedTextParser(
       scan_node_->hdfs_table()->num_cols(), scan_node_->num_partition_keys(),
       scan_node_->is_materialized_col(), hdfs_partition->line_delim(),
       field_delim, collection_delim, hdfs_partition->escape_char()));

http://git-wip-us.apache.org/repos/asf/impala/blob/380e17aa/be/src/exec/hdfs-text-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-text-scanner.h b/be/src/exec/hdfs-text-scanner.h
index 610c612..25886ba 100644
--- a/be/src/exec/hdfs-text-scanner.h
+++ b/be/src/exec/hdfs-text-scanner.h
@@ -25,6 +25,7 @@
 
 namespace impala {
 
+template<bool>
 class DelimitedTextParser;
 class ScannerContext;
 struct HdfsFileDesc;
@@ -237,7 +238,7 @@ class HdfsTextScanner : public HdfsScanner {
   int slot_idx_;
 
   /// Helper class for picking fields and rows from delimited text.
-  boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_;
+  boost::scoped_ptr<DelimitedTextParser<true>> delimited_text_parser_;
 
   /// Return field locations from the Delimited Text Parser.
   std::vector<FieldLocation> field_locations_;

Reply via email to