Repository: incubator-impala
Updated Branches:
  refs/heads/master 2fb14d1b8 -> 025fd3bd7


IMPALA-3646: Handle corrupt RLE literal or repeat counts of 0.

Adds handling and testing for a specific Parquet data corruption
scenario with plain dictionary encoded values.

The problematic scenario is when the repeat or literal count of
the RLE-encoded dictionary indexes is decoded as 0 - an invalid value.

There are several other cases of data corruption that are not yet
handled gracefully. This patch only handles one specific case.

Change-Id: Ibf406c82cdded37966f09c81e4cc1446d2b60d63
Reviewed-on: http://gerrit.cloudera.org:8080/3299
Reviewed-by: Alex Behm <[email protected]>
Tested-by: Alex Behm <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/025fd3bd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/025fd3bd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/025fd3bd

Branch: refs/heads/master
Commit: 025fd3bd7f3b7f4bc3dcb6b21e3598124b19b577
Parents: 86ff18e
Author: Alex Behm <[email protected]>
Authored: Thu Jun 2 00:03:20 2016 -0700
Committer: Tim Armstrong <[email protected]>
Committed: Tue Jun 7 17:29:59 2016 -0700

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-scanner.cc             |   5 ++--
 be/src/util/rle-encoding.h                      |   3 ++-
 be/src/util/rle-test.cc                         |  23 ++++++++++++++++++
 testdata/data/README                            |  16 +++++++++++++
 testdata/data/bad_rle_literal_count.parquet     | Bin 0 -> 255 bytes
 testdata/data/bad_rle_repeat_count.parquet      | Bin 0 -> 234 bytes
 .../parquet-corrupt-rle-counts-abort.test       |   8 +++++++
 .../QueryTest/parquet-corrupt-rle-counts.test   |   9 +++++++
 tests/query_test/test_scanners.py               |  24 +++++++++++++++++++
 9 files changed, 85 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc 
b/be/src/exec/hdfs-parquet-scanner.cc
index 4bb7bac..1713af6 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -866,7 +866,7 @@ class HdfsParquetScanner::ScalarColumnReader :
         if (def_level >= max_def_level()) {
           bool continue_execution =
               ReadSlot<IS_DICT_ENCODED>(tuple->GetSlot(tuple_offset_), pool);
-          if (UNLIKELY(!continue_execution)) break;
+          if (UNLIKELY(!continue_execution)) return false;
         } else {
           tuple->SetNull(null_indicator_offset_);
         }
@@ -1622,7 +1622,8 @@ bool HdfsParquetScanner::LevelDecoder::FillCache(int 
batch_size,
       }
       literal_count_ -= num_literals_to_set;
 
-      if (num_values == batch_size || !NextCounts<int16_t>()) break;
+      if (num_values == batch_size) break;
+      if (UNLIKELY(!NextCounts<int16_t>())) return false;
       if (repeat_count_ > 0 && current_value_ > max_level_) return false;
     }
   } else {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/be/src/util/rle-encoding.h
----------------------------------------------------------------------
diff --git a/be/src/util/rle-encoding.h b/be/src/util/rle-encoding.h
index c6686e3..030767d 100644
--- a/be/src/util/rle-encoding.h
+++ b/be/src/util/rle-encoding.h
@@ -278,11 +278,12 @@ bool RleDecoder::NextCounts() {
   bool is_literal = indicator_value & 1;
   if (is_literal) {
     literal_count_ = (indicator_value >> 1) * 8;
+    if (UNLIKELY(literal_count_ == 0)) return false;
   } else {
     repeat_count_ = indicator_value >> 1;
     bool result = bit_reader_.GetAligned<T>(
         BitUtil::Ceil(bit_width_, 8), reinterpret_cast<T*>(&current_value_));
-    DCHECK(result);
+    if (UNLIKELY(!result || repeat_count_ == 0)) return false;
   }
   return true;
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/be/src/util/rle-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/rle-test.cc b/be/src/util/rle-test.cc
index 8073068..691dd45 100644
--- a/be/src/util/rle-test.cc
+++ b/be/src/util/rle-test.cc
@@ -419,6 +419,29 @@ TEST(BitRle, Overflow) {
   }
 }
 
+// Tests handling of a specific data corruption scenario where
+// the literal or repeat count is decoded as 0 (which is invalid).
+TEST(Rle, ZeroLiteralOrRepeatCount) {
+  const int len = 1024;
+  uint8_t buffer[len];
+  RleDecoder decoder(buffer, len, 0);
+  uint64_t val;
+
+  // Test the RLE repeated values path.
+  memset(buffer, 0, len);
+  for (int i = 0; i < 10; ++i) {
+    bool result = decoder.Get(&val);
+    EXPECT_FALSE(result);
+  }
+
+  // Test the RLE literal values path
+  memset(buffer, 1, len);
+  for (int i = 0; i < 10; ++i) {
+    bool result = decoder.Get(&val);
+    EXPECT_FALSE(result);
+  }
+}
+
 }
 
 int main(int argc, char **argv) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/testdata/data/README
----------------------------------------------------------------------
diff --git a/testdata/data/README b/testdata/data/README
index f8cabda..53993c8 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -5,6 +5,22 @@ Contains 3 single-column rows:
 "is"
 "fun"
 
+bad_rle_literal_count.parquet:
+Generated by hacking Impala's Parquet writer.
+Contains a single bigint column 'c' with the values 1, 3, 7 stored
+in a single data chunk as dictionary plain. The RLE encoded dictionary
+indexes are all literals (and not repeated), but the literal count
+is incorrectly 0 in the file to test that such data corruption is
+proprly handled.
+
+bad_rle_repeat_count.parquet:
+Generated by hacking Impala's Parquet writer.
+Contains a single bigint column 'c' with the value 7 repeated 7 times
+stored in a single data chunk as dictionary plain. The RLE encoded dictionary
+indexes are a single repeated run (and not literals), but the repeat count
+is incorrectly 0 in the file to test that such data corruption is proprly
+handled.
+
 repeated_values.parquet:
 Generated with parquet-mr 1.2.5
 Contains 3 single-column rows:

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/testdata/data/bad_rle_literal_count.parquet
----------------------------------------------------------------------
diff --git a/testdata/data/bad_rle_literal_count.parquet 
b/testdata/data/bad_rle_literal_count.parquet
new file mode 100644
index 0000000..33d21ac
Binary files /dev/null and b/testdata/data/bad_rle_literal_count.parquet differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/testdata/data/bad_rle_repeat_count.parquet
----------------------------------------------------------------------
diff --git a/testdata/data/bad_rle_repeat_count.parquet 
b/testdata/data/bad_rle_repeat_count.parquet
new file mode 100644
index 0000000..ad468ac
Binary files /dev/null and b/testdata/data/bad_rle_repeat_count.parquet differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts-abort.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts-abort.test
 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts-abort.test
new file mode 100644
index 0000000..e0789ab
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts-abort.test
@@ -0,0 +1,8 @@
+====
+---- QUERY
+select * from bad_rle_counts
+---- TYPES
+bigint
+---- CATCH
+Failed to decode dictionary-encoded value.
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts.test
 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts.test
new file mode 100644
index 0000000..c086278
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts.test
@@ -0,0 +1,9 @@
+====
+---- QUERY
+select * from bad_rle_counts
+---- TYPES
+bigint
+---- RESULTS
+---- ERRORS
+Failed to decode dictionary-encoded value. file: hdfs://regex:.$
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index 216d3df..035b64b 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -224,6 +224,30 @@ class TestParquet(ImpalaTestSuite):
     vector.get_value('exec_option')['abort_on_error'] = 1
     self.run_test_case('QueryTest/parquet-abort-on-error', vector)
 
+  def test_corrupt_rle_counts(self, vector, unique_database):
+    """IMPALA-3646: Tests that a certain type of file corruption for plain
+    dictionary encoded values is gracefully handled. Cases tested:
+    - incorrect literal count of 0 for the RLE encoded dictionary indexes
+    - incorrect repeat count of 0 for the RLE encoded dictionary indexes
+    """
+    # Create test table and copy the corrupt files into it.
+    self.client.execute(
+        "create table %s.bad_rle_counts (c bigint) stored as parquet" % 
unique_database)
+    bad_rle_counts_tbl_loc =\
+        get_fs_path("/test-warehouse/%s.db/%s" % (unique_database, 
"bad_rle_counts"))
+    check_call(['hdfs', 'dfs', '-copyFromLocal',
+        os.environ['IMPALA_HOME'] + 
"/testdata/data/bad_rle_literal_count.parquet",
+        bad_rle_counts_tbl_loc])
+    check_call(['hdfs', 'dfs', '-copyFromLocal',
+        os.environ['IMPALA_HOME'] + 
"/testdata/data/bad_rle_repeat_count.parquet",
+        bad_rle_counts_tbl_loc])
+    # Querying the corrupted files should not DCHECK or crash.
+    vector.get_value('exec_option')['abort_on_error'] = 0
+    self.run_test_case('QueryTest/parquet-corrupt-rle-counts', vector, 
unique_database)
+    vector.get_value('exec_option')['abort_on_error'] = 1
+    self.run_test_case('QueryTest/parquet-corrupt-rle-counts-abort',
+                       vector, unique_database)
+
   @SkipIfS3.hdfs_block_size
   @SkipIfIsilon.hdfs_block_size
   @SkipIfLocal.multiple_impalad

Reply via email to