Repository: incubator-impala Updated Branches: refs/heads/master 2fb14d1b8 -> 025fd3bd7
IMPALA-3646: Handle corrupt RLE literal or repeat counts of 0. Adds handling and testing for a specific Parquet data corruption scenario with plain dictionary encoded values. The problematic scenario is when the repeat or literal count of the RLE-encoded dictionary indexes is decoded as 0 - an invalid value. There are several other cases of data corruption that are not yet handled gracefully. This patch only handles one specific case. Change-Id: Ibf406c82cdded37966f09c81e4cc1446d2b60d63 Reviewed-on: http://gerrit.cloudera.org:8080/3299 Reviewed-by: Alex Behm <[email protected]> Tested-by: Alex Behm <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/025fd3bd Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/025fd3bd Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/025fd3bd Branch: refs/heads/master Commit: 025fd3bd7f3b7f4bc3dcb6b21e3598124b19b577 Parents: 86ff18e Author: Alex Behm <[email protected]> Authored: Thu Jun 2 00:03:20 2016 -0700 Committer: Tim Armstrong <[email protected]> Committed: Tue Jun 7 17:29:59 2016 -0700 ---------------------------------------------------------------------- be/src/exec/hdfs-parquet-scanner.cc | 5 ++-- be/src/util/rle-encoding.h | 3 ++- be/src/util/rle-test.cc | 23 ++++++++++++++++++ testdata/data/README | 16 +++++++++++++ testdata/data/bad_rle_literal_count.parquet | Bin 0 -> 255 bytes testdata/data/bad_rle_repeat_count.parquet | Bin 0 -> 234 bytes .../parquet-corrupt-rle-counts-abort.test | 8 +++++++ .../QueryTest/parquet-corrupt-rle-counts.test | 9 +++++++ tests/query_test/test_scanners.py | 24 +++++++++++++++++++ 9 files changed, 85 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/be/src/exec/hdfs-parquet-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index 4bb7bac..1713af6 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -866,7 +866,7 @@ class HdfsParquetScanner::ScalarColumnReader : if (def_level >= max_def_level()) { bool continue_execution = ReadSlot<IS_DICT_ENCODED>(tuple->GetSlot(tuple_offset_), pool); - if (UNLIKELY(!continue_execution)) break; + if (UNLIKELY(!continue_execution)) return false; } else { tuple->SetNull(null_indicator_offset_); } @@ -1622,7 +1622,8 @@ bool HdfsParquetScanner::LevelDecoder::FillCache(int batch_size, } literal_count_ -= num_literals_to_set; - if (num_values == batch_size || !NextCounts<int16_t>()) break; + if (num_values == batch_size) break; + if (UNLIKELY(!NextCounts<int16_t>())) return false; if (repeat_count_ > 0 && current_value_ > max_level_) return false; } } else { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/be/src/util/rle-encoding.h ---------------------------------------------------------------------- diff --git a/be/src/util/rle-encoding.h b/be/src/util/rle-encoding.h index c6686e3..030767d 100644 --- a/be/src/util/rle-encoding.h +++ b/be/src/util/rle-encoding.h @@ -278,11 +278,12 @@ bool RleDecoder::NextCounts() { bool is_literal = indicator_value & 1; if (is_literal) { literal_count_ = (indicator_value >> 1) * 8; + if (UNLIKELY(literal_count_ == 0)) return false; } else { repeat_count_ = indicator_value >> 1; bool result = bit_reader_.GetAligned<T>( BitUtil::Ceil(bit_width_, 8), reinterpret_cast<T*>(¤t_value_)); - DCHECK(result); + if (UNLIKELY(!result || repeat_count_ == 0)) return false; } return true; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/be/src/util/rle-test.cc ---------------------------------------------------------------------- diff --git a/be/src/util/rle-test.cc b/be/src/util/rle-test.cc index 8073068..691dd45 100644 --- a/be/src/util/rle-test.cc +++ b/be/src/util/rle-test.cc @@ -419,6 +419,29 @@ TEST(BitRle, Overflow) { } } +// Tests handling of a specific data corruption scenario where +// the literal or repeat count is decoded as 0 (which is invalid). +TEST(Rle, ZeroLiteralOrRepeatCount) { + const int len = 1024; + uint8_t buffer[len]; + RleDecoder decoder(buffer, len, 0); + uint64_t val; + + // Test the RLE repeated values path. + memset(buffer, 0, len); + for (int i = 0; i < 10; ++i) { + bool result = decoder.Get(&val); + EXPECT_FALSE(result); + } + + // Test the RLE literal values path + memset(buffer, 1, len); + for (int i = 0; i < 10; ++i) { + bool result = decoder.Get(&val); + EXPECT_FALSE(result); + } +} + } int main(int argc, char **argv) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/testdata/data/README ---------------------------------------------------------------------- diff --git a/testdata/data/README b/testdata/data/README index f8cabda..53993c8 100644 --- a/testdata/data/README +++ b/testdata/data/README @@ -5,6 +5,22 @@ Contains 3 single-column rows: "is" "fun" +bad_rle_literal_count.parquet: +Generated by hacking Impala's Parquet writer. +Contains a single bigint column 'c' with the values 1, 3, 7 stored +in a single data chunk as dictionary plain. The RLE encoded dictionary +indexes are all literals (and not repeated), but the literal count +is incorrectly 0 in the file to test that such data corruption is +proprly handled. + +bad_rle_repeat_count.parquet: +Generated by hacking Impala's Parquet writer. +Contains a single bigint column 'c' with the value 7 repeated 7 times +stored in a single data chunk as dictionary plain. The RLE encoded dictionary +indexes are a single repeated run (and not literals), but the repeat count +is incorrectly 0 in the file to test that such data corruption is proprly +handled. + repeated_values.parquet: Generated with parquet-mr 1.2.5 Contains 3 single-column rows: http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/testdata/data/bad_rle_literal_count.parquet ---------------------------------------------------------------------- diff --git a/testdata/data/bad_rle_literal_count.parquet b/testdata/data/bad_rle_literal_count.parquet new file mode 100644 index 0000000..33d21ac Binary files /dev/null and b/testdata/data/bad_rle_literal_count.parquet differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/testdata/data/bad_rle_repeat_count.parquet ---------------------------------------------------------------------- diff --git a/testdata/data/bad_rle_repeat_count.parquet b/testdata/data/bad_rle_repeat_count.parquet new file mode 100644 index 0000000..ad468ac Binary files /dev/null and b/testdata/data/bad_rle_repeat_count.parquet differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts-abort.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts-abort.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts-abort.test new file mode 100644 index 0000000..e0789ab --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts-abort.test @@ -0,0 +1,8 @@ +==== +---- QUERY +select * from bad_rle_counts +---- TYPES +bigint +---- CATCH +Failed to decode dictionary-encoded value. +==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts.test new file mode 100644 index 0000000..c086278 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-corrupt-rle-counts.test @@ -0,0 +1,9 @@ +==== +---- QUERY +select * from bad_rle_counts +---- TYPES +bigint +---- RESULTS +---- ERRORS +Failed to decode dictionary-encoded value. file: hdfs://regex:.$ +==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/025fd3bd/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 216d3df..035b64b 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -224,6 +224,30 @@ class TestParquet(ImpalaTestSuite): vector.get_value('exec_option')['abort_on_error'] = 1 self.run_test_case('QueryTest/parquet-abort-on-error', vector) + def test_corrupt_rle_counts(self, vector, unique_database): + """IMPALA-3646: Tests that a certain type of file corruption for plain + dictionary encoded values is gracefully handled. Cases tested: + - incorrect literal count of 0 for the RLE encoded dictionary indexes + - incorrect repeat count of 0 for the RLE encoded dictionary indexes + """ + # Create test table and copy the corrupt files into it. + self.client.execute( + "create table %s.bad_rle_counts (c bigint) stored as parquet" % unique_database) + bad_rle_counts_tbl_loc =\ + get_fs_path("/test-warehouse/%s.db/%s" % (unique_database, "bad_rle_counts")) + check_call(['hdfs', 'dfs', '-copyFromLocal', + os.environ['IMPALA_HOME'] + "/testdata/data/bad_rle_literal_count.parquet", + bad_rle_counts_tbl_loc]) + check_call(['hdfs', 'dfs', '-copyFromLocal', + os.environ['IMPALA_HOME'] + "/testdata/data/bad_rle_repeat_count.parquet", + bad_rle_counts_tbl_loc]) + # Querying the corrupted files should not DCHECK or crash. + vector.get_value('exec_option')['abort_on_error'] = 0 + self.run_test_case('QueryTest/parquet-corrupt-rle-counts', vector, unique_database) + vector.get_value('exec_option')['abort_on_error'] = 1 + self.run_test_case('QueryTest/parquet-corrupt-rle-counts-abort', + vector, unique_database) + @SkipIfS3.hdfs_block_size @SkipIfIsilon.hdfs_block_size @SkipIfLocal.multiple_impalad
