Repository: impala Updated Branches: refs/heads/master f8b406222 -> 6cc76d720
IMPALA-6353: Fix crash in snappy decompressor SnappyDecompressor::MaxOutputLen assumes the input pointer to be non-null. It's not true when the parquet file is corrupted and the compressed_page_size field in a page header is 0. This patch handles this error instead of failing a DCHECK. Testing: A bad parquet file with 0 compressed_page_size is added. It crashes impala without this patch. Change-Id: I0d42937aab92a74f8e104d2f7fcd64dc24f6a500 Reviewed-on: http://gerrit.cloudera.org:8080/8977 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/6cc76d72 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/6cc76d72 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/6cc76d72 Branch: refs/heads/master Commit: 6cc76d72016b8d5672676e8e8a979b0807803ad9 Parents: f8b4062 Author: Tianyi Wang <[email protected]> Authored: Tue Jan 9 13:03:45 2018 -0800 Committer: Impala Public Jenkins <[email protected]> Committed: Wed Jan 17 04:18:24 2018 +0000 ---------------------------------------------------------------------- be/src/util/decompress.cc | 1 + testdata/data/README | 5 +++++ testdata/data/bad_compressed_dict_page_size.parquet | Bin 0 -> 293 bytes .../parquet-bad-compressed-dict-page-size.test | 7 +++++++ tests/query_test/test_scanners.py | 13 +++++++++++++ 5 files changed, 26 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/6cc76d72/be/src/util/decompress.cc ---------------------------------------------------------------------- diff --git a/be/src/util/decompress.cc b/be/src/util/decompress.cc index f3466b9..d92dfe2 100644 --- a/be/src/util/decompress.cc +++ b/be/src/util/decompress.cc @@ -524,6 +524,7 @@ SnappyDecompressor::SnappyDecompressor(MemPool* mem_pool, bool reuse_buffer) } int64_t SnappyDecompressor::MaxOutputLen(int64_t input_len, const uint8_t* input) { + if (input_len <= 0) return -1; DCHECK(input != nullptr); size_t result; if (!snappy::GetUncompressedLength(reinterpret_cast<const char*>(input), http://git-wip-us.apache.org/repos/asf/impala/blob/6cc76d72/testdata/data/README ---------------------------------------------------------------------- diff --git a/testdata/data/README b/testdata/data/README index 25aa09b..2b92a0a 100644 --- a/testdata/data/README +++ b/testdata/data/README @@ -5,6 +5,11 @@ Contains 3 single-column rows: "is" "fun" +bad_compressed_dict_page_size.parquet: +Generated by hacking Impala's Parquet writer. +Contains a single string column 'col' with one row ("a"). The compressed_page_size field +in dict page header is modifed to 0 to test if it is correctly handled. + bad_rle_literal_count.parquet: Generated by hacking Impala's Parquet writer. Contains a single bigint column 'c' with the values 1, 3, 7 stored http://git-wip-us.apache.org/repos/asf/impala/blob/6cc76d72/testdata/data/bad_compressed_dict_page_size.parquet ---------------------------------------------------------------------- diff --git a/testdata/data/bad_compressed_dict_page_size.parquet b/testdata/data/bad_compressed_dict_page_size.parquet new file mode 100644 index 0000000..a5b5ed9 Binary files /dev/null and b/testdata/data/bad_compressed_dict_page_size.parquet differ http://git-wip-us.apache.org/repos/asf/impala/blob/6cc76d72/testdata/workloads/functional-query/queries/QueryTest/parquet-bad-compressed-dict-page-size.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-bad-compressed-dict-page-size.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-bad-compressed-dict-page-size.test new file mode 100644 index 0000000..4b9a317 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-bad-compressed-dict-page-size.test @@ -0,0 +1,7 @@ +==== +---- QUERY +# Parquet file with invalid (0) compressed_page_size in a dict page. +select * from bad_compressed_dict_page_size; +---- CATCH +Snappy: GetUncompressedLength failed +==== \ No newline at end of file http://git-wip-us.apache.org/repos/asf/impala/blob/6cc76d72/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index fe0577a..9b252da 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -388,6 +388,19 @@ class TestParquet(ImpalaTestSuite): self.run_test_case('QueryTest/parquet-corrupt-rle-counts-abort', vector, unique_database) + def test_bad_compressed_page_size(self, vector, unique_database): + """IMPALA-6353: Tests that a parquet dict page with 0 compressed_page_size is + gracefully handled. """ + self.client.execute( + "create table %s.bad_compressed_dict_page_size (col string) stored as parquet" + % unique_database) + tbl_loc = get_fs_path("/test-warehouse/%s.db/%s" % (unique_database, + "bad_compressed_dict_page_size")) + check_call(['hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] + + "/testdata/data/bad_compressed_dict_page_size.parquet", tbl_loc]) + self.run_test_case('QueryTest/parquet-bad-compressed-dict-page-size', vector, + unique_database) + def test_bitpacked_def_levels(self, vector, unique_database): """Test that Impala can read a Parquet file with the deprecated bit-packed def level encoding."""
