IMPALA-3943: Do not throw scan errors for empty Parquet files. For Parquet files with no row groups but with num_rows=0 in the file footer the Parquet scanner returns an error indicating that the file is invalid. This behavior is a regression from previous Impala versions which used to accept such files.
This patch restores the previous behavior and adds tests. Change-Id: I50ac3df6ff24bc5c384ef22e0f804a5132adb62e Reviewed-on: http://gerrit.cloudera.org:8080/4693 Reviewed-by: Alex Behm <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/0449b5be Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/0449b5be Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/0449b5be Branch: refs/heads/master Commit: 0449b5beaba89b02e8bc7fe133b4dc5fbe33fe81 Parents: b28baa4 Author: Alex Behm <[email protected]> Authored: Tue Oct 11 18:49:38 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Wed Oct 12 09:22:57 2016 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-parquet-scanner.cc | 6 ++++- testdata/data/README | 8 ++++++ testdata/data/zero_rows_one_row_group.parquet | Bin 0 -> 236 bytes testdata/data/zero_rows_zero_row_groups.parquet | Bin 0 -> 199 bytes .../queries/QueryTest/parquet-zero-rows.test | 27 +++++++++++++++++++ tests/query_test/test_scanners.py | 25 +++++++++++++++++ 6 files changed, 65 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/be/src/exec/hdfs-parquet-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index 7d9adb4..7782e8a 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -410,7 +410,7 @@ Status HdfsParquetScanner::NextRowGroup() { ++row_group_idx_; if (row_group_idx_ >= file_metadata_.row_groups.size()) break; const parquet::RowGroup& row_group = file_metadata_.row_groups[row_group_idx_]; - if (row_group.num_rows == 0) continue; + if (row_group.num_rows == 0 || file_metadata_.num_rows == 0) continue; const DiskIoMgr::ScanRange* split_range = static_cast<ScanRangeMetadata*>( metadata_range_->meta_data())->original_split; @@ -895,6 +895,10 @@ Status HdfsParquetScanner::ProcessFooter() { } RETURN_IF_ERROR(ParquetMetadataUtils::ValidateFileVersion(file_metadata_, filename())); + + // IMPALA-3943: Do not throw an error for empty files for backwards compatibility. + if (file_metadata_.num_rows == 0) return Status::OK(); + // Parse out the created by application version string if (file_metadata_.__isset.created_by) { file_version_ = ParquetFileVersion(file_metadata_.created_by); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/testdata/data/README ---------------------------------------------------------------------- diff --git a/testdata/data/README b/testdata/data/README index 3a0d5ec..fce8014 100644 --- a/testdata/data/README +++ b/testdata/data/README @@ -21,6 +21,14 @@ indexes are a single repeated run (and not literals), but the repeat count is incorrectly 0 in the file to test that such data corruption is proprly handled. +zero_rows_zero_row_groups.parquet: +Generated by hacking Impala's Parquet writer. +The file metadata indicates zero rows and no row groups. + +zero_rows_one_row_group.parquet: +Generated by hacking Impala's Parquet writer. +The file metadata indicates zero rows but one row group. + repeated_values.parquet: Generated with parquet-mr 1.2.5 Contains 3 single-column rows: http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/testdata/data/zero_rows_one_row_group.parquet ---------------------------------------------------------------------- diff --git a/testdata/data/zero_rows_one_row_group.parquet b/testdata/data/zero_rows_one_row_group.parquet new file mode 100644 index 0000000..3404a7c Binary files /dev/null and b/testdata/data/zero_rows_one_row_group.parquet differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/testdata/data/zero_rows_zero_row_groups.parquet ---------------------------------------------------------------------- diff --git a/testdata/data/zero_rows_zero_row_groups.parquet b/testdata/data/zero_rows_zero_row_groups.parquet new file mode 100644 index 0000000..9e132e3 Binary files /dev/null and b/testdata/data/zero_rows_zero_row_groups.parquet differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test new file mode 100644 index 0000000..e7de245 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test @@ -0,0 +1,27 @@ +==== +---- QUERY +select * from zero_rows_zero_row_groups +---- TYPES +int +---- RESULTS +==== +---- QUERY +select count(*) from zero_rows_zero_row_groups +---- TYPES +bigint +---- RESULTS +0 +==== +---- QUERY +select * from zero_rows_one_row_group +---- TYPES +int +---- RESULTS +==== +---- QUERY +select count(*) from zero_rows_one_row_group +---- TYPES +bigint +---- RESULTS +0 +==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index ba50949..6dce0af 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -245,6 +245,31 @@ class TestParquet(ImpalaTestSuite): vector.get_value('exec_option')['abort_on_error'] = 1 self.run_test_case('QueryTest/parquet-abort-on-error', vector) + def test_zero_rows(self, vector, unique_database): + """IMPALA-3943: Tests that scanning files with num_rows=0 in the file footer + succeeds without errors.""" + # Create test table with a file that has 0 rows and 0 row groups. + self.client.execute("create table %s.zero_rows_zero_row_groups (c int) " + "stored as parquet" % unique_database) + zero_rows_zero_row_groups_loc = get_fs_path( + "/test-warehouse/%s.db/%s" % (unique_database, "zero_rows_zero_row_groups")) + check_call(['hdfs', 'dfs', '-copyFromLocal', + os.environ['IMPALA_HOME'] + "/testdata/data/zero_rows_zero_row_groups.parquet", + zero_rows_zero_row_groups_loc]) + # Create test table with a file that has 0 rows and 1 row group. + self.client.execute("create table %s.zero_rows_one_row_group (c int) " + "stored as parquet" % unique_database) + zero_rows_one_row_group_loc = get_fs_path( + "/test-warehouse/%s.db/%s" % (unique_database, "zero_rows_one_row_group")) + check_call(['hdfs', 'dfs', '-copyFromLocal', + os.environ['IMPALA_HOME'] + "/testdata/data/zero_rows_one_row_group.parquet", + zero_rows_one_row_group_loc]) + + vector.get_value('exec_option')['abort_on_error'] = 0 + self.run_test_case('QueryTest/parquet-zero-rows', vector, unique_database) + vector.get_value('exec_option')['abort_on_error'] = 1 + self.run_test_case('QueryTest/parquet-zero-rows', vector, unique_database) + def test_corrupt_rle_counts(self, vector, unique_database): """IMPALA-3646: Tests that a certain type of file corruption for plain dictionary encoded values is gracefully handled. Cases tested:
