IMPALA-3943: Do not throw scan errors for empty Parquet files.

For Parquet files with no row groups but with num_rows=0 in the
file footer the Parquet scanner returns an error indicating
that the file is invalid. This behavior is a regression from
previous Impala versions which used to accept such files.

This patch restores the previous behavior and adds tests.

Change-Id: I50ac3df6ff24bc5c384ef22e0f804a5132adb62e
Reviewed-on: http://gerrit.cloudera.org:8080/4693
Reviewed-by: Alex Behm <alex.b...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/0449b5be
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/0449b5be
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/0449b5be

Branch: refs/heads/master
Commit: 0449b5beaba89b02e8bc7fe133b4dc5fbe33fe81
Parents: b28baa4
Author: Alex Behm <alex.b...@cloudera.com>
Authored: Tue Oct 11 18:49:38 2016 -0700
Committer: Internal Jenkins <cloudera-hud...@gerrit.cloudera.org>
Committed: Wed Oct 12 09:22:57 2016 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-scanner.cc             |   6 ++++-
 testdata/data/README                            |   8 ++++++
 testdata/data/zero_rows_one_row_group.parquet   | Bin 0 -> 236 bytes
 testdata/data/zero_rows_zero_row_groups.parquet | Bin 0 -> 199 bytes
 .../queries/QueryTest/parquet-zero-rows.test    |  27 +++++++++++++++++++
 tests/query_test/test_scanners.py               |  25 +++++++++++++++++
 6 files changed, 65 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc 
b/be/src/exec/hdfs-parquet-scanner.cc
index 7d9adb4..7782e8a 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -410,7 +410,7 @@ Status HdfsParquetScanner::NextRowGroup() {
     ++row_group_idx_;
     if (row_group_idx_ >= file_metadata_.row_groups.size()) break;
     const parquet::RowGroup& row_group = 
file_metadata_.row_groups[row_group_idx_];
-    if (row_group.num_rows == 0) continue;
+    if (row_group.num_rows == 0 || file_metadata_.num_rows == 0) continue;
 
     const DiskIoMgr::ScanRange* split_range = static_cast<ScanRangeMetadata*>(
         metadata_range_->meta_data())->original_split;
@@ -895,6 +895,10 @@ Status HdfsParquetScanner::ProcessFooter() {
   }
 
   RETURN_IF_ERROR(ParquetMetadataUtils::ValidateFileVersion(file_metadata_, 
filename()));
+
+  // IMPALA-3943: Do not throw an error for empty files for backwards 
compatibility.
+  if (file_metadata_.num_rows == 0) return Status::OK();
+
   // Parse out the created by application version string
   if (file_metadata_.__isset.created_by) {
     file_version_ = ParquetFileVersion(file_metadata_.created_by);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/testdata/data/README
----------------------------------------------------------------------
diff --git a/testdata/data/README b/testdata/data/README
index 3a0d5ec..fce8014 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -21,6 +21,14 @@ indexes are a single repeated run (and not literals), but 
the repeat count
 is incorrectly 0 in the file to test that such data corruption is proprly
 handled.
 
+zero_rows_zero_row_groups.parquet:
+Generated by hacking Impala's Parquet writer.
+The file metadata indicates zero rows and no row groups.
+
+zero_rows_one_row_group.parquet:
+Generated by hacking Impala's Parquet writer.
+The file metadata indicates zero rows but one row group.
+
 repeated_values.parquet:
 Generated with parquet-mr 1.2.5
 Contains 3 single-column rows:

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/testdata/data/zero_rows_one_row_group.parquet
----------------------------------------------------------------------
diff --git a/testdata/data/zero_rows_one_row_group.parquet 
b/testdata/data/zero_rows_one_row_group.parquet
new file mode 100644
index 0000000..3404a7c
Binary files /dev/null and b/testdata/data/zero_rows_one_row_group.parquet 
differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/testdata/data/zero_rows_zero_row_groups.parquet
----------------------------------------------------------------------
diff --git a/testdata/data/zero_rows_zero_row_groups.parquet 
b/testdata/data/zero_rows_zero_row_groups.parquet
new file mode 100644
index 0000000..9e132e3
Binary files /dev/null and b/testdata/data/zero_rows_zero_row_groups.parquet 
differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
new file mode 100644
index 0000000..e7de245
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
@@ -0,0 +1,27 @@
+====
+---- QUERY
+select * from zero_rows_zero_row_groups
+---- TYPES
+int
+---- RESULTS
+====
+---- QUERY
+select count(*) from zero_rows_zero_row_groups
+---- TYPES
+bigint
+---- RESULTS
+0
+====
+---- QUERY
+select * from zero_rows_one_row_group
+---- TYPES
+int
+---- RESULTS
+====
+---- QUERY
+select count(*) from zero_rows_one_row_group
+---- TYPES
+bigint
+---- RESULTS
+0
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0449b5be/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index ba50949..6dce0af 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -245,6 +245,31 @@ class TestParquet(ImpalaTestSuite):
     vector.get_value('exec_option')['abort_on_error'] = 1
     self.run_test_case('QueryTest/parquet-abort-on-error', vector)
 
+  def test_zero_rows(self, vector, unique_database):
+    """IMPALA-3943: Tests that scanning files with num_rows=0 in the file 
footer
+    succeeds without errors."""
+    # Create test table with a file that has 0 rows and 0 row groups.
+    self.client.execute("create table %s.zero_rows_zero_row_groups (c int) "
+        "stored as parquet" % unique_database)
+    zero_rows_zero_row_groups_loc = get_fs_path(
+        "/test-warehouse/%s.db/%s" % (unique_database, 
"zero_rows_zero_row_groups"))
+    check_call(['hdfs', 'dfs', '-copyFromLocal',
+        os.environ['IMPALA_HOME'] + 
"/testdata/data/zero_rows_zero_row_groups.parquet",
+        zero_rows_zero_row_groups_loc])
+    # Create test table with a file that has 0 rows and 1 row group.
+    self.client.execute("create table %s.zero_rows_one_row_group (c int) "
+        "stored as parquet" % unique_database)
+    zero_rows_one_row_group_loc = get_fs_path(
+        "/test-warehouse/%s.db/%s" % (unique_database, 
"zero_rows_one_row_group"))
+    check_call(['hdfs', 'dfs', '-copyFromLocal',
+        os.environ['IMPALA_HOME'] + 
"/testdata/data/zero_rows_one_row_group.parquet",
+        zero_rows_one_row_group_loc])
+
+    vector.get_value('exec_option')['abort_on_error'] = 0
+    self.run_test_case('QueryTest/parquet-zero-rows', vector, unique_database)
+    vector.get_value('exec_option')['abort_on_error'] = 1
+    self.run_test_case('QueryTest/parquet-zero-rows', vector, unique_database)
+
   def test_corrupt_rle_counts(self, vector, unique_database):
     """IMPALA-3646: Tests that a certain type of file corruption for plain
     dictionary encoded values is gracefully handled. Cases tested:

Reply via email to