IMPALA-5021: Fix count(*) remaining rows overflow in Parquet.

Zero-slot scans of Parquet files that have num_rows > MAX_INT32
in the footer metadata used to run forever due to an overflow when
calculating the remaining number of rows to process.

Testing:
- Added a regression test using a file with num_rows = 2*MAX_INT32.
- Locally ran test_scanners.py which succeeded.
- Private core/hdfs run succeeded

Change-Id: Ib9f8a6b83f8f621451d5977423ef81a6e4b124bd
Reviewed-on: http://gerrit.cloudera.org:8080/6286
Reviewed-by: Alex Behm <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/d3cc23e5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/d3cc23e5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/d3cc23e5

Branch: refs/heads/master
Commit: d3cc23e569eb44a3b9b823bd99da783b8356c6d4
Parents: 082b3ef
Author: Alex Behm <[email protected]>
Authored: Mon Mar 6 23:50:59 2017 -0800
Committer: Impala Public Jenkins <[email protected]>
Committed: Wed Mar 8 02:00:30 2017 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-scanner.cc |   4 ++--
 testdata/data/README                |   5 +++++
 testdata/data/huge_num_rows.parquet | Bin 0 -> 314 bytes
 tests/query_test/test_scanners.py   |  15 +++++++++++++++
 4 files changed, 22 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d3cc23e5/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc 
b/be/src/exec/hdfs-parquet-scanner.cc
index 4955949..bad6b33 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -427,8 +427,8 @@ Status HdfsParquetScanner::GetNextInternal(RowBatch* 
row_batch) {
     }
     assemble_rows_timer_.Start();
     DCHECK_LE(row_group_rows_read_, file_metadata_.num_rows);
-    int rows_remaining = file_metadata_.num_rows - row_group_rows_read_;
-    int max_tuples = min(row_batch->capacity(), rows_remaining);
+    int64_t rows_remaining = file_metadata_.num_rows - row_group_rows_read_;
+    int max_tuples = min<int64_t>(row_batch->capacity(), rows_remaining);
     TupleRow* current_row = row_batch->GetRow(row_batch->AddRow());
     int num_to_commit = WriteTemplateTuples(current_row, max_tuples);
     Status status = CommitRows(row_batch, num_to_commit);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d3cc23e5/testdata/data/README
----------------------------------------------------------------------
diff --git a/testdata/data/README b/testdata/data/README
index 3eb4ba0..465d80b 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -29,6 +29,11 @@ zero_rows_one_row_group.parquet:
 Generated by hacking Impala's Parquet writer.
 The file metadata indicates zero rows but one row group.
 
+huge_num_rows.parquet
+Generated by hacking Impala's Parquet writer.
+The file metadata indicates 2 * MAX_INT32 rows.
+The single row group also has the same number of rows in the metadata.
+
 repeated_values.parquet:
 Generated with parquet-mr 1.2.5
 Contains 3 single-column rows:

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d3cc23e5/testdata/data/huge_num_rows.parquet
----------------------------------------------------------------------
diff --git a/testdata/data/huge_num_rows.parquet 
b/testdata/data/huge_num_rows.parquet
new file mode 100644
index 0000000..a2d64d7
Binary files /dev/null and b/testdata/data/huge_num_rows.parquet differ

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d3cc23e5/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index ac94335..cb24923 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -288,6 +288,21 @@ class TestParquet(ImpalaTestSuite):
     vector.get_value('exec_option')['abort_on_error'] = 1
     self.run_test_case('QueryTest/parquet-zero-rows', vector, unique_database)
 
+  def test_huge_num_rows(self, vector, unique_database):
+    """IMPALA-5021: Tests that a zero-slot scan on a file with a huge num_rows 
in the
+    footer succeeds without errors."""
+    self.client.execute("create table %s.huge_num_rows (i int) stored as 
parquet"
+      % unique_database)
+    huge_num_rows_loc = get_fs_path(
+        "/test-warehouse/%s.db/%s" % (unique_database, "huge_num_rows"))
+    check_call(['hdfs', 'dfs', '-copyFromLocal',
+        os.environ['IMPALA_HOME'] + "/testdata/data/huge_num_rows.parquet",
+        huge_num_rows_loc])
+    result = self.client.execute("select count(*) from %s.huge_num_rows"
+      % unique_database)
+    assert len(result.data) == 1
+    assert "4294967294" in result.data
+
   def test_corrupt_rle_counts(self, vector, unique_database):
     """IMPALA-3646: Tests that a certain type of file corruption for plain
     dictionary encoded values is gracefully handled. Cases tested:

Reply via email to