IMPALA-5021: Fix count(*) remaining rows overflow in Parquet. Zero-slot scans of Parquet files that have num_rows > MAX_INT32 in the footer metadata used to run forever due to an overflow when calculating the remaining number of rows to process.
Testing: - Added a regression test using a file with num_rows = 2*MAX_INT32. - Locally ran test_scanners.py which succeeded. - Private core/hdfs run succeeded Change-Id: Ib9f8a6b83f8f621451d5977423ef81a6e4b124bd Reviewed-on: http://gerrit.cloudera.org:8080/6286 Reviewed-by: Alex Behm <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/d3cc23e5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/d3cc23e5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/d3cc23e5 Branch: refs/heads/master Commit: d3cc23e569eb44a3b9b823bd99da783b8356c6d4 Parents: 082b3ef Author: Alex Behm <[email protected]> Authored: Mon Mar 6 23:50:59 2017 -0800 Committer: Impala Public Jenkins <[email protected]> Committed: Wed Mar 8 02:00:30 2017 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-parquet-scanner.cc | 4 ++-- testdata/data/README | 5 +++++ testdata/data/huge_num_rows.parquet | Bin 0 -> 314 bytes tests/query_test/test_scanners.py | 15 +++++++++++++++ 4 files changed, 22 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d3cc23e5/be/src/exec/hdfs-parquet-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index 4955949..bad6b33 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -427,8 +427,8 @@ Status HdfsParquetScanner::GetNextInternal(RowBatch* row_batch) { } assemble_rows_timer_.Start(); DCHECK_LE(row_group_rows_read_, file_metadata_.num_rows); - int rows_remaining = file_metadata_.num_rows - row_group_rows_read_; - int max_tuples = min(row_batch->capacity(), rows_remaining); + int64_t rows_remaining = file_metadata_.num_rows - row_group_rows_read_; + int max_tuples = min<int64_t>(row_batch->capacity(), rows_remaining); TupleRow* current_row = row_batch->GetRow(row_batch->AddRow()); int num_to_commit = WriteTemplateTuples(current_row, max_tuples); Status status = CommitRows(row_batch, num_to_commit); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d3cc23e5/testdata/data/README ---------------------------------------------------------------------- diff --git a/testdata/data/README b/testdata/data/README index 3eb4ba0..465d80b 100644 --- a/testdata/data/README +++ b/testdata/data/README @@ -29,6 +29,11 @@ zero_rows_one_row_group.parquet: Generated by hacking Impala's Parquet writer. The file metadata indicates zero rows but one row group. +huge_num_rows.parquet +Generated by hacking Impala's Parquet writer. +The file metadata indicates 2 * MAX_INT32 rows. +The single row group also has the same number of rows in the metadata. + repeated_values.parquet: Generated with parquet-mr 1.2.5 Contains 3 single-column rows: http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d3cc23e5/testdata/data/huge_num_rows.parquet ---------------------------------------------------------------------- diff --git a/testdata/data/huge_num_rows.parquet b/testdata/data/huge_num_rows.parquet new file mode 100644 index 0000000..a2d64d7 Binary files /dev/null and b/testdata/data/huge_num_rows.parquet differ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d3cc23e5/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index ac94335..cb24923 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -288,6 +288,21 @@ class TestParquet(ImpalaTestSuite): vector.get_value('exec_option')['abort_on_error'] = 1 self.run_test_case('QueryTest/parquet-zero-rows', vector, unique_database) + def test_huge_num_rows(self, vector, unique_database): + """IMPALA-5021: Tests that a zero-slot scan on a file with a huge num_rows in the + footer succeeds without errors.""" + self.client.execute("create table %s.huge_num_rows (i int) stored as parquet" + % unique_database) + huge_num_rows_loc = get_fs_path( + "/test-warehouse/%s.db/%s" % (unique_database, "huge_num_rows")) + check_call(['hdfs', 'dfs', '-copyFromLocal', + os.environ['IMPALA_HOME'] + "/testdata/data/huge_num_rows.parquet", + huge_num_rows_loc]) + result = self.client.execute("select count(*) from %s.huge_num_rows" + % unique_database) + assert len(result.data) == 1 + assert "4294967294" in result.data + def test_corrupt_rle_counts(self, vector, unique_database): """IMPALA-3646: Tests that a certain type of file corruption for plain dictionary encoded values is gracefully handled. Cases tested:
