IMPALA-3943: Address post-merge comments. Adds code comments and issues a warning for Parquet files with num_rows=0 but at least one non-empty row group.
Change-Id: I72ccf00191afddb8583ac961f1eaf11e5eb28791 Reviewed-on: http://gerrit.cloudera.org:8080/4696 Reviewed-by: Alex Behm <alex.b...@cloudera.com> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2a04b0e2 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2a04b0e2 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2a04b0e2 Branch: refs/heads/master Commit: 2a04b0e21a763e3d1f4269e7a8c3027ff0cab934 Parents: 47b8aa3 Author: Alex Behm <alex.b...@cloudera.com> Authored: Wed Oct 12 13:31:54 2016 -0700 Committer: Internal Jenkins <cloudera-hud...@gerrit.cloudera.org> Committed: Fri Oct 14 05:41:22 2016 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-parquet-scanner.cc | 24 ++++++++++++++++++-- common/thrift/generate_error_codes.py | 3 +++ .../queries/QueryTest/parquet-zero-rows.test | 4 ++++ 3 files changed, 29 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2a04b0e2/be/src/exec/hdfs-parquet-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index 7782e8a..86dbbf8 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -410,7 +410,10 @@ Status HdfsParquetScanner::NextRowGroup() { ++row_group_idx_; if (row_group_idx_ >= file_metadata_.row_groups.size()) break; const parquet::RowGroup& row_group = file_metadata_.row_groups[row_group_idx_]; - if (row_group.num_rows == 0 || file_metadata_.num_rows == 0) continue; + // Also check 'file_metadata_.num_rows' to make sure 'select count(*)' and 'select *' + // behave consistently for corrupt files that have 'file_metadata_.num_rows == 0' + // but some data in row groups. + if (row_group.num_rows == 0|| file_metadata_.num_rows == 0) continue; const DiskIoMgr::ScanRange* split_range = static_cast<ScanRangeMetadata*>( metadata_range_->meta_data())->original_split; @@ -897,7 +900,24 @@ Status HdfsParquetScanner::ProcessFooter() { RETURN_IF_ERROR(ParquetMetadataUtils::ValidateFileVersion(file_metadata_, filename())); // IMPALA-3943: Do not throw an error for empty files for backwards compatibility. - if (file_metadata_.num_rows == 0) return Status::OK(); + if (file_metadata_.num_rows == 0) { + // Warn if the num_rows is inconsistent with the row group metadata. + if (!file_metadata_.row_groups.empty()) { + bool has_non_empty_row_group = false; + for (const parquet::RowGroup& row_group : file_metadata_.row_groups) { + if (row_group.num_rows > 0) { + has_non_empty_row_group = true; + break; + } + } + // Warn if there is at least one non-empty row group. + if (has_non_empty_row_group) { + ErrorMsg msg(TErrorCode::PARQUET_ZERO_ROWS_IN_NON_EMPTY_FILE, filename()); + state_->LogError(msg); + } + } + return Status::OK(); + } // Parse out the created by application version string if (file_metadata_.__isset.created_by) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2a04b0e2/common/thrift/generate_error_codes.py ---------------------------------------------------------------------- diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py index 216d1c1..131947a 100755 --- a/common/thrift/generate_error_codes.py +++ b/common/thrift/generate_error_codes.py @@ -286,6 +286,9 @@ error_codes = ( "while spilling data to disk."), ("BUFFER_ALLOCATION_FAILED", 92, "Unexpected error allocating $0 byte buffer."), + + ("PARQUET_ZERO_ROWS_IN_NON_EMPTY_FILE", 93, "File '$0' is corrupt: metadata indicates " + "a zero row count but there is at least one non-empty row group."), ) import sys http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2a04b0e2/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test index e7de245..0dbf3e7 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test @@ -17,6 +17,8 @@ select * from zero_rows_one_row_group ---- TYPES int ---- RESULTS +---- ERRORS +File '__HDFS_FILENAME__' is corrupt: metadata indicates a zero row count but there is at least one non-empty row group. ==== ---- QUERY select count(*) from zero_rows_one_row_group @@ -24,4 +26,6 @@ select count(*) from zero_rows_one_row_group bigint ---- RESULTS 0 +---- ERRORS +File '__HDFS_FILENAME__' is corrupt: metadata indicates a zero row count but there is at least one non-empty row group. ====