IMPALA-3943: Address post-merge comments.

Adds code comments and issues a warning for Parquet files
with num_rows=0 but at least one non-empty row group.

Change-Id: I72ccf00191afddb8583ac961f1eaf11e5eb28791
Reviewed-on: http://gerrit.cloudera.org:8080/4696
Reviewed-by: Alex Behm <alex.b...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2a04b0e2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2a04b0e2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2a04b0e2

Branch: refs/heads/hadoop-next
Commit: 2a04b0e21a763e3d1f4269e7a8c3027ff0cab934
Parents: 47b8aa3
Author: Alex Behm <alex.b...@cloudera.com>
Authored: Wed Oct 12 13:31:54 2016 -0700
Committer: Internal Jenkins <cloudera-hud...@gerrit.cloudera.org>
Committed: Fri Oct 14 05:41:22 2016 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-scanner.cc             | 24 ++++++++++++++++++--
 common/thrift/generate_error_codes.py           |  3 +++
 .../queries/QueryTest/parquet-zero-rows.test    |  4 ++++
 3 files changed, 29 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2a04b0e2/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc 
b/be/src/exec/hdfs-parquet-scanner.cc
index 7782e8a..86dbbf8 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -410,7 +410,10 @@ Status HdfsParquetScanner::NextRowGroup() {
     ++row_group_idx_;
     if (row_group_idx_ >= file_metadata_.row_groups.size()) break;
     const parquet::RowGroup& row_group = 
file_metadata_.row_groups[row_group_idx_];
-    if (row_group.num_rows == 0 || file_metadata_.num_rows == 0) continue;
+    // Also check 'file_metadata_.num_rows' to make sure 'select count(*)' and 
'select *'
+    // behave consistently for corrupt files that have 
'file_metadata_.num_rows == 0'
+    // but some data in row groups.
+    if (row_group.num_rows == 0|| file_metadata_.num_rows == 0) continue;
 
     const DiskIoMgr::ScanRange* split_range = static_cast<ScanRangeMetadata*>(
         metadata_range_->meta_data())->original_split;
@@ -897,7 +900,24 @@ Status HdfsParquetScanner::ProcessFooter() {
   RETURN_IF_ERROR(ParquetMetadataUtils::ValidateFileVersion(file_metadata_, 
filename()));
 
   // IMPALA-3943: Do not throw an error for empty files for backwards 
compatibility.
-  if (file_metadata_.num_rows == 0) return Status::OK();
+  if (file_metadata_.num_rows == 0) {
+    // Warn if the num_rows is inconsistent with the row group metadata.
+    if (!file_metadata_.row_groups.empty()) {
+      bool has_non_empty_row_group = false;
+      for (const parquet::RowGroup& row_group : file_metadata_.row_groups) {
+        if (row_group.num_rows > 0) {
+          has_non_empty_row_group = true;
+          break;
+        }
+      }
+      // Warn if there is at least one non-empty row group.
+      if (has_non_empty_row_group) {
+        ErrorMsg msg(TErrorCode::PARQUET_ZERO_ROWS_IN_NON_EMPTY_FILE, 
filename());
+        state_->LogError(msg);
+      }
+    }
+    return Status::OK();
+  }
 
   // Parse out the created by application version string
   if (file_metadata_.__isset.created_by) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2a04b0e2/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py 
b/common/thrift/generate_error_codes.py
index 216d1c1..131947a 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -286,6 +286,9 @@ error_codes = (
    "while spilling data to disk."),
 
   ("BUFFER_ALLOCATION_FAILED", 92, "Unexpected error allocating $0 byte 
buffer."),
+
+  ("PARQUET_ZERO_ROWS_IN_NON_EMPTY_FILE", 93, "File '$0' is corrupt: metadata 
indicates "
+   "a zero row count but there is at least one non-empty row group."),
 )
 
 import sys

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2a04b0e2/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
index e7de245..0dbf3e7 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
@@ -17,6 +17,8 @@ select * from zero_rows_one_row_group
 ---- TYPES
 int
 ---- RESULTS
+---- ERRORS
+File '__HDFS_FILENAME__' is corrupt: metadata indicates a zero row count but 
there is at least one non-empty row group.
 ====
 ---- QUERY
 select count(*) from zero_rows_one_row_group
@@ -24,4 +26,6 @@ select count(*) from zero_rows_one_row_group
 bigint
 ---- RESULTS
 0
+---- ERRORS
+File '__HDFS_FILENAME__' is corrupt: metadata indicates a zero row count but 
there is at least one non-empty row group.
 ====

Reply via email to