IMPALA-3845: Split up hdfs-parquet-scanner.cc into more files/components. This patch refactors hdfs-parquet-scanner.cc into several files. The new responsibilities of each file/component are roughly as follows:
hdfs-parquet-scanner.h/cc - Creates column readers and uses them to materializes row batches. - Evaluates runtime filters and conjuncts, populates row batch queue. parquet-metadata-utils.h/cc - Contains utilities for validating Parquet file metadata. - Parses the schema of a Parquet file into our internal schema representation. - Resolves SchemaPaths (e.g. from a table descriptor) against the internal representation of the Parquet file schema. parquet-column-readers.h/cc - Contains the per-column data reading, parsing and value materialization logic. Testing: A private core/hdfs run passed. Change-Id: I4c5fd46f9c1a0ff2a4c30ea5a712fbae17c68f92 Reviewed-on: http://gerrit.cloudera.org:8080/3596 Tested-by: Internal Jenkins Reviewed-by: Alex Behm <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/6ee15fad Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/6ee15fad Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/6ee15fad Branch: refs/heads/master Commit: 6ee15fadedcac9d41f8ad660caf8d4a60267df8e Parents: baf8fe2 Author: Alex Behm <[email protected]> Authored: Tue May 17 10:46:36 2016 -0700 Committer: Taras Bobrovytsky <[email protected]> Committed: Fri Jul 15 18:27:05 2016 +0000 ---------------------------------------------------------------------- be/src/exec/CMakeLists.txt | 2 + be/src/exec/base-sequence-scanner.cc | 2 +- be/src/exec/hdfs-parquet-scanner.cc | 2316 +----------------------- be/src/exec/hdfs-parquet-scanner.h | 221 +-- be/src/exec/hdfs-rcfile-scanner.cc | 2 +- be/src/exec/hdfs-scanner.cc | 20 - be/src/exec/hdfs-scanner.h | 11 - be/src/exec/hdfs-text-scanner.cc | 3 +- be/src/exec/parquet-column-readers.cc | 1093 +++++++++++ be/src/exec/parquet-column-readers.h | 500 +++++ be/src/exec/parquet-metadata-utils.cc | 647 +++++++ be/src/exec/parquet-metadata-utils.h | 202 +++ be/src/exec/parquet-scratch-tuple-batch.h | 72 + be/src/exec/parquet-version-test.cc | 7 +- be/src/exprs/expr-value.h | 2 +- be/src/runtime/runtime-state.cc | 14 + be/src/runtime/runtime-state.h | 6 + be/src/util/debug-util.cc | 8 + be/src/util/debug-util.h | 18 + 19 files changed, 2684 insertions(+), 2462 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6ee15fad/be/src/exec/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/be/src/exec/CMakeLists.txt b/be/src/exec/CMakeLists.txt index 876fc7e..7cf4267 100644 --- a/be/src/exec/CMakeLists.txt +++ b/be/src/exec/CMakeLists.txt @@ -62,6 +62,8 @@ add_library(Exec hbase-table-scanner.cc incr-stats-util.cc nested-loop-join-node.cc + parquet-column-readers.cc + parquet-metadata-utils.cc partitioned-aggregation-node.cc partitioned-aggregation-node-ir.cc partitioned-hash-join-node.cc http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6ee15fad/be/src/exec/base-sequence-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/base-sequence-scanner.cc b/be/src/exec/base-sequence-scanner.cc index dc7a983..268fdae 100644 --- a/be/src/exec/base-sequence-scanner.cc +++ b/be/src/exec/base-sequence-scanner.cc @@ -124,7 +124,7 @@ Status BaseSequenceScanner::ProcessSplit() { header_ = state_->obj_pool()->Add(AllocateFileHeader()); Status status = ReadFileHeader(); if (!status.ok()) { - RETURN_IF_ERROR(LogOrReturnError(status.msg())); + RETURN_IF_ERROR(state_->LogOrReturnError(status.msg())); // We need to complete the ranges for this file. CloseFileRanges(stream_->filename()); return Status::OK();
