IMPALA-5840: Don't write page-level statistics in Parquet files. Page level statistics in Parquet files are expected to be deprecated in favor of page indexes (PARQUET-922). This change disables writing statistics to pages. Impala is currently the only project writing them. Neither Impala nor other projects make use of these right now and by not writing them anymore we prevent others from depending on soon-to-be deprecated fields.
Change-Id: I1b05131320370171d76e93a46b04880a7f9b6d84 Reviewed-on: http://gerrit.cloudera.org:8080/7817 Reviewed-by: Lars Volker <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/1faf89f0 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/1faf89f0 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/1faf89f0 Branch: refs/heads/branch-2.10.0 Commit: 1faf89f047e7d78c3a1f3b518269a3ae21a4ddea Parents: 73cb9b8 Author: Lars Volker <[email protected]> Authored: Thu Aug 24 14:59:22 2017 -0700 Committer: Tim Armstrong <[email protected]> Committed: Wed Aug 30 14:54:49 2017 -0700 ---------------------------------------------------------------------- be/src/exec/hdfs-parquet-table-writer.cc | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1faf89f0/be/src/exec/hdfs-parquet-table-writer.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-table-writer.cc b/be/src/exec/hdfs-parquet-table-writer.cc index 237dd83..4bbadb4 100644 --- a/be/src/exec/hdfs-parquet-table-writer.cc +++ b/be/src/exec/hdfs-parquet-table-writer.cc @@ -385,7 +385,8 @@ class HdfsParquetTableWriter::ColumnWriter : // Temporary string value to hold CHAR(N) StringValue temp_; - // Tracks statistics per page. + // Tracks statistics per page. These are not written out currently but are merged into + // the row group stats. TODO(IMPALA-5841): Write these to the page index. scoped_ptr<ColumnStats<T>> page_stats_; // Tracks statistics per row group. This gets reset when starting a new row group. @@ -453,7 +454,8 @@ class HdfsParquetTableWriter::BoolColumnWriter : // Used to encode bools as single bit values. This is reused across pages. BitWriter* bool_values_; - // Tracks statistics per page. + // Tracks statistics per page. These are not written out currently but are merged into + // the row group stats. TODO(IMPALA-5841): Write these to the page index. ColumnStats<bool> page_stats_; // Tracks statistics per row group. This gets reset when starting a new file. @@ -695,15 +697,9 @@ Status HdfsParquetTableWriter::BaseColumnWriter::FinalizeCurrentPage() { max_compressed_size - header.compressed_page_size); } - // Build page statistics and add them to the header. - DCHECK(page_stats_base_ != nullptr); - if (page_stats_base_->BytesNeeded() <= MAX_COLUMN_STATS_SIZE) { - page_stats_base_->EncodeToThrift(&header.data_page_header.statistics); - header.data_page_header.__isset.statistics = true; - } - // Update row group statistics from page statistics. DCHECK(row_group_stats_base_ != nullptr); + DCHECK(page_stats_base_ != nullptr); row_group_stats_base_->Merge(*page_stats_base_); // Add the size of the data page header
