IMPALA-5840: Don't write page-level statistics in Parquet files. Page level statistics in Parquet files are expected to be deprecated in favor of page indexes (PARQUET-922). This change disables writing statistics to pages. Impala is currently the only project writing them. Neither Impala nor other projects make use of these right now and by not writing them anymore we prevent others from depending on soon-to-be deprecated fields.
Change-Id: I1b05131320370171d76e93a46b04880a7f9b6d84 Reviewed-on: http://gerrit.cloudera.org:8080/7817 Reviewed-by: Lars Volker <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/8149d0e5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/8149d0e5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/8149d0e5 Branch: refs/heads/master Commit: 8149d0e5778525cb4988953377685946f31d70a2 Parents: f6c38ac Author: Lars Volker <[email protected]> Authored: Thu Aug 24 14:59:22 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Tue Aug 29 05:22:51 2017 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-parquet-table-writer.cc | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8149d0e5/be/src/exec/hdfs-parquet-table-writer.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-table-writer.cc b/be/src/exec/hdfs-parquet-table-writer.cc index 237dd83..4bbadb4 100644 --- a/be/src/exec/hdfs-parquet-table-writer.cc +++ b/be/src/exec/hdfs-parquet-table-writer.cc @@ -385,7 +385,8 @@ class HdfsParquetTableWriter::ColumnWriter : // Temporary string value to hold CHAR(N) StringValue temp_; - // Tracks statistics per page. + // Tracks statistics per page. These are not written out currently but are merged into + // the row group stats. TODO(IMPALA-5841): Write these to the page index. scoped_ptr<ColumnStats<T>> page_stats_; // Tracks statistics per row group. This gets reset when starting a new row group. @@ -453,7 +454,8 @@ class HdfsParquetTableWriter::BoolColumnWriter : // Used to encode bools as single bit values. This is reused across pages. BitWriter* bool_values_; - // Tracks statistics per page. + // Tracks statistics per page. These are not written out currently but are merged into + // the row group stats. TODO(IMPALA-5841): Write these to the page index. ColumnStats<bool> page_stats_; // Tracks statistics per row group. This gets reset when starting a new file. @@ -695,15 +697,9 @@ Status HdfsParquetTableWriter::BaseColumnWriter::FinalizeCurrentPage() { max_compressed_size - header.compressed_page_size); } - // Build page statistics and add them to the header. - DCHECK(page_stats_base_ != nullptr); - if (page_stats_base_->BytesNeeded() <= MAX_COLUMN_STATS_SIZE) { - page_stats_base_->EncodeToThrift(&header.data_page_header.statistics); - header.data_page_header.__isset.statistics = true; - } - // Update row group statistics from page statistics. DCHECK(row_group_stats_base_ != nullptr); + DCHECK(page_stats_base_ != nullptr); row_group_stats_base_->Merge(*page_stats_base_); // Add the size of the data page header
