This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 6959b4b08 Only increment metrics for data pages (#4285)
6959b4b08 is described below

commit 6959b4b08a78dd924d0044c64ac3b3a9b9fd3d2e
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Fri May 26 13:05:30 2023 +0100

    Only increment metrics for data pages (#4285)
---
 parquet/src/column/writer/mod.rs | 6 +++---
 parquet/src/file/writer.rs       | 6 +-----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 137893092..3fcfe6c19 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -915,11 +915,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, 
E> {
     fn update_metrics_for_page(&mut self, page_spec: PageWriteSpec) {
         self.column_metrics.total_uncompressed_size += 
page_spec.uncompressed_size as u64;
         self.column_metrics.total_compressed_size += page_spec.compressed_size 
as u64;
-        self.column_metrics.total_num_values += page_spec.num_values as u64;
         self.column_metrics.total_bytes_written += page_spec.bytes_written;
 
         match page_spec.page_type {
             PageType::DATA_PAGE | PageType::DATA_PAGE_V2 => {
+                self.column_metrics.total_num_values += page_spec.num_values 
as u64;
                 if self.column_metrics.data_page_offset.is_none() {
                     self.column_metrics.data_page_offset = 
Some(page_spec.offset);
                 }
@@ -1512,7 +1512,7 @@ mod tests {
             metadata.encodings(),
             &vec![Encoding::PLAIN, Encoding::RLE, Encoding::RLE_DICTIONARY]
         );
-        assert_eq!(metadata.num_values(), 8); // dictionary + value indexes
+        assert_eq!(metadata.num_values(), 4);
         assert_eq!(metadata.compressed_size(), 20);
         assert_eq!(metadata.uncompressed_size(), 20);
         assert_eq!(metadata.data_page_offset(), 0);
@@ -1639,7 +1639,7 @@ mod tests {
             metadata.encodings(),
             &vec![Encoding::PLAIN, Encoding::RLE, Encoding::RLE_DICTIONARY]
         );
-        assert_eq!(metadata.num_values(), 8); // dictionary + value indexes
+        assert_eq!(metadata.num_values(), 4);
         assert_eq!(metadata.compressed_size(), 20);
         assert_eq!(metadata.uncompressed_size(), 20);
         assert_eq!(metadata.data_page_offset(), 0);
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index 4b1c4bad9..c1c8db955 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -26,7 +26,6 @@ use std::io::{BufWriter, IoSlice, Read};
 use std::{io::Write, sync::Arc};
 use thrift::protocol::{TCompactOutputProtocol, TSerializable};
 
-use crate::basic::PageType;
 use crate::column::writer::{
     get_typed_column_writer_mut, ColumnCloseResult, ColumnWriterImpl,
 };
@@ -778,10 +777,7 @@ impl<'a, W: Write> PageWriter for SerializedPageWriter<'a, 
W> {
         spec.compressed_size = compressed_size + header_size;
         spec.offset = start_pos;
         spec.bytes_written = self.sink.bytes_written() as u64 - start_pos;
-        // Number of values is incremented for data pages only
-        if page_type == PageType::DATA_PAGE || page_type == 
PageType::DATA_PAGE_V2 {
-            spec.num_values = num_values;
-        }
+        spec.num_values = num_values;
 
         Ok(spec)
     }

Reply via email to