This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 6959b4b08 Only increment metrics for data pages (#4285)
6959b4b08 is described below
commit 6959b4b08a78dd924d0044c64ac3b3a9b9fd3d2e
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Fri May 26 13:05:30 2023 +0100
Only increment metrics for data pages (#4285)
---
parquet/src/column/writer/mod.rs | 6 +++---
parquet/src/file/writer.rs | 6 +-----
2 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 137893092..3fcfe6c19 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -915,11 +915,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a,
E> {
fn update_metrics_for_page(&mut self, page_spec: PageWriteSpec) {
self.column_metrics.total_uncompressed_size +=
page_spec.uncompressed_size as u64;
self.column_metrics.total_compressed_size += page_spec.compressed_size
as u64;
- self.column_metrics.total_num_values += page_spec.num_values as u64;
self.column_metrics.total_bytes_written += page_spec.bytes_written;
match page_spec.page_type {
PageType::DATA_PAGE | PageType::DATA_PAGE_V2 => {
+ self.column_metrics.total_num_values += page_spec.num_values
as u64;
if self.column_metrics.data_page_offset.is_none() {
self.column_metrics.data_page_offset =
Some(page_spec.offset);
}
@@ -1512,7 +1512,7 @@ mod tests {
metadata.encodings(),
&vec![Encoding::PLAIN, Encoding::RLE, Encoding::RLE_DICTIONARY]
);
- assert_eq!(metadata.num_values(), 8); // dictionary + value indexes
+ assert_eq!(metadata.num_values(), 4);
assert_eq!(metadata.compressed_size(), 20);
assert_eq!(metadata.uncompressed_size(), 20);
assert_eq!(metadata.data_page_offset(), 0);
@@ -1639,7 +1639,7 @@ mod tests {
metadata.encodings(),
&vec![Encoding::PLAIN, Encoding::RLE, Encoding::RLE_DICTIONARY]
);
- assert_eq!(metadata.num_values(), 8); // dictionary + value indexes
+ assert_eq!(metadata.num_values(), 4);
assert_eq!(metadata.compressed_size(), 20);
assert_eq!(metadata.uncompressed_size(), 20);
assert_eq!(metadata.data_page_offset(), 0);
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index 4b1c4bad9..c1c8db955 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -26,7 +26,6 @@ use std::io::{BufWriter, IoSlice, Read};
use std::{io::Write, sync::Arc};
use thrift::protocol::{TCompactOutputProtocol, TSerializable};
-use crate::basic::PageType;
use crate::column::writer::{
get_typed_column_writer_mut, ColumnCloseResult, ColumnWriterImpl,
};
@@ -778,10 +777,7 @@ impl<'a, W: Write> PageWriter for SerializedPageWriter<'a,
W> {
spec.compressed_size = compressed_size + header_size;
spec.offset = start_pos;
spec.bytes_written = self.sink.bytes_written() as u64 - start_pos;
- // Number of values is incremented for data pages only
- if page_type == PageType::DATA_PAGE || page_type ==
PageType::DATA_PAGE_V2 {
- spec.num_values = num_values;
- }
+ spec.num_values = num_values;
Ok(spec)
}