This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 06a0157700 add buffered data_pages to parquet column writer total
bytes estimation (#6862)
06a0157700 is described below
commit 06a015770098a569b67855dfaa18bdfa7c18ff92
Author: Onur Satici <[email protected]>
AuthorDate: Wed Dec 11 13:23:05 2024 +0000
add buffered data_pages to parquet column writer total bytes estimation
(#6862)
* add buffered data_pages to parquet column writer memory size estimation
* move to get estimated total bytes
---
parquet/src/column/writer/mod.rs | 26 +++++++++++++++++++++++++-
1 file changed, 25 insertions(+), 1 deletion(-)
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 8d0be5f9f8..16de0ba789 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -574,7 +574,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E>
{
/// anticipated encoded size.
#[cfg(feature = "arrow")]
pub(crate) fn get_estimated_total_bytes(&self) -> u64 {
- self.column_metrics.total_bytes_written
+ self.data_pages
+ .iter()
+ .map(|page| page.data().len() as u64)
+ .sum::<u64>()
+ + self.column_metrics.total_bytes_written
+ self.encoder.estimated_data_page_size() as u64
+ self.encoder.estimated_dict_page_size().unwrap_or_default() as
u64
}
@@ -3422,6 +3426,26 @@ mod tests {
assert!(stats.max_bytes_opt().is_none());
}
+ #[test]
+ #[cfg(feature = "arrow")]
+ fn test_column_writer_get_estimated_total_bytes() {
+ let page_writer = get_test_page_writer();
+ let props = Default::default();
+ let mut writer = get_test_column_writer::<Int32Type>(page_writer, 0,
0, props);
+ assert_eq!(writer.get_estimated_total_bytes(), 0);
+
+ writer.write_batch(&[1, 2, 3, 4], None, None).unwrap();
+ writer.add_data_page().unwrap();
+ let size_with_one_page = writer.get_estimated_total_bytes();
+ assert_eq!(size_with_one_page, 20);
+
+ writer.write_batch(&[5, 6, 7, 8], None, None).unwrap();
+ writer.add_data_page().unwrap();
+ let size_with_two_pages = writer.get_estimated_total_bytes();
+ // different pages have different compressed lengths
+ assert_eq!(size_with_two_pages, 20 + 21);
+ }
+
fn write_multiple_pages<T: DataType>(
column_descr: &Arc<ColumnDescriptor>,
pages: &[&[Option<T::T>]],