This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 06a0157700 add buffered data_pages to parquet column writer total 
bytes estimation (#6862)
06a0157700 is described below

commit 06a015770098a569b67855dfaa18bdfa7c18ff92
Author: Onur Satici <[email protected]>
AuthorDate: Wed Dec 11 13:23:05 2024 +0000

    add buffered data_pages to parquet column writer total bytes estimation 
(#6862)
    
    * add buffered data_pages to parquet column writer memory size estimation
    
    * move to get estimated total bytes
---
 parquet/src/column/writer/mod.rs | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 8d0be5f9f8..16de0ba789 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -574,7 +574,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> 
{
     /// anticipated encoded size.
     #[cfg(feature = "arrow")]
     pub(crate) fn get_estimated_total_bytes(&self) -> u64 {
-        self.column_metrics.total_bytes_written
+        self.data_pages
+            .iter()
+            .map(|page| page.data().len() as u64)
+            .sum::<u64>()
+            + self.column_metrics.total_bytes_written
             + self.encoder.estimated_data_page_size() as u64
             + self.encoder.estimated_dict_page_size().unwrap_or_default() as 
u64
     }
@@ -3422,6 +3426,26 @@ mod tests {
         assert!(stats.max_bytes_opt().is_none());
     }
 
+    #[test]
+    #[cfg(feature = "arrow")]
+    fn test_column_writer_get_estimated_total_bytes() {
+        let page_writer = get_test_page_writer();
+        let props = Default::default();
+        let mut writer = get_test_column_writer::<Int32Type>(page_writer, 0, 
0, props);
+        assert_eq!(writer.get_estimated_total_bytes(), 0);
+
+        writer.write_batch(&[1, 2, 3, 4], None, None).unwrap();
+        writer.add_data_page().unwrap();
+        let size_with_one_page = writer.get_estimated_total_bytes();
+        assert_eq!(size_with_one_page, 20);
+
+        writer.write_batch(&[5, 6, 7, 8], None, None).unwrap();
+        writer.add_data_page().unwrap();
+        let size_with_two_pages = writer.get_estimated_total_bytes();
+        // different pages have different compressed lengths
+        assert_eq!(size_with_two_pages, 20 + 21);
+    }
+
     fn write_multiple_pages<T: DataType>(
         column_descr: &Arc<ColumnDescriptor>,
         pages: &[&[Option<T::T>]],

Reply via email to