This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 6ab853d38f Do not write `ColumnIndex` for null columns when not 
writing page statistics (#6011)
6ab853d38f is described below

commit 6ab853d38fc31789e43d10d0d27d3a553439e487
Author: Ed Seidl <[email protected]>
AuthorDate: Tue Jul 16 15:33:15 2024 -0700

    Do not write `ColumnIndex` for null columns when not writing page 
statistics (#6011)
    
    * disable column_index_builder if no page stats are collected
    
    * add test
    
    * no need to clone descr
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 parquet/src/column/writer/mod.rs | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 8594e59714..fdc24890e1 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -260,6 +260,12 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> 
{
         // Used for level information
         encodings.insert(Encoding::RLE);
 
+        // Disable column_index_builder if not collecting page statistics.
+        let mut column_index_builder = ColumnIndexBuilder::new();
+        if statistics_enabled != EnabledStatistics::Page {
+            column_index_builder.to_invalid()
+        }
+
         Self {
             descr,
             props,
@@ -289,7 +295,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
                 num_column_nulls: 0,
                 column_distinct_count: None,
             },
-            column_index_builder: ColumnIndexBuilder::new(),
+            column_index_builder,
             offset_index_builder: OffsetIndexBuilder::new(),
             encodings,
             data_page_boundary_ascending: true,
@@ -3020,6 +3026,30 @@ mod tests {
         assert!(incremented.is_none())
     }
 
+    #[test]
+    fn test_no_column_index_when_stats_disabled() {
+        // https://github.com/apache/arrow-rs/issues/6010
+        // Test that column index is not created/written for all-nulls column 
when page
+        // statistics are disabled.
+        let descr = Arc::new(get_test_column_descr::<Int32Type>(1, 0));
+        let props = Arc::new(
+            WriterProperties::builder()
+                .set_statistics_enabled(EnabledStatistics::None)
+                .build(),
+        );
+        let column_writer = get_column_writer(descr, props, 
get_test_page_writer());
+        let mut writer = get_typed_column_writer::<Int32Type>(column_writer);
+
+        let data = Vec::new();
+        let def_levels = vec![0; 10];
+        writer.write_batch(&data, Some(&def_levels), None).unwrap();
+        writer.flush_data_pages().unwrap();
+
+        let column_close_result = writer.close().unwrap();
+        assert!(column_close_result.offset_index.is_some());
+        assert!(column_close_result.column_index.is_none());
+    }
+
     #[test]
     fn test_boundary_order() -> Result<()> {
         let descr = Arc::new(get_test_column_descr::<Int32Type>(1, 0));

Reply via email to