This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 6ab853d38f Do not write `ColumnIndex` for null columns when not
writing page statistics (#6011)
6ab853d38f is described below
commit 6ab853d38fc31789e43d10d0d27d3a553439e487
Author: Ed Seidl <[email protected]>
AuthorDate: Tue Jul 16 15:33:15 2024 -0700
Do not write `ColumnIndex` for null columns when not writing page
statistics (#6011)
* disable column_index_builder if no page stats are collected
* add test
* no need to clone descr
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet/src/column/writer/mod.rs | 32 +++++++++++++++++++++++++++++++-
1 file changed, 31 insertions(+), 1 deletion(-)
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 8594e59714..fdc24890e1 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -260,6 +260,12 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E>
{
// Used for level information
encodings.insert(Encoding::RLE);
+ // Disable column_index_builder if not collecting page statistics.
+ let mut column_index_builder = ColumnIndexBuilder::new();
+ if statistics_enabled != EnabledStatistics::Page {
+ column_index_builder.to_invalid()
+ }
+
Self {
descr,
props,
@@ -289,7 +295,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
num_column_nulls: 0,
column_distinct_count: None,
},
- column_index_builder: ColumnIndexBuilder::new(),
+ column_index_builder,
offset_index_builder: OffsetIndexBuilder::new(),
encodings,
data_page_boundary_ascending: true,
@@ -3020,6 +3026,30 @@ mod tests {
assert!(incremented.is_none())
}
+ #[test]
+ fn test_no_column_index_when_stats_disabled() {
+ // https://github.com/apache/arrow-rs/issues/6010
+ // Test that column index is not created/written for all-nulls column
when page
+ // statistics are disabled.
+ let descr = Arc::new(get_test_column_descr::<Int32Type>(1, 0));
+ let props = Arc::new(
+ WriterProperties::builder()
+ .set_statistics_enabled(EnabledStatistics::None)
+ .build(),
+ );
+ let column_writer = get_column_writer(descr, props,
get_test_page_writer());
+ let mut writer = get_typed_column_writer::<Int32Type>(column_writer);
+
+ let data = Vec::new();
+ let def_levels = vec![0; 10];
+ writer.write_batch(&data, Some(&def_levels), None).unwrap();
+ writer.flush_data_pages().unwrap();
+
+ let column_close_result = writer.close().unwrap();
+ assert!(column_close_result.offset_index.is_some());
+ assert!(column_close_result.column_index.is_none());
+ }
+
#[test]
fn test_boundary_order() -> Result<()> {
let descr = Arc::new(get_test_column_descr::<Int32Type>(1, 0));