alamb commented on code in PR #7555: URL: https://github.com/apache/arrow-rs/pull/7555#discussion_r2118165341
########## parquet/src/arrow/arrow_writer/mod.rs: ########## @@ -3766,4 +3769,67 @@ mod tests { .unwrap(); assert_eq!(batches.len(), 0); } + + #[test] + fn test_page_stats_truncation() { + let string_field = Field::new("a", DataType::Utf8, false); + let binary_field = Field::new("b", DataType::Binary, false); + let schema = Schema::new(vec![string_field, binary_field]); + + let raw_string_values = vec!["Blart Versenwald III"]; + let raw_binary_values = [b"Blart Versenwald III".to_vec()]; + let raw_binary_value_refs = raw_binary_values + .iter() + .map(|x| x.as_slice()) + .collect::<Vec<_>>(); + + let string_values = StringArray::from(raw_string_values.clone()); + let binary_values = BinaryArray::from(raw_binary_value_refs); + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(string_values), Arc::new(binary_values)], + ) + .unwrap(); + + let props = WriterProperties::builder() + .set_statistics_truncate_length(Some(2)) + .set_dictionary_enabled(false) + .set_encoding(Encoding::PLAIN) + .set_compression(crate::basic::Compression::UNCOMPRESSED) + .build(); + + let mut file = roundtrip_opts(&batch, props); + + // read file and decode page headers Review Comment: It took me a while to realize there are statistics embedded in the page headers that there is no corresponding rust API to read. I think a comment like this would help ```suggestion // read file and decode page headers // Note: use the thrift API as there is no Rust API to access the statistics in the page headers ``` ########## parquet/src/column/writer/mod.rs: ########## @@ -949,6 +949,58 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .unwrap_or_else(|| (data.to_vec(), false)) } + // Truncate the min and max values in a page or column chunk Statistics Review Comment: Pedantic suggestion: ```suggestion /// Truncate the min and max values that will be written to a data page /// header or column chunk Statistics ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org