rluvaton commented on code in PR #9700:
URL: https://github.com/apache/arrow-rs/pull/9700#discussion_r3116956801
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -2572,6 +2574,91 @@ mod tests {
);
}
+ #[test]
+ fn arrow_writer_dictionary_fallback_on_unfavorable_compression() {
+ let schema = Arc::new(Schema::new(vec![Field::new("col",
DataType::Utf8, false)]));
+
+ let mut builder = StringBuilder::with_capacity(100, 329 * 10_000);
+
+ // Generate an array of 10 unique 10 character strings.
+ // This results in a dictionary encoding larger than the plain encoded
data,
+ // which should trigger a fallback to PLAIN encoding.
+ for i in 0..10 {
+ let value = i
+ .to_string()
+ .repeat(10)
+ .chars()
+ .take(10)
+ .collect::<String>();
+
+ builder.append_value(value);
+ }
+
+ let array = Arc::new(builder.finish());
+
+ let batch = RecordBatch::try_new(schema, vec![array.clone()]).unwrap();
+
+ let file = tempfile::tempfile().unwrap();
+
+ // Set dictionary fallback to trigger fallback to PLAIN encoding on
unfavorable compression
+ let props = WriterProperties::builder()
+ .set_dictionary_fallback(DictionaryFallback::OnUnfavorableAfter(1))
+ .set_data_page_row_count_limit(2)
+ .set_write_batch_size(1)
+ .build();
+
+ let mut writer =
+ ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(),
Some(props))
+ .expect("Unable to write file");
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+
+ let options = ReadOptionsBuilder::new()
+ .with_encoding_stats_as_mask(false)
+ .build();
+ let reader =
+ SerializedFileReader::new_with_options(file.try_clone().unwrap(),
options).unwrap();
+
+ let column = reader.metadata().row_group(0).columns();
+
+ assert_eq!(column.len(), 1);
+
+ // check page encoding stats, should be one dict page, one dict
encoded page, and 5
+ // plain encoded pages
+ let stats = column[0].page_encoding_stats().unwrap();
+ println!("pes: {stats:?}");
+ assert!(
+ stats
+ .iter()
+ .any(|s| s.page_type == PageType::DICTIONARY_PAGE)
+ );
Review Comment:
```suggestion
assert!(
stats
.iter()
.any(|s| s.page_type == PageType::DICTIONARY_PAGE),
"stats are {stats:?}"
);
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]