alamb commented on code in PR #10142:
URL: https://github.com/apache/arrow-rs/pull/10142#discussion_r3438558241
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -2094,6 +2105,82 @@ mod tests {
assert_eq!(read_values, values);
}
+ /// The dictionary page is routed through the [`PageStore`] like any other
+ /// page rather than held resident in memory, so a dictionary column
chunk's
+ /// *entire* serialized size — dictionary page included — passes through
the
+ /// store.
+ #[test]
+ fn dictionary_page_is_routed_through_the_store() {
+ /// A store that sums the bytes handed to `put`.
+ #[derive(Debug, Default)]
+ struct SizeRecordingPageStore {
+ blobs: Vec<Bytes>,
+ bytes_put: Arc<std::sync::atomic::AtomicUsize>,
+ }
+ impl PageStore for SizeRecordingPageStore {
+ fn put(&mut self, value: Bytes) -> Result<PageKey> {
+ self.bytes_put
+ .fetch_add(value.len(),
std::sync::atomic::Ordering::Relaxed);
+ let key = PageKey::new(self.blobs.len() as u64);
+ self.blobs.push(value);
+ Ok(key)
+ }
+ fn take(&mut self, key: PageKey) -> Result<Bytes> {
+ Ok(std::mem::take(&mut self.blobs[key.get() as usize]))
+ }
+ }
+ #[derive(Debug)]
+ struct Factory {
+ bytes_put: Arc<std::sync::atomic::AtomicUsize>,
+ }
+ impl PageStoreFactory for Factory {
+ fn create(&self, _args: &PageStoreArgs<'_>) -> Result<Box<dyn
PageStore>> {
+ Ok(Box::new(SizeRecordingPageStore {
+ bytes_put: self.bytes_put.clone(),
+ ..Default::default()
+ }))
+ }
+ }
+
+ let schema = Arc::new(Schema::new(vec![Field::new("s", DataType::Utf8,
false)]));
+ // Low cardinality keeps the column dictionary-encoded with a real,
+ // non-empty dictionary page.
+ let values: Vec<&str> = (0..2048)
+ .map(|i| ["alpha", "beta", "gamma", "delta"][i % 4])
+ .collect();
+ let batch = RecordBatch::try_new(schema.clone(),
vec![Arc::new(StringArray::from(values))])
+ .unwrap();
+
+ let bytes_put = Arc::new(std::sync::atomic::AtomicUsize::new(0));
+ let opts =
ArrowWriterOptions::new().with_page_store_factory(Arc::new(Factory {
+ bytes_put: bytes_put.clone(),
+ }));
+
+ // A single batch / single column means exactly one row group and one
+ // store instance, so the bytes it saw map to one column chunk.
+ let mut buffer = Vec::new();
+ let mut writer =
+ ArrowWriter::try_new_with_options(&mut buffer, schema.clone(),
opts).unwrap();
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+
+ let reader = SerializedFileReader::new(Bytes::from(buffer)).unwrap();
+ let column = reader.metadata().row_group(0).column(0);
+ assert!(
+ column.dictionary_page_offset().is_some(),
+ "expected the column to be dictionary-encoded"
+ );
+
+ // The bytes the store was handed must account for the whole chunk,
Review Comment:
👍
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]