This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 8d6cd7627c Ensure page encoding statistics are written to Parquet file
(#7643)
8d6cd7627c is described below
commit 8d6cd7627ce364b0e1b82ce6e26d1fc969e0dd53
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Jun 11 21:18:34 2025 -0700
Ensure page encoding statistics are written to Parquet file (#7643)
---
parquet/src/arrow/arrow_writer/mod.rs | 45 +++++++++++++++++++++++++++++++++++
parquet/src/arrow/mod.rs | 2 +-
parquet/src/file/writer.rs | 3 +++
3 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index 147c553443..5dc59d790d 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1328,6 +1328,7 @@ mod tests {
use crate::arrow::arrow_reader::{ParquetRecordBatchReader,
ParquetRecordBatchReaderBuilder};
use crate::arrow::ARROW_SCHEMA_META_KEY;
+ use crate::file::page_encoding_stats::PageEncodingStats;
use crate::format::PageHeader;
use crate::thrift::TCompactSliceInputProtocol;
use arrow::datatypes::ToByteSlice;
@@ -3835,4 +3836,48 @@ mod tests {
assert_eq!(stats.max_value.unwrap(), "Bm".as_bytes());
assert_eq!(stats.min_value.unwrap(), "Bl".as_bytes());
}
+
+ #[test]
+ fn test_page_encoding_statistics_roundtrip() {
+ let batch_schema = Schema::new(vec![Field::new(
+ "int32",
+ arrow_schema::DataType::Int32,
+ false,
+ )]);
+
+ let batch = RecordBatch::try_new(
+ Arc::new(batch_schema.clone()),
+ vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as _],
+ )
+ .unwrap();
+
+ let mut file: File = tempfile::tempfile().unwrap();
+ let mut writer = ArrowWriter::try_new(&mut file,
Arc::new(batch_schema), None).unwrap();
+ writer.write(&batch).unwrap();
+ let file_metadata = writer.close().unwrap();
+
+ assert_eq!(file_metadata.row_groups.len(), 1);
+ assert_eq!(file_metadata.row_groups[0].columns.len(), 1);
+ let chunk_meta = file_metadata.row_groups[0].columns[0]
+ .meta_data
+ .as_ref()
+ .expect("column metadata missing");
+ assert!(chunk_meta.encoding_stats.is_some());
+ let chunk_page_stats = chunk_meta.encoding_stats.as_ref().unwrap();
+
+ // check that the read metadata is also correct
+ let options = ReadOptionsBuilder::new().with_page_index().build();
+ let reader = SerializedFileReader::new_with_options(file,
options).unwrap();
+
+ let rowgroup = reader.get_row_group(0).expect("row group missing");
+ assert_eq!(rowgroup.num_columns(), 1);
+ let column = rowgroup.metadata().column(0);
+ assert!(column.page_encoding_stats().is_some());
+ let file_page_stats = column.page_encoding_stats().unwrap();
+ let chunk_stats: Vec<PageEncodingStats> = chunk_page_stats
+ .iter()
+ .map(|x|
crate::file::page_encoding_stats::try_from_thrift(x).unwrap())
+ .collect();
+ assert_eq!(&chunk_stats, file_page_stats);
+ }
}
diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
index e15a8d9b02..e33d6a05a7 100644
--- a/parquet/src/arrow/mod.rs
+++ b/parquet/src/arrow/mod.rs
@@ -481,7 +481,7 @@ mod test {
.unwrap();
assert_eq!(
err.to_string(),
- "EOF: Parquet file too small. Page index range 82..115 overlaps
with file metadata 0..341"
+ "EOF: Parquet file too small. Page index range 82..115 overlaps
with file metadata 0..357"
);
}
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index 0298d8a51d..0589d09330 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -689,6 +689,9 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> {
if let Some(statistics) = metadata.statistics() {
builder = builder.set_statistics(statistics.clone())
}
+ if let Some(page_encoding_stats) = metadata.page_encoding_stats() {
+ builder =
builder.set_page_encoding_stats(page_encoding_stats.clone())
+ }
builder = self.set_column_crypto_metadata(builder, &metadata);
close.metadata = builder.build()?;