This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 8d6cd7627c Ensure page encoding statistics are written to Parquet file 
(#7643)
8d6cd7627c is described below

commit 8d6cd7627ce364b0e1b82ce6e26d1fc969e0dd53
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Jun 11 21:18:34 2025 -0700

    Ensure page encoding statistics are written to Parquet file (#7643)
---
 parquet/src/arrow/arrow_writer/mod.rs | 45 +++++++++++++++++++++++++++++++++++
 parquet/src/arrow/mod.rs              |  2 +-
 parquet/src/file/writer.rs            |  3 +++
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/parquet/src/arrow/arrow_writer/mod.rs 
b/parquet/src/arrow/arrow_writer/mod.rs
index 147c553443..5dc59d790d 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1328,6 +1328,7 @@ mod tests {
 
     use crate::arrow::arrow_reader::{ParquetRecordBatchReader, 
ParquetRecordBatchReaderBuilder};
     use crate::arrow::ARROW_SCHEMA_META_KEY;
+    use crate::file::page_encoding_stats::PageEncodingStats;
     use crate::format::PageHeader;
     use crate::thrift::TCompactSliceInputProtocol;
     use arrow::datatypes::ToByteSlice;
@@ -3835,4 +3836,48 @@ mod tests {
         assert_eq!(stats.max_value.unwrap(), "Bm".as_bytes());
         assert_eq!(stats.min_value.unwrap(), "Bl".as_bytes());
     }
+
+    #[test]
+    fn test_page_encoding_statistics_roundtrip() {
+        let batch_schema = Schema::new(vec![Field::new(
+            "int32",
+            arrow_schema::DataType::Int32,
+            false,
+        )]);
+
+        let batch = RecordBatch::try_new(
+            Arc::new(batch_schema.clone()),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as _],
+        )
+        .unwrap();
+
+        let mut file: File = tempfile::tempfile().unwrap();
+        let mut writer = ArrowWriter::try_new(&mut file, 
Arc::new(batch_schema), None).unwrap();
+        writer.write(&batch).unwrap();
+        let file_metadata = writer.close().unwrap();
+
+        assert_eq!(file_metadata.row_groups.len(), 1);
+        assert_eq!(file_metadata.row_groups[0].columns.len(), 1);
+        let chunk_meta = file_metadata.row_groups[0].columns[0]
+            .meta_data
+            .as_ref()
+            .expect("column metadata missing");
+        assert!(chunk_meta.encoding_stats.is_some());
+        let chunk_page_stats = chunk_meta.encoding_stats.as_ref().unwrap();
+
+        // check that the read metadata is also correct
+        let options = ReadOptionsBuilder::new().with_page_index().build();
+        let reader = SerializedFileReader::new_with_options(file, 
options).unwrap();
+
+        let rowgroup = reader.get_row_group(0).expect("row group missing");
+        assert_eq!(rowgroup.num_columns(), 1);
+        let column = rowgroup.metadata().column(0);
+        assert!(column.page_encoding_stats().is_some());
+        let file_page_stats = column.page_encoding_stats().unwrap();
+        let chunk_stats: Vec<PageEncodingStats> = chunk_page_stats
+            .iter()
+            .map(|x| 
crate::file::page_encoding_stats::try_from_thrift(x).unwrap())
+            .collect();
+        assert_eq!(&chunk_stats, file_page_stats);
+    }
 }
diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
index e15a8d9b02..e33d6a05a7 100644
--- a/parquet/src/arrow/mod.rs
+++ b/parquet/src/arrow/mod.rs
@@ -481,7 +481,7 @@ mod test {
             .unwrap();
         assert_eq!(
             err.to_string(),
-            "EOF: Parquet file too small. Page index range 82..115 overlaps 
with file metadata 0..341"
+            "EOF: Parquet file too small. Page index range 82..115 overlaps 
with file metadata 0..357"
         );
     }
 
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index 0298d8a51d..0589d09330 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -689,6 +689,9 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> {
         if let Some(statistics) = metadata.statistics() {
             builder = builder.set_statistics(statistics.clone())
         }
+        if let Some(page_encoding_stats) = metadata.page_encoding_stats() {
+            builder = 
builder.set_page_encoding_stats(page_encoding_stats.clone())
+        }
         builder = self.set_column_crypto_metadata(builder, &metadata);
         close.metadata = builder.build()?;
 

Reply via email to