This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new c025c48f28 [Parquet]: GH-563: Make `path_in_schema` optional (#9678)
c025c48f28 is described below

commit c025c48f284f1c1829cd1c469ac0232d0ec9c79f
Author: Ed Seidl <[email protected]>
AuthorDate: Thu May 7 10:17:59 2026 -0700

    [Parquet]: GH-563: Make `path_in_schema` optional (#9678)
    
    # Which issue does this PR close?
    
    none
    
    # Rationale for this change
    This is a proof of concept implementation for
    https://github.com/apache/parquet-format/issues/563
    
    # What changes are included in this PR?
    
    Since version 57.0.0, this crate has been tolerant of a missing
    `path_in_schema`. This PR adds options to cease writing the field as
    well. The option defaults to continuing to write the field.
    
    See related discussion on parquet mailing list:
    https://lists.apache.org/thread/czm2bk45wwtkhhpqxqvmx9dk5wkwk1kt
    
    # Are these changes tested?
    
    Yes
    
    # Are there any user-facing changes?
    
    No, this only adds an optional behavior change that defaults to no
    change
    
    # Related PRs
    - https://github.com/apache/parquet-format/issues/563
    - https://github.com/apache/parquet-format/pull/564
    - https://github.com/apache/parquet-java/pull/3470
---
 parquet/benches/metadata.rs             | 20 ++++++++++---
 parquet/src/arrow/arrow_writer/mod.rs   | 38 ++++++++++++++++++++++++
 parquet/src/bin/parquet-rewrite.rs      |  7 +++++
 parquet/src/file/metadata/thrift/mod.rs | 17 ++++++++---
 parquet/src/file/metadata/writer.rs     | 21 ++++++++++++-
 parquet/src/file/properties.rs          | 52 +++++++++++++++++++++++++++++++++
 parquet/src/file/writer.rs              |  2 ++
 parquet/src/parquet_thrift.rs           | 19 +++++++++++-
 8 files changed, 166 insertions(+), 10 deletions(-)

diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs
index c9a6cf3b76..750e33a2d3 100644
--- a/parquet/benches/metadata.rs
+++ b/parquet/benches/metadata.rs
@@ -41,7 +41,7 @@ use parquet::file::serialized_reader::ReadOptionsBuilder;
 const NUM_COLUMNS: usize = 10_000;
 const NUM_ROW_GROUPS: usize = 10;
 
-fn encoded_meta(is_nullable: bool, has_lists: bool) -> Vec<u8> {
+fn encoded_meta(is_nullable: bool, has_lists: bool, write_path_in_schema: 
bool) -> Vec<u8> {
     let mut rng = seedable_rng();
 
     let mut column_desc_ptrs: Vec<ColumnDescPtr> = 
Vec::with_capacity(NUM_COLUMNS);
@@ -143,7 +143,11 @@ fn encoded_meta(is_nullable: bool, has_lists: bool) -> 
Vec<u8> {
     let mut buffer = Vec::with_capacity(1024);
     {
         let buf = TrackedWrite::new(&mut buffer);
-        let writer = ParquetMetaDataWriter::new_with_tracked(buf, &metadata);
+        let mut writer = ParquetMetaDataWriter::new_with_tracked(buf, 
&metadata);
+        // use defaults unless `write_path_in_schema` is false
+        if !write_path_in_schema {
+            writer = writer.with_write_path_in_schema(write_path_in_schema);
+        }
         writer.finish().unwrap();
     }
 
@@ -233,7 +237,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    let buf: Bytes = black_box(encoded_meta(false, false)).into();
+    let buf: Bytes = black_box(encoded_meta(false, false, true)).into();
     let options = 
ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
     c.bench_function("decode parquet metadata (wide)", |b| {
         b.iter(|| {
@@ -275,7 +279,15 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    let buf: Bytes = black_box(encoded_meta(true, true)).into();
+    let buf: Bytes = black_box(encoded_meta(false, false, false)).into();
+    let options = 
ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
+    c.bench_function("decode parquet metadata no path_in_schema (wide)", |b| {
+        b.iter(|| {
+            ParquetMetaDataReader::decode_metadata_with_options(&buf, 
Some(&options)).unwrap();
+        })
+    });
+
+    let buf: Bytes = black_box(encoded_meta(true, true, true)).into();
     c.bench_function("decode parquet metadata w/ size stats (wide)", |b| {
         b.iter(|| {
             ParquetMetaDataReader::decode_metadata(&buf).unwrap();
diff --git a/parquet/src/arrow/arrow_writer/mod.rs 
b/parquet/src/arrow/arrow_writer/mod.rs
index b93e174401..d1f1a3cffe 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -4596,6 +4596,44 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_arrow_writer_skip_path_in_schema() {
+        let batch_schema = Schema::new(vec![Field::new("int32", 
DataType::Int32, false)]);
+        let file_schema = Arc::new(batch_schema.clone());
+
+        let batch = RecordBatch::try_new(
+            Arc::new(batch_schema),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as _],
+        )
+        .unwrap();
+
+        // default options should still write path_in_schema
+        let skip_options = ArrowWriterOptions::new();
+
+        let mut buf = Vec::with_capacity(1024);
+        let mut writer =
+            ArrowWriter::try_new_with_options(&mut buf, file_schema.clone(), 
skip_options).unwrap();
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+
+        // override to not write path_in_schema
+        let skip_options = ArrowWriterOptions::new().with_properties(
+            WriterProperties::builder()
+                .set_write_path_in_schema(false)
+                .build(),
+        );
+
+        let mut buf2 = Vec::with_capacity(1024);
+        let mut writer =
+            ArrowWriter::try_new_with_options(&mut buf2, file_schema.clone(), 
skip_options)
+                .unwrap();
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+
+        // buf2 should be a bit smaller due to lack of path_in_schema
+        assert!(buf.len() > buf2.len());
+    }
+
     #[test]
     fn mismatched_schemas() {
         let batch_schema = Schema::new(vec![Field::new("count", 
DataType::Int32, false)]);
diff --git a/parquet/src/bin/parquet-rewrite.rs 
b/parquet/src/bin/parquet-rewrite.rs
index 5bdd432be5..7637f26d77 100644
--- a/parquet/src/bin/parquet-rewrite.rs
+++ b/parquet/src/bin/parquet-rewrite.rs
@@ -279,6 +279,10 @@ struct Args {
     #[clap(long)]
     write_page_header_statistics: Option<bool>,
 
+    /// Write path_in_schema to the column metadata.
+    #[clap(long)]
+    write_path_in_schema: Option<bool>,
+
     /// Sets whether bloom filter is enabled for all columns.
     #[clap(long)]
     bloom_filter_enabled: Option<bool>,
@@ -406,6 +410,9 @@ fn main() {
     if let Some(value) = args.coerce_types {
         writer_properties_builder = 
writer_properties_builder.set_coerce_types(value);
     }
+    if let Some(value) = args.write_path_in_schema {
+        writer_properties_builder = 
writer_properties_builder.set_write_path_in_schema(value);
+    }
     if let Some(value) = args.write_batch_size {
         writer_properties_builder = 
writer_properties_builder.set_write_batch_size(value);
     }
diff --git a/parquet/src/file/metadata/thrift/mod.rs 
b/parquet/src/file/metadata/thrift/mod.rs
index ad8f6312c2..8c1147ff2c 100644
--- a/parquet/src/file/metadata/thrift/mod.rs
+++ b/parquet/src/file/metadata/thrift/mod.rs
@@ -1333,10 +1333,15 @@ pub(super) fn serialize_column_meta_data<W: Write>(
         .encodings()
         .collect::<Vec<_>>()
         .write_thrift_field(w, 2, 1)?;
-    let path = column_chunk.column_descr.path().parts();
-    let path: Vec<&str> = path.iter().map(|v| v.as_str()).collect();
-    path.write_thrift_field(w, 3, 2)?;
-    column_chunk.compression.write_thrift_field(w, 4, 3)?;
+    if w.write_path_in_schema() {
+        let path = column_chunk.column_descr.path().parts();
+        let path: Vec<&str> = path.iter().map(|v| v.as_str()).collect();
+        path.write_thrift_field(w, 3, 2)?;
+        column_chunk.compression.write_thrift_field(w, 4, 3)?;
+    } else {
+        column_chunk.compression.write_thrift_field(w, 4, 2)?;
+    }
+
     column_chunk.num_values.write_thrift_field(w, 5, 4)?;
     column_chunk
         .total_uncompressed_size
@@ -1406,6 +1411,8 @@ pub(super) fn serialize_column_meta_data<W: Write>(
 pub(super) struct FileMeta<'a> {
     pub(super) file_metadata: &'a crate::file::metadata::FileMetaData,
     pub(super) row_groups: &'a Vec<RowGroupMetaData>,
+    // If true, then write the `path_in_schema` field in the ColumnMetaData 
struct.
+    pub(super) write_path_in_schema: bool,
 }
 
 // struct FileMetaData {
@@ -1425,6 +1432,8 @@ impl<'a> WriteThrift for FileMeta<'a> {
     // needed for last_field_id w/o encryption
     #[allow(unused_assignments)]
     fn write_thrift<W: Write>(&self, writer: &mut 
ThriftCompactOutputProtocol<W>) -> Result<()> {
+        writer.set_write_path_in_schema(self.write_path_in_schema);
+
         self.file_metadata
             .version
             .write_thrift_field(writer, 1, 0)?;
diff --git a/parquet/src/file/metadata/writer.rs 
b/parquet/src/file/metadata/writer.rs
index 275b4ff28e..cd5d617f93 100644
--- a/parquet/src/file/metadata/writer.rs
+++ b/parquet/src/file/metadata/writer.rs
@@ -62,6 +62,7 @@ pub(crate) struct ThriftMetadataWriter<'a, W: Write> {
     created_by: Option<String>,
     object_writer: MetadataObjectWriter,
     writer_version: i32,
+    write_path_in_schema: bool,
 }
 
 impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
@@ -259,6 +260,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
         let file_meta = FileMeta {
             file_metadata: &file_metadata,
             row_groups: &row_groups,
+            write_path_in_schema: self.write_path_in_schema,
         };
 
         // Write file metadata
@@ -293,6 +295,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
         row_groups: Vec<RowGroupMetaData>,
         created_by: Option<String>,
         writer_version: i32,
+        write_path_in_schema: bool,
     ) -> Self {
         Self {
             buf,
@@ -304,6 +307,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
             created_by,
             object_writer: Default::default(),
             writer_version,
+            write_path_in_schema,
         }
     }
 
@@ -415,6 +419,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
 pub struct ParquetMetaDataWriter<'a, W: Write> {
     buf: TrackedWrite<W>,
     metadata: &'a ParquetMetaData,
+    write_path_in_schema: bool,
 }
 
 impl<'a, W: Write> ParquetMetaDataWriter<'a, W> {
@@ -436,7 +441,20 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> {
     ///
     /// See example on the struct level documentation
     pub fn new_with_tracked(buf: TrackedWrite<W>, metadata: &'a 
ParquetMetaData) -> Self {
-        Self { buf, metadata }
+        Self {
+            buf,
+            metadata,
+            write_path_in_schema: true,
+        }
+    }
+
+    /// Set whether or not to write the `path_in_schema` field in the Thrift 
`ColumnMetaData`
+    /// struct.
+    pub fn with_write_path_in_schema(self, val: bool) -> Self {
+        Self {
+            write_path_in_schema: val,
+            ..self
+        }
     }
 
     /// Write the metadata to the buffer
@@ -460,6 +478,7 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> {
             row_groups,
             created_by,
             file_metadata.version(),
+            self.write_path_in_schema,
         );
 
         if let Some(column_indexes) = column_indexes {
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 197c5d5c72..1ab8b5c710 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -69,6 +69,8 @@ pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
 pub const DEFAULT_COERCE_TYPES: bool = false;
 /// Default value for 
[`WriterProperties::data_page_v2_compression_ratio_threshold`]
 pub const DEFAULT_DATA_PAGE_V2_COMPRESSION_RATIO_THRESHOLD: f64 = 1.0;
+/// Default value for [`WriterProperties::write_path_in_schema`]
+pub const DEFAULT_WRITE_PATH_IN_SCHEMA: bool = true;
 /// Default minimum chunk size for content-defined chunking: 256 KiB.
 pub const DEFAULT_CDC_MIN_CHUNK_SIZE: usize = 256 * 1024;
 /// Default maximum chunk size for content-defined chunking: 1024 KiB.
@@ -252,6 +254,7 @@ pub struct WriterProperties {
     statistics_truncate_length: Option<usize>,
     coerce_types: bool,
     content_defined_chunking: Option<CdcOptions>,
+    write_path_in_schema: bool,
     #[cfg(feature = "encryption")]
     pub(crate) file_encryption_properties: 
Option<Arc<FileEncryptionProperties>>,
 }
@@ -437,6 +440,14 @@ impl WriterProperties {
         self.coerce_types
     }
 
+    /// Returns `true` if the `path_in_schema` field of the `ColumnMetaData` 
Thrift struct
+    /// should be written.
+    ///
+    /// For more details see 
[`WriterPropertiesBuilder::set_write_path_in_schema`]
+    pub fn write_path_in_schema(&self) -> bool {
+        self.write_path_in_schema
+    }
+
     /// EXPERIMENTAL: Returns content-defined chunking options, or `None` if 
CDC is disabled.
     ///
     /// For more details see 
[`WriterPropertiesBuilder::set_content_defined_chunking`]
@@ -592,6 +603,7 @@ pub struct WriterPropertiesBuilder {
     statistics_truncate_length: Option<usize>,
     coerce_types: bool,
     content_defined_chunking: Option<CdcOptions>,
+    write_path_in_schema: bool,
     #[cfg(feature = "encryption")]
     file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
 }
@@ -616,6 +628,7 @@ impl Default for WriterPropertiesBuilder {
             statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
             coerce_types: DEFAULT_COERCE_TYPES,
             content_defined_chunking: None,
+            write_path_in_schema: DEFAULT_WRITE_PATH_IN_SCHEMA,
             #[cfg(feature = "encryption")]
             file_encryption_properties: None,
         }
@@ -670,6 +683,7 @@ impl WriterPropertiesBuilder {
             statistics_truncate_length: self.statistics_truncate_length,
             coerce_types: self.coerce_types,
             content_defined_chunking: self.content_defined_chunking,
+            write_path_in_schema: self.write_path_in_schema,
             #[cfg(feature = "encryption")]
             file_encryption_properties: self.file_encryption_properties,
         }
@@ -885,6 +899,43 @@ impl WriterPropertiesBuilder {
         self
     }
 
+    /// EXPERIMENTAL: Should the writer emit the `path_in_schema` element of 
the
+    /// `ColumnMetaData` Thrift struct. Defaults to `true` via 
[`DEFAULT_WRITE_PATH_IN_SCHEMA`].
+    ///
+    /// Because `path_in_schema` is a field on the `ColumnMetaData`, it is 
repeated
+    /// `num_columns * num_rowgroups` times. Compounding this is any level of 
nesting or
+    /// repetition in the schema. For instance, a top-level list column named 
`foo` will have
+    /// a `path_in_schema` of `["foo", "list", "element"]`. A list-of-struct 
is even worse,
+    /// because the necessary list wrapping is repeated for each element of 
the struct. A
+    /// file with a deeply nested schema and many row groups can have a large 
percentage of the
+    /// footer taken up by this field. For example, a file of 38 row groups 
with a schema containing
+    /// several lists of structs containing lists had 36% of the footer taken 
up by `path_in_schema`.
+    /// Removing this redundant information can greatly speed up footer 
parsing, which is particularly
+    /// important in scenarios where one does not wish to read the entire file 
(e.g. point
+    /// lookups).
+    ///
+    /// <div class="warning">
+    ///
+    /// **WARNING:**
+    /// Setting this to `false` will break compatibility with Parquet readers 
that
+    /// still expect this field to be present. Virtually all Parquet readers 
(parquet-java,
+    /// Spark, arrow-cpp, pyarrow, pandas to name a few), with the exception
+    /// of the one in this crate, expect this field to be present, and will 
terminate execution
+    /// if it is not. This will continue to be the case unless/until the 
Parquet format
+    /// specification is explicitly changed to allow this field to be missing. 
As a consquence,
+    /// users should only set this to `false` if they have verified that any 
reader(s) they plan
+    /// to use can tolerate the absence of this field.
+    ///
+    /// For more context, see [GH-563].
+    ///
+    /// </div>
+    ///
+    /// [GH-563]: https://github.com/apache/parquet-format/issues/563
+    pub fn set_write_path_in_schema(mut self, write_path_in_schema: bool) -> 
Self {
+        self.write_path_in_schema = write_path_in_schema;
+        self
+    }
+
     /// EXPERIMENTAL: Sets content-defined chunking options, or disables CDC 
with `None`.
     ///
     /// When enabled, data page boundaries are determined by a rolling hash of 
the
@@ -1253,6 +1304,7 @@ impl From<WriterProperties> for WriterPropertiesBuilder {
             statistics_truncate_length: props.statistics_truncate_length,
             coerce_types: props.coerce_types,
             content_defined_chunking: props.content_defined_chunking,
+            write_path_in_schema: props.write_path_in_schema,
             #[cfg(feature = "encryption")]
             file_encryption_properties: props.file_encryption_properties,
         }
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index 7d69904451..beb83b71ce 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -345,12 +345,14 @@ impl<W: Write + Send> SerializedFileWriter<W> {
         let column_indexes = std::mem::take(&mut self.column_indexes);
         let offset_indexes = std::mem::take(&mut self.offset_indexes);
 
+        let write_path_in_schema = self.props.write_path_in_schema();
         let mut encoder = ThriftMetadataWriter::new(
             &mut self.buf,
             &self.descr,
             row_groups,
             Some(self.props.created_by().to_string()),
             self.props.writer_version().as_num(),
+            write_path_in_schema,
         );
 
         #[cfg(feature = "encryption")]
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index a13baa0924..e621f4c498 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -726,12 +726,29 @@ where
 /// [compact output]: 
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 pub(crate) struct ThriftCompactOutputProtocol<W: Write> {
     writer: W,
+    write_path_in_schema: bool,
 }
 
 impl<W: Write> ThriftCompactOutputProtocol<W> {
     /// Create a new `ThriftCompactOutputProtocol` wrapping the byte sink 
`writer`.
     pub(crate) fn new(writer: W) -> Self {
-        Self { writer }
+        Self {
+            writer,
+            write_path_in_schema: true,
+        }
+    }
+
+    // TODO(ets): at some point there should probably be a properties object
+    // to control aspects of thrift output. But since this is the only option 
to date
+    // I'm choosing a simpler API.
+    /// Control the writing of the `path_in_schema` element of the 
`ColumnMetaData`
+    pub(crate) fn set_write_path_in_schema(&mut self, val: bool) {
+        self.write_path_in_schema = val;
+    }
+
+    /// Indicate whether or not to emit `path_in_schema`.
+    pub(crate) fn write_path_in_schema(&self) -> bool {
+        self.write_path_in_schema
     }
 
     /// Write a single byte to the output stream.

Reply via email to