alamb commented on code in PR #8763:
URL: https://github.com/apache/arrow-rs/pull/8763#discussion_r2500690572


##########
parquet/src/file/metadata/options.rs:
##########
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Options used to control metadata parsing
+
+use crate::schema::types::SchemaDescPtr;
+
+/// Options that can be set to control what parts of the Parquet file footer
+/// metadata will be decoded and made present in the [`ParquetMetaData`] 
returned
+/// by [`ParquetMetaDataReader`] and [`ParquetMetaDataPushDecoder`].
+///
+/// [`ParquetMetaData`]: crate::file::metadata::ParquetMetaData
+/// [`ParquetMetaDataReader`]: crate::file::metadata::ParquetMetaDataReader
+/// [`ParquetMetaDataPushDecoder`]: 
crate::file::metadata::ParquetMetaDataPushDecoder
+#[derive(Default, Debug, Clone)]
+pub struct ParquetMetaDataOptions {
+    schema_descr: Option<SchemaDescPtr>,
+}
+
+impl ParquetMetaDataOptions {
+    /// Return a new default [`ParquetMetaDataOptions`].
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Returns an optional [`SchemaDescPtr`] to use when decoding. If this is 
not `None` then
+    /// the schema in the footer will be skipped.
+    pub fn schema(&self) -> Option<&SchemaDescPtr> {
+        self.schema_descr.as_ref()
+    }
+
+    /// Provide a schema to use when decoding the metadata.
+    pub fn set_schema(&mut self, val: SchemaDescPtr) {
+        self.schema_descr = Some(val);
+    }
+
+    /// Provide a schema to use when decoding the metadata. Returns `Self` for 
chaining.
+    pub fn with_schema(mut self, val: SchemaDescPtr) -> Self {
+        self.schema_descr = Some(val);
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use bytes::Bytes;
+
+    use crate::{
+        DecodeResult,
+        file::metadata::{ParquetMetaDataOptions, ParquetMetaDataPushDecoder},
+        util::test_common::file_util::get_test_file,
+    };
+    use std::{io::Read, sync::Arc};
+
+    #[test]
+    fn test_provide_schema() {
+        let mut buf: Vec<u8> = Vec::new();
+        get_test_file("alltypes_plain.parquet")
+            .read_to_end(&mut buf)
+            .unwrap();
+
+        let footer = Bytes::from(buf);

Review Comment:
   technically speaking this is the entire file, not just the footer, right? As 
in this might be clearer if it was named something different
   
   ```suggestion
           let data = Bytes::from(buf);
   ```



##########
parquet/src/file/metadata/reader.rs:
##########
@@ -795,7 +810,24 @@ impl ParquetMetaDataReader {
     ///
     /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata
     pub fn decode_metadata(buf: &[u8]) -> Result<ParquetMetaData> {
-        decode_metadata(buf)
+        decode_metadata(buf, None)

Review Comment:
   I wonder if we should start directing people to the push metadata decoder 
(the metadata reader is getting pretty complicated...)



##########
parquet/src/file/serialized_reader.rs:
##########
@@ -2697,6 +2708,21 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_reuse_schema() {
+        let file = get_test_file("alltypes_plain.parquet");
+        let file_reader = 
SerializedFileReader::new(file.try_clone().unwrap()).unwrap();
+        let schema = file_reader.metadata().file_metadata().schema_descr_ptr();
+        let expected = file_reader.metadata;
+
+        let options = ReadOptionsBuilder::new()
+            .with_parquet_schema(schema)
+            .build();
+        let file_reader = SerializedFileReader::new_with_options(file, 
options).unwrap();
+
+        assert_eq!(expected.as_ref(), file_reader.metadata.as_ref());

Review Comment:
   I think this should also verify that the pointers are the same here too
   
   ```diff
   diff --git a/parquet/src/file/serialized_reader.rs 
b/parquet/src/file/serialized_reader.rs
   index ad34b15ab5..941267b452 100644
   --- a/parquet/src/file/serialized_reader.rs
   +++ b/parquet/src/file/serialized_reader.rs
   @@ -2716,11 +2716,16 @@ mod tests {
            let expected = file_reader.metadata;
   
            let options = ReadOptionsBuilder::new()
   -            .with_parquet_schema(schema)
   +            .with_parquet_schema(Arc::clone(&schema))
                .build();
            let file_reader = SerializedFileReader::new_with_options(file, 
options).unwrap();
   
            assert_eq!(expected.as_ref(), file_reader.metadata.as_ref());
   +        // Should have used the same schema instance
   +        assert!(Arc::ptr_eq(
   +            &schema,
   +            &file_reader.metadata.file_metadata().schema_descr_ptr()
   +        ));
        }
   ```



##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -387,6 +389,8 @@ pub struct ArrowReaderOptions {
     supplied_schema: Option<SchemaRef>,
     /// Policy for reading offset and column indexes.
     pub(crate) page_index_policy: PageIndexPolicy,
+    /// Options to control reading of Parquet metadata

Review Comment:
   I reviewed the `ArrowReaderOptions` and `ArrowReaderMetadata` structures and 
their use, and I agree this is the appropriate structure to add metadata 
parsing to.
   
   Do you think it  eventually makes sense to move the other fields from 
ArrowReaderOptions to `ParquetMetaDataOptions`? (e.g. `supplied_schema`) 



##########
parquet/src/file/metadata/options.rs:
##########
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Options used to control metadata parsing
+
+use crate::schema::types::SchemaDescPtr;
+
+/// Options that can be set to control what parts of the Parquet file footer
+/// metadata will be decoded and made present in the [`ParquetMetaData`] 
returned
+/// by [`ParquetMetaDataReader`] and [`ParquetMetaDataPushDecoder`].
+///
+/// [`ParquetMetaData`]: crate::file::metadata::ParquetMetaData
+/// [`ParquetMetaDataReader`]: crate::file::metadata::ParquetMetaDataReader
+/// [`ParquetMetaDataPushDecoder`]: 
crate::file::metadata::ParquetMetaDataPushDecoder
+#[derive(Default, Debug, Clone)]
+pub struct ParquetMetaDataOptions {
+    schema_descr: Option<SchemaDescPtr>,
+}
+
+impl ParquetMetaDataOptions {
+    /// Return a new default [`ParquetMetaDataOptions`].
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Returns an optional [`SchemaDescPtr`] to use when decoding. If this is 
not `None` then
+    /// the schema in the footer will be skipped.
+    pub fn schema(&self) -> Option<&SchemaDescPtr> {
+        self.schema_descr.as_ref()
+    }
+
+    /// Provide a schema to use when decoding the metadata.
+    pub fn set_schema(&mut self, val: SchemaDescPtr) {
+        self.schema_descr = Some(val);
+    }
+
+    /// Provide a schema to use when decoding the metadata. Returns `Self` for 
chaining.
+    pub fn with_schema(mut self, val: SchemaDescPtr) -> Self {
+        self.schema_descr = Some(val);
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use bytes::Bytes;
+
+    use crate::{
+        DecodeResult,
+        file::metadata::{ParquetMetaDataOptions, ParquetMetaDataPushDecoder},
+        util::test_common::file_util::get_test_file,
+    };
+    use std::{io::Read, sync::Arc};
+
+    #[test]
+    fn test_provide_schema() {
+        let mut buf: Vec<u8> = Vec::new();
+        get_test_file("alltypes_plain.parquet")
+            .read_to_end(&mut buf)
+            .unwrap();
+
+        let footer = Bytes::from(buf);
+        let mut decoder = ParquetMetaDataPushDecoder::try_new(footer.len() as 
u64).unwrap();
+        decoder
+            .push_range(0..footer.len() as u64, footer.clone())
+            .unwrap();
+
+        let expected = match decoder.try_decode().unwrap() {
+            DecodeResult::Data(m) => m,
+            _ => panic!("could not parse metadata"),
+        };
+
+        let mut options = ParquetMetaDataOptions::new();
+        options.set_schema(expected.file_metadata().schema_descr_ptr());
+        let options = Arc::new(options);
+
+        let mut decoder = ParquetMetaDataPushDecoder::try_new(footer.len() as 
u64)
+            .unwrap()
+            .with_metadata_options(Some(options));
+        decoder.push_range(0..footer.len() as u64, footer).unwrap();
+        let metadata = match decoder.try_decode().unwrap() {
+            DecodeResult::Data(m) => m,
+            _ => panic!("could not parse metadata"),
+        };
+
+        assert_eq!(expected, metadata);

Review Comment:
   I think another important thing to test here is that the schema are actually 
the same. I think you can do this with something like
   
   ```diff
   diff --git a/parquet/src/file/metadata/options.rs 
b/parquet/src/file/metadata/options.rs
   index 6f13134df5..6fa6cc1f06 100644
   --- a/parquet/src/file/metadata/options.rs
   +++ b/parquet/src/file/metadata/options.rs
   @@ -84,8 +84,10 @@ mod tests {
                _ => panic!("could not parse metadata"),
            };
   
   +        let expected_schema = expected.file_metadata().schema_descr_ptr();
   +
            let mut options = ParquetMetaDataOptions::new();
   -        options.set_schema(expected.file_metadata().schema_descr_ptr());
   +        options.set_schema(expected_schema);
            let options = Arc::new(options);
   
            let mut decoder = ParquetMetaDataPushDecoder::try_new(footer.len() 
as u64)
   @@ -98,5 +100,10 @@ mod tests {
            };
   
            assert_eq!(expected, metadata);
   +        // the schema pointers should be the same
   +        assert!(Arc::ptr_eq(
   +            &expected.file_metadata().schema_descr_ptr(),
   +            &metadata.file_metadata().schema_descr_ptr()
   +        ));
        }
    }
   ```



##########
parquet/src/file/metadata/thrift/mod.rs:
##########
@@ -711,10 +744,15 @@ pub(crate) fn parquet_metadata_from_bytes(buf: &[u8]) -> 
Result<ParquetMetaData>
                 version = Some(i32::read_thrift(&mut prot)?);
             }
             2 => {
-                // read schema and convert to SchemaDescriptor for use when 
reading row groups
-                let val = read_thrift_vec::<SchemaElement, 
ThriftSliceInputProtocol>(&mut prot)?;
-                let val = parquet_schema_from_array(val)?;
-                schema_descr = Some(Arc::new(SchemaDescriptor::new(val)));
+                if schema_descr.is_some() {

Review Comment:
   ```suggestion
                   // If schema was passed in, skip parsing it
                   if schema_descr.is_some() {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to