tustvold commented on code in PR #4676:
URL: https://github.com/apache/arrow-rs/pull/4676#discussion_r1290368232


##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -234,48 +221,187 @@ impl ArrowReaderOptions {
     }
 }
 
+/// The clone-able metadata necessary to construct a [`ArrowReaderBuilder`]
+///
+/// This allows loading the metadata for a file once and then using this to 
construct
+/// multiple separate readers, for example, to distribute readers across 
multiple threads
+#[derive(Debug, Clone)]
+pub struct ArrowReaderMetadata {
+    pub(crate) metadata: Arc<ParquetMetaData>,
+
+    pub(crate) schema: SchemaRef,
+
+    pub(crate) fields: Option<ParquetField>,
+}
+
+impl ArrowReaderMetadata {
+    pub(crate) fn try_new(
+        metadata: Arc<ParquetMetaData>,
+        options: ArrowReaderOptions,
+    ) -> Result<Self> {
+        let kv_metadata = match options.skip_arrow_metadata {
+            true => None,
+            false => metadata.file_metadata().key_value_metadata(),
+        };
+
+        let (schema, fields) = parquet_to_arrow_schema_and_fields(
+            metadata.file_metadata().schema_descr(),
+            ProjectionMask::all(),
+            kv_metadata,
+        )?;
+
+        Ok(Self {
+            metadata,
+            schema: Arc::new(schema),
+            fields,
+        })
+    }
+
+    /// Returns a reference to the [`ParquetMetaData`] for this parquet file
+    pub fn metadata(&self) -> &Arc<ParquetMetaData> {
+        &self.metadata
+    }
+
+    /// Returns the parquet [`SchemaDescriptor`] for this parquet file
+    pub fn parquet_schema(&self) -> &SchemaDescriptor {
+        self.metadata.file_metadata().schema_descr()
+    }
+
+    /// Returns the arrow [`SchemaRef`] for this parquet file
+    pub fn schema(&self) -> &SchemaRef {
+        &self.schema
+    }
+}
+
 #[doc(hidden)]
 /// A newtype used within [`ReaderOptionsBuilder`] to distinguish sync readers 
from async
-pub struct SyncReader<T: ChunkReader>(SerializedFileReader<T>);
+pub struct SyncReader<T: ChunkReader>(T);
 
 /// A synchronous builder used to construct [`ParquetRecordBatchReader`] for a 
file
 ///
 /// For an async API see 
[`crate::arrow::async_reader::ParquetRecordBatchStreamBuilder`]
 pub type ParquetRecordBatchReaderBuilder<T> = 
ArrowReaderBuilder<SyncReader<T>>;
 
-impl<T: ChunkReader + 'static> ArrowReaderBuilder<SyncReader<T>> {
+impl<T: ChunkReader + 'static> ParquetRecordBatchReaderBuilder<T> {
     /// Create a new [`ParquetRecordBatchReaderBuilder`]
+    ///
+    /// ```
+    /// # use std::sync::Arc;
+    /// # use bytes::Bytes;
+    /// # use arrow_array::{Int32Array, RecordBatch};
+    /// # use arrow_schema::{DataType, Field, Schema};
+    /// # use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, 
ParquetRecordBatchReaderBuilder};
+    /// # use parquet::arrow::ArrowWriter;
+    /// # let mut file: Vec<u8> = Vec::with_capacity(1024);
+    /// # let schema = Arc::new(Schema::new(vec![Field::new("i32", 
DataType::Int32, false)]));
+    /// # let mut writer = ArrowWriter::try_new(&mut file, schema.clone(), 
None).unwrap();
+    /// # let batch = RecordBatch::try_new(schema, 
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))]).unwrap();
+    /// # writer.write(&batch).unwrap();
+    /// # writer.close().unwrap();
+    /// # let file = Bytes::from(file);
+    /// #
+    /// let mut builder = 
ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
+    ///
+    /// // Inspect metadata
+    /// assert_eq!(builder.metadata().num_row_groups(), 1);
+    ///
+    /// // Construct reader
+    /// let mut reader: ParquetRecordBatchReader = 
builder.with_row_groups(vec![0]).build().unwrap();
+    ///
+    /// // Read data
+    /// let _batch = reader.next().unwrap().unwrap();
+    /// ```
     pub fn try_new(reader: T) -> Result<Self> {
         Self::try_new_with_options(reader, Default::default())
     }
 
     /// Create a new [`ParquetRecordBatchReaderBuilder`] with 
[`ArrowReaderOptions`]
     pub fn try_new_with_options(reader: T, options: ArrowReaderOptions) -> 
Result<Self> {
-        let reader = match options.page_index {
-            true => {
-                let read_options = 
ReadOptionsBuilder::new().with_page_index().build();
-                SerializedFileReader::new_with_options(reader, read_options)?
-            }
-            false => SerializedFileReader::new(reader)?,
-        };
+        let metadata = Self::load_metadata(&reader, options)?;
+        Ok(Self::new_with_metadata(reader, metadata))
+    }
+
+    /// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`]
+    ///
+    /// See [`Self::new_with_metadata`] for how this can be used
+    pub fn load_metadata(

Review Comment:
   > to me if it was on ParquetRecordBatchReaderBuilder
   
   Do you mean ArrowReaderMetadata?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to