tustvold commented on code in PR #2435:
URL: https://github.com/apache/arrow-rs/pull/2435#discussion_r945751276


##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -48,9 +50,127 @@ pub(crate) use filter::{ArrowPredicate, ArrowPredicateFn, 
RowFilter};
 #[allow(unused_imports)]
 pub(crate) use selection::{RowSelection, RowSelector};
 
+/// A generic builder for constructing sync or async arrow parquet readers. 
This is not intended
+/// to be used directly, instead you should use the specialization for the 
type of reader
+/// you wish to use
+///
+/// * For a synchronous API - [`ParquetRecordBatchReaderBuilder`]
+/// * For an asynchronous API - [`ParquetRecordBatchStreamBuilder`]
+///
+/// [`ParquetRecordBatchStreamBuilder`]: 
[crate::arrow::async_reader::ParquetRecordBatchStreamBuilder]
+pub struct ArrowReaderBuilder<T> {
+    pub(crate) input: T,
+
+    pub(crate) metadata: Arc<ParquetMetaData>,
+
+    pub(crate) schema: SchemaRef,
+
+    pub(crate) batch_size: usize,
+
+    pub(crate) row_groups: Option<Vec<usize>>,
+
+    pub(crate) projection: ProjectionMask,
+
+    pub(crate) filter: Option<RowFilter>,
+
+    pub(crate) selection: Option<RowSelection>,
+}
+
+impl<T> ArrowReaderBuilder<T> {
+    pub(crate) fn new_builder(
+        input: T,
+        metadata: Arc<ParquetMetaData>,
+        options: ArrowReaderOptions,
+    ) -> Result<Self> {
+        let kv_metadata = match options.skip_arrow_metadata {
+            true => None,
+            false => metadata.file_metadata().key_value_metadata(),
+        };
+
+        let schema = Arc::new(parquet_to_arrow_schema(
+            metadata.file_metadata().schema_descr(),
+            kv_metadata,
+        )?);
+
+        Ok(Self {
+            input,
+            metadata,
+            schema,
+            batch_size: 1024,
+            row_groups: None,
+            projection: ProjectionMask::all(),
+            filter: None,
+            selection: None,
+        })
+    }
+
+    /// Returns a reference to the [`ParquetMetaData`] for this parquet file
+    pub fn metadata(&self) -> &Arc<ParquetMetaData> {
+        &self.metadata
+    }
+
+    /// Returns the parquet [`SchemaDescriptor`] for this parquet file
+    pub fn parquet_schema(&self) -> &SchemaDescriptor {
+        self.metadata.file_metadata().schema_descr()
+    }
+
+    /// Returns the arrow [`SchemaRef`] for this parquet file
+    pub fn schema(&self) -> &SchemaRef {
+        &self.schema
+    }
+
+    /// Set the size of [`RecordBatch`] to produce
+    pub fn with_batch_size(self, batch_size: usize) -> Self {
+        Self { batch_size, ..self }
+    }
+
+    /// Only read data from the provided row group indexes
+    pub fn with_row_groups(self, row_groups: Vec<usize>) -> Self {
+        Self {
+            row_groups: Some(row_groups),
+            ..self
+        }
+    }
+
+    /// Only read data from the provided column indexes
+    pub fn with_projection(self, mask: ProjectionMask) -> Self {
+        Self {
+            projection: mask,
+            ..self
+        }
+    }
+
+    /// Provide a [`RowSelection] to filter out rows, and avoid fetching their
+    /// data into memory
+    ///
+    /// Row group filtering is applied prior to this, and rows from skipped
+    /// row groups should not be included in the [`RowSelection`]
+    ///
+    /// TODO: Make public once stable (#1792)
+    #[allow(unused)]
+    pub(crate) fn with_row_selection(self, selection: RowSelection) -> Self {
+        Self {
+            selection: Some(selection),
+            ..self
+        }
+    }

Review Comment:
   They are related but different, in particular `with_row_selection` exists to 
allow you to specify a row selection before reading any data, e.g. based on 
information in the PageIndex



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to