thinkharderdev commented on code in PR #2435:
URL: https://github.com/apache/arrow-rs/pull/2435#discussion_r945747604


##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -48,9 +50,127 @@ pub(crate) use filter::{ArrowPredicate, ArrowPredicateFn, 
RowFilter};
 #[allow(unused_imports)]
 pub(crate) use selection::{RowSelection, RowSelector};
 
+/// A generic builder for constructing sync or async arrow parquet readers. 
This is not intended
+/// to be used directly, instead you should use the specialization for the 
type of reader
+/// you wish to use
+///
+/// * For a synchronous API - [`ParquetRecordBatchReaderBuilder`]
+/// * For an asynchronous API - [`ParquetRecordBatchStreamBuilder`]
+///
+/// [`ParquetRecordBatchStreamBuilder`]: 
[crate::arrow::async_reader::ParquetRecordBatchStreamBuilder]
+pub struct ArrowReaderBuilder<T> {
+    pub(crate) input: T,
+
+    pub(crate) metadata: Arc<ParquetMetaData>,
+
+    pub(crate) schema: SchemaRef,
+
+    pub(crate) batch_size: usize,
+
+    pub(crate) row_groups: Option<Vec<usize>>,
+
+    pub(crate) projection: ProjectionMask,
+
+    pub(crate) filter: Option<RowFilter>,
+
+    pub(crate) selection: Option<RowSelection>,
+}
+
+impl<T> ArrowReaderBuilder<T> {
+    pub(crate) fn new_builder(
+        input: T,
+        metadata: Arc<ParquetMetaData>,
+        options: ArrowReaderOptions,
+    ) -> Result<Self> {
+        let kv_metadata = match options.skip_arrow_metadata {
+            true => None,
+            false => metadata.file_metadata().key_value_metadata(),
+        };
+
+        let schema = Arc::new(parquet_to_arrow_schema(
+            metadata.file_metadata().schema_descr(),
+            kv_metadata,
+        )?);
+
+        Ok(Self {
+            input,
+            metadata,
+            schema,
+            batch_size: 1024,
+            row_groups: None,
+            projection: ProjectionMask::all(),
+            filter: None,
+            selection: None,
+        })
+    }
+
+    /// Returns a reference to the [`ParquetMetaData`] for this parquet file
+    pub fn metadata(&self) -> &Arc<ParquetMetaData> {
+        &self.metadata
+    }
+
+    /// Returns the parquet [`SchemaDescriptor`] for this parquet file
+    pub fn parquet_schema(&self) -> &SchemaDescriptor {
+        self.metadata.file_metadata().schema_descr()
+    }
+
+    /// Returns the arrow [`SchemaRef`] for this parquet file
+    pub fn schema(&self) -> &SchemaRef {
+        &self.schema
+    }
+
+    /// Set the size of [`RecordBatch`] to produce
+    pub fn with_batch_size(self, batch_size: usize) -> Self {
+        Self { batch_size, ..self }
+    }
+
+    /// Only read data from the provided row group indexes
+    pub fn with_row_groups(self, row_groups: Vec<usize>) -> Self {
+        Self {
+            row_groups: Some(row_groups),
+            ..self
+        }
+    }
+
+    /// Only read data from the provided column indexes
+    pub fn with_projection(self, mask: ProjectionMask) -> Self {
+        Self {
+            projection: mask,
+            ..self
+        }
+    }
+
+    /// Provide a [`RowSelection] to filter out rows, and avoid fetching their
+    /// data into memory
+    ///
+    /// Row group filtering is applied prior to this, and rows from skipped
+    /// row groups should not be included in the [`RowSelection`]
+    ///
+    /// TODO: Make public once stable (#1792)
+    #[allow(unused)]
+    pub(crate) fn with_row_selection(self, selection: RowSelection) -> Self {
+        Self {
+            selection: Some(selection),
+            ..self
+        }
+    }

Review Comment:
   Hmm, would it make sense to collapse `with_row_selection` and 
`with_row_filter`. The API is a bit confusing with both. And you could always 
just define a `RowSelection` as an `ArrowPredicate`. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to