This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 51bcadbcd13 Better document parquet pushdown (#5491)
51bcadbcd13 is described below
commit 51bcadbcd13f0775d40f153263cb02a3a5b57056
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Mon Mar 11 15:39:50 2024 +1300
Better document parquet pushdown (#5491)
---
parquet/src/arrow/arrow_reader/filter.rs | 4 ++++
parquet/src/arrow/arrow_reader/mod.rs | 12 ++++++++++++
2 files changed, 16 insertions(+)
diff --git a/parquet/src/arrow/arrow_reader/filter.rs
b/parquet/src/arrow/arrow_reader/filter.rs
index a80255f413e..4686e151272 100644
--- a/parquet/src/arrow/arrow_reader/filter.rs
+++ b/parquet/src/arrow/arrow_reader/filter.rs
@@ -96,6 +96,10 @@ where
/// leaves 99% of the rows, it may be better to not filter the data from
parquet and
/// apply the filter after the RecordBatch has been fully decoded.
///
+/// Additionally, even if a predicate eliminates a moderate number of rows, it
may still be faster
+/// to filter the data after the RecordBatch has been fully decoded, if the
eliminated rows are
+/// not contiguous.
+///
/// [`RowSelection`]: crate::arrow::arrow_reader::RowSelection
pub struct RowFilter {
/// A list of [`ArrowPredicate`]
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index a34ce77f277..83d6f6f553f 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -141,6 +141,9 @@ impl<T> ArrowReaderBuilder<T> {
/// An example use case of this would be applying a selection determined by
/// evaluating predicates against the [`Index`]
///
+ /// It is recommended to enable reading the page index if using this
functionality, to allow
+ /// more efficient skipping over data pages. See
[`ArrowReaderOptions::with_page_index`]
+ ///
/// [`Index`]: crate::file::page_index::index::Index
pub fn with_row_selection(self, selection: RowSelection) -> Self {
Self {
@@ -152,6 +155,9 @@ impl<T> ArrowReaderBuilder<T> {
/// Provide a [`RowFilter`] to skip decoding rows
///
/// Row filters are applied after row group selection and row selection
+ ///
+ /// It is recommended to enable reading the page index if using this
functionality, to allow
+ /// more efficient skipping over data pages. See
[`ArrowReaderOptions::with_page_index`].
pub fn with_row_filter(self, filter: RowFilter) -> Self {
Self {
filter: Some(filter),
@@ -163,6 +169,9 @@ impl<T> ArrowReaderBuilder<T> {
///
/// The limit will be applied after any [`Self::with_row_selection`] and
[`Self::with_row_filter`]
/// allowing it to limit the final set of rows decoded after any pushed
down predicates
+ ///
+ /// It is recommended to enable reading the page index if using this
functionality, to allow
+ /// more efficient skipping over data pages. See
[`ArrowReaderOptions::with_page_index`]
pub fn with_limit(self, limit: usize) -> Self {
Self {
limit: Some(limit),
@@ -174,6 +183,9 @@ impl<T> ArrowReaderBuilder<T> {
///
/// The offset will be applied after any [`Self::with_row_selection`] and
[`Self::with_row_filter`]
/// allowing it to skip rows after any pushed down predicates
+ ///
+ /// It is recommended to enable reading the page index if using this
functionality, to allow
+ /// more efficient skipping over data pages. See
[`ArrowReaderOptions::with_page_index`]
pub fn with_offset(self, offset: usize) -> Self {
Self {
offset: Some(offset),