This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new f73dbc30b Minor: improve `RowFilter` and `ArrowPredicate` docs (#6301)
f73dbc30b is described below
commit f73dbc30bb80a87d5b57c1cdb0232d15f0a5a75a
Author: Andrew Lamb <[email protected]>
AuthorDate: Sun Aug 25 07:06:37 2024 -0400
Minor: improve `RowFilter` and `ArrowPredicate` docs (#6301)
* Minor: improve `RowFilter` and `ArrowPredicate` docs
* tweak
---
parquet/src/arrow/arrow_reader/filter.rs | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/filter.rs
b/parquet/src/arrow/arrow_reader/filter.rs
index 4686e1512..2e22f7e01 100644
--- a/parquet/src/arrow/arrow_reader/filter.rs
+++ b/parquet/src/arrow/arrow_reader/filter.rs
@@ -20,6 +20,8 @@ use arrow_array::{BooleanArray, RecordBatch};
use arrow_schema::ArrowError;
/// A predicate operating on [`RecordBatch`]
+///
+/// See [`RowFilter`] for more information on the use of this trait.
pub trait ArrowPredicate: Send + 'static {
/// Returns the [`ProjectionMask`] that describes the columns required
/// to evaluate this predicate. All projected columns will be provided in
the `batch`
@@ -29,7 +31,7 @@ pub trait ArrowPredicate: Send + 'static {
/// Evaluate this predicate for the given [`RecordBatch`] containing the
columns
/// identified by [`Self::projection`]
///
- /// Must return a [`BooleanArray`] that has the same length as the input
+ /// Must return a [`BooleanArray`] that has the same length as the input
/// `batch` where each row indicates whether the row should be returned:
/// * `true`:the row should be returned
/// * `false` or `null`: the row should not be returned
@@ -68,12 +70,17 @@ where
}
}
-/// A [`RowFilter`] allows pushing down a filter predicate to skip IO and
decode
+/// Filter applied *during* the parquet read process
+///
+/// [`RowFilter`] applies predicates in order, after decoding only the columns
+/// required. As predicates eliminate rows, fewer rows from subsequent columns
+/// may be required, thus potentially reducing IO and decode.
///
-/// This consists of a list of [`ArrowPredicate`] where only the rows that
satisfy all
-/// of the predicates will be returned. Any [`RowSelection`] will be applied
prior
+/// A `RowFilter` consists of a list of [`ArrowPredicate`]s. Only the rows for
which
+/// all the predicates evaluate to `true` will be returned.
+/// Any [`RowSelection`] provided to the reader will be applied prior
/// to the first predicate, and each predicate in turn will then be used to
compute
-/// a more refined [`RowSelection`] to use when evaluating the subsequent
predicates.
+/// a more refined [`RowSelection`] used when evaluating the subsequent
predicates.
///
/// Once all predicates have been evaluated, the final [`RowSelection`] is
applied
/// to the top-level [`ProjectionMask`] to produce the final output
[`RecordBatch`].