(arrow-rs) branch master updated: Better document parquet pushdown (#5491)

tustvold Sun, 10 Mar 2024 19:40:00 -0700

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new 51bcadbcd13 Better document parquet pushdown (#5491)
51bcadbcd13 is described below

commit 51bcadbcd13f0775d40f153263cb02a3a5b57056
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Mon Mar 11 15:39:50 2024 +1300

    Better document parquet pushdown (#5491)
---
 parquet/src/arrow/arrow_reader/filter.rs |  4 ++++
 parquet/src/arrow/arrow_reader/mod.rs    | 12 ++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/parquet/src/arrow/arrow_reader/filter.rs 
b/parquet/src/arrow/arrow_reader/filter.rs
index a80255f413e..4686e151272 100644
--- a/parquet/src/arrow/arrow_reader/filter.rs
+++ b/parquet/src/arrow/arrow_reader/filter.rs
@@ -96,6 +96,10 @@ where
 /// leaves 99% of the rows, it may be better to not filter the data from 
parquet and
 /// apply the filter after the RecordBatch has been fully decoded.
 ///
+/// Additionally, even if a predicate eliminates a moderate number of rows, it 
may still be faster
+/// to filter the data after the RecordBatch has been fully decoded, if the 
eliminated rows are
+/// not contiguous.
+///
 /// [`RowSelection`]: crate::arrow::arrow_reader::RowSelection
 pub struct RowFilter {
     /// A list of [`ArrowPredicate`]
diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index a34ce77f277..83d6f6f553f 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -141,6 +141,9 @@ impl<T> ArrowReaderBuilder<T> {
     /// An example use case of this would be applying a selection determined by
     /// evaluating predicates against the [`Index`]
     ///
+    /// It is recommended to enable reading the page index if using this 
functionality, to allow
+    /// more efficient skipping over data pages. See 
[`ArrowReaderOptions::with_page_index`]
+    ///
     /// [`Index`]: crate::file::page_index::index::Index
     pub fn with_row_selection(self, selection: RowSelection) -> Self {
         Self {
@@ -152,6 +155,9 @@ impl<T> ArrowReaderBuilder<T> {
     /// Provide a [`RowFilter`] to skip decoding rows
     ///
     /// Row filters are applied after row group selection and row selection
+    ///
+    /// It is recommended to enable reading the page index if using this 
functionality, to allow
+    /// more efficient skipping over data pages. See 
[`ArrowReaderOptions::with_page_index`].
     pub fn with_row_filter(self, filter: RowFilter) -> Self {
         Self {
             filter: Some(filter),
@@ -163,6 +169,9 @@ impl<T> ArrowReaderBuilder<T> {
     ///
     /// The limit will be applied after any [`Self::with_row_selection`] and 
[`Self::with_row_filter`]
     /// allowing it to limit the final set of rows decoded after any pushed 
down predicates
+    ///
+    /// It is recommended to enable reading the page index if using this 
functionality, to allow
+    /// more efficient skipping over data pages. See 
[`ArrowReaderOptions::with_page_index`]
     pub fn with_limit(self, limit: usize) -> Self {
         Self {
             limit: Some(limit),
@@ -174,6 +183,9 @@ impl<T> ArrowReaderBuilder<T> {
     ///
     /// The offset will be applied after any [`Self::with_row_selection`] and 
[`Self::with_row_filter`]
     /// allowing it to skip rows after any pushed down predicates
+    ///
+    /// It is recommended to enable reading the page index if using this 
functionality, to allow
+    /// more efficient skipping over data pages. See 
[`ArrowReaderOptions::with_page_index`]
     pub fn with_offset(self, offset: usize) -> Self {
         Self {
             offset: Some(offset),

(arrow-rs) branch master updated: Better document parquet pushdown (#5491)

Reply via email to