This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 308ebc51a4 Improve docs for `TableProvider::supports_filters_pushdown` 
and remove deprecated function (#9923)
308ebc51a4 is described below

commit 308ebc51a4cfcc3924d13de56212646e644fdf3a
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Apr 5 07:13:55 2024 -0400

    Improve docs for `TableProvider::supports_filters_pushdown` and remove 
deprecated function (#9923)
    
    * Minor: Improve documentation for FilterPushdown and remove deprecated 
method
    
    * Remove deprecated method
    
    * Improve docs
    
    * more
    
    * Apply suggestions from code review
    
    Co-authored-by: Phillip LeBlanc <[email protected]>
    
    ---------
    
    Co-authored-by: Phillip LeBlanc <[email protected]>
---
 datafusion/core/src/dataframe/mod.rs               |  8 ++--
 datafusion/core/src/datasource/cte_worktable.rs    | 11 ++++--
 datafusion/core/src/datasource/listing/table.rs    | 44 ++++++++++++----------
 datafusion/core/src/datasource/provider.rs         | 43 ++++++++++++---------
 datafusion/core/src/datasource/view.rs             |  9 ++---
 .../provider_filter_pushdown.rs                    |  7 +++-
 datafusion/expr/src/table_source.rs                | 29 +++++++++-----
 7 files changed, 89 insertions(+), 62 deletions(-)

diff --git a/datafusion/core/src/dataframe/mod.rs 
b/datafusion/core/src/dataframe/mod.rs
index 1db4f8ede6..683cb809a5 100644
--- a/datafusion/core/src/dataframe/mod.rs
+++ b/datafusion/core/src/dataframe/mod.rs
@@ -1430,12 +1430,12 @@ impl TableProvider for DataFrameTableProvider {
         Some(&self.plan)
     }
 
-    fn supports_filter_pushdown(
+    fn supports_filters_pushdown(
         &self,
-        _filter: &Expr,
-    ) -> Result<TableProviderFilterPushDown> {
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
         // A filter is added on the DataFrame when given
-        Ok(TableProviderFilterPushDown::Exact)
+        Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
     }
 
     fn schema(&self) -> SchemaRef {
diff --git a/datafusion/core/src/datasource/cte_worktable.rs 
b/datafusion/core/src/datasource/cte_worktable.rs
index 71075839b9..f8fd94d4d3 100644
--- a/datafusion/core/src/datasource/cte_worktable.rs
+++ b/datafusion/core/src/datasource/cte_worktable.rs
@@ -89,11 +89,14 @@ impl TableProvider for CteWorkTable {
         )))
     }
 
-    fn supports_filter_pushdown(
+    fn supports_filters_pushdown(
         &self,
-        _filter: &Expr,
-    ) -> Result<TableProviderFilterPushDown> {
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
         // TODO: should we support filter pushdown?
-        Ok(TableProviderFilterPushDown::Unsupported)
+        Ok(vec![
+            TableProviderFilterPushDown::Unsupported;
+            filters.len()
+        ])
     }
 }
diff --git a/datafusion/core/src/datasource/listing/table.rs 
b/datafusion/core/src/datasource/listing/table.rs
index c1e337b5c4..5ef7b6241b 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -685,26 +685,32 @@ impl TableProvider for ListingTable {
             .await
     }
 
-    fn supports_filter_pushdown(
+    fn supports_filters_pushdown(
         &self,
-        filter: &Expr,
-    ) -> Result<TableProviderFilterPushDown> {
-        if expr_applicable_for_cols(
-            &self
-                .options
-                .table_partition_cols
-                .iter()
-                .map(|x| x.0.clone())
-                .collect::<Vec<_>>(),
-            filter,
-        ) {
-            // if filter can be handled by partiton pruning, it is exact
-            Ok(TableProviderFilterPushDown::Exact)
-        } else {
-            // otherwise, we still might be able to handle the filter with file
-            // level mechanisms such as Parquet row group pruning.
-            Ok(TableProviderFilterPushDown::Inexact)
-        }
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        let support: Vec<_> = filters
+            .iter()
+            .map(|filter| {
+                if expr_applicable_for_cols(
+                    &self
+                        .options
+                        .table_partition_cols
+                        .iter()
+                        .map(|x| x.0.clone())
+                        .collect::<Vec<_>>(),
+                    filter,
+                ) {
+                    // if filter can be handled by partition pruning, it is 
exact
+                    TableProviderFilterPushDown::Exact
+                } else {
+                    // otherwise, we still might be able to handle the filter 
with file
+                    // level mechanisms such as Parquet row group pruning.
+                    TableProviderFilterPushDown::Inexact
+                }
+            })
+            .collect();
+        Ok(support)
     }
 
     fn get_table_definition(&self) -> Option<&str> {
diff --git a/datafusion/core/src/datasource/provider.rs 
b/datafusion/core/src/datasource/provider.rs
index f2e3e907e5..9aac072ed4 100644
--- a/datafusion/core/src/datasource/provider.rs
+++ b/datafusion/core/src/datasource/provider.rs
@@ -96,6 +96,10 @@ pub trait TableProvider: Sync + Send {
     /// which *all* of the `Expr`s evaluate to `true` must be returned (aka the
     /// expressions are `AND`ed together).
     ///
+    /// To enable filter pushdown you must override
+    /// [`Self::supports_filters_pushdown`] as the default implementation does
+    /// not and `filters` will be empty.
+    ///
     /// DataFusion pushes filtering into the scans whenever possible
     /// ("Filter Pushdown"), and depending on the format and the
     /// implementation of the format, evaluating the predicate during the scan
@@ -154,28 +158,31 @@ pub trait TableProvider: Sync + Send {
         limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>>;
 
-    /// Tests whether the table provider can make use of a filter expression
-    /// to optimise data retrieval.
-    #[deprecated(since = "20.0.0", note = "use supports_filters_pushdown 
instead")]
-    fn supports_filter_pushdown(
-        &self,
-        _filter: &Expr,
-    ) -> Result<TableProviderFilterPushDown> {
-        Ok(TableProviderFilterPushDown::Unsupported)
-    }
-
-    /// Tests whether the table provider can make use of any or all filter 
expressions
-    /// to optimise data retrieval.
-    /// Note:  the returned vector much have the same size as the filters 
argument.
-    #[allow(deprecated)]
+    /// Specify if DataFusion should provide filter expressions to the
+    /// TableProvider to apply *during* the scan.
+    ///
+    /// The return value must have one element for each filter expression 
passed
+    /// in. The value of each element indicates if the TableProvider can apply
+    /// that particular filter during the scan.
+    ///
+    /// Some TableProviders can evaluate filters more efficiently than the
+    /// `Filter` operator in DataFusion, for example by using an index.
+    ///
+    /// By default, returns [`Unsupported`] for all filters, meaning no filters
+    /// will be provided to [`Self::scan`]. If the TableProvider can implement
+    /// filter pushdown, it should return either [`Exact`] or [`Inexact`].
+    ///
+    /// [`Unsupported`]: TableProviderFilterPushDown::Unsupported
+    /// [`Exact`]: TableProviderFilterPushDown::Exact
+    /// [`Inexact`]: TableProviderFilterPushDown::Inexact
     fn supports_filters_pushdown(
         &self,
         filters: &[&Expr],
     ) -> Result<Vec<TableProviderFilterPushDown>> {
-        filters
-            .iter()
-            .map(|f| self.supports_filter_pushdown(f))
-            .collect()
+        Ok(vec![
+            TableProviderFilterPushDown::Unsupported;
+            filters.len()
+        ])
     }
 
     /// Get statistics for this table, if available
diff --git a/datafusion/core/src/datasource/view.rs 
b/datafusion/core/src/datasource/view.rs
index d1b7dad152..31e812332c 100644
--- a/datafusion/core/src/datasource/view.rs
+++ b/datafusion/core/src/datasource/view.rs
@@ -93,13 +93,12 @@ impl TableProvider for ViewTable {
     fn get_table_definition(&self) -> Option<&str> {
         self.definition.as_deref()
     }
-
-    fn supports_filter_pushdown(
+    fn supports_filters_pushdown(
         &self,
-        _filter: &Expr,
-    ) -> Result<TableProviderFilterPushDown> {
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
         // A filter is added on the View when given
-        Ok(TableProviderFilterPushDown::Exact)
+        Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
     }
 
     async fn scan(
diff --git 
a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs 
b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
index bc6d85a74a..2ae41391f4 100644
--- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
+++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
@@ -214,8 +214,11 @@ impl TableProvider for CustomProvider {
         }
     }
 
-    fn supports_filter_pushdown(&self, _: &Expr) -> 
Result<TableProviderFilterPushDown> {
-        Ok(TableProviderFilterPushDown::Exact)
+    fn supports_filters_pushdown(
+        &self,
+        filters: &[&Expr],
+    ) -> Result<Vec<TableProviderFilterPushDown>> {
+        Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
     }
 }
 
diff --git a/datafusion/expr/src/table_source.rs 
b/datafusion/expr/src/table_source.rs
index 565f48c1c5..f662f4d9f7 100644
--- a/datafusion/expr/src/table_source.rs
+++ b/datafusion/expr/src/table_source.rs
@@ -24,20 +24,29 @@ use datafusion_common::{Constraints, Result};
 
 use std::any::Any;
 
-/// Indicates whether and how a filter expression can be handled by a
-/// TableProvider for table scans.
+/// Indicates how a filter expression is handled by
+/// [`TableProvider::scan`].
+///
+/// Filter expressions are boolean expressions used to reduce the number of
+/// rows that are read from a table. Only rows that evaluate to `true` ("pass
+/// the filter") are returned. Rows that evaluate to `false` or `NULL` are
+/// omitted.
+///
+/// [`TableProvider::scan`]: 
https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html#tymethod.scan
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum TableProviderFilterPushDown {
-    /// The expression cannot be used by the provider.
+    /// The filter cannot be used by the provider and will not be pushed down.
     Unsupported,
-    /// The expression can be used to reduce the data retrieved,
-    /// but the provider cannot guarantee it will omit all tuples that
-    /// may be filtered. In this case, DataFusion will apply an additional
-    /// `Filter` operation after the scan to ensure all rows are filtered 
correctly.
+    /// The filter can be used, but the provider might still return some tuples
+    /// that do not pass the filter.
+    ///
+    /// In this case, DataFusion applies an additional `Filter` operation
+    /// after the scan to ensure all rows are filtered correctly.
     Inexact,
-    /// The provider **guarantees** that it will omit **all** tuples that are
-    /// filtered by the filter expression. This is the fastest option, if 
available
-    /// as DataFusion will not apply additional filtering.
+    /// The provider **guarantees** that it will omit **only** tuples which
+    /// pass the filter.
+    ///
+    /// In this case, DataFusion will not apply additional filtering.
     Exact,
 }
 

Reply via email to