This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 308ebc51a4 Improve docs for `TableProvider::supports_filters_pushdown`
and remove deprecated function (#9923)
308ebc51a4 is described below
commit 308ebc51a4cfcc3924d13de56212646e644fdf3a
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Apr 5 07:13:55 2024 -0400
Improve docs for `TableProvider::supports_filters_pushdown` and remove
deprecated function (#9923)
* Minor: Improve documentation for FilterPushdown and remove deprecated
method
* Remove deprecated method
* Improve docs
* more
* Apply suggestions from code review
Co-authored-by: Phillip LeBlanc <[email protected]>
---------
Co-authored-by: Phillip LeBlanc <[email protected]>
---
datafusion/core/src/dataframe/mod.rs | 8 ++--
datafusion/core/src/datasource/cte_worktable.rs | 11 ++++--
datafusion/core/src/datasource/listing/table.rs | 44 ++++++++++++----------
datafusion/core/src/datasource/provider.rs | 43 ++++++++++++---------
datafusion/core/src/datasource/view.rs | 9 ++---
.../provider_filter_pushdown.rs | 7 +++-
datafusion/expr/src/table_source.rs | 29 +++++++++-----
7 files changed, 89 insertions(+), 62 deletions(-)
diff --git a/datafusion/core/src/dataframe/mod.rs
b/datafusion/core/src/dataframe/mod.rs
index 1db4f8ede6..683cb809a5 100644
--- a/datafusion/core/src/dataframe/mod.rs
+++ b/datafusion/core/src/dataframe/mod.rs
@@ -1430,12 +1430,12 @@ impl TableProvider for DataFrameTableProvider {
Some(&self.plan)
}
- fn supports_filter_pushdown(
+ fn supports_filters_pushdown(
&self,
- _filter: &Expr,
- ) -> Result<TableProviderFilterPushDown> {
+ filters: &[&Expr],
+ ) -> Result<Vec<TableProviderFilterPushDown>> {
// A filter is added on the DataFrame when given
- Ok(TableProviderFilterPushDown::Exact)
+ Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
}
fn schema(&self) -> SchemaRef {
diff --git a/datafusion/core/src/datasource/cte_worktable.rs
b/datafusion/core/src/datasource/cte_worktable.rs
index 71075839b9..f8fd94d4d3 100644
--- a/datafusion/core/src/datasource/cte_worktable.rs
+++ b/datafusion/core/src/datasource/cte_worktable.rs
@@ -89,11 +89,14 @@ impl TableProvider for CteWorkTable {
)))
}
- fn supports_filter_pushdown(
+ fn supports_filters_pushdown(
&self,
- _filter: &Expr,
- ) -> Result<TableProviderFilterPushDown> {
+ filters: &[&Expr],
+ ) -> Result<Vec<TableProviderFilterPushDown>> {
// TODO: should we support filter pushdown?
- Ok(TableProviderFilterPushDown::Unsupported)
+ Ok(vec![
+ TableProviderFilterPushDown::Unsupported;
+ filters.len()
+ ])
}
}
diff --git a/datafusion/core/src/datasource/listing/table.rs
b/datafusion/core/src/datasource/listing/table.rs
index c1e337b5c4..5ef7b6241b 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -685,26 +685,32 @@ impl TableProvider for ListingTable {
.await
}
- fn supports_filter_pushdown(
+ fn supports_filters_pushdown(
&self,
- filter: &Expr,
- ) -> Result<TableProviderFilterPushDown> {
- if expr_applicable_for_cols(
- &self
- .options
- .table_partition_cols
- .iter()
- .map(|x| x.0.clone())
- .collect::<Vec<_>>(),
- filter,
- ) {
- // if filter can be handled by partiton pruning, it is exact
- Ok(TableProviderFilterPushDown::Exact)
- } else {
- // otherwise, we still might be able to handle the filter with file
- // level mechanisms such as Parquet row group pruning.
- Ok(TableProviderFilterPushDown::Inexact)
- }
+ filters: &[&Expr],
+ ) -> Result<Vec<TableProviderFilterPushDown>> {
+ let support: Vec<_> = filters
+ .iter()
+ .map(|filter| {
+ if expr_applicable_for_cols(
+ &self
+ .options
+ .table_partition_cols
+ .iter()
+ .map(|x| x.0.clone())
+ .collect::<Vec<_>>(),
+ filter,
+ ) {
+ // if filter can be handled by partition pruning, it is
exact
+ TableProviderFilterPushDown::Exact
+ } else {
+ // otherwise, we still might be able to handle the filter
with file
+ // level mechanisms such as Parquet row group pruning.
+ TableProviderFilterPushDown::Inexact
+ }
+ })
+ .collect();
+ Ok(support)
}
fn get_table_definition(&self) -> Option<&str> {
diff --git a/datafusion/core/src/datasource/provider.rs
b/datafusion/core/src/datasource/provider.rs
index f2e3e907e5..9aac072ed4 100644
--- a/datafusion/core/src/datasource/provider.rs
+++ b/datafusion/core/src/datasource/provider.rs
@@ -96,6 +96,10 @@ pub trait TableProvider: Sync + Send {
/// which *all* of the `Expr`s evaluate to `true` must be returned (aka the
/// expressions are `AND`ed together).
///
+ /// To enable filter pushdown you must override
+ /// [`Self::supports_filters_pushdown`] as the default implementation does
+ /// not and `filters` will be empty.
+ ///
/// DataFusion pushes filtering into the scans whenever possible
/// ("Filter Pushdown"), and depending on the format and the
/// implementation of the format, evaluating the predicate during the scan
@@ -154,28 +158,31 @@ pub trait TableProvider: Sync + Send {
limit: Option<usize>,
) -> Result<Arc<dyn ExecutionPlan>>;
- /// Tests whether the table provider can make use of a filter expression
- /// to optimise data retrieval.
- #[deprecated(since = "20.0.0", note = "use supports_filters_pushdown
instead")]
- fn supports_filter_pushdown(
- &self,
- _filter: &Expr,
- ) -> Result<TableProviderFilterPushDown> {
- Ok(TableProviderFilterPushDown::Unsupported)
- }
-
- /// Tests whether the table provider can make use of any or all filter
expressions
- /// to optimise data retrieval.
- /// Note: the returned vector much have the same size as the filters
argument.
- #[allow(deprecated)]
+ /// Specify if DataFusion should provide filter expressions to the
+ /// TableProvider to apply *during* the scan.
+ ///
+ /// The return value must have one element for each filter expression
passed
+ /// in. The value of each element indicates if the TableProvider can apply
+ /// that particular filter during the scan.
+ ///
+ /// Some TableProviders can evaluate filters more efficiently than the
+ /// `Filter` operator in DataFusion, for example by using an index.
+ ///
+ /// By default, returns [`Unsupported`] for all filters, meaning no filters
+ /// will be provided to [`Self::scan`]. If the TableProvider can implement
+ /// filter pushdown, it should return either [`Exact`] or [`Inexact`].
+ ///
+ /// [`Unsupported`]: TableProviderFilterPushDown::Unsupported
+ /// [`Exact`]: TableProviderFilterPushDown::Exact
+ /// [`Inexact`]: TableProviderFilterPushDown::Inexact
fn supports_filters_pushdown(
&self,
filters: &[&Expr],
) -> Result<Vec<TableProviderFilterPushDown>> {
- filters
- .iter()
- .map(|f| self.supports_filter_pushdown(f))
- .collect()
+ Ok(vec![
+ TableProviderFilterPushDown::Unsupported;
+ filters.len()
+ ])
}
/// Get statistics for this table, if available
diff --git a/datafusion/core/src/datasource/view.rs
b/datafusion/core/src/datasource/view.rs
index d1b7dad152..31e812332c 100644
--- a/datafusion/core/src/datasource/view.rs
+++ b/datafusion/core/src/datasource/view.rs
@@ -93,13 +93,12 @@ impl TableProvider for ViewTable {
fn get_table_definition(&self) -> Option<&str> {
self.definition.as_deref()
}
-
- fn supports_filter_pushdown(
+ fn supports_filters_pushdown(
&self,
- _filter: &Expr,
- ) -> Result<TableProviderFilterPushDown> {
+ filters: &[&Expr],
+ ) -> Result<Vec<TableProviderFilterPushDown>> {
// A filter is added on the View when given
- Ok(TableProviderFilterPushDown::Exact)
+ Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
}
async fn scan(
diff --git
a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
index bc6d85a74a..2ae41391f4 100644
--- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
+++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs
@@ -214,8 +214,11 @@ impl TableProvider for CustomProvider {
}
}
- fn supports_filter_pushdown(&self, _: &Expr) ->
Result<TableProviderFilterPushDown> {
- Ok(TableProviderFilterPushDown::Exact)
+ fn supports_filters_pushdown(
+ &self,
+ filters: &[&Expr],
+ ) -> Result<Vec<TableProviderFilterPushDown>> {
+ Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
}
}
diff --git a/datafusion/expr/src/table_source.rs
b/datafusion/expr/src/table_source.rs
index 565f48c1c5..f662f4d9f7 100644
--- a/datafusion/expr/src/table_source.rs
+++ b/datafusion/expr/src/table_source.rs
@@ -24,20 +24,29 @@ use datafusion_common::{Constraints, Result};
use std::any::Any;
-/// Indicates whether and how a filter expression can be handled by a
-/// TableProvider for table scans.
+/// Indicates how a filter expression is handled by
+/// [`TableProvider::scan`].
+///
+/// Filter expressions are boolean expressions used to reduce the number of
+/// rows that are read from a table. Only rows that evaluate to `true` ("pass
+/// the filter") are returned. Rows that evaluate to `false` or `NULL` are
+/// omitted.
+///
+/// [`TableProvider::scan`]:
https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html#tymethod.scan
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TableProviderFilterPushDown {
- /// The expression cannot be used by the provider.
+ /// The filter cannot be used by the provider and will not be pushed down.
Unsupported,
- /// The expression can be used to reduce the data retrieved,
- /// but the provider cannot guarantee it will omit all tuples that
- /// may be filtered. In this case, DataFusion will apply an additional
- /// `Filter` operation after the scan to ensure all rows are filtered
correctly.
+ /// The filter can be used, but the provider might still return some tuples
+ /// that do not pass the filter.
+ ///
+ /// In this case, DataFusion applies an additional `Filter` operation
+ /// after the scan to ensure all rows are filtered correctly.
Inexact,
- /// The provider **guarantees** that it will omit **all** tuples that are
- /// filtered by the filter expression. This is the fastest option, if
available
- /// as DataFusion will not apply additional filtering.
+ /// The provider **guarantees** that it will omit **only** tuples which
+ /// pass the filter.
+ ///
+ /// In this case, DataFusion will not apply additional filtering.
Exact,
}