This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new fb37105824 Minor: Improve TableProvider document, and add ascii art 
(#7759)
fb37105824 is described below

commit fb371058244c00325ec8f779623ce223b9b50b86
Author: Andrew Lamb <[email protected]>
AuthorDate: Sat Oct 7 15:51:44 2023 -0400

    Minor: Improve TableProvider document, and add ascii art (#7759)
    
    * Improve TableProvider document, and add ascii art
    
    * Update datafusion/core/src/datasource/provider.rs
---
 datafusion/core/src/datasource/provider.rs | 81 ++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 9 deletions(-)

diff --git a/datafusion/core/src/datasource/provider.rs 
b/datafusion/core/src/datasource/provider.rs
index 5ebcc45b57..af99c12d39 100644
--- a/datafusion/core/src/datasource/provider.rs
+++ b/datafusion/core/src/datasource/provider.rs
@@ -54,24 +54,87 @@ pub trait TableProvider: Sync + Send {
         None
     }
 
-    /// Get the Logical Plan of this table, if available.
+    /// Get the [`LogicalPlan`] of this table, if available
     fn get_logical_plan(&self) -> Option<&LogicalPlan> {
         None
     }
 
-    /// Create an ExecutionPlan that will scan the table.
-    /// The table provider will be usually responsible of grouping
-    /// the source data into partitions that can be efficiently
-    /// parallelized or distributed.
+    /// Create an [`ExecutionPlan`] for scanning the table with optionally
+    /// specified `projection`, `filter` and `limit`, described below.
+    ///
+    /// The `ExecutionPlan` is responsible scanning the datasource's
+    /// partitions in a streaming, parallelized fashion.
+    ///
+    /// # Projection
+    ///
+    /// If specified, only a subset of columns should be returned, in the order
+    /// specified. The projection is a set of indexes of the fields in
+    /// [`Self::schema`].
+    ///
+    /// DataFusion provides the projection to scan only the columns actually
+    /// used in the query to improve performance, an optimization  called
+    /// "Projection Pushdown". Some datasources, such as Parquet, can use this
+    /// information to go significantly faster when only a subset of columns is
+    /// required.
+    ///
+    /// # Filters
+    ///
+    /// A list of boolean filter [`Expr`]s to evaluate *during* the scan, in 
the
+    /// manner specified by [`Self::supports_filters_pushdown`]. Only rows for
+    /// which *all* of the `Expr`s evaluate to `true` must be returned (aka the
+    /// expressions are `AND`ed together).
+    ///
+    /// DataFusion pushes filtering into the scans whenever possible
+    /// ("Projection Pushdown"), and depending on the format and the
+    /// implementation of the format, evaluating the predicate during the scan
+    /// can increase performance significantly.
+    ///
+    /// ## Note: Some columns may appear *only* in Filters
+    ///
+    /// In certain cases, a query may only use a certain column in a Filter 
that
+    /// has been completely pushed down to the scan. In this case, the
+    /// projection will not contain all the columns found in the filter
+    /// expressions.
+    ///
+    /// For example, given the query `SELECT t.a FROM t WHERE t.b > 5`,
+    ///
+    /// ```text
+    /// ┌────────────────────┐
+    /// │  Projection(t.a)   │
+    /// └────────────────────┘
+    ///            ▲
+    ///            │
+    ///            │
+    /// ┌────────────────────┐     Filter     ┌────────────────────┐   
Projection    ┌────────────────────┐
+    /// │  Filter(t.b > 5)   │────Pushdown──▶ │  Projection(t.a)   │ 
───Pushdown───▶ │  Projection(t.a)   │
+    /// └────────────────────┘                └────────────────────┘           
      └────────────────────┘
+    ///            ▲                                     ▲                     
                 ▲
+    ///            │                                     │                     
                 │
+    ///            │                                     │                     
      ┌────────────────────┐
+    /// ┌────────────────────┐                ┌────────────────────┐           
      │        Scan        │
+    /// │        Scan        │                │        Scan        │           
      │  filter=(t.b > 5)  │
+    /// └────────────────────┘                │  filter=(t.b > 5)  │           
      │  projection=(t.a)  │
+    ///                                       └────────────────────┘           
      └────────────────────┘
+    ///
+    /// Initial Plan                  If `TableProviderFilterPushDown`         
  Projection pushdown notes that
+    ///                               returns true, filter pushdown            
  the scan only needs t.a
+    ///                               pushes the filter into the scan
+    ///                                                                        
  BUT internally evaluating the
+    ///                                                                        
  predicate still requires t.b
+    /// ```
+    ///
+    /// # Limit
+    ///
+    /// If `limit` is specified,  must only produce *at least* this many rows,
+    /// (though it may return more).  Like Projection Pushdown and Filter
+    /// Pushdown, DataFusion pushes `LIMIT`s  as far down in the plan as
+    /// possible, called "Limit Pushdown" as some sources can use this
+    /// information to improve their performance.
     async fn scan(
         &self,
         state: &SessionState,
         projection: Option<&Vec<usize>>,
         filters: &[Expr],
-        // limit can be used to reduce the amount scanned
-        // from the datasource as a performance optimization.
-        // If set, it contains the amount of rows needed by the `LogicalPlan`,
-        // The datasource should return *at least* this number of rows if 
available.
         limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>>;
 

Reply via email to