This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 57f26756d3 Minor: improve `join` / `join_on` docs (#7813)
57f26756d3 is described below

commit 57f26756d3de0938e520bcdb8c8bd25722858b2b
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Oct 18 17:46:51 2023 -0400

    Minor: improve `join` / `join_on` docs (#7813)
    
    * Minor: improve join_on docs
    
    * Update datafusion/expr/src/logical_plan/builder.rs
    
    Co-authored-by: jakevin <[email protected]>
    
    ---------
    
    Co-authored-by: jakevin <[email protected]>
---
 datafusion/core/src/dataframe.rs            | 33 +++++++++++++++++-----
 datafusion/expr/src/logical_plan/builder.rs | 44 +++++++++++++++++++++--------
 2 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs
index 79b8fcd519..2e192c2a78 100644
--- a/datafusion/core/src/dataframe.rs
+++ b/datafusion/core/src/dataframe.rs
@@ -582,12 +582,21 @@ impl DataFrame {
         Ok(DataFrame::new(self.session_state, plan))
     }
 
-    /// Join this DataFrame with another DataFrame using the specified columns 
as join keys.
+    /// Join this `DataFrame` with another `DataFrame` using explicitly 
specified
+    /// columns and an optional filter expression.
     ///
-    /// Filter expression expected to contain non-equality predicates that can 
not be pushed
-    /// down to any of join inputs.
-    /// In case of outer join, filter applied to only matched rows.
+    /// See [`join_on`](Self::join_on) for a more concise way to specify the
+    /// join condition. Since DataFusion will automatically identify and
+    /// optimize equality predicates there is no performance difference between
+    /// this function and `join_on`
     ///
+    /// `left_cols` and `right_cols` are used to form "equijoin" predicates 
(see
+    /// example below), which are then combined with the optional `filter`
+    /// expression.
+    ///
+    /// Note that in case of outer join, the `filter` is applied to only 
matched rows.
+    ///
+    /// # Example
     /// ```
     /// # use datafusion::prelude::*;
     /// # use datafusion::error::Result;
@@ -600,11 +609,14 @@ impl DataFrame {
     ///     col("a").alias("a2"),
     ///     col("b").alias("b2"),
     ///     col("c").alias("c2")])?;
+    /// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b = 
b2)`
+    /// // finding all pairs of rows from `left` and `right` where `a = a2` 
and `b = b2`.
     /// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", 
"b2"], None)?;
     /// let batches = join.collect().await?;
     /// # Ok(())
     /// # }
     /// ```
+    ///
     pub fn join(
         self,
         right: DataFrame,
@@ -624,10 +636,13 @@ impl DataFrame {
         Ok(DataFrame::new(self.session_state, plan))
     }
 
-    /// Join this DataFrame with another DataFrame using the specified 
expressions.
+    /// Join this `DataFrame` with another `DataFrame` using the specified
+    /// expressions.
     ///
-    /// Simply a thin wrapper over [`join`](Self::join) where the join keys 
are not provided,
-    /// and the provided expressions are AND'ed together to form the filter 
expression.
+    /// Note that DataFusion automatically optimizes joins, including
+    /// identifying and optimizing equality predicates.
+    ///
+    /// # Example
     ///
     /// ```
     /// # use datafusion::prelude::*;
@@ -646,6 +661,10 @@ impl DataFrame {
     ///         col("b").alias("b2"),
     ///         col("c").alias("c2"),
     ///     ])?;
+    ///
+    /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b 
!= b2)`
+    /// // finding all pairs of rows from `left` and `right` where
+    /// // where `a != a2` and `b != b2`.
     /// let join_on = left.join_on(
     ///     right,
     ///     JoinType::Inner,
diff --git a/datafusion/expr/src/logical_plan/builder.rs 
b/datafusion/expr/src/logical_plan/builder.rs
index 770f39be47..cd50dbe79c 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -611,11 +611,19 @@ impl LogicalPlanBuilder {
         })))
     }
 
-    /// Apply a join with on constraint.
+    /// Apply a join to `right` using explicitly specified columns and an
+    /// optional filter expression.
     ///
-    /// Filter expression expected to contain non-equality predicates that can 
not be pushed
-    /// down to any of join inputs.
-    /// In case of outer join, filter applied to only matched rows.
+    /// See [`join_on`](Self::join_on) for a more concise way to specify the
+    /// join condition. Since DataFusion will automatically identify and
+    /// optimize equality predicates there is no performance difference between
+    /// this function and `join_on`
+    ///
+    /// `left_cols` and `right_cols` are used to form "equijoin" predicates 
(see
+    /// example below), which are then combined with the optional `filter`
+    /// expression.
+    ///
+    /// Note that in case of outer join, the `filter` is applied to only 
matched rows.
     pub fn join(
         self,
         right: LogicalPlan,
@@ -626,11 +634,12 @@ impl LogicalPlanBuilder {
         self.join_detailed(right, join_type, join_keys, filter, false)
     }
 
-    /// Apply a join with on constraint.
+    /// Apply a join with using the specified expressions.
+    ///
+    /// Note that DataFusion automatically optimizes joins, including
+    /// identifying and optimizing equality predicates.
     ///
-    /// The `ExtractEquijoinPredicate` optimizer pass has the ability to split 
join predicates into
-    /// equijoin predicates and (other) filter predicates. Therefore, if you 
prefer not to manually split the
-    /// join predicates, it is recommended to use the `join_on` method instead 
of the `join` method.
+    /// # Example
     ///
     /// ```
     /// # use datafusion_expr::{Expr, col, LogicalPlanBuilder,
@@ -650,8 +659,15 @@ impl LogicalPlanBuilder {
     ///
     /// let right_plan = LogicalPlanBuilder::scan("right", right_table, 
None)?.build()?;
     ///
-    /// let exprs = vec![col("left.a").eq(col("right.a")), 
col("left.b").not_eq(col("right.b"))];
+    /// // Form the expression `(left.a != right.a)` AND `(left.b != right.b)`
+    /// let exprs = vec![
+    ///     col("left.a").eq(col("right.a")),
+    ///     col("left.b").not_eq(col("right.b"))
+    ///  ];
     ///
+    /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b 
!= b2)`
+    /// // finding all pairs of rows from `left` and `right` where
+    /// // where `a = a2` and `b != b2`.
     /// let plan = LogicalPlanBuilder::scan("left", left_table, None)?
     ///     .join_on(right_plan, JoinType::Inner, exprs)?
     ///     .build()?;
@@ -688,8 +704,14 @@ impl LogicalPlanBuilder {
         )
     }
 
-    /// Apply a join with on constraint and specified null equality
-    /// If null_equals_null is true then null == null, else null != null
+    /// Apply a join with on constraint and specified null equality.
+    ///
+    /// The behavior is the same as [`join`](Self::join) except that it allows
+    /// specifying the null equality behavior.
+    ///
+    /// If `null_equals_null=true`, rows where both join keys are `null` will 
be
+    /// emitted. Otherwise rows where either or both join keys are `null` will 
be
+    /// omitted.
     pub fn join_detailed(
         self,
         right: LogicalPlan,

Reply via email to