This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 57f26756d3 Minor: improve `join` / `join_on` docs (#7813)
57f26756d3 is described below
commit 57f26756d3de0938e520bcdb8c8bd25722858b2b
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Oct 18 17:46:51 2023 -0400
Minor: improve `join` / `join_on` docs (#7813)
* Minor: improve join_on docs
* Update datafusion/expr/src/logical_plan/builder.rs
Co-authored-by: jakevin <[email protected]>
---------
Co-authored-by: jakevin <[email protected]>
---
datafusion/core/src/dataframe.rs | 33 +++++++++++++++++-----
datafusion/expr/src/logical_plan/builder.rs | 44 +++++++++++++++++++++--------
2 files changed, 59 insertions(+), 18 deletions(-)
diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs
index 79b8fcd519..2e192c2a78 100644
--- a/datafusion/core/src/dataframe.rs
+++ b/datafusion/core/src/dataframe.rs
@@ -582,12 +582,21 @@ impl DataFrame {
Ok(DataFrame::new(self.session_state, plan))
}
- /// Join this DataFrame with another DataFrame using the specified columns
as join keys.
+ /// Join this `DataFrame` with another `DataFrame` using explicitly
specified
+ /// columns and an optional filter expression.
///
- /// Filter expression expected to contain non-equality predicates that can
not be pushed
- /// down to any of join inputs.
- /// In case of outer join, filter applied to only matched rows.
+ /// See [`join_on`](Self::join_on) for a more concise way to specify the
+ /// join condition. Since DataFusion will automatically identify and
+ /// optimize equality predicates there is no performance difference between
+ /// this function and `join_on`
///
+ /// `left_cols` and `right_cols` are used to form "equijoin" predicates
(see
+ /// example below), which are then combined with the optional `filter`
+ /// expression.
+ ///
+ /// Note that in case of outer join, the `filter` is applied to only
matched rows.
+ ///
+ /// # Example
/// ```
/// # use datafusion::prelude::*;
/// # use datafusion::error::Result;
@@ -600,11 +609,14 @@ impl DataFrame {
/// col("a").alias("a2"),
/// col("b").alias("b2"),
/// col("c").alias("c2")])?;
+ /// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b =
b2)`
+ /// // finding all pairs of rows from `left` and `right` where `a = a2`
and `b = b2`.
/// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2",
"b2"], None)?;
/// let batches = join.collect().await?;
/// # Ok(())
/// # }
/// ```
+ ///
pub fn join(
self,
right: DataFrame,
@@ -624,10 +636,13 @@ impl DataFrame {
Ok(DataFrame::new(self.session_state, plan))
}
- /// Join this DataFrame with another DataFrame using the specified
expressions.
+ /// Join this `DataFrame` with another `DataFrame` using the specified
+ /// expressions.
///
- /// Simply a thin wrapper over [`join`](Self::join) where the join keys
are not provided,
- /// and the provided expressions are AND'ed together to form the filter
expression.
+ /// Note that DataFusion automatically optimizes joins, including
+ /// identifying and optimizing equality predicates.
+ ///
+ /// # Example
///
/// ```
/// # use datafusion::prelude::*;
@@ -646,6 +661,10 @@ impl DataFrame {
/// col("b").alias("b2"),
/// col("c").alias("c2"),
/// ])?;
+ ///
+ /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b
!= b2)`
+ /// // finding all pairs of rows from `left` and `right` where
+ /// // where `a != a2` and `b != b2`.
/// let join_on = left.join_on(
/// right,
/// JoinType::Inner,
diff --git a/datafusion/expr/src/logical_plan/builder.rs
b/datafusion/expr/src/logical_plan/builder.rs
index 770f39be47..cd50dbe79c 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -611,11 +611,19 @@ impl LogicalPlanBuilder {
})))
}
- /// Apply a join with on constraint.
+ /// Apply a join to `right` using explicitly specified columns and an
+ /// optional filter expression.
///
- /// Filter expression expected to contain non-equality predicates that can
not be pushed
- /// down to any of join inputs.
- /// In case of outer join, filter applied to only matched rows.
+ /// See [`join_on`](Self::join_on) for a more concise way to specify the
+ /// join condition. Since DataFusion will automatically identify and
+ /// optimize equality predicates there is no performance difference between
+ /// this function and `join_on`
+ ///
+ /// `left_cols` and `right_cols` are used to form "equijoin" predicates
(see
+ /// example below), which are then combined with the optional `filter`
+ /// expression.
+ ///
+ /// Note that in case of outer join, the `filter` is applied to only
matched rows.
pub fn join(
self,
right: LogicalPlan,
@@ -626,11 +634,12 @@ impl LogicalPlanBuilder {
self.join_detailed(right, join_type, join_keys, filter, false)
}
- /// Apply a join with on constraint.
+ /// Apply a join with using the specified expressions.
+ ///
+ /// Note that DataFusion automatically optimizes joins, including
+ /// identifying and optimizing equality predicates.
///
- /// The `ExtractEquijoinPredicate` optimizer pass has the ability to split
join predicates into
- /// equijoin predicates and (other) filter predicates. Therefore, if you
prefer not to manually split the
- /// join predicates, it is recommended to use the `join_on` method instead
of the `join` method.
+ /// # Example
///
/// ```
/// # use datafusion_expr::{Expr, col, LogicalPlanBuilder,
@@ -650,8 +659,15 @@ impl LogicalPlanBuilder {
///
/// let right_plan = LogicalPlanBuilder::scan("right", right_table,
None)?.build()?;
///
- /// let exprs = vec![col("left.a").eq(col("right.a")),
col("left.b").not_eq(col("right.b"))];
+ /// // Form the expression `(left.a != right.a)` AND `(left.b != right.b)`
+ /// let exprs = vec![
+ /// col("left.a").eq(col("right.a")),
+ /// col("left.b").not_eq(col("right.b"))
+ /// ];
///
+ /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b
!= b2)`
+ /// // finding all pairs of rows from `left` and `right` where
+ /// // where `a = a2` and `b != b2`.
/// let plan = LogicalPlanBuilder::scan("left", left_table, None)?
/// .join_on(right_plan, JoinType::Inner, exprs)?
/// .build()?;
@@ -688,8 +704,14 @@ impl LogicalPlanBuilder {
)
}
- /// Apply a join with on constraint and specified null equality
- /// If null_equals_null is true then null == null, else null != null
+ /// Apply a join with on constraint and specified null equality.
+ ///
+ /// The behavior is the same as [`join`](Self::join) except that it allows
+ /// specifying the null equality behavior.
+ ///
+ /// If `null_equals_null=true`, rows where both join keys are `null` will
be
+ /// emitted. Otherwise rows where either or both join keys are `null` will
be
+ /// omitted.
pub fn join_detailed(
self,
right: LogicalPlan,