alamb commented on code in PR #7612:
URL: https://github.com/apache/arrow-datafusion/pull/7612#discussion_r1349506258
##########
datafusion/substrait/src/logical_plan/consumer.rs:
##########
@@ -341,65 +393,32 @@ pub async fn from_substrait_rel(
// The join condition expression needs full input schema and not
the output schema from join since we lose columns from
// certain join types such as semi and anti joins
let in_join_schema = left.schema().join(right.schema())?;
- // Parse post join filter if exists
- let join_filter = match &join.post_join_filter {
- Some(filter) => {
- let parsed_filter =
- from_substrait_rex(filter, &in_join_schema,
extensions).await?;
- Some(parsed_filter.as_ref().clone())
- }
- None => None,
- };
+
// If join expression exists, parse the `on` condition expression,
build join and return
- // Otherwise, build join with koin filter, without join keys
+ // Otherwise, build join with only the filter, without join keys
match &join.expression.as_ref() {
Some(expr) => {
let on =
from_substrait_rex(expr, &in_join_schema,
extensions).await?;
- let predicates = split_conjunction(&on);
- // TODO: collect only one null_eq_null
- let join_exprs: Vec<(Column, Column, bool)> = predicates
- .iter()
- .map(|p| match p {
- Expr::BinaryExpr(BinaryExpr { left, op, right })
=> {
- match (left.as_ref(), right.as_ref()) {
- (Expr::Column(l), Expr::Column(r)) =>
match op {
- Operator::Eq => Ok((l.clone(),
r.clone(), false)),
- Operator::IsNotDistinctFrom => {
- Ok((l.clone(), r.clone(), true))
- }
- _ => plan_err!("invalid join condition
op"),
- },
- _ => plan_err!("invalid join condition
expression"),
- }
- }
- _ => plan_err!(
- "Non-binary expression is not supported in
join condition"
- ),
- })
- .collect::<Result<Vec<_>>>()?;
- let (left_cols, right_cols, null_eq_nulls): (Vec<_>,
Vec<_>, Vec<_>) =
- itertools::multiunzip(join_exprs);
+ // The join expression can contain both equal and
non-equal ops.
Review Comment:
Given that the
[`ExtractEquijoinPredicate`](https://github.com/apache/arrow-datafusion/blob/093b775adc3593e9e5cb7343e28406ed458551ad/datafusion/optimizer/src/extract_equijoin_predicate.rs#L31-L41)
optimizer pass already splits up join predicates into equijoin predicates and
"other" predicates, I wonder if simply create the `LogicalPlan::Join` using
`join.expression` (and let the subsequent optimizer pass sort it out)?
Something like
```rust
left.join(
right.build()?,
join_type,
(vec![], vec![]),
on, // <-- use the filter directly here, let optimizer pass extract the
equijoin columns
nulls_equal_nulls,
)?
```
It makes me realize when looking at the API for
[`LogicalPlanBuilder::join`](https://docs.rs/datafusion/latest/datafusion/logical_expr/struct.LogicalPlanBuilder.html#method.join)
that the API is super confusing. It would be nice to improve that API to make
it clear that a `join` can just take a single `Expr` and DataFusion will sort
out figuring out the join columns, etc.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]