westonpace commented on code in PR #20597:
URL: https://github.com/apache/datafusion/pull/20597#discussion_r3266753866


##########
datafusion/substrait/src/logical_plan/producer/expr/scalar_function.rs:
##########
@@ -15,33 +15,41 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{SubstraitProducer, 
to_substrait_literal_expr};
+use crate::logical_plan::producer::{
+    SubstraitProducer, to_substrait_literal_expr, to_substrait_type,
+};
+use datafusion::arrow::datatypes::Field;
 use datafusion::common::{DFSchemaRef, ScalarValue, not_impl_err};
-use datafusion::logical_expr::{Between, BinaryExpr, Expr, Like, Operator, 
expr};
+use datafusion::logical_expr::{
+    Between, BinaryExpr, Expr, ExprSchemable, Like, Operator, expr,
+};
 use substrait::proto::expression::{RexType, ScalarFunction};
 use substrait::proto::function_argument::ArgType;
-use substrait::proto::{Expression, FunctionArgument};
+use substrait::proto::{Expression, FunctionArgument, Type};
 
 pub fn from_scalar_function(
     producer: &mut impl SubstraitProducer,
     fun: &expr::ScalarFunction,
     schema: &DFSchemaRef,
 ) -> datafusion::common::Result<Expression> {
-    from_function(producer, fun.name(), &fun.args, schema)
+    let (_, output_field) = 
Expr::ScalarFunction(fun.clone()).to_field(schema)?;
+    from_function(producer, fun.name(), &fun.args, &output_field, schema)
 }
 
 pub fn from_higher_order_function(
     producer: &mut impl SubstraitProducer,
     fun: &expr::HigherOrderFunction,
     schema: &DFSchemaRef,
 ) -> datafusion::common::Result<Expression> {
-    from_function(producer, fun.name(), &fun.args, schema)
+    let (_, output_field) = 
Expr::HigherOrderFunction(fun.clone()).to_field(schema)?;
+    from_function(producer, fun.name(), &fun.args, &output_field, schema)
 }
 
 fn from_function(
     producer: &mut impl SubstraitProducer,
     name: &str,
     args: &[Expr],
+    output_field: &Field,

Review Comment:
   Minor nit: although all the call-sites are able to come up with a `Field` 
easily enough all you really need here is a `DataType` so it might be nice to 
change this to `DataType` in case some future caller doesn't have a name handy.



##########
datafusion/substrait/src/logical_plan/producer/rel/join.rs:
##########
@@ -15,59 +15,38 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::logical_plan::producer::{SubstraitProducer, 
make_binary_op_scalar_func};
-use datafusion::common::{
-    DFSchemaRef, JoinConstraint, JoinType, NullEquality, not_impl_err,
-};
+use crate::logical_plan::producer::SubstraitProducer;
+use datafusion::common::{JoinConstraint, JoinType, NullEquality, not_impl_err};
+use datafusion::logical_expr::utils::conjunction;
 use datafusion::logical_expr::{Expr, Join, Operator};
+use datafusion::prelude::binary_expr;
 use std::sync::Arc;
 use substrait::proto::rel::RelType;
-use substrait::proto::{Expression, JoinRel, Rel, join_rel};
+use substrait::proto::{JoinRel, Rel, join_rel};
 
 pub fn from_join(

Review Comment:
   Can you add a unit test for regression purposes ensuring that the join 
expression has an output type?



##########
datafusion/substrait/src/logical_plan/producer/expr/scalar_function.rs:
##########
@@ -264,57 +297,21 @@ pub fn from_between(
         low,
         high,
     } = between;
-    if *negated {
-        // `expr NOT BETWEEN low AND high` can be translated into (expr < low 
OR high < expr)
-        let substrait_expr = producer.handle_expr(expr.as_ref(), schema)?;
-        let substrait_low = producer.handle_expr(low.as_ref(), schema)?;
-        let substrait_high = producer.handle_expr(high.as_ref(), schema)?;
-
-        let l_expr = make_binary_op_scalar_func(
-            producer,
-            &substrait_expr,
-            &substrait_low,
-            Operator::Lt,
-        );
-        let r_expr = make_binary_op_scalar_func(
-            producer,
-            &substrait_high,
-            &substrait_expr,
-            Operator::Lt,
-        );
 
-        Ok(make_binary_op_scalar_func(
-            producer,
-            &l_expr,
-            &r_expr,
-            Operator::Or,
-        ))
+    let expr = if *negated {
+        // `expr NOT BETWEEN low AND high` can be translated into (expr < low 
OR high < expr)
+        Expr::or(
+            Expr::lt(*expr.clone(), *low.clone()),
+            Expr::lt(*high.clone(), *expr.clone()),
+        )

Review Comment:
   This change seems unrelated to the problem at hand.  While I think it 
probably is correct that you can switch from BETWEEN to comparison operators I 
also think this is more of a plan rewrite than is normally done during 
Substrait conversion.  Users might be surprised if they round-trip through 
Substrait and end up with a different plan than they started with.



##########
datafusion/substrait/src/logical_plan/producer/rel/join.rs:
##########
@@ -76,33 +55,28 @@ pub fn from_join(
             left: Some(left),
             right: Some(right),
             r#type: join_type as i32,
-            expression: join_expr,
+            expression: join_expression,
             post_join_filter: None,
             advanced_extension: None,
         }))),
     }))
 }
 
 fn to_substrait_join_expr(
-    producer: &mut impl SubstraitProducer,
-    join_conditions: &Vec<(Expr, Expr)>,
-    eq_op: Operator,
-    join_schema: &DFSchemaRef,
-) -> datafusion::common::Result<Option<Expression>> {
-    // Only support AND conjunction for each binary expression in join 
conditions
-    let mut exprs: Vec<Expression> = vec![];
-    for (left, right) in join_conditions {
-        let l = producer.handle_expr(left, join_schema)?;
-        let r = producer.handle_expr(right, join_schema)?;
-        // AND with existing expression
-        exprs.push(make_binary_op_scalar_func(producer, &l, &r, eq_op));
-    }
-
-    let join_expr: Option<Expression> =
-        exprs.into_iter().reduce(|acc: Expression, e: Expression| {
-            make_binary_op_scalar_func(producer, &acc, &e, Operator::And)
-        });
-    Ok(join_expr)
+    join_on: Vec<(Expr, Expr)>,
+    null_equality: NullEquality,
+    join_filter: Option<Expr>,
+) -> Option<Expr> {
+    // Combine join on and filter conditions into a single Boolean expression 
(#7611)
+    let eq_op = match null_equality {
+        NullEquality::NullEqualsNothing => Operator::Eq,
+        NullEquality::NullEqualsNull => Operator::IsNotDistinctFrom,
+    };
+    let all_conditions = join_on
+        .into_iter()
+        .map(|(left, right)| binary_expr(left, eq_op, right))
+        .chain(join_filter);

Review Comment:
   This is clever and much more compact than the old impl, well done.  I'm not 
100% sure of the correctness but I assume the existing unit tests would catch 
it otherwise.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to