alamb commented on code in PR #13260: URL: https://github.com/apache/datafusion/pull/13260#discussion_r1830873028
########## datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs: ########## @@ -2987,41 +3051,41 @@ mod tests { }) } - fn like(expr: Expr, pattern: &str) -> Expr { + fn like(expr: Expr, pattern: impl Into<Expr>) -> Expr { Review Comment: I think we could use Expr::like https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html#method.like and similar here instead of these functions This is likely left over from when the Expr API was less expressive ########## datafusion/sqllogictest/test_files/string/string_view.slt: ########## @@ -396,8 +396,9 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: test.column1_utf8view LIKE Utf8View("foo") AS like, test.column1_utf8view ILIKE Utf8View("foo") AS ilike -02)--TableScan: test projection=[column1_utf8view] +01)Projection: __common_expr_1 AS like, __common_expr_1 AS ilike Review Comment: Nice! Though I think this test's meaning is now changed it is supposed to be verifying cast exprs for like Perhaps you can change the patterns to `like '%foo%'` ########## datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs: ########## @@ -1470,34 +1471,67 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { }) => Transformed::yes(simplify_regex_expr(left, op, right)?), // Rules for Like - Expr::Like(Like { - expr, - pattern, - negated, - escape_char: _, - case_insensitive: _, - }) if matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%" - ) || matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%" - ) || matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%" - ) => - { - // exp LIKE '%' is - // - when exp is not NULL, it's true - // - when exp is NULL, it's NULL - // exp NOT LIKE '%' is - // - when exp is not NULL, it's false - // - when exp is NULL, it's NULL - Transformed::yes(Expr::Case(Case { - expr: Some(Box::new(Expr::IsNotNull(expr))), - when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))], - else_expr: None, - })) + Expr::Like(like) => { + match as_string_scalar(&like.pattern) { + Some((data_type, pattern_str)) => { + match pattern_str { + None => return Ok(Transformed::yes(lit_bool_null())), + Some(pattern_str) if pattern_str == "%" => { + // exp LIKE '%' is + // - when exp is not NULL, it's true + // - when exp is NULL, it's NULL + // exp NOT LIKE '%' is + // - when exp is not NULL, it's false + // - when exp is NULL, it's NULL + let result_for_non_null = lit(!like.negated); + Transformed::yes(if !info.nullable(&like.expr)? { + result_for_non_null + } else { + Expr::Case(Case { + expr: Some(Box::new(Expr::IsNotNull(like.expr))), + when_then_expr: vec![( + Box::new(lit(true)), + Box::new(result_for_non_null), + )], + else_expr: None, + }) + }) + } + Some(pattern_str) Review Comment: I am not sure about this rule (see questions below) -- maybe you could add some comments explanation on what it is supposed to do. ########## datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs: ########## @@ -3633,32 +3697,123 @@ mod tests { #[test] fn test_like_and_ilke() { - // LIKE '%' - let expr = like(col("c1"), "%"); + let null = lit(ScalarValue::Utf8(None)); + + // expr [NOT] [I]LIKE NULL + let expr = like(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_like(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = ilike(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_ilike(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + // expr [NOT] [I]LIKE '%' + let expr = like(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); + + let expr = not_like(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); + + let expr = ilike(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); + + let expr = not_ilike(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); + + // expr [NOT] [I]LIKE '%%' Review Comment: Also, since the code seems to handle more than two `%`, could you add a test for patterns like - `col LIKE '17%%' // 17%` - `col LIKE '%%five%% // %five%` - `col LIKE '%%%%five%%%% // %%five%%` ########## datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs: ########## @@ -3633,32 +3697,123 @@ mod tests { #[test] fn test_like_and_ilke() { - // LIKE '%' - let expr = like(col("c1"), "%"); + let null = lit(ScalarValue::Utf8(None)); + + // expr [NOT] [I]LIKE NULL + let expr = like(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_like(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = ilike(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_ilike(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + // expr [NOT] [I]LIKE '%' + let expr = like(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); + + let expr = not_like(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); + + let expr = ilike(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); + + let expr = not_ilike(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); + + // expr [NOT] [I]LIKE '%%' Review Comment: I don't understand this simplification I thought `expr LIKE '%%'` means the same as `expr = '%'` (match the actual literal character `%` ) Specifically that `%%` is how to escape the wildcard to not be a wildcard This test looks like it has applied the rewrite needed for `expr LIKE '%'` which matches any non null input 🤔 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org