This is an automated email from the ASF dual-hosted git repository.

goldmedal pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 6612d7ce0d Fix incorrect `... LIKE '%'` simplification with NULLs 
(#13259)
6612d7ce0d is described below

commit 6612d7ce0dc71c31b268863477e0aa25664c8505
Author: Piotr Findeisen <[email protected]>
AuthorDate: Wed Nov 6 03:48:45 2024 +0100

    Fix incorrect `... LIKE '%'` simplification with NULLs (#13259)
    
    * Fix incorrect `... LIKE '%'` simplification
    
    `expr LIKE '%'` was previously simplified to `true`, but the expression
    returns `NULL` when `expr` is null.
    The conversion was conditional on `!is_null(expr)` which means "is not
    always true, i.e. is not a null literal".
    
    This commit adds correct simplification logic. It additionally expands
    the rule coverage to include string view (Utf8View) and large string
    (LargeUtf8). This allows writing shared test cases even despite
    `utf8_view LIKE '%'` returning incorrect results at execution time
    (tracked by https://github.com/apache/datafusion/issues/12637). I.e. the
    simplification masks the bug for cases where pattern is statically
    known.
    
    * fixup! Fix incorrect `... LIKE '%'` simplification
    
    * fix tests (re review comments)
---
 .../src/simplify_expressions/expr_simplifier.rs    | 67 ++++++++++++++++------
 .../test_files/string/string_literal.slt           |  8 +++
 .../test_files/string/string_query.slt.part        | 50 ++++++++++++++++
 3 files changed, 109 insertions(+), 16 deletions(-)

diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs 
b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 40be1f8539..e0df6a3a68 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -1476,13 +1476,28 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for 
Simplifier<'a, S> {
                 negated,
                 escape_char: _,
                 case_insensitive: _,
-            }) if !is_null(&expr)
-                && matches!(
-                    pattern.as_ref(),
-                    Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if 
pattern_str == "%"
-                ) =>
+            }) if matches!(
+                pattern.as_ref(),
+                Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if 
pattern_str == "%"
+            ) || matches!(
+                pattern.as_ref(),
+                Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if 
pattern_str == "%"
+            ) || matches!(
+                pattern.as_ref(),
+                Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if 
pattern_str == "%"
+            ) =>
             {
-                Transformed::yes(lit(!negated))
+                // exp LIKE '%' is
+                //   - when exp is not NULL, it's true
+                //   - when exp is NULL, it's NULL
+                // exp NOT LIKE '%' is
+                //   - when exp is not NULL, it's false
+                //   - when exp is NULL, it's NULL
+                Transformed::yes(Expr::Case(Case {
+                    expr: Some(Box::new(Expr::IsNotNull(expr))),
+                    when_then_expr: vec![(Box::new(lit(true)), 
Box::new(lit(!negated)))],
+                    else_expr: None,
+                }))
             }
 
             // a is not null/unknown --> true (if a is not nullable)
@@ -2777,10 +2792,22 @@ mod tests {
         assert_no_change(regex_match(col("c1"), lit("f_o")));
 
         // empty cases
-        assert_change(regex_match(col("c1"), lit("")), lit(true));
-        assert_change(regex_not_match(col("c1"), lit("")), lit(false));
-        assert_change(regex_imatch(col("c1"), lit("")), lit(true));
-        assert_change(regex_not_imatch(col("c1"), lit("")), lit(false));
+        assert_change(
+            regex_match(col("c1"), lit("")),
+            if_not_null(col("c1"), true),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("")),
+            if_not_null(col("c1"), false),
+        );
+        assert_change(
+            regex_imatch(col("c1"), lit("")),
+            if_not_null(col("c1"), true),
+        );
+        assert_change(
+            regex_not_imatch(col("c1"), lit("")),
+            if_not_null(col("c1"), false),
+        );
 
         // single character
         assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), 
"%x%"));
@@ -3606,20 +3633,20 @@ mod tests {
 
     #[test]
     fn test_like_and_ilke() {
-        // test non-null values
+        // LIKE '%'
         let expr = like(col("c1"), "%");
-        assert_eq!(simplify(expr), lit(true));
+        assert_eq!(simplify(expr), if_not_null(col("c1"), true));
 
         let expr = not_like(col("c1"), "%");
-        assert_eq!(simplify(expr), lit(false));
+        assert_eq!(simplify(expr), if_not_null(col("c1"), false));
 
         let expr = ilike(col("c1"), "%");
-        assert_eq!(simplify(expr), lit(true));
+        assert_eq!(simplify(expr), if_not_null(col("c1"), true));
 
         let expr = not_ilike(col("c1"), "%");
-        assert_eq!(simplify(expr), lit(false));
+        assert_eq!(simplify(expr), if_not_null(col("c1"), false));
 
-        // test null values
+        // null_constant LIKE '%'
         let null = lit(ScalarValue::Utf8(None));
         let expr = like(null.clone(), "%");
         assert_eq!(simplify(expr), lit_bool_null());
@@ -4043,4 +4070,12 @@ mod tests {
             );
         }
     }
+
+    fn if_not_null(expr: Expr, then: bool) -> Expr {
+        Expr::Case(Case {
+            expr: Some(expr.is_not_null().into()),
+            when_then_expr: vec![(lit(true).into(), lit(then).into())],
+            else_expr: None,
+        })
+    }
 }
diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt 
b/datafusion/sqllogictest/test_files/string/string_literal.slt
index 80bd7fc59c..6d12d83773 100644
--- a/datafusion/sqllogictest/test_files/string/string_literal.slt
+++ b/datafusion/sqllogictest/test_files/string/string_literal.slt
@@ -821,3 +821,11 @@ query TT
 select '   ', '|'
 ----
     |
+
+query BBB
+SELECT
+    NULL LIKE '%',
+    '' LIKE '%',
+    'a' LIKE '%'
+----
+NULL true true
diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part 
b/datafusion/sqllogictest/test_files/string/string_query.slt.part
index c4975b5b8c..4960a02540 100644
--- a/datafusion/sqllogictest/test_files/string/string_query.slt.part
+++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part
@@ -873,6 +873,56 @@ NULL NULL NULL NULL NULL
 #Raphael datafusionДатаФусион false false false false
 #NULL NULL NULL NULL NULL NULL
 
+# TODO (https://github.com/apache/datafusion/issues/12637) uncomment 
additional test projections
+query TTBB
+SELECT ascii_1, unicode_1,
+      ascii_1 LIKE '%' AS ascii_1_like_percent,
+      unicode_1 LIKE '%' AS unicode_1_like_percent
+      -- ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, -- TODO enable 
after fixing https://github.com/apache/datafusion/issues/12637
+      -- unicode_1 LIKE '%%' AS unicode_1_like_percent_percent -- TODO enable 
after fixing https://github.com/apache/datafusion/issues/12637
+FROM test_basic_operator
+----
+Andrew datafusion📊🔥 true true
+Xiangpeng datafusion数据融合 true true
+Raphael datafusionДатаФусион true true
+under_score un iść core true true
+percent pan Tadeusz ma iść w kąt true true
+(empty) (empty) true true
+NULL NULL NULL NULL
+NULL NULL NULL NULL
+
+# TODO (https://github.com/apache/datafusion/issues/12637) uncomment 
additional test projections
+query TTBB
+SELECT ascii_1, unicode_1,
+      ascii_1 NOT LIKE '%' AS ascii_1_not_like_percent,
+      unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent
+      -- ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, -- TODO 
enable after fixing https://github.com/apache/datafusion/issues/12637
+      -- unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent -- TODO 
enable after fixing https://github.com/apache/datafusion/issues/12637
+FROM test_basic_operator
+----
+Andrew datafusion📊🔥 false false
+Xiangpeng datafusion数据融合 false false
+Raphael datafusionДатаФусион false false
+under_score un iść core false false
+percent pan Tadeusz ma iść w kąt false false
+(empty) (empty) false false
+NULL NULL NULL NULL
+NULL NULL NULL NULL
+
+query T
+SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 LIKE '%'
+----
+Andrew
+Xiangpeng
+Raphael
+under_score
+percent
+(empty)
+
+query T
+SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 NOT LIKE '%'
+----
+
 # Test pattern without wildcard characters
 query TTBBBB
 select ascii_1, unicode_1,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to