This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 8b412deaca fix: Optimize `!~ '.*'` case to `col IS NULL AND 
Boolean(NULL)` instead of `Eq ""` (#20702)
8b412deaca is described below

commit 8b412deaca33ec79f4982019379f3c8e538ab650
Author: Peter Nguyen <[email protected]>
AuthorDate: Thu Mar 12 05:53:34 2026 -0700

    fix: Optimize `!~ '.*'` case to `col IS NULL AND Boolean(NULL)` instead of 
`Eq ""` (#20702)
    
    ## Which issue does this PR close?
    
    <!--
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax. For example
    `Closes #123` indicates that this PR will close issue #123.
    -->
    
    - Closes #20701
    
    ## Rationale for this change
    
    <!--
    Why are you proposing this change? If this is already explained clearly
    in the issue then this section is not needed.
    Explaining clearly why changes are proposed helps reviewers understand
    your changes and offer better suggestions for fixes.
    -->
    
    
    ## What changes are included in this PR?
    
    <!--
    There is no need to duplicate the description in the issue here but it
    is sometimes worth providing a summary of the individual changes in this
    PR.
    -->
    A pre-existing optimization rule for the `!~ .*` (regexp not match) case
    rewrote the plan to `Eq ""`, which would return empty strings as part of
    the result. This is incorrect and doesn't match the output without the
    optimization rule.
    
    Instead, this PR rewrites the plan to simply `col IS NULL AND
    Boolean(NULL)` or, in other words, "NULL if col is NULL else false."
    
    I've confirmed this behavior matches the result of running queries
    manually with the optimization rule turned off.
    
    ## Are these changes tested?
    
    <!--
    We typically require tests for all PRs in order to:
    1. Prevent the code from being accidentally broken by subsequent changes
    2. Serve as another way to document the expected behavior of the code
    
    If tests are not included in your PR, please explain why (for example,
    are they covered by existing tests)?
    -->
    Fixed expected output in tests. Added new tests for nulls
    
    ## Are there any user-facing changes?
    
    <!--
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    -->
    
    <!--
    If there are any breaking changes to public APIs, please add the `api
    change` label.
    -->
    
    Yes, a minor bug fix. When querying `s !~ .*`, empty strings will no
    longer be included in the result which is consistent with the behavior
    without the optimization rule.
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 .../optimizer/src/simplify_expressions/regex.rs    | 13 +++--
 .../src/simplify_expressions/simplify_exprs.rs     | 55 ++++++++++++++++++----
 .../sqllogictest/test_files/simplify_expr.slt      | 19 +++++++-
 3 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs 
b/datafusion/optimizer/src/simplify_expressions/regex.rs
index 6c2492d054..b341c328e9 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use datafusion_common::tree_node::Transformed;
-use datafusion_common::{DataFusionError, Result};
+use datafusion_common::{DataFusionError, Result, ScalarValue};
 use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
 use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};
 
@@ -39,7 +39,7 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*";
 /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
 /// - combinations (alternatives) of the above, will be concatenated with `OR` 
or `AND`
 /// - `EQ .*` to NotNull
-/// - `NE .*` means IS EMPTY
+/// - `NE .*` to col IS NULL AND Boolean(NULL) (false for any string, or NULL 
if col is NULL)
 ///
 /// Dev note: unit tests of this function are in `expr_simplifier.rs`, case 
`test_simplify_regex`.
 pub fn simplify_regex_expr(
@@ -68,12 +68,11 @@ pub fn simplify_regex_expr(
     // Handle the special case for ".*" pattern
     if pattern == ANY_CHAR_REGEX_PATTERN {
         let new_expr = if mode.not {
-            // not empty
-            let empty_lit = Box::new(string_scalar.to_expr(""));
+            let null_bool = lit(ScalarValue::Boolean(None));
             Expr::BinaryExpr(BinaryExpr {
-                left,
-                op: Operator::Eq,
-                right: empty_lit,
+                left: Box::new(left.is_null()),
+                op: Operator::And,
+                right: Box::new(null_bool),
             })
         } else {
             // not null
diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs 
b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
index f7f1000150..2114c5ef3d 100644
--- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
+++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
@@ -155,6 +155,7 @@ mod tests {
     use arrow::datatypes::{DataType, Field, Schema};
     use chrono::{DateTime, Utc};
 
+    use datafusion_common::ScalarValue;
     use datafusion_expr::logical_plan::builder::table_scan_with_filters;
     use datafusion_expr::logical_plan::table_scan;
     use datafusion_expr::*;
@@ -870,7 +871,7 @@ mod tests {
         ]);
         let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
 
-        // Test `= ".*"` transforms to true (except for empty strings)
+        // Test `~ ".*"` transforms to true for any non-NULL string
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("a"), Operator::RegexMatch, lit(".*")))?
             .build()?;
@@ -883,22 +884,22 @@ mod tests {
         "
         )?;
 
-        // Test `!= ".*"` transforms to checking if the column is empty
+        // Test `!~ ".*"` preserves NULL semantics while remaining false for 
non-NULL strings
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))?
             .build()?;
 
         assert_optimized_plan_equal!(
             plan,
-            @ r#"
-        Filter: test.a = Utf8("")
+            @ r"
+        Filter: test.a IS NULL AND Boolean(NULL)
           TableScan: test
-        "#
+        "
         )?;
 
         // Test case-insensitive versions
 
-        // Test `=~ ".*"` (case-insensitive) transforms to true (except for 
empty strings)
+        // Test `~* ".*"` transforms to true for any non-NULL string
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("b"), Operator::RegexIMatch, lit(".*")))?
             .build()?;
@@ -911,17 +912,51 @@ mod tests {
         "
         )?;
 
-        // Test `!~ ".*"` (case-insensitive) transforms to checking if the 
column is empty
+        // Test NULL `!~ ".*"` transforms to Boolean(NULL)
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .filter(binary_expr(
+                lit(ScalarValue::Utf8(None)),
+                Operator::RegexNotMatch,
+                lit(".*"),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: Boolean(NULL)
+          TableScan: test
+        "
+        )?;
+
+        // Test `!~* ".*"` preserves NULL semantics while remaining false for 
non-NULL strings
         let plan = LogicalPlanBuilder::from(table_scan.clone())
             .filter(binary_expr(col("a"), Operator::RegexNotIMatch, 
lit(".*")))?
             .build()?;
 
         assert_optimized_plan_equal!(
             plan,
-            @ r#"
-        Filter: test.a = Utf8("")
+            @ r"
+        Filter: test.a IS NULL AND Boolean(NULL)
           TableScan: test
-        "#
+        "
+        )?;
+
+        // Test NULL `!~* ".*"` transforms to Boolean(NULL)
+        let plan = LogicalPlanBuilder::from(table_scan.clone())
+            .filter(binary_expr(
+                lit(ScalarValue::Utf8(None)),
+                Operator::RegexNotIMatch,
+                lit(".*"),
+            ))?
+            .build()?;
+
+        assert_optimized_plan_equal!(
+            plan,
+            @ r"
+        Filter: Boolean(NULL)
+          TableScan: test
+        "
         )
     }
 
diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt 
b/datafusion/sqllogictest/test_files/simplify_expr.slt
index f8c219e052..58ec7a1b26 100644
--- a/datafusion/sqllogictest/test_files/simplify_expr.slt
+++ b/datafusion/sqllogictest/test_files/simplify_expr.slt
@@ -44,12 +44,27 @@ query TT
 explain select b from t where b !~ '.*'
 ----
 logical_plan
-01)Filter: t.b = Utf8View("")
+01)Filter: t.b IS NULL AND Boolean(NULL)
 02)--TableScan: t projection=[b]
 physical_plan
-01)FilterExec: b@0 =
+01)FilterExec: b@0 IS NULL AND NULL
 02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
+query TB
+WITH vals(id, col) AS (
+    VALUES
+      (1, 'foo'::text),
+      (2, ''::text),
+      (3, NULL::text)
+)
+SELECT col, col !~ '.*'
+FROM vals
+ORDER BY id
+----
+foo false
+(empty) false
+NULL NULL
+
 query T
 select b from t where b ~ '.*'
 ----


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to