This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 8e73844fb3 #17838 Rewrite `regexp_like` calls as `~` and `*~` operator 
expressions when possible (#17839)
8e73844fb3 is described below

commit 8e73844fb31b41df99c89348cf9b373520dc0f1f
Author: Pepijn Van Eeckhoudt <[email protected]>
AuthorDate: Mon Oct 6 16:04:14 2025 +0200

    #17838 Rewrite `regexp_like` calls as `~` and `*~` operator expressions 
when possible (#17839)
    
    * #17838 Add simplify implementation for regexp_like that rewrites as 
operator expressions when possible
    
    * Avoid cloning Exprs
    
    * Tweak code comments
    
    * Add some more sqllogictests
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/functions/src/regex/regexplike.rs       | 72 +++++++++++++++++++++-
 .../sqllogictest/test_files/regexp/regexp_like.slt | 60 ++++++++++++++++++
 .../sqllogictest/test_files/string/string_view.slt |  2 +-
 3 files changed, 131 insertions(+), 3 deletions(-)

diff --git a/datafusion/functions/src/regex/regexplike.rs 
b/datafusion/functions/src/regex/regexplike.rs
index 0554844d11..d75eb9141c 100644
--- a/datafusion/functions/src/regex/regexplike.rs
+++ b/datafusion/functions/src/regex/regexplike.rs
@@ -27,11 +27,14 @@ use datafusion_common::{
     ScalarValue,
 };
 use datafusion_expr::{
-    Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, 
TypeSignature,
-    TypeSignatureClass, Volatility,
+    binary_expr, cast, Coercion, ColumnarValue, Documentation, Expr, 
ScalarUDFImpl,
+    Signature, TypeSignature, TypeSignatureClass, Volatility,
 };
 use datafusion_macros::user_doc;
 
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr_common::operator::Operator;
+use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
 use std::any::Any;
 use std::sync::Arc;
 
@@ -153,11 +156,76 @@ impl ScalarUDFImpl for RegexpLikeFunc {
         }
     }
 
+    fn simplify(
+        &self,
+        mut args: Vec<Expr>,
+        info: &dyn SimplifyInfo,
+    ) -> Result<ExprSimplifyResult> {
+        // Try to simplify regexp_like usage to one of the builtin operators 
since those have
+        // optimized code paths for the case where the regular expression 
pattern is a scalar.
+        // Additionally, the expression simplification optimization pass will 
attempt to further
+        // simplify regular expression patterns used in operator expressions.
+        let Some(op) = derive_operator(&args) else {
+            return Ok(ExprSimplifyResult::Original(args));
+        };
+
+        let string_type = info.get_data_type(&args[0])?;
+        let regexp_type = info.get_data_type(&args[1])?;
+        let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op, 
&regexp_type);
+        let Ok((coerced_string_type, coerced_regexp_type)) =
+            binary_type_coercer.get_input_types()
+        else {
+            return Ok(ExprSimplifyResult::Original(args));
+        };
+
+        // regexp_like(str, regexp [, flags])
+        let regexp = args.swap_remove(1);
+        let string = args.swap_remove(0);
+
+        Ok(ExprSimplifyResult::Simplified(binary_expr(
+            if string_type != coerced_string_type {
+                cast(string, coerced_string_type)
+            } else {
+                string
+            },
+            op,
+            if regexp_type != coerced_regexp_type {
+                cast(regexp, coerced_regexp_type)
+            } else {
+                regexp
+            },
+        )))
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
+fn derive_operator(args: &[Expr]) -> Option<Operator> {
+    match args.len() {
+        // regexp_like(str, regexp, flags)
+        3 => {
+            match &args[2] {
+                Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => {
+                    match flags.as_str() {
+                        "i" => Some(Operator::RegexIMatch),
+                        "" => Some(Operator::RegexMatch),
+                        // Any flags besides 'i' have no operator equivalent
+                        _ => None,
+                    }
+                }
+                // `flags` is not a literal, so we can't derive the correct 
operator statically
+                _ => None,
+            }
+        }
+        // regexp_like(str, regexp)
+        2 => Some(Operator::RegexMatch),
+        // Should never happen, but just in case
+        _ => None,
+    }
+}
+
 /// Tests a string using a regular expression returning true if at
 /// least one match, false otherwise.
 ///
diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_like.slt 
b/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
index 223ef22b98..dd42511ead 100644
--- a/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
+++ b/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
@@ -277,3 +277,63 @@ drop table strings
 
 statement ok
 drop table dict_table
+
+# Ensure that regexp_like is rewritten to use the (more optimized) regex 
operators
+statement ok
+create table regexp_test as values
+  ('foobar', 'i'),
+  ('Foo', 'i'),
+  ('bar', 'mi') ;
+
+# Expressions that can be rewritten to use the ~ operator (which is more 
optimized)
+# (expect the plans to use the ~ / ~* operators, not the REGEXP_LIKE function)
+query TT
+explain select
+  regexp_like(column1, 'fo.*'),
+  regexp_like(column1, 'fo.*', 'i'),
+from regexp_test;
+----
+logical_plan
+01)Projection: regexp_test.column1 ~ Utf8("fo.*") AS 
regexp_like(regexp_test.column1,Utf8("fo.*")), regexp_test.column1 ~* 
Utf8("fo.*") AS regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))
+02)--TableScan: regexp_test projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[column1@0 ~ fo.* as 
regexp_like(regexp_test.column1,Utf8("fo.*")), column1@0 ~* fo.* as 
regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query BB
+select
+  regexp_like(column1, 'fo.*'),
+  regexp_like(column1, 'fo.*', 'i'),
+from regexp_test;
+----
+true true
+false true
+false false
+
+# Expressions that can not be rewritten to use the ~ / ~* operators
+# (expect the plans to use the REGEXP_LIKE function)
+query TT
+explain select
+  regexp_like(column1, 'f.*r', 'mi'), -- args
+  regexp_like(column1, 'f.*r', column2) -- non scalar flags
+from regexp_test;
+----
+logical_plan
+01)Projection: regexp_like(regexp_test.column1, Utf8("f.*r"), Utf8("mi")), 
regexp_like(regexp_test.column1, Utf8("f.*r"), regexp_test.column2)
+02)--TableScan: regexp_test projection=[column1, column2]
+physical_plan
+01)ProjectionExec: expr=[regexp_like(column1@0, f.*r, mi) as 
regexp_like(regexp_test.column1,Utf8("f.*r"),Utf8("mi")), 
regexp_like(column1@0, f.*r, column2@1) as 
regexp_like(regexp_test.column1,Utf8("f.*r"),regexp_test.column2)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query BB
+select
+  regexp_like(column1, 'f.*r', 'mi'), -- args
+  regexp_like(column1, 'f.*r', column2) -- non scalar flags
+from regexp_test;
+----
+true true
+false false
+false false
+
+statement ok
+drop table if exists dict_table;
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt 
b/datafusion/sqllogictest/test_files/string/string_view.slt
index 7d10a0615d..fb67daa0b8 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -784,7 +784,7 @@ EXPLAIN SELECT
 FROM test;
 ----
 logical_plan
-01)Projection: regexp_like(test.column1_utf8view, 
Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k
+01)Projection: test.column1_utf8view ~ 
Utf8View("^https?://(?:www\.)?([^/]+)/.*$") AS k
 02)--TableScan: test projection=[column1_utf8view]
 
 ## Ensure no casts for REGEXP_MATCH


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to