This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 8e73844fb3 #17838 Rewrite `regexp_like` calls as `~` and `*~` operator
expressions when possible (#17839)
8e73844fb3 is described below
commit 8e73844fb31b41df99c89348cf9b373520dc0f1f
Author: Pepijn Van Eeckhoudt <[email protected]>
AuthorDate: Mon Oct 6 16:04:14 2025 +0200
#17838 Rewrite `regexp_like` calls as `~` and `*~` operator expressions
when possible (#17839)
* #17838 Add simplify implementation for regexp_like that rewrites as
operator expressions when possible
* Avoid cloning Exprs
* Tweak code comments
* Add some more sqllogictests
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/functions/src/regex/regexplike.rs | 72 +++++++++++++++++++++-
.../sqllogictest/test_files/regexp/regexp_like.slt | 60 ++++++++++++++++++
.../sqllogictest/test_files/string/string_view.slt | 2 +-
3 files changed, 131 insertions(+), 3 deletions(-)
diff --git a/datafusion/functions/src/regex/regexplike.rs
b/datafusion/functions/src/regex/regexplike.rs
index 0554844d11..d75eb9141c 100644
--- a/datafusion/functions/src/regex/regexplike.rs
+++ b/datafusion/functions/src/regex/regexplike.rs
@@ -27,11 +27,14 @@ use datafusion_common::{
ScalarValue,
};
use datafusion_expr::{
- Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature,
TypeSignature,
- TypeSignatureClass, Volatility,
+ binary_expr, cast, Coercion, ColumnarValue, Documentation, Expr,
ScalarUDFImpl,
+ Signature, TypeSignature, TypeSignatureClass, Volatility,
};
use datafusion_macros::user_doc;
+use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
+use datafusion_expr_common::operator::Operator;
+use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
use std::any::Any;
use std::sync::Arc;
@@ -153,11 +156,76 @@ impl ScalarUDFImpl for RegexpLikeFunc {
}
}
+ fn simplify(
+ &self,
+ mut args: Vec<Expr>,
+ info: &dyn SimplifyInfo,
+ ) -> Result<ExprSimplifyResult> {
+ // Try to simplify regexp_like usage to one of the builtin operators
since those have
+ // optimized code paths for the case where the regular expression
pattern is a scalar.
+ // Additionally, the expression simplification optimization pass will
attempt to further
+ // simplify regular expression patterns used in operator expressions.
+ let Some(op) = derive_operator(&args) else {
+ return Ok(ExprSimplifyResult::Original(args));
+ };
+
+ let string_type = info.get_data_type(&args[0])?;
+ let regexp_type = info.get_data_type(&args[1])?;
+ let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op,
®exp_type);
+ let Ok((coerced_string_type, coerced_regexp_type)) =
+ binary_type_coercer.get_input_types()
+ else {
+ return Ok(ExprSimplifyResult::Original(args));
+ };
+
+ // regexp_like(str, regexp [, flags])
+ let regexp = args.swap_remove(1);
+ let string = args.swap_remove(0);
+
+ Ok(ExprSimplifyResult::Simplified(binary_expr(
+ if string_type != coerced_string_type {
+ cast(string, coerced_string_type)
+ } else {
+ string
+ },
+ op,
+ if regexp_type != coerced_regexp_type {
+ cast(regexp, coerced_regexp_type)
+ } else {
+ regexp
+ },
+ )))
+ }
+
fn documentation(&self) -> Option<&Documentation> {
self.doc()
}
}
+fn derive_operator(args: &[Expr]) -> Option<Operator> {
+ match args.len() {
+ // regexp_like(str, regexp, flags)
+ 3 => {
+ match &args[2] {
+ Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => {
+ match flags.as_str() {
+ "i" => Some(Operator::RegexIMatch),
+ "" => Some(Operator::RegexMatch),
+ // Any flags besides 'i' have no operator equivalent
+ _ => None,
+ }
+ }
+ // `flags` is not a literal, so we can't derive the correct
operator statically
+ _ => None,
+ }
+ }
+ // regexp_like(str, regexp)
+ 2 => Some(Operator::RegexMatch),
+ // Should never happen, but just in case
+ _ => None,
+ }
+}
+
/// Tests a string using a regular expression returning true if at
/// least one match, false otherwise.
///
diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
b/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
index 223ef22b98..dd42511ead 100644
--- a/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
+++ b/datafusion/sqllogictest/test_files/regexp/regexp_like.slt
@@ -277,3 +277,63 @@ drop table strings
statement ok
drop table dict_table
+
+# Ensure that regexp_like is rewritten to use the (more optimized) regex
operators
+statement ok
+create table regexp_test as values
+ ('foobar', 'i'),
+ ('Foo', 'i'),
+ ('bar', 'mi') ;
+
+# Expressions that can be rewritten to use the ~ operator (which is more
optimized)
+# (expect the plans to use the ~ / ~* operators, not the REGEXP_LIKE function)
+query TT
+explain select
+ regexp_like(column1, 'fo.*'),
+ regexp_like(column1, 'fo.*', 'i'),
+from regexp_test;
+----
+logical_plan
+01)Projection: regexp_test.column1 ~ Utf8("fo.*") AS
regexp_like(regexp_test.column1,Utf8("fo.*")), regexp_test.column1 ~*
Utf8("fo.*") AS regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))
+02)--TableScan: regexp_test projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[column1@0 ~ fo.* as
regexp_like(regexp_test.column1,Utf8("fo.*")), column1@0 ~* fo.* as
regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query BB
+select
+ regexp_like(column1, 'fo.*'),
+ regexp_like(column1, 'fo.*', 'i'),
+from regexp_test;
+----
+true true
+false true
+false false
+
+# Expressions that can not be rewritten to use the ~ / ~* operators
+# (expect the plans to use the REGEXP_LIKE function)
+query TT
+explain select
+ regexp_like(column1, 'f.*r', 'mi'), -- args
+ regexp_like(column1, 'f.*r', column2) -- non scalar flags
+from regexp_test;
+----
+logical_plan
+01)Projection: regexp_like(regexp_test.column1, Utf8("f.*r"), Utf8("mi")),
regexp_like(regexp_test.column1, Utf8("f.*r"), regexp_test.column2)
+02)--TableScan: regexp_test projection=[column1, column2]
+physical_plan
+01)ProjectionExec: expr=[regexp_like(column1@0, f.*r, mi) as
regexp_like(regexp_test.column1,Utf8("f.*r"),Utf8("mi")),
regexp_like(column1@0, f.*r, column2@1) as
regexp_like(regexp_test.column1,Utf8("f.*r"),regexp_test.column2)]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
+
+query BB
+select
+ regexp_like(column1, 'f.*r', 'mi'), -- args
+ regexp_like(column1, 'f.*r', column2) -- non scalar flags
+from regexp_test;
+----
+true true
+false false
+false false
+
+statement ok
+drop table if exists dict_table;
diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt
b/datafusion/sqllogictest/test_files/string/string_view.slt
index 7d10a0615d..fb67daa0b8 100644
--- a/datafusion/sqllogictest/test_files/string/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -784,7 +784,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
-01)Projection: regexp_like(test.column1_utf8view,
Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k
+01)Projection: test.column1_utf8view ~
Utf8View("^https?://(?:www\.)?([^/]+)/.*$") AS k
02)--TableScan: test projection=[column1_utf8view]
## Ensure no casts for REGEXP_MATCH
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]