This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 3e3e9b5545 feat: add pattern for simplifying exprs like `str ~
'^foo$'` (#6369)
3e3e9b5545 is described below
commit 3e3e9b5545988620b9f403d323f3e72dcad571c3
Author: Christopher M. Wolff <[email protected]>
AuthorDate: Wed May 17 12:36:29 2023 -0700
feat: add pattern for simplifying exprs like `str ~ '^foo$'` (#6369)
* feat: add pattern for simplifying exprs like `str ~ '^foo$'`
* test: add additional tests
---
.../src/simplify_expressions/expr_simplifier.rs | 34 +++++++++++++
.../optimizer/src/simplify_expressions/regex.rs | 59 +++++++++++++++++++++-
2 files changed, 91 insertions(+), 2 deletions(-)
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 699e92a208..75f50aa3c5 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -2434,6 +2434,27 @@ mod tests {
// single word
assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"),
"%foo%"));
+ // regular expressions that match an exact literal
+ assert_change(regex_match(col("c1"), lit("^$")),
col("c1").eq(lit("")));
+ assert_change(
+ regex_not_match(col("c1"), lit("^$")),
+ col("c1").not_eq(lit("")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("^foo$")),
+ col("c1").eq(lit("foo")),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("^foo$")),
+ col("c1").not_eq(lit("foo")),
+ );
+ assert_no_change(regex_match(col("c1"), lit("^foo|bar$")));
+ assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$")));
+ assert_no_change(regex_match(col("c1"), lit("^")));
+ assert_no_change(regex_match(col("c1"), lit("$")));
+ assert_no_change(regex_match(col("c1"), lit("$^")));
+ assert_no_change(regex_match(col("c1"), lit("$foo^")));
+
// OR-chain
assert_change(
regex_match(col("c1"), lit("foo|bar|baz")),
@@ -2453,6 +2474,19 @@ mod tests {
.and(not_like(col("c1"), "%bar%"))
.and(not_like(col("c1"), "%baz%")),
);
+ // both anchored expressions (translated to equality) and unanchored
+ assert_change(
+ regex_match(col("c1"), lit("foo|^x$|baz")),
+ like(col("c1"), "%foo%")
+ .or(col("c1").eq(lit("x")))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("foo|^bar$|baz")),
+ not_like(col("c1"), "%foo%")
+ .and(col("c1").not_eq(lit("bar")))
+ .and(not_like(col("c1"), "%baz%")),
+ );
// Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION)
assert_no_change(regex_match(col("c1"),
lit("foo|bar|baz|blarg|bozo|etc")));
}
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs
b/datafusion/optimizer/src/simplify_expressions/regex.rs
index a7ae14542d..35f6dcaef0 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -16,8 +16,8 @@
// under the License.
use datafusion_common::{DataFusionError, Result, ScalarValue};
-use datafusion_expr::{BinaryExpr, Expr, Like, Operator};
-use regex_syntax::hir::{Hir, HirKind, Literal};
+use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator};
+use regex_syntax::hir::{Hir, HirKind, Literal, Look};
/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded
into multiple `LIKE` expressions.
const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
@@ -95,6 +95,15 @@ impl OperatorMode {
Expr::Like(like)
}
}
+
+ fn expr_matches_literal(&self, left: Box<Expr>, right: Box<Expr>) -> Expr {
+ let op = if self.not {
+ Operator::NotEq
+ } else {
+ Operator::Eq
+ };
+ Expr::BinaryExpr(BinaryExpr { left, op, right })
+ }
}
fn collect_concat_to_like_string(parts: &[Hir]) -> Option<String> {
@@ -130,6 +139,46 @@ fn is_safe_for_like(c: char) -> bool {
(c != '%') && (c != '_')
}
+/// returns true if the elements in a `Concat` pattern are:
+/// - `[Look::Start, Look::End]`
+/// - `[Look::Start, Literal(_), Look::End]`
+fn is_anchored_literal(v: &[Hir]) -> bool {
+ match v.len() {
+ 2..=3 => (),
+ _ => return false,
+ };
+
+ let first_last = (
+ v.first().expect("length checked"),
+ v.last().expect("length checked"),
+ );
+ if !matches!(first_last,
+ (s, e) if s.kind() == &HirKind::Look(Look::Start)
+ && e.kind() == &HirKind::Look(Look::End)
+ )
+ {
+ return false;
+ }
+
+ v.iter()
+ .skip(1)
+ .take(v.len() - 2)
+ .all(|h| matches!(h.kind(), HirKind::Literal(_)))
+}
+
+/// extracts a string literal expression assuming that [`is_anchored_literal`]
+/// returned true.
+fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
+ match v.len() {
+ 2 => Some(lit("")),
+ 3 => {
+ let HirKind::Literal(l) = v[1].kind() else { return None };
+ str_from_literal(l).map(lit)
+ }
+ _ => None,
+ }
+}
+
fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
println!("Considering hir kind: mode {mode:?} hir: {hir:?}");
match hir.kind() {
@@ -140,6 +189,12 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir:
&Hir) -> Option<Expr> {
let s = str_from_literal(l)?;
return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
}
+ HirKind::Concat(inner) if is_anchored_literal(inner) => {
+ let right = anchored_literal_to_expr(inner)?;
+ return Some(
+ mode.expr_matches_literal(Box::new(left.clone()),
Box::new(right)),
+ );
+ }
HirKind::Concat(inner) => {
if let Some(pattern) = collect_concat_to_like_string(inner) {
return Some(mode.expr(Box::new(left.clone()), pattern));