This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 3e3e9b5545 feat: add pattern for simplifying exprs like `str ~ 
'^foo$'` (#6369)
3e3e9b5545 is described below

commit 3e3e9b5545988620b9f403d323f3e72dcad571c3
Author: Christopher M. Wolff <[email protected]>
AuthorDate: Wed May 17 12:36:29 2023 -0700

    feat: add pattern for simplifying exprs like `str ~ '^foo$'` (#6369)
    
    * feat: add pattern for simplifying exprs like `str ~ '^foo$'`
    
    * test: add additional tests
---
 .../src/simplify_expressions/expr_simplifier.rs    | 34 +++++++++++++
 .../optimizer/src/simplify_expressions/regex.rs    | 59 +++++++++++++++++++++-
 2 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs 
b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 699e92a208..75f50aa3c5 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -2434,6 +2434,27 @@ mod tests {
         // single word
         assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), 
"%foo%"));
 
+        // regular expressions that match an exact literal
+        assert_change(regex_match(col("c1"), lit("^$")), 
col("c1").eq(lit("")));
+        assert_change(
+            regex_not_match(col("c1"), lit("^$")),
+            col("c1").not_eq(lit("")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("^foo$")),
+            col("c1").eq(lit("foo")),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("^foo$")),
+            col("c1").not_eq(lit("foo")),
+        );
+        assert_no_change(regex_match(col("c1"), lit("^foo|bar$")));
+        assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$")));
+        assert_no_change(regex_match(col("c1"), lit("^")));
+        assert_no_change(regex_match(col("c1"), lit("$")));
+        assert_no_change(regex_match(col("c1"), lit("$^")));
+        assert_no_change(regex_match(col("c1"), lit("$foo^")));
+
         // OR-chain
         assert_change(
             regex_match(col("c1"), lit("foo|bar|baz")),
@@ -2453,6 +2474,19 @@ mod tests {
                 .and(not_like(col("c1"), "%bar%"))
                 .and(not_like(col("c1"), "%baz%")),
         );
+        // both anchored expressions (translated to equality) and unanchored
+        assert_change(
+            regex_match(col("c1"), lit("foo|^x$|baz")),
+            like(col("c1"), "%foo%")
+                .or(col("c1").eq(lit("x")))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("foo|^bar$|baz")),
+            not_like(col("c1"), "%foo%")
+                .and(col("c1").not_eq(lit("bar")))
+                .and(not_like(col("c1"), "%baz%")),
+        );
         // Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION)
         assert_no_change(regex_match(col("c1"), 
lit("foo|bar|baz|blarg|bozo|etc")));
     }
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs 
b/datafusion/optimizer/src/simplify_expressions/regex.rs
index a7ae14542d..35f6dcaef0 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -16,8 +16,8 @@
 // under the License.
 
 use datafusion_common::{DataFusionError, Result, ScalarValue};
-use datafusion_expr::{BinaryExpr, Expr, Like, Operator};
-use regex_syntax::hir::{Hir, HirKind, Literal};
+use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator};
+use regex_syntax::hir::{Hir, HirKind, Literal, Look};
 
 /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded 
into multiple `LIKE` expressions.
 const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
@@ -95,6 +95,15 @@ impl OperatorMode {
             Expr::Like(like)
         }
     }
+
+    fn expr_matches_literal(&self, left: Box<Expr>, right: Box<Expr>) -> Expr {
+        let op = if self.not {
+            Operator::NotEq
+        } else {
+            Operator::Eq
+        };
+        Expr::BinaryExpr(BinaryExpr { left, op, right })
+    }
 }
 
 fn collect_concat_to_like_string(parts: &[Hir]) -> Option<String> {
@@ -130,6 +139,46 @@ fn is_safe_for_like(c: char) -> bool {
     (c != '%') && (c != '_')
 }
 
+/// returns true if the elements in a `Concat` pattern are:
+/// - `[Look::Start, Look::End]`
+/// - `[Look::Start, Literal(_), Look::End]`
+fn is_anchored_literal(v: &[Hir]) -> bool {
+    match v.len() {
+        2..=3 => (),
+        _ => return false,
+    };
+
+    let first_last = (
+        v.first().expect("length checked"),
+        v.last().expect("length checked"),
+    );
+    if !matches!(first_last,
+    (s, e) if s.kind() == &HirKind::Look(Look::Start)
+        && e.kind() == &HirKind::Look(Look::End)
+         )
+    {
+        return false;
+    }
+
+    v.iter()
+        .skip(1)
+        .take(v.len() - 2)
+        .all(|h| matches!(h.kind(), HirKind::Literal(_)))
+}
+
+/// extracts a string literal expression assuming that [`is_anchored_literal`]
+/// returned true.
+fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
+    match v.len() {
+        2 => Some(lit("")),
+        3 => {
+            let HirKind::Literal(l) = v[1].kind() else { return None };
+            str_from_literal(l).map(lit)
+        }
+        _ => None,
+    }
+}
+
 fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
     println!("Considering hir kind: mode {mode:?} hir: {hir:?}");
     match hir.kind() {
@@ -140,6 +189,12 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: 
&Hir) -> Option<Expr> {
             let s = str_from_literal(l)?;
             return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
         }
+        HirKind::Concat(inner) if is_anchored_literal(inner) => {
+            let right = anchored_literal_to_expr(inner)?;
+            return Some(
+                mode.expr_matches_literal(Box::new(left.clone()), 
Box::new(right)),
+            );
+        }
         HirKind::Concat(inner) => {
             if let Some(pattern) = collect_concat_to_like_string(inner) {
                 return Some(mode.expr(Box::new(left.clone()), pattern));

Reply via email to