This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 3ab72627f2 Support simplifying expressions such as `~ ^(ba_r|foo)$` , 
where the string includes underline (#7186)
3ab72627f2 is described below

commit 3ab72627f2bcc6b493f5a320a7dd9f0d261f61b7
Author: Ruixiang Tan <[email protected]>
AuthorDate: Fri Aug 4 01:52:54 2023 +0800

    Support simplifying expressions such as `~ ^(ba_r|foo)$` , where the string 
includes underline (#7186)
    
    * Support simplifying expressions like ~ ^(ba_r)$
    
    * rename fn name
---
 .../src/simplify_expressions/expr_simplifier.rs    | 33 ++++++++++++++++++++++
 .../optimizer/src/simplify_expressions/regex.rs    | 16 ++++++++---
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs 
b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 5562e10f69..b7e8612d53 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -2499,10 +2499,43 @@ mod tests {
             col("c1")
                 .in_list(vec![lit("foo"), lit("bar"), lit("baz"), lit("qux")], 
false),
         );
+        assert_change(
+            regex_match(col("c1"), lit("^(fo_o)$")),
+            col("c1").eq(lit("fo_o")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("^(fo_o)$")),
+            col("c1").eq(lit("fo_o")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("^(fo_o|ba_r)$")),
+            col("c1").eq(lit("fo_o")).or(col("c1").eq(lit("ba_r"))),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("^(fo_o|ba_r)$")),
+            col("c1")
+                .not_eq(lit("fo_o"))
+                .and(col("c1").not_eq(lit("ba_r"))),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("^(fo_o|ba_r|ba_z)$")),
+            ((col("c1").eq(lit("fo_o"))).or(col("c1").eq(lit("ba_r"))))
+                .or(col("c1").eq(lit("ba_z"))),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("^(fo_o|ba_r|baz|qu_x)$")),
+            col("c1").in_list(
+                vec![lit("fo_o"), lit("ba_r"), lit("baz"), lit("qu_x")],
+                false,
+            ),
+        );
 
         // regular expressions that mismatch captured literals
         assert_no_change(regex_match(col("c1"), lit("(foo|bar)")));
         assert_no_change(regex_match(col("c1"), lit("(foo|bar)*")));
+        assert_no_change(regex_match(col("c1"), lit("(fo_o|b_ar)")));
+        assert_no_change(regex_match(col("c1"), lit("(foo|ba_r)*")));
+        assert_no_change(regex_match(col("c1"), lit("(fo_o|ba_r)*")));
         assert_no_change(regex_match(col("c1"), lit("^(foo|bar)*")));
         assert_no_change(regex_match(col("c1"), lit("^foo|bar$")));
         assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$")));
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs 
b/datafusion/optimizer/src/simplify_expressions/regex.rs
index 27fcfc5dbf..5094623b82 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -108,7 +108,7 @@ fn collect_concat_to_like_string(parts: &[Hir]) -> 
Option<String> {
 
     for sub in parts {
         if let HirKind::Literal(l) = sub.kind() {
-            s.push_str(str_from_literal(l)?);
+            s.push_str(like_str_from_literal(l)?);
         } else {
             return None;
         }
@@ -120,7 +120,7 @@ fn collect_concat_to_like_string(parts: &[Hir]) -> 
Option<String> {
 
 /// returns a str represented by `Literal` if it contains a valid utf8
 /// sequence and is safe for like (has no '%' and '_')
-fn str_from_literal(l: &Literal) -> Option<&str> {
+fn like_str_from_literal(l: &Literal) -> Option<&str> {
     // if not utf8, no good
     let s = std::str::from_utf8(&l.0).ok()?;
 
@@ -131,6 +131,14 @@ fn str_from_literal(l: &Literal) -> Option<&str> {
     }
 }
 
+/// returns a str represented by `Literal` if it contains a valid utf8
+fn str_from_literal(l: &Literal) -> Option<&str> {
+    // if not utf8, no good
+    let s = std::str::from_utf8(&l.0).ok()?;
+
+    Some(s)
+}
+
 fn is_safe_for_like(c: char) -> bool {
     (c != '%') && (c != '_')
 }
@@ -196,7 +204,7 @@ fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
         2 => Some(lit("")),
         3 => {
             let HirKind::Literal(l) = v[1].kind() else { return None };
-            str_from_literal(l).map(lit)
+            like_str_from_literal(l).map(lit)
         }
         _ => None,
     }
@@ -242,7 +250,7 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: 
&Hir) -> Option<Expr> {
             return Some(mode.expr(Box::new(left.clone()), "%".to_owned()));
         }
         HirKind::Literal(l) => {
-            let s = str_from_literal(l)?;
+            let s = like_str_from_literal(l)?;
             return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
         }
         HirKind::Concat(inner) if is_anchored_literal(inner) => {

Reply via email to