This is an automated email from the ASF dual-hosted git repository.

wayne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 8fda4a6163 feat(optimizer): handle partial anchored regex cases and 
improve doc (#10977)
8fda4a6163 is described below

commit 8fda4a6163970cd4ee02cc97468b2a1f67012ae2
Author: Ruihang Xia <[email protected]>
AuthorDate: Wed Jun 19 10:08:12 2024 +0800

    feat(optimizer): handle partial anchored regex cases and improve doc 
(#10977)
    
    Signed-off-by: Ruihang Xia <[email protected]>
---
 .../src/simplify_expressions/expr_simplifier.rs    | 12 +++-
 .../optimizer/src/simplify_expressions/regex.rs    | 68 ++++++++++++++++++----
 2 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs 
b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 024cb74403..f2c80e4a72 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -2730,11 +2730,10 @@ mod tests {
         // unsupported cases
         assert_no_change(regex_match(col("c1"), lit("foo.*")));
         assert_no_change(regex_match(col("c1"), lit("(foo)")));
-        assert_no_change(regex_match(col("c1"), lit("^foo")));
-        assert_no_change(regex_match(col("c1"), lit("foo$")));
         assert_no_change(regex_match(col("c1"), lit("%")));
         assert_no_change(regex_match(col("c1"), lit("_")));
         assert_no_change(regex_match(col("c1"), lit("f%o")));
+        assert_no_change(regex_match(col("c1"), lit("^f%o")));
         assert_no_change(regex_match(col("c1"), lit("f_o")));
 
         // empty cases
@@ -2827,13 +2826,20 @@ mod tests {
         assert_no_change(regex_match(col("c1"), lit("(foo|ba_r)*")));
         assert_no_change(regex_match(col("c1"), lit("(fo_o|ba_r)*")));
         assert_no_change(regex_match(col("c1"), lit("^(foo|bar)*")));
-        assert_no_change(regex_match(col("c1"), lit("^foo|bar$")));
         assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$")));
         assert_no_change(regex_match(col("c1"), lit("^")));
         assert_no_change(regex_match(col("c1"), lit("$")));
         assert_no_change(regex_match(col("c1"), lit("$^")));
         assert_no_change(regex_match(col("c1"), lit("$foo^")));
 
+        // regular expressions that match a partial literal
+        assert_change(regex_match(col("c1"), lit("^foo")), like(col("c1"), 
"foo%"));
+        assert_change(regex_match(col("c1"), lit("foo$")), like(col("c1"), 
"%foo"));
+        assert_change(
+            regex_match(col("c1"), lit("^foo|bar$")),
+            like(col("c1"), "foo%").or(like(col("c1"), "%bar")),
+        );
+
         // OR-chain
         assert_change(
             regex_match(col("c1"), lit("foo|bar|baz")),
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs 
b/datafusion/optimizer/src/simplify_expressions/regex.rs
index 175b70f2b1..9a78298b10 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -22,6 +22,19 @@ use regex_syntax::hir::{Capture, Hir, HirKind, Literal, 
Look};
 /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded 
into multiple `LIKE` expressions.
 const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
 
+/// Tries to convert a regexp expression to a `LIKE` or `Eq`/`NotEq` 
expression.
+///
+/// This function also validates the regex pattern. And will return error if 
the
+/// pattern is invalid.
+///
+/// Typical cases this function can simplify:
+/// - empty regex pattern to `LIKE '%'`
+/// - literal regex patterns to `LIKE '%foo%'`
+/// - full anchored regex patterns (e.g. `^foo$`) to `= 'foo'`
+/// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
+/// - combinations (alternatives) of the above, will be concatenated with `OR` 
or `AND`
+///
+/// Dev note: unit tests of this function are in `expr_simplifier.rs`, case 
`test_simplify_regex`.
 pub fn simplify_regex_expr(
     left: Box<Expr>,
     op: Operator,
@@ -53,13 +66,15 @@ pub fn simplify_regex_expr(
         }
     }
 
-    // leave untouched if optimization didn't work
+    // Leave untouched if optimization didn't work
     Ok(Expr::BinaryExpr(BinaryExpr { left, op, right }))
 }
 
 #[derive(Debug)]
 struct OperatorMode {
+    /// Negative match.
     not: bool,
+    /// Ignore case (`true` for case-insensitive).
     i: bool,
 }
 
@@ -80,6 +95,7 @@ impl OperatorMode {
         Self { not, i }
     }
 
+    /// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern.
     fn expr(&self, expr: Box<Expr>, pattern: String) -> Expr {
         let like = Like {
             negated: self.not,
@@ -92,6 +108,7 @@ impl OperatorMode {
         Expr::Like(like)
     }
 
+    /// Creates an [`Expr::BinaryExpr`] of "`left` = `right`" or "`left` != 
`right`".
     fn expr_matches_literal(&self, left: Box<Expr>, right: Box<Expr>) -> Expr {
         let op = if self.not {
             Operator::NotEq
@@ -118,7 +135,7 @@ fn collect_concat_to_like_string(parts: &[Hir]) -> 
Option<String> {
     Some(s)
 }
 
-/// returns a str represented by `Literal` if it contains a valid utf8
+/// Returns a str represented by `Literal` if it contains a valid utf8
 /// sequence and is safe for like (has no '%' and '_')
 fn like_str_from_literal(l: &Literal) -> Option<&str> {
     // if not utf8, no good
@@ -131,7 +148,7 @@ fn like_str_from_literal(l: &Literal) -> Option<&str> {
     }
 }
 
-/// returns a str represented by `Literal` if it contains a valid utf8
+/// Returns a str represented by `Literal` if it contains a valid utf8
 fn str_from_literal(l: &Literal) -> Option<&str> {
     // if not utf8, no good
     let s = std::str::from_utf8(&l.0).ok()?;
@@ -143,7 +160,7 @@ fn is_safe_for_like(c: char) -> bool {
     (c != '%') && (c != '_')
 }
 
-/// returns true if the elements in a `Concat` pattern are:
+/// Returns true if the elements in a `Concat` pattern are:
 /// - `[Look::Start, Look::End]`
 /// - `[Look::Start, Literal(_), Look::End]`
 fn is_anchored_literal(v: &[Hir]) -> bool {
@@ -157,10 +174,9 @@ fn is_anchored_literal(v: &[Hir]) -> bool {
         v.last().expect("length checked"),
     );
     if !matches!(first_last,
-    (s, e) if s.kind() == &HirKind::Look(Look::Start)
+        (s, e) if s.kind() == &HirKind::Look(Look::Start)
         && e.kind() == &HirKind::Look(Look::End)
-         )
-    {
+    ) {
         return false;
     }
 
@@ -170,7 +186,7 @@ fn is_anchored_literal(v: &[Hir]) -> bool {
         .all(|h| matches!(h.kind(), HirKind::Literal(_)))
 }
 
-/// returns true if the elements in a `Concat` pattern are:
+/// Returns true if the elements in a `Concat` pattern are:
 /// - `[Look::Start, Capture(Alternation(Literals...)), Look::End]`
 fn is_anchored_capture(v: &[Hir]) -> bool {
     if v.len() != 3
@@ -197,7 +213,33 @@ fn is_anchored_capture(v: &[Hir]) -> bool {
     true
 }
 
-/// extracts a string literal expression assuming that [`is_anchored_literal`]
+/// Returns the `LIKE` pattern if the `Concat` pattern is partial anchored:
+/// - `[Look::Start, Literal(_)]`
+/// - `[Literal(_), Look::End]`
+/// Full anchored patterns are handled by [`anchored_literal_to_expr`].
+fn partial_anchored_literal_to_like(v: &[Hir]) -> Option<String> {
+    if v.len() != 2 {
+        return None;
+    }
+
+    let (lit, match_begin) = match (&v[0].kind(), &v[1].kind()) {
+        (HirKind::Look(Look::Start), HirKind::Literal(l)) => {
+            (like_str_from_literal(l)?, true)
+        }
+        (HirKind::Literal(l), HirKind::Look(Look::End)) => {
+            (like_str_from_literal(l)?, false)
+        }
+        _ => return None,
+    };
+
+    if match_begin {
+        Some(format!("{}%", lit))
+    } else {
+        Some(format!("%{}", lit))
+    }
+}
+
+/// Extracts a string literal expression assuming that [`is_anchored_literal`]
 /// returned true.
 fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
     match v.len() {
@@ -246,6 +288,7 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> 
Option<Vec<Expr>> {
     None
 }
 
+/// Tries to lower (transform) a simple regex pattern to a LIKE expression.
 fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
     match hir.kind() {
         HirKind::Empty => {
@@ -265,7 +308,9 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: 
&Hir) -> Option<Expr> {
                 .map(|right| left.clone().in_list(right, mode.not));
         }
         HirKind::Concat(inner) => {
-            if let Some(pattern) = collect_concat_to_like_string(inner) {
+            if let Some(pattern) = partial_anchored_literal_to_like(inner)
+                .or(collect_concat_to_like_string(inner))
+            {
                 return Some(mode.expr(Box::new(left.clone()), pattern));
             }
         }
@@ -274,6 +319,9 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: 
&Hir) -> Option<Expr> {
     None
 }
 
+/// Calls [`lower_simple`] for each alternative and combine the results with 
`or` or `and`
+/// based on [`OperatorMode`]. Any fail attempt to lower an alternative will 
makes this
+/// function to return `None`.
 fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option<Expr> {
     let mut accu: Option<Expr> = None;
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to