alamb commented on code in PR #4646:
URL: https://github.com/apache/arrow-datafusion/pull/4646#discussion_r1051392954


##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
         assert_eq!(simplify(expr), expected)
     }
 
+    #[test]
+    fn test_simplify_regex() {
+        // malformed regex
+        assert_contains!(
+            try_simplify(regex_match(col("c1"), lit("foo{")))
+                .unwrap_err()
+                .to_string(),
+            "regex parse error"
+        );
+
+        // unsupported cases
+        assert_no_change(regex_match(col("c1"), lit("foo.*")));
+        assert_no_change(regex_match(col("c1"), lit("(foo)")));
+        assert_no_change(regex_match(col("c1"), lit("^foo")));
+        assert_no_change(regex_match(col("c1"), lit("foo$")));
+        assert_no_change(regex_match(col("c1"), lit("%")));
+        assert_no_change(regex_match(col("c1"), lit("_")));
+        assert_no_change(regex_match(col("c1"), lit("f%o")));
+        assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+        // empty cases
+        assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+        assert_change(
+            regex_not_match(col("c1"), lit("")),
+            not_like(col("c1"), "%"),
+        );
+        assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+        assert_change(
+            regex_not_imatch(col("c1"), lit("")),
+            not_ilike(col("c1"), "%"),
+        );
+
+        // single character
+        assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), 
"%x%"));
+
+        // single word
+        assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), 
"%foo%"));
+
+        // OR-chain
+        assert_change(
+            regex_match(col("c1"), lit("foo|bar|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%bar%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo|x|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%x%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo||baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("foo|bar|baz")),
+            not_like(col("c1"), "%foo%")
+                .and(not_like(col("c1"), "%bar%"))
+                .and(not_like(col("c1"), "%baz%")),
+        );
+    }
+
+    #[track_caller]
+    fn assert_no_change(expr: Expr) {
+        let optimized = simplify(expr.clone());
+        assert_eq!(expr, optimized);
+    }
+
+    #[track_caller]
+    fn assert_change(expr: Expr, expected: Expr) {
+        let optimized = simplify(expr);
+        assert_eq!(optimized, expected);
+    }
+
+    fn regex_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn like(expr: Expr, pattern: &str) -> Expr {
+        Expr::Like(Like {
+            negated: false,
+            expr: Box::new(expr),
+            pattern: 
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),

Review Comment:
   ```suggestion
               pattern: Box::new(lit(pattern)),
   ```



##########
datafusion/optimizer/Cargo.toml:
##########
@@ -45,6 +45,7 @@ datafusion-expr = { path = "../expr", version = "15.0.0" }
 datafusion-physical-expr = { path = "../physical-expr", version = "15.0.0" }
 hashbrown = { version = "0.13", features = ["raw"] }
 log = "^0.4"
+regex-syntax = "0.6.28"

Review Comment:
   This is technically not a new dependency (as datafusion-physical-expr 
already depends on `regex` which depends on `regex-syntax`). However, regex is 
an optional dependency whereas this is not optional.
   
   I think it would be better to move this dependency into 
`datafusion-physical-expr` so regex can still be optional. I will try to do so 
over the next few days



##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
         assert_eq!(simplify(expr), expected)
     }
 
+    #[test]
+    fn test_simplify_regex() {
+        // malformed regex
+        assert_contains!(
+            try_simplify(regex_match(col("c1"), lit("foo{")))
+                .unwrap_err()
+                .to_string(),
+            "regex parse error"
+        );
+
+        // unsupported cases
+        assert_no_change(regex_match(col("c1"), lit("foo.*")));
+        assert_no_change(regex_match(col("c1"), lit("(foo)")));
+        assert_no_change(regex_match(col("c1"), lit("^foo")));
+        assert_no_change(regex_match(col("c1"), lit("foo$")));
+        assert_no_change(regex_match(col("c1"), lit("%")));
+        assert_no_change(regex_match(col("c1"), lit("_")));
+        assert_no_change(regex_match(col("c1"), lit("f%o")));
+        assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+        // empty cases
+        assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+        assert_change(
+            regex_not_match(col("c1"), lit("")),
+            not_like(col("c1"), "%"),
+        );
+        assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+        assert_change(
+            regex_not_imatch(col("c1"), lit("")),
+            not_ilike(col("c1"), "%"),
+        );
+
+        // single character
+        assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), 
"%x%"));
+
+        // single word
+        assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), 
"%foo%"));
+
+        // OR-chain
+        assert_change(
+            regex_match(col("c1"), lit("foo|bar|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%bar%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo|x|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%x%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo||baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("foo|bar|baz")),
+            not_like(col("c1"), "%foo%")
+                .and(not_like(col("c1"), "%bar%"))
+                .and(not_like(col("c1"), "%baz%")),
+        );
+    }
+
+    #[track_caller]
+    fn assert_no_change(expr: Expr) {
+        let optimized = simplify(expr.clone());
+        assert_eq!(expr, optimized);
+    }
+
+    #[track_caller]
+    fn assert_change(expr: Expr, expected: Expr) {
+        let optimized = simplify(expr);
+        assert_eq!(optimized, expected);
+    }
+
+    fn regex_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn like(expr: Expr, pattern: &str) -> Expr {

Review Comment:
   I think there are already functions like this : 
https://docs.rs/datafusion/15.0.0/datafusion/prelude/enum.Expr.html#method.like



##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
         assert_eq!(simplify(expr), expected)
     }
 
+    #[test]
+    fn test_simplify_regex() {
+        // malformed regex
+        assert_contains!(
+            try_simplify(regex_match(col("c1"), lit("foo{")))
+                .unwrap_err()
+                .to_string(),
+            "regex parse error"
+        );
+
+        // unsupported cases
+        assert_no_change(regex_match(col("c1"), lit("foo.*")));
+        assert_no_change(regex_match(col("c1"), lit("(foo)")));
+        assert_no_change(regex_match(col("c1"), lit("^foo")));
+        assert_no_change(regex_match(col("c1"), lit("foo$")));
+        assert_no_change(regex_match(col("c1"), lit("%")));
+        assert_no_change(regex_match(col("c1"), lit("_")));
+        assert_no_change(regex_match(col("c1"), lit("f%o")));
+        assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+        // empty cases
+        assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+        assert_change(
+            regex_not_match(col("c1"), lit("")),
+            not_like(col("c1"), "%"),
+        );
+        assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+        assert_change(
+            regex_not_imatch(col("c1"), lit("")),
+            not_ilike(col("c1"), "%"),
+        );
+
+        // single character
+        assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), 
"%x%"));
+
+        // single word
+        assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), 
"%foo%"));
+
+        // OR-chain
+        assert_change(
+            regex_match(col("c1"), lit("foo|bar|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%bar%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo|x|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%x%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo||baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("foo|bar|baz")),
+            not_like(col("c1"), "%foo%")
+                .and(not_like(col("c1"), "%bar%"))
+                .and(not_like(col("c1"), "%baz%")),
+        );
+    }
+
+    #[track_caller]
+    fn assert_no_change(expr: Expr) {
+        let optimized = simplify(expr.clone());
+        assert_eq!(expr, optimized);
+    }
+
+    #[track_caller]
+    fn assert_change(expr: Expr, expected: Expr) {
+        let optimized = simplify(expr);
+        assert_eq!(optimized, expected);
+    }
+
+    fn regex_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn like(expr: Expr, pattern: &str) -> Expr {
+        Expr::Like(Like {
+            negated: false,
+            expr: Box::new(expr),
+            pattern: 
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
+            escape_char: None,
+        })
+    }
+
+    fn not_like(expr: Expr, pattern: &str) -> Expr {
+        Expr::Like(Like {
+            negated: true,
+            expr: Box::new(expr),
+            pattern: 
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
+            escape_char: None,
+        })
+    }
+
+    fn ilike(expr: Expr, pattern: &str) -> Expr {
+        Expr::ILike(Like {
+            negated: false,
+            expr: Box::new(expr),
+            pattern: 
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),

Review Comment:
   ```suggestion
               pattern: Box::new(lit(pattern)),
   ```



##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
         assert_eq!(simplify(expr), expected)
     }
 
+    #[test]
+    fn test_simplify_regex() {
+        // malformed regex
+        assert_contains!(
+            try_simplify(regex_match(col("c1"), lit("foo{")))
+                .unwrap_err()
+                .to_string(),
+            "regex parse error"
+        );
+
+        // unsupported cases
+        assert_no_change(regex_match(col("c1"), lit("foo.*")));
+        assert_no_change(regex_match(col("c1"), lit("(foo)")));
+        assert_no_change(regex_match(col("c1"), lit("^foo")));
+        assert_no_change(regex_match(col("c1"), lit("foo$")));
+        assert_no_change(regex_match(col("c1"), lit("%")));
+        assert_no_change(regex_match(col("c1"), lit("_")));
+        assert_no_change(regex_match(col("c1"), lit("f%o")));
+        assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+        // empty cases
+        assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+        assert_change(
+            regex_not_match(col("c1"), lit("")),
+            not_like(col("c1"), "%"),
+        );
+        assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+        assert_change(
+            regex_not_imatch(col("c1"), lit("")),
+            not_ilike(col("c1"), "%"),
+        );
+
+        // single character
+        assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), 
"%x%"));
+
+        // single word
+        assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), 
"%foo%"));
+
+        // OR-chain
+        assert_change(
+            regex_match(col("c1"), lit("foo|bar|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%bar%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo|x|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%x%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo||baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("foo|bar|baz")),
+            not_like(col("c1"), "%foo%")
+                .and(not_like(col("c1"), "%bar%"))
+                .and(not_like(col("c1"), "%baz%")),
+        );
+    }
+
+    #[track_caller]
+    fn assert_no_change(expr: Expr) {
+        let optimized = simplify(expr.clone());
+        assert_eq!(expr, optimized);
+    }
+
+    #[track_caller]
+    fn assert_change(expr: Expr, expected: Expr) {
+        let optimized = simplify(expr);
+        assert_eq!(optimized, expected);
+    }
+
+    fn regex_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn like(expr: Expr, pattern: &str) -> Expr {
+        Expr::Like(Like {
+            negated: false,
+            expr: Box::new(expr),
+            pattern: 
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
+            escape_char: None,
+        })
+    }
+
+    fn not_like(expr: Expr, pattern: &str) -> Expr {
+        Expr::Like(Like {
+            negated: true,
+            expr: Box::new(expr),
+            pattern: 
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),

Review Comment:
   ```suggestion
               pattern: Box::new(lit(pattern)),
   ```



##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
         assert_eq!(simplify(expr), expected)
     }
 
+    #[test]
+    fn test_simplify_regex() {
+        // malformed regex
+        assert_contains!(
+            try_simplify(regex_match(col("c1"), lit("foo{")))
+                .unwrap_err()
+                .to_string(),
+            "regex parse error"
+        );
+
+        // unsupported cases
+        assert_no_change(regex_match(col("c1"), lit("foo.*")));
+        assert_no_change(regex_match(col("c1"), lit("(foo)")));
+        assert_no_change(regex_match(col("c1"), lit("^foo")));
+        assert_no_change(regex_match(col("c1"), lit("foo$")));
+        assert_no_change(regex_match(col("c1"), lit("%")));
+        assert_no_change(regex_match(col("c1"), lit("_")));
+        assert_no_change(regex_match(col("c1"), lit("f%o")));
+        assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+        // empty cases
+        assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+        assert_change(
+            regex_not_match(col("c1"), lit("")),
+            not_like(col("c1"), "%"),
+        );
+        assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+        assert_change(
+            regex_not_imatch(col("c1"), lit("")),
+            not_ilike(col("c1"), "%"),
+        );
+
+        // single character
+        assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), 
"%x%"));
+
+        // single word
+        assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), 
"%foo%"));
+
+        // OR-chain
+        assert_change(
+            regex_match(col("c1"), lit("foo|bar|baz")),

Review Comment:
   I think it might be better to convert this to an `IN <list>` which has a 
bunch of optimizations  
https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/expressions/in_list.rs
   
   I remember a discussion about `IN <list>` vs `OR` chains -- @tustvold do you 
have any opinion on which is better in this case?



##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
         assert_eq!(simplify(expr), expected)
     }
 
+    #[test]
+    fn test_simplify_regex() {
+        // malformed regex
+        assert_contains!(
+            try_simplify(regex_match(col("c1"), lit("foo{")))
+                .unwrap_err()
+                .to_string(),
+            "regex parse error"
+        );
+
+        // unsupported cases
+        assert_no_change(regex_match(col("c1"), lit("foo.*")));
+        assert_no_change(regex_match(col("c1"), lit("(foo)")));
+        assert_no_change(regex_match(col("c1"), lit("^foo")));
+        assert_no_change(regex_match(col("c1"), lit("foo$")));
+        assert_no_change(regex_match(col("c1"), lit("%")));
+        assert_no_change(regex_match(col("c1"), lit("_")));
+        assert_no_change(regex_match(col("c1"), lit("f%o")));
+        assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+        // empty cases
+        assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+        assert_change(
+            regex_not_match(col("c1"), lit("")),
+            not_like(col("c1"), "%"),
+        );
+        assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+        assert_change(
+            regex_not_imatch(col("c1"), lit("")),
+            not_ilike(col("c1"), "%"),
+        );
+
+        // single character
+        assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), 
"%x%"));
+
+        // single word
+        assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), 
"%foo%"));
+
+        // OR-chain
+        assert_change(
+            regex_match(col("c1"), lit("foo|bar|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%bar%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo|x|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%x%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo||baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("foo|bar|baz")),
+            not_like(col("c1"), "%foo%")
+                .and(not_like(col("c1"), "%bar%"))
+                .and(not_like(col("c1"), "%baz%")),
+        );
+    }
+
+    #[track_caller]

Review Comment:
   TIL 
https://rustc-dev-guide.rust-lang.org/backend/implicit-caller-location.html 👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to