alamb commented on code in PR #4646:
URL: https://github.com/apache/arrow-datafusion/pull/4646#discussion_r1058604794


##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
         assert_eq!(simplify(expr), expected)
     }
 
+    #[test]
+    fn test_simplify_regex() {
+        // malformed regex
+        assert_contains!(
+            try_simplify(regex_match(col("c1"), lit("foo{")))
+                .unwrap_err()
+                .to_string(),
+            "regex parse error"
+        );
+
+        // unsupported cases
+        assert_no_change(regex_match(col("c1"), lit("foo.*")));
+        assert_no_change(regex_match(col("c1"), lit("(foo)")));
+        assert_no_change(regex_match(col("c1"), lit("^foo")));
+        assert_no_change(regex_match(col("c1"), lit("foo$")));
+        assert_no_change(regex_match(col("c1"), lit("%")));
+        assert_no_change(regex_match(col("c1"), lit("_")));
+        assert_no_change(regex_match(col("c1"), lit("f%o")));
+        assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+        // empty cases
+        assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+        assert_change(
+            regex_not_match(col("c1"), lit("")),
+            not_like(col("c1"), "%"),
+        );
+        assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+        assert_change(
+            regex_not_imatch(col("c1"), lit("")),
+            not_ilike(col("c1"), "%"),
+        );
+
+        // single character
+        assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), 
"%x%"));
+
+        // single word
+        assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), 
"%foo%"));
+
+        // OR-chain
+        assert_change(
+            regex_match(col("c1"), lit("foo|bar|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%bar%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo|x|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%x%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo||baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("foo|bar|baz")),
+            not_like(col("c1"), "%foo%")
+                .and(not_like(col("c1"), "%bar%"))
+                .and(not_like(col("c1"), "%baz%")),
+        );
+    }
+
+    #[track_caller]
+    fn assert_no_change(expr: Expr) {
+        let optimized = simplify(expr.clone());
+        assert_eq!(expr, optimized);
+    }
+
+    #[track_caller]
+    fn assert_change(expr: Expr, expected: Expr) {
+        let optimized = simplify(expr);
+        assert_eq!(optimized, expected);
+    }
+
+    fn regex_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn like(expr: Expr, pattern: &str) -> Expr {

Review Comment:
   🤔  but it turns out that those make `Expr::Like` rather than 
`BinaryOperator::Like` 🤔 -- I am not sure why there is duplicate



##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
         assert_eq!(simplify(expr), expected)
     }
 
+    #[test]
+    fn test_simplify_regex() {
+        // malformed regex
+        assert_contains!(
+            try_simplify(regex_match(col("c1"), lit("foo{")))
+                .unwrap_err()
+                .to_string(),
+            "regex parse error"
+        );
+
+        // unsupported cases
+        assert_no_change(regex_match(col("c1"), lit("foo.*")));
+        assert_no_change(regex_match(col("c1"), lit("(foo)")));
+        assert_no_change(regex_match(col("c1"), lit("^foo")));
+        assert_no_change(regex_match(col("c1"), lit("foo$")));
+        assert_no_change(regex_match(col("c1"), lit("%")));
+        assert_no_change(regex_match(col("c1"), lit("_")));
+        assert_no_change(regex_match(col("c1"), lit("f%o")));
+        assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+        // empty cases
+        assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+        assert_change(
+            regex_not_match(col("c1"), lit("")),
+            not_like(col("c1"), "%"),
+        );
+        assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+        assert_change(
+            regex_not_imatch(col("c1"), lit("")),
+            not_ilike(col("c1"), "%"),
+        );
+
+        // single character
+        assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), 
"%x%"));
+
+        // single word
+        assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), 
"%foo%"));
+
+        // OR-chain
+        assert_change(
+            regex_match(col("c1"), lit("foo|bar|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%bar%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo|x|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%x%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo||baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("foo|bar|baz")),
+            not_like(col("c1"), "%foo%")
+                .and(not_like(col("c1"), "%bar%"))
+                .and(not_like(col("c1"), "%baz%")),
+        );
+    }
+
+    #[track_caller]
+    fn assert_no_change(expr: Expr) {
+        let optimized = simplify(expr.clone());
+        assert_eq!(expr, optimized);
+    }
+
+    #[track_caller]
+    fn assert_change(expr: Expr, expected: Expr) {
+        let optimized = simplify(expr);
+        assert_eq!(optimized, expected);
+    }
+
+    fn regex_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn like(expr: Expr, pattern: &str) -> Expr {
+        Expr::Like(Like {
+            negated: false,
+            expr: Box::new(expr),
+            pattern: 
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
+            escape_char: None,
+        })
+    }
+
+    fn not_like(expr: Expr, pattern: &str) -> Expr {
+        Expr::Like(Like {
+            negated: true,
+            expr: Box::new(expr),
+            pattern: 
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
+            escape_char: None,
+        })
+    }
+
+    fn ilike(expr: Expr, pattern: &str) -> Expr {
+        Expr::ILike(Like {
+            negated: false,
+            expr: Box::new(expr),
+            pattern: 
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),

Review Comment:
   in 197e1cf86



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to