alamb commented on code in PR #4646:
URL: https://github.com/apache/arrow-datafusion/pull/4646#discussion_r1058604794
##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
assert_eq!(simplify(expr), expected)
}
+ #[test]
+ fn test_simplify_regex() {
+ // malformed regex
+ assert_contains!(
+ try_simplify(regex_match(col("c1"), lit("foo{")))
+ .unwrap_err()
+ .to_string(),
+ "regex parse error"
+ );
+
+ // unsupported cases
+ assert_no_change(regex_match(col("c1"), lit("foo.*")));
+ assert_no_change(regex_match(col("c1"), lit("(foo)")));
+ assert_no_change(regex_match(col("c1"), lit("^foo")));
+ assert_no_change(regex_match(col("c1"), lit("foo$")));
+ assert_no_change(regex_match(col("c1"), lit("%")));
+ assert_no_change(regex_match(col("c1"), lit("_")));
+ assert_no_change(regex_match(col("c1"), lit("f%o")));
+ assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+ // empty cases
+ assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+ assert_change(
+ regex_not_match(col("c1"), lit("")),
+ not_like(col("c1"), "%"),
+ );
+ assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+ assert_change(
+ regex_not_imatch(col("c1"), lit("")),
+ not_ilike(col("c1"), "%"),
+ );
+
+ // single character
+ assert_change(regex_match(col("c1"), lit("x")), like(col("c1"),
"%x%"));
+
+ // single word
+ assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"),
"%foo%"));
+
+ // OR-chain
+ assert_change(
+ regex_match(col("c1"), lit("foo|bar|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%bar%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo|x|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%x%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo||baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("foo|bar|baz")),
+ not_like(col("c1"), "%foo%")
+ .and(not_like(col("c1"), "%bar%"))
+ .and(not_like(col("c1"), "%baz%")),
+ );
+ }
+
+ #[track_caller]
+ fn assert_no_change(expr: Expr) {
+ let optimized = simplify(expr.clone());
+ assert_eq!(expr, optimized);
+ }
+
+ #[track_caller]
+ fn assert_change(expr: Expr, expected: Expr) {
+ let optimized = simplify(expr);
+ assert_eq!(optimized, expected);
+ }
+
+ fn regex_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn like(expr: Expr, pattern: &str) -> Expr {
Review Comment:
🤔 but it turns out that those make `Expr::Like` rather than
`BinaryOperator::Like` 🤔 -- I am not sure why there is duplicate
##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
assert_eq!(simplify(expr), expected)
}
+ #[test]
+ fn test_simplify_regex() {
+ // malformed regex
+ assert_contains!(
+ try_simplify(regex_match(col("c1"), lit("foo{")))
+ .unwrap_err()
+ .to_string(),
+ "regex parse error"
+ );
+
+ // unsupported cases
+ assert_no_change(regex_match(col("c1"), lit("foo.*")));
+ assert_no_change(regex_match(col("c1"), lit("(foo)")));
+ assert_no_change(regex_match(col("c1"), lit("^foo")));
+ assert_no_change(regex_match(col("c1"), lit("foo$")));
+ assert_no_change(regex_match(col("c1"), lit("%")));
+ assert_no_change(regex_match(col("c1"), lit("_")));
+ assert_no_change(regex_match(col("c1"), lit("f%o")));
+ assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+ // empty cases
+ assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+ assert_change(
+ regex_not_match(col("c1"), lit("")),
+ not_like(col("c1"), "%"),
+ );
+ assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+ assert_change(
+ regex_not_imatch(col("c1"), lit("")),
+ not_ilike(col("c1"), "%"),
+ );
+
+ // single character
+ assert_change(regex_match(col("c1"), lit("x")), like(col("c1"),
"%x%"));
+
+ // single word
+ assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"),
"%foo%"));
+
+ // OR-chain
+ assert_change(
+ regex_match(col("c1"), lit("foo|bar|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%bar%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo|x|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%x%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo||baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("foo|bar|baz")),
+ not_like(col("c1"), "%foo%")
+ .and(not_like(col("c1"), "%bar%"))
+ .and(not_like(col("c1"), "%baz%")),
+ );
+ }
+
+ #[track_caller]
+ fn assert_no_change(expr: Expr) {
+ let optimized = simplify(expr.clone());
+ assert_eq!(expr, optimized);
+ }
+
+ #[track_caller]
+ fn assert_change(expr: Expr, expected: Expr) {
+ let optimized = simplify(expr);
+ assert_eq!(optimized, expected);
+ }
+
+ fn regex_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn like(expr: Expr, pattern: &str) -> Expr {
+ Expr::Like(Like {
+ negated: false,
+ expr: Box::new(expr),
+ pattern:
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
+ escape_char: None,
+ })
+ }
+
+ fn not_like(expr: Expr, pattern: &str) -> Expr {
+ Expr::Like(Like {
+ negated: true,
+ expr: Box::new(expr),
+ pattern:
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
+ escape_char: None,
+ })
+ }
+
+ fn ilike(expr: Expr, pattern: &str) -> Expr {
+ Expr::ILike(Like {
+ negated: false,
+ expr: Box::new(expr),
+ pattern:
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
Review Comment:
in 197e1cf86
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]