alamb commented on code in PR #4646:
URL: https://github.com/apache/arrow-datafusion/pull/4646#discussion_r1051392954
##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
assert_eq!(simplify(expr), expected)
}
+ #[test]
+ fn test_simplify_regex() {
+ // malformed regex
+ assert_contains!(
+ try_simplify(regex_match(col("c1"), lit("foo{")))
+ .unwrap_err()
+ .to_string(),
+ "regex parse error"
+ );
+
+ // unsupported cases
+ assert_no_change(regex_match(col("c1"), lit("foo.*")));
+ assert_no_change(regex_match(col("c1"), lit("(foo)")));
+ assert_no_change(regex_match(col("c1"), lit("^foo")));
+ assert_no_change(regex_match(col("c1"), lit("foo$")));
+ assert_no_change(regex_match(col("c1"), lit("%")));
+ assert_no_change(regex_match(col("c1"), lit("_")));
+ assert_no_change(regex_match(col("c1"), lit("f%o")));
+ assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+ // empty cases
+ assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+ assert_change(
+ regex_not_match(col("c1"), lit("")),
+ not_like(col("c1"), "%"),
+ );
+ assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+ assert_change(
+ regex_not_imatch(col("c1"), lit("")),
+ not_ilike(col("c1"), "%"),
+ );
+
+ // single character
+ assert_change(regex_match(col("c1"), lit("x")), like(col("c1"),
"%x%"));
+
+ // single word
+ assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"),
"%foo%"));
+
+ // OR-chain
+ assert_change(
+ regex_match(col("c1"), lit("foo|bar|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%bar%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo|x|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%x%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo||baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("foo|bar|baz")),
+ not_like(col("c1"), "%foo%")
+ .and(not_like(col("c1"), "%bar%"))
+ .and(not_like(col("c1"), "%baz%")),
+ );
+ }
+
+ #[track_caller]
+ fn assert_no_change(expr: Expr) {
+ let optimized = simplify(expr.clone());
+ assert_eq!(expr, optimized);
+ }
+
+ #[track_caller]
+ fn assert_change(expr: Expr, expected: Expr) {
+ let optimized = simplify(expr);
+ assert_eq!(optimized, expected);
+ }
+
+ fn regex_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn like(expr: Expr, pattern: &str) -> Expr {
+ Expr::Like(Like {
+ negated: false,
+ expr: Box::new(expr),
+ pattern:
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
Review Comment:
```suggestion
pattern: Box::new(lit(pattern)),
```
##########
datafusion/optimizer/Cargo.toml:
##########
@@ -45,6 +45,7 @@ datafusion-expr = { path = "../expr", version = "15.0.0" }
datafusion-physical-expr = { path = "../physical-expr", version = "15.0.0" }
hashbrown = { version = "0.13", features = ["raw"] }
log = "^0.4"
+regex-syntax = "0.6.28"
Review Comment:
This is technically not a new dependency (as datafusion-physical-expr
already depends on `regex` which depends on `regex-syntax`). However, regex is
an optional dependency whereas this is not optional.
I think it would be better to move this dependency into
`datafusion-physical-expr` so regex can still be optional. I will try to do so
over the next few days
##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
assert_eq!(simplify(expr), expected)
}
+ #[test]
+ fn test_simplify_regex() {
+ // malformed regex
+ assert_contains!(
+ try_simplify(regex_match(col("c1"), lit("foo{")))
+ .unwrap_err()
+ .to_string(),
+ "regex parse error"
+ );
+
+ // unsupported cases
+ assert_no_change(regex_match(col("c1"), lit("foo.*")));
+ assert_no_change(regex_match(col("c1"), lit("(foo)")));
+ assert_no_change(regex_match(col("c1"), lit("^foo")));
+ assert_no_change(regex_match(col("c1"), lit("foo$")));
+ assert_no_change(regex_match(col("c1"), lit("%")));
+ assert_no_change(regex_match(col("c1"), lit("_")));
+ assert_no_change(regex_match(col("c1"), lit("f%o")));
+ assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+ // empty cases
+ assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+ assert_change(
+ regex_not_match(col("c1"), lit("")),
+ not_like(col("c1"), "%"),
+ );
+ assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+ assert_change(
+ regex_not_imatch(col("c1"), lit("")),
+ not_ilike(col("c1"), "%"),
+ );
+
+ // single character
+ assert_change(regex_match(col("c1"), lit("x")), like(col("c1"),
"%x%"));
+
+ // single word
+ assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"),
"%foo%"));
+
+ // OR-chain
+ assert_change(
+ regex_match(col("c1"), lit("foo|bar|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%bar%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo|x|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%x%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo||baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("foo|bar|baz")),
+ not_like(col("c1"), "%foo%")
+ .and(not_like(col("c1"), "%bar%"))
+ .and(not_like(col("c1"), "%baz%")),
+ );
+ }
+
+ #[track_caller]
+ fn assert_no_change(expr: Expr) {
+ let optimized = simplify(expr.clone());
+ assert_eq!(expr, optimized);
+ }
+
+ #[track_caller]
+ fn assert_change(expr: Expr, expected: Expr) {
+ let optimized = simplify(expr);
+ assert_eq!(optimized, expected);
+ }
+
+ fn regex_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn like(expr: Expr, pattern: &str) -> Expr {
Review Comment:
I think there are already functions like this :
https://docs.rs/datafusion/15.0.0/datafusion/prelude/enum.Expr.html#method.like
##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
assert_eq!(simplify(expr), expected)
}
+ #[test]
+ fn test_simplify_regex() {
+ // malformed regex
+ assert_contains!(
+ try_simplify(regex_match(col("c1"), lit("foo{")))
+ .unwrap_err()
+ .to_string(),
+ "regex parse error"
+ );
+
+ // unsupported cases
+ assert_no_change(regex_match(col("c1"), lit("foo.*")));
+ assert_no_change(regex_match(col("c1"), lit("(foo)")));
+ assert_no_change(regex_match(col("c1"), lit("^foo")));
+ assert_no_change(regex_match(col("c1"), lit("foo$")));
+ assert_no_change(regex_match(col("c1"), lit("%")));
+ assert_no_change(regex_match(col("c1"), lit("_")));
+ assert_no_change(regex_match(col("c1"), lit("f%o")));
+ assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+ // empty cases
+ assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+ assert_change(
+ regex_not_match(col("c1"), lit("")),
+ not_like(col("c1"), "%"),
+ );
+ assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+ assert_change(
+ regex_not_imatch(col("c1"), lit("")),
+ not_ilike(col("c1"), "%"),
+ );
+
+ // single character
+ assert_change(regex_match(col("c1"), lit("x")), like(col("c1"),
"%x%"));
+
+ // single word
+ assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"),
"%foo%"));
+
+ // OR-chain
+ assert_change(
+ regex_match(col("c1"), lit("foo|bar|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%bar%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo|x|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%x%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo||baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("foo|bar|baz")),
+ not_like(col("c1"), "%foo%")
+ .and(not_like(col("c1"), "%bar%"))
+ .and(not_like(col("c1"), "%baz%")),
+ );
+ }
+
+ #[track_caller]
+ fn assert_no_change(expr: Expr) {
+ let optimized = simplify(expr.clone());
+ assert_eq!(expr, optimized);
+ }
+
+ #[track_caller]
+ fn assert_change(expr: Expr, expected: Expr) {
+ let optimized = simplify(expr);
+ assert_eq!(optimized, expected);
+ }
+
+ fn regex_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn like(expr: Expr, pattern: &str) -> Expr {
+ Expr::Like(Like {
+ negated: false,
+ expr: Box::new(expr),
+ pattern:
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
+ escape_char: None,
+ })
+ }
+
+ fn not_like(expr: Expr, pattern: &str) -> Expr {
+ Expr::Like(Like {
+ negated: true,
+ expr: Box::new(expr),
+ pattern:
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
+ escape_char: None,
+ })
+ }
+
+ fn ilike(expr: Expr, pattern: &str) -> Expr {
+ Expr::ILike(Like {
+ negated: false,
+ expr: Box::new(expr),
+ pattern:
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
Review Comment:
```suggestion
pattern: Box::new(lit(pattern)),
```
##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
assert_eq!(simplify(expr), expected)
}
+ #[test]
+ fn test_simplify_regex() {
+ // malformed regex
+ assert_contains!(
+ try_simplify(regex_match(col("c1"), lit("foo{")))
+ .unwrap_err()
+ .to_string(),
+ "regex parse error"
+ );
+
+ // unsupported cases
+ assert_no_change(regex_match(col("c1"), lit("foo.*")));
+ assert_no_change(regex_match(col("c1"), lit("(foo)")));
+ assert_no_change(regex_match(col("c1"), lit("^foo")));
+ assert_no_change(regex_match(col("c1"), lit("foo$")));
+ assert_no_change(regex_match(col("c1"), lit("%")));
+ assert_no_change(regex_match(col("c1"), lit("_")));
+ assert_no_change(regex_match(col("c1"), lit("f%o")));
+ assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+ // empty cases
+ assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+ assert_change(
+ regex_not_match(col("c1"), lit("")),
+ not_like(col("c1"), "%"),
+ );
+ assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+ assert_change(
+ regex_not_imatch(col("c1"), lit("")),
+ not_ilike(col("c1"), "%"),
+ );
+
+ // single character
+ assert_change(regex_match(col("c1"), lit("x")), like(col("c1"),
"%x%"));
+
+ // single word
+ assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"),
"%foo%"));
+
+ // OR-chain
+ assert_change(
+ regex_match(col("c1"), lit("foo|bar|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%bar%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo|x|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%x%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo||baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("foo|bar|baz")),
+ not_like(col("c1"), "%foo%")
+ .and(not_like(col("c1"), "%bar%"))
+ .and(not_like(col("c1"), "%baz%")),
+ );
+ }
+
+ #[track_caller]
+ fn assert_no_change(expr: Expr) {
+ let optimized = simplify(expr.clone());
+ assert_eq!(expr, optimized);
+ }
+
+ #[track_caller]
+ fn assert_change(expr: Expr, expected: Expr) {
+ let optimized = simplify(expr);
+ assert_eq!(optimized, expected);
+ }
+
+ fn regex_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn like(expr: Expr, pattern: &str) -> Expr {
+ Expr::Like(Like {
+ negated: false,
+ expr: Box::new(expr),
+ pattern:
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
+ escape_char: None,
+ })
+ }
+
+ fn not_like(expr: Expr, pattern: &str) -> Expr {
+ Expr::Like(Like {
+ negated: true,
+ expr: Box::new(expr),
+ pattern:
Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern.to_owned())))),
Review Comment:
```suggestion
pattern: Box::new(lit(pattern)),
```
##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
assert_eq!(simplify(expr), expected)
}
+ #[test]
+ fn test_simplify_regex() {
+ // malformed regex
+ assert_contains!(
+ try_simplify(regex_match(col("c1"), lit("foo{")))
+ .unwrap_err()
+ .to_string(),
+ "regex parse error"
+ );
+
+ // unsupported cases
+ assert_no_change(regex_match(col("c1"), lit("foo.*")));
+ assert_no_change(regex_match(col("c1"), lit("(foo)")));
+ assert_no_change(regex_match(col("c1"), lit("^foo")));
+ assert_no_change(regex_match(col("c1"), lit("foo$")));
+ assert_no_change(regex_match(col("c1"), lit("%")));
+ assert_no_change(regex_match(col("c1"), lit("_")));
+ assert_no_change(regex_match(col("c1"), lit("f%o")));
+ assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+ // empty cases
+ assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+ assert_change(
+ regex_not_match(col("c1"), lit("")),
+ not_like(col("c1"), "%"),
+ );
+ assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+ assert_change(
+ regex_not_imatch(col("c1"), lit("")),
+ not_ilike(col("c1"), "%"),
+ );
+
+ // single character
+ assert_change(regex_match(col("c1"), lit("x")), like(col("c1"),
"%x%"));
+
+ // single word
+ assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"),
"%foo%"));
+
+ // OR-chain
+ assert_change(
+ regex_match(col("c1"), lit("foo|bar|baz")),
Review Comment:
I think it might be better to convert this to an `IN <list>` which has a
bunch of optimizations
https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/expressions/in_list.rs
I remember a discussion about `IN <list>` vs `OR` chains -- @tustvold do you
have any opinion on which is better in this case?
##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1576,17 +1589,166 @@ mod tests {
assert_eq!(simplify(expr), expected)
}
+ #[test]
+ fn test_simplify_regex() {
+ // malformed regex
+ assert_contains!(
+ try_simplify(regex_match(col("c1"), lit("foo{")))
+ .unwrap_err()
+ .to_string(),
+ "regex parse error"
+ );
+
+ // unsupported cases
+ assert_no_change(regex_match(col("c1"), lit("foo.*")));
+ assert_no_change(regex_match(col("c1"), lit("(foo)")));
+ assert_no_change(regex_match(col("c1"), lit("^foo")));
+ assert_no_change(regex_match(col("c1"), lit("foo$")));
+ assert_no_change(regex_match(col("c1"), lit("%")));
+ assert_no_change(regex_match(col("c1"), lit("_")));
+ assert_no_change(regex_match(col("c1"), lit("f%o")));
+ assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+ // empty cases
+ assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+ assert_change(
+ regex_not_match(col("c1"), lit("")),
+ not_like(col("c1"), "%"),
+ );
+ assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+ assert_change(
+ regex_not_imatch(col("c1"), lit("")),
+ not_ilike(col("c1"), "%"),
+ );
+
+ // single character
+ assert_change(regex_match(col("c1"), lit("x")), like(col("c1"),
"%x%"));
+
+ // single word
+ assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"),
"%foo%"));
+
+ // OR-chain
+ assert_change(
+ regex_match(col("c1"), lit("foo|bar|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%bar%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo|x|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%x%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo||baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("foo|bar|baz")),
+ not_like(col("c1"), "%foo%")
+ .and(not_like(col("c1"), "%bar%"))
+ .and(not_like(col("c1"), "%baz%")),
+ );
+ }
+
+ #[track_caller]
Review Comment:
TIL
https://rustc-dev-guide.rust-lang.org/backend/implicit-caller-location.html 👍
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]