This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 3ab72627f2 Support simplifying expressions such as `~ ^(ba_r|foo)$` ,
where the string includes underline (#7186)
3ab72627f2 is described below
commit 3ab72627f2bcc6b493f5a320a7dd9f0d261f61b7
Author: Ruixiang Tan <[email protected]>
AuthorDate: Fri Aug 4 01:52:54 2023 +0800
Support simplifying expressions such as `~ ^(ba_r|foo)$` , where the string
includes underline (#7186)
* Support simplifying expressions like ~ ^(ba_r)$
* rename fn name
---
.../src/simplify_expressions/expr_simplifier.rs | 33 ++++++++++++++++++++++
.../optimizer/src/simplify_expressions/regex.rs | 16 ++++++++---
2 files changed, 45 insertions(+), 4 deletions(-)
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 5562e10f69..b7e8612d53 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -2499,10 +2499,43 @@ mod tests {
col("c1")
.in_list(vec![lit("foo"), lit("bar"), lit("baz"), lit("qux")],
false),
);
+ assert_change(
+ regex_match(col("c1"), lit("^(fo_o)$")),
+ col("c1").eq(lit("fo_o")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("^(fo_o)$")),
+ col("c1").eq(lit("fo_o")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("^(fo_o|ba_r)$")),
+ col("c1").eq(lit("fo_o")).or(col("c1").eq(lit("ba_r"))),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("^(fo_o|ba_r)$")),
+ col("c1")
+ .not_eq(lit("fo_o"))
+ .and(col("c1").not_eq(lit("ba_r"))),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("^(fo_o|ba_r|ba_z)$")),
+ ((col("c1").eq(lit("fo_o"))).or(col("c1").eq(lit("ba_r"))))
+ .or(col("c1").eq(lit("ba_z"))),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("^(fo_o|ba_r|baz|qu_x)$")),
+ col("c1").in_list(
+ vec![lit("fo_o"), lit("ba_r"), lit("baz"), lit("qu_x")],
+ false,
+ ),
+ );
// regular expressions that mismatch captured literals
assert_no_change(regex_match(col("c1"), lit("(foo|bar)")));
assert_no_change(regex_match(col("c1"), lit("(foo|bar)*")));
+ assert_no_change(regex_match(col("c1"), lit("(fo_o|b_ar)")));
+ assert_no_change(regex_match(col("c1"), lit("(foo|ba_r)*")));
+ assert_no_change(regex_match(col("c1"), lit("(fo_o|ba_r)*")));
assert_no_change(regex_match(col("c1"), lit("^(foo|bar)*")));
assert_no_change(regex_match(col("c1"), lit("^foo|bar$")));
assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$")));
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs
b/datafusion/optimizer/src/simplify_expressions/regex.rs
index 27fcfc5dbf..5094623b82 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -108,7 +108,7 @@ fn collect_concat_to_like_string(parts: &[Hir]) ->
Option<String> {
for sub in parts {
if let HirKind::Literal(l) = sub.kind() {
- s.push_str(str_from_literal(l)?);
+ s.push_str(like_str_from_literal(l)?);
} else {
return None;
}
@@ -120,7 +120,7 @@ fn collect_concat_to_like_string(parts: &[Hir]) ->
Option<String> {
/// returns a str represented by `Literal` if it contains a valid utf8
/// sequence and is safe for like (has no '%' and '_')
-fn str_from_literal(l: &Literal) -> Option<&str> {
+fn like_str_from_literal(l: &Literal) -> Option<&str> {
// if not utf8, no good
let s = std::str::from_utf8(&l.0).ok()?;
@@ -131,6 +131,14 @@ fn str_from_literal(l: &Literal) -> Option<&str> {
}
}
+/// returns a str represented by `Literal` if it contains a valid utf8
+fn str_from_literal(l: &Literal) -> Option<&str> {
+ // if not utf8, no good
+ let s = std::str::from_utf8(&l.0).ok()?;
+
+ Some(s)
+}
+
fn is_safe_for_like(c: char) -> bool {
(c != '%') && (c != '_')
}
@@ -196,7 +204,7 @@ fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
2 => Some(lit("")),
3 => {
let HirKind::Literal(l) = v[1].kind() else { return None };
- str_from_literal(l).map(lit)
+ like_str_from_literal(l).map(lit)
}
_ => None,
}
@@ -242,7 +250,7 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir:
&Hir) -> Option<Expr> {
return Some(mode.expr(Box::new(left.clone()), "%".to_owned()));
}
HirKind::Literal(l) => {
- let s = str_from_literal(l)?;
+ let s = like_str_from_literal(l)?;
return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
}
HirKind::Concat(inner) if is_anchored_literal(inner) => {