This is an automated email from the ASF dual-hosted git repository.

yjshen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new edeb88533e chore(deps): update regex and regex-syntax requirement from 
0.6.28 to 0.7.1 (#6095)
edeb88533e is described below

commit edeb88533e2f79049a7f34cb68c176ece5da90e4
Author: Andrew Lamb <[email protected]>
AuthorDate: Sun Apr 23 10:07:56 2023 -0400

    chore(deps): update regex and regex-syntax requirement from 0.6.28 to 0.7.1 
(#6095)
    
    * Update regex and regex-syntax dependencies
    
    * chore(deps): update regex-syntax requirement from 0.6.28 to 0.7.1
    
    * fix up
    
    * clippy
---
 datafusion-cli/Cargo.lock                          |  6 ++---
 datafusion/optimizer/Cargo.toml                    |  2 +-
 .../optimizer/src/simplify_expressions/regex.rs    | 27 ++++++++++++++++------
 datafusion/physical-expr/Cargo.toml                |  2 +-
 4 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 7b1190a91a..556a296274 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -389,9 +389,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.12.0"
+version = "3.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index";
-checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
+checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8"
 
 [[package]]
 name = "byteorder"
@@ -811,7 +811,7 @@ dependencies = [
  "hashbrown 0.13.2",
  "itertools",
  "log",
- "regex-syntax 0.6.29",
+ "regex-syntax 0.7.1",
 ]
 
 [[package]]
diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml
index d84dbc5c2d..4959291c42 100644
--- a/datafusion/optimizer/Cargo.toml
+++ b/datafusion/optimizer/Cargo.toml
@@ -49,7 +49,7 @@ datafusion-physical-expr = { path = "../physical-expr", 
version = "23.0.0", defa
 hashbrown = { version = "0.13", features = ["raw"] }
 itertools = "0.10"
 log = "^0.4"
-regex-syntax = "0.6.28"
+regex-syntax = "0.7.1"
 
 [dev-dependencies]
 ctor = "0.2.0"
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs 
b/datafusion/optimizer/src/simplify_expressions/regex.rs
index 13d170fd88..a7ae14542d 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -58,6 +58,7 @@ pub fn simplify_regex_expr(
     Ok(Expr::BinaryExpr(BinaryExpr { left, op, right }))
 }
 
+#[derive(Debug)]
 struct OperatorMode {
     not: bool,
     i: bool,
@@ -101,11 +102,8 @@ fn collect_concat_to_like_string(parts: &[Hir]) -> 
Option<String> {
     s.push('%');
 
     for sub in parts {
-        if let HirKind::Literal(Literal::Unicode(c)) = sub.kind() {
-            if !is_safe_for_like(*c) {
-                return None;
-            }
-            s.push(*c);
+        if let HirKind::Literal(l) = sub.kind() {
+            s.push_str(str_from_literal(l)?);
         } else {
             return None;
         }
@@ -115,17 +113,32 @@ fn collect_concat_to_like_string(parts: &[Hir]) -> 
Option<String> {
     Some(s)
 }
 
+/// returns a str represented by `Literal` if it contains a valid utf8
+/// sequence and is safe for like (has no '%' and '_')
+fn str_from_literal(l: &Literal) -> Option<&str> {
+    // if not utf8, no good
+    let s = std::str::from_utf8(&l.0).ok()?;
+
+    if s.chars().all(is_safe_for_like) {
+        Some(s)
+    } else {
+        None
+    }
+}
+
 fn is_safe_for_like(c: char) -> bool {
     (c != '%') && (c != '_')
 }
 
 fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
+    println!("Considering hir kind: mode {mode:?} hir: {hir:?}");
     match hir.kind() {
         HirKind::Empty => {
             return Some(mode.expr(Box::new(left.clone()), "%".to_owned()));
         }
-        HirKind::Literal(Literal::Unicode(c)) if is_safe_for_like(*c) => {
-            return Some(mode.expr(Box::new(left.clone()), format!("%{c}%")));
+        HirKind::Literal(l) => {
+            let s = str_from_literal(l)?;
+            return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
         }
         HirKind::Concat(inner) => {
             if let Some(pattern) = collect_concat_to_like_string(inner) {
diff --git a/datafusion/physical-expr/Cargo.toml 
b/datafusion/physical-expr/Cargo.toml
index 3cc85071ba..df19299e7e 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -63,7 +63,7 @@ md-5 = { version = "^0.10.0", optional = true }
 paste = "^1.0"
 petgraph = "0.6.2"
 rand = "0.8"
-regex = { version = "^1.4.3", optional = true }
+regex = { version = "1.8", optional = true }
 sha2 = { version = "^0.10.1", optional = true }
 unicode-segmentation = { version = "^1.7.1", optional = true }
 uuid = { version = "^1.2", features = ["v4"] }

Reply via email to