This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new bf9ce475df Improve `LIKE` performance for "contains" style queries 
(#6128)
bf9ce475df is described below

commit bf9ce475df82d362631099d491d3454d64d50217
Author: Samuel Colvin <[email protected]>
AuthorDate: Mon Jul 29 22:08:30 2024 +0100

    Improve `LIKE` performance for "contains" style queries (#6128)
    
    * improve "contains" performance
    
    * add tests
    
    * cargo fmt :disappointed:
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-string/Cargo.toml       |  2 +-
 arrow-string/src/like.rs      |  8 ++++++--
 arrow-string/src/predicate.rs | 35 +++++++++++++++++++++++++++++------
 3 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml
index bdfa681113..0757067dc8 100644
--- a/arrow-string/Cargo.toml
+++ b/arrow-string/Cargo.toml
@@ -42,4 +42,4 @@ arrow-select = { workspace = true }
 regex = { version = "1.7.0", default-features = false, features = ["std", 
"unicode", "perf"] }
 regex-syntax = { version = "0.8.0", default-features = false, features = 
["unicode"] }
 num = { version = "0.4", default-features = false, features = ["std"] }
-memchr = "2.7.1"
+memchr = "2.7.4"
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index 49831092ff..d23040b592 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -239,7 +239,7 @@ fn op_scalar<'a, T: StringArrayType<'a>>(
     let r = match op {
         Op::Like(neg) => Predicate::like(r)?.evaluate_array(l, neg),
         Op::ILike(neg) => Predicate::ilike(r, l.is_ascii())?.evaluate_array(l, 
neg),
-        Op::Contains => Predicate::Contains(r).evaluate_array(l, false),
+        Op::Contains => Predicate::contains(r).evaluate_array(l, false),
         Op::StartsWith => Predicate::StartsWith(r).evaluate_array(l, false),
         Op::EndsWith => Predicate::EndsWith(r).evaluate_array(l, false),
     };
@@ -273,12 +273,16 @@ fn op_binary<'a>(
     match op {
         Op::Like(neg) => binary_predicate(l, r, neg, Predicate::like),
         Op::ILike(neg) => binary_predicate(l, r, neg, |s| Predicate::ilike(s, 
false)),
-        Op::Contains => Ok(l.zip(r).map(|(l, r)| 
Some(l?.contains(r?))).collect()),
+        Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(str_contains(l?, 
r?))).collect()),
         Op::StartsWith => Ok(l.zip(r).map(|(l, r)| 
Some(l?.starts_with(r?))).collect()),
         Op::EndsWith => Ok(l.zip(r).map(|(l, r)| 
Some(l?.ends_with(r?))).collect()),
     }
 }
 
+fn str_contains(haystack: &str, needle: &str) -> bool {
+    memchr::memmem::find(haystack.as_bytes(), needle.as_bytes()).is_some()
+}
+
 fn binary_predicate<'a>(
     l: impl Iterator<Item = Option<&'a str>>,
     r: impl Iterator<Item = Option<&'a str>>,
diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index c7ccffb3ad..0769da3fa0 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -18,12 +18,13 @@
 use arrow_array::{ArrayAccessor, BooleanArray};
 use arrow_schema::ArrowError;
 use memchr::memchr2;
+use memchr::memmem::Finder;
 use regex::{Regex, RegexBuilder};
 
 /// A string based predicate
 pub enum Predicate<'a> {
     Eq(&'a str),
-    Contains(&'a str),
+    Contains(Finder<'a>),
     StartsWith(&'a str),
     EndsWith(&'a str),
 
@@ -54,12 +55,16 @@ impl<'a> Predicate<'a> {
             && !pattern.ends_with("\\%")
             && !contains_like_pattern(&pattern[1..pattern.len() - 1])
         {
-            Ok(Self::Contains(&pattern[1..pattern.len() - 1]))
+            Ok(Self::contains(&pattern[1..pattern.len() - 1]))
         } else {
             Ok(Self::Regex(regex_like(pattern, false)?))
         }
     }
 
+    pub fn contains(needle: &'a str) -> Self {
+        Self::Contains(Finder::new(needle.as_bytes()))
+    }
+
     /// Create a predicate for the given ilike pattern
     pub fn ilike(pattern: &'a str, is_ascii: bool) -> Result<Self, ArrowError> 
{
         if is_ascii && pattern.is_ascii() {
@@ -82,7 +87,7 @@ impl<'a> Predicate<'a> {
         match self {
             Predicate::Eq(v) => *v == haystack,
             Predicate::IEqAscii(v) => haystack.eq_ignore_ascii_case(v),
-            Predicate::Contains(v) => haystack.contains(v),
+            Predicate::Contains(finder) => 
finder.find(haystack.as_bytes()).is_some(),
             Predicate::StartsWith(v) => haystack.starts_with(v),
             Predicate::IStartsWithAscii(v) => 
starts_with_ignore_ascii_case(haystack, v),
             Predicate::EndsWith(v) => haystack.ends_with(v),
@@ -106,9 +111,9 @@ impl<'a> Predicate<'a> {
             Predicate::IEqAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
                 haystack.eq_ignore_ascii_case(v) != negate
             }),
-            Predicate::Contains(v) => {
-                BooleanArray::from_unary(array, |haystack| 
haystack.contains(v) != negate)
-            }
+            Predicate::Contains(finder) => BooleanArray::from_unary(array, 
|haystack| {
+                finder.find(haystack.as_bytes()).is_some() != negate
+            }),
             Predicate::StartsWith(v) => {
                 BooleanArray::from_unary(array, |haystack| 
haystack.starts_with(v) != negate)
             }
@@ -258,4 +263,22 @@ mod tests {
         let r = regex_like(a_eq, false).unwrap();
         assert_eq!(r.to_string(), expected);
     }
+    #[test]
+    fn test_contains() {
+        assert!(Predicate::contains("hay").evaluate("haystack"));
+        assert!(Predicate::contains("haystack").evaluate("haystack"));
+        assert!(Predicate::contains("h").evaluate("haystack"));
+        assert!(Predicate::contains("k").evaluate("haystack"));
+        assert!(Predicate::contains("stack").evaluate("haystack"));
+        assert!(Predicate::contains("sta").evaluate("haystack"));
+        assert!(Predicate::contains("stack").evaluate("hay£stack"));
+        assert!(Predicate::contains("y£s").evaluate("hay£stack"));
+        assert!(Predicate::contains("£").evaluate("hay£stack"));
+        assert!(Predicate::contains("a").evaluate("a"));
+        // not matching
+        assert!(!Predicate::contains("hy").evaluate("haystack"));
+        assert!(!Predicate::contains("stackx").evaluate("haystack"));
+        assert!(!Predicate::contains("x").evaluate("haystack"));
+        assert!(!Predicate::contains("haystack 
haystack").evaluate("haystack"));
+    }
 }

Reply via email to