This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 0d96f1eb36 Improve like kernel by ~2% (#5390)
0d96f1eb36 is described below

commit 0d96f1eb36d3e4fed44dc0b94abfc04e493ca86f
Author: Vrishabh <[email protected]>
AuthorDate: Tue Feb 13 21:48:31 2024 +0530

    Improve like kernel by ~2% (#5390)
    
    * Rework like to use memchr
    
    * Fix clippy
    
    * Rename a function
    
    * Incorporate review comment
---
 arrow-string/Cargo.toml       |  1 +
 arrow-string/src/predicate.rs | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml
index 1ae7af8bdf..bdfa681113 100644
--- a/arrow-string/Cargo.toml
+++ b/arrow-string/Cargo.toml
@@ -42,3 +42,4 @@ arrow-select = { workspace = true }
 regex = { version = "1.7.0", default-features = false, features = ["std", 
"unicode", "perf"] }
 regex-syntax = { version = "0.8.0", default-features = false, features = 
["unicode"] }
 num = { version = "0.4", default-features = false, features = ["std"] }
+memchr = "2.7.1"
diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index fe288f9de8..54ecf42e36 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -17,6 +17,7 @@
 
 use arrow_array::{BooleanArray, GenericStringArray, OffsetSizeTrait};
 use arrow_schema::ArrowError;
+use memchr::memchr2;
 use regex::{Regex, RegexBuilder};
 
 /// A string based predicate
@@ -39,19 +40,19 @@ pub enum Predicate<'a> {
 impl<'a> Predicate<'a> {
     /// Create a predicate for the given like pattern
     pub fn like(pattern: &'a str) -> Result<Self, ArrowError> {
-        if !pattern.contains(is_like_pattern) {
+        if !contains_like_pattern(pattern) {
             Ok(Self::Eq(pattern))
         } else if pattern.ends_with('%')
             && !pattern.ends_with("\\%")
-            && !pattern[..pattern.len() - 1].contains(is_like_pattern)
+            && !contains_like_pattern(&pattern[..pattern.len() - 1])
         {
             Ok(Self::StartsWith(&pattern[..pattern.len() - 1]))
-        } else if pattern.starts_with('%') && 
!pattern[1..].contains(is_like_pattern) {
+        } else if pattern.starts_with('%') && 
!contains_like_pattern(&pattern[1..]) {
             Ok(Self::EndsWith(&pattern[1..]))
         } else if pattern.starts_with('%')
             && pattern.ends_with('%')
             && !pattern.ends_with("\\%")
-            && !pattern[1..pattern.len() - 1].contains(is_like_pattern)
+            && !contains_like_pattern(&pattern[1..pattern.len() - 1])
         {
             Ok(Self::Contains(&pattern[1..pattern.len() - 1]))
         } else {
@@ -62,14 +63,14 @@ impl<'a> Predicate<'a> {
     /// Create a predicate for the given ilike pattern
     pub fn ilike(pattern: &'a str, is_ascii: bool) -> Result<Self, ArrowError> 
{
         if is_ascii && pattern.is_ascii() {
-            if !pattern.contains(is_like_pattern) {
+            if !contains_like_pattern(pattern) {
                 return Ok(Self::IEqAscii(pattern));
             } else if pattern.ends_with('%')
                 && !pattern.ends_with("\\%")
-                && !pattern[..pattern.len() - 1].contains(is_like_pattern)
+                && !contains_like_pattern(&pattern[..pattern.len() - 1])
             {
                 return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 
1]));
-            } else if pattern.starts_with('%') && 
!pattern[1..].contains(is_like_pattern) {
+            } else if pattern.starts_with('%') && 
!contains_like_pattern(&pattern[1..]) {
                 return Ok(Self::IEndsWithAscii(&pattern[1..]));
             }
         }
@@ -188,6 +189,10 @@ fn is_like_pattern(c: char) -> bool {
     c == '%' || c == '_'
 }
 
+fn contains_like_pattern(pattern: &str) -> bool {
+    memchr2(b'%', b'_', pattern.as_bytes()).is_some()
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

Reply via email to