This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 0d96f1eb36 Improve like kernel by ~2% (#5390)
0d96f1eb36 is described below
commit 0d96f1eb36d3e4fed44dc0b94abfc04e493ca86f
Author: Vrishabh <[email protected]>
AuthorDate: Tue Feb 13 21:48:31 2024 +0530
Improve like kernel by ~2% (#5390)
* Rework like to use memchr
* Fix clippy
* Rename a function
* Incorporate review comment
---
arrow-string/Cargo.toml | 1 +
arrow-string/src/predicate.rs | 19 ++++++++++++-------
2 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml
index 1ae7af8bdf..bdfa681113 100644
--- a/arrow-string/Cargo.toml
+++ b/arrow-string/Cargo.toml
@@ -42,3 +42,4 @@ arrow-select = { workspace = true }
regex = { version = "1.7.0", default-features = false, features = ["std",
"unicode", "perf"] }
regex-syntax = { version = "0.8.0", default-features = false, features =
["unicode"] }
num = { version = "0.4", default-features = false, features = ["std"] }
+memchr = "2.7.1"
diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index fe288f9de8..54ecf42e36 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -17,6 +17,7 @@
use arrow_array::{BooleanArray, GenericStringArray, OffsetSizeTrait};
use arrow_schema::ArrowError;
+use memchr::memchr2;
use regex::{Regex, RegexBuilder};
/// A string based predicate
@@ -39,19 +40,19 @@ pub enum Predicate<'a> {
impl<'a> Predicate<'a> {
/// Create a predicate for the given like pattern
pub fn like(pattern: &'a str) -> Result<Self, ArrowError> {
- if !pattern.contains(is_like_pattern) {
+ if !contains_like_pattern(pattern) {
Ok(Self::Eq(pattern))
} else if pattern.ends_with('%')
&& !pattern.ends_with("\\%")
- && !pattern[..pattern.len() - 1].contains(is_like_pattern)
+ && !contains_like_pattern(&pattern[..pattern.len() - 1])
{
Ok(Self::StartsWith(&pattern[..pattern.len() - 1]))
- } else if pattern.starts_with('%') &&
!pattern[1..].contains(is_like_pattern) {
+ } else if pattern.starts_with('%') &&
!contains_like_pattern(&pattern[1..]) {
Ok(Self::EndsWith(&pattern[1..]))
} else if pattern.starts_with('%')
&& pattern.ends_with('%')
&& !pattern.ends_with("\\%")
- && !pattern[1..pattern.len() - 1].contains(is_like_pattern)
+ && !contains_like_pattern(&pattern[1..pattern.len() - 1])
{
Ok(Self::Contains(&pattern[1..pattern.len() - 1]))
} else {
@@ -62,14 +63,14 @@ impl<'a> Predicate<'a> {
/// Create a predicate for the given ilike pattern
pub fn ilike(pattern: &'a str, is_ascii: bool) -> Result<Self, ArrowError>
{
if is_ascii && pattern.is_ascii() {
- if !pattern.contains(is_like_pattern) {
+ if !contains_like_pattern(pattern) {
return Ok(Self::IEqAscii(pattern));
} else if pattern.ends_with('%')
&& !pattern.ends_with("\\%")
- && !pattern[..pattern.len() - 1].contains(is_like_pattern)
+ && !contains_like_pattern(&pattern[..pattern.len() - 1])
{
return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() -
1]));
- } else if pattern.starts_with('%') &&
!pattern[1..].contains(is_like_pattern) {
+ } else if pattern.starts_with('%') &&
!contains_like_pattern(&pattern[1..]) {
return Ok(Self::IEndsWithAscii(&pattern[1..]));
}
}
@@ -188,6 +189,10 @@ fn is_like_pattern(c: char) -> bool {
c == '%' || c == '_'
}
+fn contains_like_pattern(pattern: &str) -> bool {
+ memchr2(b'%', b'_', pattern.as_bytes()).is_some()
+}
+
#[cfg(test)]
mod tests {
use super::*;