This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 855666d9e Specialize Prefix/Suffix Match for `Like/ILike` between 
Array and Scalar for StringViewArray (#6231)
855666d9e is described below

commit 855666d9e9283c1ef11648762fe92c7c188b68f1
Author: Xin Li <[email protected]>
AuthorDate: Sun Aug 25 06:24:21 2024 -0700

    Specialize Prefix/Suffix Match for `Like/ILike` between Array and Scalar 
for StringViewArray (#6231)
    
    * v2 impl
    
    * Add bench
    
    * fix clippy
    
    * fix endswith
    
    * Finalize the prefix_v2 implementation
    
    * stop reverse string for ends_with
    
    * Fix comments
    
    * fix bad comment
    
    * Correct equals sematics
---
 arrow-array/src/array/byte_view_array.rs |  64 ++++++++++++++++++
 arrow-string/src/like.rs                 |  21 ++++++
 arrow-string/src/predicate.rs            | 108 ++++++++++++++++++++++++++-----
 3 files changed, 176 insertions(+), 17 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs 
b/arrow-array/src/array/byte_view_array.rs
index a155b6ab2..3f5a952c2 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -24,6 +24,7 @@ use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, 
OffsetSizeTrait, S
 use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
 use arrow_schema::{ArrowError, DataType};
+use core::str;
 use num::ToPrimitive;
 use std::any::Any;
 use std::fmt::Debug;
@@ -301,6 +302,69 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         ArrayIter::new(self)
     }
 
+    /// Returns an iterator over the bytes of this array.
+    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
+        self.views.iter().map(move |v| {
+            let len = *v as u32;
+            if len <= 12 {
+                unsafe { Self::inline_value(v, len as usize) }
+            } else {
+                let view = ByteView::from(*v);
+                let data = &self.buffers[view.buffer_index as usize];
+                let offset = view.offset as usize;
+                unsafe { data.get_unchecked(offset..offset + len as usize) }
+            }
+        })
+    }
+
+    /// Returns an iterator over the prefix bytes of this array with respect 
to the prefix length.
+    /// If the prefix length is larger than the string length, it will return 
the empty slice.
+    pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = 
&[u8]> {
+        self.views().into_iter().map(move |v| {
+            let len = (*v as u32) as usize;
+
+            if len < prefix_len {
+                return &[] as &[u8];
+            }
+
+            if prefix_len <= 4 || len <= 12 {
+                unsafe { StringViewArray::inline_value(v, prefix_len) }
+            } else {
+                let view = ByteView::from(*v);
+                let data = unsafe {
+                    self.data_buffers()
+                        .get_unchecked(view.buffer_index as usize)
+                };
+                let offset = view.offset as usize;
+                unsafe { data.get_unchecked(offset..offset + prefix_len) }
+            }
+        })
+    }
+
+    /// Returns an iterator over the suffix bytes of this array with respect 
to the suffix length.
+    /// If the suffix length is larger than the string length, it will return 
the empty slice.
+    pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = 
&[u8]> {
+        self.views().into_iter().map(move |v| {
+            let len = (*v as u32) as usize;
+
+            if len < suffix_len {
+                return &[] as &[u8];
+            }
+
+            if len <= 12 {
+                unsafe { &StringViewArray::inline_value(v, len)[len - 
suffix_len..] }
+            } else {
+                let view = ByteView::from(*v);
+                let data = unsafe {
+                    self.data_buffers()
+                        .get_unchecked(view.buffer_index as usize)
+                };
+                let offset = view.offset as usize;
+                unsafe { data.get_unchecked(offset + len - suffix_len..offset 
+ len) }
+            }
+        })
+    }
+
     /// Returns a zero-copy slice of this array with the indicated offset and 
length.
     pub fn slice(&self, offset: usize, length: usize) -> Self {
         Self {
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index e878e2418..4626be136 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -989,6 +989,27 @@ mod tests {
         vec![false, true, true, false, false, false, false, true, true, true, 
true]
     );
 
+    // 😈 is four bytes long.
+    test_utf8_scalar!(
+        test_uff8_array_like_multibyte,
+        vec![
+            "sdlkdfFooßsdfs",
+            "sdlkdfFooSSdggs",
+            "sdlkdfFoosssdsd",
+            "FooS",
+            "Foos",
+            "ffooSS",
+            "ffooß",
+            "😃sadlksffofsSsh😈klF",
+            "😱slgffoesSsh😈klF",
+            "FFKoSS",
+            "longer than 12 bytes FFKoSS",
+        ],
+        "%Ssh😈klF",
+        like,
+        vec![false, false, false, false, false, false, false, true, true, 
false, false]
+    );
+
     test_utf8_scalar!(
         test_utf8_array_ilike_scalar_one,
         vec![
diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index ec0c48278..f559088e6 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow_array::{ArrayAccessor, BooleanArray};
+use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray};
 use arrow_schema::ArrowError;
 use memchr::memchr2;
 use memchr::memmem::Finder;
@@ -114,21 +114,92 @@ impl<'a> Predicate<'a> {
             Predicate::IEqAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
                 haystack.eq_ignore_ascii_case(v) != negate
             }),
-            Predicate::Contains(finder) => BooleanArray::from_unary(array, 
|haystack| {
-                finder.find(haystack.as_bytes()).is_some() != negate
-            }),
-            Predicate::StartsWith(v) => BooleanArray::from_unary(array, 
|haystack| {
-                starts_with(haystack, v, equals_kernel) != negate
-            }),
-            Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
-                starts_with(haystack, v, equals_ignore_ascii_case_kernel) != 
negate
-            }),
-            Predicate::EndsWith(v) => BooleanArray::from_unary(array, 
|haystack| {
-                ends_with(haystack, v, equals_kernel) != negate
-            }),
-            Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
-                ends_with(haystack, v, equals_ignore_ascii_case_kernel) != 
negate
-            }),
+            Predicate::Contains(finder) => {
+                if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .bytes_iter()
+                            .map(|haystack| finder.find(haystack).is_some() != 
negate)
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {
+                        finder.find(haystack.as_bytes()).is_some() != negate
+                    })
+                }
+            }
+            Predicate::StartsWith(v) => {
+                if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .prefix_bytes_iter(v.len())
+                            .map(|haystack| {
+                                equals_bytes(haystack, v.as_bytes(), 
equals_kernel) != negate
+                            })
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {
+                        starts_with(haystack, v, equals_kernel) != negate
+                    })
+                }
+            }
+            Predicate::IStartsWithAscii(v) => {
+                if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .prefix_bytes_iter(v.len())
+                            .map(|haystack| {
+                                equals_bytes(
+                                    haystack,
+                                    v.as_bytes(),
+                                    equals_ignore_ascii_case_kernel,
+                                ) != negate
+                            })
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {
+                        starts_with(haystack, v, 
equals_ignore_ascii_case_kernel) != negate
+                    })
+                }
+            }
+            Predicate::EndsWith(v) => {
+                if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .suffix_bytes_iter(v.len())
+                            .map(|haystack| {
+                                equals_bytes(haystack, v.as_bytes(), 
equals_kernel) != negate
+                            })
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {
+                        ends_with(haystack, v, equals_kernel) != negate
+                    })
+                }
+            }
+            Predicate::IEndsWithAscii(v) => {
+                if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .suffix_bytes_iter(v.len())
+                            .map(|haystack| {
+                                equals_bytes(
+                                    haystack,
+                                    v.as_bytes(),
+                                    equals_ignore_ascii_case_kernel,
+                                ) != negate
+                            })
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {
+                        ends_with(haystack, v, 
equals_ignore_ascii_case_kernel) != negate
+                    })
+                }
+            }
             Predicate::Regex(v) => {
                 BooleanArray::from_unary(array, |haystack| 
v.is_match(haystack) != negate)
             }
@@ -136,6 +207,10 @@ impl<'a> Predicate<'a> {
     }
 }
 
+fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> 
bool) -> bool {
+    lhs.len() == rhs.len() && zip(lhs, rhs).all(byte_eq_kernel)
+}
+
 /// This is faster than `str::starts_with` for small strings.
 /// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
 fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, 
&u8)) -> bool) -> bool {
@@ -145,7 +220,6 @@ fn starts_with(haystack: &str, needle: &str, 
byte_eq_kernel: impl Fn((&u8, &u8))
         zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel)
     }
 }
-
 /// This is faster than `str::ends_with` for small strings.
 /// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
 fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) 
-> bool) -> bool {

Reply via email to