ILike` between Array and Scalar for StringViewArray [arrow-rs]

via GitHub Wed, 21 Aug 2024 09:49:52 -0700


alamb commented on code in PR #6231:
URL: https://github.com/apache/arrow-rs/pull/6231#discussion_r1725401057



##########
arrow-string/src/predicate.rs:
##########
@@ -114,28 +114,103 @@ impl<'a> Predicate<'a> {
             Predicate::IEqAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
                 haystack.eq_ignore_ascii_case(v) != negate
             }),
-            Predicate::Contains(finder) => BooleanArray::from_unary(array, 
|haystack| {
-                finder.find(haystack.as_bytes()).is_some() != negate
-            }),
-            Predicate::StartsWith(v) => BooleanArray::from_unary(array, 
|haystack| {
-                starts_with(haystack, v, equals_kernel) != negate
-            }),
-            Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
-                starts_with(haystack, v, equals_ignore_ascii_case_kernel) != 
negate
-            }),
-            Predicate::EndsWith(v) => BooleanArray::from_unary(array, 
|haystack| {
-                ends_with(haystack, v, equals_kernel) != negate
-            }),
-            Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
-                ends_with(haystack, v, equals_ignore_ascii_case_kernel) != 
negate
-            }),
+            Predicate::Contains(finder) => {
+                if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .bytes_iter()
+                            .map(|haystack| finder.find(haystack).is_some() != 
negate)
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {

Review Comment:
   looking more carefully at `BooleanArray::from_unary` it will use the 
`ArrayAccessor` impl for StringViewArray 
   
   
https://github.com/apache/arrow-rs/blob/7eb3c8323071245d77296f09971536ede4684cbc/arrow-array/src/array/byte_view_array.rs#L547-L557
   
   It isn't clear to me that how calling `bytes_iter()` would make this faster 
as the code for `value_unchecked` is the same as butes_iter



##########
arrow-string/src/predicate.rs:
##########
@@ -114,28 +114,103 @@ impl<'a> Predicate<'a> {
             Predicate::IEqAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
                 haystack.eq_ignore_ascii_case(v) != negate
             }),
-            Predicate::Contains(finder) => BooleanArray::from_unary(array, 
|haystack| {
-                finder.find(haystack.as_bytes()).is_some() != negate
-            }),
-            Predicate::StartsWith(v) => BooleanArray::from_unary(array, 
|haystack| {
-                starts_with(haystack, v, equals_kernel) != negate
-            }),
-            Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
-                starts_with(haystack, v, equals_ignore_ascii_case_kernel) != 
negate
-            }),
-            Predicate::EndsWith(v) => BooleanArray::from_unary(array, 
|haystack| {
-                ends_with(haystack, v, equals_kernel) != negate
-            }),
-            Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
-                ends_with(haystack, v, equals_ignore_ascii_case_kernel) != 
negate
-            }),
+            Predicate::Contains(finder) => {
+                if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .bytes_iter()
+                            .map(|haystack| finder.find(haystack).is_some() != 
negate)
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {

Review Comment:
   I think the test should be we'll try benchmarking and see if this improves 
things



##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -294,6 +295,69 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         ArrayIter::new(self)
     }
 
+    /// Returns an iterator over the bytes of this array.
+    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
+        self.views.iter().map(move |v| {
+            let len = *v as u32;
+            if len <= 12 {
+                unsafe { Self::inline_value(v, len as usize) }
+            } else {
+                let view = ByteView::from(*v);
+                let data = &self.buffers[view.buffer_index as usize];
+                let offset = view.offset as usize;
+                unsafe { data.get_unchecked(offset..offset + len as usize) }
+            }
+        })
+    }
+
+    /// Returns an iterator over the prefix bytes of this array with respect 
to the prefix length.
+    /// If the prefix length is larger than the string length, it will return 
the empty slice.
+    pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = 
&[u8]> {
+        self.views().into_iter().map(move |v| {
+            let len = (*v as u32) as usize;
+
+            if len < prefix_len {
+                return &[] as &[u8];

Review Comment:
   Makes sense -- thank you for the explanation. Let's keep exploring this 
method for now



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Specialize Prefix/Suffix Match for `Like/ILike` between Array and Scalar for StringViewArray [arrow-rs]

Reply via email to