alamb commented on code in PR #6231:
URL: https://github.com/apache/arrow-rs/pull/6231#discussion_r1725401057
##########
arrow-string/src/predicate.rs:
##########
@@ -114,28 +114,103 @@ impl<'a> Predicate<'a> {
Predicate::IEqAscii(v) => BooleanArray::from_unary(array,
|haystack| {
haystack.eq_ignore_ascii_case(v) != negate
}),
- Predicate::Contains(finder) => BooleanArray::from_unary(array,
|haystack| {
- finder.find(haystack.as_bytes()).is_some() != negate
- }),
- Predicate::StartsWith(v) => BooleanArray::from_unary(array,
|haystack| {
- starts_with(haystack, v, equals_kernel) != negate
- }),
- Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- starts_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
- }),
- Predicate::EndsWith(v) => BooleanArray::from_unary(array,
|haystack| {
- ends_with(haystack, v, equals_kernel) != negate
- }),
- Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- ends_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
- }),
+ Predicate::Contains(finder) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .bytes_iter()
+ .map(|haystack| finder.find(haystack).is_some() !=
negate)
+ .collect::<Vec<_>>(),
+ )
+ } else {
+ BooleanArray::from_unary(array, |haystack| {
Review Comment:
looking more carefully at `BooleanArray::from_unary` it will use the
`ArrayAccessor` impl for StringViewArray
https://github.com/apache/arrow-rs/blob/7eb3c8323071245d77296f09971536ede4684cbc/arrow-array/src/array/byte_view_array.rs#L547-L557
It isn't clear to me that how calling `bytes_iter()` would make this faster
as the code for `value_unchecked` is the same as butes_iter
##########
arrow-string/src/predicate.rs:
##########
@@ -114,28 +114,103 @@ impl<'a> Predicate<'a> {
Predicate::IEqAscii(v) => BooleanArray::from_unary(array,
|haystack| {
haystack.eq_ignore_ascii_case(v) != negate
}),
- Predicate::Contains(finder) => BooleanArray::from_unary(array,
|haystack| {
- finder.find(haystack.as_bytes()).is_some() != negate
- }),
- Predicate::StartsWith(v) => BooleanArray::from_unary(array,
|haystack| {
- starts_with(haystack, v, equals_kernel) != negate
- }),
- Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- starts_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
- }),
- Predicate::EndsWith(v) => BooleanArray::from_unary(array,
|haystack| {
- ends_with(haystack, v, equals_kernel) != negate
- }),
- Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- ends_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
- }),
+ Predicate::Contains(finder) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .bytes_iter()
+ .map(|haystack| finder.find(haystack).is_some() !=
negate)
+ .collect::<Vec<_>>(),
+ )
+ } else {
+ BooleanArray::from_unary(array, |haystack| {
Review Comment:
I think the test should be we'll try benchmarking and see if this improves
things
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -294,6 +295,69 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
ArrayIter::new(self)
}
+ /// Returns an iterator over the bytes of this array.
+ pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
+ self.views.iter().map(move |v| {
+ let len = *v as u32;
+ if len <= 12 {
+ unsafe { Self::inline_value(v, len as usize) }
+ } else {
+ let view = ByteView::from(*v);
+ let data = &self.buffers[view.buffer_index as usize];
+ let offset = view.offset as usize;
+ unsafe { data.get_unchecked(offset..offset + len as usize) }
+ }
+ })
+ }
+
+ /// Returns an iterator over the prefix bytes of this array with respect
to the prefix length.
+ /// If the prefix length is larger than the string length, it will return
the empty slice.
+ pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item =
&[u8]> {
+ self.views().into_iter().map(move |v| {
+ let len = (*v as u32) as usize;
+
+ if len < prefix_len {
+ return &[] as &[u8];
Review Comment:
Makes sense -- thank you for the explanation. Let's keep exploring this
method for now
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]