alamb commented on code in PR #6231:
URL: https://github.com/apache/arrow-rs/pull/6231#discussion_r1723595894
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -294,6 +295,69 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
ArrayIter::new(self)
}
+ /// Returns an iterator over the bytes of this array.
+ pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
+ self.views.iter().map(move |v| {
+ let len = *v as u32;
+ if len <= 12 {
+ unsafe { Self::inline_value(v, len as usize) }
+ } else {
+ let view = ByteView::from(*v);
+ let data = &self.buffers[view.buffer_index as usize];
+ let offset = view.offset as usize;
+ unsafe { data.get_unchecked(offset..offset + len as usize) }
+ }
+ })
+ }
+
+ /// Returns an iterator over the prefix bytes of this array with respect
to the prefix length.
+ /// If the prefix length is larger than the string length, it will return
the empty slice.
+ pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item =
&[u8]> {
+ self.views().into_iter().map(move |v| {
+ let len = (*v as u32) as usize;
+
+ if len < prefix_len {
+ return &[] as &[u8];
Review Comment:
as you mentioned above, having to return an empty slice just for the
function to immediate check it again might be another potential performance
improvement
What do you think about making this more general and take a function? Maybe
something like the following (untested)
```rust
/// Applies function `f` to the first `prefix_len` bytes for all views
/// if the view length is less tha prefix_len func is invoked with
None(T)
pub fn prefix_bytes_iter<F, T>(&self, prefix_len: usize, func: F) ->
impl Iterator<Item = T>
where
F: FnMut(Option<&[u8]>) -> T
{
...
}
```
I am not sure this is a good idea but figured maybe it would be more
general. But maybe not...
##########
arrow-string/src/like.rs:
##########
@@ -989,6 +989,27 @@ mod tests {
vec![false, true, true, false, false, false, false, true, true, true,
true]
);
+ // 😈 is four bytes long.
+ test_utf8_scalar!(
+ test_uff8_array_like_multibyte,
+ vec![
Review Comment:
🤔 it occurs to me we should also be testing with `Option`s as well (aka the
test data should have nulls)
##########
arrow-string/src/predicate.rs:
##########
@@ -114,38 +114,117 @@ impl<'a> Predicate<'a> {
Predicate::IEqAscii(v) => BooleanArray::from_unary(array,
|haystack| {
haystack.eq_ignore_ascii_case(v) != negate
}),
- Predicate::Contains(finder) => BooleanArray::from_unary(array,
|haystack| {
- finder.find(haystack.as_bytes()).is_some() != negate
- }),
- Predicate::StartsWith(v) => BooleanArray::from_unary(array,
|haystack| {
- starts_with(haystack, v, equals_kernel) != negate
- }),
- Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- starts_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
- }),
- Predicate::EndsWith(v) => BooleanArray::from_unary(array,
|haystack| {
- ends_with(haystack, v, equals_kernel) != negate
- }),
- Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- ends_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
- }),
+ Predicate::Contains(finder) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .bytes_iter()
+ .map(|haystack| finder.find(haystack).is_some() !=
negate)
+ .collect::<Vec<_>>(),
+ )
+ } else {
+ BooleanArray::from_unary(array, |haystack| {
+ finder.find(haystack.as_bytes()).is_some() != negate
+ })
+ }
+ }
+ Predicate::StartsWith(v) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .prefix_bytes_iter(v.len())
+ .map(|haystack| {
+ starts_with_bytes(haystack, v.as_bytes(),
equals_kernel) != negate
+ })
+ .collect::<Vec<_>>(),
+ )
+ } else {
+ BooleanArray::from_unary(array, |haystack| {
+ starts_with(haystack, v, equals_kernel) != negate
+ })
+ }
+ }
+ Predicate::IStartsWithAscii(v) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .prefix_bytes_iter(v.len())
+ .map(|haystack| {
+ starts_with_bytes(
+ haystack,
+ v.as_bytes(),
+ equals_ignore_ascii_case_kernel,
+ ) != negate
+ })
+ .collect::<Vec<_>>(),
+ )
+ } else {
+ BooleanArray::from_unary(array, |haystack| {
+ starts_with(haystack, v,
equals_ignore_ascii_case_kernel) != negate
+ })
+ }
+ }
+ Predicate::EndsWith(v) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .suffix_bytes_iter(v.len())
+ .map(|haystack| {
+ starts_with_bytes(haystack, v.as_bytes(),
equals_kernel) != negate
Review Comment:
I found it strange that the suffix check is using a `starts_with_bytes` -- I
think it should be checking exactly equal
I think this is correct however, because the suffix will will always be
either `v.len()` bytes or `""`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]