This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 855666d9e Specialize Prefix/Suffix Match for `Like/ILike` between
Array and Scalar for StringViewArray (#6231)
855666d9e is described below
commit 855666d9e9283c1ef11648762fe92c7c188b68f1
Author: Xin Li <[email protected]>
AuthorDate: Sun Aug 25 06:24:21 2024 -0700
Specialize Prefix/Suffix Match for `Like/ILike` between Array and Scalar
for StringViewArray (#6231)
* v2 impl
* Add bench
* fix clippy
* fix endswith
* Finalize the prefix_v2 implementation
* stop reverse string for ends_with
* Fix comments
* fix bad comment
* Correct equals sematics
---
arrow-array/src/array/byte_view_array.rs | 64 ++++++++++++++++++
arrow-string/src/like.rs | 21 ++++++
arrow-string/src/predicate.rs | 108 ++++++++++++++++++++++++++-----
3 files changed, 176 insertions(+), 17 deletions(-)
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index a155b6ab2..3f5a952c2 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -24,6 +24,7 @@ use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray,
OffsetSizeTrait, S
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
use arrow_schema::{ArrowError, DataType};
+use core::str;
use num::ToPrimitive;
use std::any::Any;
use std::fmt::Debug;
@@ -301,6 +302,69 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
ArrayIter::new(self)
}
+ /// Returns an iterator over the bytes of this array.
+ pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
+ self.views.iter().map(move |v| {
+ let len = *v as u32;
+ if len <= 12 {
+ unsafe { Self::inline_value(v, len as usize) }
+ } else {
+ let view = ByteView::from(*v);
+ let data = &self.buffers[view.buffer_index as usize];
+ let offset = view.offset as usize;
+ unsafe { data.get_unchecked(offset..offset + len as usize) }
+ }
+ })
+ }
+
+ /// Returns an iterator over the prefix bytes of this array with respect
to the prefix length.
+ /// If the prefix length is larger than the string length, it will return
the empty slice.
+ pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item =
&[u8]> {
+ self.views().into_iter().map(move |v| {
+ let len = (*v as u32) as usize;
+
+ if len < prefix_len {
+ return &[] as &[u8];
+ }
+
+ if prefix_len <= 4 || len <= 12 {
+ unsafe { StringViewArray::inline_value(v, prefix_len) }
+ } else {
+ let view = ByteView::from(*v);
+ let data = unsafe {
+ self.data_buffers()
+ .get_unchecked(view.buffer_index as usize)
+ };
+ let offset = view.offset as usize;
+ unsafe { data.get_unchecked(offset..offset + prefix_len) }
+ }
+ })
+ }
+
+ /// Returns an iterator over the suffix bytes of this array with respect
to the suffix length.
+ /// If the suffix length is larger than the string length, it will return
the empty slice.
+ pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item =
&[u8]> {
+ self.views().into_iter().map(move |v| {
+ let len = (*v as u32) as usize;
+
+ if len < suffix_len {
+ return &[] as &[u8];
+ }
+
+ if len <= 12 {
+ unsafe { &StringViewArray::inline_value(v, len)[len -
suffix_len..] }
+ } else {
+ let view = ByteView::from(*v);
+ let data = unsafe {
+ self.data_buffers()
+ .get_unchecked(view.buffer_index as usize)
+ };
+ let offset = view.offset as usize;
+ unsafe { data.get_unchecked(offset + len - suffix_len..offset
+ len) }
+ }
+ })
+ }
+
/// Returns a zero-copy slice of this array with the indicated offset and
length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index e878e2418..4626be136 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -989,6 +989,27 @@ mod tests {
vec![false, true, true, false, false, false, false, true, true, true,
true]
);
+ // 😈 is four bytes long.
+ test_utf8_scalar!(
+ test_uff8_array_like_multibyte,
+ vec![
+ "sdlkdfFooßsdfs",
+ "sdlkdfFooSSdggs",
+ "sdlkdfFoosssdsd",
+ "FooS",
+ "Foos",
+ "ffooSS",
+ "ffooß",
+ "😃sadlksffofsSsh😈klF",
+ "😱slgffoesSsh😈klF",
+ "FFKoSS",
+ "longer than 12 bytes FFKoSS",
+ ],
+ "%Ssh😈klF",
+ like,
+ vec![false, false, false, false, false, false, false, true, true,
false, false]
+ );
+
test_utf8_scalar!(
test_utf8_array_ilike_scalar_one,
vec![
diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index ec0c48278..f559088e6 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_array::{ArrayAccessor, BooleanArray};
+use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray};
use arrow_schema::ArrowError;
use memchr::memchr2;
use memchr::memmem::Finder;
@@ -114,21 +114,92 @@ impl<'a> Predicate<'a> {
Predicate::IEqAscii(v) => BooleanArray::from_unary(array,
|haystack| {
haystack.eq_ignore_ascii_case(v) != negate
}),
- Predicate::Contains(finder) => BooleanArray::from_unary(array,
|haystack| {
- finder.find(haystack.as_bytes()).is_some() != negate
- }),
- Predicate::StartsWith(v) => BooleanArray::from_unary(array,
|haystack| {
- starts_with(haystack, v, equals_kernel) != negate
- }),
- Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- starts_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
- }),
- Predicate::EndsWith(v) => BooleanArray::from_unary(array,
|haystack| {
- ends_with(haystack, v, equals_kernel) != negate
- }),
- Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- ends_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
- }),
+ Predicate::Contains(finder) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .bytes_iter()
+ .map(|haystack| finder.find(haystack).is_some() !=
negate)
+ .collect::<Vec<_>>(),
+ )
+ } else {
+ BooleanArray::from_unary(array, |haystack| {
+ finder.find(haystack.as_bytes()).is_some() != negate
+ })
+ }
+ }
+ Predicate::StartsWith(v) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .prefix_bytes_iter(v.len())
+ .map(|haystack| {
+ equals_bytes(haystack, v.as_bytes(),
equals_kernel) != negate
+ })
+ .collect::<Vec<_>>(),
+ )
+ } else {
+ BooleanArray::from_unary(array, |haystack| {
+ starts_with(haystack, v, equals_kernel) != negate
+ })
+ }
+ }
+ Predicate::IStartsWithAscii(v) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .prefix_bytes_iter(v.len())
+ .map(|haystack| {
+ equals_bytes(
+ haystack,
+ v.as_bytes(),
+ equals_ignore_ascii_case_kernel,
+ ) != negate
+ })
+ .collect::<Vec<_>>(),
+ )
+ } else {
+ BooleanArray::from_unary(array, |haystack| {
+ starts_with(haystack, v,
equals_ignore_ascii_case_kernel) != negate
+ })
+ }
+ }
+ Predicate::EndsWith(v) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .suffix_bytes_iter(v.len())
+ .map(|haystack| {
+ equals_bytes(haystack, v.as_bytes(),
equals_kernel) != negate
+ })
+ .collect::<Vec<_>>(),
+ )
+ } else {
+ BooleanArray::from_unary(array, |haystack| {
+ ends_with(haystack, v, equals_kernel) != negate
+ })
+ }
+ }
+ Predicate::IEndsWithAscii(v) => {
+ if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ BooleanArray::from(
+ string_view_array
+ .suffix_bytes_iter(v.len())
+ .map(|haystack| {
+ equals_bytes(
+ haystack,
+ v.as_bytes(),
+ equals_ignore_ascii_case_kernel,
+ ) != negate
+ })
+ .collect::<Vec<_>>(),
+ )
+ } else {
+ BooleanArray::from_unary(array, |haystack| {
+ ends_with(haystack, v,
equals_ignore_ascii_case_kernel) != negate
+ })
+ }
+ }
Predicate::Regex(v) => {
BooleanArray::from_unary(array, |haystack|
v.is_match(haystack) != negate)
}
@@ -136,6 +207,10 @@ impl<'a> Predicate<'a> {
}
}
+fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) ->
bool) -> bool {
+ lhs.len() == rhs.len() && zip(lhs, rhs).all(byte_eq_kernel)
+}
+
/// This is faster than `str::starts_with` for small strings.
/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8,
&u8)) -> bool) -> bool {
@@ -145,7 +220,6 @@ fn starts_with(haystack: &str, needle: &str,
byte_eq_kernel: impl Fn((&u8, &u8))
zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel)
}
}
-
/// This is faster than `str::ends_with` for small strings.
/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8))
-> bool) -> bool {