alamb commented on code in PR #6231:
URL: https://github.com/apache/arrow-rs/pull/6231#discussion_r1715950657
##########
arrow-string/src/like.rs:
##########
@@ -155,9 +156,27 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) ->
Result<BooleanArray, Arr
///
/// This trait helps to abstract over the different types of string arrays
/// so that we don't need to duplicate the implementation for each type.
-trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
+pub(crate) trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
Review Comment:
I think this trait would help us write optimized string function
implementations in DataFusion -- maybe we can (as a follow on PR) consider
making it public (possibly renamed to something like `StringArrayAccessor` as
`StringArrayType` may be confusing)
Do you have any thoughts @XiangpengHao or @tustvold ?
##########
arrow-string/src/like.rs:
##########
@@ -155,9 +156,27 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) ->
Result<BooleanArray, Arr
///
/// This trait helps to abstract over the different types of string arrays
/// so that we don't need to duplicate the implementation for each type.
-trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
+pub(crate) trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
fn is_ascii(&self) -> bool;
fn iter(&self) -> ArrayIter<Self>;
+
+ /// Returns whether the array is optimized for prefix search
Review Comment:
One thing about this API and its use below is it will only ever be able to
use the first 4 bytes of the stringview prefix (and this won't ever be used if
I am looking for a string that is larger ( `starts_with('google')` for example)
##########
arrow-string/src/predicate.rs:
##########
@@ -103,35 +104,80 @@ impl<'a> Predicate<'a> {
///
/// If `negate` is true the result of the predicate will be negated
#[inline(never)]
- pub fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray
- where
- T: ArrayAccessor<Item = &'i str>,
- {
+ pub fn evaluate_array<'i, T: StringArrayType<'i>>(
+ &self,
+ array: T,
+ negate: bool,
+ ) -> BooleanArray {
match self {
- Predicate::Eq(v) => BooleanArray::from_unary(array, |haystack| {
- (haystack.len() == v.len() && haystack == *v) != negate
- }),
- Predicate::IEqAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- haystack.eq_ignore_ascii_case(v) != negate
- }),
- Predicate::Contains(finder) => BooleanArray::from_unary(array,
|haystack| {
- finder.find(haystack.as_bytes()).is_some() != negate
- }),
- Predicate::StartsWith(v) => BooleanArray::from_unary(array,
|haystack| {
- starts_with(haystack, v, equals_kernel) != negate
- }),
- Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- starts_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
- }),
- Predicate::EndsWith(v) => BooleanArray::from_unary(array,
|haystack| {
- ends_with(haystack, v, equals_kernel) != negate
- }),
- Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array,
|haystack| {
- ends_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
- }),
+ Predicate::Eq(v) => {
+ return BooleanArray::from_unary(array, |haystack| {
+ (haystack.len() == v.len() && haystack == *v) != negate
+ });
+ }
+ Predicate::IEqAscii(v) => {
+ return BooleanArray::from_unary(array, |haystack| {
+ haystack.eq_ignore_ascii_case(v) != negate
+ })
+ }
+ Predicate::Contains(finder) => {
+ return BooleanArray::from_unary(array, |haystack| {
+ finder.find(haystack.as_bytes()).is_some() != negate
+ })
+ }
+ Predicate::EndsWith(v) => {
+ return BooleanArray::from_unary(array, |haystack| {
+ ends_with(haystack, v, equals_kernel) != negate
+ })
+ }
+ Predicate::IEndsWithAscii(v) => {
+ return BooleanArray::from_unary(array, |haystack| {
+ ends_with(haystack, v, equals_ignore_ascii_case_kernel) !=
negate
+ })
+ }
Predicate::Regex(v) => {
- BooleanArray::from_unary(array, |haystack|
v.is_match(haystack) != negate)
+ return BooleanArray::from_unary(array, |haystack|
v.is_match(haystack) != negate);
+ }
+ _ => { // should be handled by the optimized path
+ }
+ }
+
+ let (prefix_optimized, prefix_len) = array.prefix_optimized();
Review Comment:
Since this is a pretty special optimization for
StringViewArray/BinaryViewArray it might be simpler / faster to make an
explicit special case for them maybe?
like
```rust
if let Some(string_view_array) = array.as_string_view_opt() {
// do special case comparison here
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]