alamb commented on code in PR #6231:
URL: https://github.com/apache/arrow-rs/pull/6231#discussion_r1715950657


##########
arrow-string/src/like.rs:
##########
@@ -155,9 +156,27 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> 
Result<BooleanArray, Arr
 ///
 /// This trait helps to abstract over the different types of string arrays
 /// so that we don't need to duplicate the implementation for each type.
-trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
+pub(crate) trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {

Review Comment:
   I think this trait would help us write optimized string function 
implementations in DataFusion -- maybe we can (as a follow on PR) consider 
making it public (possibly renamed to something like `StringArrayAccessor` as 
`StringArrayType` may be confusing)
   
   Do you have any thoughts @XiangpengHao  or @tustvold ?



##########
arrow-string/src/like.rs:
##########
@@ -155,9 +156,27 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> 
Result<BooleanArray, Arr
 ///
 /// This trait helps to abstract over the different types of string arrays
 /// so that we don't need to duplicate the implementation for each type.
-trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
+pub(crate) trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
     fn is_ascii(&self) -> bool;
     fn iter(&self) -> ArrayIter<Self>;
+
+    /// Returns whether the array is optimized for prefix search

Review Comment:
   One thing about this API and its use below is it will only ever be able to 
use the first 4 bytes of the stringview prefix (and this won't ever be used if 
I am looking for a string that is larger ( `starts_with('google')` for example)
   
   



##########
arrow-string/src/predicate.rs:
##########
@@ -103,35 +104,80 @@ impl<'a> Predicate<'a> {
     ///
     /// If `negate` is true the result of the predicate will be negated
     #[inline(never)]
-    pub fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray
-    where
-        T: ArrayAccessor<Item = &'i str>,
-    {
+    pub fn evaluate_array<'i, T: StringArrayType<'i>>(
+        &self,
+        array: T,
+        negate: bool,
+    ) -> BooleanArray {
         match self {
-            Predicate::Eq(v) => BooleanArray::from_unary(array, |haystack| {
-                (haystack.len() == v.len() && haystack == *v) != negate
-            }),
-            Predicate::IEqAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
-                haystack.eq_ignore_ascii_case(v) != negate
-            }),
-            Predicate::Contains(finder) => BooleanArray::from_unary(array, 
|haystack| {
-                finder.find(haystack.as_bytes()).is_some() != negate
-            }),
-            Predicate::StartsWith(v) => BooleanArray::from_unary(array, 
|haystack| {
-                starts_with(haystack, v, equals_kernel) != negate
-            }),
-            Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
-                starts_with(haystack, v, equals_ignore_ascii_case_kernel) != 
negate
-            }),
-            Predicate::EndsWith(v) => BooleanArray::from_unary(array, 
|haystack| {
-                ends_with(haystack, v, equals_kernel) != negate
-            }),
-            Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
-                ends_with(haystack, v, equals_ignore_ascii_case_kernel) != 
negate
-            }),
+            Predicate::Eq(v) => {
+                return BooleanArray::from_unary(array, |haystack| {
+                    (haystack.len() == v.len() && haystack == *v) != negate
+                });
+            }
+            Predicate::IEqAscii(v) => {
+                return BooleanArray::from_unary(array, |haystack| {
+                    haystack.eq_ignore_ascii_case(v) != negate
+                })
+            }
+            Predicate::Contains(finder) => {
+                return BooleanArray::from_unary(array, |haystack| {
+                    finder.find(haystack.as_bytes()).is_some() != negate
+                })
+            }
+            Predicate::EndsWith(v) => {
+                return BooleanArray::from_unary(array, |haystack| {
+                    ends_with(haystack, v, equals_kernel) != negate
+                })
+            }
+            Predicate::IEndsWithAscii(v) => {
+                return BooleanArray::from_unary(array, |haystack| {
+                    ends_with(haystack, v, equals_ignore_ascii_case_kernel) != 
negate
+                })
+            }
             Predicate::Regex(v) => {
-                BooleanArray::from_unary(array, |haystack| 
v.is_match(haystack) != negate)
+                return BooleanArray::from_unary(array, |haystack| 
v.is_match(haystack) != negate);
+            }
+            _ => { // should be handled by the optimized path
+            }
+        }
+
+        let (prefix_optimized, prefix_len) = array.prefix_optimized();

Review Comment:
   Since this is a pretty special optimization for 
StringViewArray/BinaryViewArray it might be simpler / faster to make an 
explicit special case for them maybe?
   
   like 
   ```rust
   if let Some(string_view_array) = array.as_string_view_opt() {
     // do special case comparison here
   }
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to