This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 66bada54cf5 Implement like/ilike etc for StringViewArray (#5931)
66bada54cf5 is described below

commit 66bada54cf55703fa11bd4bf195f47fb9df714c9
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jun 24 14:02:45 2024 -0700

    Implement like/ilike etc for StringViewArray (#5931)
    
    * like for string view array
    
    * fix bug
    
    * update doc
    
    * update tests
---
 arrow-array/src/array/byte_view_array.rs |  14 +
 arrow-string/src/like.rs                 | 646 +++++++++++++++++++------------
 arrow-string/src/predicate.rs            |  11 +-
 3 files changed, 417 insertions(+), 254 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs 
b/arrow-array/src/array/byte_view_array.rs
index f31bc1c785b..dc4cbe6834c 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -569,6 +569,20 @@ impl StringViewArray {
     pub fn to_binary_view(self) -> BinaryViewArray {
         unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, 
self.nulls) }
     }
+
+    /// Returns true if all data within this array is ASCII
+    pub fn is_ascii(&self) -> bool {
+        // Alternative (but incorrect): directly check the underlying buffers
+        // (1) Our string view might be sparse, i.e., a subset of the buffers,
+        //      so even if the buffer is not ascii, we can still be ascii.
+        // (2) It is quite difficult to know the range of each buffer (unlike 
StringArray)
+        // This means that this operation is quite expensive, shall we cache 
the result?
+        //  i.e. track `is_ascii` in the builder.
+        self.iter().all(|v| match v {
+            Some(v) => v.is_ascii(),
+            None => true,
+        })
+    }
 }
 
 impl From<Vec<&str>> for StringViewArray {
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index 6f6dfe03133..49831092ffc 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -20,6 +20,7 @@ use arrow_array::cast::AsArray;
 use arrow_array::*;
 use arrow_schema::*;
 use arrow_select::take::take;
+use iterator::ArrayIter;
 use std::sync::Arc;
 
 #[derive(Debug)]
@@ -126,24 +127,66 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> 
Result<BooleanArray, Arr
     let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r);
 
     match (l.data_type(), r.data_type()) {
-        (Utf8, Utf8) => apply::<i32>(op, l.as_string(), l_s, l_v, 
r.as_string(), r_s, r_v),
+        (Utf8, Utf8) => {
+            apply::<&GenericStringArray<i32>>(op, l.as_string(), l_s, l_v, 
r.as_string(), r_s, r_v)
+        }
         (LargeUtf8, LargeUtf8) => {
-            apply::<i64>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v)
+            apply::<&GenericStringArray<i64>>(op, l.as_string(), l_s, l_v, 
r.as_string(), r_s, r_v)
         }
+        (Utf8View, Utf8View) => apply::<&StringViewArray>(
+            op,
+            l.as_string_view(),
+            l_s,
+            l_v,
+            r.as_string_view(),
+            r_s,
+            r_v,
+        ),
         (l_t, r_t) => Err(ArrowError::InvalidArgumentError(format!(
             "Invalid string operation: {l_t} {op} {r_t}"
         ))),
     }
 }
 
-fn apply<O: OffsetSizeTrait>(
+/// A trait for Arrow String Arrays, currently three types are supported:
+/// - `StringArray`
+/// - `LargeStringArray`
+/// - `StringViewArray`
+///
+/// This trait helps to abstract over the different types of string arrays
+/// so that we don't need to duplicate the implementation for each type.
+trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
+    fn is_ascii(&self) -> bool;
+    fn iter(&self) -> ArrayIter<Self>;
+}
+
+impl<'a, O: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<O> 
{
+    fn is_ascii(&self) -> bool {
+        GenericStringArray::<O>::is_ascii(self)
+    }
+
+    fn iter(&self) -> ArrayIter<Self> {
+        GenericStringArray::<O>::iter(self)
+    }
+}
+impl<'a> StringArrayType<'a> for &'a StringViewArray {
+    fn is_ascii(&self) -> bool {
+        StringViewArray::is_ascii(self)
+    }
+
+    fn iter(&self) -> ArrayIter<Self> {
+        StringViewArray::iter(self)
+    }
+}
+
+fn apply<'a, T: StringArrayType<'a> + 'a>(
     op: Op,
-    l: &GenericStringArray<O>,
+    l: T,
     l_s: bool,
-    l_v: Option<&dyn AnyDictionaryArray>,
-    r: &GenericStringArray<O>,
+    l_v: Option<&'a dyn AnyDictionaryArray>,
+    r: T,
     r_s: bool,
-    r_v: Option<&dyn AnyDictionaryArray>,
+    r_v: Option<&'a dyn AnyDictionaryArray>,
 ) -> Result<BooleanArray, ArrowError> {
     let l_len = l_v.map(|l| l.len()).unwrap_or(l.len());
     if r_s {
@@ -155,7 +198,7 @@ fn apply<O: OffsetSizeTrait>(
         if r.is_null(idx) {
             return Ok(BooleanArray::new_null(l_len));
         }
-        op_scalar(op, l, l_v, r.value(idx))
+        op_scalar::<T>(op, l, l_v, r.value(idx))
     } else {
         match (l_s, l_v, r_v) {
             (true, None, None) => {
@@ -187,9 +230,9 @@ fn apply<O: OffsetSizeTrait>(
 }
 
 #[inline(never)]
-fn op_scalar<O: OffsetSizeTrait>(
+fn op_scalar<'a, T: StringArrayType<'a>>(
     op: Op,
-    l: &GenericStringArray<O>,
+    l: T,
     l_v: Option<&dyn AnyDictionaryArray>,
     r: &str,
 ) -> Result<BooleanArray, ArrowError> {
@@ -207,8 +250,8 @@ fn op_scalar<O: OffsetSizeTrait>(
     })
 }
 
-fn vectored_iter<'a, O: OffsetSizeTrait>(
-    a: &'a GenericStringArray<O>,
+fn vectored_iter<'a, T: StringArrayType<'a> + 'a>(
+    a: T,
     a_v: &'a dyn AnyDictionaryArray,
 ) -> impl Iterator<Item = Option<&'a str>> + 'a {
     let nulls = a_v.nulls();
@@ -373,24 +416,33 @@ mod tests {
     use super::*;
     use arrow_array::types::Int8Type;
 
+    /// Applying `op(left, right)`, both sides are arrays
+    /// The macro tests four types of array implementations:
+    /// - `StringArray`
+    /// - `LargeStringArray`
+    /// - `StringViewArray`
+    /// - `DictionaryArray`
     macro_rules! test_utf8 {
         ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) 
=> {
             #[test]
             fn $test_name() {
                 let expected = BooleanArray::from($expected);
+
                 let left = StringArray::from($left);
                 let right = StringArray::from($right);
                 let res = $op(&left, &right).unwrap();
                 assert_eq!(res, expected);
-            }
-        };
-    }
 
-    macro_rules! test_dict_utf8 {
-        ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) 
=> {
-            #[test]
-            fn $test_name() {
-                let expected = BooleanArray::from($expected);
+                let left = LargeStringArray::from($left);
+                let right = LargeStringArray::from($right);
+                let res = $op(&left, &right).unwrap();
+                assert_eq!(res, expected);
+
+                let left = StringViewArray::from($left);
+                let right = StringViewArray::from($right);
+                let res = $op(&left, &right).unwrap();
+                assert_eq!(res, expected);
+
                 let left: DictionaryArray<Int8Type> = 
$left.into_iter().collect();
                 let right: DictionaryArray<Int8Type> = 
$right.into_iter().collect();
                 let res = $op(&left, &right).unwrap();
@@ -399,6 +451,12 @@ mod tests {
         };
     }
 
+    /// Applying `op(left, right)`, left side is array, right side is scalar
+    /// The macro tests four types of array implementations:
+    /// - `StringArray`
+    /// - `LargeStringArray`
+    /// - `StringViewArray`
+    /// - `DictionaryArray`
     macro_rules! test_utf8_scalar {
         ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) 
=> {
             #[test]
@@ -406,351 +464,420 @@ mod tests {
                 let expected = BooleanArray::from($expected);
 
                 let left = StringArray::from($left);
-                let res = $op(&left, $right).unwrap();
+                let right = StringArray::from_iter_values([$right]);
+                let res = $op(&left, &Scalar::new(&right)).unwrap();
                 assert_eq!(res, expected);
 
                 let left = LargeStringArray::from($left);
-                let res = $op(&left, $right).unwrap();
+                let right = LargeStringArray::from_iter_values([$right]);
+                let res = $op(&left, &Scalar::new(&right)).unwrap();
+                assert_eq!(res, expected);
+
+                let left = StringViewArray::from($left);
+                let right = StringViewArray::from_iter_values([$right]);
+                let res = $op(&left, &Scalar::new(&right)).unwrap();
+                assert_eq!(res, expected);
+
+                let left: DictionaryArray<Int8Type> = 
$left.into_iter().collect();
+                let right: DictionaryArray<Int8Type> = 
[$right].into_iter().collect();
+                let res = $op(&left, &Scalar::new(&right)).unwrap();
                 assert_eq!(res, expected);
             }
         };
-        ($test_name:ident, $test_name_dyn:ident, $left:expr, $right:expr, 
$op:expr, $op_dyn:expr, $expected:expr) => {
-            test_utf8_scalar!($test_name, $left, $right, $op, $expected);
-            test_utf8_scalar!($test_name_dyn, $left, $right, $op_dyn, 
$expected);
-        };
     }
 
     test_utf8!(
         test_utf8_array_like,
-        vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", 
"arrow"],
-        vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"],
-        like_utf8,
-        vec![true, true, true, false, false, true, false, false]
-    );
-
-    test_dict_utf8!(
-        test_utf8_array_like_dict,
-        vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", 
"arrow"],
+        vec![
+            "arrow",
+            "arrow_long_string_more than 12 bytes",
+            "arrow",
+            "arrow",
+            "arrow",
+            "arrows",
+            "arrow",
+            "arrow"
+        ],
         vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"],
-        like_dyn,
+        like,
         vec![true, true, true, false, false, true, false, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_like_scalar_escape_testing,
-        test_utf8_array_like_scalar_dyn_escape_testing,
-        vec!["varchar(255)", "int(255)", "varchar", "int"],
+        vec![
+            "varchar(255)",
+            "int(255)longer than 12 bytes",
+            "varchar",
+            "int"
+        ],
         "%(%)%",
-        like_utf8_scalar,
-        like_utf8_scalar_dyn,
+        like,
         vec![true, true, false, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_like_scalar_escape_regex,
-        test_utf8_array_like_scalar_dyn_escape_regex,
         vec![".*", "a", "*"],
         ".*",
-        like_utf8_scalar,
-        like_utf8_scalar_dyn,
+        like,
         vec![true, false, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_like_scalar_escape_regex_dot,
-        test_utf8_array_like_scalar_dyn_escape_regex_dot,
         vec![".", "a", "*"],
         ".",
-        like_utf8_scalar,
-        like_utf8_scalar_dyn,
+        like,
         vec![true, false, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_like_scalar,
-        test_utf8_array_like_scalar_dyn,
-        vec!["arrow", "parquet", "datafusion", "flight"],
+        vec![
+            "arrow",
+            "parquet",
+            "datafusion",
+            "flight",
+            "long string arrow test 12 bytes"
+        ],
         "%ar%",
-        like_utf8_scalar,
-        like_utf8_scalar_dyn,
-        vec![true, true, false, false]
+        like,
+        vec![true, true, false, false, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_like_scalar_start,
-        test_utf8_array_like_scalar_dyn_start,
-        vec!["arrow", "parrow", "arrows", "arr"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "arrow%",
-        like_utf8_scalar,
-        like_utf8_scalar_dyn,
-        vec![true, false, true, false]
+        like,
+        vec![true, false, true, false, true]
     );
 
     // Replicates `test_utf8_array_like_scalar_start` 
`test_utf8_array_like_scalar_dyn_start` to
     // demonstrate that `SQL STARTSWITH` works as expected.
     test_utf8_scalar!(
         test_utf8_array_starts_with_scalar_start,
-        test_utf8_array_starts_with_scalar_dyn_start,
-        vec!["arrow", "parrow", "arrows", "arr"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "arrow",
-        starts_with_utf8_scalar,
-        starts_with_utf8_scalar_dyn,
-        vec![true, false, true, false]
+        starts_with,
+        vec![true, false, true, false, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_like_scalar_end,
-        test_utf8_array_like_scalar_dyn_end,
-        vec!["arrow", "parrow", "arrows", "arr"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "%arrow",
-        like_utf8_scalar,
-        like_utf8_scalar_dyn,
-        vec![true, true, false, false]
+        like,
+        vec![true, true, false, false, false]
     );
 
     // Replicates `test_utf8_array_like_scalar_end` 
`test_utf8_array_like_scalar_dyn_end` to
     // demonstrate that `SQL ENDSWITH` works as expected.
     test_utf8_scalar!(
         test_utf8_array_ends_with_scalar_end,
-        test_utf8_array_ends_with_scalar_dyn_end,
-        vec!["arrow", "parrow", "arrows", "arr"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "arrow",
-        ends_with_utf8_scalar,
-        ends_with_utf8_scalar_dyn,
-        vec![true, true, false, false]
+        ends_with,
+        vec![true, true, false, false, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_like_scalar_equals,
-        test_utf8_array_like_scalar_dyn_equals,
-        vec!["arrow", "parrow", "arrows", "arr"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "arrow",
-        like_utf8_scalar,
-        like_utf8_scalar_dyn,
-        vec![true, false, false, false]
+        like,
+        vec![true, false, false, false, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_like_scalar_one,
-        test_utf8_array_like_scalar_dyn_one,
-        vec!["arrow", "arrows", "parrow", "arr"],
+        vec![
+            "arrow",
+            "arrows",
+            "parrow",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "arrow_",
-        like_utf8_scalar,
-        like_utf8_scalar_dyn,
-        vec![false, true, false, false]
+        like,
+        vec![false, true, false, false, false]
     );
 
     test_utf8_scalar!(
         test_utf8_scalar_like_escape,
-        test_utf8_scalar_like_dyn_escape,
-        vec!["a%", "a\\x"],
+        vec!["a%", "a\\x", "arrow long string longer than 12 bytes"],
         "a\\%",
-        like_utf8_scalar,
-        like_utf8_scalar_dyn,
-        vec![true, false]
+        like,
+        vec![true, false, false]
     );
 
     test_utf8_scalar!(
         test_utf8_scalar_like_escape_contains,
-        test_utf8_scalar_like_dyn_escape_contains,
-        vec!["ba%", "ba\\x"],
+        vec!["ba%", "ba\\x", "arrow long string longer than 12 bytes"],
         "%a\\%",
-        like_utf8_scalar,
-        like_utf8_scalar_dyn,
-        vec![true, false]
+        like,
+        vec![true, false, false]
     );
 
     test_utf8!(
         test_utf8_scalar_ilike_regex,
         vec!["%%%"],
         vec![r"\%_\%"],
-        ilike_utf8,
-        vec![true]
-    );
-
-    test_dict_utf8!(
-        test_utf8_scalar_ilike_regex_dict,
-        vec!["%%%"],
-        vec![r"\%_\%"],
-        ilike_dyn,
+        ilike,
         vec![true]
     );
 
     test_utf8!(
         test_utf8_array_nlike,
-        vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
-        vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
-        nlike_utf8,
-        vec![false, false, false, true, true, false, true]
-    );
-
-    test_dict_utf8!(
-        test_utf8_array_nlike_dict,
-        vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
+        vec![
+            "arrow",
+            "arrow",
+            "arrow long string longer than 12 bytes",
+            "arrow",
+            "arrow",
+            "arrows",
+            "arrow"
+        ],
         vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
-        nlike_dyn,
+        nlike,
         vec![false, false, false, true, true, false, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nlike_escape_testing,
-        test_utf8_array_nlike_escape_dyn_testing_dyn,
-        vec!["varchar(255)", "int(255)", "varchar", "int"],
+        vec![
+            "varchar(255)",
+            "int(255) arrow long string longer than 12 bytes",
+            "varchar",
+            "int"
+        ],
         "%(%)%",
-        nlike_utf8_scalar,
-        nlike_utf8_scalar_dyn,
+        nlike,
         vec![false, false, true, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nlike_scalar_escape_regex,
-        test_utf8_array_nlike_scalar_dyn_escape_regex,
         vec![".*", "a", "*"],
         ".*",
-        nlike_utf8_scalar,
-        nlike_utf8_scalar_dyn,
+        nlike,
         vec![false, true, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nlike_scalar_escape_regex_dot,
-        test_utf8_array_nlike_scalar_dyn_escape_regex_dot,
         vec![".", "a", "*"],
         ".",
-        nlike_utf8_scalar,
-        nlike_utf8_scalar_dyn,
+        nlike,
         vec![false, true, true]
     );
     test_utf8_scalar!(
         test_utf8_array_nlike_scalar,
-        test_utf8_array_nlike_scalar_dyn,
-        vec!["arrow", "parquet", "datafusion", "flight"],
+        vec![
+            "arrow",
+            "parquet",
+            "datafusion",
+            "flight",
+            "arrow long string longer than 12 bytes"
+        ],
         "%ar%",
-        nlike_utf8_scalar,
-        nlike_utf8_scalar_dyn,
-        vec![false, false, true, true]
+        nlike,
+        vec![false, false, true, true, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nlike_scalar_start,
-        test_utf8_array_nlike_scalar_dyn_start,
-        vec!["arrow", "parrow", "arrows", "arr"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "arrow%",
-        nlike_utf8_scalar,
-        nlike_utf8_scalar_dyn,
-        vec![false, true, false, true]
+        nlike,
+        vec![false, true, false, true, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nlike_scalar_end,
-        test_utf8_array_nlike_scalar_dyn_end,
-        vec!["arrow", "parrow", "arrows", "arr"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "%arrow",
-        nlike_utf8_scalar,
-        nlike_utf8_scalar_dyn,
-        vec![false, false, true, true]
+        nlike,
+        vec![false, false, true, true, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nlike_scalar_equals,
-        test_utf8_array_nlike_scalar_dyn_equals,
-        vec!["arrow", "parrow", "arrows", "arr"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "arrow",
-        nlike_utf8_scalar,
-        nlike_utf8_scalar_dyn,
-        vec![false, true, true, true]
+        nlike,
+        vec![false, true, true, true, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nlike_scalar_one,
-        test_utf8_array_nlike_scalar_dyn_one,
-        vec!["arrow", "arrows", "parrow", "arr"],
+        vec![
+            "arrow",
+            "arrows",
+            "parrow",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "arrow_",
-        nlike_utf8_scalar,
-        nlike_utf8_scalar_dyn,
-        vec![true, false, true, true]
+        nlike,
+        vec![true, false, true, true, true]
     );
 
     test_utf8!(
         test_utf8_array_ilike,
-        vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
-        vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
-        ilike_utf8,
-        vec![true, true, true, false, false, true, false]
-    );
-
-    test_dict_utf8!(
-        test_utf8_array_ilike_dict,
-        vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+        vec![
+            "arrow",
+            "arrow",
+            "ARROW long string longer than 12 bytes",
+            "arrow",
+            "ARROW",
+            "ARROWS",
+            "arROw"
+        ],
         vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
-        ilike_dyn,
+        ilike,
         vec![true, true, true, false, false, true, false]
     );
 
     test_utf8_scalar!(
         ilike_utf8_scalar_escape_testing,
-        ilike_utf8_scalar_escape_dyn_testing,
-        vec!["varchar(255)", "int(255)", "varchar", "int"],
+        vec![
+            "varchar(255)",
+            "int(255) long string longer than 12 bytes",
+            "varchar",
+            "int"
+        ],
         "%(%)%",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
+        ilike,
         vec![true, true, false, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_ilike_scalar,
-        test_utf8_array_ilike_dyn_scalar,
-        vec!["arrow", "parquet", "datafusion", "flight"],
+        vec![
+            "arrow",
+            "parquet",
+            "datafusion",
+            "flight",
+            "arrow long string longer than 12 bytes"
+        ],
         "%AR%",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
-        vec![true, true, false, false]
+        ilike,
+        vec![true, true, false, false, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_ilike_scalar_start,
-        test_utf8_array_ilike_scalar_dyn_start,
-        vec!["arrow", "parrow", "arrows", "ARR"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "ARR",
+            "arrow long string longer than 12 bytes"
+        ],
         "aRRow%",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
-        vec![true, false, true, false]
+        ilike,
+        vec![true, false, true, false, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_ilike_scalar_end,
-        test_utf8_array_ilike_scalar_dyn_end,
-        vec!["ArroW", "parrow", "ARRowS", "arr"],
+        vec![
+            "ArroW",
+            "parrow",
+            "ARRowS",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "%arrow",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
-        vec![true, true, false, false]
+        ilike,
+        vec![true, true, false, false, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_ilike_scalar_equals,
-        test_utf8_array_ilike_scalar_dyn_equals,
-        vec!["arrow", "parrow", "arrows", "arr"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "Arrow",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
-        vec![true, false, false, false]
+        ilike,
+        vec![true, false, false, false, false]
     );
 
     // We only implement loose matching
     test_utf8_scalar!(
         test_utf8_array_ilike_unicode,
-        test_utf8_array_ilike_unicode_dyn,
-        vec!["FFkoß", "FFkoSS", "FFkoss", "FFkoS", "FFkos", "ffkoSS", "ffkoß", 
"FFKoSS"],
+        vec![
+            "FFkoß",
+            "FFkoSS",
+            "FFkoss",
+            "FFkoS",
+            "FFkos",
+            "ffkoSS",
+            "ffkoß",
+            "FFKoSS",
+            "longer than 12 bytes FFKoSS"
+        ],
         "FFkoSS",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
-        vec![false, true, true, false, false, false, false, true]
+        ilike,
+        vec![false, true, true, false, false, false, false, true, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_ilike_unicode_starts,
-        test_utf8_array_ilike_unicode_start_dyn,
         vec![
             "FFkoßsdlkdf",
             "FFkoSSsdlkdf",
@@ -761,16 +888,15 @@ mod tests {
             "ffkoß",
             "FfkosSsdfd",
             "FFKoSS",
+            "longer than 12 bytes FFKoSS",
         ],
         "FFkoSS%",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
-        vec![false, true, true, false, false, false, false, true, true]
+        ilike,
+        vec![false, true, true, false, false, false, false, true, true, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_ilike_unicode_ends,
-        test_utf8_array_ilike_unicode_ends_dyn,
         vec![
             "sdlkdfFFkoß",
             "sdlkdfFFkoSS",
@@ -781,16 +907,15 @@ mod tests {
             "ffkoß",
             "h😃klFfkosS",
             "FFKoSS",
+            "longer than 12 bytes FFKoSS",
         ],
         "%FFkoSS",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
-        vec![false, true, true, false, false, false, false, true, true]
+        ilike,
+        vec![false, true, true, false, false, false, false, true, true, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_ilike_unicode_contains,
-        test_utf8_array_ilike_unicode_contains_dyn,
         vec![
             "sdlkdfFkoßsdfs",
             "sdlkdfFkoSSdggs",
@@ -802,11 +927,11 @@ mod tests {
             "😃sadlksffkosSsh😃klF",
             "😱slgffkosSsh😃klF",
             "FFKoSS",
+            "longer than 12 bytes FFKoSS",
         ],
         "%FFkoSS%",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
-        vec![false, true, true, false, false, false, false, true, true, true]
+        ilike,
+        vec![false, true, true, false, false, false, false, true, true, true, 
true]
     );
 
     // Replicates `test_utf8_array_ilike_unicode_contains` and
@@ -816,7 +941,6 @@ mod tests {
     // NOTE: 5 of the values were changed because the original used a case 
insensitive `ilike`.
     test_utf8_scalar!(
         test_utf8_array_contains_unicode_contains,
-        test_utf8_array_contains_unicode_contains_dyn,
         vec![
             "sdlkdfFkoßsdfs",
             "sdlkdFFkoSSdggs", // Original was case insensitive 
"sdlkdfFkoSSdggs"
@@ -828,16 +952,15 @@ mod tests {
             "😃sadlksFFkoSSsh😃klF", // Original was case insensitive 
"😃sadlksffkosSsh😃klF"
             "😱slgFFkoSSsh😃klF",    // Original was case insensitive 
"😱slgffkosSsh😃klF"
             "FFkoSS",                // "FFKoSS"
+            "longer than 12 bytes FFKoSS",
         ],
         "FFkoSS",
-        contains_utf8_scalar,
-        contains_utf8_scalar_dyn,
-        vec![false, true, true, false, false, false, false, true, true, true]
+        contains,
+        vec![false, true, true, false, false, false, false, true, true, true, 
false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_ilike_unicode_complex,
-        test_utf8_array_ilike_unicode_complex_dyn,
         vec![
             "sdlkdfFooßsdfs",
             "sdlkdfFooSSdggs",
@@ -849,97 +972,124 @@ mod tests {
             "😃sadlksffofsSsh😃klF",
             "😱slgffoesSsh😃klF",
             "FFKoSS",
+            "longer than 12 bytes FFKoSS",
         ],
         "%FF__SS%",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
-        vec![false, true, true, false, false, false, false, true, true, true]
+        ilike,
+        vec![false, true, true, false, false, false, false, true, true, true, 
true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_ilike_scalar_one,
-        test_utf8_array_ilike_scalar_dyn_one,
-        vec!["arrow", "arrows", "parrow", "arr"],
+        vec![
+            "arrow",
+            "arrows",
+            "parrow",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "arrow_",
-        ilike_utf8_scalar,
-        ilike_utf8_scalar_dyn,
-        vec![false, true, false, false]
+        ilike,
+        vec![false, true, false, false, false]
     );
 
     test_utf8!(
         test_utf8_array_nilike,
-        vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
-        vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
-        nilike_utf8,
-        vec![false, false, false, true, true, false, true]
-    );
-
-    test_dict_utf8!(
-        test_utf8_array_nilike_dict,
-        vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+        vec![
+            "arrow",
+            "arrow",
+            "ARROW longer than 12 bytes string",
+            "arrow",
+            "ARROW",
+            "ARROWS",
+            "arROw"
+        ],
         vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
-        nilike_dyn,
+        nilike,
         vec![false, false, false, true, true, false, true]
     );
 
     test_utf8_scalar!(
         nilike_utf8_scalar_escape_testing,
-        nilike_utf8_scalar_escape_dyn_testing,
-        vec!["varchar(255)", "int(255)", "varchar", "int"],
+        vec![
+            "varchar(255)",
+            "int(255) longer than 12 bytes string",
+            "varchar",
+            "int"
+        ],
         "%(%)%",
-        nilike_utf8_scalar,
-        nilike_utf8_scalar_dyn,
+        nilike,
         vec![false, false, true, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nilike_scalar,
-        test_utf8_array_nilike_dyn_scalar,
-        vec!["arrow", "parquet", "datafusion", "flight"],
+        vec![
+            "arrow",
+            "parquet",
+            "datafusion",
+            "flight",
+            "arrow long string longer than 12 bytes"
+        ],
         "%AR%",
-        nilike_utf8_scalar,
-        nilike_utf8_scalar_dyn,
-        vec![false, false, true, true]
+        nilike,
+        vec![false, false, true, true, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nilike_scalar_start,
-        test_utf8_array_nilike_scalar_dyn_start,
-        vec!["arrow", "parrow", "arrows", "ARR"],
+        vec![
+            "arrow",
+            "parrow",
+            "arrows",
+            "ARR",
+            "arrow long string longer than 12 bytes"
+        ],
         "aRRow%",
-        nilike_utf8_scalar,
-        nilike_utf8_scalar_dyn,
-        vec![false, true, false, true]
+        nilike,
+        vec![false, true, false, true, false]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nilike_scalar_end,
-        test_utf8_array_nilike_scalar_dyn_end,
-        vec!["ArroW", "parrow", "ARRowS", "arr"],
+        vec![
+            "ArroW",
+            "parrow",
+            "ARRowS",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "%arrow",
-        nilike_utf8_scalar,
-        nilike_utf8_scalar_dyn,
-        vec![false, false, true, true]
+        nilike,
+        vec![false, false, true, true, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nilike_scalar_equals,
-        test_utf8_array_nilike_scalar_dyn_equals,
-        vec!["arRow", "parrow", "arrows", "arr"],
+        vec![
+            "arRow",
+            "parrow",
+            "arrows",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "Arrow",
-        nilike_utf8_scalar,
-        nilike_utf8_scalar_dyn,
-        vec![false, true, true, true]
+        nilike,
+        vec![false, true, true, true, true]
     );
 
     test_utf8_scalar!(
         test_utf8_array_nilike_scalar_one,
-        test_utf8_array_nilike_scalar_dyn_one,
-        vec!["arrow", "arrows", "parrow", "arr"],
+        vec![
+            "arrow",
+            "arrows",
+            "parrow",
+            "arr",
+            "arrow long string longer than 12 bytes"
+        ],
         "arrow_",
-        nilike_utf8_scalar,
-        nilike_utf8_scalar_dyn,
-        vec![true, false, true, true]
+        nilike,
+        vec![true, false, true, true, true]
     );
 
     #[test]
diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index 54ecf42e368..01e3710a6d0 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow_array::{BooleanArray, GenericStringArray, OffsetSizeTrait};
+use arrow_array::{ArrayAccessor, BooleanArray};
 use arrow_schema::ArrowError;
 use memchr::memchr2;
 use regex::{Regex, RegexBuilder};
@@ -95,11 +95,10 @@ impl<'a> Predicate<'a> {
     ///
     /// If `negate` is true the result of the predicate will be negated
     #[inline(never)]
-    pub fn evaluate_array<O: OffsetSizeTrait>(
-        &self,
-        array: &GenericStringArray<O>,
-        negate: bool,
-    ) -> BooleanArray {
+    pub fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray
+    where
+        T: ArrayAccessor<Item = &'i str>,
+    {
         match self {
             Predicate::Eq(v) => BooleanArray::from_unary(array, |haystack| {
                 (haystack.len() == v.len() && haystack == *v) != negate

Reply via email to