This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 66bada54cf5 Implement like/ilike etc for StringViewArray (#5931)
66bada54cf5 is described below
commit 66bada54cf55703fa11bd4bf195f47fb9df714c9
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jun 24 14:02:45 2024 -0700
Implement like/ilike etc for StringViewArray (#5931)
* like for string view array
* fix bug
* update doc
* update tests
---
arrow-array/src/array/byte_view_array.rs | 14 +
arrow-string/src/like.rs | 646 +++++++++++++++++++------------
arrow-string/src/predicate.rs | 11 +-
3 files changed, 417 insertions(+), 254 deletions(-)
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index f31bc1c785b..dc4cbe6834c 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -569,6 +569,20 @@ impl StringViewArray {
pub fn to_binary_view(self) -> BinaryViewArray {
unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers,
self.nulls) }
}
+
+ /// Returns true if all data within this array is ASCII
+ pub fn is_ascii(&self) -> bool {
+ // Alternative (but incorrect): directly check the underlying buffers
+ // (1) Our string view might be sparse, i.e., a subset of the buffers,
+ // so even if the buffer is not ascii, we can still be ascii.
+ // (2) It is quite difficult to know the range of each buffer (unlike
StringArray)
+ // This means that this operation is quite expensive, shall we cache
the result?
+ // i.e. track `is_ascii` in the builder.
+ self.iter().all(|v| match v {
+ Some(v) => v.is_ascii(),
+ None => true,
+ })
+ }
}
impl From<Vec<&str>> for StringViewArray {
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index 6f6dfe03133..49831092ffc 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -20,6 +20,7 @@ use arrow_array::cast::AsArray;
use arrow_array::*;
use arrow_schema::*;
use arrow_select::take::take;
+use iterator::ArrayIter;
use std::sync::Arc;
#[derive(Debug)]
@@ -126,24 +127,66 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) ->
Result<BooleanArray, Arr
let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r);
match (l.data_type(), r.data_type()) {
- (Utf8, Utf8) => apply::<i32>(op, l.as_string(), l_s, l_v,
r.as_string(), r_s, r_v),
+ (Utf8, Utf8) => {
+ apply::<&GenericStringArray<i32>>(op, l.as_string(), l_s, l_v,
r.as_string(), r_s, r_v)
+ }
(LargeUtf8, LargeUtf8) => {
- apply::<i64>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v)
+ apply::<&GenericStringArray<i64>>(op, l.as_string(), l_s, l_v,
r.as_string(), r_s, r_v)
}
+ (Utf8View, Utf8View) => apply::<&StringViewArray>(
+ op,
+ l.as_string_view(),
+ l_s,
+ l_v,
+ r.as_string_view(),
+ r_s,
+ r_v,
+ ),
(l_t, r_t) => Err(ArrowError::InvalidArgumentError(format!(
"Invalid string operation: {l_t} {op} {r_t}"
))),
}
}
-fn apply<O: OffsetSizeTrait>(
+/// A trait for Arrow String Arrays, currently three types are supported:
+/// - `StringArray`
+/// - `LargeStringArray`
+/// - `StringViewArray`
+///
+/// This trait helps to abstract over the different types of string arrays
+/// so that we don't need to duplicate the implementation for each type.
+trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
+ fn is_ascii(&self) -> bool;
+ fn iter(&self) -> ArrayIter<Self>;
+}
+
+impl<'a, O: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<O>
{
+ fn is_ascii(&self) -> bool {
+ GenericStringArray::<O>::is_ascii(self)
+ }
+
+ fn iter(&self) -> ArrayIter<Self> {
+ GenericStringArray::<O>::iter(self)
+ }
+}
+impl<'a> StringArrayType<'a> for &'a StringViewArray {
+ fn is_ascii(&self) -> bool {
+ StringViewArray::is_ascii(self)
+ }
+
+ fn iter(&self) -> ArrayIter<Self> {
+ StringViewArray::iter(self)
+ }
+}
+
+fn apply<'a, T: StringArrayType<'a> + 'a>(
op: Op,
- l: &GenericStringArray<O>,
+ l: T,
l_s: bool,
- l_v: Option<&dyn AnyDictionaryArray>,
- r: &GenericStringArray<O>,
+ l_v: Option<&'a dyn AnyDictionaryArray>,
+ r: T,
r_s: bool,
- r_v: Option<&dyn AnyDictionaryArray>,
+ r_v: Option<&'a dyn AnyDictionaryArray>,
) -> Result<BooleanArray, ArrowError> {
let l_len = l_v.map(|l| l.len()).unwrap_or(l.len());
if r_s {
@@ -155,7 +198,7 @@ fn apply<O: OffsetSizeTrait>(
if r.is_null(idx) {
return Ok(BooleanArray::new_null(l_len));
}
- op_scalar(op, l, l_v, r.value(idx))
+ op_scalar::<T>(op, l, l_v, r.value(idx))
} else {
match (l_s, l_v, r_v) {
(true, None, None) => {
@@ -187,9 +230,9 @@ fn apply<O: OffsetSizeTrait>(
}
#[inline(never)]
-fn op_scalar<O: OffsetSizeTrait>(
+fn op_scalar<'a, T: StringArrayType<'a>>(
op: Op,
- l: &GenericStringArray<O>,
+ l: T,
l_v: Option<&dyn AnyDictionaryArray>,
r: &str,
) -> Result<BooleanArray, ArrowError> {
@@ -207,8 +250,8 @@ fn op_scalar<O: OffsetSizeTrait>(
})
}
-fn vectored_iter<'a, O: OffsetSizeTrait>(
- a: &'a GenericStringArray<O>,
+fn vectored_iter<'a, T: StringArrayType<'a> + 'a>(
+ a: T,
a_v: &'a dyn AnyDictionaryArray,
) -> impl Iterator<Item = Option<&'a str>> + 'a {
let nulls = a_v.nulls();
@@ -373,24 +416,33 @@ mod tests {
use super::*;
use arrow_array::types::Int8Type;
+ /// Applying `op(left, right)`, both sides are arrays
+ /// The macro tests four types of array implementations:
+ /// - `StringArray`
+ /// - `LargeStringArray`
+ /// - `StringViewArray`
+ /// - `DictionaryArray`
macro_rules! test_utf8 {
($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr)
=> {
#[test]
fn $test_name() {
let expected = BooleanArray::from($expected);
+
let left = StringArray::from($left);
let right = StringArray::from($right);
let res = $op(&left, &right).unwrap();
assert_eq!(res, expected);
- }
- };
- }
- macro_rules! test_dict_utf8 {
- ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr)
=> {
- #[test]
- fn $test_name() {
- let expected = BooleanArray::from($expected);
+ let left = LargeStringArray::from($left);
+ let right = LargeStringArray::from($right);
+ let res = $op(&left, &right).unwrap();
+ assert_eq!(res, expected);
+
+ let left = StringViewArray::from($left);
+ let right = StringViewArray::from($right);
+ let res = $op(&left, &right).unwrap();
+ assert_eq!(res, expected);
+
let left: DictionaryArray<Int8Type> =
$left.into_iter().collect();
let right: DictionaryArray<Int8Type> =
$right.into_iter().collect();
let res = $op(&left, &right).unwrap();
@@ -399,6 +451,12 @@ mod tests {
};
}
+ /// Applying `op(left, right)`, left side is array, right side is scalar
+ /// The macro tests four types of array implementations:
+ /// - `StringArray`
+ /// - `LargeStringArray`
+ /// - `StringViewArray`
+ /// - `DictionaryArray`
macro_rules! test_utf8_scalar {
($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr)
=> {
#[test]
@@ -406,351 +464,420 @@ mod tests {
let expected = BooleanArray::from($expected);
let left = StringArray::from($left);
- let res = $op(&left, $right).unwrap();
+ let right = StringArray::from_iter_values([$right]);
+ let res = $op(&left, &Scalar::new(&right)).unwrap();
assert_eq!(res, expected);
let left = LargeStringArray::from($left);
- let res = $op(&left, $right).unwrap();
+ let right = LargeStringArray::from_iter_values([$right]);
+ let res = $op(&left, &Scalar::new(&right)).unwrap();
+ assert_eq!(res, expected);
+
+ let left = StringViewArray::from($left);
+ let right = StringViewArray::from_iter_values([$right]);
+ let res = $op(&left, &Scalar::new(&right)).unwrap();
+ assert_eq!(res, expected);
+
+ let left: DictionaryArray<Int8Type> =
$left.into_iter().collect();
+ let right: DictionaryArray<Int8Type> =
[$right].into_iter().collect();
+ let res = $op(&left, &Scalar::new(&right)).unwrap();
assert_eq!(res, expected);
}
};
- ($test_name:ident, $test_name_dyn:ident, $left:expr, $right:expr,
$op:expr, $op_dyn:expr, $expected:expr) => {
- test_utf8_scalar!($test_name, $left, $right, $op, $expected);
- test_utf8_scalar!($test_name_dyn, $left, $right, $op_dyn,
$expected);
- };
}
test_utf8!(
test_utf8_array_like,
- vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow",
"arrow"],
- vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"],
- like_utf8,
- vec![true, true, true, false, false, true, false, false]
- );
-
- test_dict_utf8!(
- test_utf8_array_like_dict,
- vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow",
"arrow"],
+ vec![
+ "arrow",
+ "arrow_long_string_more than 12 bytes",
+ "arrow",
+ "arrow",
+ "arrow",
+ "arrows",
+ "arrow",
+ "arrow"
+ ],
vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"],
- like_dyn,
+ like,
vec![true, true, true, false, false, true, false, false]
);
test_utf8_scalar!(
test_utf8_array_like_scalar_escape_testing,
- test_utf8_array_like_scalar_dyn_escape_testing,
- vec!["varchar(255)", "int(255)", "varchar", "int"],
+ vec![
+ "varchar(255)",
+ "int(255)longer than 12 bytes",
+ "varchar",
+ "int"
+ ],
"%(%)%",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
+ like,
vec![true, true, false, false]
);
test_utf8_scalar!(
test_utf8_array_like_scalar_escape_regex,
- test_utf8_array_like_scalar_dyn_escape_regex,
vec![".*", "a", "*"],
".*",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
+ like,
vec![true, false, false]
);
test_utf8_scalar!(
test_utf8_array_like_scalar_escape_regex_dot,
- test_utf8_array_like_scalar_dyn_escape_regex_dot,
vec![".", "a", "*"],
".",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
+ like,
vec![true, false, false]
);
test_utf8_scalar!(
test_utf8_array_like_scalar,
- test_utf8_array_like_scalar_dyn,
- vec!["arrow", "parquet", "datafusion", "flight"],
+ vec![
+ "arrow",
+ "parquet",
+ "datafusion",
+ "flight",
+ "long string arrow test 12 bytes"
+ ],
"%ar%",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, true, false, false]
+ like,
+ vec![true, true, false, false, true]
);
test_utf8_scalar!(
test_utf8_array_like_scalar_start,
- test_utf8_array_like_scalar_dyn_start,
- vec!["arrow", "parrow", "arrows", "arr"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"arrow%",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, false, true, false]
+ like,
+ vec![true, false, true, false, true]
);
// Replicates `test_utf8_array_like_scalar_start`
`test_utf8_array_like_scalar_dyn_start` to
// demonstrate that `SQL STARTSWITH` works as expected.
test_utf8_scalar!(
test_utf8_array_starts_with_scalar_start,
- test_utf8_array_starts_with_scalar_dyn_start,
- vec!["arrow", "parrow", "arrows", "arr"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"arrow",
- starts_with_utf8_scalar,
- starts_with_utf8_scalar_dyn,
- vec![true, false, true, false]
+ starts_with,
+ vec![true, false, true, false, true]
);
test_utf8_scalar!(
test_utf8_array_like_scalar_end,
- test_utf8_array_like_scalar_dyn_end,
- vec!["arrow", "parrow", "arrows", "arr"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"%arrow",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, true, false, false]
+ like,
+ vec![true, true, false, false, false]
);
// Replicates `test_utf8_array_like_scalar_end`
`test_utf8_array_like_scalar_dyn_end` to
// demonstrate that `SQL ENDSWITH` works as expected.
test_utf8_scalar!(
test_utf8_array_ends_with_scalar_end,
- test_utf8_array_ends_with_scalar_dyn_end,
- vec!["arrow", "parrow", "arrows", "arr"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"arrow",
- ends_with_utf8_scalar,
- ends_with_utf8_scalar_dyn,
- vec![true, true, false, false]
+ ends_with,
+ vec![true, true, false, false, false]
);
test_utf8_scalar!(
test_utf8_array_like_scalar_equals,
- test_utf8_array_like_scalar_dyn_equals,
- vec!["arrow", "parrow", "arrows", "arr"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"arrow",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, false, false, false]
+ like,
+ vec![true, false, false, false, false]
);
test_utf8_scalar!(
test_utf8_array_like_scalar_one,
- test_utf8_array_like_scalar_dyn_one,
- vec!["arrow", "arrows", "parrow", "arr"],
+ vec![
+ "arrow",
+ "arrows",
+ "parrow",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"arrow_",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![false, true, false, false]
+ like,
+ vec![false, true, false, false, false]
);
test_utf8_scalar!(
test_utf8_scalar_like_escape,
- test_utf8_scalar_like_dyn_escape,
- vec!["a%", "a\\x"],
+ vec!["a%", "a\\x", "arrow long string longer than 12 bytes"],
"a\\%",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, false]
+ like,
+ vec![true, false, false]
);
test_utf8_scalar!(
test_utf8_scalar_like_escape_contains,
- test_utf8_scalar_like_dyn_escape_contains,
- vec!["ba%", "ba\\x"],
+ vec!["ba%", "ba\\x", "arrow long string longer than 12 bytes"],
"%a\\%",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, false]
+ like,
+ vec![true, false, false]
);
test_utf8!(
test_utf8_scalar_ilike_regex,
vec!["%%%"],
vec![r"\%_\%"],
- ilike_utf8,
- vec![true]
- );
-
- test_dict_utf8!(
- test_utf8_scalar_ilike_regex_dict,
- vec!["%%%"],
- vec![r"\%_\%"],
- ilike_dyn,
+ ilike,
vec![true]
);
test_utf8!(
test_utf8_array_nlike,
- vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
- vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
- nlike_utf8,
- vec![false, false, false, true, true, false, true]
- );
-
- test_dict_utf8!(
- test_utf8_array_nlike_dict,
- vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
+ vec![
+ "arrow",
+ "arrow",
+ "arrow long string longer than 12 bytes",
+ "arrow",
+ "arrow",
+ "arrows",
+ "arrow"
+ ],
vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
- nlike_dyn,
+ nlike,
vec![false, false, false, true, true, false, true]
);
test_utf8_scalar!(
test_utf8_array_nlike_escape_testing,
- test_utf8_array_nlike_escape_dyn_testing_dyn,
- vec!["varchar(255)", "int(255)", "varchar", "int"],
+ vec![
+ "varchar(255)",
+ "int(255) arrow long string longer than 12 bytes",
+ "varchar",
+ "int"
+ ],
"%(%)%",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
+ nlike,
vec![false, false, true, true]
);
test_utf8_scalar!(
test_utf8_array_nlike_scalar_escape_regex,
- test_utf8_array_nlike_scalar_dyn_escape_regex,
vec![".*", "a", "*"],
".*",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
+ nlike,
vec![false, true, true]
);
test_utf8_scalar!(
test_utf8_array_nlike_scalar_escape_regex_dot,
- test_utf8_array_nlike_scalar_dyn_escape_regex_dot,
vec![".", "a", "*"],
".",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
+ nlike,
vec![false, true, true]
);
test_utf8_scalar!(
test_utf8_array_nlike_scalar,
- test_utf8_array_nlike_scalar_dyn,
- vec!["arrow", "parquet", "datafusion", "flight"],
+ vec![
+ "arrow",
+ "parquet",
+ "datafusion",
+ "flight",
+ "arrow long string longer than 12 bytes"
+ ],
"%ar%",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, false, true, true]
+ nlike,
+ vec![false, false, true, true, false]
);
test_utf8_scalar!(
test_utf8_array_nlike_scalar_start,
- test_utf8_array_nlike_scalar_dyn_start,
- vec!["arrow", "parrow", "arrows", "arr"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"arrow%",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, true, false, true]
+ nlike,
+ vec![false, true, false, true, false]
);
test_utf8_scalar!(
test_utf8_array_nlike_scalar_end,
- test_utf8_array_nlike_scalar_dyn_end,
- vec!["arrow", "parrow", "arrows", "arr"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"%arrow",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, false, true, true]
+ nlike,
+ vec![false, false, true, true, true]
);
test_utf8_scalar!(
test_utf8_array_nlike_scalar_equals,
- test_utf8_array_nlike_scalar_dyn_equals,
- vec!["arrow", "parrow", "arrows", "arr"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"arrow",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, true, true, true]
+ nlike,
+ vec![false, true, true, true, true]
);
test_utf8_scalar!(
test_utf8_array_nlike_scalar_one,
- test_utf8_array_nlike_scalar_dyn_one,
- vec!["arrow", "arrows", "parrow", "arr"],
+ vec![
+ "arrow",
+ "arrows",
+ "parrow",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"arrow_",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![true, false, true, true]
+ nlike,
+ vec![true, false, true, true, true]
);
test_utf8!(
test_utf8_array_ilike,
- vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
- vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
- ilike_utf8,
- vec![true, true, true, false, false, true, false]
- );
-
- test_dict_utf8!(
- test_utf8_array_ilike_dict,
- vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+ vec![
+ "arrow",
+ "arrow",
+ "ARROW long string longer than 12 bytes",
+ "arrow",
+ "ARROW",
+ "ARROWS",
+ "arROw"
+ ],
vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
- ilike_dyn,
+ ilike,
vec![true, true, true, false, false, true, false]
);
test_utf8_scalar!(
ilike_utf8_scalar_escape_testing,
- ilike_utf8_scalar_escape_dyn_testing,
- vec!["varchar(255)", "int(255)", "varchar", "int"],
+ vec![
+ "varchar(255)",
+ "int(255) long string longer than 12 bytes",
+ "varchar",
+ "int"
+ ],
"%(%)%",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
+ ilike,
vec![true, true, false, false]
);
test_utf8_scalar!(
test_utf8_array_ilike_scalar,
- test_utf8_array_ilike_dyn_scalar,
- vec!["arrow", "parquet", "datafusion", "flight"],
+ vec![
+ "arrow",
+ "parquet",
+ "datafusion",
+ "flight",
+ "arrow long string longer than 12 bytes"
+ ],
"%AR%",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![true, true, false, false]
+ ilike,
+ vec![true, true, false, false, true]
);
test_utf8_scalar!(
test_utf8_array_ilike_scalar_start,
- test_utf8_array_ilike_scalar_dyn_start,
- vec!["arrow", "parrow", "arrows", "ARR"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "ARR",
+ "arrow long string longer than 12 bytes"
+ ],
"aRRow%",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![true, false, true, false]
+ ilike,
+ vec![true, false, true, false, true]
);
test_utf8_scalar!(
test_utf8_array_ilike_scalar_end,
- test_utf8_array_ilike_scalar_dyn_end,
- vec!["ArroW", "parrow", "ARRowS", "arr"],
+ vec![
+ "ArroW",
+ "parrow",
+ "ARRowS",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"%arrow",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![true, true, false, false]
+ ilike,
+ vec![true, true, false, false, false]
);
test_utf8_scalar!(
test_utf8_array_ilike_scalar_equals,
- test_utf8_array_ilike_scalar_dyn_equals,
- vec!["arrow", "parrow", "arrows", "arr"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"Arrow",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![true, false, false, false]
+ ilike,
+ vec![true, false, false, false, false]
);
// We only implement loose matching
test_utf8_scalar!(
test_utf8_array_ilike_unicode,
- test_utf8_array_ilike_unicode_dyn,
- vec!["FFkoß", "FFkoSS", "FFkoss", "FFkoS", "FFkos", "ffkoSS", "ffkoß",
"FFKoSS"],
+ vec![
+ "FFkoß",
+ "FFkoSS",
+ "FFkoss",
+ "FFkoS",
+ "FFkos",
+ "ffkoSS",
+ "ffkoß",
+ "FFKoSS",
+ "longer than 12 bytes FFKoSS"
+ ],
"FFkoSS",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![false, true, true, false, false, false, false, true]
+ ilike,
+ vec![false, true, true, false, false, false, false, true, false]
);
test_utf8_scalar!(
test_utf8_array_ilike_unicode_starts,
- test_utf8_array_ilike_unicode_start_dyn,
vec![
"FFkoßsdlkdf",
"FFkoSSsdlkdf",
@@ -761,16 +888,15 @@ mod tests {
"ffkoß",
"FfkosSsdfd",
"FFKoSS",
+ "longer than 12 bytes FFKoSS",
],
"FFkoSS%",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![false, true, true, false, false, false, false, true, true]
+ ilike,
+ vec![false, true, true, false, false, false, false, true, true, false]
);
test_utf8_scalar!(
test_utf8_array_ilike_unicode_ends,
- test_utf8_array_ilike_unicode_ends_dyn,
vec![
"sdlkdfFFkoß",
"sdlkdfFFkoSS",
@@ -781,16 +907,15 @@ mod tests {
"ffkoß",
"h😃klFfkosS",
"FFKoSS",
+ "longer than 12 bytes FFKoSS",
],
"%FFkoSS",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![false, true, true, false, false, false, false, true, true]
+ ilike,
+ vec![false, true, true, false, false, false, false, true, true, true]
);
test_utf8_scalar!(
test_utf8_array_ilike_unicode_contains,
- test_utf8_array_ilike_unicode_contains_dyn,
vec![
"sdlkdfFkoßsdfs",
"sdlkdfFkoSSdggs",
@@ -802,11 +927,11 @@ mod tests {
"😃sadlksffkosSsh😃klF",
"😱slgffkosSsh😃klF",
"FFKoSS",
+ "longer than 12 bytes FFKoSS",
],
"%FFkoSS%",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![false, true, true, false, false, false, false, true, true, true]
+ ilike,
+ vec![false, true, true, false, false, false, false, true, true, true,
true]
);
// Replicates `test_utf8_array_ilike_unicode_contains` and
@@ -816,7 +941,6 @@ mod tests {
// NOTE: 5 of the values were changed because the original used a case
insensitive `ilike`.
test_utf8_scalar!(
test_utf8_array_contains_unicode_contains,
- test_utf8_array_contains_unicode_contains_dyn,
vec![
"sdlkdfFkoßsdfs",
"sdlkdFFkoSSdggs", // Original was case insensitive
"sdlkdfFkoSSdggs"
@@ -828,16 +952,15 @@ mod tests {
"😃sadlksFFkoSSsh😃klF", // Original was case insensitive
"😃sadlksffkosSsh😃klF"
"😱slgFFkoSSsh😃klF", // Original was case insensitive
"😱slgffkosSsh😃klF"
"FFkoSS", // "FFKoSS"
+ "longer than 12 bytes FFKoSS",
],
"FFkoSS",
- contains_utf8_scalar,
- contains_utf8_scalar_dyn,
- vec![false, true, true, false, false, false, false, true, true, true]
+ contains,
+ vec![false, true, true, false, false, false, false, true, true, true,
false]
);
test_utf8_scalar!(
test_utf8_array_ilike_unicode_complex,
- test_utf8_array_ilike_unicode_complex_dyn,
vec![
"sdlkdfFooßsdfs",
"sdlkdfFooSSdggs",
@@ -849,97 +972,124 @@ mod tests {
"😃sadlksffofsSsh😃klF",
"😱slgffoesSsh😃klF",
"FFKoSS",
+ "longer than 12 bytes FFKoSS",
],
"%FF__SS%",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![false, true, true, false, false, false, false, true, true, true]
+ ilike,
+ vec![false, true, true, false, false, false, false, true, true, true,
true]
);
test_utf8_scalar!(
test_utf8_array_ilike_scalar_one,
- test_utf8_array_ilike_scalar_dyn_one,
- vec!["arrow", "arrows", "parrow", "arr"],
+ vec![
+ "arrow",
+ "arrows",
+ "parrow",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"arrow_",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![false, true, false, false]
+ ilike,
+ vec![false, true, false, false, false]
);
test_utf8!(
test_utf8_array_nilike,
- vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
- vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
- nilike_utf8,
- vec![false, false, false, true, true, false, true]
- );
-
- test_dict_utf8!(
- test_utf8_array_nilike_dict,
- vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+ vec![
+ "arrow",
+ "arrow",
+ "ARROW longer than 12 bytes string",
+ "arrow",
+ "ARROW",
+ "ARROWS",
+ "arROw"
+ ],
vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
- nilike_dyn,
+ nilike,
vec![false, false, false, true, true, false, true]
);
test_utf8_scalar!(
nilike_utf8_scalar_escape_testing,
- nilike_utf8_scalar_escape_dyn_testing,
- vec!["varchar(255)", "int(255)", "varchar", "int"],
+ vec![
+ "varchar(255)",
+ "int(255) longer than 12 bytes string",
+ "varchar",
+ "int"
+ ],
"%(%)%",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
+ nilike,
vec![false, false, true, true]
);
test_utf8_scalar!(
test_utf8_array_nilike_scalar,
- test_utf8_array_nilike_dyn_scalar,
- vec!["arrow", "parquet", "datafusion", "flight"],
+ vec![
+ "arrow",
+ "parquet",
+ "datafusion",
+ "flight",
+ "arrow long string longer than 12 bytes"
+ ],
"%AR%",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![false, false, true, true]
+ nilike,
+ vec![false, false, true, true, false]
);
test_utf8_scalar!(
test_utf8_array_nilike_scalar_start,
- test_utf8_array_nilike_scalar_dyn_start,
- vec!["arrow", "parrow", "arrows", "ARR"],
+ vec![
+ "arrow",
+ "parrow",
+ "arrows",
+ "ARR",
+ "arrow long string longer than 12 bytes"
+ ],
"aRRow%",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![false, true, false, true]
+ nilike,
+ vec![false, true, false, true, false]
);
test_utf8_scalar!(
test_utf8_array_nilike_scalar_end,
- test_utf8_array_nilike_scalar_dyn_end,
- vec!["ArroW", "parrow", "ARRowS", "arr"],
+ vec![
+ "ArroW",
+ "parrow",
+ "ARRowS",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"%arrow",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![false, false, true, true]
+ nilike,
+ vec![false, false, true, true, true]
);
test_utf8_scalar!(
test_utf8_array_nilike_scalar_equals,
- test_utf8_array_nilike_scalar_dyn_equals,
- vec!["arRow", "parrow", "arrows", "arr"],
+ vec![
+ "arRow",
+ "parrow",
+ "arrows",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"Arrow",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![false, true, true, true]
+ nilike,
+ vec![false, true, true, true, true]
);
test_utf8_scalar!(
test_utf8_array_nilike_scalar_one,
- test_utf8_array_nilike_scalar_dyn_one,
- vec!["arrow", "arrows", "parrow", "arr"],
+ vec![
+ "arrow",
+ "arrows",
+ "parrow",
+ "arr",
+ "arrow long string longer than 12 bytes"
+ ],
"arrow_",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![true, false, true, true]
+ nilike,
+ vec![true, false, true, true, true]
);
#[test]
diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index 54ecf42e368..01e3710a6d0 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_array::{BooleanArray, GenericStringArray, OffsetSizeTrait};
+use arrow_array::{ArrayAccessor, BooleanArray};
use arrow_schema::ArrowError;
use memchr::memchr2;
use regex::{Regex, RegexBuilder};
@@ -95,11 +95,10 @@ impl<'a> Predicate<'a> {
///
/// If `negate` is true the result of the predicate will be negated
#[inline(never)]
- pub fn evaluate_array<O: OffsetSizeTrait>(
- &self,
- array: &GenericStringArray<O>,
- negate: bool,
- ) -> BooleanArray {
+ pub fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray
+ where
+ T: ArrayAccessor<Item = &'i str>,
+ {
match self {
Predicate::Eq(v) => BooleanArray::from_unary(array, |haystack| {
(haystack.len() == v.len() && haystack == *v) != negate