tustvold commented on code in PR #4732:
URL: https://github.com/apache/arrow-rs/pull/4732#discussion_r1305713433
##########
arrow-string/src/like.rs:
##########
@@ -15,719 +15,350 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_array::builder::BooleanBufferBuilder;
-use arrow_array::cast::*;
+use crate::predicate::Predicate;
+use arrow_array::cast::AsArray;
use arrow_array::*;
-use arrow_buffer::NullBuffer;
-use arrow_data::ArrayDataBuilder;
use arrow_schema::*;
use arrow_select::take::take;
-use regex::Regex;
-use std::collections::HashMap;
-
-/// Helper function to perform boolean lambda function on values from two
array accessors, this
-/// version does not attempt to use SIMD.
-///
-/// Duplicated from `arrow_ord::comparison`
-fn compare_op<T: ArrayAccessor, S: ArrayAccessor, F>(
- left: T,
- right: S,
- op: F,
-) -> Result<BooleanArray, ArrowError>
-where
- F: Fn(T::Item, S::Item) -> bool,
-{
- if left.len() != right.len() {
- return Err(ArrowError::ComputeError(
- "Cannot perform comparison operation on arrays of different length"
- .to_string(),
- ));
- }
-
- Ok(BooleanArray::from_binary(left, right, op))
-}
-
-/// Helper function to perform boolean lambda function on values from array
accessor, this
-/// version does not attempt to use SIMD.
-///
-/// Duplicated from `arrow_ord::comparison`
-fn compare_op_scalar<T: ArrayAccessor, F>(
- left: T,
- op: F,
-) -> Result<BooleanArray, ArrowError>
-where
- F: Fn(T::Item) -> bool,
-{
- Ok(BooleanArray::from_unary(left, op))
+use std::sync::Arc;
+
+#[derive(Debug)]
+enum Op {
+ Like(bool),
+ ILike(bool),
+ Contains,
+ StartsWith,
+ EndsWith,
}
-macro_rules! dyn_function {
- ($sql:tt, $fn_name:tt, $fn_utf8:tt, $fn_dict:tt) => {
-#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")]
-/// [`LargeStringArray`], or [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray,
ArrowError> {
- match (left.data_type(), right.data_type()) {
- (DataType::Utf8, DataType::Utf8) => {
- let left = left.as_string::<i32>();
- let right = right.as_string::<i32>();
- $fn_utf8(left, right)
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- let left = left.as_string::<i64>();
- let right = right.as_string::<i64>();
- $fn_utf8(left, right)
+impl std::fmt::Display for Op {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Op::Like(false) => write!(f, "LIKE"),
+ Op::Like(true) => write!(f, "NLIKE"),
+ Op::ILike(false) => write!(f, "ILIKE"),
+ Op::ILike(true) => write!(f, "NILIKE"),
+ Op::Contains => write!(f, "CONTAINS"),
+ Op::StartsWith => write!(f, "STARTS_WITH"),
+ Op::EndsWith => write!(f, "ENDS_WITH"),
}
- #[cfg(feature = "dyn_cmp_dict")]
- (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
- downcast_dictionary_array!(
- left => {
- let right = as_dictionary_array(right);
- $fn_dict(left, right)
- }
- t => Err(ArrowError::ComputeError(format!(
- "Should be DictionaryArray but got: {}", t
- )))
- )
- }
- _ => {
- Err(ArrowError::ComputeError(format!(
- "{} only supports Utf8, LargeUtf8 or DictionaryArray (with
feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values",
- stringify!($fn_name)
- )))
- }
- }
-}
-
}
}
-dyn_function!("left LIKE right", like_dyn, like_utf8, like_dict);
-dyn_function!("left NOT LIKE right", nlike_dyn, nlike_utf8, nlike_dict);
-dyn_function!("left ILIKE right", ilike_dyn, ilike_utf8, ilike_dict);
-dyn_function!("left NOT ILIKE right", nilike_dyn, nilike_utf8, nilike_dict);
-dyn_function!(
- "STARTSWITH(left, right)",
- starts_with_dyn,
- starts_with_utf8,
- starts_with_dict
-);
-dyn_function!(
- "ENDSWITH(left, right)",
- ends_with_dyn,
- ends_with_utf8,
- ends_with_dict
-);
-dyn_function!(
- "CONTAINS(left, right)",
- contains_dyn,
- contains_utf8,
- contains_dict
-);
-macro_rules! scalar_dyn_function {
- ($sql:tt, $fn_name:tt, $fn_scalar:tt) => {
-#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")]
-/// [`LargeStringArray`], or [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn $fn_name(
- left: &dyn Array,
- right: &str,
-) -> Result<BooleanArray, ArrowError> {
- match left.data_type() {
- DataType::Utf8 => {
- let left = left.as_string::<i32>();
- $fn_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = left.as_string::<i64>();
- $fn_scalar(left, right)
- }
- DataType::Dictionary(_, _) => {
- downcast_dictionary_array!(
- left => {
- let dict_comparison = $fn_name(left.values().as_ref(),
right)?;
- // TODO: Use take_boolean (#2967)
- let array = take(&dict_comparison, left.keys(), None)?;
- Ok(BooleanArray::from(array.to_data()))
- }
- t => Err(ArrowError::ComputeError(format!(
- "Should be DictionaryArray but got: {}", t
- )))
- )
- }
- _ => {
- Err(ArrowError::ComputeError(format!(
- "{} only supports Utf8, LargeUtf8 or DictionaryArray with Utf8
or LargeUtf8 values",
- stringify!($fn_name)
- )))
- }
- }
+/// Perform SQL `left LIKE right`
+pub fn like(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray,
ArrowError> {
+ like_op(Op::Like(false), left, right)
}
- }
-}
-scalar_dyn_function!("left LIKE right", like_utf8_scalar_dyn, like_scalar);
-scalar_dyn_function!("left NOT LIKE right", nlike_utf8_scalar_dyn,
nlike_scalar);
-scalar_dyn_function!("left ILIKE right", ilike_utf8_scalar_dyn, ilike_scalar);
-scalar_dyn_function!(
- "left NOT ILIKE right",
- nilike_utf8_scalar_dyn,
- nilike_scalar
-);
-scalar_dyn_function!(
- "STARTSWITH(left, right)",
- starts_with_utf8_scalar_dyn,
- starts_with_scalar
-);
-scalar_dyn_function!(
- "ENDSWITH(left, right)",
- ends_with_utf8_scalar_dyn,
- ends_with_scalar
-);
-scalar_dyn_function!(
- "CONTAINS(left, right)",
- contains_utf8_scalar_dyn,
- contains_scalar
-);
-
-macro_rules! dict_function {
- ($sql:tt, $fn_name:tt, $fn_impl:tt) => {
-
-#[doc = concat!("Perform SQL `", $sql ,"` operation on [`DictionaryArray`]
with values")]
-/// [`StringArray`]/[`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-#[cfg(feature = "dyn_cmp_dict")]
-fn $fn_name<K: arrow_array::types::ArrowDictionaryKeyType>(
- left: &DictionaryArray<K>,
- right: &DictionaryArray<K>,
-) -> Result<BooleanArray, ArrowError> {
- match (left.value_type(), right.value_type()) {
- (DataType::Utf8, DataType::Utf8) => {
- let left =
left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- let right =
right.downcast_dict::<GenericStringArray<i32>>().unwrap();
- $fn_impl(left, right)
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- let left =
left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- let right =
right.downcast_dict::<GenericStringArray<i64>>().unwrap();
-
- $fn_impl(left, right)
- }
- _ => Err(ArrowError::ComputeError(format!(
- "{} only supports DictionaryArray with Utf8 or LargeUtf8 values",
- stringify!($fn_name)
- ))),
- }
-}
- }
+/// Perform SQL `left ILIKE right`
+pub fn ilike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray,
ArrowError> {
+ like_op(Op::ILike(false), left, right)
}
-dict_function!("left LIKE right", like_dict, like);
-dict_function!("left NOT LIKE right", nlike_dict, nlike);
-dict_function!("left ILIKE right", ilike_dict, ilike);
-dict_function!("left NOT ILIKE right", nilike_dict, nilike);
-dict_function!("STARTSWITH(left, right)", starts_with_dict, starts_with);
-dict_function!("ENDSWITH(left, right)", ends_with_dict, ends_with);
-dict_function!("CONTAINS(left, right)", contains_dict, contains);
-
-/// Perform SQL `left LIKE right` operation on [`StringArray`] /
[`LargeStringArray`].
-///
-/// There are two wildcards supported with the LIKE operator:
-///
-/// 1. `%` - The percent sign represents zero, one, or multiple characters
-/// 2. `_` - The underscore represents a single character
-///
-/// For example:
-/// ```
-/// use arrow_array::{StringArray, BooleanArray};
-/// use arrow_string::like::like_utf8;
-///
-/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]);
-/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]);
-///
-/// let result = like_utf8(&strings, &patterns).unwrap();
-/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true]));
-/// ```
-pub fn like_utf8<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &GenericStringArray<OffsetSize>,
-) -> Result<BooleanArray, ArrowError> {
- like(left, right)
+/// Perform SQL `left NOT LIKE right`
+pub fn nlike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray,
ArrowError> {
+ like_op(Op::Like(true), left, right)
}
-#[inline]
-fn like<'a, S: ArrayAccessor<Item = &'a str>>(
- left: S,
- right: S,
-) -> Result<BooleanArray, ArrowError> {
- regex_like(left, right, false, |re_pattern| {
- Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from LIKE pattern: {e}"
- ))
- })
- })
+/// Perform SQL `left NOT ILIKE right`
+pub fn nilike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray,
ArrowError> {
+ like_op(Op::ILike(true), left, right)
}
-#[inline]
-fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor<Item = &'a str>>(
- left: L,
- right: &str,
- op: F,
+/// Perform SQL `STARTSWITH(left, right)`
+pub fn starts_with(
+ left: &dyn Datum,
+ right: &dyn Datum,
) -> Result<BooleanArray, ArrowError> {
- if !right.contains(is_like_pattern) {
- // fast path, can use equals
- Ok(BooleanArray::from_unary(left, |item| op(item == right)))
- } else if right.ends_with('%')
- && !right.ends_with("\\%")
- && !right[..right.len() - 1].contains(is_like_pattern)
- {
- // fast path, can use starts_with
- let starts_with = &right[..right.len() - 1];
-
- Ok(BooleanArray::from_unary(left, |item| {
- op(item.starts_with(starts_with))
- }))
- } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
- // fast path, can use ends_with
- let ends_with = &right[1..];
-
- Ok(BooleanArray::from_unary(left, |item| {
- op(item.ends_with(ends_with))
- }))
- } else if right.starts_with('%')
- && right.ends_with('%')
- && !right.ends_with("\\%")
- && !right[1..right.len() - 1].contains(is_like_pattern)
- {
- let contains = &right[1..right.len() - 1];
-
- Ok(BooleanArray::from_unary(left, |item| {
- op(item.contains(contains))
- }))
- } else {
- let re_pattern = replace_like_wildcards(right)?;
- let re = Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from LIKE pattern: {e}"
- ))
- })?;
-
- Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item))))
- }
+ like_op(Op::StartsWith, left, right)
}
-#[inline]
-fn like_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
- left: L,
- right: &str,
+/// Perform SQL `ENDSWITH(left, right)`
+pub fn ends_with(
+ left: &dyn Datum,
+ right: &dyn Datum,
) -> Result<BooleanArray, ArrowError> {
- like_scalar_op(left, right, |x| x)
+ like_op(Op::EndsWith, left, right)
}
-/// Perform SQL `left LIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &str,
-) -> Result<BooleanArray, ArrowError> {
- like_scalar(left, right)
+/// Perform SQL `CONTAINS(left, right)`
+pub fn contains(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray,
ArrowError> {
+ like_op(Op::Contains, left, right)
}
-/// Transforms a like `pattern` to a regex compatible pattern. To achieve
that, it does:
-///
-/// 1. Replace like wildcards for regex expressions as the pattern will be
evaluated using regex match: `%` => `.*` and `_` => `.`
-/// 2. Escape regex meta characters to match them and not be evaluated as
regex special chars. For example: `.` => `\\.`
-/// 3. Replace escaped like wildcards removing the escape characters to be
able to match it as a regex. For example: `\\%` => `%`
-fn replace_like_wildcards(pattern: &str) -> Result<String, ArrowError> {
- let mut result = String::new();
- let pattern = String::from(pattern);
- let mut chars_iter = pattern.chars().peekable();
- while let Some(c) = chars_iter.next() {
- if c == '\\' {
- let next = chars_iter.peek();
- match next {
- Some(next) if is_like_pattern(*next) => {
- result.push(*next);
- // Skipping the next char as it is already appended
- chars_iter.next();
- }
- _ => {
- result.push('\\');
- result.push('\\');
- }
- }
- } else if regex_syntax::is_meta_character(c) {
- result.push('\\');
- result.push(c);
- } else if c == '%' {
- result.push_str(".*");
- } else if c == '_' {
- result.push('.');
- } else {
- result.push(c);
- }
+fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray,
ArrowError> {
+ use arrow_schema::DataType::*;
+ let (l, l_s) = lhs.get();
+ let (r, r_s) = rhs.get();
+
+ if l.len() != r.len() && !l_s && !r_s {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Cannot compare arrays of different lengths, got {} vs {}",
+ l.len(),
+ r.len()
+ )));
}
- Ok(result)
-}
-
-/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn nlike_utf8<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &GenericStringArray<OffsetSize>,
-) -> Result<BooleanArray, ArrowError> {
- nlike(left, right)
-}
-
-#[inline]
-fn nlike<'a, S: ArrayAccessor<Item = &'a str>>(
- left: S,
- right: S,
-) -> Result<BooleanArray, ArrowError> {
- regex_like(left, right, true, |re_pattern| {
- Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from LIKE pattern: {e}"
- ))
- })
- })
-}
-#[inline]
-fn nlike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
- left: L,
- right: &str,
-) -> Result<BooleanArray, ArrowError> {
- like_scalar_op(left, right, |x| !x)
-}
-
-/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &str,
-) -> Result<BooleanArray, ArrowError> {
- nlike_scalar(left, right)
-}
+ let l_v = l.as_any_dictionary_opt();
+ let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l);
-/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`].
-///
-/// Case insensitive version of [`like_utf8`]
-///
-/// Note: this only implements loose matching as defined by the Unicode
standard. For example,
-/// the `ff` ligature is not equivalent to `FF` and `ß` is not equivalent to
`SS`
-pub fn ilike_utf8<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &GenericStringArray<OffsetSize>,
-) -> Result<BooleanArray, ArrowError> {
- ilike(left, right)
-}
-
-#[inline]
-fn ilike<'a, S: ArrayAccessor<Item = &'a str>>(
- left: S,
- right: S,
-) -> Result<BooleanArray, ArrowError> {
- regex_like(left, right, false, |re_pattern| {
- Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from ILIKE pattern: {e}"
- ))
- })
- })
-}
+ let r_v = r.as_any_dictionary_opt();
+ let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r);
-#[inline]
-fn ilike_scalar_op<O: OffsetSizeTrait, F: Fn(bool) -> bool>(
- left: &GenericStringArray<O>,
- right: &str,
- op: F,
-) -> Result<BooleanArray, ArrowError> {
- // If not ASCII faster to use case insensitive regex than using
to_uppercase
- if right.is_ascii() && left.is_ascii() {
- if !right.contains(is_like_pattern) {
- return Ok(BooleanArray::from_unary(left, |item| {
- op(item.eq_ignore_ascii_case(right))
- }));
- } else if right.ends_with('%')
- && !right.ends_with("\\%")
- && !right[..right.len() - 1].contains(is_like_pattern)
- {
- // fast path, can use starts_with
- let start_str = &right[..right.len() - 1];
- return Ok(BooleanArray::from_unary(left, |item| {
- let end = item.len().min(start_str.len());
- let result = item.is_char_boundary(end)
- && start_str.eq_ignore_ascii_case(&item[..end]);
- op(result)
- }));
- } else if right.starts_with('%') &&
!right[1..].contains(is_like_pattern) {
- // fast path, can use ends_with
- let ends_str = &right[1..];
- return Ok(BooleanArray::from_unary(left, |item| {
- let start = item.len().saturating_sub(ends_str.len());
- let result = item.is_char_boundary(start)
- && ends_str.eq_ignore_ascii_case(&item[start..]);
- op(result)
- }));
+ match (l.data_type(), r.data_type()) {
+ (Utf8, Utf8) => {
+ apply::<i32>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v)
+ }
+ (LargeUtf8, LargeUtf8) => {
+ apply::<i64>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v)
}
+ (l_t, r_t) => Err(ArrowError::InvalidArgumentError(format!(
+ "Invalid string operation: {l_t} {op} {r_t}"
+ ))),
}
-
- let re_pattern = replace_like_wildcards(right)?;
- let re = Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| {
- ArrowError::ComputeError(format!("Unable to build regex from ILIKE
pattern: {e}"))
- })?;
-
- Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item))))
}
-#[inline]
-fn ilike_scalar<O: OffsetSizeTrait>(
- left: &GenericStringArray<O>,
- right: &str,
+fn apply<O: OffsetSizeTrait>(
+ op: Op,
+ l: &GenericStringArray<O>,
+ l_s: bool,
+ l_v: Option<&dyn AnyDictionaryArray>,
+ r: &GenericStringArray<O>,
+ r_s: bool,
+ r_v: Option<&dyn AnyDictionaryArray>,
) -> Result<BooleanArray, ArrowError> {
- ilike_scalar_op(left, right, |x| x)
-}
-
-/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`ilike_utf8`] for more details.
-pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &str,
-) -> Result<BooleanArray, ArrowError> {
- ilike_scalar(left, right)
+ let l_len = l_v.map(|l| l.len()).unwrap_or(l.len());
+ if r_s {
+ let scalar = match r_v {
+ Some(dict) => match dict.nulls().filter(|n| n.null_count() != 0) {
+ Some(_) => return Ok(BooleanArray::new_null(l_len)),
+ None => {
+ let idx = dict.normalized_keys()[0];
+ if r.is_null(idx) {
+ return Ok(BooleanArray::new_null(l_len));
+ }
+ r.value(idx)
+ }
+ },
+ None => r.value(0),
+ };
+ op_scalar(op, l, l_v, scalar)
+ } else {
Review Comment:
Thank you, it took much bashing my head against my keyboard :laughing:
The performance is likely not spectacular, but I'm also not sure how
important it is to optimise the performance of non-scalar like kernels
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]