alamb commented on code in PR #4732: URL: https://github.com/apache/arrow-rs/pull/4732#discussion_r1305675141
########## arrow-string/src/predicate.rs: ########## @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{BooleanArray, GenericStringArray, OffsetSizeTrait}; +use arrow_schema::ArrowError; +use regex::{Regex, RegexBuilder}; + +/// A string based predicate +pub enum Predicate<'a> { + Eq(&'a str), + IEqAscii(&'a str), + Contains(&'a str), + StartsWith(&'a str), + IStartsWithAscii(&'a str), + EndsWith(&'a str), + IEndsWithAscii(&'a str), + Regex(Regex), +} + +impl<'a> Predicate<'a> { + /// Create a predicate for the given like pattern + pub fn like(pattern: &'a str) -> Result<Self, ArrowError> { + if !pattern.contains(is_like_pattern) { + Ok(Self::Eq(pattern)) + } else if pattern.ends_with('%') + && !pattern.ends_with("\\%") + && !pattern[..pattern.len() - 1].contains(is_like_pattern) + { + Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) + } else if pattern.starts_with('%') && !pattern[1..].contains(is_like_pattern) { + Ok(Self::EndsWith(&pattern[1..])) + } else if pattern.starts_with('%') + && pattern.ends_with('%') + && !pattern.ends_with("\\%") + && !pattern[1..pattern.len() - 1].contains(is_like_pattern) + { + Ok(Self::Contains(&pattern[1..pattern.len() - 1])) + } else { + Ok(Self::Regex(regex_like(pattern, false)?)) + } + } + + /// Create a predicate for the given ilike pattern + pub fn ilike(pattern: &'a str, is_ascii: bool) -> Result<Self, ArrowError> { + if is_ascii && pattern.is_ascii() { + if !pattern.contains(is_like_pattern) { + return Ok(Self::IEqAscii(pattern)); + } else if pattern.ends_with('%') + && !pattern.ends_with("\\%") + && !pattern[..pattern.len() - 1].contains(is_like_pattern) + { + return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 1])); + } else if pattern.starts_with('%') && !pattern[1..].contains(is_like_pattern) + { + return Ok(Self::IEndsWithAscii(&pattern[1..])); + } + } + Ok(Self::Regex(regex_like(pattern, true)?)) + } + + /// Evaluate this predicate against the given haystack + pub fn evaluate(&self, haystack: &str) -> bool { + match self { Review Comment: this is pretty neat to mostly use the built in Rust code 👍 ########## arrow-string/src/like.rs: ########## @@ -15,719 +15,350 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::BooleanBufferBuilder; -use arrow_array::cast::*; +use crate::predicate::Predicate; +use arrow_array::cast::AsArray; use arrow_array::*; -use arrow_buffer::NullBuffer; -use arrow_data::ArrayDataBuilder; use arrow_schema::*; use arrow_select::take::take; -use regex::Regex; -use std::collections::HashMap; - -/// Helper function to perform boolean lambda function on values from two array accessors, this -/// version does not attempt to use SIMD. -/// -/// Duplicated from `arrow_ord::comparison` -fn compare_op<T: ArrayAccessor, S: ArrayAccessor, F>( - left: T, - right: S, - op: F, -) -> Result<BooleanArray, ArrowError> -where - F: Fn(T::Item, S::Item) -> bool, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - Ok(BooleanArray::from_binary(left, right, op)) -} - -/// Helper function to perform boolean lambda function on values from array accessor, this -/// version does not attempt to use SIMD. -/// -/// Duplicated from `arrow_ord::comparison` -fn compare_op_scalar<T: ArrayAccessor, F>( - left: T, - op: F, -) -> Result<BooleanArray, ArrowError> -where - F: Fn(T::Item) -> bool, -{ - Ok(BooleanArray::from_unary(left, op)) +use std::sync::Arc; + +#[derive(Debug)] +enum Op { + Like(bool), + ILike(bool), + Contains, + StartsWith, + EndsWith, } -macro_rules! dyn_function { - ($sql:tt, $fn_name:tt, $fn_utf8:tt, $fn_dict:tt) => { -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray, ArrowError> { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.as_string::<i32>(); - let right = right.as_string::<i32>(); - $fn_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.as_string::<i64>(); - let right = right.as_string::<i64>(); - $fn_utf8(left, right) +impl std::fmt::Display for Op { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Op::Like(false) => write!(f, "LIKE"), + Op::Like(true) => write!(f, "NLIKE"), + Op::ILike(false) => write!(f, "ILIKE"), + Op::ILike(true) => write!(f, "NILIKE"), + Op::Contains => write!(f, "CONTAINS"), + Op::StartsWith => write!(f, "STARTS_WITH"), + Op::EndsWith => write!(f, "ENDS_WITH"), } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - $fn_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError(format!( - "{} only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))) - } - } -} - } } -dyn_function!("left LIKE right", like_dyn, like_utf8, like_dict); -dyn_function!("left NOT LIKE right", nlike_dyn, nlike_utf8, nlike_dict); -dyn_function!("left ILIKE right", ilike_dyn, ilike_utf8, ilike_dict); -dyn_function!("left NOT ILIKE right", nilike_dyn, nilike_utf8, nilike_dict); -dyn_function!( - "STARTSWITH(left, right)", - starts_with_dyn, - starts_with_utf8, - starts_with_dict -); -dyn_function!( - "ENDSWITH(left, right)", - ends_with_dyn, - ends_with_utf8, - ends_with_dict -); -dyn_function!( - "CONTAINS(left, right)", - contains_dyn, - contains_utf8, - contains_dict -); -macro_rules! scalar_dyn_function { - ($sql:tt, $fn_name:tt, $fn_scalar:tt) => { -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn $fn_name( - left: &dyn Array, - right: &str, -) -> Result<BooleanArray, ArrowError> { - match left.data_type() { - DataType::Utf8 => { - let left = left.as_string::<i32>(); - $fn_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.as_string::<i64>(); - $fn_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - let dict_comparison = $fn_name(left.values().as_ref(), right)?; - // TODO: Use take_boolean (#2967) - let array = take(&dict_comparison, left.keys(), None)?; - Ok(BooleanArray::from(array.to_data())) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError(format!( - "{} only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))) - } - } +/// Perform SQL `left LIKE right` +pub fn like(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::Like(false), left, right) } - } -} -scalar_dyn_function!("left LIKE right", like_utf8_scalar_dyn, like_scalar); -scalar_dyn_function!("left NOT LIKE right", nlike_utf8_scalar_dyn, nlike_scalar); -scalar_dyn_function!("left ILIKE right", ilike_utf8_scalar_dyn, ilike_scalar); -scalar_dyn_function!( - "left NOT ILIKE right", - nilike_utf8_scalar_dyn, - nilike_scalar -); -scalar_dyn_function!( - "STARTSWITH(left, right)", - starts_with_utf8_scalar_dyn, - starts_with_scalar -); -scalar_dyn_function!( - "ENDSWITH(left, right)", - ends_with_utf8_scalar_dyn, - ends_with_scalar -); -scalar_dyn_function!( - "CONTAINS(left, right)", - contains_utf8_scalar_dyn, - contains_scalar -); - -macro_rules! dict_function { - ($sql:tt, $fn_name:tt, $fn_impl:tt) => { - -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`DictionaryArray`] with values")] -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn $fn_name<K: arrow_array::types::ArrowDictionaryKeyType>( - left: &DictionaryArray<K>, - right: &DictionaryArray<K>, -) -> Result<BooleanArray, ArrowError> { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap(); - let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap(); - $fn_impl(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap(); - let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap(); - - $fn_impl(left, right) - } - _ => Err(ArrowError::ComputeError(format!( - "{} only supports DictionaryArray with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))), - } -} - } +/// Perform SQL `left ILIKE right` +pub fn ilike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::ILike(false), left, right) } -dict_function!("left LIKE right", like_dict, like); -dict_function!("left NOT LIKE right", nlike_dict, nlike); -dict_function!("left ILIKE right", ilike_dict, ilike); -dict_function!("left NOT ILIKE right", nilike_dict, nilike); -dict_function!("STARTSWITH(left, right)", starts_with_dict, starts_with); -dict_function!("ENDSWITH(left, right)", ends_with_dict, ends_with); -dict_function!("CONTAINS(left, right)", contains_dict, contains); - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. Review Comment: This documentation and example seems valuable -- could you please port it over to `Predicate::like`? ########## arrow-string/src/predicate.rs: ########## @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{BooleanArray, GenericStringArray, OffsetSizeTrait}; +use arrow_schema::ArrowError; +use regex::{Regex, RegexBuilder}; + +/// A string based predicate +pub enum Predicate<'a> { + Eq(&'a str), + IEqAscii(&'a str), + Contains(&'a str), + StartsWith(&'a str), + IStartsWithAscii(&'a str), + EndsWith(&'a str), + IEndsWithAscii(&'a str), + Regex(Regex), +} + +impl<'a> Predicate<'a> { + /// Create a predicate for the given like pattern + pub fn like(pattern: &'a str) -> Result<Self, ArrowError> { + if !pattern.contains(is_like_pattern) { + Ok(Self::Eq(pattern)) + } else if pattern.ends_with('%') + && !pattern.ends_with("\\%") + && !pattern[..pattern.len() - 1].contains(is_like_pattern) + { + Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) + } else if pattern.starts_with('%') && !pattern[1..].contains(is_like_pattern) { + Ok(Self::EndsWith(&pattern[1..])) + } else if pattern.starts_with('%') + && pattern.ends_with('%') + && !pattern.ends_with("\\%") + && !pattern[1..pattern.len() - 1].contains(is_like_pattern) + { + Ok(Self::Contains(&pattern[1..pattern.len() - 1])) + } else { + Ok(Self::Regex(regex_like(pattern, false)?)) + } + } + + /// Create a predicate for the given ilike pattern + pub fn ilike(pattern: &'a str, is_ascii: bool) -> Result<Self, ArrowError> { + if is_ascii && pattern.is_ascii() { + if !pattern.contains(is_like_pattern) { + return Ok(Self::IEqAscii(pattern)); + } else if pattern.ends_with('%') + && !pattern.ends_with("\\%") + && !pattern[..pattern.len() - 1].contains(is_like_pattern) + { + return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 1])); + } else if pattern.starts_with('%') && !pattern[1..].contains(is_like_pattern) + { + return Ok(Self::IEndsWithAscii(&pattern[1..])); + } + } + Ok(Self::Regex(regex_like(pattern, true)?)) + } + + /// Evaluate this predicate against the given haystack + pub fn evaluate(&self, haystack: &str) -> bool { + match self { Review Comment: this is pretty neat to mostly use the built in Rust code 👍 ########## arrow-string/src/like.rs: ########## @@ -15,719 +15,350 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::BooleanBufferBuilder; -use arrow_array::cast::*; +use crate::predicate::Predicate; +use arrow_array::cast::AsArray; use arrow_array::*; -use arrow_buffer::NullBuffer; -use arrow_data::ArrayDataBuilder; use arrow_schema::*; use arrow_select::take::take; -use regex::Regex; -use std::collections::HashMap; - -/// Helper function to perform boolean lambda function on values from two array accessors, this -/// version does not attempt to use SIMD. -/// -/// Duplicated from `arrow_ord::comparison` -fn compare_op<T: ArrayAccessor, S: ArrayAccessor, F>( - left: T, - right: S, - op: F, -) -> Result<BooleanArray, ArrowError> -where - F: Fn(T::Item, S::Item) -> bool, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - Ok(BooleanArray::from_binary(left, right, op)) -} - -/// Helper function to perform boolean lambda function on values from array accessor, this -/// version does not attempt to use SIMD. -/// -/// Duplicated from `arrow_ord::comparison` -fn compare_op_scalar<T: ArrayAccessor, F>( - left: T, - op: F, -) -> Result<BooleanArray, ArrowError> -where - F: Fn(T::Item) -> bool, -{ - Ok(BooleanArray::from_unary(left, op)) +use std::sync::Arc; + +#[derive(Debug)] +enum Op { + Like(bool), + ILike(bool), + Contains, + StartsWith, + EndsWith, } -macro_rules! dyn_function { - ($sql:tt, $fn_name:tt, $fn_utf8:tt, $fn_dict:tt) => { -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray, ArrowError> { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.as_string::<i32>(); - let right = right.as_string::<i32>(); - $fn_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.as_string::<i64>(); - let right = right.as_string::<i64>(); - $fn_utf8(left, right) +impl std::fmt::Display for Op { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Op::Like(false) => write!(f, "LIKE"), + Op::Like(true) => write!(f, "NLIKE"), + Op::ILike(false) => write!(f, "ILIKE"), + Op::ILike(true) => write!(f, "NILIKE"), + Op::Contains => write!(f, "CONTAINS"), + Op::StartsWith => write!(f, "STARTS_WITH"), + Op::EndsWith => write!(f, "ENDS_WITH"), } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - $fn_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError(format!( - "{} only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))) - } - } -} - } } -dyn_function!("left LIKE right", like_dyn, like_utf8, like_dict); -dyn_function!("left NOT LIKE right", nlike_dyn, nlike_utf8, nlike_dict); -dyn_function!("left ILIKE right", ilike_dyn, ilike_utf8, ilike_dict); -dyn_function!("left NOT ILIKE right", nilike_dyn, nilike_utf8, nilike_dict); -dyn_function!( - "STARTSWITH(left, right)", - starts_with_dyn, - starts_with_utf8, - starts_with_dict -); -dyn_function!( - "ENDSWITH(left, right)", - ends_with_dyn, - ends_with_utf8, - ends_with_dict -); -dyn_function!( - "CONTAINS(left, right)", - contains_dyn, - contains_utf8, - contains_dict -); -macro_rules! scalar_dyn_function { - ($sql:tt, $fn_name:tt, $fn_scalar:tt) => { -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn $fn_name( - left: &dyn Array, - right: &str, -) -> Result<BooleanArray, ArrowError> { - match left.data_type() { - DataType::Utf8 => { - let left = left.as_string::<i32>(); - $fn_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.as_string::<i64>(); - $fn_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - let dict_comparison = $fn_name(left.values().as_ref(), right)?; - // TODO: Use take_boolean (#2967) - let array = take(&dict_comparison, left.keys(), None)?; - Ok(BooleanArray::from(array.to_data())) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError(format!( - "{} only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))) - } - } +/// Perform SQL `left LIKE right` +pub fn like(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::Like(false), left, right) } - } -} -scalar_dyn_function!("left LIKE right", like_utf8_scalar_dyn, like_scalar); -scalar_dyn_function!("left NOT LIKE right", nlike_utf8_scalar_dyn, nlike_scalar); -scalar_dyn_function!("left ILIKE right", ilike_utf8_scalar_dyn, ilike_scalar); -scalar_dyn_function!( - "left NOT ILIKE right", - nilike_utf8_scalar_dyn, - nilike_scalar -); -scalar_dyn_function!( - "STARTSWITH(left, right)", - starts_with_utf8_scalar_dyn, - starts_with_scalar -); -scalar_dyn_function!( - "ENDSWITH(left, right)", - ends_with_utf8_scalar_dyn, - ends_with_scalar -); -scalar_dyn_function!( - "CONTAINS(left, right)", - contains_utf8_scalar_dyn, - contains_scalar -); - -macro_rules! dict_function { - ($sql:tt, $fn_name:tt, $fn_impl:tt) => { - -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`DictionaryArray`] with values")] -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn $fn_name<K: arrow_array::types::ArrowDictionaryKeyType>( - left: &DictionaryArray<K>, - right: &DictionaryArray<K>, -) -> Result<BooleanArray, ArrowError> { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap(); - let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap(); - $fn_impl(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap(); - let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap(); - - $fn_impl(left, right) - } - _ => Err(ArrowError::ComputeError(format!( - "{} only supports DictionaryArray with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))), - } -} - } +/// Perform SQL `left ILIKE right` +pub fn ilike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::ILike(false), left, right) } -dict_function!("left LIKE right", like_dict, like); -dict_function!("left NOT LIKE right", nlike_dict, nlike); -dict_function!("left ILIKE right", ilike_dict, ilike); -dict_function!("left NOT ILIKE right", nilike_dict, nilike); -dict_function!("STARTSWITH(left, right)", starts_with_dict, starts_with); -dict_function!("ENDSWITH(left, right)", ends_with_dict, ends_with); -dict_function!("CONTAINS(left, right)", contains_dict, contains); - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// There are two wildcards supported with the LIKE operator: -/// -/// 1. `%` - The percent sign represents zero, one, or multiple characters -/// 2. `_` - The underscore represents a single character -/// -/// For example: -/// ``` -/// use arrow_array::{StringArray, BooleanArray}; -/// use arrow_string::like::like_utf8; -/// -/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); -/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); -/// -/// let result = like_utf8(&strings, &patterns).unwrap(); -/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); -/// ``` -pub fn like_utf8<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &GenericStringArray<OffsetSize>, -) -> Result<BooleanArray, ArrowError> { - like(left, right) +/// Perform SQL `left NOT LIKE right` +pub fn nlike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::Like(true), left, right) } -#[inline] -fn like<'a, S: ArrayAccessor<Item = &'a str>>( - left: S, - right: S, -) -> Result<BooleanArray, ArrowError> { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) - }) +/// Perform SQL `left NOT ILIKE right` +pub fn nilike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::ILike(true), left, right) } -#[inline] -fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor<Item = &'a str>>( - left: L, - right: &str, - op: F, +/// Perform SQL `STARTSWITH(left, right)` +pub fn starts_with( + left: &dyn Datum, + right: &dyn Datum, ) -> Result<BooleanArray, ArrowError> { - if !right.contains(is_like_pattern) { - // fast path, can use equals - Ok(BooleanArray::from_unary(left, |item| op(item == right))) - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let starts_with = &right[..right.len() - 1]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.starts_with(starts_with)) - })) - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_with = &right[1..]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.ends_with(ends_with)) - })) - } else if right.starts_with('%') - && right.ends_with('%') - && !right.ends_with("\\%") - && !right[1..right.len() - 1].contains(is_like_pattern) - { - let contains = &right[1..right.len() - 1]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.contains(contains)) - })) - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - })?; - - Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) - } + like_op(Op::StartsWith, left, right) } -#[inline] -fn like_scalar<'a, L: ArrayAccessor<Item = &'a str>>( - left: L, - right: &str, +/// Perform SQL `ENDSWITH(left, right)` +pub fn ends_with( + left: &dyn Datum, + right: &dyn Datum, ) -> Result<BooleanArray, ArrowError> { - like_scalar_op(left, right, |x| x) + like_op(Op::EndsWith, left, right) } -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &str, -) -> Result<BooleanArray, ArrowError> { - like_scalar(left, right) +/// Perform SQL `CONTAINS(left, right)` +pub fn contains(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::Contains, left, right) } -/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: -/// -/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` -/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` -/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` -fn replace_like_wildcards(pattern: &str) -> Result<String, ArrowError> { - let mut result = String::new(); - let pattern = String::from(pattern); - let mut chars_iter = pattern.chars().peekable(); - while let Some(c) = chars_iter.next() { - if c == '\\' { - let next = chars_iter.peek(); - match next { - Some(next) if is_like_pattern(*next) => { - result.push(*next); - // Skipping the next char as it is already appended - chars_iter.next(); - } - _ => { - result.push('\\'); - result.push('\\'); - } - } - } else if regex_syntax::is_meta_character(c) { - result.push('\\'); - result.push(c); - } else if c == '%' { - result.push_str(".*"); - } else if c == '_' { - result.push('.'); - } else { - result.push(c); - } +fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray, ArrowError> { + use arrow_schema::DataType::*; + let (l, l_s) = lhs.get(); + let (r, r_s) = rhs.get(); + + if l.len() != r.len() && !l_s && !r_s { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot compare arrays of different lengths, got {} vs {}", + l.len(), + r.len() + ))); } - Ok(result) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &GenericStringArray<OffsetSize>, -) -> Result<BooleanArray, ArrowError> { - nlike(left, right) -} - -#[inline] -fn nlike<'a, S: ArrayAccessor<Item = &'a str>>( - left: S, - right: S, -) -> Result<BooleanArray, ArrowError> { - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) - }) -} -#[inline] -fn nlike_scalar<'a, L: ArrayAccessor<Item = &'a str>>( - left: L, - right: &str, -) -> Result<BooleanArray, ArrowError> { - like_scalar_op(left, right, |x| !x) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &str, -) -> Result<BooleanArray, ArrowError> { - nlike_scalar(left, right) -} + let l_v = l.as_any_dictionary_opt(); + let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l); -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// Case insensitive version of [`like_utf8`] -/// -/// Note: this only implements loose matching as defined by the Unicode standard. For example, -/// the `ff` ligature is not equivalent to `FF` and `ß` is not equivalent to `SS` -pub fn ilike_utf8<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &GenericStringArray<OffsetSize>, -) -> Result<BooleanArray, ArrowError> { - ilike(left, right) -} - -#[inline] -fn ilike<'a, S: ArrayAccessor<Item = &'a str>>( - left: S, - right: S, -) -> Result<BooleanArray, ArrowError> { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {e}" - )) - }) - }) -} + let r_v = r.as_any_dictionary_opt(); + let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r); -#[inline] -fn ilike_scalar_op<O: OffsetSizeTrait, F: Fn(bool) -> bool>( - left: &GenericStringArray<O>, - right: &str, - op: F, -) -> Result<BooleanArray, ArrowError> { - // If not ASCII faster to use case insensitive regex than using to_uppercase - if right.is_ascii() && left.is_ascii() { - if !right.contains(is_like_pattern) { - return Ok(BooleanArray::from_unary(left, |item| { - op(item.eq_ignore_ascii_case(right)) - })); - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let start_str = &right[..right.len() - 1]; - return Ok(BooleanArray::from_unary(left, |item| { - let end = item.len().min(start_str.len()); - let result = item.is_char_boundary(end) - && start_str.eq_ignore_ascii_case(&item[..end]); - op(result) - })); - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_str = &right[1..]; - return Ok(BooleanArray::from_unary(left, |item| { - let start = item.len().saturating_sub(ends_str.len()); - let result = item.is_char_boundary(start) - && ends_str.eq_ignore_ascii_case(&item[start..]); - op(result) - })); + match (l.data_type(), r.data_type()) { + (Utf8, Utf8) => { + apply::<i32>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) + } + (LargeUtf8, LargeUtf8) => { + apply::<i64>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) } + (l_t, r_t) => Err(ArrowError::InvalidArgumentError(format!( + "Invalid string operation: {l_t} {op} {r_t}" + ))), } - - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!("Unable to build regex from ILIKE pattern: {e}")) - })?; - - Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) } -#[inline] -fn ilike_scalar<O: OffsetSizeTrait>( - left: &GenericStringArray<O>, - right: &str, +fn apply<O: OffsetSizeTrait>( + op: Op, + l: &GenericStringArray<O>, + l_s: bool, + l_v: Option<&dyn AnyDictionaryArray>, + r: &GenericStringArray<O>, + r_s: bool, + r_v: Option<&dyn AnyDictionaryArray>, ) -> Result<BooleanArray, ArrowError> { - ilike_scalar_op(left, right, |x| x) -} - -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`ilike_utf8`] for more details. -pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &str, -) -> Result<BooleanArray, ArrowError> { - ilike_scalar(left, right) + let l_len = l_v.map(|l| l.len()).unwrap_or(l.len()); + if r_s { + let scalar = match r_v { + Some(dict) => match dict.nulls().filter(|n| n.null_count() != 0) { + Some(_) => return Ok(BooleanArray::new_null(l_len)), + None => { + let idx = dict.normalized_keys()[0]; + if r.is_null(idx) { + return Ok(BooleanArray::new_null(l_len)); + } + r.value(idx) + } + }, + None => r.value(0), + }; + op_scalar(op, l, l_v, scalar) + } else { Review Comment: This is quite a nice formulation ❤️ ########## arrow-string/src/like.rs: ########## @@ -936,34 +567,6 @@ mod tests { vec![true] ); - #[test] - fn test_replace_like_wildcards() { Review Comment: I think these tests should be carried over as tests for `fn regex_like(pattern: &str, case_insensitive: bool) -> Result<Regex, ArrowError> { ` ########## arrow-string/src/predicate.rs: ########## @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{BooleanArray, GenericStringArray, OffsetSizeTrait}; +use arrow_schema::ArrowError; +use regex::{Regex, RegexBuilder}; + +/// A string based predicate +pub enum Predicate<'a> { + Eq(&'a str), + IEqAscii(&'a str), Review Comment: maybe it is obvious to others that `IEqAscii` means "case insensitive ascii equal" but it might help to explicitly explain what each enum here is ########## arrow-string/src/like.rs: ########## @@ -15,719 +15,350 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::BooleanBufferBuilder; -use arrow_array::cast::*; +use crate::predicate::Predicate; +use arrow_array::cast::AsArray; use arrow_array::*; -use arrow_buffer::NullBuffer; -use arrow_data::ArrayDataBuilder; use arrow_schema::*; use arrow_select::take::take; -use regex::Regex; -use std::collections::HashMap; - -/// Helper function to perform boolean lambda function on values from two array accessors, this -/// version does not attempt to use SIMD. -/// -/// Duplicated from `arrow_ord::comparison` -fn compare_op<T: ArrayAccessor, S: ArrayAccessor, F>( - left: T, - right: S, - op: F, -) -> Result<BooleanArray, ArrowError> -where - F: Fn(T::Item, S::Item) -> bool, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - Ok(BooleanArray::from_binary(left, right, op)) -} - -/// Helper function to perform boolean lambda function on values from array accessor, this -/// version does not attempt to use SIMD. -/// -/// Duplicated from `arrow_ord::comparison` -fn compare_op_scalar<T: ArrayAccessor, F>( - left: T, - op: F, -) -> Result<BooleanArray, ArrowError> -where - F: Fn(T::Item) -> bool, -{ - Ok(BooleanArray::from_unary(left, op)) +use std::sync::Arc; + +#[derive(Debug)] +enum Op { + Like(bool), + ILike(bool), + Contains, + StartsWith, + EndsWith, } -macro_rules! dyn_function { - ($sql:tt, $fn_name:tt, $fn_utf8:tt, $fn_dict:tt) => { -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray, ArrowError> { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.as_string::<i32>(); - let right = right.as_string::<i32>(); - $fn_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.as_string::<i64>(); - let right = right.as_string::<i64>(); - $fn_utf8(left, right) +impl std::fmt::Display for Op { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Op::Like(false) => write!(f, "LIKE"), + Op::Like(true) => write!(f, "NLIKE"), + Op::ILike(false) => write!(f, "ILIKE"), + Op::ILike(true) => write!(f, "NILIKE"), + Op::Contains => write!(f, "CONTAINS"), + Op::StartsWith => write!(f, "STARTS_WITH"), + Op::EndsWith => write!(f, "ENDS_WITH"), } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - $fn_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError(format!( - "{} only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))) - } - } -} - } } -dyn_function!("left LIKE right", like_dyn, like_utf8, like_dict); -dyn_function!("left NOT LIKE right", nlike_dyn, nlike_utf8, nlike_dict); -dyn_function!("left ILIKE right", ilike_dyn, ilike_utf8, ilike_dict); -dyn_function!("left NOT ILIKE right", nilike_dyn, nilike_utf8, nilike_dict); -dyn_function!( - "STARTSWITH(left, right)", - starts_with_dyn, - starts_with_utf8, - starts_with_dict -); -dyn_function!( - "ENDSWITH(left, right)", - ends_with_dyn, - ends_with_utf8, - ends_with_dict -); -dyn_function!( - "CONTAINS(left, right)", - contains_dyn, - contains_utf8, - contains_dict -); -macro_rules! scalar_dyn_function { - ($sql:tt, $fn_name:tt, $fn_scalar:tt) => { -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn $fn_name( - left: &dyn Array, - right: &str, -) -> Result<BooleanArray, ArrowError> { - match left.data_type() { - DataType::Utf8 => { - let left = left.as_string::<i32>(); - $fn_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.as_string::<i64>(); - $fn_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - let dict_comparison = $fn_name(left.values().as_ref(), right)?; - // TODO: Use take_boolean (#2967) - let array = take(&dict_comparison, left.keys(), None)?; - Ok(BooleanArray::from(array.to_data())) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError(format!( - "{} only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))) - } - } +/// Perform SQL `left LIKE right` +pub fn like(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::Like(false), left, right) } - } -} -scalar_dyn_function!("left LIKE right", like_utf8_scalar_dyn, like_scalar); -scalar_dyn_function!("left NOT LIKE right", nlike_utf8_scalar_dyn, nlike_scalar); -scalar_dyn_function!("left ILIKE right", ilike_utf8_scalar_dyn, ilike_scalar); -scalar_dyn_function!( - "left NOT ILIKE right", - nilike_utf8_scalar_dyn, - nilike_scalar -); -scalar_dyn_function!( - "STARTSWITH(left, right)", - starts_with_utf8_scalar_dyn, - starts_with_scalar -); -scalar_dyn_function!( - "ENDSWITH(left, right)", - ends_with_utf8_scalar_dyn, - ends_with_scalar -); -scalar_dyn_function!( - "CONTAINS(left, right)", - contains_utf8_scalar_dyn, - contains_scalar -); - -macro_rules! dict_function { - ($sql:tt, $fn_name:tt, $fn_impl:tt) => { - -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`DictionaryArray`] with values")] -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn $fn_name<K: arrow_array::types::ArrowDictionaryKeyType>( - left: &DictionaryArray<K>, - right: &DictionaryArray<K>, -) -> Result<BooleanArray, ArrowError> { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap(); - let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap(); - $fn_impl(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap(); - let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap(); - - $fn_impl(left, right) - } - _ => Err(ArrowError::ComputeError(format!( - "{} only supports DictionaryArray with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))), - } -} - } +/// Perform SQL `left ILIKE right` +pub fn ilike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::ILike(false), left, right) } -dict_function!("left LIKE right", like_dict, like); -dict_function!("left NOT LIKE right", nlike_dict, nlike); -dict_function!("left ILIKE right", ilike_dict, ilike); -dict_function!("left NOT ILIKE right", nilike_dict, nilike); -dict_function!("STARTSWITH(left, right)", starts_with_dict, starts_with); -dict_function!("ENDSWITH(left, right)", ends_with_dict, ends_with); -dict_function!("CONTAINS(left, right)", contains_dict, contains); - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// There are two wildcards supported with the LIKE operator: -/// -/// 1. `%` - The percent sign represents zero, one, or multiple characters -/// 2. `_` - The underscore represents a single character -/// -/// For example: -/// ``` -/// use arrow_array::{StringArray, BooleanArray}; -/// use arrow_string::like::like_utf8; -/// -/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); -/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); -/// -/// let result = like_utf8(&strings, &patterns).unwrap(); -/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); -/// ``` -pub fn like_utf8<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &GenericStringArray<OffsetSize>, -) -> Result<BooleanArray, ArrowError> { - like(left, right) +/// Perform SQL `left NOT LIKE right` +pub fn nlike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::Like(true), left, right) } -#[inline] -fn like<'a, S: ArrayAccessor<Item = &'a str>>( - left: S, - right: S, -) -> Result<BooleanArray, ArrowError> { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) - }) +/// Perform SQL `left NOT ILIKE right` +pub fn nilike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::ILike(true), left, right) } -#[inline] -fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor<Item = &'a str>>( - left: L, - right: &str, - op: F, +/// Perform SQL `STARTSWITH(left, right)` +pub fn starts_with( + left: &dyn Datum, + right: &dyn Datum, ) -> Result<BooleanArray, ArrowError> { - if !right.contains(is_like_pattern) { - // fast path, can use equals - Ok(BooleanArray::from_unary(left, |item| op(item == right))) - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let starts_with = &right[..right.len() - 1]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.starts_with(starts_with)) - })) - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_with = &right[1..]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.ends_with(ends_with)) - })) - } else if right.starts_with('%') - && right.ends_with('%') - && !right.ends_with("\\%") - && !right[1..right.len() - 1].contains(is_like_pattern) - { - let contains = &right[1..right.len() - 1]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.contains(contains)) - })) - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - })?; - - Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) - } + like_op(Op::StartsWith, left, right) } -#[inline] -fn like_scalar<'a, L: ArrayAccessor<Item = &'a str>>( - left: L, - right: &str, +/// Perform SQL `ENDSWITH(left, right)` +pub fn ends_with( + left: &dyn Datum, + right: &dyn Datum, ) -> Result<BooleanArray, ArrowError> { - like_scalar_op(left, right, |x| x) + like_op(Op::EndsWith, left, right) } -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &str, -) -> Result<BooleanArray, ArrowError> { - like_scalar(left, right) +/// Perform SQL `CONTAINS(left, right)` +pub fn contains(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::Contains, left, right) } -/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: -/// -/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` -/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` -/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` -fn replace_like_wildcards(pattern: &str) -> Result<String, ArrowError> { - let mut result = String::new(); - let pattern = String::from(pattern); - let mut chars_iter = pattern.chars().peekable(); - while let Some(c) = chars_iter.next() { - if c == '\\' { - let next = chars_iter.peek(); - match next { - Some(next) if is_like_pattern(*next) => { - result.push(*next); - // Skipping the next char as it is already appended - chars_iter.next(); - } - _ => { - result.push('\\'); - result.push('\\'); - } - } - } else if regex_syntax::is_meta_character(c) { - result.push('\\'); - result.push(c); - } else if c == '%' { - result.push_str(".*"); - } else if c == '_' { - result.push('.'); - } else { - result.push(c); - } +fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray, ArrowError> { + use arrow_schema::DataType::*; + let (l, l_s) = lhs.get(); + let (r, r_s) = rhs.get(); + + if l.len() != r.len() && !l_s && !r_s { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot compare arrays of different lengths, got {} vs {}", + l.len(), + r.len() + ))); } - Ok(result) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &GenericStringArray<OffsetSize>, -) -> Result<BooleanArray, ArrowError> { - nlike(left, right) -} - -#[inline] -fn nlike<'a, S: ArrayAccessor<Item = &'a str>>( - left: S, - right: S, -) -> Result<BooleanArray, ArrowError> { - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) - }) -} -#[inline] -fn nlike_scalar<'a, L: ArrayAccessor<Item = &'a str>>( - left: L, - right: &str, -) -> Result<BooleanArray, ArrowError> { - like_scalar_op(left, right, |x| !x) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &str, -) -> Result<BooleanArray, ArrowError> { - nlike_scalar(left, right) -} + let l_v = l.as_any_dictionary_opt(); + let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l); -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// Case insensitive version of [`like_utf8`] -/// -/// Note: this only implements loose matching as defined by the Unicode standard. For example, Review Comment: this detail, likewise would be nice to transfer over ########## arrow-string/src/like.rs: ########## @@ -15,719 +15,350 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::BooleanBufferBuilder; -use arrow_array::cast::*; +use crate::predicate::Predicate; +use arrow_array::cast::AsArray; use arrow_array::*; -use arrow_buffer::NullBuffer; -use arrow_data::ArrayDataBuilder; use arrow_schema::*; use arrow_select::take::take; -use regex::Regex; -use std::collections::HashMap; - -/// Helper function to perform boolean lambda function on values from two array accessors, this -/// version does not attempt to use SIMD. -/// -/// Duplicated from `arrow_ord::comparison` -fn compare_op<T: ArrayAccessor, S: ArrayAccessor, F>( - left: T, - right: S, - op: F, -) -> Result<BooleanArray, ArrowError> -where - F: Fn(T::Item, S::Item) -> bool, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - Ok(BooleanArray::from_binary(left, right, op)) -} - -/// Helper function to perform boolean lambda function on values from array accessor, this -/// version does not attempt to use SIMD. -/// -/// Duplicated from `arrow_ord::comparison` -fn compare_op_scalar<T: ArrayAccessor, F>( - left: T, - op: F, -) -> Result<BooleanArray, ArrowError> -where - F: Fn(T::Item) -> bool, -{ - Ok(BooleanArray::from_unary(left, op)) +use std::sync::Arc; + +#[derive(Debug)] +enum Op { + Like(bool), + ILike(bool), + Contains, + StartsWith, + EndsWith, } -macro_rules! dyn_function { - ($sql:tt, $fn_name:tt, $fn_utf8:tt, $fn_dict:tt) => { -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray, ArrowError> { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.as_string::<i32>(); - let right = right.as_string::<i32>(); - $fn_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.as_string::<i64>(); - let right = right.as_string::<i64>(); - $fn_utf8(left, right) +impl std::fmt::Display for Op { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Op::Like(false) => write!(f, "LIKE"), + Op::Like(true) => write!(f, "NLIKE"), + Op::ILike(false) => write!(f, "ILIKE"), + Op::ILike(true) => write!(f, "NILIKE"), + Op::Contains => write!(f, "CONTAINS"), + Op::StartsWith => write!(f, "STARTS_WITH"), + Op::EndsWith => write!(f, "ENDS_WITH"), } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - $fn_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError(format!( - "{} only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))) - } - } -} - } } -dyn_function!("left LIKE right", like_dyn, like_utf8, like_dict); -dyn_function!("left NOT LIKE right", nlike_dyn, nlike_utf8, nlike_dict); -dyn_function!("left ILIKE right", ilike_dyn, ilike_utf8, ilike_dict); -dyn_function!("left NOT ILIKE right", nilike_dyn, nilike_utf8, nilike_dict); -dyn_function!( - "STARTSWITH(left, right)", - starts_with_dyn, - starts_with_utf8, - starts_with_dict -); -dyn_function!( - "ENDSWITH(left, right)", - ends_with_dyn, - ends_with_utf8, - ends_with_dict -); -dyn_function!( - "CONTAINS(left, right)", - contains_dyn, - contains_utf8, - contains_dict -); -macro_rules! scalar_dyn_function { - ($sql:tt, $fn_name:tt, $fn_scalar:tt) => { -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn $fn_name( - left: &dyn Array, - right: &str, -) -> Result<BooleanArray, ArrowError> { - match left.data_type() { - DataType::Utf8 => { - let left = left.as_string::<i32>(); - $fn_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.as_string::<i64>(); - $fn_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - let dict_comparison = $fn_name(left.values().as_ref(), right)?; - // TODO: Use take_boolean (#2967) - let array = take(&dict_comparison, left.keys(), None)?; - Ok(BooleanArray::from(array.to_data())) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError(format!( - "{} only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))) - } - } +/// Perform SQL `left LIKE right` +pub fn like(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::Like(false), left, right) } - } -} -scalar_dyn_function!("left LIKE right", like_utf8_scalar_dyn, like_scalar); -scalar_dyn_function!("left NOT LIKE right", nlike_utf8_scalar_dyn, nlike_scalar); -scalar_dyn_function!("left ILIKE right", ilike_utf8_scalar_dyn, ilike_scalar); -scalar_dyn_function!( - "left NOT ILIKE right", - nilike_utf8_scalar_dyn, - nilike_scalar -); -scalar_dyn_function!( - "STARTSWITH(left, right)", - starts_with_utf8_scalar_dyn, - starts_with_scalar -); -scalar_dyn_function!( - "ENDSWITH(left, right)", - ends_with_utf8_scalar_dyn, - ends_with_scalar -); -scalar_dyn_function!( - "CONTAINS(left, right)", - contains_utf8_scalar_dyn, - contains_scalar -); - -macro_rules! dict_function { - ($sql:tt, $fn_name:tt, $fn_impl:tt) => { - -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`DictionaryArray`] with values")] -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn $fn_name<K: arrow_array::types::ArrowDictionaryKeyType>( - left: &DictionaryArray<K>, - right: &DictionaryArray<K>, -) -> Result<BooleanArray, ArrowError> { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap(); - let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap(); - $fn_impl(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap(); - let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap(); - - $fn_impl(left, right) - } - _ => Err(ArrowError::ComputeError(format!( - "{} only supports DictionaryArray with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))), - } -} - } +/// Perform SQL `left ILIKE right` +pub fn ilike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::ILike(false), left, right) } -dict_function!("left LIKE right", like_dict, like); -dict_function!("left NOT LIKE right", nlike_dict, nlike); -dict_function!("left ILIKE right", ilike_dict, ilike); -dict_function!("left NOT ILIKE right", nilike_dict, nilike); -dict_function!("STARTSWITH(left, right)", starts_with_dict, starts_with); -dict_function!("ENDSWITH(left, right)", ends_with_dict, ends_with); -dict_function!("CONTAINS(left, right)", contains_dict, contains); - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// There are two wildcards supported with the LIKE operator: -/// -/// 1. `%` - The percent sign represents zero, one, or multiple characters -/// 2. `_` - The underscore represents a single character -/// -/// For example: -/// ``` -/// use arrow_array::{StringArray, BooleanArray}; -/// use arrow_string::like::like_utf8; -/// -/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); -/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); -/// -/// let result = like_utf8(&strings, &patterns).unwrap(); -/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); -/// ``` -pub fn like_utf8<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &GenericStringArray<OffsetSize>, -) -> Result<BooleanArray, ArrowError> { - like(left, right) +/// Perform SQL `left NOT LIKE right` +pub fn nlike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::Like(true), left, right) } -#[inline] -fn like<'a, S: ArrayAccessor<Item = &'a str>>( - left: S, - right: S, -) -> Result<BooleanArray, ArrowError> { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) - }) +/// Perform SQL `left NOT ILIKE right` +pub fn nilike(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::ILike(true), left, right) } -#[inline] -fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor<Item = &'a str>>( - left: L, - right: &str, - op: F, +/// Perform SQL `STARTSWITH(left, right)` +pub fn starts_with( + left: &dyn Datum, + right: &dyn Datum, ) -> Result<BooleanArray, ArrowError> { - if !right.contains(is_like_pattern) { - // fast path, can use equals - Ok(BooleanArray::from_unary(left, |item| op(item == right))) - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let starts_with = &right[..right.len() - 1]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.starts_with(starts_with)) - })) - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_with = &right[1..]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.ends_with(ends_with)) - })) - } else if right.starts_with('%') - && right.ends_with('%') - && !right.ends_with("\\%") - && !right[1..right.len() - 1].contains(is_like_pattern) - { - let contains = &right[1..right.len() - 1]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.contains(contains)) - })) - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - })?; - - Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) - } + like_op(Op::StartsWith, left, right) } -#[inline] -fn like_scalar<'a, L: ArrayAccessor<Item = &'a str>>( - left: L, - right: &str, +/// Perform SQL `ENDSWITH(left, right)` +pub fn ends_with( + left: &dyn Datum, + right: &dyn Datum, ) -> Result<BooleanArray, ArrowError> { - like_scalar_op(left, right, |x| x) + like_op(Op::EndsWith, left, right) } -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &str, -) -> Result<BooleanArray, ArrowError> { - like_scalar(left, right) +/// Perform SQL `CONTAINS(left, right)` +pub fn contains(left: &dyn Datum, right: &dyn Datum) -> Result<BooleanArray, ArrowError> { + like_op(Op::Contains, left, right) } -/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: -/// -/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` -/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` -/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` -fn replace_like_wildcards(pattern: &str) -> Result<String, ArrowError> { - let mut result = String::new(); - let pattern = String::from(pattern); - let mut chars_iter = pattern.chars().peekable(); - while let Some(c) = chars_iter.next() { - if c == '\\' { - let next = chars_iter.peek(); - match next { - Some(next) if is_like_pattern(*next) => { - result.push(*next); - // Skipping the next char as it is already appended - chars_iter.next(); - } - _ => { - result.push('\\'); - result.push('\\'); - } - } - } else if regex_syntax::is_meta_character(c) { - result.push('\\'); - result.push(c); - } else if c == '%' { - result.push_str(".*"); - } else if c == '_' { - result.push('.'); - } else { - result.push(c); - } +fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray, ArrowError> { + use arrow_schema::DataType::*; + let (l, l_s) = lhs.get(); + let (r, r_s) = rhs.get(); + + if l.len() != r.len() && !l_s && !r_s { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot compare arrays of different lengths, got {} vs {}", + l.len(), + r.len() + ))); } - Ok(result) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &GenericStringArray<OffsetSize>, -) -> Result<BooleanArray, ArrowError> { - nlike(left, right) -} - -#[inline] -fn nlike<'a, S: ArrayAccessor<Item = &'a str>>( - left: S, - right: S, -) -> Result<BooleanArray, ArrowError> { - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) - }) -} -#[inline] -fn nlike_scalar<'a, L: ArrayAccessor<Item = &'a str>>( - left: L, - right: &str, -) -> Result<BooleanArray, ArrowError> { - like_scalar_op(left, right, |x| !x) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &str, -) -> Result<BooleanArray, ArrowError> { - nlike_scalar(left, right) -} + let l_v = l.as_any_dictionary_opt(); + let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l); -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// Case insensitive version of [`like_utf8`] -/// -/// Note: this only implements loose matching as defined by the Unicode standard. For example, -/// the `ff` ligature is not equivalent to `FF` and `ß` is not equivalent to `SS` -pub fn ilike_utf8<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &GenericStringArray<OffsetSize>, -) -> Result<BooleanArray, ArrowError> { - ilike(left, right) -} - -#[inline] -fn ilike<'a, S: ArrayAccessor<Item = &'a str>>( - left: S, - right: S, -) -> Result<BooleanArray, ArrowError> { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {e}" - )) - }) - }) -} + let r_v = r.as_any_dictionary_opt(); + let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r); -#[inline] -fn ilike_scalar_op<O: OffsetSizeTrait, F: Fn(bool) -> bool>( - left: &GenericStringArray<O>, - right: &str, - op: F, -) -> Result<BooleanArray, ArrowError> { - // If not ASCII faster to use case insensitive regex than using to_uppercase - if right.is_ascii() && left.is_ascii() { - if !right.contains(is_like_pattern) { - return Ok(BooleanArray::from_unary(left, |item| { - op(item.eq_ignore_ascii_case(right)) - })); - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let start_str = &right[..right.len() - 1]; - return Ok(BooleanArray::from_unary(left, |item| { - let end = item.len().min(start_str.len()); - let result = item.is_char_boundary(end) - && start_str.eq_ignore_ascii_case(&item[..end]); - op(result) - })); - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_str = &right[1..]; - return Ok(BooleanArray::from_unary(left, |item| { - let start = item.len().saturating_sub(ends_str.len()); - let result = item.is_char_boundary(start) - && ends_str.eq_ignore_ascii_case(&item[start..]); - op(result) - })); + match (l.data_type(), r.data_type()) { + (Utf8, Utf8) => { + apply::<i32>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) + } + (LargeUtf8, LargeUtf8) => { + apply::<i64>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) } + (l_t, r_t) => Err(ArrowError::InvalidArgumentError(format!( + "Invalid string operation: {l_t} {op} {r_t}" + ))), } - - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!("Unable to build regex from ILIKE pattern: {e}")) - })?; - - Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) } -#[inline] -fn ilike_scalar<O: OffsetSizeTrait>( - left: &GenericStringArray<O>, - right: &str, +fn apply<O: OffsetSizeTrait>( + op: Op, + l: &GenericStringArray<O>, + l_s: bool, + l_v: Option<&dyn AnyDictionaryArray>, + r: &GenericStringArray<O>, + r_s: bool, + r_v: Option<&dyn AnyDictionaryArray>, ) -> Result<BooleanArray, ArrowError> { - ilike_scalar_op(left, right, |x| x) -} - -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`ilike_utf8`] for more details. -pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>( - left: &GenericStringArray<OffsetSize>, - right: &str, -) -> Result<BooleanArray, ArrowError> { - ilike_scalar(left, right) + let l_len = l_v.map(|l| l.len()).unwrap_or(l.len()); + if r_s { + let scalar = match r_v { + Some(dict) => match dict.nulls().filter(|n| n.null_count() != 0) { + Some(_) => return Ok(BooleanArray::new_null(l_len)), + None => { + let idx = dict.normalized_keys()[0]; + if r.is_null(idx) { + return Ok(BooleanArray::new_null(l_len)); + } + r.value(idx) + } + }, + None => r.value(0), + }; + op_scalar(op, l, l_v, scalar) + } else { Review Comment: This is quite a nice formulation ❤️ -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
