This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch cherry_pick_ee1d1644 in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit 1859aa0a381224e1c794b0ce8575b3eba293f472 Author: Jordan Deitch <jwdei...@users.noreply.github.com> AuthorDate: Tue Nov 9 08:56:18 2021 -0500 add ilike comparitor (#874) * add ilike comparitor * add ilike comparitor Co-authored-by: Jordan Deitch <jdei...@digitalocean.com> --- arrow/benches/comparison_kernels.rs | 25 +++ arrow/src/compute/kernels/comparison.rs | 270 +++++++++++++++++++++++--------- 2 files changed, 217 insertions(+), 78 deletions(-) diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index bfee9b9..94ff7df 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -119,6 +119,11 @@ fn bench_nlike_utf8_scalar(arr_a: &StringArray, value_b: &str) { .unwrap(); } +fn bench_ilike_utf8_scalar(arr_a: &StringArray, value_b: &str) { + ilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)) + .unwrap(); +} + fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) { regexp_is_match_utf8_scalar( criterion::black_box(arr_a), @@ -205,6 +210,26 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_nlike_utf8_scalar(&arr_string, "%xx_xx%xxx")) }); + c.bench_function("ilike_utf8 scalar equals", |b| { + b.iter(|| bench_ilike_utf8_scalar(&arr_string, "xxXX")) + }); + + c.bench_function("ilike_utf8 scalar contains", |b| { + b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xxXX%")) + }); + + c.bench_function("ilike_utf8 scalar ends with", |b| { + b.iter(|| bench_ilike_utf8_scalar(&arr_string, "xXXx%")) + }); + + c.bench_function("ilike_utf8 scalar starts with", |b| { + b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%XXXx")) + }); + + c.bench_function("ilike_utf8 scalar complex", |b| { + b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xx_xX%xXX")) + }); + c.bench_function("egexp_matches_utf8 scalar starts with", |b| { b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx")) }); diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 9d49e89..3b65f33 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -228,28 +228,23 @@ where compare_op_scalar_primitive!(left, right, op) } -/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// There are two wildcards supported with the LIKE operator: -/// -/// 1. `%` - The percent sign represents zero, one, or multiple characters -/// 2. `_` - The underscore represents a single character -/// -/// For example: -/// ``` -/// use arrow::array::{StringArray, BooleanArray}; -/// use arrow::compute::like_utf8; -/// -/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); -/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A."]); +fn is_like_pattern(c: char) -> bool { + c == '%' || c == '_' +} + +/// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`] /// -/// let result = like_utf8(&strings, &patterns).unwrap(); -/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); -/// ``` -pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>( +/// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`) +fn regex_like<OffsetSize, F>( left: &GenericStringArray<OffsetSize>, right: &GenericStringArray<OffsetSize>, -) -> Result<BooleanArray> { + negate_regex: bool, + op: F, +) -> Result<BooleanArray> +where + OffsetSize: StringOffsetSizeTrait, + F: Fn(&str) -> Result<Regex>, +{ let mut map = HashMap::new(); if left.len() != right.len() { return Err(ArrowError::ComputeError( @@ -269,17 +264,16 @@ pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>( regex } else { let re_pattern = pat.replace("%", ".*").replace("_", "."); - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - })?; + let re = op(&re_pattern)?; map.insert(pat, re); map.get(pat).unwrap() }; - result.append(re.is_match(haystack)); + result.append(if negate_regex { + !re.is_match(haystack) + } else { + re.is_match(haystack) + }); } let data = unsafe { @@ -296,8 +290,36 @@ pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>( Ok(BooleanArray::from(data)) } -fn is_like_pattern(c: char) -> bool { - c == '%' || c == '_' +/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. +/// +/// There are two wildcards supported with the LIKE operator: +/// +/// 1. `%` - The percent sign represents zero, one, or multiple characters +/// 2. `_` - The underscore represents a single character +/// +/// For example: +/// ``` +/// use arrow::array::{StringArray, BooleanArray}; +/// use arrow::compute::like_utf8; +/// +/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); +/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A."]); +/// +/// let result = like_utf8(&strings, &patterns).unwrap(); +/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); +/// ``` +pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>( + left: &GenericStringArray<OffsetSize>, + right: &GenericStringArray<OffsetSize>, +) -> Result<BooleanArray> { + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) } /// Perform SQL `left LIKE right` operation on [`StringArray`] / @@ -376,36 +398,55 @@ pub fn nlike_utf8<OffsetSize: StringOffsetSizeTrait>( left: &GenericStringArray<OffsetSize>, right: &GenericStringArray<OffsetSize>, ) -> Result<BooleanArray> { - let mut map = HashMap::new(); - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) +} +/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>( + left: &GenericStringArray<OffsetSize>, + right: &str, +) -> Result<BooleanArray> { + let null_bit_buffer = left.data().null_buffer().cloned(); let mut result = BooleanBufferBuilder::new(left.len()); - for i in 0..left.len() { - let haystack = left.value(i); - let pat = right.value(i); - let re = if let Some(ref regex) = map.get(pat) { - regex - } else { - let re_pattern = pat.replace("%", ".*").replace("_", "."); - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - })?; - map.insert(pat, re); - map.get(pat).unwrap() - }; - result.append(!re.is_match(haystack)); + if !right.contains(is_like_pattern) { + // fast path, can use equals + for i in 0..left.len() { + result.append(left.value(i) != right); + } + } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use ends_with + for i in 0..left.len() { + result.append(!left.value(i).starts_with(&right[..right.len() - 1])); + } + } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { + // fast path, can use starts_with + for i in 0..left.len() { + result.append(!left.value(i).ends_with(&right[1..])); + } + } else { + let re_pattern = right.replace("%", ".*").replace("_", "."); + let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + })?; + for i in 0..left.len() { + let haystack = left.value(i); + result.append(!re.is_match(haystack)); + } } let data = unsafe { @@ -422,11 +463,29 @@ pub fn nlike_utf8<OffsetSize: StringOffsetSizeTrait>( Ok(BooleanArray::from(data)) } -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / +/// Perform SQL `left ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_utf8<OffsetSize: StringOffsetSizeTrait>( + left: &GenericStringArray<OffsetSize>, + right: &GenericStringArray<OffsetSize>, +) -> Result<BooleanArray> { + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// /// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>( +pub fn ilike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>( left: &GenericStringArray<OffsetSize>, right: &str, ) -> Result<BooleanArray> { @@ -436,30 +495,38 @@ pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>( if !right.contains(is_like_pattern) { // fast path, can use equals for i in 0..left.len() { - result.append(left.value(i) != right); + result.append(left.value(i) == right); } } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use ends_with for i in 0..left.len() { - result.append(!left.value(i).starts_with(&right[..right.len() - 1])); + result.append( + left.value(i) + .to_uppercase() + .starts_with(&right[..right.len() - 1].to_uppercase()), + ); } } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { // fast path, can use starts_with for i in 0..left.len() { - result.append(!left.value(i).ends_with(&right[1..])); + result.append( + left.value(i) + .to_uppercase() + .ends_with(&right[1..].to_uppercase()), + ); } } else { let re_pattern = right.replace("%", ".*").replace("_", "."); - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", + "Unable to build regex from ILIKE pattern: {}", e )) })?; for i in 0..left.len() { let haystack = left.value(i); - result.append(!re.is_match(haystack)); + result.append(re.is_match(haystack)); } } @@ -2128,21 +2195,6 @@ mod tests { ); test_utf8!( - test_utf8_array_nlike, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], - nlike_utf8, - vec![false, false, false, true, true, false, true] - ); - test_utf8_scalar!( - test_utf8_array_nlike_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "%ar%", - nlike_utf8_scalar, - vec![false, false, true, true] - ); - - test_utf8!( test_utf8_array_eq, vec!["arrow", "arrow", "arrow", "arrow"], vec!["arrow", "parquet", "datafusion", "flight"], @@ -2157,6 +2209,21 @@ mod tests { vec![true, false, false, false] ); + test_utf8!( + test_utf8_array_nlike, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], + nlike_utf8, + vec![false, false, false, true, true, false, true] + ); + test_utf8_scalar!( + test_utf8_array_nlike_scalar, + vec!["arrow", "parquet", "datafusion", "flight"], + "%ar%", + nlike_utf8_scalar, + vec![false, false, true, true] + ); + test_utf8_scalar!( test_utf8_array_nlike_scalar_start, vec!["arrow", "parrow", "arrows", "arr"], @@ -2190,6 +2257,53 @@ mod tests { ); test_utf8!( + test_utf8_array_ilike, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + ilike_utf8, + vec![true, true, true, false, false, true, false] + ); + test_utf8_scalar!( + test_utf8_array_ilike_scalar, + vec!["arrow", "parquet", "datafusion", "flight"], + "%AR%", + ilike_utf8_scalar, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_start, + vec!["arrow", "parrow", "arrows", "ARR"], + "aRRow%", + ilike_utf8_scalar, + vec![true, false, true, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_end, + vec!["ArroW", "parrow", "ARRowS", "arr"], + "%arrow", + ilike_utf8_scalar, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_equals, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow", + ilike_utf8_scalar, + vec![true, false, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + ilike_utf8_scalar, + vec![false, true, false, false] + ); + + test_utf8!( test_utf8_array_neq, vec!["arrow", "arrow", "arrow", "arrow"], vec!["arrow", "parquet", "datafusion", "flight"],