This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new f78d2e657 Fix escaped like wildcards in `like_utf8` / `nlike_utf8`
kernels (#2258)
f78d2e657 is described below
commit f78d2e6572c5b60d00a15084875d90d3e31c9b74
Author: Daniel Martinez Maqueda
<[email protected]>
AuthorDate: Wed Aug 3 18:22:45 2022 +0200
Fix escaped like wildcards in `like_utf8` / `nlike_utf8` kernels (#2258)
* Fix escaped like wildcards
Added a new function that replaces the like wildcards '%' and '_' for
the regex counterparts before executing them. It also takes into account
that the wildcards can be escaped, in that case, it does remove the
escape characters and leaves the wildcards so that they are matched
against the raw character.
This is implemented iterating over all the characters of the pattern to
figure out when it needs to be transformed or not.
* Rewrite logic with peek after PR feedback
* Simplifly logic
* Add documentation and refactor string creation in tests
* Add small fix and cargo fmt
---
arrow/Cargo.toml | 1 +
arrow/src/compute/kernels/comparison.rs | 109 +++++++++++++++++++++++++++++---
2 files changed, 100 insertions(+), 10 deletions(-)
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index d0a7c73ae..dcecdb674 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -49,6 +49,7 @@ half = { version = "2.0", default-features = false }
hashbrown = { version = "0.12", default-features = false }
csv_crate = { version = "1.1", default-features = false, optional = true,
package="csv" }
regex = { version = "1.5.6", default-features = false, features = ["std",
"unicode"] }
+regex-syntax = { version = "0.6.27", default-features = false, features =
["unicode"] }
lazy_static = { version = "1.4", default-features = false }
packed_simd = { version = "0.3", default-features = false, optional = true,
package = "packed_simd_2" }
chrono = { version = "0.4", default-features = false, features = ["clock"] }
diff --git a/arrow/src/compute/kernels/comparison.rs
b/arrow/src/compute/kernels/comparison.rs
index 7733ce67a..e4187ef87 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -35,7 +35,7 @@ use crate::datatypes::{
};
use crate::error::{ArrowError, Result};
use crate::util::bit_util;
-use regex::{escape, Regex};
+use regex::Regex;
use std::collections::HashMap;
/// Helper function to perform boolean lambda function on values from two
array accessors, this
@@ -169,7 +169,7 @@ where
let re = if let Some(ref regex) = map.get(pat) {
regex
} else {
- let re_pattern = escape(pat).replace('%', ".*").replace('_', ".");
+ let re_pattern = replace_like_wildcards(pat)?;
let re = op(&re_pattern)?;
map.insert(pat, re);
map.get(pat).unwrap()
@@ -248,7 +248,9 @@ pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
bit_util::set_bit(bool_slice, i);
}
}
- } else if right.ends_with('%') && !right[..right.len() -
1].contains(is_like_pattern)
+ } else if right.ends_with('%')
+ && !right.ends_with("\\%")
+ && !right[..right.len() - 1].contains(is_like_pattern)
{
// fast path, can use starts_with
let starts_with = &right[..right.len() - 1];
@@ -266,7 +268,7 @@ pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
}
}
} else {
- let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
+ let re_pattern = replace_like_wildcards(right)?;
let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
ArrowError::ComputeError(format!(
"Unable to build regex from LIKE pattern: {}",
@@ -296,6 +298,43 @@ pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
Ok(BooleanArray::from(data))
}
+/// Transforms a like `pattern` to a regex compatible pattern. To achieve
that, it does:
+///
+/// 1. Replace like wildcards for regex expressions as the pattern will be
evaluated using regex match: `%` => `.*` and `_` => `.`
+/// 2. Escape regex meta characters to match them and not be evaluated as
regex special chars. For example: `.` => `\\.`
+/// 3. Replace escaped like wildcards removing the escape characters to be
able to match it as a regex. For example: `\\%` => `%`
+fn replace_like_wildcards(pattern: &str) -> Result<String> {
+ let mut result = String::new();
+ let pattern = String::from(pattern);
+ let mut chars_iter = pattern.chars().peekable();
+ while let Some(c) = chars_iter.next() {
+ if c == '\\' {
+ let next = chars_iter.peek();
+ match next {
+ Some(next) if is_like_pattern(*next) => {
+ result.push(*next);
+ // Skipping the next char as it is already appended
+ chars_iter.next();
+ }
+ _ => {
+ result.push('\\');
+ result.push('\\');
+ }
+ }
+ } else if regex_syntax::is_meta_character(c) {
+ result.push('\\');
+ result.push(c);
+ } else if c == '%' {
+ result.push_str(".*");
+ } else if c == '_' {
+ result.push('.');
+ } else {
+ result.push(c);
+ }
+ }
+ Ok(result)
+}
+
/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
@@ -330,7 +369,9 @@ pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
for i in 0..left.len() {
result.append(left.value(i) != right);
}
- } else if right.ends_with('%') && !right[..right.len() -
1].contains(is_like_pattern)
+ } else if right.ends_with('%')
+ && !right.ends_with("\\%")
+ && !right[..right.len() - 1].contains(is_like_pattern)
{
// fast path, can use ends_with
for i in 0..left.len() {
@@ -342,7 +383,7 @@ pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
result.append(!left.value(i).ends_with(&right[1..]));
}
} else {
- let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
+ let re_pattern = replace_like_wildcards(right)?;
let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
ArrowError::ComputeError(format!(
"Unable to build regex from LIKE pattern: {}",
@@ -403,7 +444,9 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
for i in 0..left.len() {
result.append(left.value(i) == right);
}
- } else if right.ends_with('%') && !right[..right.len() -
1].contains(is_like_pattern)
+ } else if right.ends_with('%')
+ && !right.ends_with("\\%")
+ && !right[..right.len() - 1].contains(is_like_pattern)
{
// fast path, can use ends_with
for i in 0..left.len() {
@@ -423,7 +466,7 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
);
}
} else {
- let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
+ let re_pattern = replace_like_wildcards(right)?;
let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
ArrowError::ComputeError(format!(
"Unable to build regex from ILIKE pattern: {}",
@@ -484,7 +527,9 @@ pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
for i in 0..left.len() {
result.append(left.value(i) != right);
}
- } else if right.ends_with('%') && !right[..right.len() -
1].contains(is_like_pattern)
+ } else if right.ends_with('%')
+ && !right.ends_with("\\%")
+ && !right[..right.len() - 1].contains(is_like_pattern)
{
// fast path, can use ends_with
for i in 0..left.len() {
@@ -506,7 +551,7 @@ pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
);
}
} else {
- let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
+ let re_pattern = replace_like_wildcards(right)?;
let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
ArrowError::ComputeError(format!(
"Unable to build regex from ILIKE pattern: {}",
@@ -3740,6 +3785,50 @@ mod tests {
vec![false, true, false, false]
);
+ test_utf8_scalar!(
+ test_utf8_scalar_like_escape,
+ vec!["a%", "a\\x"],
+ "a\\%",
+ like_utf8_scalar,
+ vec![true, false]
+ );
+
+ test_utf8!(
+ test_utf8_scalar_ilike_regex,
+ vec!["%%%"],
+ vec![r#"\%_\%"#],
+ ilike_utf8,
+ vec![true]
+ );
+
+ #[test]
+ fn test_replace_like_wildcards() {
+ let a_eq = "_%";
+ let expected = "..*";
+ assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
+ }
+
+ #[test]
+ fn test_replace_like_wildcards_leave_like_meta_chars() {
+ let a_eq = "\\%\\_";
+ let expected = "%_";
+ assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
+ }
+
+ #[test]
+ fn test_replace_like_wildcards_with_multiple_escape_chars() {
+ let a_eq = "\\\\%";
+ let expected = "\\\\%";
+ assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
+ }
+
+ #[test]
+ fn test_replace_like_wildcards_escape_regex_meta_char() {
+ let a_eq = ".";
+ let expected = "\\.";
+ assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
+ }
+
test_utf8!(
test_utf8_array_eq,
vec!["arrow", "arrow", "arrow", "arrow"],