This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 5ad621fd97 Fix LIKE with escapes (#6703)
5ad621fd97 is described below
commit 5ad621fd97032796f3da2d02948e65513de8c891
Author: Piotr Findeisen <[email protected]>
AuthorDate: Sat Nov 9 08:43:46 2024 +0100
Fix LIKE with escapes (#6703)
* Fix LIKE with escapes
Fix LIKE processing for patterns containing escapes
- the starts_with / ends_with optimization did not correctly check for
escapes when checking rest of the pattern for being literal or not
- the pattern to regexp compiler incorrectly processed \ followed by a
character other than % or _. In PostgreSQL '\x' pattern matches single
'x'.
There are two tests
- like_escape_many was generated using PostgreSQL with the code attached
below for verification
- like_escape is hand-picked test cases that are more interesting.
Lower cardinality of hand-picked test cases allows for exercising all
scalar/array vs scalar/array combinations.
The below script isn't simples possible, because it was attempted to
generate more test cases by adding padding. Hence e.g.
is_like_without_dangling_escape. Since this is attached for reference,
should be attached as-is.
```python
import psycopg2
data = r"""
\
\\
\\\
\\\\
a
\a
\\a
%
\%
\\%
%%
\%%
\\%%
_
\_
\\_
__
\__
\\__
abc
a_c
a\bc
a\_c
%abc
\%abc
a\\_c%
""".split('\n')
data = list(dict.fromkeys(data))
conn = psycopg2.connect(host='localhost', port=5432, user='postgres',
password='mysecretpassword')
conn.set_session(autocommit=True)
cursor = conn.cursor()
for r in data:
try:
# PostgreSQL verifies dandling escape only sometimes
cursor.execute(f"SELECT %s LIKE %s", (r, r))
is_like, = cursor.fetchone()
has_dandling_escape = False
pg_pattern = r
except Exception as e:
if 'LIKE pattern must not end with escape character' not in str(e):
raise e
has_dandling_escape = True
pg_pattern = r + '\\'
for l in data:
# print()
# print(' '.join(str(v) for v in (l, r, has_dandling_escape,
postgres_pattern)))
cursor.execute(f"SELECT %s LIKE %s", (l, pg_pattern))
is_like, = cursor.fetchone()
assert type(is_like) is bool
if not is_like and has_dandling_escape:
pattern_without_escaped_dandling_escape = pg_pattern[:-2]
cursor.execute(f"SELECT %s LIKE %s", (l,
pattern_without_escaped_dandling_escape))
is_like_without_dangling_escape, = cursor.fetchone()
assert type(is_like_without_dangling_escape) is bool
else:
is_like_without_dangling_escape = False
assert '"' not in l
assert '"' not in r
print('(r"%s", r"%s", %s),' % (
l, r,
str(is_like).lower(),
# str(has_dandling_escape).lower(),
# str(is_like_without_dangling_escape).lower(),
))
```
* Compact tests for regex_like
Reduce test code boilerplate and make it easier to see what are the test
cases.
* Add more test cases for regex_like
---
arrow-string/src/like.rs | 1058 +++++++++++++++++++++++++++++++++++++++++
arrow-string/src/predicate.rs | 95 ++--
2 files changed, 1094 insertions(+), 59 deletions(-)
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index 1b04b4eb56..0a5aa77dbb 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -429,6 +429,7 @@ legacy_kernels!(
mod tests {
use super::*;
use arrow_array::types::Int8Type;
+ use std::iter::zip;
/// Applying `op(left, right)`, both sides are arrays
/// The macro tests four types of array implementations:
@@ -1864,4 +1865,1061 @@ mod tests {
assert!(r.is_null(0));
}
}
+
+ #[test]
+ fn like_escape() {
+ // (value, pattern, expected)
+ let test_cases = vec![
+ // Empty pattern
+ (r"", r"", true),
+ (r"\", r"", false),
+ // Sole (dangling) escape (some engines consider this invalid
pattern)
+ (r"", r"\", false),
+ (r"\", r"\", true),
+ (r"\\", r"\", false),
+ (r"a", r"\", false),
+ (r"\a", r"\", false),
+ (r"\\a", r"\", false),
+ // Sole escape
+ (r"", r"\\", false),
+ (r"\", r"\\", true),
+ (r"\\", r"\\", false),
+ (r"a", r"\\", false),
+ (r"\a", r"\\", false),
+ (r"\\a", r"\\", false),
+ // Sole escape and dangling escape
+ (r"", r"\\\", false),
+ (r"\", r"\\\", false),
+ (r"\\", r"\\\", true),
+ (r"\\\", r"\\\", false),
+ (r"\\\\", r"\\\", false),
+ (r"a", r"\\\", false),
+ (r"\a", r"\\\", false),
+ (r"\\a", r"\\\", false),
+ // Sole two escapes
+ (r"", r"\\\\", false),
+ (r"\", r"\\\\", false),
+ (r"\\", r"\\\\", true),
+ (r"\\\", r"\\\\", false),
+ (r"\\\\", r"\\\\", false),
+ (r"\\\\\", r"\\\\", false),
+ (r"a", r"\\\\", false),
+ (r"\a", r"\\\\", false),
+ (r"\\a", r"\\\\", false),
+ // Escaped non-wildcard
+ (r"", r"\a", false),
+ (r"\", r"\a", false),
+ (r"\\", r"\a", false),
+ (r"a", r"\a", true),
+ (r"\a", r"\a", false),
+ (r"\\a", r"\a", false),
+ // Escaped _ wildcard
+ (r"", r"\_", false),
+ (r"\", r"\_", false),
+ (r"\\", r"\_", false),
+ (r"a", r"\_", false),
+ (r"_", r"\_", true),
+ (r"%", r"\_", false),
+ (r"\a", r"\_", false),
+ (r"\\a", r"\_", false),
+ (r"\_", r"\_", false),
+ (r"\\_", r"\_", false),
+ // Escaped % wildcard
+ (r"", r"\%", false),
+ (r"\", r"\%", false),
+ (r"\\", r"\%", false),
+ (r"a", r"\%", false),
+ (r"_", r"\%", false),
+ (r"%", r"\%", true),
+ (r"\a", r"\%", false),
+ (r"\\a", r"\%", false),
+ (r"\%", r"\%", false),
+ (r"\\%", r"\%", false),
+ // Escape and non-wildcard
+ (r"", r"\\a", false),
+ (r"\", r"\\a", false),
+ (r"\\", r"\\a", false),
+ (r"a", r"\\a", false),
+ (r"\a", r"\\a", true),
+ (r"\\a", r"\\a", false),
+ (r"\\\a", r"\\a", false),
+ // Escape and _ wildcard
+ (r"", r"\\_", false),
+ (r"\", r"\\_", false),
+ (r"\\", r"\\_", true),
+ (r"a", r"\\_", false),
+ (r"_", r"\\_", false),
+ (r"%", r"\\_", false),
+ (r"\a", r"\\_", true),
+ (r"\\a", r"\\_", false),
+ (r"\_", r"\\_", true),
+ (r"\\_", r"\\_", false),
+ (r"\\\_", r"\\_", false),
+ // Escape and % wildcard
+ (r"", r"\\%", false),
+ (r"\", r"\\%", true),
+ (r"\\", r"\\%", true),
+ (r"a", r"\\%", false),
+ (r"ab", r"\\%", false),
+ (r"a%", r"\\%", false),
+ (r"_", r"\\%", false),
+ (r"%", r"\\%", false),
+ (r"\a", r"\\%", true),
+ (r"\\a", r"\\%", true),
+ (r"\%", r"\\%", true),
+ (r"\\%", r"\\%", true),
+ (r"\\\%", r"\\%", true),
+ // %... pattern with dangling wildcard
+ (r"\", r"%\", true),
+ (r"\\", r"%\", true),
+ (r"%\", r"%\", true),
+ (r"%\\", r"%\", true),
+ (r"abc\", r"%\", true),
+ (r"abc", r"%\", false),
+ // %... pattern with wildcard
+ (r"\", r"%\\", true),
+ (r"\\", r"%\\", true),
+ (r"%\\", r"%\\", true),
+ (r"%\\\", r"%\\", true),
+ (r"abc\", r"%\\", true),
+ (r"abc", r"%\\", false),
+ // %... pattern including escaped non-wildcard
+ (r"ac", r"%a\c", true),
+ (r"xyzac", r"%a\c", true),
+ (r"abc", r"%a\c", false),
+ (r"a\c", r"%a\c", false),
+ (r"%a\c", r"%a\c", false),
+ // %... pattern including escape
+ (r"\", r"%a\\c", false),
+ (r"\\", r"%a\\c", false),
+ (r"ac", r"%a\\c", false),
+ (r"a\c", r"%a\\c", true),
+ (r"a\\c", r"%a\\c", false),
+ (r"abc", r"%a\\c", false),
+ (r"xyza\c", r"%a\\c", true),
+ (r"xyza\\c", r"%a\\c", false),
+ (r"%a\\c", r"%a\\c", false),
+ // ...% pattern with wildcard
+ (r"\", r"\\%", true),
+ (r"\\", r"\\%", true),
+ (r"\\%", r"\\%", true),
+ (r"\\\%", r"\\%", true),
+ (r"\abc", r"\\%", true),
+ (r"a", r"\\%", false),
+ (r"abc", r"\\%", false),
+ // ...% pattern including escaped non-wildcard
+ (r"ac", r"a\c%", true),
+ (r"acxyz", r"a\c%", true),
+ (r"abc", r"a\c%", false),
+ (r"a\c", r"a\c%", false),
+ (r"a\c%", r"a\c%", false),
+ (r"a\\c%", r"a\c%", false),
+ // ...% pattern including escape
+ (r"ac", r"a\\c%", false),
+ (r"a\c", r"a\\c%", true),
+ (r"a\cxyz", r"a\\c%", true),
+ (r"a\\c", r"a\\c%", false),
+ (r"a\\cxyz", r"a\\c%", false),
+ (r"abc", r"a\\c%", false),
+ (r"abcxyz", r"a\\c%", false),
+ (r"a\\c%", r"a\\c%", false),
+ // %...% pattern including escaped non-wildcard
+ (r"ac", r"%a\c%", true),
+ (r"xyzacxyz", r"%a\c%", true),
+ (r"abc", r"%a\c%", false),
+ (r"a\c", r"%a\c%", false),
+ (r"xyza\cxyz", r"%a\c%", false),
+ (r"%a\c%", r"%a\c%", false),
+ (r"%a\\c%", r"%a\c%", false),
+ // %...% pattern including escape
+ (r"ac", r"%a\\c%", false),
+ (r"a\c", r"%a\\c%", true),
+ (r"xyza\cxyz", r"%a\\c%", true),
+ (r"a\\c", r"%a\\c%", false),
+ (r"xyza\\cxyz", r"%a\\c%", false),
+ (r"abc", r"%a\\c%", false),
+ (r"xyzabcxyz", r"%a\\c%", false),
+ (r"%a\\c%", r"%a\\c%", false),
+ // Odd (7) backslashes and % wildcard
+ (r"\\%", r"\\\\\\\%", false),
+ (r"\\\", r"\\\\\\\%", false),
+ (r"\\\%", r"\\\\\\\%", true),
+ (r"\\\\", r"\\\\\\\%", false),
+ (r"\\\\%", r"\\\\\\\%", false),
+ (r"\\\\\\\%", r"\\\\\\\%", false),
+ // Odd (7) backslashes and _ wildcard
+ (r"\\\", r"\\\\\\\_", false),
+ (r"\\\\", r"\\\\\\\_", false),
+ (r"\\\_", r"\\\\\\\_", true),
+ (r"\\\\", r"\\\\\\\_", false),
+ (r"\\\a", r"\\\\\\\_", false),
+ (r"\\\\_", r"\\\\\\\_", false),
+ (r"\\\\\\\_", r"\\\\\\\_", false),
+ // Even (8) backslashes and % wildcard
+ (r"\\\", r"\\\\\\\\%", false),
+ (r"\\\\", r"\\\\\\\\%", true),
+ (r"\\\\\", r"\\\\\\\\%", true),
+ (r"\\\\xyz", r"\\\\\\\\%", true),
+ (r"\\\\\\\\%", r"\\\\\\\\%", true),
+ // Even (8) backslashes and _ wildcard
+ (r"\\\", r"\\\\\\\\_", false),
+ (r"\\\\", r"\\\\\\\\_", false),
+ (r"\\\\\", r"\\\\\\\\_", true),
+ (r"\\\\a", r"\\\\\\\\_", true),
+ (r"\\\\\a", r"\\\\\\\\_", false),
+ (r"\\\\ab", r"\\\\\\\\_", false),
+ (r"\\\\\\\\_", r"\\\\\\\\_", false),
+ ];
+
+ for (value, pattern, expected) in test_cases {
+ let unexpected = BooleanArray::from(vec![!expected]);
+ let expected = BooleanArray::from(vec![expected]);
+
+ for string_type in [DataType::Utf8, DataType::LargeUtf8,
DataType::Utf8View] {
+ for ((value_datum, value_type), (pattern_datum, pattern_type))
in zip(
+ make_datums(value, &string_type),
+ make_datums(pattern, &string_type),
+ ) {
+ let value_datum = value_datum.as_ref();
+ let pattern_datum = pattern_datum.as_ref();
+ assert_eq!(
+ like(value_datum, pattern_datum).unwrap(),
+ expected,
+ "{value_type:?} «{value}» like {pattern_type:?}
«{pattern}»"
+ );
+ assert_eq!(
+ ilike(value_datum, pattern_datum).unwrap(),
+ expected,
+ "{value_type:?} «{value}» ilike {pattern_type:?}
«{pattern}»"
+ );
+ assert_eq!(
+ nlike(value_datum, pattern_datum).unwrap(),
+ unexpected,
+ "{value_type:?} «{value}» nlike {pattern_type:?}
«{pattern}»"
+ );
+ assert_eq!(
+ nilike(value_datum, pattern_datum).unwrap(),
+ unexpected,
+ "{value_type:?} «{value}» nilike {pattern_type:?}
«{pattern}»"
+ );
+ }
+ }
+ }
+ }
+
+ #[test]
+ fn like_escape_many() {
+ // (value, pattern, expected)
+ let test_cases = vec![
+ (r"", r"", true),
+ (r"\", r"", false),
+ (r"\\", r"", false),
+ (r"\\\", r"", false),
+ (r"\\\\", r"", false),
+ (r"a", r"", false),
+ (r"\a", r"", false),
+ (r"\\a", r"", false),
+ (r"%", r"", false),
+ (r"\%", r"", false),
+ (r"\\%", r"", false),
+ (r"%%", r"", false),
+ (r"\%%", r"", false),
+ (r"\\%%", r"", false),
+ (r"_", r"", false),
+ (r"\_", r"", false),
+ (r"\\_", r"", false),
+ (r"__", r"", false),
+ (r"\__", r"", false),
+ (r"\\__", r"", false),
+ (r"abc", r"", false),
+ (r"a_c", r"", false),
+ (r"a\bc", r"", false),
+ (r"a\_c", r"", false),
+ (r"%abc", r"", false),
+ (r"\%abc", r"", false),
+ (r"a\\_c%", r"", false),
+ (r"", r"\", false),
+ (r"\", r"\", true),
+ (r"\\", r"\", false),
+ (r"\\\", r"\", false),
+ (r"\\\\", r"\", false),
+ (r"a", r"\", false),
+ (r"\a", r"\", false),
+ (r"\\a", r"\", false),
+ (r"%", r"\", false),
+ (r"\%", r"\", false),
+ (r"\\%", r"\", false),
+ (r"%%", r"\", false),
+ (r"\%%", r"\", false),
+ (r"\\%%", r"\", false),
+ (r"_", r"\", false),
+ (r"\_", r"\", false),
+ (r"\\_", r"\", false),
+ (r"__", r"\", false),
+ (r"\__", r"\", false),
+ (r"\\__", r"\", false),
+ (r"abc", r"\", false),
+ (r"a_c", r"\", false),
+ (r"a\bc", r"\", false),
+ (r"a\_c", r"\", false),
+ (r"%abc", r"\", false),
+ (r"\%abc", r"\", false),
+ (r"a\\_c%", r"\", false),
+ (r"", r"\\", false),
+ (r"\", r"\\", true),
+ (r"\\", r"\\", false),
+ (r"\\\", r"\\", false),
+ (r"\\\\", r"\\", false),
+ (r"a", r"\\", false),
+ (r"\a", r"\\", false),
+ (r"\\a", r"\\", false),
+ (r"%", r"\\", false),
+ (r"\%", r"\\", false),
+ (r"\\%", r"\\", false),
+ (r"%%", r"\\", false),
+ (r"\%%", r"\\", false),
+ (r"\\%%", r"\\", false),
+ (r"_", r"\\", false),
+ (r"\_", r"\\", false),
+ (r"\\_", r"\\", false),
+ (r"__", r"\\", false),
+ (r"\__", r"\\", false),
+ (r"\\__", r"\\", false),
+ (r"abc", r"\\", false),
+ (r"a_c", r"\\", false),
+ (r"a\bc", r"\\", false),
+ (r"a\_c", r"\\", false),
+ (r"%abc", r"\\", false),
+ (r"\%abc", r"\\", false),
+ (r"a\\_c%", r"\\", false),
+ (r"", r"\\\", false),
+ (r"\", r"\\\", false),
+ (r"\\", r"\\\", true),
+ (r"\\\", r"\\\", false),
+ (r"\\\\", r"\\\", false),
+ (r"a", r"\\\", false),
+ (r"\a", r"\\\", false),
+ (r"\\a", r"\\\", false),
+ (r"%", r"\\\", false),
+ (r"\%", r"\\\", false),
+ (r"\\%", r"\\\", false),
+ (r"%%", r"\\\", false),
+ (r"\%%", r"\\\", false),
+ (r"\\%%", r"\\\", false),
+ (r"_", r"\\\", false),
+ (r"\_", r"\\\", false),
+ (r"\\_", r"\\\", false),
+ (r"__", r"\\\", false),
+ (r"\__", r"\\\", false),
+ (r"\\__", r"\\\", false),
+ (r"abc", r"\\\", false),
+ (r"a_c", r"\\\", false),
+ (r"a\bc", r"\\\", false),
+ (r"a\_c", r"\\\", false),
+ (r"%abc", r"\\\", false),
+ (r"\%abc", r"\\\", false),
+ (r"a\\_c%", r"\\\", false),
+ (r"", r"\\\\", false),
+ (r"\", r"\\\\", false),
+ (r"\\", r"\\\\", true),
+ (r"\\\", r"\\\\", false),
+ (r"\\\\", r"\\\\", false),
+ (r"a", r"\\\\", false),
+ (r"\a", r"\\\\", false),
+ (r"\\a", r"\\\\", false),
+ (r"%", r"\\\\", false),
+ (r"\%", r"\\\\", false),
+ (r"\\%", r"\\\\", false),
+ (r"%%", r"\\\\", false),
+ (r"\%%", r"\\\\", false),
+ (r"\\%%", r"\\\\", false),
+ (r"_", r"\\\\", false),
+ (r"\_", r"\\\\", false),
+ (r"\\_", r"\\\\", false),
+ (r"__", r"\\\\", false),
+ (r"\__", r"\\\\", false),
+ (r"\\__", r"\\\\", false),
+ (r"abc", r"\\\\", false),
+ (r"a_c", r"\\\\", false),
+ (r"a\bc", r"\\\\", false),
+ (r"a\_c", r"\\\\", false),
+ (r"%abc", r"\\\\", false),
+ (r"\%abc", r"\\\\", false),
+ (r"a\\_c%", r"\\\\", false),
+ (r"", r"a", false),
+ (r"\", r"a", false),
+ (r"\\", r"a", false),
+ (r"\\\", r"a", false),
+ (r"\\\\", r"a", false),
+ (r"a", r"a", true),
+ (r"\a", r"a", false),
+ (r"\\a", r"a", false),
+ (r"%", r"a", false),
+ (r"\%", r"a", false),
+ (r"\\%", r"a", false),
+ (r"%%", r"a", false),
+ (r"\%%", r"a", false),
+ (r"\\%%", r"a", false),
+ (r"_", r"a", false),
+ (r"\_", r"a", false),
+ (r"\\_", r"a", false),
+ (r"__", r"a", false),
+ (r"\__", r"a", false),
+ (r"\\__", r"a", false),
+ (r"abc", r"a", false),
+ (r"a_c", r"a", false),
+ (r"a\bc", r"a", false),
+ (r"a\_c", r"a", false),
+ (r"%abc", r"a", false),
+ (r"\%abc", r"a", false),
+ (r"a\\_c%", r"a", false),
+ (r"", r"\a", false),
+ (r"\", r"\a", false),
+ (r"\\", r"\a", false),
+ (r"\\\", r"\a", false),
+ (r"\\\\", r"\a", false),
+ (r"a", r"\a", true),
+ (r"\a", r"\a", false),
+ (r"\\a", r"\a", false),
+ (r"%", r"\a", false),
+ (r"\%", r"\a", false),
+ (r"\\%", r"\a", false),
+ (r"%%", r"\a", false),
+ (r"\%%", r"\a", false),
+ (r"\\%%", r"\a", false),
+ (r"_", r"\a", false),
+ (r"\_", r"\a", false),
+ (r"\\_", r"\a", false),
+ (r"__", r"\a", false),
+ (r"\__", r"\a", false),
+ (r"\\__", r"\a", false),
+ (r"abc", r"\a", false),
+ (r"a_c", r"\a", false),
+ (r"a\bc", r"\a", false),
+ (r"a\_c", r"\a", false),
+ (r"%abc", r"\a", false),
+ (r"\%abc", r"\a", false),
+ (r"a\\_c%", r"\a", false),
+ (r"", r"\\a", false),
+ (r"\", r"\\a", false),
+ (r"\\", r"\\a", false),
+ (r"\\\", r"\\a", false),
+ (r"\\\\", r"\\a", false),
+ (r"a", r"\\a", false),
+ (r"\a", r"\\a", true),
+ (r"\\a", r"\\a", false),
+ (r"%", r"\\a", false),
+ (r"\%", r"\\a", false),
+ (r"\\%", r"\\a", false),
+ (r"%%", r"\\a", false),
+ (r"\%%", r"\\a", false),
+ (r"\\%%", r"\\a", false),
+ (r"_", r"\\a", false),
+ (r"\_", r"\\a", false),
+ (r"\\_", r"\\a", false),
+ (r"__", r"\\a", false),
+ (r"\__", r"\\a", false),
+ (r"\\__", r"\\a", false),
+ (r"abc", r"\\a", false),
+ (r"a_c", r"\\a", false),
+ (r"a\bc", r"\\a", false),
+ (r"a\_c", r"\\a", false),
+ (r"%abc", r"\\a", false),
+ (r"\%abc", r"\\a", false),
+ (r"a\\_c%", r"\\a", false),
+ (r"", r"%", true),
+ (r"\", r"%", true),
+ (r"\\", r"%", true),
+ (r"\\\", r"%", true),
+ (r"\\\\", r"%", true),
+ (r"a", r"%", true),
+ (r"\a", r"%", true),
+ (r"\\a", r"%", true),
+ (r"%", r"%", true),
+ (r"\%", r"%", true),
+ (r"\\%", r"%", true),
+ (r"%%", r"%", true),
+ (r"\%%", r"%", true),
+ (r"\\%%", r"%", true),
+ (r"_", r"%", true),
+ (r"\_", r"%", true),
+ (r"\\_", r"%", true),
+ (r"__", r"%", true),
+ (r"\__", r"%", true),
+ (r"\\__", r"%", true),
+ (r"abc", r"%", true),
+ (r"a_c", r"%", true),
+ (r"a\bc", r"%", true),
+ (r"a\_c", r"%", true),
+ (r"%abc", r"%", true),
+ (r"\%abc", r"%", true),
+ (r"a\\_c%", r"%", true),
+ (r"", r"\%", false),
+ (r"\", r"\%", false),
+ (r"\\", r"\%", false),
+ (r"\\\", r"\%", false),
+ (r"\\\\", r"\%", false),
+ (r"a", r"\%", false),
+ (r"\a", r"\%", false),
+ (r"\\a", r"\%", false),
+ (r"%", r"\%", true),
+ (r"\%", r"\%", false),
+ (r"\\%", r"\%", false),
+ (r"%%", r"\%", false),
+ (r"\%%", r"\%", false),
+ (r"\\%%", r"\%", false),
+ (r"_", r"\%", false),
+ (r"\_", r"\%", false),
+ (r"\\_", r"\%", false),
+ (r"__", r"\%", false),
+ (r"\__", r"\%", false),
+ (r"\\__", r"\%", false),
+ (r"abc", r"\%", false),
+ (r"a_c", r"\%", false),
+ (r"a\bc", r"\%", false),
+ (r"a\_c", r"\%", false),
+ (r"%abc", r"\%", false),
+ (r"\%abc", r"\%", false),
+ (r"a\\_c%", r"\%", false),
+ (r"", r"\\%", false),
+ (r"\", r"\\%", true),
+ (r"\\", r"\\%", true),
+ (r"\\\", r"\\%", true),
+ (r"\\\\", r"\\%", true),
+ (r"a", r"\\%", false),
+ (r"\a", r"\\%", true),
+ (r"\\a", r"\\%", true),
+ (r"%", r"\\%", false),
+ (r"\%", r"\\%", true),
+ (r"\\%", r"\\%", true),
+ (r"%%", r"\\%", false),
+ (r"\%%", r"\\%", true),
+ (r"\\%%", r"\\%", true),
+ (r"_", r"\\%", false),
+ (r"\_", r"\\%", true),
+ (r"\\_", r"\\%", true),
+ (r"__", r"\\%", false),
+ (r"\__", r"\\%", true),
+ (r"\\__", r"\\%", true),
+ (r"abc", r"\\%", false),
+ (r"a_c", r"\\%", false),
+ (r"a\bc", r"\\%", false),
+ (r"a\_c", r"\\%", false),
+ (r"%abc", r"\\%", false),
+ (r"\%abc", r"\\%", true),
+ (r"a\\_c%", r"\\%", false),
+ (r"", r"%%", true),
+ (r"\", r"%%", true),
+ (r"\\", r"%%", true),
+ (r"\\\", r"%%", true),
+ (r"\\\\", r"%%", true),
+ (r"a", r"%%", true),
+ (r"\a", r"%%", true),
+ (r"\\a", r"%%", true),
+ (r"%", r"%%", true),
+ (r"\%", r"%%", true),
+ (r"\\%", r"%%", true),
+ (r"%%", r"%%", true),
+ (r"\%%", r"%%", true),
+ (r"\\%%", r"%%", true),
+ (r"_", r"%%", true),
+ (r"\_", r"%%", true),
+ (r"\\_", r"%%", true),
+ (r"__", r"%%", true),
+ (r"\__", r"%%", true),
+ (r"\\__", r"%%", true),
+ (r"abc", r"%%", true),
+ (r"a_c", r"%%", true),
+ (r"a\bc", r"%%", true),
+ (r"a\_c", r"%%", true),
+ (r"%abc", r"%%", true),
+ (r"\%abc", r"%%", true),
+ (r"a\\_c%", r"%%", true),
+ (r"", r"\%%", false),
+ (r"\", r"\%%", false),
+ (r"\\", r"\%%", false),
+ (r"\\\", r"\%%", false),
+ (r"\\\\", r"\%%", false),
+ (r"a", r"\%%", false),
+ (r"\a", r"\%%", false),
+ (r"\\a", r"\%%", false),
+ (r"%", r"\%%", true),
+ (r"\%", r"\%%", false),
+ (r"\\%", r"\%%", false),
+ (r"%%", r"\%%", true),
+ (r"\%%", r"\%%", false),
+ (r"\\%%", r"\%%", false),
+ (r"_", r"\%%", false),
+ (r"\_", r"\%%", false),
+ (r"\\_", r"\%%", false),
+ (r"__", r"\%%", false),
+ (r"\__", r"\%%", false),
+ (r"\\__", r"\%%", false),
+ (r"abc", r"\%%", false),
+ (r"a_c", r"\%%", false),
+ (r"a\bc", r"\%%", false),
+ (r"a\_c", r"\%%", false),
+ (r"%abc", r"\%%", true),
+ (r"\%abc", r"\%%", false),
+ (r"a\\_c%", r"\%%", false),
+ (r"", r"\\%%", false),
+ (r"\", r"\\%%", true),
+ (r"\\", r"\\%%", true),
+ (r"\\\", r"\\%%", true),
+ (r"\\\\", r"\\%%", true),
+ (r"a", r"\\%%", false),
+ (r"\a", r"\\%%", true),
+ (r"\\a", r"\\%%", true),
+ (r"%", r"\\%%", false),
+ (r"\%", r"\\%%", true),
+ (r"\\%", r"\\%%", true),
+ (r"%%", r"\\%%", false),
+ (r"\%%", r"\\%%", true),
+ (r"\\%%", r"\\%%", true),
+ (r"_", r"\\%%", false),
+ (r"\_", r"\\%%", true),
+ (r"\\_", r"\\%%", true),
+ (r"__", r"\\%%", false),
+ (r"\__", r"\\%%", true),
+ (r"\\__", r"\\%%", true),
+ (r"abc", r"\\%%", false),
+ (r"a_c", r"\\%%", false),
+ (r"a\bc", r"\\%%", false),
+ (r"a\_c", r"\\%%", false),
+ (r"%abc", r"\\%%", false),
+ (r"\%abc", r"\\%%", true),
+ (r"a\\_c%", r"\\%%", false),
+ (r"", r"_", false),
+ (r"\", r"_", true),
+ (r"\\", r"_", false),
+ (r"\\\", r"_", false),
+ (r"\\\\", r"_", false),
+ (r"a", r"_", true),
+ (r"\a", r"_", false),
+ (r"\\a", r"_", false),
+ (r"%", r"_", true),
+ (r"\%", r"_", false),
+ (r"\\%", r"_", false),
+ (r"%%", r"_", false),
+ (r"\%%", r"_", false),
+ (r"\\%%", r"_", false),
+ (r"_", r"_", true),
+ (r"\_", r"_", false),
+ (r"\\_", r"_", false),
+ (r"__", r"_", false),
+ (r"\__", r"_", false),
+ (r"\\__", r"_", false),
+ (r"abc", r"_", false),
+ (r"a_c", r"_", false),
+ (r"a\bc", r"_", false),
+ (r"a\_c", r"_", false),
+ (r"%abc", r"_", false),
+ (r"\%abc", r"_", false),
+ (r"a\\_c%", r"_", false),
+ (r"", r"\_", false),
+ (r"\", r"\_", false),
+ (r"\\", r"\_", false),
+ (r"\\\", r"\_", false),
+ (r"\\\\", r"\_", false),
+ (r"a", r"\_", false),
+ (r"\a", r"\_", false),
+ (r"\\a", r"\_", false),
+ (r"%", r"\_", false),
+ (r"\%", r"\_", false),
+ (r"\\%", r"\_", false),
+ (r"%%", r"\_", false),
+ (r"\%%", r"\_", false),
+ (r"\\%%", r"\_", false),
+ (r"_", r"\_", true),
+ (r"\_", r"\_", false),
+ (r"\\_", r"\_", false),
+ (r"__", r"\_", false),
+ (r"\__", r"\_", false),
+ (r"\\__", r"\_", false),
+ (r"abc", r"\_", false),
+ (r"a_c", r"\_", false),
+ (r"a\bc", r"\_", false),
+ (r"a\_c", r"\_", false),
+ (r"%abc", r"\_", false),
+ (r"\%abc", r"\_", false),
+ (r"a\\_c%", r"\_", false),
+ (r"", r"\\_", false),
+ (r"\", r"\\_", false),
+ (r"\\", r"\\_", true),
+ (r"\\\", r"\\_", false),
+ (r"\\\\", r"\\_", false),
+ (r"a", r"\\_", false),
+ (r"\a", r"\\_", true),
+ (r"\\a", r"\\_", false),
+ (r"%", r"\\_", false),
+ (r"\%", r"\\_", true),
+ (r"\\%", r"\\_", false),
+ (r"%%", r"\\_", false),
+ (r"\%%", r"\\_", false),
+ (r"\\%%", r"\\_", false),
+ (r"_", r"\\_", false),
+ (r"\_", r"\\_", true),
+ (r"\\_", r"\\_", false),
+ (r"__", r"\\_", false),
+ (r"\__", r"\\_", false),
+ (r"\\__", r"\\_", false),
+ (r"abc", r"\\_", false),
+ (r"a_c", r"\\_", false),
+ (r"a\bc", r"\\_", false),
+ (r"a\_c", r"\\_", false),
+ (r"%abc", r"\\_", false),
+ (r"\%abc", r"\\_", false),
+ (r"a\\_c%", r"\\_", false),
+ (r"", r"__", false),
+ (r"\", r"__", false),
+ (r"\\", r"__", true),
+ (r"\\\", r"__", false),
+ (r"\\\\", r"__", false),
+ (r"a", r"__", false),
+ (r"\a", r"__", true),
+ (r"\\a", r"__", false),
+ (r"%", r"__", false),
+ (r"\%", r"__", true),
+ (r"\\%", r"__", false),
+ (r"%%", r"__", true),
+ (r"\%%", r"__", false),
+ (r"\\%%", r"__", false),
+ (r"_", r"__", false),
+ (r"\_", r"__", true),
+ (r"\\_", r"__", false),
+ (r"__", r"__", true),
+ (r"\__", r"__", false),
+ (r"\\__", r"__", false),
+ (r"abc", r"__", false),
+ (r"a_c", r"__", false),
+ (r"a\bc", r"__", false),
+ (r"a\_c", r"__", false),
+ (r"%abc", r"__", false),
+ (r"\%abc", r"__", false),
+ (r"a\\_c%", r"__", false),
+ (r"", r"\__", false),
+ (r"\", r"\__", false),
+ (r"\\", r"\__", false),
+ (r"\\\", r"\__", false),
+ (r"\\\\", r"\__", false),
+ (r"a", r"\__", false),
+ (r"\a", r"\__", false),
+ (r"\\a", r"\__", false),
+ (r"%", r"\__", false),
+ (r"\%", r"\__", false),
+ (r"\\%", r"\__", false),
+ (r"%%", r"\__", false),
+ (r"\%%", r"\__", false),
+ (r"\\%%", r"\__", false),
+ (r"_", r"\__", false),
+ (r"\_", r"\__", false),
+ (r"\\_", r"\__", false),
+ (r"__", r"\__", true),
+ (r"\__", r"\__", false),
+ (r"\\__", r"\__", false),
+ (r"abc", r"\__", false),
+ (r"a_c", r"\__", false),
+ (r"a\bc", r"\__", false),
+ (r"a\_c", r"\__", false),
+ (r"%abc", r"\__", false),
+ (r"\%abc", r"\__", false),
+ (r"a\\_c%", r"\__", false),
+ (r"", r"\\__", false),
+ (r"\", r"\\__", false),
+ (r"\\", r"\\__", false),
+ (r"\\\", r"\\__", true),
+ (r"\\\\", r"\\__", false),
+ (r"a", r"\\__", false),
+ (r"\a", r"\\__", false),
+ (r"\\a", r"\\__", true),
+ (r"%", r"\\__", false),
+ (r"\%", r"\\__", false),
+ (r"\\%", r"\\__", true),
+ (r"%%", r"\\__", false),
+ (r"\%%", r"\\__", true),
+ (r"\\%%", r"\\__", false),
+ (r"_", r"\\__", false),
+ (r"\_", r"\\__", false),
+ (r"\\_", r"\\__", true),
+ (r"__", r"\\__", false),
+ (r"\__", r"\\__", true),
+ (r"\\__", r"\\__", false),
+ (r"abc", r"\\__", false),
+ (r"a_c", r"\\__", false),
+ (r"a\bc", r"\\__", false),
+ (r"a\_c", r"\\__", false),
+ (r"%abc", r"\\__", false),
+ (r"\%abc", r"\\__", false),
+ (r"a\\_c%", r"\\__", false),
+ (r"", r"abc", false),
+ (r"\", r"abc", false),
+ (r"\\", r"abc", false),
+ (r"\\\", r"abc", false),
+ (r"\\\\", r"abc", false),
+ (r"a", r"abc", false),
+ (r"\a", r"abc", false),
+ (r"\\a", r"abc", false),
+ (r"%", r"abc", false),
+ (r"\%", r"abc", false),
+ (r"\\%", r"abc", false),
+ (r"%%", r"abc", false),
+ (r"\%%", r"abc", false),
+ (r"\\%%", r"abc", false),
+ (r"_", r"abc", false),
+ (r"\_", r"abc", false),
+ (r"\\_", r"abc", false),
+ (r"__", r"abc", false),
+ (r"\__", r"abc", false),
+ (r"\\__", r"abc", false),
+ (r"abc", r"abc", true),
+ (r"a_c", r"abc", false),
+ (r"a\bc", r"abc", false),
+ (r"a\_c", r"abc", false),
+ (r"%abc", r"abc", false),
+ (r"\%abc", r"abc", false),
+ (r"a\\_c%", r"abc", false),
+ (r"", r"a_c", false),
+ (r"\", r"a_c", false),
+ (r"\\", r"a_c", false),
+ (r"\\\", r"a_c", false),
+ (r"\\\\", r"a_c", false),
+ (r"a", r"a_c", false),
+ (r"\a", r"a_c", false),
+ (r"\\a", r"a_c", false),
+ (r"%", r"a_c", false),
+ (r"\%", r"a_c", false),
+ (r"\\%", r"a_c", false),
+ (r"%%", r"a_c", false),
+ (r"\%%", r"a_c", false),
+ (r"\\%%", r"a_c", false),
+ (r"_", r"a_c", false),
+ (r"\_", r"a_c", false),
+ (r"\\_", r"a_c", false),
+ (r"__", r"a_c", false),
+ (r"\__", r"a_c", false),
+ (r"\\__", r"a_c", false),
+ (r"abc", r"a_c", true),
+ (r"a_c", r"a_c", true),
+ (r"a\bc", r"a_c", false),
+ (r"a\_c", r"a_c", false),
+ (r"%abc", r"a_c", false),
+ (r"\%abc", r"a_c", false),
+ (r"a\\_c%", r"a_c", false),
+ (r"", r"a\bc", false),
+ (r"\", r"a\bc", false),
+ (r"\\", r"a\bc", false),
+ (r"\\\", r"a\bc", false),
+ (r"\\\\", r"a\bc", false),
+ (r"a", r"a\bc", false),
+ (r"\a", r"a\bc", false),
+ (r"\\a", r"a\bc", false),
+ (r"%", r"a\bc", false),
+ (r"\%", r"a\bc", false),
+ (r"\\%", r"a\bc", false),
+ (r"%%", r"a\bc", false),
+ (r"\%%", r"a\bc", false),
+ (r"\\%%", r"a\bc", false),
+ (r"_", r"a\bc", false),
+ (r"\_", r"a\bc", false),
+ (r"\\_", r"a\bc", false),
+ (r"__", r"a\bc", false),
+ (r"\__", r"a\bc", false),
+ (r"\\__", r"a\bc", false),
+ (r"abc", r"a\bc", true),
+ (r"a_c", r"a\bc", false),
+ (r"a\bc", r"a\bc", false),
+ (r"a\_c", r"a\bc", false),
+ (r"%abc", r"a\bc", false),
+ (r"\%abc", r"a\bc", false),
+ (r"a\\_c%", r"a\bc", false),
+ (r"", r"a\_c", false),
+ (r"\", r"a\_c", false),
+ (r"\\", r"a\_c", false),
+ (r"\\\", r"a\_c", false),
+ (r"\\\\", r"a\_c", false),
+ (r"a", r"a\_c", false),
+ (r"\a", r"a\_c", false),
+ (r"\\a", r"a\_c", false),
+ (r"%", r"a\_c", false),
+ (r"\%", r"a\_c", false),
+ (r"\\%", r"a\_c", false),
+ (r"%%", r"a\_c", false),
+ (r"\%%", r"a\_c", false),
+ (r"\\%%", r"a\_c", false),
+ (r"_", r"a\_c", false),
+ (r"\_", r"a\_c", false),
+ (r"\\_", r"a\_c", false),
+ (r"__", r"a\_c", false),
+ (r"\__", r"a\_c", false),
+ (r"\\__", r"a\_c", false),
+ (r"abc", r"a\_c", false),
+ (r"a_c", r"a\_c", true),
+ (r"a\bc", r"a\_c", false),
+ (r"a\_c", r"a\_c", false),
+ (r"%abc", r"a\_c", false),
+ (r"\%abc", r"a\_c", false),
+ (r"a\\_c%", r"a\_c", false),
+ (r"", r"%abc", false),
+ (r"\", r"%abc", false),
+ (r"\\", r"%abc", false),
+ (r"\\\", r"%abc", false),
+ (r"\\\\", r"%abc", false),
+ (r"a", r"%abc", false),
+ (r"\a", r"%abc", false),
+ (r"\\a", r"%abc", false),
+ (r"%", r"%abc", false),
+ (r"\%", r"%abc", false),
+ (r"\\%", r"%abc", false),
+ (r"%%", r"%abc", false),
+ (r"\%%", r"%abc", false),
+ (r"\\%%", r"%abc", false),
+ (r"_", r"%abc", false),
+ (r"\_", r"%abc", false),
+ (r"\\_", r"%abc", false),
+ (r"__", r"%abc", false),
+ (r"\__", r"%abc", false),
+ (r"\\__", r"%abc", false),
+ (r"abc", r"%abc", true),
+ (r"a_c", r"%abc", false),
+ (r"a\bc", r"%abc", false),
+ (r"a\_c", r"%abc", false),
+ (r"%abc", r"%abc", true),
+ (r"\%abc", r"%abc", true),
+ (r"a\\_c%", r"%abc", false),
+ (r"", r"\%abc", false),
+ (r"\", r"\%abc", false),
+ (r"\\", r"\%abc", false),
+ (r"\\\", r"\%abc", false),
+ (r"\\\\", r"\%abc", false),
+ (r"a", r"\%abc", false),
+ (r"\a", r"\%abc", false),
+ (r"\\a", r"\%abc", false),
+ (r"%", r"\%abc", false),
+ (r"\%", r"\%abc", false),
+ (r"\\%", r"\%abc", false),
+ (r"%%", r"\%abc", false),
+ (r"\%%", r"\%abc", false),
+ (r"\\%%", r"\%abc", false),
+ (r"_", r"\%abc", false),
+ (r"\_", r"\%abc", false),
+ (r"\\_", r"\%abc", false),
+ (r"__", r"\%abc", false),
+ (r"\__", r"\%abc", false),
+ (r"\\__", r"\%abc", false),
+ (r"abc", r"\%abc", false),
+ (r"a_c", r"\%abc", false),
+ (r"a\bc", r"\%abc", false),
+ (r"a\_c", r"\%abc", false),
+ (r"%abc", r"\%abc", true),
+ (r"\%abc", r"\%abc", false),
+ (r"a\\_c%", r"\%abc", false),
+ (r"", r"a\\_c%", false),
+ (r"\", r"a\\_c%", false),
+ (r"\\", r"a\\_c%", false),
+ (r"\\\", r"a\\_c%", false),
+ (r"\\\\", r"a\\_c%", false),
+ (r"a", r"a\\_c%", false),
+ (r"\a", r"a\\_c%", false),
+ (r"\\a", r"a\\_c%", false),
+ (r"%", r"a\\_c%", false),
+ (r"\%", r"a\\_c%", false),
+ (r"\\%", r"a\\_c%", false),
+ (r"%%", r"a\\_c%", false),
+ (r"\%%", r"a\\_c%", false),
+ (r"\\%%", r"a\\_c%", false),
+ (r"_", r"a\\_c%", false),
+ (r"\_", r"a\\_c%", false),
+ (r"\\_", r"a\\_c%", false),
+ (r"__", r"a\\_c%", false),
+ (r"\__", r"a\\_c%", false),
+ (r"\\__", r"a\\_c%", false),
+ (r"abc", r"a\\_c%", false),
+ (r"a_c", r"a\\_c%", false),
+ (r"a\bc", r"a\\_c%", true),
+ (r"a\_c", r"a\\_c%", true),
+ (r"%abc", r"a\\_c%", false),
+ (r"\%abc", r"a\\_c%", false),
+ (r"a\\_c%", r"a\\_c%", false),
+ ];
+
+ let values = test_cases
+ .iter()
+ .map(|(value, _, _)| *value)
+ .collect::<Vec<_>>();
+ let patterns = test_cases
+ .iter()
+ .map(|(_, pattern, _)| *pattern)
+ .collect::<Vec<_>>();
+ let expected = BooleanArray::from(
+ test_cases
+ .iter()
+ .map(|(_, _, expected)| *expected)
+ .collect::<Vec<_>>(),
+ );
+ let unexpected = BooleanArray::from(
+ test_cases
+ .iter()
+ .map(|(_, _, expected)| !*expected)
+ .collect::<Vec<_>>(),
+ );
+
+ for string_type in [DataType::Utf8, DataType::LargeUtf8,
DataType::Utf8View] {
+ let values = make_array(values.iter(), &string_type);
+ let patterns = make_array(patterns.iter(), &string_type);
+ let (values, patterns) = (values.as_ref(), patterns.as_ref());
+
+ assert_eq!(like(&values, &patterns).unwrap(), expected,);
+ assert_eq!(ilike(&values, &patterns).unwrap(), expected,);
+ assert_eq!(nlike(&values, &patterns).unwrap(), unexpected,);
+ assert_eq!(nilike(&values, &patterns).unwrap(), unexpected,);
+ }
+ }
+
+ fn make_datums(
+ value: impl AsRef<str>,
+ data_type: &DataType,
+ ) -> Vec<(Box<dyn Datum>, DatumType)> {
+ match data_type {
+ DataType::Utf8 => {
+ let array = StringArray::from_iter_values([value]);
+ vec![
+ (Box::new(array.clone()), DatumType::Array),
+ (Box::new(Scalar::new(array)), DatumType::Scalar),
+ ]
+ }
+ DataType::LargeUtf8 => {
+ let array = LargeStringArray::from_iter_values([value]);
+ vec![
+ (Box::new(array.clone()), DatumType::Array),
+ (Box::new(Scalar::new(array)), DatumType::Scalar),
+ ]
+ }
+ DataType::Utf8View => {
+ let array = StringViewArray::from_iter_values([value]);
+ vec![
+ (Box::new(array.clone()), DatumType::Array),
+ (Box::new(Scalar::new(array)), DatumType::Scalar),
+ ]
+ }
+ _ => unimplemented!(),
+ }
+ }
+
+ fn make_array(
+ values: impl IntoIterator<Item: AsRef<str>>,
+ data_type: &DataType,
+ ) -> Box<dyn Array> {
+ match data_type {
+ DataType::Utf8 => Box::new(StringArray::from_iter_values(values)),
+ DataType::LargeUtf8 =>
Box::new(LargeStringArray::from_iter_values(values)),
+ DataType::Utf8View =>
Box::new(StringViewArray::from_iter_values(values)),
+ _ => unimplemented!(),
+ }
+ }
+
+ #[derive(Debug)]
+ enum DatumType {
+ Array,
+ Scalar,
+ }
}
diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index ae2493692d..8bbfe65bab 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -18,7 +18,7 @@
use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray};
use arrow_buffer::BooleanBuffer;
use arrow_schema::ArrowError;
-use memchr::memchr2;
+use memchr::memchr3;
use memchr::memmem::Finder;
use regex::{Regex, RegexBuilder};
use std::iter::zip;
@@ -45,16 +45,12 @@ impl<'a> Predicate<'a> {
pub fn like(pattern: &'a str) -> Result<Self, ArrowError> {
if !contains_like_pattern(pattern) {
Ok(Self::Eq(pattern))
- } else if pattern.ends_with('%')
- && !pattern.ends_with("\\%")
- && !contains_like_pattern(&pattern[..pattern.len() - 1])
- {
+ } else if pattern.ends_with('%') &&
!contains_like_pattern(&pattern[..pattern.len() - 1]) {
Ok(Self::StartsWith(&pattern[..pattern.len() - 1]))
} else if pattern.starts_with('%') &&
!contains_like_pattern(&pattern[1..]) {
Ok(Self::EndsWith(&pattern[1..]))
} else if pattern.starts_with('%')
&& pattern.ends_with('%')
- && !pattern.ends_with("\\%")
&& !contains_like_pattern(&pattern[1..pattern.len() - 1])
{
Ok(Self::contains(&pattern[1..pattern.len() - 1]))
@@ -262,12 +258,16 @@ fn regex_like(pattern: &str, case_insensitive: bool) ->
Result<Regex, ArrowError
match c {
'\\' => {
match chars_iter.peek() {
- Some(next) if is_like_pattern(*next) => {
- result.push(*next);
+ Some(&next) => {
+ if regex_syntax::is_meta_character(next) {
+ result.push('\\');
+ }
+ result.push(next);
// Skipping the next char as it is already appended
chars_iter.next();
}
- _ => {
+ None => {
+ // Trailing backslash in the pattern. E.g. PostgreSQL
and Trino treat it as an error, but e.g. Snowflake treats it as a literal
backslash
result.push('\\');
result.push('\\');
}
@@ -301,12 +301,8 @@ fn regex_like(pattern: &str, case_insensitive: bool) ->
Result<Regex, ArrowError
})
}
-fn is_like_pattern(c: char) -> bool {
- c == '%' || c == '_'
-}
-
fn contains_like_pattern(pattern: &str) -> bool {
- memchr2(b'%', b'_', pattern.as_bytes()).is_some()
+ memchr3(b'%', b'_', b'\\', pattern.as_bytes()).is_some()
}
#[cfg(test)]
@@ -314,51 +310,32 @@ mod tests {
use super::*;
#[test]
- fn test_replace_start_end_percent() {
- let a_eq = "%foobar%";
- let expected = "foobar";
- let r = regex_like(a_eq, false).unwrap();
- assert_eq!(r.to_string(), expected);
- }
-
- #[test]
- fn test_replace_middle_percent() {
- let a_eq = "foo%bar";
- let expected = "^foo.*bar$";
- let r = regex_like(a_eq, false).unwrap();
- assert_eq!(r.to_string(), expected);
- }
-
- #[test]
- fn test_replace_underscore() {
- let a_eq = "foo_bar";
- let expected = "^foo.bar$";
- let r = regex_like(a_eq, false).unwrap();
- assert_eq!(r.to_string(), expected);
- }
-
- #[test]
- fn test_replace_like_wildcards_leave_like_meta_chars() {
- let a_eq = "\\%\\_";
- let expected = "^%_$";
- let r = regex_like(a_eq, false).unwrap();
- assert_eq!(r.to_string(), expected);
- }
-
- #[test]
- fn test_replace_like_wildcards_with_multiple_escape_chars() {
- let a_eq = "\\\\%";
- let expected = "^\\\\%$";
- let r = regex_like(a_eq, false).unwrap();
- assert_eq!(r.to_string(), expected);
- }
-
- #[test]
- fn test_replace_like_wildcards_escape_regex_meta_char() {
- let a_eq = ".";
- let expected = "^\\.$";
- let r = regex_like(a_eq, false).unwrap();
- assert_eq!(r.to_string(), expected);
+ fn test_regex_like() {
+ let test_cases = [
+ // %..%
+ (r"%foobar%", r"foobar"),
+ // ..%..
+ (r"foo%bar", r"^foo.*bar$"),
+ // .._..
+ (r"foo_bar", r"^foo.bar$"),
+ // escaped wildcards
+ (r"\%\_", r"^%_$"),
+ // escaped non-wildcard
+ (r"\a", r"^a$"),
+ // escaped escape and wildcard
+ (r"\\%", r"^\\"),
+ // escaped escape and non-wildcard
+ (r"\\a", r"^\\a$"),
+ // regex meta character
+ (r".", r"^\.$"),
+ (r"$", r"^\$$"),
+ (r"\\", r"^\\$"),
+ ];
+
+ for (like_pattern, expected_regexp) in test_cases {
+ let r = regex_like(like_pattern, false).unwrap();
+ assert_eq!(r.to_string(), expected_regexp);
+ }
}
#[test]