This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 12ca3f07d8 Fix string view LIKE checks with NULL values (#6662)
12ca3f07d8 is described below
commit 12ca3f07d8e486275aa8871ca2c57a470460e11c
Author: Piotr Findeisen <[email protected]>
AuthorDate: Fri Nov 8 17:59:01 2024 +0100
Fix string view LIKE checks with NULL values (#6662)
* Fix string view LIKE checks with NULL values
* Add TODO comments to operations yet to be fixed
* Use from_unary for contains with string view
from_unary takes care of nulls
* Fix performance
* fixup! Fix performance
---
arrow-string/src/like.rs | 119 +++++++++++++++++++++++++++++++++++++++++-
arrow-string/src/predicate.rs | 34 ++++++------
2 files changed, 133 insertions(+), 20 deletions(-)
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index 5c76d5c810..fee3b792a9 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -1708,7 +1708,93 @@ mod tests {
}
#[test]
- fn like_scalar_null() {
+ fn string_null_like_pattern() {
+ // Different patterns have different execution code paths
+ for pattern in &[
+ "", // can execute as equality check
+ "_", // can execute as length check
+ "%", // can execute as starts_with("") or non-null check
+ "a%", // can execute as starts_with("a")
+ "%a", // can execute as ends_with("")
+ "a%b", // can execute as starts_with("a") && ends_with("b")
+ "%a%", // can_execute as contains("a")
+ "%a%b_c_d%e", // can_execute as regular expression
+ ] {
+ let a = Scalar::new(StringArray::new_null(1));
+ let b = StringArray::new_scalar(pattern);
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1, "With pattern {pattern}");
+ assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+ assert!(r.is_null(0), "With pattern {pattern}");
+
+ let a = Scalar::new(StringArray::new_null(1));
+ let b = StringArray::from_iter_values([pattern]);
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1, "With pattern {pattern}");
+ assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+ assert!(r.is_null(0), "With pattern {pattern}");
+
+ let a = StringArray::new_null(1);
+ let b = StringArray::from_iter_values([pattern]);
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1, "With pattern {pattern}");
+ assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+ assert!(r.is_null(0), "With pattern {pattern}");
+
+ let a = StringArray::new_null(1);
+ let b = StringArray::new_scalar(pattern);
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1, "With pattern {pattern}");
+ assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+ assert!(r.is_null(0), "With pattern {pattern}");
+ }
+ }
+
+ #[test]
+ fn string_view_null_like_pattern() {
+ // Different patterns have different execution code paths
+ for pattern in &[
+ "", // can execute as equality check
+ "_", // can execute as length check
+ "%", // can execute as starts_with("") or non-null check
+ "a%", // can execute as starts_with("a")
+ "%a", // can execute as ends_with("")
+ "a%b", // can execute as starts_with("a") && ends_with("b")
+ "%a%", // can_execute as contains("a")
+ "%a%b_c_d%e", // can_execute as regular expression
+ ] {
+ let a = Scalar::new(StringViewArray::new_null(1));
+ let b = StringViewArray::new_scalar(pattern);
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1, "With pattern {pattern}");
+ assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+ assert!(r.is_null(0), "With pattern {pattern}");
+
+ let a = Scalar::new(StringViewArray::new_null(1));
+ let b = StringViewArray::from_iter_values([pattern]);
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1, "With pattern {pattern}");
+ assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+ assert!(r.is_null(0), "With pattern {pattern}");
+
+ let a = StringViewArray::new_null(1);
+ let b = StringViewArray::from_iter_values([pattern]);
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1, "With pattern {pattern}");
+ assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+ assert!(r.is_null(0), "With pattern {pattern}");
+
+ let a = StringViewArray::new_null(1);
+ let b = StringViewArray::new_scalar(pattern);
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1, "With pattern {pattern}");
+ assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+ assert!(r.is_null(0), "With pattern {pattern}");
+ }
+ }
+
+ #[test]
+ fn string_like_scalar_null() {
let a = StringArray::new_scalar("a");
let b = Scalar::new(StringArray::new_null(1));
let r = like(&a, &b).unwrap();
@@ -1737,4 +1823,35 @@ mod tests {
assert_eq!(r.null_count(), 1);
assert!(r.is_null(0));
}
+
+ #[test]
+ fn string_view_like_scalar_null() {
+ let a = StringViewArray::new_scalar("a");
+ let b = Scalar::new(StringViewArray::new_null(1));
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1);
+ assert_eq!(r.null_count(), 1);
+ assert!(r.is_null(0));
+
+ let a = StringViewArray::from_iter_values(["a"]);
+ let b = Scalar::new(StringViewArray::new_null(1));
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1);
+ assert_eq!(r.null_count(), 1);
+ assert!(r.is_null(0));
+
+ let a = StringViewArray::from_iter_values(["a"]);
+ let b = StringViewArray::new_null(1);
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1);
+ assert_eq!(r.null_count(), 1);
+ assert!(r.is_null(0));
+
+ let a = StringViewArray::new_scalar("a");
+ let b = StringViewArray::new_null(1);
+ let r = like(&a, &b).unwrap();
+ assert_eq!(r.len(), 1);
+ assert_eq!(r.null_count(), 1);
+ assert!(r.is_null(0));
+ }
}
diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index 408d9d45cc..1d141e6b0b 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray};
+use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray};
+use arrow_buffer::BooleanBuffer;
use arrow_schema::ArrowError;
use memchr::memchr2;
use memchr::memmem::Finder;
@@ -114,30 +115,21 @@ impl<'a> Predicate<'a> {
Predicate::IEqAscii(v) => BooleanArray::from_unary(array,
|haystack| {
haystack.eq_ignore_ascii_case(v) != negate
}),
- Predicate::Contains(finder) => {
- if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
- BooleanArray::from(
- string_view_array
- .bytes_iter()
- .map(|haystack| finder.find(haystack).is_some() !=
negate)
- .collect::<Vec<_>>(),
- )
- } else {
- BooleanArray::from_unary(array, |haystack| {
- finder.find(haystack.as_bytes()).is_some() != negate
- })
- }
- }
+ Predicate::Contains(finder) => BooleanArray::from_unary(array,
|haystack| {
+ finder.find(haystack.as_bytes()).is_some() != negate
+ }),
Predicate::StartsWith(v) => {
if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
- BooleanArray::from(
+ let nulls = string_view_array.logical_nulls();
+ let values = BooleanBuffer::from(
string_view_array
.prefix_bytes_iter(v.len())
.map(|haystack| {
equals_bytes(haystack, v.as_bytes(),
equals_kernel) != negate
})
.collect::<Vec<_>>(),
- )
+ );
+ BooleanArray::new(values, nulls)
} else {
BooleanArray::from_unary(array, |haystack| {
starts_with(haystack, v, equals_kernel) != negate
@@ -146,6 +138,7 @@ impl<'a> Predicate<'a> {
}
Predicate::IStartsWithAscii(v) => {
if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ // TODO respect null buffer
BooleanArray::from(
string_view_array
.prefix_bytes_iter(v.len())
@@ -166,14 +159,16 @@ impl<'a> Predicate<'a> {
}
Predicate::EndsWith(v) => {
if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
- BooleanArray::from(
+ let nulls = string_view_array.logical_nulls();
+ let values = BooleanBuffer::from(
string_view_array
.suffix_bytes_iter(v.len())
.map(|haystack| {
equals_bytes(haystack, v.as_bytes(),
equals_kernel) != negate
})
.collect::<Vec<_>>(),
- )
+ );
+ BooleanArray::new(values, nulls)
} else {
BooleanArray::from_unary(array, |haystack| {
ends_with(haystack, v, equals_kernel) != negate
@@ -182,6 +177,7 @@ impl<'a> Predicate<'a> {
}
Predicate::IEndsWithAscii(v) => {
if let Some(string_view_array) =
array.as_any().downcast_ref::<StringViewArray>() {
+ // TODO respect null buffer
BooleanArray::from(
string_view_array
.suffix_bytes_iter(v.len())