This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 12ca3f07d8 Fix string view LIKE checks with NULL values (#6662)
12ca3f07d8 is described below

commit 12ca3f07d8e486275aa8871ca2c57a470460e11c
Author: Piotr Findeisen <[email protected]>
AuthorDate: Fri Nov 8 17:59:01 2024 +0100

    Fix string view LIKE checks with NULL values (#6662)
    
    * Fix string view LIKE checks with NULL values
    
    * Add TODO comments to operations yet to be fixed
    
    * Use from_unary for contains with string view
    
    from_unary takes care of nulls
    
    * Fix performance
    
    * fixup! Fix performance
---
 arrow-string/src/like.rs      | 119 +++++++++++++++++++++++++++++++++++++++++-
 arrow-string/src/predicate.rs |  34 ++++++------
 2 files changed, 133 insertions(+), 20 deletions(-)

diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index 5c76d5c810..fee3b792a9 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -1708,7 +1708,93 @@ mod tests {
     }
 
     #[test]
-    fn like_scalar_null() {
+    fn string_null_like_pattern() {
+        // Different patterns have different execution code paths
+        for pattern in &[
+            "",           // can execute as equality check
+            "_",          // can execute as length check
+            "%",          // can execute as starts_with("") or non-null check
+            "a%",         // can execute as starts_with("a")
+            "%a",         // can execute as ends_with("")
+            "a%b",        // can execute as starts_with("a") && ends_with("b")
+            "%a%",        // can_execute as contains("a")
+            "%a%b_c_d%e", // can_execute as regular expression
+        ] {
+            let a = Scalar::new(StringArray::new_null(1));
+            let b = StringArray::new_scalar(pattern);
+            let r = like(&a, &b).unwrap();
+            assert_eq!(r.len(), 1, "With pattern {pattern}");
+            assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+            assert!(r.is_null(0), "With pattern {pattern}");
+
+            let a = Scalar::new(StringArray::new_null(1));
+            let b = StringArray::from_iter_values([pattern]);
+            let r = like(&a, &b).unwrap();
+            assert_eq!(r.len(), 1, "With pattern {pattern}");
+            assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+            assert!(r.is_null(0), "With pattern {pattern}");
+
+            let a = StringArray::new_null(1);
+            let b = StringArray::from_iter_values([pattern]);
+            let r = like(&a, &b).unwrap();
+            assert_eq!(r.len(), 1, "With pattern {pattern}");
+            assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+            assert!(r.is_null(0), "With pattern {pattern}");
+
+            let a = StringArray::new_null(1);
+            let b = StringArray::new_scalar(pattern);
+            let r = like(&a, &b).unwrap();
+            assert_eq!(r.len(), 1, "With pattern {pattern}");
+            assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+            assert!(r.is_null(0), "With pattern {pattern}");
+        }
+    }
+
+    #[test]
+    fn string_view_null_like_pattern() {
+        // Different patterns have different execution code paths
+        for pattern in &[
+            "",           // can execute as equality check
+            "_",          // can execute as length check
+            "%",          // can execute as starts_with("") or non-null check
+            "a%",         // can execute as starts_with("a")
+            "%a",         // can execute as ends_with("")
+            "a%b",        // can execute as starts_with("a") && ends_with("b")
+            "%a%",        // can_execute as contains("a")
+            "%a%b_c_d%e", // can_execute as regular expression
+        ] {
+            let a = Scalar::new(StringViewArray::new_null(1));
+            let b = StringViewArray::new_scalar(pattern);
+            let r = like(&a, &b).unwrap();
+            assert_eq!(r.len(), 1, "With pattern {pattern}");
+            assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+            assert!(r.is_null(0), "With pattern {pattern}");
+
+            let a = Scalar::new(StringViewArray::new_null(1));
+            let b = StringViewArray::from_iter_values([pattern]);
+            let r = like(&a, &b).unwrap();
+            assert_eq!(r.len(), 1, "With pattern {pattern}");
+            assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+            assert!(r.is_null(0), "With pattern {pattern}");
+
+            let a = StringViewArray::new_null(1);
+            let b = StringViewArray::from_iter_values([pattern]);
+            let r = like(&a, &b).unwrap();
+            assert_eq!(r.len(), 1, "With pattern {pattern}");
+            assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+            assert!(r.is_null(0), "With pattern {pattern}");
+
+            let a = StringViewArray::new_null(1);
+            let b = StringViewArray::new_scalar(pattern);
+            let r = like(&a, &b).unwrap();
+            assert_eq!(r.len(), 1, "With pattern {pattern}");
+            assert_eq!(r.null_count(), 1, "With pattern {pattern}");
+            assert!(r.is_null(0), "With pattern {pattern}");
+        }
+    }
+
+    #[test]
+    fn string_like_scalar_null() {
         let a = StringArray::new_scalar("a");
         let b = Scalar::new(StringArray::new_null(1));
         let r = like(&a, &b).unwrap();
@@ -1737,4 +1823,35 @@ mod tests {
         assert_eq!(r.null_count(), 1);
         assert!(r.is_null(0));
     }
+
+    #[test]
+    fn string_view_like_scalar_null() {
+        let a = StringViewArray::new_scalar("a");
+        let b = Scalar::new(StringViewArray::new_null(1));
+        let r = like(&a, &b).unwrap();
+        assert_eq!(r.len(), 1);
+        assert_eq!(r.null_count(), 1);
+        assert!(r.is_null(0));
+
+        let a = StringViewArray::from_iter_values(["a"]);
+        let b = Scalar::new(StringViewArray::new_null(1));
+        let r = like(&a, &b).unwrap();
+        assert_eq!(r.len(), 1);
+        assert_eq!(r.null_count(), 1);
+        assert!(r.is_null(0));
+
+        let a = StringViewArray::from_iter_values(["a"]);
+        let b = StringViewArray::new_null(1);
+        let r = like(&a, &b).unwrap();
+        assert_eq!(r.len(), 1);
+        assert_eq!(r.null_count(), 1);
+        assert!(r.is_null(0));
+
+        let a = StringViewArray::new_scalar("a");
+        let b = StringViewArray::new_null(1);
+        let r = like(&a, &b).unwrap();
+        assert_eq!(r.len(), 1);
+        assert_eq!(r.null_count(), 1);
+        assert!(r.is_null(0));
+    }
 }
diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
index 408d9d45cc..1d141e6b0b 100644
--- a/arrow-string/src/predicate.rs
+++ b/arrow-string/src/predicate.rs
@@ -15,7 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray};
+use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray};
+use arrow_buffer::BooleanBuffer;
 use arrow_schema::ArrowError;
 use memchr::memchr2;
 use memchr::memmem::Finder;
@@ -114,30 +115,21 @@ impl<'a> Predicate<'a> {
             Predicate::IEqAscii(v) => BooleanArray::from_unary(array, 
|haystack| {
                 haystack.eq_ignore_ascii_case(v) != negate
             }),
-            Predicate::Contains(finder) => {
-                if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
-                    BooleanArray::from(
-                        string_view_array
-                            .bytes_iter()
-                            .map(|haystack| finder.find(haystack).is_some() != 
negate)
-                            .collect::<Vec<_>>(),
-                    )
-                } else {
-                    BooleanArray::from_unary(array, |haystack| {
-                        finder.find(haystack.as_bytes()).is_some() != negate
-                    })
-                }
-            }
+            Predicate::Contains(finder) => BooleanArray::from_unary(array, 
|haystack| {
+                finder.find(haystack.as_bytes()).is_some() != negate
+            }),
             Predicate::StartsWith(v) => {
                 if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
-                    BooleanArray::from(
+                    let nulls = string_view_array.logical_nulls();
+                    let values = BooleanBuffer::from(
                         string_view_array
                             .prefix_bytes_iter(v.len())
                             .map(|haystack| {
                                 equals_bytes(haystack, v.as_bytes(), 
equals_kernel) != negate
                             })
                             .collect::<Vec<_>>(),
-                    )
+                    );
+                    BooleanArray::new(values, nulls)
                 } else {
                     BooleanArray::from_unary(array, |haystack| {
                         starts_with(haystack, v, equals_kernel) != negate
@@ -146,6 +138,7 @@ impl<'a> Predicate<'a> {
             }
             Predicate::IStartsWithAscii(v) => {
                 if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
+                    // TODO respect null buffer
                     BooleanArray::from(
                         string_view_array
                             .prefix_bytes_iter(v.len())
@@ -166,14 +159,16 @@ impl<'a> Predicate<'a> {
             }
             Predicate::EndsWith(v) => {
                 if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
-                    BooleanArray::from(
+                    let nulls = string_view_array.logical_nulls();
+                    let values = BooleanBuffer::from(
                         string_view_array
                             .suffix_bytes_iter(v.len())
                             .map(|haystack| {
                                 equals_bytes(haystack, v.as_bytes(), 
equals_kernel) != negate
                             })
                             .collect::<Vec<_>>(),
-                    )
+                    );
+                    BooleanArray::new(values, nulls)
                 } else {
                     BooleanArray::from_unary(array, |haystack| {
                         ends_with(haystack, v, equals_kernel) != negate
@@ -182,6 +177,7 @@ impl<'a> Predicate<'a> {
             }
             Predicate::IEndsWithAscii(v) => {
                 if let Some(string_view_array) = 
array.as_any().downcast_ref::<StringViewArray>() {
+                    // TODO respect null buffer
                     BooleanArray::from(
                         string_view_array
                             .suffix_bytes_iter(v.len())

Reply via email to