This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new d5c361b63 Optimize `regexp_replace` when the input is a sparse array 
(#3804)
d5c361b63 is described below

commit d5c361b63c31b57d0052c501a94c1b1f7847b402
Author: Batuhan Taskaya <[email protected]>
AuthorDate: Wed Oct 12 23:17:36 2022 +0300

    Optimize `regexp_replace` when the input is a sparse array (#3804)
    
    * Optimize `regexp_replace` when the input is a sparse array (by reusing 
null buffers)
    
    * Add a test regarding the slicing behavior
---
 datafusion/physical-expr/src/regex_expressions.rs | 105 ++++++++++++++++++++--
 1 file changed, 97 insertions(+), 8 deletions(-)

diff --git a/datafusion/physical-expr/src/regex_expressions.rs 
b/datafusion/physical-expr/src/regex_expressions.rs
index d7edc3400..b76bb2c45 100644
--- a/datafusion/physical-expr/src/regex_expressions.rs
+++ b/datafusion/physical-expr/src/regex_expressions.rs
@@ -22,7 +22,8 @@
 //! Regex expressions
 
 use arrow::array::{
-    new_null_array, Array, ArrayRef, GenericStringArray, OffsetSizeTrait,
+    new_null_array, Array, ArrayData, ArrayRef, BufferBuilder, 
GenericStringArray,
+    OffsetSizeTrait,
 };
 use arrow::compute;
 use datafusion_common::{DataFusionError, Result};
@@ -254,13 +255,38 @@ fn _regexp_replace_static_pattern_replace<T: 
OffsetSizeTrait>(
     // with rust ones.
     let replacement = regex_replace_posix_groups(replacement);
 
-    let result = string_array
-        .iter()
-        .map(|string| {
-            string.map(|string| re.replacen(string, limit, 
replacement.as_str()))
-        })
-        .collect::<GenericStringArray<T>>();
-    Ok(Arc::new(result) as ArrayRef)
+    // We are going to create the underlying string buffer from its parts
+    // to be able to re-use the existing null buffer for sparse arrays.
+    let mut vals = BufferBuilder::<u8>::new({
+        let offsets = string_array.value_offsets();
+        (offsets[string_array.len()] - offsets[0])
+            .to_usize()
+            .unwrap()
+    });
+    let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
+    new_offsets.append(T::zero());
+
+    string_array.iter().for_each(|val| {
+        if let Some(val) = val {
+            let result = re.replacen(val, limit, replacement.as_str());
+            vals.append_slice(result.as_bytes());
+        }
+        new_offsets.append(T::from_usize(vals.len()).unwrap());
+    });
+
+    let data = ArrayData::try_new(
+        GenericStringArray::<T>::DATA_TYPE,
+        string_array.len(),
+        string_array
+            .data_ref()
+            .null_buffer()
+            .map(|b| b.bit_slice(string_array.offset(), string_array.len())),
+        0,
+        vec![new_offsets.finish(), vals.finish()],
+        vec![],
+    )?;
+    let result_array = GenericStringArray::<T>::from(data);
+    Ok(Arc::new(result_array) as ArrayRef)
 }
 
 /// Determine which implementation of the regexp_replace to use based
@@ -513,4 +539,67 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn test_static_pattern_regexp_replace_with_null_buffers() {
+        let values = StringArray::from(vec![
+            Some("a"),
+            None,
+            Some("b"),
+            None,
+            Some("a"),
+            None,
+            None,
+            Some("c"),
+        ]);
+        let patterns = StringArray::from(vec!["a"; 1]);
+        let replacements = StringArray::from(vec!["foo"; 1]);
+        let expected = StringArray::from(vec![
+            Some("foo"),
+            None,
+            Some("b"),
+            None,
+            Some("foo"),
+            None,
+            None,
+            Some("c"),
+        ]);
+
+        let re = _regexp_replace_static_pattern_replace::<i32>(&[
+            Arc::new(values),
+            Arc::new(patterns),
+            Arc::new(replacements),
+        ])
+        .unwrap();
+
+        assert_eq!(re.as_ref(), &expected);
+        assert_eq!(re.null_count(), 4);
+    }
+
+    #[test]
+    fn test_static_pattern_regexp_replace_with_sliced_null_buffer() {
+        let values = StringArray::from(vec![
+            Some("a"),
+            None,
+            Some("b"),
+            None,
+            Some("a"),
+            None,
+            None,
+            Some("c"),
+        ]);
+        let values = values.slice(2, 5);
+        let patterns = StringArray::from(vec!["a"; 1]);
+        let replacements = StringArray::from(vec!["foo"; 1]);
+        let expected = StringArray::from(vec![Some("b"), None, Some("foo"), 
None, None]);
+
+        let re = _regexp_replace_static_pattern_replace::<i32>(&[
+            Arc::new(values),
+            Arc::new(patterns),
+            Arc::new(replacements),
+        ])
+        .unwrap();
+        assert_eq!(re.as_ref(), &expected);
+        assert_eq!(re.null_count(), 3);
+    }
 }

Reply via email to