This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch string-view2
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/string-view2 by this push:
     new 34d42bccb8 Initial support for regex_replace on `StringViewArray` 
(#11556)
34d42bccb8 is described below

commit 34d42bccb89ff96e9d4fbd876bdc52a254bd1e14
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jul 22 08:23:57 2024 -0400

    Initial support for regex_replace on `StringViewArray` (#11556)
    
    * initial support for string view regex
    
    * update tests
---
 datafusion/functions/src/regex/regexpreplace.rs | 217 +++++++++++++++++-------
 1 file changed, 151 insertions(+), 66 deletions(-)

diff --git a/datafusion/functions/src/regex/regexpreplace.rs 
b/datafusion/functions/src/regex/regexpreplace.rs
index 378b6ced07..9f4d0374e1 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -17,11 +17,14 @@
 
 //! Regx expressions
 use arrow::array::new_null_array;
+use arrow::array::ArrayAccessor;
 use arrow::array::ArrayDataBuilder;
 use arrow::array::BufferBuilder;
 use arrow::array::GenericStringArray;
+use arrow::array::StringViewBuilder;
 use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
 use arrow::datatypes::DataType;
+use datafusion_common::cast::as_string_view_array;
 use datafusion_common::exec_err;
 use datafusion_common::plan_err;
 use datafusion_common::ScalarValue;
@@ -54,6 +57,7 @@ impl RegexpReplaceFunc {
             signature: Signature::one_of(
                 vec![
                     Exact(vec![Utf8, Utf8, Utf8]),
+                    Exact(vec![Utf8View, Utf8, Utf8]),
                     Exact(vec![Utf8, Utf8, Utf8, Utf8]),
                 ],
                 Volatility::Immutable,
@@ -80,6 +84,7 @@ impl ScalarUDFImpl for RegexpReplaceFunc {
         Ok(match &arg_types[0] {
             LargeUtf8 | LargeBinary => LargeUtf8,
             Utf8 | Binary => Utf8,
+            Utf8View | BinaryView => Utf8View,
             Null => Null,
             Dictionary(_, t) => match **t {
                 LargeUtf8 | LargeBinary => LargeUtf8,
@@ -118,15 +123,18 @@ impl ScalarUDFImpl for RegexpReplaceFunc {
         }
     }
 }
+
 fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => specialize_regexp_replace::<i32>(args),
         DataType::LargeUtf8 => specialize_regexp_replace::<i64>(args),
+        DataType::Utf8View => specialize_regexp_replace::<i32>(args),
         other => {
             internal_err!("Unsupported data type {other:?} for function 
regexp_replace")
         }
     }
 }
+
 /// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
 /// used by regexp_replace
 fn regex_replace_posix_groups(replacement: &str) -> String {
@@ -280,8 +288,8 @@ pub fn regexp_replace<T: OffsetSizeTrait>(args: 
&[ArrayRef]) -> Result<ArrayRef>
     }
 }
 
-fn _regexp_replace_early_abort<T: OffsetSizeTrait>(
-    input_array: &GenericStringArray<T>,
+fn _regexp_replace_early_abort<T: ArrayAccessor>(
+    input_array: T,
     sz: usize,
 ) -> Result<ArrayRef> {
     // Mimicking the existing behavior of regexp_replace, if any of the scalar 
arguments
@@ -290,13 +298,14 @@ fn _regexp_replace_early_abort<T: OffsetSizeTrait>(
     // Also acts like an early abort mechanism when the input array is empty.
     Ok(new_null_array(input_array.data_type(), sz))
 }
+
 /// Get the first argument from the given string array.
 ///
 /// Note: If the array is empty or the first argument is null,
 /// then calls the given early abort function.
 macro_rules! fetch_string_arg {
     ($ARG:expr, $NAME:expr, $T:ident, $EARLY_ABORT:ident, $ARRAY_SIZE:expr) => 
{{
-        let array = as_generic_string_array::<T>($ARG)?;
+        let array = as_generic_string_array::<$T>($ARG)?;
         if array.len() == 0 || array.is_null(0) {
             return $EARLY_ABORT(array, $ARRAY_SIZE);
         } else {
@@ -313,25 +322,24 @@ macro_rules! fetch_string_arg {
 fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
     args: &[ArrayRef],
 ) -> Result<ArrayRef> {
-    let string_array = as_generic_string_array::<T>(&args[0])?;
-    let array_size = string_array.len();
+    let array_size = args[0].len();
     let pattern = fetch_string_arg!(
         &args[1],
         "pattern",
-        T,
+        i32,
         _regexp_replace_early_abort,
         array_size
     );
     let replacement = fetch_string_arg!(
         &args[2],
         "replacement",
-        T,
+        i32,
         _regexp_replace_early_abort,
         array_size
     );
     let flags = match args.len() {
         3 => None,
-        4 => Some(fetch_string_arg!(&args[3], "flags", T, 
_regexp_replace_early_abort, array_size)),
+        4 => Some(fetch_string_arg!(&args[3], "flags", i32, 
_regexp_replace_early_abort, array_size)),
         other => {
             return exec_err!(
                 "regexp_replace was called with {other} arguments. It requires 
at least 3 and at most 4."
@@ -358,32 +366,61 @@ fn _regexp_replace_static_pattern_replace<T: 
OffsetSizeTrait>(
     // with rust ones.
     let replacement = regex_replace_posix_groups(replacement);
 
-    // We are going to create the underlying string buffer from its parts
-    // to be able to re-use the existing null buffer for sparse arrays.
-    let mut vals = BufferBuilder::<u8>::new({
-        let offsets = string_array.value_offsets();
-        (offsets[string_array.len()] - offsets[0])
-            .to_usize()
-            .expect("Failed to convert usize")
-    });
-    let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
-    new_offsets.append(T::zero());
-
-    string_array.iter().for_each(|val| {
-        if let Some(val) = val {
-            let result = re.replacen(val, limit, replacement.as_str());
-            vals.append_slice(result.as_bytes());
+    let string_array_type = args[0].data_type();
+    match string_array_type {
+        DataType::Utf8 | DataType::LargeUtf8 => {
+            let string_array = as_generic_string_array::<T>(&args[0])?;
+
+            // We are going to create the underlying string buffer from its 
parts
+            // to be able to re-use the existing null buffer for sparse arrays.
+            let mut vals = BufferBuilder::<u8>::new({
+                let offsets = string_array.value_offsets();
+                (offsets[string_array.len()] - offsets[0])
+                    .to_usize()
+                    .unwrap()
+            });
+            let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 
1);
+            new_offsets.append(T::zero());
+
+            string_array.iter().for_each(|val| {
+                if let Some(val) = val {
+                    let result = re.replacen(val, limit, replacement.as_str());
+                    vals.append_slice(result.as_bytes());
+                }
+                new_offsets.append(T::from_usize(vals.len()).unwrap());
+            });
+
+            let data = 
ArrayDataBuilder::new(GenericStringArray::<T>::DATA_TYPE)
+                .len(string_array.len())
+                .nulls(string_array.nulls().cloned())
+                .buffers(vec![new_offsets.finish(), vals.finish()])
+                .build()?;
+            let result_array = GenericStringArray::<T>::from(data);
+            Ok(Arc::new(result_array) as ArrayRef)
         }
-        new_offsets.append(T::from_usize(vals.len()).unwrap());
-    });
-
-    let data = ArrayDataBuilder::new(GenericStringArray::<T>::DATA_TYPE)
-        .len(string_array.len())
-        .nulls(string_array.nulls().cloned())
-        .buffers(vec![new_offsets.finish(), vals.finish()])
-        .build()?;
-    let result_array = GenericStringArray::<T>::from(data);
-    Ok(Arc::new(result_array) as ArrayRef)
+        DataType::Utf8View => {
+            let string_view_array = as_string_view_array(&args[0])?;
+
+            let mut builder = 
StringViewBuilder::with_capacity(string_view_array.len())
+                .with_block_size(1024 * 1024 * 2);
+
+            for val in string_view_array.iter() {
+                if let Some(val) = val {
+                    let result = re.replacen(val, limit, replacement.as_str());
+                    builder.append_value(result);
+                } else {
+                    builder.append_null();
+                }
+            }
+
+            let result = builder.finish();
+            Ok(Arc::new(result) as ArrayRef)
+        }
+        _ => unreachable!(
+            "Invalid data type for regexp_replace: {}",
+            string_array_type
+        ),
+    }
 }
 
 /// Determine which implementation of the regexp_replace to use based
@@ -469,43 +506,91 @@ mod tests {
 
     use super::*;
 
-    #[test]
-    fn test_static_pattern_regexp_replace() {
-        let values = StringArray::from(vec!["abc"; 5]);
-        let patterns = StringArray::from(vec!["b"; 5]);
-        let replacements = StringArray::from(vec!["foo"; 5]);
-        let expected = StringArray::from(vec!["afooc"; 5]);
-
-        let re = _regexp_replace_static_pattern_replace::<i32>(&[
-            Arc::new(values),
-            Arc::new(patterns),
-            Arc::new(replacements),
-        ])
-        .unwrap();
-
-        assert_eq!(re.as_ref(), &expected);
+    macro_rules! static_pattern_regexp_replace {
+        ($name:ident, $T:ty, $O:ty) => {
+            #[test]
+            fn $name() {
+                let values = vec!["abc", "acd", "abcd1234567890123", 
"123456789012abc"];
+                let patterns = vec!["b"; 4];
+                let replacement = vec!["foo"; 4];
+                let expected =
+                    vec!["afooc", "acd", "afoocd1234567890123", 
"123456789012afooc"];
+
+                let values = <$T>::from(values);
+                let patterns = StringArray::from(patterns);
+                let replacements = StringArray::from(replacement);
+                let expected = <$T>::from(expected);
+
+                let re = _regexp_replace_static_pattern_replace::<$O>(&[
+                    Arc::new(values),
+                    Arc::new(patterns),
+                    Arc::new(replacements),
+                ])
+                .unwrap();
+
+                assert_eq!(re.as_ref(), &expected);
+            }
+        };
     }
 
-    #[test]
-    fn test_static_pattern_regexp_replace_with_flags() {
-        let values = StringArray::from(vec!["abc", "ABC", "aBc", "AbC", 
"aBC"]);
-        let patterns = StringArray::from(vec!["b"; 5]);
-        let replacements = StringArray::from(vec!["foo"; 5]);
-        let flags = StringArray::from(vec!["i"; 5]);
-        let expected =
-            StringArray::from(vec!["afooc", "AfooC", "afooc", "AfooC", 
"afooC"]);
-
-        let re = _regexp_replace_static_pattern_replace::<i32>(&[
-            Arc::new(values),
-            Arc::new(patterns),
-            Arc::new(replacements),
-            Arc::new(flags),
-        ])
-        .unwrap();
-
-        assert_eq!(re.as_ref(), &expected);
+    static_pattern_regexp_replace!(string_array, StringArray, i32);
+    static_pattern_regexp_replace!(string_view_array, StringViewArray, i32);
+    static_pattern_regexp_replace!(large_string_array, LargeStringArray, i64);
+
+    macro_rules! static_pattern_regexp_replace_with_flags {
+        ($name:ident, $T:ty, $O: ty) => {
+            #[test]
+            fn $name() {
+                let values = vec![
+                    "abc",
+                    "aBc",
+                    "acd",
+                    "abcd1234567890123",
+                    "aBcd1234567890123",
+                    "123456789012abc",
+                    "123456789012aBc",
+                ];
+                let expected = vec![
+                    "afooc",
+                    "afooc",
+                    "acd",
+                    "afoocd1234567890123",
+                    "afoocd1234567890123",
+                    "123456789012afooc",
+                    "123456789012afooc",
+                ];
+
+                let values = <$T>::from(values);
+                let patterns = StringArray::from(vec!["b"; 7]);
+                let replacements = StringArray::from(vec!["foo"; 7]);
+                let flags = StringArray::from(vec!["i"; 5]);
+                let expected = <$T>::from(expected);
+
+                let re = _regexp_replace_static_pattern_replace::<$O>(&[
+                    Arc::new(values),
+                    Arc::new(patterns),
+                    Arc::new(replacements),
+                    Arc::new(flags),
+                ])
+                .unwrap();
+
+                assert_eq!(re.as_ref(), &expected);
+            }
+        };
     }
 
+    static_pattern_regexp_replace_with_flags!(string_array_with_flags, 
StringArray, i32);
+    static_pattern_regexp_replace_with_flags!(
+        string_view_array_with_flags,
+        StringViewArray,
+        i32
+    );
+    static_pattern_regexp_replace_with_flags!(
+        large_string_array_with_flags,
+        LargeStringArray,
+        i64
+    );
+
     #[test]
     fn test_static_pattern_regexp_replace_early_abort() {
         let values = StringArray::from(vec!["abc"; 5]);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to