This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new d5c361b63 Optimize `regexp_replace` when the input is a sparse array
(#3804)
d5c361b63 is described below
commit d5c361b63c31b57d0052c501a94c1b1f7847b402
Author: Batuhan Taskaya <[email protected]>
AuthorDate: Wed Oct 12 23:17:36 2022 +0300
Optimize `regexp_replace` when the input is a sparse array (#3804)
* Optimize `regexp_replace` when the input is a sparse array (by reusing
null buffers)
* Add a test regarding the slicing behavior
---
datafusion/physical-expr/src/regex_expressions.rs | 105 ++++++++++++++++++++--
1 file changed, 97 insertions(+), 8 deletions(-)
diff --git a/datafusion/physical-expr/src/regex_expressions.rs
b/datafusion/physical-expr/src/regex_expressions.rs
index d7edc3400..b76bb2c45 100644
--- a/datafusion/physical-expr/src/regex_expressions.rs
+++ b/datafusion/physical-expr/src/regex_expressions.rs
@@ -22,7 +22,8 @@
//! Regex expressions
use arrow::array::{
- new_null_array, Array, ArrayRef, GenericStringArray, OffsetSizeTrait,
+ new_null_array, Array, ArrayData, ArrayRef, BufferBuilder,
GenericStringArray,
+ OffsetSizeTrait,
};
use arrow::compute;
use datafusion_common::{DataFusionError, Result};
@@ -254,13 +255,38 @@ fn _regexp_replace_static_pattern_replace<T:
OffsetSizeTrait>(
// with rust ones.
let replacement = regex_replace_posix_groups(replacement);
- let result = string_array
- .iter()
- .map(|string| {
- string.map(|string| re.replacen(string, limit,
replacement.as_str()))
- })
- .collect::<GenericStringArray<T>>();
- Ok(Arc::new(result) as ArrayRef)
+ // We are going to create the underlying string buffer from its parts
+ // to be able to re-use the existing null buffer for sparse arrays.
+ let mut vals = BufferBuilder::<u8>::new({
+ let offsets = string_array.value_offsets();
+ (offsets[string_array.len()] - offsets[0])
+ .to_usize()
+ .unwrap()
+ });
+ let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
+ new_offsets.append(T::zero());
+
+ string_array.iter().for_each(|val| {
+ if let Some(val) = val {
+ let result = re.replacen(val, limit, replacement.as_str());
+ vals.append_slice(result.as_bytes());
+ }
+ new_offsets.append(T::from_usize(vals.len()).unwrap());
+ });
+
+ let data = ArrayData::try_new(
+ GenericStringArray::<T>::DATA_TYPE,
+ string_array.len(),
+ string_array
+ .data_ref()
+ .null_buffer()
+ .map(|b| b.bit_slice(string_array.offset(), string_array.len())),
+ 0,
+ vec![new_offsets.finish(), vals.finish()],
+ vec![],
+ )?;
+ let result_array = GenericStringArray::<T>::from(data);
+ Ok(Arc::new(result_array) as ArrayRef)
}
/// Determine which implementation of the regexp_replace to use based
@@ -513,4 +539,67 @@ mod tests {
}
}
}
+
+ #[test]
+ fn test_static_pattern_regexp_replace_with_null_buffers() {
+ let values = StringArray::from(vec![
+ Some("a"),
+ None,
+ Some("b"),
+ None,
+ Some("a"),
+ None,
+ None,
+ Some("c"),
+ ]);
+ let patterns = StringArray::from(vec!["a"; 1]);
+ let replacements = StringArray::from(vec!["foo"; 1]);
+ let expected = StringArray::from(vec![
+ Some("foo"),
+ None,
+ Some("b"),
+ None,
+ Some("foo"),
+ None,
+ None,
+ Some("c"),
+ ]);
+
+ let re = _regexp_replace_static_pattern_replace::<i32>(&[
+ Arc::new(values),
+ Arc::new(patterns),
+ Arc::new(replacements),
+ ])
+ .unwrap();
+
+ assert_eq!(re.as_ref(), &expected);
+ assert_eq!(re.null_count(), 4);
+ }
+
+ #[test]
+ fn test_static_pattern_regexp_replace_with_sliced_null_buffer() {
+ let values = StringArray::from(vec![
+ Some("a"),
+ None,
+ Some("b"),
+ None,
+ Some("a"),
+ None,
+ None,
+ Some("c"),
+ ]);
+ let values = values.slice(2, 5);
+ let patterns = StringArray::from(vec!["a"; 1]);
+ let replacements = StringArray::from(vec!["foo"; 1]);
+ let expected = StringArray::from(vec![Some("b"), None, Some("foo"),
None, None]);
+
+ let re = _regexp_replace_static_pattern_replace::<i32>(&[
+ Arc::new(values),
+ Arc::new(patterns),
+ Arc::new(replacements),
+ ])
+ .unwrap();
+ assert_eq!(re.as_ref(), &expected);
+ assert_eq!(re.null_count(), 3);
+ }
}