This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 72af0ffdf0 Improve regexp_match performance by avoiding cloning Regex
(#8631)
72af0ffdf0 is described below
commit 72af0ffdf00247e5383adcdbe3dada7ca85d9172
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Sun Dec 24 00:17:05 2023 -0800
Improve regexp_match performance by avoiding cloning Regex (#8631)
* Improve regexp_match performance by avoiding cloning Regex
* Update datafusion/physical-expr/src/regex_expressions.rs
Co-authored-by: Andrew Lamb <[email protected]>
* Removing clone of Regex in regexp_replace
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/physical-expr/src/regex_expressions.rs | 96 ++++++++++++++++++++---
1 file changed, 87 insertions(+), 9 deletions(-)
diff --git a/datafusion/physical-expr/src/regex_expressions.rs
b/datafusion/physical-expr/src/regex_expressions.rs
index 7bafed072b..b778fd86c2 100644
--- a/datafusion/physical-expr/src/regex_expressions.rs
+++ b/datafusion/physical-expr/src/regex_expressions.rs
@@ -25,7 +25,8 @@ use arrow::array::{
new_null_array, Array, ArrayDataBuilder, ArrayRef, BufferBuilder,
GenericStringArray,
OffsetSizeTrait,
};
-use arrow::compute;
+use arrow_array::builder::{GenericStringBuilder, ListBuilder};
+use arrow_schema::ArrowError;
use datafusion_common::{arrow_datafusion_err, plan_err};
use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
@@ -58,7 +59,7 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
2 => {
let values = as_generic_string_array::<T>(&args[0])?;
let regex = as_generic_string_array::<T>(&args[1])?;
- compute::regexp_match(values, regex, None).map_err(|e|
arrow_datafusion_err!(e))
+ _regexp_match(values, regex, None).map_err(|e|
arrow_datafusion_err!(e))
}
3 => {
let values = as_generic_string_array::<T>(&args[0])?;
@@ -69,7 +70,7 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
Some(f) if f.iter().any(|s| s == Some("g")) => {
plan_err!("regexp_match() does not support the \"global\"
option")
},
- _ => compute::regexp_match(values, regex, flags).map_err(|e|
arrow_datafusion_err!(e)),
+ _ => _regexp_match(values, regex, flags).map_err(|e|
arrow_datafusion_err!(e)),
}
}
other => internal_err!(
@@ -78,6 +79,83 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef])
-> Result<ArrayRef> {
}
}
+/// TODO: Remove this once it is included in arrow-rs new release.
+/// <https://github.com/apache/arrow-rs/pull/5235>
+fn _regexp_match<OffsetSize: OffsetSizeTrait>(
+ array: &GenericStringArray<OffsetSize>,
+ regex_array: &GenericStringArray<OffsetSize>,
+ flags_array: Option<&GenericStringArray<OffsetSize>>,
+) -> std::result::Result<ArrayRef, ArrowError> {
+ let mut patterns: std::collections::HashMap<String, Regex> =
+ std::collections::HashMap::new();
+ let builder: GenericStringBuilder<OffsetSize> =
+ GenericStringBuilder::with_capacity(0, 0);
+ let mut list_builder = ListBuilder::new(builder);
+
+ let complete_pattern = match flags_array {
+ Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
+ |(pattern, flags)| {
+ pattern.map(|pattern| match flags {
+ Some(value) => format!("(?{value}){pattern}"),
+ None => pattern.to_string(),
+ })
+ },
+ )) as Box<dyn Iterator<Item = Option<String>>>,
+ None => Box::new(
+ regex_array
+ .iter()
+ .map(|pattern| pattern.map(|pattern| pattern.to_string())),
+ ),
+ };
+
+ array
+ .iter()
+ .zip(complete_pattern)
+ .map(|(value, pattern)| {
+ match (value, pattern) {
+ // Required for Postgres compatibility:
+ // SELECT regexp_match('foobarbequebaz', ''); = {""}
+ (Some(_), Some(pattern)) if pattern == *"" => {
+ list_builder.values().append_value("");
+ list_builder.append(true);
+ }
+ (Some(value), Some(pattern)) => {
+ let existing_pattern = patterns.get(&pattern);
+ let re = match existing_pattern {
+ Some(re) => re,
+ None => {
+ let re = Regex::new(pattern.as_str()).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Regular expression did not compile: {e:?}"
+ ))
+ })?;
+ patterns.insert(pattern.clone(), re);
+ patterns.get(&pattern).unwrap()
+ }
+ };
+ match re.captures(value) {
+ Some(caps) => {
+ let mut iter = caps.iter();
+ if caps.len() > 1 {
+ iter.next();
+ }
+ for m in iter.flatten() {
+ list_builder.values().append_value(m.as_str());
+ }
+
+ list_builder.append(true);
+ }
+ None => list_builder.append(false),
+ }
+ }
+ _ => list_builder.append(false),
+ }
+ Ok(())
+ })
+ .collect::<std::result::Result<Vec<()>, ArrowError>>()?;
+ Ok(Arc::new(list_builder.finish()))
+}
+
/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
/// used by regexp_replace
fn regex_replace_posix_groups(replacement: &str) -> String {
@@ -116,12 +194,12 @@ pub fn regexp_replace<T: OffsetSizeTrait>(args:
&[ArrayRef]) -> Result<ArrayRef>
// if patterns hashmap already has regexp then use else
else create and return
let re = match patterns.get(pattern) {
- Some(re) => Ok(re.clone()),
+ Some(re) => Ok(re),
None => {
match Regex::new(pattern) {
Ok(re) => {
- patterns.insert(pattern.to_string(),
re.clone());
- Ok(re)
+ patterns.insert(pattern.to_string(), re);
+ Ok(patterns.get(pattern).unwrap())
},
Err(err) =>
Err(DataFusionError::External(Box::new(err))),
}
@@ -162,12 +240,12 @@ pub fn regexp_replace<T: OffsetSizeTrait>(args:
&[ArrayRef]) -> Result<ArrayRef>
// if patterns hashmap already has regexp then use else
else create and return
let re = match patterns.get(&pattern) {
- Some(re) => Ok(re.clone()),
+ Some(re) => Ok(re),
None => {
match Regex::new(pattern.as_str()) {
Ok(re) => {
- patterns.insert(pattern, re.clone());
- Ok(re)
+ patterns.insert(pattern.clone(), re);
+ Ok(patterns.get(&pattern).unwrap())
},
Err(err) =>
Err(DataFusionError::External(Box::new(err))),
}