Dandandan commented on code in PR #3614:
URL: https://github.com/apache/arrow-datafusion/pull/3614#discussion_r980254141
##########
datafusion/physical-expr/src/regex_expressions.rs:
##########
@@ -178,10 +197,125 @@ pub fn regexp_replace<T: OffsetSizeTrait>(args:
&[ArrayRef]) -> Result<ArrayRef>
}
}
+fn _regexp_replace_early_abort<T: OffsetSizeTrait>(
+ input_array: &GenericStringArray<T>,
+) -> Result<ArrayRef> {
+ // Mimicing the existing behavior of regexp_replace, if any of the scalar
arguments
+ // are actuall null, then the result will be an array of the same size but
with nulls.
+ Ok(new_null_array(input_array.data_type(), input_array.len()))
+}
+
+fn _regexp_replace_static_pattern<T: OffsetSizeTrait>(
+ args: &[ArrayRef],
+) -> Result<ArrayRef> {
+ // Special cased regex_replace implementation for the scenerio where
+ // both the pattern itself and the flags are scalars. This means we can
+ // skip regex caching system and basically hold a single Regex object
+ // for the replace operation.
+
+ let string_array = downcast_string_array_arg!(args[0], "string", T);
+ let pattern = fetch_string_arg!(args[1], "pattern", T,
_regexp_replace_early_abort);
+ let replacement_array = downcast_string_array_arg!(args[2], "replacement",
T);
+ let flags = match args.len() {
+ 3 => None,
+ 4 => Some(fetch_string_arg!(args[3], "flags", T,
_regexp_replace_early_abort)),
+ other => {
+ return Err(DataFusionError::Internal(format!(
+ "regexp_replace was called with {} arguments. It requires at
least 3 and at most 4.",
+ other
+ )))
+ }
+ };
+
+ // Embed the flag (if it exists) into the pattern
+ let (pattern, replace_all) = match flags {
+ Some("g") => (pattern.to_string(), true),
+ Some(flags) => (
+ format!("(?{}){}", flags.to_string().replace('g', ""), pattern),
+ flags.contains('g'),
+ ),
+ None => (pattern.to_string(), false),
+ };
+
+ let re = Regex::new(&pattern)
+ .map_err(|err| DataFusionError::Execution(err.to_string()))?;
+
+ let result = string_array
+ .iter()
+ .zip(replacement_array.iter())
Review Comment:
I guess the case with both a scalar replacement + scalar pattern (as in the
example query) might be the most common - so maybe best to implement this case
in this PR?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]