alamb commented on code in PR #21379:
URL: https://github.com/apache/datafusion/pull/21379#discussion_r3053829623
##########
datafusion/functions/src/regex/regexpreplace.rs:
##########
@@ -199,6 +201,22 @@ fn regex_replace_posix_groups(replacement: &str) -> String
{
.into_owned()
}
+/// For anchored patterns like `^...(capture)....*$` where the replacement
+/// is `\1`, build a shorter regex (stripping trailing `.*$`) and use
+/// `captures_read` with `CaptureLocations` for direct extraction — no
+/// `expand()`, no `String` allocation.
Review Comment:
is it worth pointing out that this is for ClickBench?
```suggestion
/// `expand()`, no `String` allocation.
/// This pattern appears in ClickBench Q28: which uses a regexp like
/// `^https?://(?:www\.)?([^/]+)/.*$`
```
##########
datafusion/functions/src/regex/regexpreplace.rs:
##########
@@ -494,12 +544,39 @@ fn _regexp_replace_static_pattern_replace<T:
OffsetSizeTrait>(
let mut builder =
StringViewBuilder::with_capacity(string_view_array.len());
- for val in string_view_array.iter() {
- if let Some(val) = val {
- let result = re.replacen(val, limit, replacement.as_str());
- builder.append_value(result);
- } else {
- builder.append_null();
+ if let Some(ref short_re) = short_re {
Review Comment:
This code is not covered by the tests. I will push a commit to add more
coverage
```shell
cargo llvm-cov --html test --test sqllogictests -- regexp
```
<img width="1230" height="670" alt="Image"
src="https://github.com/user-attachments/assets/d7a4a76f-dbd6-4e85-9793-f2faadc78a05"
/>
##########
datafusion/functions/src/regex/regexpreplace.rs:
##########
@@ -473,13 +499,37 @@ fn _regexp_replace_static_pattern_replace<T:
OffsetSizeTrait>(
let mut new_offsets = BufferBuilder::<T>::new(string_array.len() +
1);
new_offsets.append(T::zero());
- string_array.iter().for_each(|val| {
- if let Some(val) = val {
- let result = re.replacen(val, limit, replacement.as_str());
- vals.append_slice(result.as_bytes());
- }
- new_offsets.append(T::from_usize(vals.len()).unwrap());
- });
+ if let Some(ref short_re) = short_re {
Review Comment:
I verified this code is covered by the slt tests .
```shell
cargo llvm-cov --html test --test sqllogictests -- regexp
```
<img width="1112" height="802" alt="Image"
src="https://github.com/user-attachments/assets/c2e78e72-f3a5-44a0-8a41-4c3760b615a8"
/>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]