This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new b8cebdde2a Fix incorrect regex pattern in regex_replace_posix_groups
(#19827)
b8cebdde2a is described below
commit b8cebdde2a5b2fc4ecb219b838292a1fcbfb5bc9
Author: Ganesh Patil <[email protected]>
AuthorDate: Wed Feb 25 01:56:06 2026 +0530
Fix incorrect regex pattern in regex_replace_posix_groups (#19827)
The `regex_replace_posix_groups` method was using the pattern `(\d*)` to
match
POSIX capture group references like `\1`. However, `*` matches zero or
more
digits, which caused a lone backslash `\` to incorrectly become `${}`.
Changed to `(\d+)` which requires at least one digit, fixing the issue.
Added unit tests to validate correct behavior.
- Fixes #19766
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/functions/src/regex/regexpreplace.rs | 48 +++++++++++++++++++++++--
1 file changed, 45 insertions(+), 3 deletions(-)
diff --git a/datafusion/functions/src/regex/regexpreplace.rs
b/datafusion/functions/src/regex/regexpreplace.rs
index b5ab46f0ec..68e324e21c 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -189,13 +189,19 @@ fn regexp_replace_func(args: &[ColumnarValue]) ->
Result<ArrayRef> {
}
}
-/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
+/// replace POSIX capture groups (like \1 or \\1) with Rust Regex group (like
${1})
/// used by regexp_replace
+/// Handles both single backslash (\1) and double backslash (\\1) which can
occur
+/// when SQL strings with escaped backslashes are passed through
+///
+/// Note: \0 is converted to ${0}, which in Rust's regex replacement syntax
+/// substitutes the entire match. This is consistent with POSIX behavior where
+/// \0 (or &) refers to the entire matched string.
fn regex_replace_posix_groups(replacement: &str) -> String {
static CAPTURE_GROUPS_RE_LOCK: LazyLock<Regex> =
- LazyLock::new(|| Regex::new(r"(\\)(\d*)").unwrap());
+ LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap());
CAPTURE_GROUPS_RE_LOCK
- .replace_all(replacement, "$${$2}")
+ .replace_all(replacement, "$${$1}")
.into_owned()
}
@@ -659,6 +665,42 @@ mod tests {
use super::*;
+ #[test]
+ fn test_regex_replace_posix_groups() {
+ // Test that \1, \2, etc. are replaced with ${1}, ${2}, etc.
+ assert_eq!(regex_replace_posix_groups(r"\1"), "${1}");
+ assert_eq!(regex_replace_posix_groups(r"\12"), "${12}");
+ assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y");
+ assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}");
+
+ // Test double backslash (from SQL escaped strings like '\\1')
+ assert_eq!(regex_replace_posix_groups(r"\\1"), "${1}");
+ assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y");
+ assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}");
+
+ // Test 3 or 4 backslashes before digits to document expected behavior
+ assert_eq!(regex_replace_posix_groups(r"\\\1"), r"\${1}");
+ assert_eq!(regex_replace_posix_groups(r"\\\\1"), r"\\${1}");
+ assert_eq!(regex_replace_posix_groups(r"\\\1\\\\2"), r"\${1}\\${2}");
+
+ // Test that a lone backslash is NOT replaced (requires at least one
digit)
+ assert_eq!(regex_replace_posix_groups(r"\"), r"\");
+ assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar");
+
+ // Test that backslash followed by non-digit is preserved
+ assert_eq!(regex_replace_posix_groups(r"\n"), r"\n");
+ assert_eq!(regex_replace_posix_groups(r"\t"), r"\t");
+
+ // Test \0 behavior: \0 is converted to ${0}, which in Rust's regex
+ // replacement syntax substitutes the entire match. This is consistent
+ // with POSIX behavior where \0 (or &) refers to the entire matched
string.
+ assert_eq!(regex_replace_posix_groups(r"\0"), "${0}");
+ assert_eq!(
+ regex_replace_posix_groups(r"prefix\0suffix"),
+ "prefix${0}suffix"
+ );
+ }
+
macro_rules! static_pattern_regexp_replace {
($name:ident, $T:ty, $O:ty) => {
#[test]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]