This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new b8cebdde2a Fix incorrect regex pattern in regex_replace_posix_groups 
(#19827)
b8cebdde2a is described below

commit b8cebdde2a5b2fc4ecb219b838292a1fcbfb5bc9
Author: Ganesh Patil <[email protected]>
AuthorDate: Wed Feb 25 01:56:06 2026 +0530

    Fix incorrect regex pattern in regex_replace_posix_groups (#19827)
    
    The `regex_replace_posix_groups` method was using the pattern `(\d*)` to
    match
    POSIX capture group references like `\1`. However, `*` matches zero or
    more
    digits, which caused a lone backslash `\` to incorrectly become `${}`.
    
    Changed to `(\d+)` which requires at least one digit, fixing the issue.
    
    Added unit tests to validate correct behavior.
    
    - Fixes #19766
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/functions/src/regex/regexpreplace.rs | 48 +++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/datafusion/functions/src/regex/regexpreplace.rs 
b/datafusion/functions/src/regex/regexpreplace.rs
index b5ab46f0ec..68e324e21c 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -189,13 +189,19 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> 
Result<ArrayRef> {
     }
 }
 
-/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
+/// replace POSIX capture groups (like \1 or \\1) with Rust Regex group (like 
${1})
 /// used by regexp_replace
+/// Handles both single backslash (\1) and double backslash (\\1) which can 
occur
+/// when SQL strings with escaped backslashes are passed through
+///
+/// Note: \0 is converted to ${0}, which in Rust's regex replacement syntax
+/// substitutes the entire match. This is consistent with POSIX behavior where
+/// \0 (or &) refers to the entire matched string.
 fn regex_replace_posix_groups(replacement: &str) -> String {
     static CAPTURE_GROUPS_RE_LOCK: LazyLock<Regex> =
-        LazyLock::new(|| Regex::new(r"(\\)(\d*)").unwrap());
+        LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap());
     CAPTURE_GROUPS_RE_LOCK
-        .replace_all(replacement, "$${$2}")
+        .replace_all(replacement, "$${$1}")
         .into_owned()
 }
 
@@ -659,6 +665,42 @@ mod tests {
 
     use super::*;
 
+    #[test]
+    fn test_regex_replace_posix_groups() {
+        // Test that \1, \2, etc. are replaced with ${1}, ${2}, etc.
+        assert_eq!(regex_replace_posix_groups(r"\1"), "${1}");
+        assert_eq!(regex_replace_posix_groups(r"\12"), "${12}");
+        assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y");
+        assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}");
+
+        // Test double backslash (from SQL escaped strings like '\\1')
+        assert_eq!(regex_replace_posix_groups(r"\\1"), "${1}");
+        assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y");
+        assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}");
+
+        // Test 3 or 4 backslashes before digits to document expected behavior
+        assert_eq!(regex_replace_posix_groups(r"\\\1"), r"\${1}");
+        assert_eq!(regex_replace_posix_groups(r"\\\\1"), r"\\${1}");
+        assert_eq!(regex_replace_posix_groups(r"\\\1\\\\2"), r"\${1}\\${2}");
+
+        // Test that a lone backslash is NOT replaced (requires at least one 
digit)
+        assert_eq!(regex_replace_posix_groups(r"\"), r"\");
+        assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar");
+
+        // Test that backslash followed by non-digit is preserved
+        assert_eq!(regex_replace_posix_groups(r"\n"), r"\n");
+        assert_eq!(regex_replace_posix_groups(r"\t"), r"\t");
+
+        // Test \0 behavior: \0 is converted to ${0}, which in Rust's regex
+        // replacement syntax substitutes the entire match. This is consistent
+        // with POSIX behavior where \0 (or &) refers to the entire matched 
string.
+        assert_eq!(regex_replace_posix_groups(r"\0"), "${0}");
+        assert_eq!(
+            regex_replace_posix_groups(r"prefix\0suffix"),
+            "prefix${0}suffix"
+        );
+    }
+
     macro_rules! static_pattern_regexp_replace {
         ($name:ident, $T:ty, $O:ty) => {
             #[test]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to