neilconway commented on code in PR #22046:
URL: https://github.com/apache/datafusion/pull/22046#discussion_r3210792908


##########
datafusion/functions/src/core/overlay.rs:
##########
@@ -112,140 +112,170 @@ impl ScalarUDFImpl for OverlayFunc {
     }
 }
 
+/// Converts a 0-based character index into a byte index suitable for UTF-8
+/// slicing.
+fn byte_index_for_char(string: &str, char_idx: usize, is_ascii: bool) -> usize 
{
+    if is_ascii {
+        char_idx.min(string.len())
+    } else {
+        string
+            .char_indices()
+            .nth(char_idx)
+            .map_or(string.len(), |(byte_idx, _)| byte_idx)
+    }
+}
+
+/// Builds the OVERLAY result for a single (non-null) row.
+///
+/// `start_pos` is a 1-based character position; `replace_len` is the number
+/// of characters of `string` to replace with `characters`.
+fn overlay_one(
+    string: &str,
+    characters: &str,
+    start_pos: i64,
+    replace_len: i64,
+) -> String {
+    debug_assert!(start_pos >= 1);
+
+    let is_ascii = string.is_ascii();
+    let string_char_len = if is_ascii {
+        string.len() as i64
+    } else {
+        string.chars().count() as i64
+    };
+
+    // Convert SQL's 1-based character position into 0-based character indexes.
+    // `start_char_idx` is the first replaced character; `end_char_idx` is the
+    // first character after the replaced span.
+    //
+    // No upper-bound check on `start_char_idx`: when it exceeds 
`string_char_len`
+    // we want the whole string as the prefix (PostgreSQL-compatible "insert 
past
+    // end" semantics).
+    let start_char_idx = start_pos - 1;
+    let end_char_idx = start_char_idx.saturating_add(replace_len);
+
+    let prefix_char_idx = 
usize::try_from(start_char_idx).unwrap_or(usize::MAX);
+    let prefix_end_byte = byte_index_for_char(string, prefix_char_idx, 
is_ascii);
+
+    let mut res = String::with_capacity(string.len() + characters.len());
+    res.push_str(&string[..prefix_end_byte]);
+    res.push_str(characters);
+
+    if end_char_idx < string_char_len {
+        let suffix_char_idx = 
usize::try_from(end_char_idx.max(0)).unwrap_or(usize::MAX);
+        let suffix_start_byte = byte_index_for_char(string, suffix_char_idx, 
is_ascii);
+        res.push_str(&string[suffix_start_byte..]);
+    }
+    res
+}
+
 macro_rules! process_overlay {
-    // For the three-argument case
-    ($string_array:expr, $characters_array:expr, $pos_num:expr) => {{
+    // Three argument case
+    ($string_array:expr, $characters_array:expr, $pos_array:expr) => {{
         $string_array
-        .iter()
-        .zip($characters_array.iter())
-        .zip($pos_num.iter())
-        .map(|((string, characters), start_pos)| {
-            match (string, characters, start_pos) {
-                (Some(string), Some(characters), Some(start_pos)) => {
-                    let string_len = string.chars().count();
-                    let characters_len = characters.chars().count();
-                    let replace_len = characters_len as i64;
-                    let mut res =
-                        String::with_capacity(string_len.max(characters_len));
-
-                    //as sql replace index start from 1 while string index 
start from 0
-                    if start_pos > 1 && start_pos - 1 < string_len as i64 {
-                        let start = (start_pos - 1) as usize;
-                        res.push_str(&string[..start]);
+            .iter()
+            .zip($characters_array.iter())
+            .zip($pos_array.iter())
+            .map(|((string, characters), start_pos)| {
+                match (string, characters, start_pos) {
+                    (Some(string), Some(characters), Some(start_pos)) => {
+                        if start_pos < 1 {
+                            return exec_err!("negative substring length not 
allowed");
+                        }
+                        let replace_len = characters.chars().count() as i64;
+                        Ok(Some(overlay_one(
+                            string,
+                            characters,
+                            start_pos,
+                            replace_len,
+                        )))
                     }
-                    res.push_str(characters);
-                    // if start + replace_len - 1 >= string_length, just to 
string end
-                    if start_pos + replace_len - 1 < string_len as i64 {
-                        let end = (start_pos + replace_len - 1) as usize;
-                        res.push_str(&string[end..]);
-                    }
-                    Ok(Some(res))
+                    _ => Ok(None),
                 }
-                _ => Ok(None),
-            }
-        })
-        .collect::<Result<GenericStringArray<T>>>()
+            })
+            .collect::<Result<GenericStringArray<T>>>()
     }};
 
-    // For the four-argument case
-    ($string_array:expr, $characters_array:expr, $pos_num:expr, $len_num:expr) 
=> {{
+    // Four argument case
+    ($string_array:expr, $characters_array:expr, $pos_array:expr, 
$len_array:expr) => {{
         $string_array
-        .iter()
-        .zip($characters_array.iter())
-        .zip($pos_num.iter())
-        .zip($len_num.iter())
-        .map(|(((string, characters), start_pos), len)| {
-            match (string, characters, start_pos, len) {
-                (Some(string), Some(characters), Some(start_pos), Some(len)) 
=> {
-                    let string_len = string.chars().count();
-                    let characters_len = characters.chars().count();
-                    let replace_len = len.min(string_len as i64);
-                    let mut res =
-                        String::with_capacity(string_len.max(characters_len));
-
-                    //as sql replace index start from 1 while string index 
start from 0
-                    if start_pos > 1 && start_pos - 1 < string_len as i64 {
-                        let start = (start_pos - 1) as usize;
-                        res.push_str(&string[..start]);
-                    }
-                    res.push_str(characters);
-                    // if start + replace_len - 1 >= string_length, just to 
string end
-                    if start_pos + replace_len - 1 < string_len as i64 {
-                        let end = (start_pos + replace_len - 1) as usize;
-                        res.push_str(&string[end..]);
+            .iter()
+            .zip($characters_array.iter())
+            .zip($pos_array.iter())
+            .zip($len_array.iter())
+            .map(|(((string, characters), start_pos), len)| {
+                match (string, characters, start_pos, len) {
+                    (Some(string), Some(characters), Some(start_pos), 
Some(len)) => {
+                        if start_pos < 1 {
+                            return exec_err!("negative substring length not 
allowed");
+                        }
+                        let string_char_len = string.chars().count() as i64;

Review Comment:
   Good call! I was going to defer optimization work until a subsequent PR, but 
this is a straightforward cleanup. On closer inspection, we can just remove the 
clamp here, because `overlay_one` can handle length past the end of the string.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to