alamb commented on code in PR #19572:
URL: https://github.com/apache/datafusion/pull/19572#discussion_r2656382915


##########
datafusion/functions/src/unicode/strpos.rs:
##########
@@ -215,14 +215,32 @@ where
                         )
                     }
                 } else {
-                    // The `find` method returns the byte index of the 
substring.
-                    // We count the number of chars up to that byte index.
-                    T::Native::from_usize(
-                        string
-                            .find(substring)
-                            .map(|x| string[..x].chars().count() + 1)
-                            .unwrap_or(0),
-                    )
+                    // For non-ASCII, use a single-pass search that tracks both
+                    // byte position and character position simultaneously
+                    if substring.is_empty() {
+                        return T::Native::from_usize(1);
+                    }
+
+                    let substring_bytes = substring.as_bytes();
+                    let string_bytes = string.as_bytes();
+
+                    if substring_bytes.len() > string_bytes.len() {
+                        return T::Native::from_usize(0);
+                    }
+
+                    // Single pass: find substring while counting characters
+                    let mut char_pos = 0;
+                    for (byte_idx, _) in string.char_indices() {
+                        char_pos += 1;
+                        if byte_idx + substring_bytes.len() <= 
string_bytes.len()
+                            && &string_bytes[byte_idx..byte_idx + 
substring_bytes.len()]

Review Comment:
   you could potentially use `string_bytes.get_unchecked` here if it makes any 
difference as you validate the bounds check immediately above. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to