neilconway commented on code in PR #22029:
URL: https://github.com/apache/datafusion/pull/22029#discussion_r3211160908


##########
datafusion/functions/src/string/replace.rs:
##########
@@ -230,71 +226,77 @@ fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
             let string = unsafe { string_array.value_unchecked(i) };
             let from = unsafe { from_array.value_unchecked(i) };
             let to = unsafe { to_array.value_unchecked(i) };
-            buffer.clear();
-            replace_into_string(&mut buffer, string, from, to);
-            builder.append_value(&buffer);
+            apply_replace(&mut builder, string, from, to);
         }
     } else {
         for i in 0..len {
             // SAFETY: i < len, and no input has a null buffer.
             let string = unsafe { string_array.value_unchecked(i) };
             let from = unsafe { from_array.value_unchecked(i) };
             let to = unsafe { to_array.value_unchecked(i) };
-            buffer.clear();
-            replace_into_string(&mut buffer, string, from, to);
-            builder.append_value(&buffer);
+            apply_replace(&mut builder, string, from, to);
         }
     }
 
     Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
 }
 
-/// Helper function to perform string replacement into a reusable String buffer
 #[inline]
-fn replace_into_string(buffer: &mut String, string: &str, from: &str, to: 
&str) {
+fn apply_replace<B: BulkNullStringArrayBuilder>(
+    builder: &mut B,
+    string: &str,
+    from: &str,
+    to: &str,
+) {
     if from.is_empty() {
-        // When from is empty, insert 'to' at the beginning, between each 
character, and at the end
-        // This matches the behavior of str::replace()
-        buffer.push_str(to);
-        for ch in string.chars() {
-            buffer.push(ch);
-            buffer.push_str(to);
-        }
+        // Empty `from`: insert `to` before each character and at both ends
+        builder.append_with(|w| {
+            w.write_str(to);
+            for ch in string.chars() {
+                w.write_char(ch);

Review Comment:
   Really interesting! Thanks for raising this.
   
   I didn't quite follow your comment about the per-write tradeoff: I don't 
think there's anything fundamental about `append_with` that should be slower 
for the many-small-writes case (and we still are able to avoid a final 
`memcpy`). However, I can reproduce the significant slowdown you suggested for 
the "empty `from`" case. I dug into why, and it seems that repeatedly extending 
an Arrow `MutableBuffer` is relatively slow: `StringWriter::write_str()` does 
`MutableBuffer::extend_from_slice(&[byte])`, which results in a libc `memcpy` 
for every call, which is obviously slower than per-char `String::push(c)`, as 
the original code does. I don't think 
`MutableBuffer::extend_from_slice(&[byte])` is inherently slow, but there was 
enough helper functions / abstractions here that LLVM didn't inline, which lead 
to the per-call `memcpy`.
   
   A few options:
   
   1. Ignore it, because `replace` with empty-from is a corner-case. But it's 
unfortunate to leave `append_with` with a performance footgun like this.
   2. Fix it by reverting to a `mut String` buffer for the empty-from case. 
Fixes this specific workload but doesn't fix the underlyling issue in 
`StringWriter`.
   3. Optimize `StringWriter` for small-string writes.
   
   I've implemented #3. A combination of special-casing the string length and 
marking functions as `#[inline(always)]` appears to convince LLVM to vectorize 
this code path, which is a nice win (30-40% faster than `main` for the 
empty-from). All the inlining might in theory cause code block for other 
callers but it doesn't appear to regress any other `replace` benchmarks, at 
least.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to