This is an automated email from the ASF dual-hosted git repository.

dmollitor pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/avro.git


The following commit(s) were added to refs/heads/main by this push:
     new 515edcd0f AVRO-4074: Optimization for Serializing ASCII Strings (#3198)
515edcd0f is described below

commit 515edcd0f341f67eacb47f9997e18cf0f2f4d5f9
Author: belugabehr <[email protected]>
AuthorDate: Mon Oct 7 07:16:39 2024 -0400

    AVRO-4074: Optimization for Serializing ASCII Strings (#3198)
---
 .../java/org/apache/avro/io/BinaryEncoder.java     | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java 
b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java
index aacb83b88..f8f9802ed 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java
@@ -37,6 +37,9 @@ import org.apache.avro.util.Utf8;
  */
 public abstract class BinaryEncoder extends Encoder {
 
+  // Buffer used for writing ASCII strings
+  private final byte[] stringBuffer = new byte[128];
+
   @Override
   public void writeNull() throws IOException {
   }
@@ -48,10 +51,47 @@ public abstract class BinaryEncoder extends Encoder {
 
   @Override
   public void writeString(String string) throws IOException {
+    /* empty string short-circuit */
     if (string.isEmpty()) {
       writeZero();
       return;
     }
+
+    /*
+     * Assume the String is ASCII. If the ASCII String fits into the existing
+     * buffer, copy the characters into the buffer and write it to the 
underlying
+     * Encoder. If the String is too long, or ends up not being ASCII, then
+     * fall-back to the default JDK mechanism for handling String to byte 
array.
+     */
+    final int stringLength = string.length();
+    if (stringLength <= stringBuffer.length) {
+      boolean onlyAscii = true;
+      for (int i = 0; onlyAscii && (i < stringLength); i++) {
+        /*
+         * The char data type is a single 16-bit Unicode character (UTF-16). 
ASCII, is a
+         * 7-bit character encoding. Therefore, if the value is larger than 
127, it
+         * cannot be ASCII. If it is ASCII, it is safe to trim to byte.
+         */
+        final char c = string.charAt(i);
+        if (c >= 0x80) {
+          onlyAscii = false;
+        } else {
+          stringBuffer[i] = (byte) c;
+        }
+      }
+      if (onlyAscii) {
+        writeInt(stringLength);
+        writeFixed(stringBuffer, 0, stringLength);
+        return;
+      }
+    }
+
+    /*
+     * The standard JDK way of turning Strings into byte arrays. Handles UTF-16
+     * case. However, for ASCII this has the overhead of instantiating a new 
byte
+     * array (which pollutes the heap), and then copying the underlying bytes 
into
+     * the array,
+     */
     byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
     writeInt(bytes.length);
     writeFixed(bytes, 0, bytes.length);

Reply via email to