(fory) branch main updated: perf(rust): always use utf8 when writing string (#2809)

chaokunyang Wed, 22 Oct 2025 05:30:26 -0700

This is an automated email from the ASF dual-hosted git repository.

chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fory.git



The following commit(s) were added to refs/heads/main by this push:
     new 873d05ce5 perf(rust): always use utf8 when writing string (#2809)
873d05ce5 is described below

commit 873d05ce57279888cfc554eefb056aa97176db0e
Author: Shawn Yang <[email protected]>
AuthorDate: Wed Oct 22 20:30:16 2025 +0800

    perf(rust): always use utf8 when writing string (#2809)
    
    ## Why?
    
    <!-- Describe the purpose of this PR. -->
    
    ## What does this PR do?
    
    - always use utf8 when writing string
    - Update java StringSerializer to support decode utf8 string in latin1
    range
    
    ## Related issues
    
    Clsoes #2806
    
    ## Does this PR introduce any user-facing change?
    
    <!--
    If any user-facing interface changes, please [open an
    issue](https://github.com/apache/fory/issues/new/choose) describing the
    need to do so and update the document if necessary.
    
    Delete section if not applicable.
    -->
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    ## Benchmark
    
    <!--
    When the PR has an impact on performance (if you don't know whether the
    PR will have an impact on performance, you can submit the PR first, and
    if it will have impact on performance, the code reviewer will explain
    it), be sure to attach a benchmark data here.
    
    Delete section if not applicable.
    -->
---
 .../java/org/apache/fory/config/ForyBuilder.java   |   1 +
 .../apache/fory/serializer/StringSerializer.java   |  67 +++
 .../org/apache/fory/util/StringEncodingUtils.java  | 100 +++++
 .../fory/serializer/StringSerializerTest.java      | 460 +++++++++++++++++++++
 rust/fory-core/benches/simd_bench.rs               |   4 +-
 rust/fory-core/src/buffer.rs                       |  39 +-
 rust/fory-core/src/meta/string_util.rs             |  24 ++
 rust/fory-core/src/serializer/string.rs            |  49 +--
 8 files changed, 664 insertions(+), 80 deletions(-)

diff --git 
a/java/fory-core/src/main/java/org/apache/fory/config/ForyBuilder.java 
b/java/fory-core/src/main/java/org/apache/fory/config/ForyBuilder.java
index a72d58c85..0c610bad5 100644
--- a/java/fory-core/src/main/java/org/apache/fory/config/ForyBuilder.java
+++ b/java/fory-core/src/main/java/org/apache/fory/config/ForyBuilder.java
@@ -422,6 +422,7 @@ public final class ForyBuilder {
       stringRefIgnored = true;
       longEncoding = LongEncoding.PVL;
       compressInt = true;
+      compressString = true;
     }
     if (ENABLE_CLASS_REGISTRATION_FORCIBLY) {
       if (!requireClassRegistration) {
diff --git 
a/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java 
b/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java
index f755f270b..d85513723 100644
--- 
a/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java
+++ 
b/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java
@@ -108,6 +108,7 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
 
   private final boolean compressString;
   private final boolean writeNumUtf16BytesForUtf8Encoding;
+  private final boolean xlang;
   private byte[] byteArray = new byte[DEFAULT_BUFFER_SIZE];
   private int smoothByteArrayLength = DEFAULT_BUFFER_SIZE;
   private char[] charArray = new char[16];
@@ -117,6 +118,10 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
   public StringSerializer(Fory fory) {
     super(fory, String.class, fory.trackingRef() && 
!fory.isStringRefIgnored());
     compressString = fory.compressString();
+    xlang = fory.isCrossLanguage();
+    if (xlang) {
+      Preconditions.checkArgument(compressString, "compress string muse be 
enabled for xlang mode");
+    }
     writeNumUtf16BytesForUtf8Encoding = 
fory.getConfig().writeNumUtf16BytesForUtf8Encoding();
   }
 
@@ -225,6 +230,9 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
       if (writeNumUtf16BytesForUtf8Encoding) {
         data = readBytesUTF8PerfOptimized(buffer, numBytes);
       } else {
+        if (xlang) {
+          return readBytesUTF8ForXlang(buffer, numBytes);
+        }
         data = readBytesUTF8(buffer, numBytes);
       }
       return newBytesStringZeroCopy(UTF16, data);
@@ -235,6 +243,65 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     }
   }
 
+  // the utf8 data may can be encoded with latin1, so the read need to check 
whether it can be
+  // encoded by latin1, if true, the coder should be latin1 instead of utf16
+  String readBytesUTF8ForXlang(MemoryBuffer buffer, int numBytes) {
+    buffer.checkReadableBytes(numBytes);
+    byte[] srcArray = buffer.getHeapMemory();
+
+    if (srcArray != null) {
+      int srcIndex = buffer._unsafeHeapReaderIndex();
+
+      // Fast path: vectorized ASCII check (8 bytes at a time)
+      if (StringEncodingUtils.isUTF8WithinAscii(srcArray, srcIndex, numBytes)) 
{
+        byte[] result = new byte[numBytes];
+        System.arraycopy(srcArray, srcIndex, result, 0, numBytes);
+        buffer._increaseReaderIndexUnsafe(numBytes);
+        return newBytesStringZeroCopy(LATIN1, result);
+      }
+
+      // Two-pass approach: scan first, then convert
+      boolean isLatin1 = StringEncodingUtils.isUTF8WithinLatin1(srcArray, 
srcIndex, numBytes);
+      buffer._increaseReaderIndexUnsafe(numBytes);
+
+      if (isLatin1) {
+        byte[] latin1Buffer = getByteArray(numBytes);
+        int latin1Len =
+            StringEncodingUtils.convertUTF8ToLatin1(srcArray, srcIndex, 
numBytes, latin1Buffer);
+        return newBytesStringZeroCopy(LATIN1, Arrays.copyOf(latin1Buffer, 
latin1Len));
+      } else {
+        byte[] utf16Buffer = getByteArray(numBytes << 1);
+        int utf16Len =
+            StringEncodingUtils.convertUTF8ToUTF16(srcArray, srcIndex, 
numBytes, utf16Buffer);
+        return newBytesStringZeroCopy(UTF16, Arrays.copyOf(utf16Buffer, 
utf16Len));
+      }
+    } else {
+      // Off-heap path
+      byte[] srcBytes = getByteArray2(numBytes);
+      buffer.readBytes(srcBytes, 0, numBytes);
+
+      // Fast path: vectorized ASCII check
+      if (StringEncodingUtils.isUTF8WithinAscii(srcBytes, 0, numBytes)) {
+        // Must copy to exact size since srcBytes is a reusable buffer
+        return newBytesStringZeroCopy(LATIN1, Arrays.copyOf(srcBytes, 
numBytes));
+      }
+
+      // Two-pass approach: scan first, then convert
+      boolean isLatin1 = StringEncodingUtils.isUTF8WithinLatin1(srcBytes, 0, 
numBytes);
+
+      if (isLatin1) {
+        byte[] latin1Buffer = getByteArray(numBytes);
+        int latin1Len =
+            StringEncodingUtils.convertUTF8ToLatin1(srcBytes, 0, numBytes, 
latin1Buffer);
+        return newBytesStringZeroCopy(LATIN1, Arrays.copyOf(latin1Buffer, 
latin1Len));
+      } else {
+        byte[] utf16Buffer = getByteArray(numBytes << 1);
+        int utf16Len = StringEncodingUtils.convertUTF8ToUTF16(srcBytes, 0, 
numBytes, utf16Buffer);
+        return newBytesStringZeroCopy(UTF16, Arrays.copyOf(utf16Buffer, 
utf16Len));
+      }
+    }
+  }
+
   @CodegenInvoke
   public String readCompressedCharsString(MemoryBuffer buffer) {
     long header = buffer.readVarUint36Small();
diff --git 
a/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java 
b/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java
index 4d121aa2e..ef6bb67ee 100644
--- a/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java
+++ b/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java
@@ -379,4 +379,104 @@ public class StringEncodingUtils {
     dst[dp + 2] = (byte) (0x80 | ((uc >> 6) & 0x3f));
     dst[dp + 3] = (byte) (0x80 | (uc & 0x3f));
   }
+
+  /**
+   * Fast scan to check if UTF-8 data fits in Latin1 encoding (all code points 
<= 0xFF). This is a
+   * read-only pass optimized for cache locality.
+   */
+  public static boolean isUTF8WithinLatin1(byte[] src, int offset, int length) 
{
+    final int end = offset + length;
+
+    while (offset < end) {
+      int b0 = src[offset++] & 0xFF;
+
+      if (b0 < 0x80) {
+        // 1-byte UTF-8 (ASCII) - always Latin1
+        continue;
+      } else if ((b0 >> 5) == 0b110 && (b0 & 0x1e) != 0) {
+        // 2-byte UTF-8
+        if (offset >= end) {
+          return false; // Malformed
+        }
+        int b1 = src[offset++] & 0xFF;
+        if ((b1 & 0xc0) != 0x80) {
+          return false; // Malformed
+        }
+        int codePoint = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
+        if (codePoint > 0xFF) {
+          return false; // Beyond Latin1
+        }
+      } else {
+        // 3-byte or 4-byte UTF-8 - definitely not Latin1
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /** Fast UTF-8 to Latin1 conversion. Assumes scanUTF8IsLatin1 already 
validated the input. */
+  public static int convertUTF8ToLatin1(byte[] src, int offset, int length, 
byte[] dst) {
+    final int end = offset + length;
+    int dstPos = 0;
+
+    while (offset < end) {
+      // Vectorized ASCII fast path
+      if (offset + 8 <= end
+          && (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset) & 
0x8080808080808080L)
+              == 0) {
+        // 8 ASCII bytes - direct copy
+        dst[dstPos] = src[offset];
+        dst[dstPos + 1] = src[offset + 1];
+        dst[dstPos + 2] = src[offset + 2];
+        dst[dstPos + 3] = src[offset + 3];
+        dst[dstPos + 4] = src[offset + 4];
+        dst[dstPos + 5] = src[offset + 5];
+        dst[dstPos + 6] = src[offset + 6];
+        dst[dstPos + 7] = src[offset + 7];
+        dstPos += 8;
+        offset += 8;
+        continue;
+      }
+
+      int b0 = src[offset++] & 0xFF;
+
+      if (b0 < 0x80) {
+        // 1-byte UTF-8 (ASCII)
+        dst[dstPos++] = (byte) b0;
+      } else {
+        // 2-byte UTF-8 (already validated to be Latin1)
+        int b1 = src[offset++] & 0xFF;
+        int codePoint = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
+        dst[dstPos++] = (byte) codePoint;
+      }
+    }
+
+    return dstPos;
+  }
+
+  /**
+   * Vectorized ASCII check - processes 8 bytes at a time. Returns true if all 
bytes are ASCII (high
+   * bit not set).
+   */
+  public static boolean isUTF8WithinAscii(byte[] bytes, int offset, int 
length) {
+    final int end = offset + length;
+    int vectorizedEnd = offset + ((length >> 3) << 3);
+
+    // Check 8 bytes at a time
+    for (int i = offset; i < vectorizedEnd; i += 8) {
+      if ((Platform.getLong(bytes, Platform.BYTE_ARRAY_OFFSET + i) & 
0x8080808080808080L) != 0) {
+        return false;
+      }
+    }
+
+    // Check remaining bytes
+    for (int i = vectorizedEnd; i < end; i++) {
+      if (bytes[i] < 0) {
+        return false;
+      }
+    }
+
+    return true;
+  }
 }
diff --git 
a/java/fory-core/src/test/java/org/apache/fory/serializer/StringSerializerTest.java
 
b/java/fory-core/src/test/java/org/apache/fory/serializer/StringSerializerTest.java
index 436180ebe..493933853 100644
--- 
a/java/fory-core/src/test/java/org/apache/fory/serializer/StringSerializerTest.java
+++ 
b/java/fory-core/src/test/java/org/apache/fory/serializer/StringSerializerTest.java
@@ -381,4 +381,464 @@ public class StringSerializerTest extends ForyTestBase {
       assertEquals(buffer.readerIndex(), buffer.writerIndex());
     }
   }
+
+  /**
+   * Comprehensive tests for readBytesUTF8ForXlang method. Tests the optimized 
single-pass UTF-8 to
+   * Latin1/UTF-16 conversion.
+   */
+  @Test
+  public void testReadBytesUTF8ForXlang_PureAscii() {
+    Fory fory =
+        Fory.builder()
+            .withStringCompressed(true)
+            .withLanguage(Language.XLANG)
+            .requireClassRegistration(false)
+            .build();
+
+    // Test various ASCII string lengths to verify vectorized path (8 bytes at 
a time)
+    String[] testStrings = {
+      "", // Empty
+      "a", // Single char
+      "hello", // 5 chars
+      "helloabc", // Exactly 8 chars (1 vectorized chunk)
+      "hello world!", // 12 chars
+      "hello world, this is a test", // 28 chars (multiple vectorized chunks)
+      new String(new char[100]).replace("\0", "x") // Long ASCII string
+    };
+
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(1024), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(1024))
+        }) {
+      for (String testStr : testStrings) {
+        buffer.writerIndex(0);
+        buffer.readerIndex(0);
+
+        StringSerializer serializer = new StringSerializer(fory);
+        serializer.write(buffer, testStr);
+        String result = serializer.read(buffer);
+
+        assertEquals(result, testStr, "Failed for ASCII string: " + testStr);
+        assertEquals(buffer.readerIndex(), buffer.writerIndex());
+      }
+    }
+  }
+
+  @Test
+  public void testReadBytesUTF8ForXlang_Latin1() {
+    Fory fory =
+        Fory.builder()
+            .withStringCompressed(true)
+            .withLanguage(Language.XLANG)
+            .requireClassRegistration(false)
+            .build();
+
+    // Test Latin1 characters (0x80-0xFF range)
+    // These are 2-byte UTF-8 sequences but fit in Latin1 encoding
+    String[] testStrings = {
+      "café", // Contains é (0xE9)
+      "résumé", // Multiple accented chars
+      "Ñoño", // Spanish characters
+      "Größe", // German umlaut
+      "\u00A0\u00FF", // Non-breaking space and ÿ
+      "hello " + "\u00E9" + " world", // Mixed ASCII and Latin1
+      new String(new char[50]).replace("\0", "\u00E9") // Repeated Latin1
+    };
+
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(1024), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(1024))
+        }) {
+      for (String testStr : testStrings) {
+        buffer.writerIndex(0);
+        buffer.readerIndex(0);
+
+        StringSerializer serializer = new StringSerializer(fory);
+        serializer.write(buffer, testStr);
+        String result = serializer.read(buffer);
+
+        assertEquals(result, testStr, "Failed for Latin1 string: " + testStr);
+        assertEquals(buffer.readerIndex(), buffer.writerIndex());
+      }
+    }
+  }
+
+  @Test
+  public void testReadBytesUTF8ForXlang_Utf16() {
+    Fory fory =
+        Fory.builder()
+            .withStringCompressed(true)
+            .withLanguage(Language.XLANG)
+            .requireClassRegistration(false)
+            .build();
+
+    // Test UTF-16 characters (beyond Latin1 range)
+    String[] testStrings = {
+      "你好", // Chinese characters
+      "Hello 世界", // Mixed ASCII and Chinese
+      "こんにちは", // Japanese Hiragana
+      "안녕하세요", // Korean Hangul
+      "Привет", // Russian Cyrillic
+      "🎉🎊", // Emoji (surrogate pairs)
+      "test " + "\uD83D\uDE00" + " emoji", // Grinning face emoji
+      "\u4E00\u4E01\u4E03", // CJK ideographs
+      new String(new char[30]).replace("\0", "你") // Repeated UTF-16
+    };
+
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(2048), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(2048))
+        }) {
+      for (String testStr : testStrings) {
+        buffer.writerIndex(0);
+        buffer.readerIndex(0);
+
+        StringSerializer serializer = new StringSerializer(fory);
+        serializer.write(buffer, testStr);
+        String result = serializer.read(buffer);
+
+        assertEquals(result, testStr, "Failed for UTF-16 string: " + testStr);
+        assertEquals(buffer.readerIndex(), buffer.writerIndex());
+      }
+    }
+  }
+
+  @Test
+  public void testReadBytesUTF8ForXlang_MixedContent() {
+    Fory fory =
+        Fory.builder()
+            .withStringCompressed(true)
+            .withLanguage(Language.XLANG)
+            .requireClassRegistration(false)
+            .build();
+
+    // Test mixed content that transitions between ASCII, Latin1, and UTF-16
+    String[] testStrings = {
+      "hello café 你好", // ASCII + Latin1 + UTF-16
+      "test\u00E9test你test", // Alternating encodings
+      "abc" + "\u00FF" + "你好" + "xyz", // All three types
+      StringUtils.random(20) + "你好" + StringUtils.random(20), // Random ASCII 
with UTF-16
+      "🎉hello世界café🎊", // Complex mix with emoji
+    };
+
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(2048), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(2048))
+        }) {
+      for (String testStr : testStrings) {
+        buffer.writerIndex(0);
+        buffer.readerIndex(0);
+
+        StringSerializer serializer = new StringSerializer(fory);
+        serializer.write(buffer, testStr);
+        String result = serializer.read(buffer);
+
+        assertEquals(result, testStr, "Failed for mixed content string: " + 
testStr);
+        assertEquals(buffer.readerIndex(), buffer.writerIndex());
+      }
+    }
+  }
+
+  @Test
+  public void testReadBytesUTF8ForXlang_EdgeCases() {
+    Fory fory =
+        Fory.builder()
+            .withStringCompressed(true)
+            .withLanguage(Language.XLANG)
+            .requireClassRegistration(false)
+            .build();
+
+    // Test edge cases
+    String[] testStrings = {
+      "", // Empty string
+      " ", // Single space
+      "\u0001", // Control character
+      "\u007F", // DEL character
+      "\u0080", // First Latin1 extended char
+      "\u00FF", // Last Latin1 char
+      "\u0100", // First char beyond Latin1
+      new String(new char[7]).replace("\0", "a"), // 7 chars (just under 
vectorized chunk)
+      new String(new char[9]).replace("\0", "a"), // 9 chars (just over 
vectorized chunk)
+      new String(new char[16]).replace("\0", "a"), // Exactly 2 vectorized 
chunks
+      new String(new char[17]).replace("\0", "a"), // 2 chunks + 1 byte
+    };
+
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(1024), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(1024))
+        }) {
+      for (String testStr : testStrings) {
+        buffer.writerIndex(0);
+        buffer.readerIndex(0);
+
+        StringSerializer serializer = new StringSerializer(fory);
+        serializer.write(buffer, testStr);
+        String result = serializer.read(buffer);
+
+        assertEquals(result, testStr, "Failed for edge case string: " + 
testStr);
+        assertEquals(buffer.readerIndex(), buffer.writerIndex());
+      }
+    }
+  }
+
+  @Test
+  public void testReadBytesUTF8ForXlang_SurrogatePairs() {
+    Fory fory =
+        Fory.builder()
+            .withStringCompressed(true)
+            .withLanguage(Language.XLANG)
+            .requireClassRegistration(false)
+            .build();
+
+    // Test surrogate pairs (4-byte UTF-8 sequences)
+    String[] testStrings = {
+      "😀", // Grinning face
+      "😀😁😂", // Multiple emoji
+      "hello😀world", // Emoji in middle
+      "test🎉test🎊test", // Multiple emoji separated
+      "\uD83D\uDC4D", // Thumbs up (explicit surrogate pair)
+      "\uD83D\uDE00\uD83D\uDE01", // Multiple explicit pairs
+    };
+
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(2048), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(2048))
+        }) {
+      for (String testStr : testStrings) {
+        buffer.writerIndex(0);
+        buffer.readerIndex(0);
+
+        StringSerializer serializer = new StringSerializer(fory);
+        serializer.write(buffer, testStr);
+        String result = serializer.read(buffer);
+
+        assertEquals(result, testStr, "Failed for surrogate pair string: " + 
testStr);
+        assertEquals(buffer.readerIndex(), buffer.writerIndex());
+      }
+    }
+  }
+
+  @Test
+  public void testReadBytesUTF8ForXlang_LargeStrings() {
+    Fory fory =
+        Fory.builder()
+            .withStringCompressed(true)
+            .withLanguage(Language.XLANG)
+            .requireClassRegistration(false)
+            .build();
+
+    // Test large strings to verify buffer reuse and no overflow
+    String[] testStrings = {
+      new String(new char[1000]).replace("\0", "a"), // Large ASCII
+      new String(new char[1000]).replace("\0", "\u00E9"), // Large Latin1
+      new String(new char[500]).replace("\0", "你"), // Large UTF-16
+      StringUtils.random(500) + new String(new char[500]).replace("\0", "你"), 
// Large mixed
+    };
+
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(8192), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(8192))
+        }) {
+      for (String testStr : testStrings) {
+        buffer.writerIndex(0);
+        buffer.readerIndex(0);
+
+        StringSerializer serializer = new StringSerializer(fory);
+        serializer.write(buffer, testStr);
+        String result = serializer.read(buffer);
+
+        assertEquals(result, testStr, "Failed for large string");
+        assertEquals(buffer.readerIndex(), buffer.writerIndex());
+      }
+    }
+  }
+
+  @Test
+  public void testReadBytesUTF8ForXlang_VectorizedPathVerification() {
+    Fory fory =
+        Fory.builder()
+            .withStringCompressed(true)
+            .withLanguage(Language.XLANG)
+            .requireClassRegistration(false)
+            .build();
+
+    // Specifically test strings that exercise vectorized paths
+    // Multiples of 8 to ensure vectorized loop is used
+    for (int length : new int[] {8, 16, 24, 32, 64, 128}) {
+      String asciiStr = new String(new char[length]).replace("\0", "x");
+
+      for (MemoryBuffer buffer :
+          new MemoryBuffer[] {
+            MemoryUtils.buffer(1024), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(1024))
+          }) {
+        buffer.writerIndex(0);
+        buffer.readerIndex(0);
+
+        StringSerializer serializer = new StringSerializer(fory);
+        serializer.write(buffer, asciiStr);
+        String result = serializer.read(buffer);
+
+        assertEquals(result, asciiStr, "Failed for vectorized ASCII string of 
length " + length);
+        assertEquals(buffer.readerIndex(), buffer.writerIndex());
+      }
+    }
+  }
+
+  @Test
+  public void testReadBytesUTF8ForXlang_BufferReuseCorrectness() {
+    Fory fory =
+        Fory.builder()
+            .withStringCompressed(true)
+            .withLanguage(Language.XLANG)
+            .requireClassRegistration(false)
+            .build();
+
+    MemoryBuffer buffer = MemoryUtils.buffer(2048);
+    StringSerializer serializer = new StringSerializer(fory);
+
+    // Test multiple consecutive reads/writes to verify buffer reuse doesn't 
cause issues
+    String[] testSequence = {
+      "short",
+      new String(new char[1000]).replace("\0", "a"), // Trigger buffer growth
+      "short again",
+      "café",
+      "你好",
+      new String(new char[500]).replace("\0", "\u00E9"),
+      "final test"
+    };
+
+    for (String testStr : testSequence) {
+      buffer.writerIndex(0);
+      buffer.readerIndex(0);
+
+      serializer.write(buffer, testStr);
+      String result = serializer.read(buffer);
+
+      assertEquals(result, testStr, "Failed during buffer reuse test for: " + 
testStr);
+      assertEquals(buffer.readerIndex(), buffer.writerIndex());
+    }
+  }
+
+  @Test
+  public void disabled_testReadBytesUTF8ForXlang_DirectRawBytes() {
+    if (Platform.JAVA_VERSION <= 8) {
+      // readBytesUTF8ForXlang will be invoked only in java9+
+      return;
+    }
+    Fory fory = 
Fory.builder().withLanguage(Language.XLANG).requireClassRegistration(false).build();
+
+    // Direct test with raw UTF-8 bytes - bypasses full serialization
+    // This tests the method directly with known UTF-8 byte sequences
+
+    // Test 1: Pure ASCII "hello"
+    byte[] asciiBytes = "hello".getBytes(StandardCharsets.UTF_8);
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(1024), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(1024))
+        }) {
+      // Create fresh serializer for each test to avoid buffer reuse issues
+      StringSerializer serializer = new StringSerializer(fory);
+      buffer.writerIndex(0);
+      buffer.readerIndex(0);
+      buffer.writeBytes(asciiBytes);
+      String result = serializer.readBytesUTF8ForXlang(buffer, 
asciiBytes.length);
+      assertEquals(result, "hello", "Direct ASCII test failed");
+    }
+
+    // Test 2: Latin1 "café" (UTF-8: 63 61 66 C3 A9)
+    byte[] latin1Bytes = "café".getBytes(StandardCharsets.UTF_8);
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(1024), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(1024))
+        }) {
+      StringSerializer serializer = new StringSerializer(fory);
+      buffer.writerIndex(0);
+      buffer.readerIndex(0);
+      buffer.writeBytes(latin1Bytes);
+      String result = serializer.readBytesUTF8ForXlang(buffer, 
latin1Bytes.length);
+      assertEquals(result, "café", "Direct Latin1 test failed");
+    }
+
+    // Test 3: UTF-16 "你好" (3-byte UTF-8 sequences)
+    byte[] utf16Bytes = "你好".getBytes(StandardCharsets.UTF_8);
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(1024), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(1024))
+        }) {
+      StringSerializer serializer = new StringSerializer(fory);
+      buffer.writerIndex(0);
+      buffer.readerIndex(0);
+      buffer.writeBytes(utf16Bytes);
+      String result = serializer.readBytesUTF8ForXlang(buffer, 
utf16Bytes.length);
+      assertEquals(result, "你好", "Direct UTF-16 test failed");
+    }
+
+    // Test 4: Emoji with surrogate pairs "😀" (4-byte UTF-8: F0 9F 98 80)
+    byte[] emojiBytes = "😀".getBytes(StandardCharsets.UTF_8);
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(1024), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(1024))
+        }) {
+      StringSerializer serializer = new StringSerializer(fory);
+      buffer.writerIndex(0);
+      buffer.readerIndex(0);
+      buffer.writeBytes(emojiBytes);
+      String result = serializer.readBytesUTF8ForXlang(buffer, 
emojiBytes.length);
+      assertEquals(result, "😀", "Direct emoji test failed");
+    }
+
+    // Test 5: Mixed content - simpler case
+    byte[] mixedBytes = "abc你好".getBytes(StandardCharsets.UTF_8);
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(1024), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(1024))
+        }) {
+      StringSerializer serializer = new StringSerializer(fory);
+      buffer.writerIndex(0);
+      buffer.readerIndex(0);
+      buffer.writeBytes(mixedBytes);
+      String result = serializer.readBytesUTF8ForXlang(buffer, 
mixedBytes.length);
+      assertEquals(result, "abc你好", "Direct mixed content test failed");
+    }
+  }
+
+  @Test
+  public void testReadBytesUTF8ForXlang_SpecialCharacters() {
+    Fory fory =
+        Fory.builder()
+            .withStringCompressed(true)
+            .withLanguage(Language.XLANG)
+            .requireClassRegistration(false)
+            .build();
+
+    // Test various special characters and Unicode ranges
+    String[] testStrings = {
+      "\n\r\t", // Control characters
+      "\u0000", // Null character
+      "line1\nline2\rline3\tline4", // Mixed with text
+      "©®™", // Copyright, registered, trademark
+      "€£¥", // Currency symbols
+      "αβγδ", // Greek letters
+      "←↑→↓", // Arrows
+      "♠♣♥♦", // Card suits
+      "½⅓¼", // Fractions
+    };
+
+    for (MemoryBuffer buffer :
+        new MemoryBuffer[] {
+          MemoryUtils.buffer(1024), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(1024))
+        }) {
+      for (String testStr : testStrings) {
+        buffer.writerIndex(0);
+        buffer.readerIndex(0);
+
+        StringSerializer serializer = new StringSerializer(fory);
+        serializer.write(buffer, testStr);
+        String result = serializer.read(buffer);
+
+        assertEquals(result, testStr, "Failed for special characters: " + 
testStr);
+        assertEquals(buffer.readerIndex(), buffer.writerIndex());
+      }
+    }
+  }
 }
diff --git a/rust/fory-core/benches/simd_bench.rs 
b/rust/fory-core/benches/simd_bench.rs
index 208e80f12..57510a762 100644
--- a/rust/fory-core/benches/simd_bench.rs
+++ b/rust/fory-core/benches/simd_bench.rs
@@ -22,7 +22,9 @@ use std::arch::x86_64::*;
 use fory_core::buffer::{Reader, Writer};
 use fory_core::meta::buffer_rw_string::{
     read_latin1_simd, read_latin1_standard, write_latin1_simd, 
write_latin1_standard,
+    write_latin1_string,
 };
+
 #[cfg(target_feature = "sse2")]
 use std::arch::x86_64::*;
 
@@ -114,7 +116,7 @@ fn benchmark_read_latin1(c: &mut Criterion) {
     for &size in &sizes {
         let s_ascii = ascii_string.repeat(size / ascii_string.len() + 1);
         let mut writer = Writer::default();
-        writer.write_latin1_string(&s_ascii);
+        write_latin1_string(&mut writer, &s_ascii);
         let data = writer.dump();
 
         let name_simd = format!("Read Latin-1 SIMD size {}", size);
diff --git a/rust/fory-core/src/buffer.rs b/rust/fory-core/src/buffer.rs
index 3ffc9f6ad..8b087c687 100644
--- a/rust/fory-core/src/buffer.rs
+++ b/rust/fory-core/src/buffer.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::error::Error;
-use crate::meta::buffer_rw_string::{read_latin1_simd, write_latin1_simd};
+use crate::meta::buffer_rw_string::read_latin1_simd;
 use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
 use std::slice;
 
@@ -324,30 +324,6 @@ impl Writer {
         }
     }
 
-    #[inline(always)]
-    pub fn write_latin1_string(&mut self, s: &str) {
-        if s.len() < SIMD_THRESHOLD {
-            // Fast path for small buffers
-            let bytes = s.as_bytes();
-            // CRITICAL: Only safe if ASCII (UTF-8 == Latin1 for ASCII)
-            let is_ascii = bytes.iter().all(|&b| b < 0x80);
-            if is_ascii {
-                self.bf.reserve(s.len());
-                self.bf.extend_from_slice(bytes);
-            } else {
-                // Non-ASCII: must iterate chars to extract Latin1 byte values
-                self.bf.reserve(s.len());
-                for c in s.chars() {
-                    let v = c as u32;
-                    assert!(v <= 0xFF, "Non-Latin1 character found");
-                    self.bf.push(v as u8);
-                }
-            }
-            return;
-        }
-        write_latin1_simd(self, s);
-    }
-
     #[inline(always)]
     pub fn write_utf8_string(&mut self, s: &str) {
         let bytes = s.as_bytes();
@@ -355,19 +331,6 @@ impl Writer {
         self.bf.reserve(len);
         self.bf.extend_from_slice(bytes);
     }
-
-    #[inline(always)]
-    pub fn write_utf16_bytes(&mut self, bytes: &[u16]) {
-        let total_bytes = bytes.len() * 2;
-        let old_len = self.bf.len();
-        self.bf.reserve(total_bytes);
-        unsafe {
-            let dest = self.bf.as_mut_ptr().add(old_len);
-            let src = bytes.as_ptr() as *const u8;
-            std::ptr::copy_nonoverlapping(src, dest, total_bytes);
-            self.bf.set_len(old_len + total_bytes);
-        }
-    }
 }
 
 pub struct Reader {
diff --git a/rust/fory-core/src/meta/string_util.rs 
b/rust/fory-core/src/meta/string_util.rs
index d77124b18..109702c23 100644
--- a/rust/fory-core/src/meta/string_util.rs
+++ b/rust/fory-core/src/meta/string_util.rs
@@ -534,6 +534,30 @@ pub mod buffer_rw_string {
         }
     }
 
+    #[inline(always)]
+    pub fn write_latin1_string(writer: &mut Writer, s: &str) {
+        if s.len() < 128 {
+            // Fast path for small buffers
+            let bytes = s.as_bytes();
+            // CRITICAL: Only safe if ASCII (UTF-8 == Latin1 for ASCII)
+            let is_ascii = bytes.iter().all(|&b| b < 0x80);
+            if is_ascii {
+                writer.bf.reserve(s.len());
+                writer.bf.extend_from_slice(bytes);
+            } else {
+                // Non-ASCII: must iterate chars to extract Latin1 byte values
+                writer.bf.reserve(s.len());
+                for c in s.chars() {
+                    let v = c as u32;
+                    assert!(v <= 0xFF, "Non-Latin1 character found");
+                    writer.bf.push(v as u8);
+                }
+            }
+            return;
+        }
+        write_latin1_simd(writer, s);
+    }
+
     #[inline]
     pub fn write_utf8_standard(writer: &mut Writer, s: &str) {
         let bytes = s.as_bytes();
diff --git a/rust/fory-core/src/serializer/string.rs 
b/rust/fory-core/src/serializer/string.rs
index be457cbfb..256ed41b6 100644
--- a/rust/fory-core/src/serializer/string.rs
+++ b/rust/fory-core/src/serializer/string.rs
@@ -16,7 +16,6 @@
 // under the License.
 
 use crate::error::Error;
-use crate::meta::get_latin1_length;
 use crate::resolver::context::ReadContext;
 use crate::resolver::context::WriteContext;
 use crate::resolver::type_resolver::TypeResolver;
@@ -25,6 +24,7 @@ use crate::serializer::{ForyDefault, Serializer};
 use crate::types::TypeId;
 use std::mem;
 
+#[allow(dead_code)]
 enum StrEncoding {
     Latin1 = 0,
     Utf16 = 1,
@@ -34,61 +34,28 @@ enum StrEncoding {
 impl Serializer for String {
     #[inline(always)]
     fn fory_write_data(&self, context: &mut WriteContext) -> Result<(), Error> 
{
-        if !context.is_xlang() {
-            // Fast path: non-xlang mode always uses UTF-8 without encoding 
header
-            context.writer.write_varuint32(self.len() as u32);
-            context.writer.write_utf8_string(self);
-            return Ok(());
-        }
-
-        // xlang mode: use encoding header for optimal format selection
-        let mut len = get_latin1_length(self);
-        if len >= 0 {
-            let bitor = (len as u64) << 2 | StrEncoding::Latin1 as u64;
-            context.writer.write_varuint36_small(bitor);
-            context.writer.write_latin1_string(self);
-        } else if context.is_compress_string() {
-            // todo: support `writeNumUtf16BytesForUtf8Encoding` like in java
-            len = self.len() as i32;
-            let bitor = (len as u64) << 2 | StrEncoding::Utf8 as u64;
-            context.writer.write_varuint36_small(bitor);
-            context.writer.write_utf8_string(self);
-        } else {
-            let utf16: Vec<u16> = self.encode_utf16().collect();
-            let bitor = (utf16.len() as u64 * 2) << 2 | StrEncoding::Utf16 as 
u64;
-            context.writer.write_varuint36_small(bitor);
-            context.writer.write_utf16_bytes(&utf16);
-        }
+        let bitor = (self.len() as i32 as u64) << 2 | StrEncoding::Utf8 as u64;
+        context.writer.write_varuint36_small(bitor);
+        context.writer.write_utf8_string(self);
         Ok(())
     }
 
     #[inline(always)]
     fn fory_read_data(context: &mut ReadContext) -> Result<Self, Error> {
-        if !context.is_xlang() {
-            // Fast path: non-xlang mode always uses UTF-8 without encoding 
header
-            let len = context.reader.read_varuint32()? as usize;
-            return context.reader.read_utf8_string(len);
-        }
-
         // xlang mode: read encoding header and decode accordingly
         let bitor = context.reader.read_varuint36small()?;
         let len = bitor >> 2;
         let encoding = bitor & 0b11;
-        let encoding = match encoding {
-            0 => StrEncoding::Latin1,
-            1 => StrEncoding::Utf16,
-            2 => StrEncoding::Utf8,
+        let s = match encoding {
+            0 => context.reader.read_latin1_string(len as usize),
+            1 => context.reader.read_utf16_string(len as usize),
+            2 => context.reader.read_utf8_string(len as usize),
             _ => {
                 return Err(Error::encoding_error(format!(
                     "wrong encoding value: {}",
                     encoding
                 )))
             }
-        };
-        let s = match encoding {
-            StrEncoding::Latin1 => context.reader.read_latin1_string(len as 
usize),
-            StrEncoding::Utf16 => context.reader.read_utf16_string(len as 
usize),
-            StrEncoding::Utf8 => context.reader.read_utf8_string(len as usize),
         }?;
         Ok(s)
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(fory) branch main updated: perf(rust): always use utf8 when writing string (#2809)

Reply via email to