This is an automated email from the ASF dual-hosted git repository.

pandalee pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fury.git


The following commit(s) were added to refs/heads/main by this push:
     new b952bf1e feat(java): make 4 bytes utf16 size header optional for utf8 
encoding (#2010)
b952bf1e is described below

commit b952bf1e067bdd0821a9c3c88a8f04924c486186
Author: Shawn Yang <[email protected]>
AuthorDate: Sun Jan 19 23:13:51 2025 +0800

    feat(java): make 4 bytes utf16 size header optional for utf8 encoding 
(#2010)
    
    ## What does this PR do?
    
    Currently fury serialize utf8 string in java will write num bytes of
    utf16 first, so that the deserializaiton can save one copy.
    But C++ and golang does not need this information. This PR makes the 4
    bytes utf16 size header optional for utf8 encoding, so theat the xlang
    serialiation can use the standard fury string serialization spec, and
    align to other languages.
    
    For performance consideration, this PR introduce
    `writeNumUtf16BytesForUtf8Encoding` which can perserve current
    behaviour.
    
    ## Related issues
    
    #1890
    
    ## Does this PR introduce any user-facing change?
    
    <!--
    If any user-facing interface changes, please [open an
    issue](https://github.com/apache/fury/issues/new/choose) describing the
    need to do so and update the document if necessary.
    -->
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    ## Benchmark
    
    This PR will introduce an extra copy for deserialization since we can't
    know the size of utf16 in advance before decoding utf8 string.
---
 .../main/java/org/apache/fury/config/Config.java   |   8 +
 .../java/org/apache/fury/config/FuryBuilder.java   |  15 ++
 .../apache/fury/serializer/StringSerializer.java   | 212 +++++++++++++++++++--
 .../test/java/org/apache/fury/FuryTestBase.java    |  10 +
 .../fury/serializer/StringSerializerTest.java      |  67 +++++--
 5 files changed, 275 insertions(+), 37 deletions(-)

diff --git a/java/fury-core/src/main/java/org/apache/fury/config/Config.java 
b/java/fury-core/src/main/java/org/apache/fury/config/Config.java
index bc6afe87..62e45bc0 100644
--- a/java/fury-core/src/main/java/org/apache/fury/config/Config.java
+++ b/java/fury-core/src/main/java/org/apache/fury/config/Config.java
@@ -46,6 +46,7 @@ public class Config implements Serializable {
   private final boolean checkJdkClassSerializable;
   private final Class<? extends Serializer> defaultJDKStreamSerializerType;
   private final boolean compressString;
+  private final boolean writeNumUtf16BytesForUtf8Encoding;
   private final boolean compressInt;
   private final boolean compressLong;
   private final LongEncoding longEncoding;
@@ -72,6 +73,7 @@ public class Config implements Serializable {
     timeRefIgnored = !trackingRef || builder.timeRefIgnored;
     copyRef = builder.copyRef;
     compressString = builder.compressString;
+    writeNumUtf16BytesForUtf8Encoding = 
builder.writeNumUtf16BytesForUtf8Encoding;
     compressInt = builder.compressInt;
     longEncoding = builder.longEncoding;
     compressLong = longEncoding != LongEncoding.LE_RAW_BYTES;
@@ -176,6 +178,10 @@ public class Config implements Serializable {
     return compressString;
   }
 
+  public boolean writeNumUtf16BytesForUtf8Encoding() {
+    return writeNumUtf16BytesForUtf8Encoding;
+  }
+
   public boolean compressInt() {
     return compressInt;
   }
@@ -287,6 +293,7 @@ public class Config implements Serializable {
         && checkClassVersion == config.checkClassVersion
         && checkJdkClassSerializable == config.checkJdkClassSerializable
         && compressString == config.compressString
+        && writeNumUtf16BytesForUtf8Encoding == 
config.writeNumUtf16BytesForUtf8Encoding
         && compressInt == config.compressInt
         && compressLong == config.compressLong
         && bufferSizeLimitBytes == config.bufferSizeLimitBytes
@@ -321,6 +328,7 @@ public class Config implements Serializable {
         checkJdkClassSerializable,
         defaultJDKStreamSerializerType,
         compressString,
+        writeNumUtf16BytesForUtf8Encoding,
         compressInt,
         compressLong,
         longEncoding,
diff --git 
a/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java 
b/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java
index e139e09f..3fe45415 100644
--- a/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java
+++ b/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java
@@ -69,6 +69,7 @@ public final class FuryBuilder {
   boolean compressInt = true;
   public LongEncoding longEncoding = LongEncoding.SLI;
   boolean compressString = false;
+  Boolean writeNumUtf16BytesForUtf8Encoding;
   CompatibleMode compatibleMode = CompatibleMode.SCHEMA_CONSISTENT;
   boolean checkJdkClassSerializable = true;
   Class<? extends Serializer> defaultJDKStreamSerializerType = 
ObjectStreamSerializer.class;
@@ -185,6 +186,17 @@ public final class FuryBuilder {
     return this;
   }
 
+  /**
+   * Whether write num_bytes of utf16 for utf8 encoding. With this option 
enabled, fury will write
+   * the num_bytes of utf16 before write utf8 encoded data, so that the 
deserialization can create
+   * the appropriate utf16 array for store the data, thus save one copy.
+   */
+  public FuryBuilder withWriteNumUtf16BytesForUtf8Encoding(
+      boolean writeNumUtf16BytesForUtf8Encoding) {
+    this.writeNumUtf16BytesForUtf8Encoding = writeNumUtf16BytesForUtf8Encoding;
+    return this;
+  }
+
   /**
    * Sets the limit for Fury's internal buffer. If the buffer size exceeds 
this limit, it will be
    * reset to this limit after every serialization and deserialization.
@@ -379,6 +391,9 @@ public final class FuryBuilder {
           ObjectStreamSerializer.class,
           Serializer.class);
     }
+    if (writeNumUtf16BytesForUtf8Encoding == null) {
+      writeNumUtf16BytesForUtf8Encoding = language == Language.JAVA;
+    }
     if (compatibleMode == CompatibleMode.COMPATIBLE) {
       checkClassVersion = false;
       if (deserializeNonexistentClass == null) {
diff --git 
a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java 
b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
index 22f3eeca..50546aa9 100644
--- 
a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
+++ 
b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
@@ -30,6 +30,7 @@ import java.lang.invoke.MethodHandles;
 import java.lang.invoke.MethodType;
 import java.lang.reflect.Field;
 import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
 import java.util.function.BiFunction;
 import java.util.function.Function;
 import org.apache.fury.Fury;
@@ -106,12 +107,17 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
   }
 
   private final boolean compressString;
+  private final boolean writeNumUtf16BytesForUtf8Encoding;
   private byte[] byteArray = new byte[DEFAULT_BUFFER_SIZE];
   private int smoothByteArrayLength = DEFAULT_BUFFER_SIZE;
+  private char[] charArray = new char[16];
+  private int smoothCharArrayLength = DEFAULT_BUFFER_SIZE;
+  private byte[] byteArray2 = new byte[16];
 
   public StringSerializer(Fury fury) {
     super(fury, String.class, fury.trackingRef() && 
!fury.isStringRefIgnored());
     compressString = fury.compressString();
+    writeNumUtf16BytesForUtf8Encoding = 
fury.getConfig().writeNumUtf16BytesForUtf8Encoding();
   }
 
   @Override
@@ -215,7 +221,13 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     byte coder = (byte) (header & 0b11);
     int numBytes = (int) (header >>> 2);
     if (coder == UTF8) {
-      return newBytesStringZeroCopy(UTF16, readBytesUTF8(buffer, numBytes));
+      byte[] data;
+      if (writeNumUtf16BytesForUtf8Encoding) {
+        data = readBytesUTF8PerfOptimized(buffer, numBytes);
+      } else {
+        data = readBytesUTF8(buffer, numBytes);
+      }
+      return newBytesStringZeroCopy(UTF16, data);
     } else if (coder == LATIN1 || coder == UTF16) {
       return newBytesStringZeroCopy(coder, readBytesUnCompressedUTF16(buffer, 
numBytes));
     } else {
@@ -232,7 +244,9 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     if (coder == LATIN1) {
       chars = readCharsLatin1(buffer, numBytes);
     } else if (coder == UTF8) {
-      chars = readCharsUTF8(buffer, numBytes);
+      return writeNumUtf16BytesForUtf8Encoding
+          ? readCharsUTF8PerfOptimized(buffer, numBytes)
+          : readCharsUTF8(buffer, numBytes);
     } else if (coder == UTF16) {
       chars = readCharsUTF16(buffer, numBytes);
     } else {
@@ -284,7 +298,11 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     if (coder == LATIN1 || bestCoder(bytes) == UTF16) {
       writeBytesString(buffer, coder, bytes);
     } else {
-      writeBytesUTF8(buffer, bytes);
+      if (writeNumUtf16BytesForUtf8Encoding) {
+        writeBytesUTF8PerfOptimized(buffer, bytes);
+      } else {
+        writeBytesUTF8(buffer, bytes);
+      }
     }
   }
 
@@ -295,7 +313,11 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     if (coder == LATIN1) {
       writeCharsLatin1(buffer, chars, chars.length);
     } else if (coder == UTF8) {
-      writeCharsUTF8(buffer, chars);
+      if (writeNumUtf16BytesForUtf8Encoding) {
+        writeCharsUTF8PerfOptimized(buffer, chars);
+      } else {
+        writeCharsUTF8(buffer, chars);
+      }
     } else {
       writeCharsUTF16(buffer, chars, chars.length);
     }
@@ -365,24 +387,39 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
   }
 
   public byte[] readBytesUTF8(MemoryBuffer buffer, int numBytes) {
+    byte[] tmpArray = getByteArray(numBytes << 1);
+    buffer.checkReadableBytes(numBytes);
+    int utf16NumBytes;
+    byte[] srcArray = buffer.getHeapMemory();
+    if (srcArray != null) {
+      int srcIndex = buffer._unsafeHeapReaderIndex();
+      utf16NumBytes =
+          StringEncodingUtils.convertUTF8ToUTF16(srcArray, srcIndex, numBytes, 
tmpArray);
+      buffer._increaseReaderIndexUnsafe(numBytes);
+    } else {
+      byte[] byteArray2 = getByteArray2(numBytes);
+      buffer.readBytes(byteArray2, 0, numBytes);
+      utf16NumBytes = StringEncodingUtils.convertUTF8ToUTF16(byteArray2, 0, 
numBytes, tmpArray);
+    }
+    return Arrays.copyOf(tmpArray, utf16NumBytes);
+  }
+
+  private byte[] readBytesUTF8PerfOptimized(MemoryBuffer buffer, int numBytes) 
{
     int udf8Bytes = buffer.readInt32();
     byte[] bytes = new byte[numBytes];
+    // noinspection Duplicates
     buffer.checkReadableBytes(udf8Bytes);
     byte[] srcArray = buffer.getHeapMemory();
     if (srcArray != null) {
       int srcIndex = buffer._unsafeHeapReaderIndex();
       int readLen = StringEncodingUtils.convertUTF8ToUTF16(srcArray, srcIndex, 
udf8Bytes, bytes);
-      if (readLen != numBytes) {
-        throw new RuntimeException("Decode UTF8 to UTF16 failed");
-      }
+      assert readLen == numBytes : "Decode UTF8 to UTF16 failed";
       buffer._increaseReaderIndexUnsafe(udf8Bytes);
     } else {
       byte[] tmpArray = getByteArray(udf8Bytes);
       buffer.readBytes(tmpArray, 0, udf8Bytes);
       int readLen = StringEncodingUtils.convertUTF8ToUTF16(tmpArray, 0, 
udf8Bytes, bytes);
-      if (readLen != numBytes) {
-        throw new RuntimeException("Decode UTF8 to UTF16 failed");
-      }
+      assert readLen == numBytes : "Decode UTF8 to UTF16 failed";
     }
     return bytes;
   }
@@ -436,28 +473,42 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     return chars;
   }
 
-  public char[] readCharsUTF8(MemoryBuffer buffer, int numBytes) {
+  public String readCharsUTF8(MemoryBuffer buffer, int numBytes) {
+    char[] chars = getCharArray(numBytes);
+    int charsLen;
+    buffer.checkReadableBytes(numBytes);
+    byte[] srcArray = buffer.getHeapMemory();
+    if (srcArray != null) {
+      int srcIndex = buffer._unsafeHeapReaderIndex();
+      charsLen = StringEncodingUtils.convertUTF8ToUTF16(srcArray, srcIndex, 
numBytes, chars);
+      buffer._increaseReaderIndexUnsafe(numBytes);
+    } else {
+      byte[] tmpArray = getByteArray(numBytes);
+      buffer.readBytes(tmpArray, 0, numBytes);
+      charsLen = StringEncodingUtils.convertUTF8ToUTF16(tmpArray, 0, numBytes, 
chars);
+    }
+    return new String(chars, 0, charsLen);
+  }
+
+  public String readCharsUTF8PerfOptimized(MemoryBuffer buffer, int numBytes) {
     int udf16Chars = numBytes >> 1;
     int udf8Bytes = buffer.readInt32();
     char[] chars = new char[udf16Chars];
+    // noinspection Duplicates
     buffer.checkReadableBytes(udf8Bytes);
     byte[] srcArray = buffer.getHeapMemory();
     if (srcArray != null) {
       int srcIndex = buffer._unsafeHeapReaderIndex();
       int readLen = StringEncodingUtils.convertUTF8ToUTF16(srcArray, srcIndex, 
udf8Bytes, chars);
-      if (readLen != udf16Chars) {
-        throw new RuntimeException("Decode UTF8 to UTF16 failed");
-      }
+      assert readLen == udf16Chars : "Decode UTF8 to UTF16 failed";
       buffer._increaseReaderIndexUnsafe(udf8Bytes);
     } else {
       byte[] tmpArray = getByteArray(udf8Bytes);
       buffer.readBytes(tmpArray, 0, udf8Bytes);
       int readLen = StringEncodingUtils.convertUTF8ToUTF16(tmpArray, 0, 
udf8Bytes, chars);
-      if (readLen != udf16Chars) {
-        throw new RuntimeException("Decode UTF8 to UTF16 failed");
-      }
+      assert readLen == udf16Chars : "Decode UTF8 to UTF16 failed";
     }
-    return chars;
+    return newCharsStringZeroCopy(chars);
   }
 
   public void writeCharsLatin1(MemoryBuffer buffer, char[] chars, int 
numBytes) {
@@ -515,8 +566,51 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
   }
 
   public void writeCharsUTF8(MemoryBuffer buffer, char[] chars) {
+    int estimateMaxBytes = chars.length * 3;
+    // num bytes of utf8 should be smaller than utf16, otherwise we should
+    // utf16 instead.
+    // We can't use length in header since we don't know num chars in go/c++
+    int approxNumBytes = (int) (chars.length * 1.5) + 1;
+    int writerIndex = buffer.writerIndex();
+    // 9 for max bytes of header
+    buffer.ensure(writerIndex + 9 + estimateMaxBytes);
+    byte[] targetArray = buffer.getHeapMemory();
+    if (targetArray != null) {
+      // noinspection Duplicates
+      int targetIndex = buffer._unsafeHeapWriterIndex();
+      // keep this index in case actual num utf8 bytes need different bytes 
for header
+      int headerPos = targetIndex;
+      int arrIndex = targetIndex;
+      long header = ((long) approxNumBytes << 2) | UTF8;
+      int headerBytesWritten = LittleEndian.putVarUint36Small(targetArray, 
arrIndex, header);
+      arrIndex += headerBytesWritten;
+      writerIndex += headerBytesWritten;
+      // noinspection Duplicates
+      targetIndex = StringEncodingUtils.convertUTF16ToUTF8(chars, targetArray, 
arrIndex);
+      byte stashedByte = targetArray[arrIndex];
+      int written = targetIndex - arrIndex;
+      header = ((long) written << 2) | UTF8;
+      int diff =
+          LittleEndian.putVarUint36Small(targetArray, headerPos, header) - 
headerBytesWritten;
+      if (diff != 0) {
+        handleWriteCharsUTF8UnalignedHeaderBytes(targetArray, arrIndex, diff, 
written, stashedByte);
+      }
+      buffer._unsafeWriterIndex(writerIndex + written + diff);
+    } else {
+      // noinspection Duplicates
+      final byte[] tmpArray = getByteArray(estimateMaxBytes);
+      int written = StringEncodingUtils.convertUTF16ToUTF8(chars, tmpArray, 0);
+      long header = ((long) written << 2) | UTF8;
+      writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+      buffer.put(writerIndex, tmpArray, 0, written);
+      buffer._unsafeWriterIndex(writerIndex + written);
+    }
+  }
+
+  public void writeCharsUTF8PerfOptimized(MemoryBuffer buffer, char[] chars) {
     int estimateMaxBytes = chars.length * 3;
     int numBytes = MathUtils.doubleExact(chars.length);
+    // noinspection Duplicates
     int writerIndex = buffer.writerIndex();
     long header = ((long) numBytes << 2) | UTF8;
     buffer.ensure(writerIndex + 9 + estimateMaxBytes);
@@ -541,7 +635,55 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     }
   }
 
-  public void writeBytesUTF8(MemoryBuffer buffer, byte[] bytes) {
+  private void handleWriteCharsUTF8UnalignedHeaderBytes(
+      byte[] targetArray, int arrIndex, int diff, int written, byte stashed) {
+    if (diff == 1) {
+      System.arraycopy(targetArray, arrIndex + 1, targetArray, arrIndex + 2, 
written - 1);
+      targetArray[arrIndex + 1] = stashed;
+    } else {
+      System.arraycopy(targetArray, arrIndex, targetArray, arrIndex - 1, 
written);
+    }
+  }
+
+  private void writeBytesUTF8(MemoryBuffer buffer, byte[] bytes) {
+    int numBytes = bytes.length;
+    int estimateMaxBytes = bytes.length / 2 * 3;
+    int writerIndex = buffer.writerIndex();
+    buffer.ensure(writerIndex + 9 + estimateMaxBytes);
+    byte[] targetArray = buffer.getHeapMemory();
+    if (targetArray != null) {
+      // noinspection Duplicates
+      int targetIndex = buffer._unsafeHeapWriterIndex();
+      // keep this index in case actual num utf8 bytes need different bytes 
for header
+      int headerPos = targetIndex;
+      int arrIndex = targetIndex;
+      long header = ((long) numBytes << 2) | UTF8;
+      int headerBytesWritten = LittleEndian.putVarUint36Small(targetArray, 
arrIndex, header);
+      arrIndex += headerBytesWritten;
+      writerIndex += arrIndex - targetIndex;
+      // noinspection Duplicates
+      targetIndex = StringEncodingUtils.convertUTF16ToUTF8(bytes, targetArray, 
arrIndex);
+      byte stashedByte = targetArray[arrIndex];
+      int written = targetIndex - arrIndex;
+      header = ((long) written << 2) | UTF8;
+      int diff =
+          LittleEndian.putVarUint36Small(targetArray, headerPos, header) - 
headerBytesWritten;
+      if (diff != 0) {
+        handleWriteCharsUTF8UnalignedHeaderBytes(targetArray, arrIndex, diff, 
written, stashedByte);
+      }
+      buffer._unsafeWriterIndex(writerIndex + written + diff);
+    } else {
+      // noinspection Duplicates
+      final byte[] tmpArray = getByteArray(estimateMaxBytes);
+      int written = StringEncodingUtils.convertUTF16ToUTF8(bytes, tmpArray, 0);
+      long header = ((long) written << 2) | UTF8;
+      writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+      buffer.put(writerIndex, tmpArray, 0, written);
+      buffer._unsafeWriterIndex(writerIndex + written);
+    }
+  }
+
+  private void writeBytesUTF8PerfOptimized(MemoryBuffer buffer, byte[] bytes) {
     int numBytes = bytes.length;
     int estimateMaxBytes = bytes.length / 2 * 3;
     int writerIndex = buffer.writerIndex();
@@ -815,6 +957,22 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     }
   }
 
+  private char[] getCharArray(int numElements) {
+    char[] charArray = this.charArray;
+    if (charArray.length < numElements) {
+      charArray = new char[numElements];
+      this.charArray = charArray;
+    }
+    if (charArray.length > DEFAULT_BUFFER_SIZE) {
+      smoothCharArrayLength =
+          Math.max(((int) (smoothCharArrayLength * 0.9 + numElements * 0.1)), 
DEFAULT_BUFFER_SIZE);
+      if (smoothByteArrayLength <= DEFAULT_BUFFER_SIZE) {
+        this.charArray = new char[DEFAULT_BUFFER_SIZE];
+      }
+    }
+    return charArray;
+  }
+
   private byte[] getByteArray(int numElements) {
     byte[] byteArray = this.byteArray;
     if (byteArray.length < numElements) {
@@ -830,4 +988,20 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     }
     return byteArray;
   }
+
+  private byte[] getByteArray2(int numElements) {
+    byte[] byteArray2 = this.byteArray2;
+    if (byteArray2.length < numElements) {
+      byteArray2 = new byte[numElements];
+      this.byteArray = byteArray2;
+    }
+    if (byteArray2.length > DEFAULT_BUFFER_SIZE) {
+      smoothByteArrayLength =
+          Math.max(((int) (smoothByteArrayLength * 0.9 + numElements * 0.1)), 
DEFAULT_BUFFER_SIZE);
+      if (smoothByteArrayLength <= DEFAULT_BUFFER_SIZE) {
+        this.byteArray2 = new byte[DEFAULT_BUFFER_SIZE];
+      }
+    }
+    return byteArray2;
+  }
 }
diff --git a/java/fury-core/src/test/java/org/apache/fury/FuryTestBase.java 
b/java/fury-core/src/test/java/org/apache/fury/FuryTestBase.java
index a33c5b95..c4d7e838 100644
--- a/java/fury-core/src/test/java/org/apache/fury/FuryTestBase.java
+++ b/java/fury-core/src/test/java/org/apache/fury/FuryTestBase.java
@@ -128,6 +128,16 @@ public abstract class FuryTestBase {
     return new Object[][] {{false}, {true}};
   }
 
+  @DataProvider
+  public static Object[][] oneBoolOption() {
+    return new Object[][] {{false}, {true}};
+  }
+
+  @DataProvider
+  public static Object[][] twoBoolOptions() {
+    return new Object[][] {{false, false}, {true, false}, {false, true}, 
{true, true}};
+  }
+
   @DataProvider
   public static Object[][] compressNumberAndCodeGen() {
     return new Object[][] {{false, false}, {true, false}, {false, true}, 
{true, true}};
diff --git 
a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java
 
b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java
index 123f3e54..761cbd03 100644
--- 
a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java
+++ 
b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java
@@ -171,29 +171,46 @@ public class StringSerializerTest extends FuryTestBase {
   }
 
   /** Test for <a href="https://github.com/apache/fury/issues/1984";>#1984</a> 
*/
-  @Test
-  public void testJavaCompressedString() {
+  @Test(dataProvider = "oneBoolOption")
+  public void testJavaCompressedString(boolean b) {
     Fury fury =
         Fury.builder()
             .withStringCompressed(true)
+            .withWriteNumUtf16BytesForUtf8Encoding(b)
             .withLanguage(Language.JAVA)
             .requireClassRegistration(false)
             .build();
-
     Simple a =
         new Simple(
             "STG@ON DEMAND Solutions@GeoComputing Switch/ Hub@Digi 
Edgeport/216 – 16 port Serial Hub");
+    serDeCheck(fury, a);
+  }
 
-    byte[] bytes = fury.serialize(a);
-
-    Simple b = (Simple) fury.deserialize(bytes);
-    assertEquals(a, b);
+  @Test
+  public void testCompressedStringEstimatedWrongSize() {
+    Fury fury =
+        Fury.builder()
+            .withStringCompressed(true)
+            .withWriteNumUtf16BytesForUtf8Encoding(false)
+            .withLanguage(Language.JAVA)
+            .requireClassRegistration(false)
+            .build();
+    // estimated 41 bytes, header needs 2 byte.
+    // encoded utf8 is 31 bytes, took 1 byte for header.
+    serDeCheck(fury, StringUtils.random(25, 47) + "你好");
+    // estimated 31 bytes, header needs 1 byte.
+    // encoded utf8 is 32 bytes, took 2 byte for header.
+    serDeCheck(fury, "hello, world. 你好,世界。");
   }
 
-  @Test(dataProvider = "stringCompress")
-  public void testJavaString(boolean stringCompress) {
+  @Test(dataProvider = "twoBoolOptions")
+  public void testJavaString(boolean stringCompress, boolean 
writeNumUtf16BytesForUtf8Encoding) {
     Fury fury =
-        
Fury.builder().withStringCompressed(stringCompress).requireClassRegistration(false).build();
+        Fury.builder()
+            .withStringCompressed(stringCompress)
+            
.withWriteNumUtf16BytesForUtf8Encoding(writeNumUtf16BytesForUtf8Encoding)
+            .requireClassRegistration(false)
+            .build();
     MemoryBuffer buffer = MemoryUtils.buffer(32);
     StringSerializer serializer = new StringSerializer(fury);
 
@@ -211,10 +228,15 @@ public class StringSerializerTest extends FuryTestBase {
         new String[] {"你好, Fury" + StringUtils.random(64), "你好, Fury" + 
StringUtils.random(64)});
   }
 
-  @Test(dataProvider = "stringCompress")
-  public void testJavaStringOffHeap(boolean stringCompress) {
+  @Test(dataProvider = "twoBoolOptions")
+  public void testJavaStringOffHeap(
+      boolean stringCompress, boolean writeNumUtf16BytesForUtf8Encoding) {
     Fury fury =
-        
Fury.builder().withStringCompressed(stringCompress).requireClassRegistration(false).build();
+        Fury.builder()
+            .withStringCompressed(stringCompress)
+            
.withWriteNumUtf16BytesForUtf8Encoding(writeNumUtf16BytesForUtf8Encoding)
+            .requireClassRegistration(false)
+            .build();
     MemoryBuffer buffer = MemoryUtils.wrap(ByteBuffer.allocateDirect(1024));
     Object o1 = "你好, Fury" + StringUtils.random(64);
     Object o2 =
@@ -331,9 +353,14 @@ public class StringSerializerTest extends FuryTestBase {
     }
   }
 
-  @Test
-  public void testReadUtf8String() {
-    Fury fury = 
Fury.builder().withStringCompressed(true).requireClassRegistration(false).build();
+  @Test(dataProvider = "oneBoolOption")
+  public void testReadUtf8String(boolean writeNumUtf16BytesForUtf8Encoding) {
+    Fury fury =
+        Fury.builder()
+            .withStringCompressed(true)
+            
.withWriteNumUtf16BytesForUtf8Encoding(writeNumUtf16BytesForUtf8Encoding)
+            .requireClassRegistration(false)
+            .build();
     for (MemoryBuffer buffer :
         new MemoryBuffer[] {
           MemoryUtils.buffer(32), 
MemoryUtils.wrap(ByteBuffer.allocateDirect(2048))
@@ -343,8 +370,12 @@ public class StringSerializerTest extends FuryTestBase {
       assertEquals(serializer.read(buffer), "abc你好");
       byte[] bytes = "abc你好".getBytes(StandardCharsets.UTF_8);
       byte UTF8 = 2;
-      buffer.writeVarUint64(((long) "abc你好".length() << 1) << 2 | UTF8);
-      buffer.writeInt32(bytes.length);
+      if (writeNumUtf16BytesForUtf8Encoding) {
+        buffer.writeVarUint64(((long) "abc你好".length() << 1) << 2 | UTF8);
+        buffer.writeInt32(bytes.length);
+      } else {
+        buffer.writeVarUint64((((long) bytes.length) << 2 | UTF8));
+      }
       buffer.writeBytes(bytes);
       assertEquals(serializer.read(buffer), "abc你好");
       assertEquals(buffer.readerIndex(), buffer.writerIndex());


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to