This is an automated email from the ASF dual-hosted git repository.

chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fory.git


The following commit(s) were added to refs/heads/main by this push:
     new ccc1c633f fix(java): fix openj9 sliced string serde (#3160)
ccc1c633f is described below

commit ccc1c633f0428c7f0ff4170347c102d3b52e0569
Author: Shawn Yang <[email protected]>
AuthorDate: Mon Jan 19 17:55:28 2026 +0800

    fix(java): fix openj9 sliced string serde (#3160)
    
    ## Why?
    
    
    
    ## What does this PR do?
    
    
    
    ## Related issues
    Closes #2079
    
    ## Does this PR introduce any user-facing change?
    
    
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    ## Benchmark
---
 .../apache/fory/serializer/SlicedStringUtil.java   | 295 +++++++++++++++++++++
 .../apache/fory/serializer/StringSerializer.java   |  87 +++++-
 .../org/apache/fory/util/StringEncodingUtils.java  |  42 +++
 3 files changed, 412 insertions(+), 12 deletions(-)

diff --git 
a/java/fory-core/src/main/java/org/apache/fory/serializer/SlicedStringUtil.java 
b/java/fory-core/src/main/java/org/apache/fory/serializer/SlicedStringUtil.java
new file mode 100644
index 000000000..f134de9ae
--- /dev/null
+++ 
b/java/fory-core/src/main/java/org/apache/fory/serializer/SlicedStringUtil.java
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.fory.serializer;
+
+import org.apache.fory.memory.LittleEndian;
+import org.apache.fory.memory.MemoryBuffer;
+import org.apache.fory.memory.Platform;
+import org.apache.fory.util.MathUtils;
+import org.apache.fory.util.StringEncodingUtils;
+import org.apache.fory.util.StringUtils;
+
+final class SlicedStringUtil {
+  private static final byte LATIN1 = 0;
+  private static final byte UTF16 = 1;
+  private static final byte UTF8 = 2;
+
+  private SlicedStringUtil() {}
+
+  static void writeCharsLatin1WithOffset(
+      StringSerializer serializer, MemoryBuffer buffer, char[] chars, int 
offset, int count) {
+    int writerIndex = buffer.writerIndex();
+    long header = ((long) count << 2) | LATIN1;
+    buffer.ensure(writerIndex + 5 + count);
+    byte[] targetArray = buffer.getHeapMemory();
+    if (targetArray != null) {
+      final int targetIndex = buffer._unsafeHeapWriterIndex();
+      int arrIndex = targetIndex;
+      arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, 
header);
+      writerIndex += arrIndex - targetIndex;
+      for (int i = 0; i < count; i++) {
+        targetArray[arrIndex + i] = (byte) chars[offset + i];
+      }
+    } else {
+      writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+      final byte[] tmpArray = serializer.getByteArray(count);
+      for (int i = 0; i < count; i++) {
+        tmpArray[i] = (byte) chars[offset + i];
+      }
+      buffer.put(writerIndex, tmpArray, 0, count);
+    }
+    writerIndex += count;
+    buffer._unsafeWriterIndex(writerIndex);
+  }
+
+  static void writeCharsUTF16WithOffset(
+      StringSerializer serializer, MemoryBuffer buffer, char[] chars, int 
offset, int count) {
+    int numBytes = MathUtils.doubleExact(count);
+    int writerIndex = buffer.writerIndex();
+    long header = ((long) numBytes << 2) | UTF16;
+    buffer.ensure(writerIndex + 5 + numBytes);
+    final byte[] targetArray = buffer.getHeapMemory();
+    if (targetArray != null) {
+      final int targetIndex = buffer._unsafeHeapWriterIndex();
+      int arrIndex = targetIndex;
+      arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, 
header);
+      writerIndex += arrIndex - targetIndex + numBytes;
+      if (Platform.IS_LITTLE_ENDIAN) {
+        // FIXME JDK11 utf16 string uses little-endian order.
+        Platform.UNSAFE.copyMemory(
+            chars,
+            Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1),
+            targetArray,
+            Platform.BYTE_ARRAY_OFFSET + arrIndex,
+            numBytes);
+      } else {
+        writeCharsUTF16BEToHeap(chars, offset, arrIndex, numBytes, 
targetArray);
+      }
+    } else {
+      writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+      if (Platform.IS_LITTLE_ENDIAN) {
+        writerIndex =
+            offHeapWriteCharsUTF16WithOffset(
+                serializer, buffer, chars, offset, writerIndex, numBytes);
+      } else {
+        writerIndex =
+            offHeapWriteCharsUTF16BEWithOffset(
+                serializer, buffer, chars, offset, writerIndex, numBytes);
+      }
+    }
+    buffer._unsafeWriterIndex(writerIndex);
+  }
+
+  static void writeCharsUTF8WithOffset(
+      StringSerializer serializer, MemoryBuffer buffer, char[] chars, int 
offset, int count) {
+    int estimateMaxBytes = count * 3;
+    int approxNumBytes = (int) (count * 1.5) + 1;
+    int writerIndex = buffer.writerIndex();
+    buffer.ensure(writerIndex + 9 + estimateMaxBytes);
+    byte[] targetArray = buffer.getHeapMemory();
+    if (targetArray != null) {
+      int targetIndex = buffer._unsafeHeapWriterIndex();
+      int headerPos = targetIndex;
+      int arrIndex = targetIndex;
+      long header = ((long) approxNumBytes << 2) | UTF8;
+      int headerBytesWritten = LittleEndian.putVarUint36Small(targetArray, 
arrIndex, header);
+      arrIndex += headerBytesWritten;
+      writerIndex += headerBytesWritten;
+      targetIndex =
+          StringEncodingUtils.convertUTF16ToUTF8(chars, offset, count, 
targetArray, arrIndex);
+      byte stashedByte = targetArray[arrIndex];
+      int written = targetIndex - arrIndex;
+      header = ((long) written << 2) | UTF8;
+      int diff =
+          LittleEndian.putVarUint36Small(targetArray, headerPos, header) - 
headerBytesWritten;
+      if (diff != 0) {
+        handleWriteCharsUTF8UnalignedHeaderBytes(targetArray, arrIndex, diff, 
written, stashedByte);
+      }
+      buffer._unsafeWriterIndex(writerIndex + written + diff);
+    } else {
+      final byte[] tmpArray = serializer.getByteArray(estimateMaxBytes);
+      int written = StringEncodingUtils.convertUTF16ToUTF8(chars, offset, 
count, tmpArray, 0);
+      long header = ((long) written << 2) | UTF8;
+      writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+      buffer.put(writerIndex, tmpArray, 0, written);
+      buffer._unsafeWriterIndex(writerIndex + written);
+    }
+  }
+
+  static void writeCharsUTF8PerfOptimizedWithOffset(
+      StringSerializer serializer, MemoryBuffer buffer, char[] chars, int 
offset, int count) {
+    int estimateMaxBytes = count * 3;
+    int numBytes = MathUtils.doubleExact(count);
+    int writerIndex = buffer.writerIndex();
+    long header = ((long) numBytes << 2) | UTF8;
+    buffer.ensure(writerIndex + 9 + estimateMaxBytes);
+    byte[] targetArray = buffer.getHeapMemory();
+    if (targetArray != null) {
+      int targetIndex = buffer._unsafeHeapWriterIndex();
+      int arrIndex = targetIndex;
+      arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex, 
header);
+      writerIndex += arrIndex - targetIndex;
+      targetIndex =
+          StringEncodingUtils.convertUTF16ToUTF8(chars, offset, count, 
targetArray, arrIndex + 4);
+      int written = targetIndex - arrIndex - 4;
+      buffer._unsafePutInt32(writerIndex, written);
+      buffer._unsafeWriterIndex(writerIndex + 4 + written);
+    } else {
+      final byte[] tmpArray = serializer.getByteArray(estimateMaxBytes);
+      int written = StringEncodingUtils.convertUTF16ToUTF8(chars, offset, 
count, tmpArray, 0);
+      writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+      buffer._unsafePutInt32(writerIndex, written);
+      writerIndex += 4;
+      buffer.put(writerIndex, tmpArray, 0, written);
+      buffer._unsafeWriterIndex(writerIndex + written);
+    }
+  }
+
+  static boolean isLatin(char[] chars, int offset, int count) {
+    int end = offset + count;
+    int vectorizedChars = count & ~3;
+    int vectorEnd = offset + vectorizedChars;
+    long byteOffset = Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1);
+    long endOffset = Platform.CHAR_ARRAY_OFFSET + ((long) vectorEnd << 1);
+    for (long off = byteOffset; off < endOffset; off += 8) {
+      long multiChars = Platform.getLong(chars, off);
+      if ((multiChars & StringUtils.MULTI_CHARS_NON_LATIN_MASK) != 0) {
+        return false;
+      }
+    }
+    for (int i = vectorEnd; i < end; i++) {
+      if (chars[i] > 0xFF) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static byte bestCoder(char[] chars, int offset, int count) {
+    int sampleNum = Math.min(64, count);
+    int vectorizedLen = sampleNum >> 2;
+    int vectorizedChars = vectorizedLen << 2;
+    long byteOffset = Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1);
+    long endOffset = byteOffset + ((long) vectorizedChars << 1);
+    int asciiCount = 0;
+    int latin1Count = 0;
+    int charOffset = offset;
+    for (long off = byteOffset; off < endOffset; off += 8, charOffset += 4) {
+      long multiChars = Platform.getLong(chars, off);
+      if ((multiChars & StringUtils.MULTI_CHARS_NON_ASCII_MASK) == 0) {
+        latin1Count += 4;
+        asciiCount += 4;
+      } else if ((multiChars & StringUtils.MULTI_CHARS_NON_LATIN_MASK) == 0) {
+        latin1Count += 4;
+        for (int i = 0; i < 4; ++i) {
+          if (chars[charOffset + i] < 0x80) {
+            asciiCount++;
+          }
+        }
+      } else {
+        for (int i = 0; i < 4; ++i) {
+          char c = chars[charOffset + i];
+          if (c < 0x80) {
+            latin1Count++;
+            asciiCount++;
+          } else if (c <= 0xFF) {
+            latin1Count++;
+          }
+        }
+      }
+    }
+
+    for (int i = vectorizedChars; i < sampleNum; i++) {
+      char c = chars[offset + i];
+      if (c < 0x80) {
+        latin1Count++;
+        asciiCount++;
+      } else if (c <= 0xFF) {
+        latin1Count++;
+      }
+    }
+
+    if (latin1Count == count || (latin1Count == sampleNum && isLatin(chars, 
offset, count))) {
+      return LATIN1;
+    } else if (asciiCount >= sampleNum * 0.5) {
+      return UTF8;
+    } else {
+      return UTF16;
+    }
+  }
+
+  private static void handleWriteCharsUTF8UnalignedHeaderBytes(
+      byte[] targetArray, int arrIndex, int diff, int written, byte stashed) {
+    if (diff == 1) {
+      System.arraycopy(targetArray, arrIndex + 1, targetArray, arrIndex + 2, 
written - 1);
+      targetArray[arrIndex + 1] = stashed;
+    } else {
+      System.arraycopy(targetArray, arrIndex, targetArray, arrIndex - 1, 
written);
+    }
+  }
+
+  private static void writeCharsUTF16BEToHeap(
+      char[] chars, int offset, int arrIndex, int numBytes, byte[] 
targetArray) {
+    int charIndex = offset;
+    for (int i = arrIndex, end = i + numBytes; i < end; i += 2) {
+      char c = chars[charIndex++];
+      targetArray[i] = (byte) c;
+      targetArray[i + 1] = (byte) (c >>> 8);
+    }
+  }
+
+  private static int offHeapWriteCharsUTF16WithOffset(
+      StringSerializer serializer,
+      MemoryBuffer buffer,
+      char[] chars,
+      int offset,
+      int writerIndex,
+      int numBytes) {
+    byte[] tmpArray = serializer.getByteArray(numBytes);
+    Platform.UNSAFE.copyMemory(
+        chars,
+        Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1),
+        tmpArray,
+        Platform.BYTE_ARRAY_OFFSET,
+        numBytes);
+    buffer.put(writerIndex, tmpArray, 0, numBytes);
+    writerIndex += numBytes;
+    return writerIndex;
+  }
+
+  private static int offHeapWriteCharsUTF16BEWithOffset(
+      StringSerializer serializer,
+      MemoryBuffer buffer,
+      char[] chars,
+      int offset,
+      int writerIndex,
+      int numBytes) {
+    byte[] tmpArray = serializer.getByteArray(numBytes);
+    int charIndex = offset;
+    for (int i = 0; i < numBytes; i += 2) {
+      char c = chars[charIndex++];
+      tmpArray[i] = (byte) c;
+      tmpArray[i + 1] = (byte) (c >>> 8);
+    }
+    buffer.put(writerIndex, tmpArray, 0, numBytes);
+    writerIndex += numBytes;
+    return writerIndex;
+  }
+}
diff --git 
a/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java 
b/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java
index 2ed61be61..ec26ddd77 100644
--- 
a/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java
+++ 
b/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java
@@ -69,6 +69,9 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
 
   // Make offset compatible with graalvm native image.
   private static final long STRING_VALUE_FIELD_OFFSET;
+  private static final boolean STRING_HAS_COUNT_OFFSET;
+  private static final long STRING_COUNT_FIELD_OFFSET;
+  private static final long STRING_OFFSET_FIELD_OFFSET;
 
   private static class Offset {
     // Make offset compatible with graalvm native image.
@@ -97,13 +100,22 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     } catch (NoSuchFieldException e) {
       throw new RuntimeException(e);
     }
-    // String length field for android.
-    Preconditions.checkArgument(
-        ReflectionUtils.getFieldNullable(String.class, "count") == null,
-        "Current jdk not supported");
-    Preconditions.checkArgument(
-        ReflectionUtils.getFieldNullable(String.class, "offset") == null,
-        "Current jdk not supported");
+    Field countField = ReflectionUtils.getFieldNullable(String.class, "count");
+    Field offsetField = ReflectionUtils.getFieldNullable(String.class, 
"offset");
+    if (countField != null || offsetField != null) {
+      Preconditions.checkArgument(
+          countField != null && offsetField != null, "Current jdk not 
supported");
+      Preconditions.checkArgument(
+          countField.getType() == int.class && offsetField.getType() == 
int.class,
+          "Current jdk not supported");
+      STRING_HAS_COUNT_OFFSET = true;
+      STRING_COUNT_FIELD_OFFSET = Platform.objectFieldOffset(countField);
+      STRING_OFFSET_FIELD_OFFSET = Platform.objectFieldOffset(offsetField);
+    } else {
+      STRING_HAS_COUNT_OFFSET = false;
+      STRING_COUNT_FIELD_OFFSET = -1;
+      STRING_OFFSET_FIELD_OFFSET = -1;
+    }
   }
 
   private final boolean compressString;
@@ -160,10 +172,18 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
       if (!STRING_VALUE_FIELD_IS_CHARS) {
         throw new UnsupportedOperationException();
       }
-      if (compressString) {
-        return new Invoke(strSerializer, "writeCompressedCharsString", buffer, 
str);
+      if (STRING_HAS_COUNT_OFFSET) {
+        if (compressString) {
+          return new Invoke(strSerializer, 
"writeCompressedCharsStringWithOffset", buffer, str);
+        } else {
+          return new Invoke(strSerializer, "writeCharsStringWithOffset", 
buffer, str);
+        }
       } else {
-        return new Invoke(strSerializer, "writeCharsString", buffer, str);
+        if (compressString) {
+          return new Invoke(strSerializer, "writeCompressedCharsString", 
buffer, str);
+        } else {
+          return new Invoke(strSerializer, "writeCharsString", buffer, str);
+        }
       }
     }
   }
@@ -344,7 +364,19 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
         writeBytesString(buffer, value);
       }
     } else {
-      assert STRING_VALUE_FIELD_IS_CHARS;
+      writeJava8String(buffer, value);
+    }
+  }
+
+  private void writeJava8String(MemoryBuffer buffer, String value) {
+    assert STRING_VALUE_FIELD_IS_CHARS;
+    if (STRING_HAS_COUNT_OFFSET) {
+      if (compressString) {
+        writeCompressedCharsStringWithOffset(buffer, value);
+      } else {
+        writeCharsStringWithOffset(buffer, value);
+      }
+    } else {
       if (compressString) {
         writeCompressedCharsString(buffer, value);
       } else {
@@ -403,6 +435,25 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     }
   }
 
+  @CodegenInvoke
+  public void writeCompressedCharsStringWithOffset(MemoryBuffer buffer, String 
value) {
+    final char[] chars = (char[]) Platform.getObject(value, 
STRING_VALUE_FIELD_OFFSET);
+    final int offset = Platform.getInt(value, STRING_OFFSET_FIELD_OFFSET);
+    final int count = Platform.getInt(value, STRING_COUNT_FIELD_OFFSET);
+    final byte coder = SlicedStringUtil.bestCoder(chars, offset, count);
+    if (coder == LATIN1) {
+      SlicedStringUtil.writeCharsLatin1WithOffset(this, buffer, chars, offset, 
count);
+    } else if (coder == UTF8) {
+      if (writeNumUtf16BytesForUtf8Encoding) {
+        SlicedStringUtil.writeCharsUTF8PerfOptimizedWithOffset(this, buffer, 
chars, offset, count);
+      } else {
+        SlicedStringUtil.writeCharsUTF8WithOffset(this, buffer, chars, offset, 
count);
+      }
+    } else {
+      SlicedStringUtil.writeCharsUTF16WithOffset(this, buffer, chars, offset, 
count);
+    }
+  }
+
   @CodegenInvoke
   public static void writeBytesString(MemoryBuffer buffer, String value) {
     byte[] bytes = (byte[]) Platform.getObject(value, 
STRING_VALUE_FIELD_OFFSET);
@@ -450,6 +501,18 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     }
   }
 
+  @CodegenInvoke
+  public void writeCharsStringWithOffset(MemoryBuffer buffer, String value) {
+    final char[] chars = (char[]) Platform.getObject(value, 
STRING_VALUE_FIELD_OFFSET);
+    final int offset = Platform.getInt(value, STRING_OFFSET_FIELD_OFFSET);
+    final int count = Platform.getInt(value, STRING_COUNT_FIELD_OFFSET);
+    if (SlicedStringUtil.isLatin(chars, offset, count)) {
+      SlicedStringUtil.writeCharsLatin1WithOffset(this, buffer, chars, offset, 
count);
+    } else {
+      SlicedStringUtil.writeCharsUTF16WithOffset(this, buffer, chars, offset, 
count);
+    }
+  }
+
   public char[] readCharsLatin1(MemoryBuffer buffer, int numBytes) {
     buffer.checkReadableBytes(numBytes);
     byte[] srcArray = buffer.getHeapMemory();
@@ -1118,7 +1181,7 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     return charArray;
   }
 
-  private byte[] getByteArray(int numElements) {
+  byte[] getByteArray(int numElements) {
     byte[] byteArray = this.byteArray;
     if (byteArray.length < numElements) {
       byteArray = new byte[numElements];
diff --git 
a/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java 
b/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java
index ef6bb67ee..5988762ba 100644
--- a/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java
+++ b/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java
@@ -65,6 +65,48 @@ public class StringEncodingUtils {
     return dp;
   }
 
+  /** A fast convert algorithm to convert an utf16 char array slice into an 
utf8 byte array. */
+  public static int convertUTF16ToUTF8(char[] src, int offset, int len, byte[] 
dst, int dp) {
+    int end = offset + len;
+    for (int charOffset = offset, arrayOffset = Platform.CHAR_ARRAY_OFFSET + 
(offset << 1);
+        charOffset < end; ) {
+      if (charOffset + 4 <= end
+          && (Platform.getLong(src, arrayOffset) & MULTI_CHARS_NON_ASCII_MASK) 
== 0) {
+        dst[dp] = (byte) src[charOffset];
+        dst[dp + 1] = (byte) src[charOffset + 1];
+        dst[dp + 2] = (byte) src[charOffset + 2];
+        dst[dp + 3] = (byte) src[charOffset + 3];
+        dp += 4;
+        charOffset += 4;
+        arrayOffset += 8;
+      } else {
+        char c = src[charOffset++];
+        arrayOffset += 2;
+        if (c < 0x80) {
+          dst[dp++] = (byte) c;
+        } else if (c < 0x800) {
+          dst[dp] = (byte) (0xc0 | (c >> 6));
+          dst[dp + 1] = (byte) (0x80 | (c & 0x3f));
+          dp += 2;
+        } else if (c >= '\uD800' && c <= Character.MAX_LOW_SURROGATE) {
+          if (charOffset >= end) {
+            throw new RuntimeException("malformed input off : " + charOffset);
+          }
+          utf8ToChar2(src, charOffset, c, dst, dp);
+          dp += 4;
+          charOffset++;
+          arrayOffset += 2;
+        } else {
+          dst[dp] = (byte) (0xe0 | ((c >> 12)));
+          dst[dp + 1] = (byte) (0x80 | ((c >> 6) & 0x3f));
+          dst[dp + 2] = (byte) (0x80 | (c & 0x3f));
+          dp += 3;
+        }
+      }
+    }
+    return dp;
+  }
+
   /** A fast convert algorithm to convert an utf16 byte array into an utf8 
byte array. */
   public static int convertUTF16ToUTF8(byte[] src, byte[] dst, int dp) {
     int numBytes = src.length;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to