This is an automated email from the ASF dual-hosted git repository.
chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fory.git
The following commit(s) were added to refs/heads/main by this push:
new ccc1c633f fix(java): fix openj9 sliced string serde (#3160)
ccc1c633f is described below
commit ccc1c633f0428c7f0ff4170347c102d3b52e0569
Author: Shawn Yang <[email protected]>
AuthorDate: Mon Jan 19 17:55:28 2026 +0800
fix(java): fix openj9 sliced string serde (#3160)
## Why?
## What does this PR do?
## Related issues
Closes #2079
## Does this PR introduce any user-facing change?
- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?
## Benchmark
---
.../apache/fory/serializer/SlicedStringUtil.java | 295 +++++++++++++++++++++
.../apache/fory/serializer/StringSerializer.java | 87 +++++-
.../org/apache/fory/util/StringEncodingUtils.java | 42 +++
3 files changed, 412 insertions(+), 12 deletions(-)
diff --git
a/java/fory-core/src/main/java/org/apache/fory/serializer/SlicedStringUtil.java
b/java/fory-core/src/main/java/org/apache/fory/serializer/SlicedStringUtil.java
new file mode 100644
index 000000000..f134de9ae
--- /dev/null
+++
b/java/fory-core/src/main/java/org/apache/fory/serializer/SlicedStringUtil.java
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.fory.serializer;
+
+import org.apache.fory.memory.LittleEndian;
+import org.apache.fory.memory.MemoryBuffer;
+import org.apache.fory.memory.Platform;
+import org.apache.fory.util.MathUtils;
+import org.apache.fory.util.StringEncodingUtils;
+import org.apache.fory.util.StringUtils;
+
+final class SlicedStringUtil {
+ private static final byte LATIN1 = 0;
+ private static final byte UTF16 = 1;
+ private static final byte UTF8 = 2;
+
+ private SlicedStringUtil() {}
+
+ static void writeCharsLatin1WithOffset(
+ StringSerializer serializer, MemoryBuffer buffer, char[] chars, int
offset, int count) {
+ int writerIndex = buffer.writerIndex();
+ long header = ((long) count << 2) | LATIN1;
+ buffer.ensure(writerIndex + 5 + count);
+ byte[] targetArray = buffer.getHeapMemory();
+ if (targetArray != null) {
+ final int targetIndex = buffer._unsafeHeapWriterIndex();
+ int arrIndex = targetIndex;
+ arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex,
header);
+ writerIndex += arrIndex - targetIndex;
+ for (int i = 0; i < count; i++) {
+ targetArray[arrIndex + i] = (byte) chars[offset + i];
+ }
+ } else {
+ writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+ final byte[] tmpArray = serializer.getByteArray(count);
+ for (int i = 0; i < count; i++) {
+ tmpArray[i] = (byte) chars[offset + i];
+ }
+ buffer.put(writerIndex, tmpArray, 0, count);
+ }
+ writerIndex += count;
+ buffer._unsafeWriterIndex(writerIndex);
+ }
+
+ static void writeCharsUTF16WithOffset(
+ StringSerializer serializer, MemoryBuffer buffer, char[] chars, int
offset, int count) {
+ int numBytes = MathUtils.doubleExact(count);
+ int writerIndex = buffer.writerIndex();
+ long header = ((long) numBytes << 2) | UTF16;
+ buffer.ensure(writerIndex + 5 + numBytes);
+ final byte[] targetArray = buffer.getHeapMemory();
+ if (targetArray != null) {
+ final int targetIndex = buffer._unsafeHeapWriterIndex();
+ int arrIndex = targetIndex;
+ arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex,
header);
+ writerIndex += arrIndex - targetIndex + numBytes;
+ if (Platform.IS_LITTLE_ENDIAN) {
+ // FIXME JDK11 utf16 string uses little-endian order.
+ Platform.UNSAFE.copyMemory(
+ chars,
+ Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1),
+ targetArray,
+ Platform.BYTE_ARRAY_OFFSET + arrIndex,
+ numBytes);
+ } else {
+ writeCharsUTF16BEToHeap(chars, offset, arrIndex, numBytes,
targetArray);
+ }
+ } else {
+ writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+ if (Platform.IS_LITTLE_ENDIAN) {
+ writerIndex =
+ offHeapWriteCharsUTF16WithOffset(
+ serializer, buffer, chars, offset, writerIndex, numBytes);
+ } else {
+ writerIndex =
+ offHeapWriteCharsUTF16BEWithOffset(
+ serializer, buffer, chars, offset, writerIndex, numBytes);
+ }
+ }
+ buffer._unsafeWriterIndex(writerIndex);
+ }
+
+ static void writeCharsUTF8WithOffset(
+ StringSerializer serializer, MemoryBuffer buffer, char[] chars, int
offset, int count) {
+ int estimateMaxBytes = count * 3;
+ int approxNumBytes = (int) (count * 1.5) + 1;
+ int writerIndex = buffer.writerIndex();
+ buffer.ensure(writerIndex + 9 + estimateMaxBytes);
+ byte[] targetArray = buffer.getHeapMemory();
+ if (targetArray != null) {
+ int targetIndex = buffer._unsafeHeapWriterIndex();
+ int headerPos = targetIndex;
+ int arrIndex = targetIndex;
+ long header = ((long) approxNumBytes << 2) | UTF8;
+ int headerBytesWritten = LittleEndian.putVarUint36Small(targetArray,
arrIndex, header);
+ arrIndex += headerBytesWritten;
+ writerIndex += headerBytesWritten;
+ targetIndex =
+ StringEncodingUtils.convertUTF16ToUTF8(chars, offset, count,
targetArray, arrIndex);
+ byte stashedByte = targetArray[arrIndex];
+ int written = targetIndex - arrIndex;
+ header = ((long) written << 2) | UTF8;
+ int diff =
+ LittleEndian.putVarUint36Small(targetArray, headerPos, header) -
headerBytesWritten;
+ if (diff != 0) {
+ handleWriteCharsUTF8UnalignedHeaderBytes(targetArray, arrIndex, diff,
written, stashedByte);
+ }
+ buffer._unsafeWriterIndex(writerIndex + written + diff);
+ } else {
+ final byte[] tmpArray = serializer.getByteArray(estimateMaxBytes);
+ int written = StringEncodingUtils.convertUTF16ToUTF8(chars, offset,
count, tmpArray, 0);
+ long header = ((long) written << 2) | UTF8;
+ writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+ buffer.put(writerIndex, tmpArray, 0, written);
+ buffer._unsafeWriterIndex(writerIndex + written);
+ }
+ }
+
+ static void writeCharsUTF8PerfOptimizedWithOffset(
+ StringSerializer serializer, MemoryBuffer buffer, char[] chars, int
offset, int count) {
+ int estimateMaxBytes = count * 3;
+ int numBytes = MathUtils.doubleExact(count);
+ int writerIndex = buffer.writerIndex();
+ long header = ((long) numBytes << 2) | UTF8;
+ buffer.ensure(writerIndex + 9 + estimateMaxBytes);
+ byte[] targetArray = buffer.getHeapMemory();
+ if (targetArray != null) {
+ int targetIndex = buffer._unsafeHeapWriterIndex();
+ int arrIndex = targetIndex;
+ arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex,
header);
+ writerIndex += arrIndex - targetIndex;
+ targetIndex =
+ StringEncodingUtils.convertUTF16ToUTF8(chars, offset, count,
targetArray, arrIndex + 4);
+ int written = targetIndex - arrIndex - 4;
+ buffer._unsafePutInt32(writerIndex, written);
+ buffer._unsafeWriterIndex(writerIndex + 4 + written);
+ } else {
+ final byte[] tmpArray = serializer.getByteArray(estimateMaxBytes);
+ int written = StringEncodingUtils.convertUTF16ToUTF8(chars, offset,
count, tmpArray, 0);
+ writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+ buffer._unsafePutInt32(writerIndex, written);
+ writerIndex += 4;
+ buffer.put(writerIndex, tmpArray, 0, written);
+ buffer._unsafeWriterIndex(writerIndex + written);
+ }
+ }
+
+ static boolean isLatin(char[] chars, int offset, int count) {
+ int end = offset + count;
+ int vectorizedChars = count & ~3;
+ int vectorEnd = offset + vectorizedChars;
+ long byteOffset = Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1);
+ long endOffset = Platform.CHAR_ARRAY_OFFSET + ((long) vectorEnd << 1);
+ for (long off = byteOffset; off < endOffset; off += 8) {
+ long multiChars = Platform.getLong(chars, off);
+ if ((multiChars & StringUtils.MULTI_CHARS_NON_LATIN_MASK) != 0) {
+ return false;
+ }
+ }
+ for (int i = vectorEnd; i < end; i++) {
+ if (chars[i] > 0xFF) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ static byte bestCoder(char[] chars, int offset, int count) {
+ int sampleNum = Math.min(64, count);
+ int vectorizedLen = sampleNum >> 2;
+ int vectorizedChars = vectorizedLen << 2;
+ long byteOffset = Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1);
+ long endOffset = byteOffset + ((long) vectorizedChars << 1);
+ int asciiCount = 0;
+ int latin1Count = 0;
+ int charOffset = offset;
+ for (long off = byteOffset; off < endOffset; off += 8, charOffset += 4) {
+ long multiChars = Platform.getLong(chars, off);
+ if ((multiChars & StringUtils.MULTI_CHARS_NON_ASCII_MASK) == 0) {
+ latin1Count += 4;
+ asciiCount += 4;
+ } else if ((multiChars & StringUtils.MULTI_CHARS_NON_LATIN_MASK) == 0) {
+ latin1Count += 4;
+ for (int i = 0; i < 4; ++i) {
+ if (chars[charOffset + i] < 0x80) {
+ asciiCount++;
+ }
+ }
+ } else {
+ for (int i = 0; i < 4; ++i) {
+ char c = chars[charOffset + i];
+ if (c < 0x80) {
+ latin1Count++;
+ asciiCount++;
+ } else if (c <= 0xFF) {
+ latin1Count++;
+ }
+ }
+ }
+ }
+
+ for (int i = vectorizedChars; i < sampleNum; i++) {
+ char c = chars[offset + i];
+ if (c < 0x80) {
+ latin1Count++;
+ asciiCount++;
+ } else if (c <= 0xFF) {
+ latin1Count++;
+ }
+ }
+
+ if (latin1Count == count || (latin1Count == sampleNum && isLatin(chars,
offset, count))) {
+ return LATIN1;
+ } else if (asciiCount >= sampleNum * 0.5) {
+ return UTF8;
+ } else {
+ return UTF16;
+ }
+ }
+
+ private static void handleWriteCharsUTF8UnalignedHeaderBytes(
+ byte[] targetArray, int arrIndex, int diff, int written, byte stashed) {
+ if (diff == 1) {
+ System.arraycopy(targetArray, arrIndex + 1, targetArray, arrIndex + 2,
written - 1);
+ targetArray[arrIndex + 1] = stashed;
+ } else {
+ System.arraycopy(targetArray, arrIndex, targetArray, arrIndex - 1,
written);
+ }
+ }
+
+ private static void writeCharsUTF16BEToHeap(
+ char[] chars, int offset, int arrIndex, int numBytes, byte[]
targetArray) {
+ int charIndex = offset;
+ for (int i = arrIndex, end = i + numBytes; i < end; i += 2) {
+ char c = chars[charIndex++];
+ targetArray[i] = (byte) c;
+ targetArray[i + 1] = (byte) (c >>> 8);
+ }
+ }
+
+ private static int offHeapWriteCharsUTF16WithOffset(
+ StringSerializer serializer,
+ MemoryBuffer buffer,
+ char[] chars,
+ int offset,
+ int writerIndex,
+ int numBytes) {
+ byte[] tmpArray = serializer.getByteArray(numBytes);
+ Platform.UNSAFE.copyMemory(
+ chars,
+ Platform.CHAR_ARRAY_OFFSET + ((long) offset << 1),
+ tmpArray,
+ Platform.BYTE_ARRAY_OFFSET,
+ numBytes);
+ buffer.put(writerIndex, tmpArray, 0, numBytes);
+ writerIndex += numBytes;
+ return writerIndex;
+ }
+
+ private static int offHeapWriteCharsUTF16BEWithOffset(
+ StringSerializer serializer,
+ MemoryBuffer buffer,
+ char[] chars,
+ int offset,
+ int writerIndex,
+ int numBytes) {
+ byte[] tmpArray = serializer.getByteArray(numBytes);
+ int charIndex = offset;
+ for (int i = 0; i < numBytes; i += 2) {
+ char c = chars[charIndex++];
+ tmpArray[i] = (byte) c;
+ tmpArray[i + 1] = (byte) (c >>> 8);
+ }
+ buffer.put(writerIndex, tmpArray, 0, numBytes);
+ writerIndex += numBytes;
+ return writerIndex;
+ }
+}
diff --git
a/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java
b/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java
index 2ed61be61..ec26ddd77 100644
---
a/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java
+++
b/java/fory-core/src/main/java/org/apache/fory/serializer/StringSerializer.java
@@ -69,6 +69,9 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
// Make offset compatible with graalvm native image.
private static final long STRING_VALUE_FIELD_OFFSET;
+ private static final boolean STRING_HAS_COUNT_OFFSET;
+ private static final long STRING_COUNT_FIELD_OFFSET;
+ private static final long STRING_OFFSET_FIELD_OFFSET;
private static class Offset {
// Make offset compatible with graalvm native image.
@@ -97,13 +100,22 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
} catch (NoSuchFieldException e) {
throw new RuntimeException(e);
}
- // String length field for android.
- Preconditions.checkArgument(
- ReflectionUtils.getFieldNullable(String.class, "count") == null,
- "Current jdk not supported");
- Preconditions.checkArgument(
- ReflectionUtils.getFieldNullable(String.class, "offset") == null,
- "Current jdk not supported");
+ Field countField = ReflectionUtils.getFieldNullable(String.class, "count");
+ Field offsetField = ReflectionUtils.getFieldNullable(String.class,
"offset");
+ if (countField != null || offsetField != null) {
+ Preconditions.checkArgument(
+ countField != null && offsetField != null, "Current jdk not
supported");
+ Preconditions.checkArgument(
+ countField.getType() == int.class && offsetField.getType() ==
int.class,
+ "Current jdk not supported");
+ STRING_HAS_COUNT_OFFSET = true;
+ STRING_COUNT_FIELD_OFFSET = Platform.objectFieldOffset(countField);
+ STRING_OFFSET_FIELD_OFFSET = Platform.objectFieldOffset(offsetField);
+ } else {
+ STRING_HAS_COUNT_OFFSET = false;
+ STRING_COUNT_FIELD_OFFSET = -1;
+ STRING_OFFSET_FIELD_OFFSET = -1;
+ }
}
private final boolean compressString;
@@ -160,10 +172,18 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
if (!STRING_VALUE_FIELD_IS_CHARS) {
throw new UnsupportedOperationException();
}
- if (compressString) {
- return new Invoke(strSerializer, "writeCompressedCharsString", buffer,
str);
+ if (STRING_HAS_COUNT_OFFSET) {
+ if (compressString) {
+ return new Invoke(strSerializer,
"writeCompressedCharsStringWithOffset", buffer, str);
+ } else {
+ return new Invoke(strSerializer, "writeCharsStringWithOffset",
buffer, str);
+ }
} else {
- return new Invoke(strSerializer, "writeCharsString", buffer, str);
+ if (compressString) {
+ return new Invoke(strSerializer, "writeCompressedCharsString",
buffer, str);
+ } else {
+ return new Invoke(strSerializer, "writeCharsString", buffer, str);
+ }
}
}
}
@@ -344,7 +364,19 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
writeBytesString(buffer, value);
}
} else {
- assert STRING_VALUE_FIELD_IS_CHARS;
+ writeJava8String(buffer, value);
+ }
+ }
+
+ private void writeJava8String(MemoryBuffer buffer, String value) {
+ assert STRING_VALUE_FIELD_IS_CHARS;
+ if (STRING_HAS_COUNT_OFFSET) {
+ if (compressString) {
+ writeCompressedCharsStringWithOffset(buffer, value);
+ } else {
+ writeCharsStringWithOffset(buffer, value);
+ }
+ } else {
if (compressString) {
writeCompressedCharsString(buffer, value);
} else {
@@ -403,6 +435,25 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
}
}
+ @CodegenInvoke
+ public void writeCompressedCharsStringWithOffset(MemoryBuffer buffer, String
value) {
+ final char[] chars = (char[]) Platform.getObject(value,
STRING_VALUE_FIELD_OFFSET);
+ final int offset = Platform.getInt(value, STRING_OFFSET_FIELD_OFFSET);
+ final int count = Platform.getInt(value, STRING_COUNT_FIELD_OFFSET);
+ final byte coder = SlicedStringUtil.bestCoder(chars, offset, count);
+ if (coder == LATIN1) {
+ SlicedStringUtil.writeCharsLatin1WithOffset(this, buffer, chars, offset,
count);
+ } else if (coder == UTF8) {
+ if (writeNumUtf16BytesForUtf8Encoding) {
+ SlicedStringUtil.writeCharsUTF8PerfOptimizedWithOffset(this, buffer,
chars, offset, count);
+ } else {
+ SlicedStringUtil.writeCharsUTF8WithOffset(this, buffer, chars, offset,
count);
+ }
+ } else {
+ SlicedStringUtil.writeCharsUTF16WithOffset(this, buffer, chars, offset,
count);
+ }
+ }
+
@CodegenInvoke
public static void writeBytesString(MemoryBuffer buffer, String value) {
byte[] bytes = (byte[]) Platform.getObject(value,
STRING_VALUE_FIELD_OFFSET);
@@ -450,6 +501,18 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
}
}
+ @CodegenInvoke
+ public void writeCharsStringWithOffset(MemoryBuffer buffer, String value) {
+ final char[] chars = (char[]) Platform.getObject(value,
STRING_VALUE_FIELD_OFFSET);
+ final int offset = Platform.getInt(value, STRING_OFFSET_FIELD_OFFSET);
+ final int count = Platform.getInt(value, STRING_COUNT_FIELD_OFFSET);
+ if (SlicedStringUtil.isLatin(chars, offset, count)) {
+ SlicedStringUtil.writeCharsLatin1WithOffset(this, buffer, chars, offset,
count);
+ } else {
+ SlicedStringUtil.writeCharsUTF16WithOffset(this, buffer, chars, offset,
count);
+ }
+ }
+
public char[] readCharsLatin1(MemoryBuffer buffer, int numBytes) {
buffer.checkReadableBytes(numBytes);
byte[] srcArray = buffer.getHeapMemory();
@@ -1118,7 +1181,7 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
return charArray;
}
- private byte[] getByteArray(int numElements) {
+ byte[] getByteArray(int numElements) {
byte[] byteArray = this.byteArray;
if (byteArray.length < numElements) {
byteArray = new byte[numElements];
diff --git
a/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java
b/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java
index ef6bb67ee..5988762ba 100644
--- a/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java
+++ b/java/fory-core/src/main/java/org/apache/fory/util/StringEncodingUtils.java
@@ -65,6 +65,48 @@ public class StringEncodingUtils {
return dp;
}
+ /** A fast convert algorithm to convert an utf16 char array slice into an
utf8 byte array. */
+ public static int convertUTF16ToUTF8(char[] src, int offset, int len, byte[]
dst, int dp) {
+ int end = offset + len;
+ for (int charOffset = offset, arrayOffset = Platform.CHAR_ARRAY_OFFSET +
(offset << 1);
+ charOffset < end; ) {
+ if (charOffset + 4 <= end
+ && (Platform.getLong(src, arrayOffset) & MULTI_CHARS_NON_ASCII_MASK)
== 0) {
+ dst[dp] = (byte) src[charOffset];
+ dst[dp + 1] = (byte) src[charOffset + 1];
+ dst[dp + 2] = (byte) src[charOffset + 2];
+ dst[dp + 3] = (byte) src[charOffset + 3];
+ dp += 4;
+ charOffset += 4;
+ arrayOffset += 8;
+ } else {
+ char c = src[charOffset++];
+ arrayOffset += 2;
+ if (c < 0x80) {
+ dst[dp++] = (byte) c;
+ } else if (c < 0x800) {
+ dst[dp] = (byte) (0xc0 | (c >> 6));
+ dst[dp + 1] = (byte) (0x80 | (c & 0x3f));
+ dp += 2;
+ } else if (c >= '\uD800' && c <= Character.MAX_LOW_SURROGATE) {
+ if (charOffset >= end) {
+ throw new RuntimeException("malformed input off : " + charOffset);
+ }
+ utf8ToChar2(src, charOffset, c, dst, dp);
+ dp += 4;
+ charOffset++;
+ arrayOffset += 2;
+ } else {
+ dst[dp] = (byte) (0xe0 | ((c >> 12)));
+ dst[dp + 1] = (byte) (0x80 | ((c >> 6) & 0x3f));
+ dst[dp + 2] = (byte) (0x80 | (c & 0x3f));
+ dp += 3;
+ }
+ }
+ }
+ return dp;
+ }
+
/** A fast convert algorithm to convert an utf16 byte array into an utf8
byte array. */
public static int convertUTF16ToUTF8(byte[] src, byte[] dst, int dp) {
int numBytes = src.length;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]