This is an automated email from the ASF dual-hosted git repository.
chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fury.git
The following commit(s) were added to refs/heads/main by this push:
new 09abde8a feat(java): Refactor String serialization and deserialization
(#1890)
09abde8a is described below
commit 09abde8afe115f0691062998ad37fef5721ff14b
Author: HuangXingBo <[email protected]>
AuthorDate: Tue Oct 22 00:31:53 2024 +0800
feat(java): Refactor String serialization and deserialization (#1890)
## What does this PR do?
<!-- Describe the purpose of this PR. -->
## Related issues
Closes #1868
Closes #1754
## Does this PR introduce any user-facing change?
<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/fury/issues/new/choose) describing the
need to do so and update the document if necessary.
-->
- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?
## Benchmark
<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
-->
---------
Co-authored-by: chaokunyang <[email protected]>
---
.../java/org/apache/fury/config/FuryBuilder.java | 2 +-
.../java/org/apache/fury/memory/MemoryBuffer.java | 2 +-
.../org/apache/fury/serializer/Serializers.java | 2 +-
.../apache/fury/serializer/StringSerializer.java | 577 +++++++++++++++------
.../org/apache/fury/util/StringEncodingUtils.java | 381 ++++++++++++++
.../java/org/apache/fury/util/StringUtils.java | 12 +-
.../org/apache/fury/builder/JITContextTest.java | 6 +
.../fury/serializer/StringSerializerTest.java | 5 +-
.../apache/fury/util/StringEncodingUtilsTest.java | 59 +++
9 files changed, 868 insertions(+), 178 deletions(-)
diff --git
a/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java
b/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java
index c27a6124..7e26d722 100644
--- a/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java
+++ b/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java
@@ -68,7 +68,7 @@ public final class FuryBuilder {
ClassLoader classLoader;
boolean compressInt = true;
public LongEncoding longEncoding = LongEncoding.SLI;
- boolean compressString = true;
+ boolean compressString = false;
CompatibleMode compatibleMode = CompatibleMode.SCHEMA_CONSISTENT;
boolean checkJdkClassSerializable = true;
Class<? extends Serializer> defaultJDKStreamSerializerType =
ObjectStreamSerializer.class;
diff --git
a/java/fury-core/src/main/java/org/apache/fury/memory/MemoryBuffer.java
b/java/fury-core/src/main/java/org/apache/fury/memory/MemoryBuffer.java
index 79d6f2b7..87b56e6e 100644
--- a/java/fury-core/src/main/java/org/apache/fury/memory/MemoryBuffer.java
+++ b/java/fury-core/src/main/java/org/apache/fury/memory/MemoryBuffer.java
@@ -471,7 +471,7 @@ public final class MemoryBuffer {
}
// CHECKSTYLE.OFF:MethodName
- private void _unsafePutInt32(int index, int value) {
+ public void _unsafePutInt32(int index, int value) {
// CHECKSTYLE.ON:MethodName
if (!LITTLE_ENDIAN) {
value = Integer.reverseBytes(value);
diff --git
a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java
b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java
index 36870aa6..61e7574e 100644
--- a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java
+++ b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java
@@ -265,7 +265,7 @@ public class Serializers {
} else {
char[] v = (char[]) GET_VALUE.apply(value);
if (StringUtils.isLatin(v)) {
- stringSerializer.writeCharsLatin(buffer, v, value.length());
+ stringSerializer.writeCharsLatin1(buffer, v, value.length());
} else {
stringSerializer.writeCharsUTF16(buffer, v, value.length());
}
diff --git
a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
index b0b67abc..a1161138 100644
---
a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
+++
b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
@@ -19,8 +19,8 @@
package org.apache.fury.serializer;
-import static org.apache.fury.type.TypeUtils.PRIMITIVE_CHAR_ARRAY_TYPE;
import static org.apache.fury.type.TypeUtils.STRING_TYPE;
+import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_LATIN_MASK;
import java.lang.invoke.CallSite;
import java.lang.invoke.LambdaMetafactory;
@@ -43,6 +43,7 @@ import org.apache.fury.reflect.ReflectionUtils;
import org.apache.fury.type.Type;
import org.apache.fury.util.MathUtils;
import org.apache.fury.util.Preconditions;
+import org.apache.fury.util.StringEncodingUtils;
import org.apache.fury.util.StringUtils;
import org.apache.fury.util.unsafe._JDKAccess;
@@ -149,15 +150,19 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
public Expression writeStringExpr(Expression strSerializer, Expression
buffer, Expression str) {
if (isJava) {
if (STRING_VALUE_FIELD_IS_BYTES) {
- return new StaticInvoke(StringSerializer.class, "writeBytesString",
buffer, str);
+ if (compressString) {
+ return new Invoke(strSerializer, "writeCompressedBytesString",
buffer, str);
+ } else {
+ return new StaticInvoke(StringSerializer.class, "writeBytesString",
buffer, str);
+ }
} else {
if (!STRING_VALUE_FIELD_IS_CHARS) {
throw new UnsupportedOperationException();
}
if (compressString) {
- return new Invoke(strSerializer, "writeCharsStringCompressed",
buffer, str);
+ return new Invoke(strSerializer, "writeCompressedCharsString",
buffer, str);
} else {
- return new Invoke(strSerializer, "writeCharsStringUncompressed",
buffer, str);
+ return new Invoke(strSerializer, "writeCharsString", buffer, str);
}
}
} else {
@@ -165,23 +170,6 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
}
}
- // Invoked by jit
- public void writeCharsStringCompressed(MemoryBuffer buffer, String value) {
- final char[] chars = (char[]) Platform.getObject(value,
STRING_VALUE_FIELD_OFFSET);
- if (StringUtils.isLatin(chars)) {
- writeCharsLatin(buffer, chars, chars.length);
- } else {
- writeCharsUTF16(buffer, chars, chars.length);
- }
- }
-
- // Invoked by jit
- public void writeCharsStringUncompressed(MemoryBuffer buffer, String value) {
- int numBytes = MathUtils.doubleExact(value.length());
- final char[] chars = (char[]) Platform.getObject(value,
STRING_VALUE_FIELD_OFFSET);
- buffer.writePrimitiveArrayWithSize(chars, Platform.CHAR_ARRAY_OFFSET,
numBytes);
- }
-
public String readString(MemoryBuffer buffer) {
if (isJava) {
return readJavaString(buffer);
@@ -201,9 +189,7 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
if (compressString) {
return new Invoke(strSerializer, "readCompressedCharsString",
STRING_TYPE, buffer);
} else {
- Expression chars = new Invoke(buffer, "readCharsAndSize",
PRIMITIVE_CHAR_ARRAY_TYPE);
- return new StaticInvoke(
- StringSerializer.class, "newCharsStringZeroCopy", STRING_TYPE,
chars);
+ return new Invoke(strSerializer, "readCharsString", STRING_TYPE,
buffer);
}
}
} else {
@@ -216,17 +202,7 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
long header = buffer.readVarUint36Small();
byte coder = (byte) (header & 0b11);
int numBytes = (int) (header >>> 2);
- buffer.checkReadableBytes(numBytes);
- byte[] bytes;
- byte[] heapMemory = buffer.getHeapMemory();
- if (heapMemory != null) {
- final int arrIndex = buffer._unsafeHeapReaderIndex();
- buffer.increaseReaderIndex(numBytes);
- bytes = new byte[numBytes];
- System.arraycopy(heapMemory, arrIndex, bytes, 0, numBytes);
- } else {
- bytes = buffer.readBytes(numBytes);
- }
+ byte[] bytes = readBytesUnCompressedUTF16(buffer, numBytes);
if (coder != UTF8) {
return newBytesStringZeroCopy(coder, bytes);
} else {
@@ -235,80 +211,130 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
}
@CodegenInvoke
- public String readCompressedCharsString(MemoryBuffer buffer) {
+ public String readCharsString(MemoryBuffer buffer) {
long header = buffer.readVarUint36Small();
byte coder = (byte) (header & 0b11);
int numBytes = (int) (header >>> 2);
+ char[] chars;
if (coder == LATIN1) {
- return newCharsStringZeroCopy(readLatinChars(buffer, numBytes));
+ chars = readCharsLatin1(buffer, numBytes);
} else if (coder == UTF16) {
- return newCharsStringZeroCopy(readUTF16Chars(buffer, numBytes));
+ chars = readCharsUTF16(buffer, numBytes);
} else {
- return readUtf8(buffer, coder, numBytes);
+ throw new RuntimeException("Unknown coder type " + coder);
}
+ return newCharsStringZeroCopy(chars);
}
- private String readUtf8(MemoryBuffer buffer, byte coder, int numBytes) {
- Preconditions.checkArgument(coder == UTF8, UTF8);
- byte[] bytes = buffer.readBytes(numBytes);
- return new String(bytes, 0, numBytes, StandardCharsets.UTF_8);
+ @CodegenInvoke
+ public String readCompressedBytesString(MemoryBuffer buffer) {
+ long header = buffer.readVarUint36Small();
+ byte coder = (byte) (header & 0b11);
+ int numBytes = (int) (header >>> 2);
+ if (coder == UTF8) {
+ return newBytesStringZeroCopy(UTF16, readBytesUTF8(buffer, numBytes));
+ } else if (coder == LATIN1 || coder == UTF16) {
+ return newBytesStringZeroCopy(coder, readBytesUnCompressedUTF16(buffer,
numBytes));
+ } else {
+ throw new RuntimeException("Unknown coder type " + coder);
+ }
}
- private byte[] getByteArray(int numElements) {
- byte[] byteArray = this.byteArray;
- if (byteArray.length < numElements) {
- byteArray = new byte[numElements];
- this.byteArray = byteArray;
- }
- if (byteArray.length > DEFAULT_BUFFER_SIZE) {
- smoothByteArrayLength =
- Math.max(((int) (smoothByteArrayLength * 0.9 + numElements * 0.1)),
DEFAULT_BUFFER_SIZE);
- if (smoothByteArrayLength <= DEFAULT_BUFFER_SIZE) {
- this.byteArray = new byte[DEFAULT_BUFFER_SIZE];
- }
+ @CodegenInvoke
+ public String readCompressedCharsString(MemoryBuffer buffer) {
+ long header = buffer.readVarUint36Small();
+ byte coder = (byte) (header & 0b11);
+ int numBytes = (int) (header >>> 2);
+ char[] chars;
+ if (coder == LATIN1) {
+ chars = readCharsLatin1(buffer, numBytes);
+ } else if (coder == UTF8) {
+ chars = readCharsUTF8(buffer, numBytes);
+ } else if (coder == UTF16) {
+ chars = readCharsUTF16(buffer, numBytes);
+ } else {
+ throw new RuntimeException("Unknown coder type " + coder);
}
- return byteArray;
+ return newCharsStringZeroCopy(chars);
}
// Invoked by fury JIT
public void writeJavaString(MemoryBuffer buffer, String value) {
if (STRING_VALUE_FIELD_IS_BYTES) {
- writeBytesString(buffer, value);
+ if (compressString) {
+ writeCompressedBytesString(buffer, value);
+ } else {
+ writeBytesString(buffer, value);
+ }
} else {
assert STRING_VALUE_FIELD_IS_CHARS;
- final char[] chars = (char[]) Platform.getObject(value,
STRING_VALUE_FIELD_OFFSET);
if (compressString) {
- if (StringUtils.isLatin(chars)) {
- writeCharsLatin(buffer, chars, chars.length);
- } else {
- writeCharsUTF16(buffer, chars, chars.length);
- }
+ writeCompressedCharsString(buffer, value);
} else {
- int numBytes = MathUtils.doubleExact(value.length());
- buffer.writePrimitiveArrayWithSize(chars, Platform.CHAR_ARRAY_OFFSET,
numBytes);
+ writeCharsString(buffer, value);
}
}
}
+ @CodegenInvoke
+ public void writeUTF8String(MemoryBuffer buffer, String value) {
+ byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
+ buffer.writeVarUint32(bytes.length);
+ buffer.writeBytes(bytes);
+ }
+
// Invoked by fury JIT
public String readJavaString(MemoryBuffer buffer) {
if (STRING_VALUE_FIELD_IS_BYTES) {
- return readBytesString(buffer);
+ if (compressString) {
+ return readCompressedBytesString(buffer);
+ } else {
+ return readBytesString(buffer);
+ }
} else {
assert STRING_VALUE_FIELD_IS_CHARS;
if (compressString) {
return readCompressedCharsString(buffer);
} else {
- return newCharsStringZeroCopy(buffer.readCharsAndSize());
+ return readCharsString(buffer);
}
}
}
+ @CodegenInvoke
+ public void writeCompressedBytesString(MemoryBuffer buffer, String value) {
+ final byte[] bytes = (byte[]) Platform.getObject(value,
STRING_VALUE_FIELD_OFFSET);
+ final byte coder = Platform.getByte(value,
Offset.STRING_CODER_FIELD_OFFSET);
+ if (coder == LATIN1 || bestCoder(bytes) == UTF16) {
+ writeBytesString(buffer, coder, bytes);
+ } else {
+ writeBytesUTF8(buffer, bytes);
+ }
+ }
+
+ @CodegenInvoke
+ public void writeCompressedCharsString(MemoryBuffer buffer, String value) {
+ final char[] chars = (char[]) Platform.getObject(value,
STRING_VALUE_FIELD_OFFSET);
+ final byte coder = bestCoder(chars);
+ if (coder == LATIN1) {
+ writeCharsLatin1(buffer, chars, chars.length);
+ } else if (coder == UTF8) {
+ writeCharsUTF8(buffer, chars);
+ } else {
+ writeCharsUTF16(buffer, chars, chars.length);
+ }
+ }
+
+ @CodegenInvoke
public static void writeBytesString(MemoryBuffer buffer, String value) {
byte[] bytes = (byte[]) Platform.getObject(value,
STRING_VALUE_FIELD_OFFSET);
+ byte coder = Platform.getByte(value, Offset.STRING_CODER_FIELD_OFFSET);
+ writeBytesString(buffer, coder, bytes);
+ }
+
+ public static void writeBytesString(MemoryBuffer buffer, byte coder, byte[]
bytes) {
int bytesLen = bytes.length;
- long header =
- ((long) bytesLen << 2) | Platform.getByte(value,
Offset.STRING_CODER_FIELD_OFFSET);
+ long header = ((long) bytesLen << 2) | coder;
int writerIndex = buffer.writerIndex();
// The `ensure` ensure next operations are safe without bound checks,
// and inner heap buffer doesn't change.
@@ -332,112 +358,94 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
buffer._unsafeWriterIndex(writerIndex);
}
- public void writeCharsLatin(MemoryBuffer buffer, char[] chars, final int
strLen) {
- int writerIndex = buffer.writerIndex();
- // The `ensure` ensure next operations are safe without bound checks,
- // and inner heap buffer doesn't change.
- buffer.ensure(writerIndex + 9 + strLen);
- long header = ((long) strLen << 2) | LATIN1;
- final byte[] targetArray = buffer.getHeapMemory();
- if (targetArray != null) {
- int arrIndex = buffer._unsafeHeapWriterIndex();
- int written = LittleEndian.putVarUint36Small(targetArray, arrIndex,
header);
- arrIndex += written;
- writerIndex += written + strLen;
- for (int i = 0; i < strLen; i++) {
- targetArray[arrIndex + i] = (byte) chars[i];
- }
- buffer._unsafeWriterIndex(writerIndex);
+ @CodegenInvoke
+ public void writeCharsString(MemoryBuffer buffer, String value) {
+ final char[] chars = (char[]) Platform.getObject(value,
STRING_VALUE_FIELD_OFFSET);
+ if (StringUtils.isLatin(chars)) {
+ writeCharsLatin1(buffer, chars, chars.length);
} else {
- writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
- final byte[] tmpArray = getByteArray(strLen);
- // Write to heap memory then copy is 60% faster than unsafe write to
direct memory.
- for (int i = 0; i < strLen; i++) {
- tmpArray[i] = (byte) chars[i];
- }
- buffer.put(writerIndex, tmpArray, 0, strLen);
- writerIndex += strLen;
- buffer._unsafeWriterIndex(writerIndex);
+ writeCharsUTF16(buffer, chars, chars.length);
}
}
- public void writeCharsUTF16(MemoryBuffer buffer, char[] chars, int strLen) {
- int numBytes = MathUtils.doubleExact(strLen);
- long header = ((long) numBytes << 2) | UTF16;
- // The `ensure` ensure next operations are safe without bound checks,
- // and inner heap buffer doesn't change.
- int writerIndex = buffer.writerIndex();
- buffer.ensure(writerIndex + 9 + numBytes);
- byte[] targetArray = buffer.getHeapMemory();
+ @CodegenInvoke
+ public String readUTF8String(MemoryBuffer buffer) {
+ int numBytes = buffer.readVarUint32Small14();
+ buffer.checkReadableBytes(numBytes);
+ final byte[] targetArray = buffer.getHeapMemory();
if (targetArray != null) {
- int arrIndex = buffer._unsafeHeapWriterIndex();
- int written = LittleEndian.putVarUint36Small(targetArray, arrIndex,
header);
- arrIndex += written;
- writerIndex += written + numBytes;
- if (Platform.IS_LITTLE_ENDIAN) {
- // FIXME JDK11 utf16 string uses little-endian order.
- Platform.UNSAFE.copyMemory(
- chars,
- Platform.CHAR_ARRAY_OFFSET,
- targetArray,
- Platform.BYTE_ARRAY_OFFSET + arrIndex,
- numBytes);
- } else {
- heapWriteCharsUTF16BE(chars, arrIndex, numBytes, targetArray);
- }
+ String str =
+ new String(
+ targetArray, buffer._unsafeHeapReaderIndex(), numBytes,
StandardCharsets.UTF_8);
+ buffer.increaseReaderIndex(numBytes);
+ return str;
} else {
- writerIndex = offHeapWriteCharsUTF16(buffer, chars, writerIndex, header,
numBytes);
- }
- buffer._unsafeWriterIndex(writerIndex);
- }
-
- private static void heapWriteCharsUTF16BE(
- char[] chars, int arrIndex, int numBytes, byte[] targetArray) {
- // Write to heap memory then copy is 250% faster than unsafe write to
direct memory.
- int charIndex = 0;
- for (int i = arrIndex, end = i + numBytes; i < end; i += 2) {
- char c = chars[charIndex++];
- targetArray[i] = (byte) (c >> StringUTF16.HI_BYTE_SHIFT);
- targetArray[i + 1] = (byte) (c >> StringUTF16.LO_BYTE_SHIFT);
- }
- }
-
- private int offHeapWriteCharsUTF16(
- MemoryBuffer buffer, char[] chars, int writerIndex, long header, int
numBytes) {
- writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
- byte[] tmpArray = getByteArray(numBytes);
- int charIndex = 0;
- for (int i = 0; i < numBytes; i += 2) {
- char c = chars[charIndex++];
- tmpArray[i] = (byte) (c >> StringUTF16.HI_BYTE_SHIFT);
- tmpArray[i + 1] = (byte) (c >> StringUTF16.LO_BYTE_SHIFT);
+ final byte[] tmpArray = getByteArray(numBytes);
+ buffer.readBytes(tmpArray, 0, numBytes);
+ return new String(tmpArray, 0, numBytes, StandardCharsets.UTF_8);
}
- buffer.put(writerIndex, tmpArray, 0, numBytes);
- writerIndex += numBytes;
- return writerIndex;
}
- private char[] readLatinChars(MemoryBuffer buffer, int numBytes) {
- char[] chars = new char[numBytes];
+ public char[] readCharsLatin1(MemoryBuffer buffer, int numBytes) {
+ // int utf8AsciiBytes = buffer.readInt32();
buffer.checkReadableBytes(numBytes);
- byte[] targetArray = buffer.getHeapMemory();
- if (targetArray != null) {
+ byte[] srcArray = buffer.getHeapMemory();
+ char[] chars = new char[numBytes];
+ if (srcArray != null) {
int srcIndex = buffer._unsafeHeapReaderIndex();
for (int i = 0; i < numBytes; i++) {
- chars[i] = (char) (targetArray[srcIndex++] & 0xff);
+ chars[i] = (char) (srcArray[srcIndex++] & 0xff);
}
buffer._increaseReaderIndexUnsafe(numBytes);
} else {
- byte[] byteArray = getByteArray(numBytes);
- buffer.readBytes(byteArray, 0, numBytes);
+ byte[] tmpArray = getByteArray(numBytes);
+ buffer.readBytes(tmpArray, 0, numBytes);
for (int i = 0; i < numBytes; i++) {
- chars[i] = (char) (byteArray[i] & 0xff);
+ chars[i] = (char) (tmpArray[i] & 0xff);
}
}
return chars;
}
- private char[] readUTF16Chars(MemoryBuffer buffer, int numBytes) {
+ public byte[] readBytesUTF8(MemoryBuffer buffer, int numBytes) {
+ int udf8Bytes = buffer.readInt32();
+ byte[] bytes = new byte[numBytes];
+ buffer.checkReadableBytes(udf8Bytes);
+ byte[] srcArray = buffer.getHeapMemory();
+ if (srcArray != null) {
+ int srcIndex = buffer._unsafeHeapReaderIndex();
+ int readLen = StringEncodingUtils.convertUTF8ToUTF16(srcArray, srcIndex,
udf8Bytes, bytes);
+ if (readLen != numBytes) {
+ throw new RuntimeException("Decode UTF8 to UTF16 failed");
+ }
+ buffer._increaseReaderIndexUnsafe(udf8Bytes);
+ } else {
+ byte[] tmpArray = getByteArray(udf8Bytes);
+ buffer.readBytes(tmpArray, 0, udf8Bytes);
+ int readLen = StringEncodingUtils.convertUTF8ToUTF16(tmpArray, 0,
udf8Bytes, bytes);
+ if (readLen != numBytes) {
+ throw new RuntimeException("Decode UTF8 to UTF16 failed");
+ }
+ }
+ return bytes;
+ }
+
+ public byte[] readBytesUnCompressedUTF16(MemoryBuffer buffer, int numBytes) {
+ buffer.checkReadableBytes(numBytes);
+ byte[] bytes;
+ byte[] heapMemory = buffer.getHeapMemory();
+ if (heapMemory != null) {
+ final int arrIndex = buffer._unsafeHeapReaderIndex();
+ buffer.increaseReaderIndex(numBytes);
+ bytes = new byte[numBytes];
+ System.arraycopy(heapMemory, arrIndex, bytes, 0, numBytes);
+ } else {
+ bytes = buffer.readBytes(numBytes);
+ }
+ return bytes;
+ }
+
+ public char[] readCharsUTF16(MemoryBuffer buffer, int numBytes) {
char[] chars = new char[numBytes >> 1];
if (Platform.IS_LITTLE_ENDIAN) {
// FIXME JDK11 utf16 string uses little-endian order.
@@ -471,6 +479,138 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
return chars;
}
+ public char[] readCharsUTF8(MemoryBuffer buffer, int numBytes) {
+ int udf16Chars = numBytes >> 1;
+ int udf8Bytes = buffer.readInt32();
+ char[] chars = new char[udf16Chars];
+ buffer.checkReadableBytes(udf8Bytes);
+ byte[] srcArray = buffer.getHeapMemory();
+ if (srcArray != null) {
+ int srcIndex = buffer._unsafeHeapReaderIndex();
+ int readLen = StringEncodingUtils.convertUTF8ToUTF16(srcArray, srcIndex,
udf8Bytes, chars);
+ if (readLen != udf16Chars) {
+ throw new RuntimeException("Decode UTF8 to UTF16 failed");
+ }
+ buffer._increaseReaderIndexUnsafe(udf8Bytes);
+ } else {
+ byte[] tmpArray = getByteArray(udf8Bytes);
+ buffer.readBytes(tmpArray, 0, udf8Bytes);
+ int readLen = StringEncodingUtils.convertUTF8ToUTF16(tmpArray, 0,
udf8Bytes, chars);
+ if (readLen != udf16Chars) {
+ throw new RuntimeException("Decode UTF8 to UTF16 failed");
+ }
+ }
+ return chars;
+ }
+
+ public void writeCharsLatin1(MemoryBuffer buffer, char[] chars, int
numBytes) {
+ int writerIndex = buffer.writerIndex();
+ long header = ((long) numBytes << 2) | LATIN1;
+ buffer.ensure(writerIndex + 5 + numBytes);
+ byte[] targetArray = buffer.getHeapMemory();
+ if (targetArray != null) {
+ final int targetIndex = buffer._unsafeHeapWriterIndex();
+ int arrIndex = targetIndex;
+ arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex,
header);
+ writerIndex += arrIndex - targetIndex;
+ for (int i = 0; i < numBytes; i++) {
+ targetArray[arrIndex + i] = (byte) chars[i];
+ }
+ } else {
+ writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+ final byte[] tmpArray = getByteArray(numBytes);
+ for (int i = 0; i < numBytes; i++) {
+ tmpArray[i] = (byte) chars[i];
+ }
+ buffer.put(writerIndex, tmpArray, 0, numBytes);
+ }
+ writerIndex += numBytes;
+ buffer._unsafeWriterIndex(writerIndex);
+ }
+
+ public void writeCharsUTF16(MemoryBuffer buffer, char[] chars, int numChars)
{
+ int numBytes = MathUtils.doubleExact(numChars);
+ int writerIndex = buffer.writerIndex();
+ long header = ((long) numBytes << 2) | UTF16;
+ buffer.ensure(writerIndex + 5 + numBytes);
+ final byte[] targetArray = buffer.getHeapMemory();
+ if (targetArray != null) {
+ final int targetIndex = buffer._unsafeHeapWriterIndex();
+ int arrIndex = targetIndex;
+ arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex,
header);
+ writerIndex += arrIndex - targetIndex + numBytes;
+ if (Platform.IS_LITTLE_ENDIAN) {
+ // FIXME JDK11 utf16 string uses little-endian order.
+ Platform.UNSAFE.copyMemory(
+ chars,
+ Platform.CHAR_ARRAY_OFFSET,
+ targetArray,
+ Platform.BYTE_ARRAY_OFFSET + arrIndex,
+ numBytes);
+ } else {
+ heapWriteCharsUTF16BE(chars, arrIndex, numBytes, targetArray);
+ }
+ } else {
+ writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+ writerIndex = offHeapWriteCharsUTF16(buffer, chars, writerIndex,
numBytes);
+ }
+ buffer._unsafeWriterIndex(writerIndex);
+ }
+
+ public void writeCharsUTF8(MemoryBuffer buffer, char[] chars) {
+ int estimateMaxBytes = chars.length * 3;
+ int numBytes = MathUtils.doubleExact(chars.length);
+ int writerIndex = buffer.writerIndex();
+ long header = ((long) numBytes << 2) | UTF8;
+ buffer.ensure(writerIndex + 9 + estimateMaxBytes);
+ byte[] targetArray = buffer.getHeapMemory();
+ if (targetArray != null) {
+ int targetIndex = buffer._unsafeHeapWriterIndex();
+ int arrIndex = targetIndex;
+ arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex,
header);
+ writerIndex += arrIndex - targetIndex;
+ targetIndex = StringEncodingUtils.convertUTF16ToUTF8(chars, targetArray,
arrIndex + 4);
+ int written = targetIndex - arrIndex - 4;
+ buffer._unsafePutInt32(writerIndex, written);
+ buffer._unsafeWriterIndex(writerIndex + 4 + written);
+ } else {
+ final byte[] tmpArray = getByteArray(estimateMaxBytes);
+ int written = StringEncodingUtils.convertUTF16ToUTF8(chars, tmpArray, 0);
+ writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+ buffer._unsafePutInt32(writerIndex, written);
+ writerIndex += 4;
+ buffer.put(writerIndex, tmpArray, 0, written);
+ buffer._unsafeWriterIndex(writerIndex + written);
+ }
+ }
+
+ public void writeBytesUTF8(MemoryBuffer buffer, byte[] bytes) {
+ int numBytes = bytes.length;
+ int estimateMaxBytes = bytes.length / 2 * 3;
+ int writerIndex = buffer.writerIndex();
+ long header = ((long) numBytes << 2) | UTF8;
+ buffer.ensure(writerIndex + 9 + estimateMaxBytes);
+ byte[] targetArray = buffer.getHeapMemory();
+ if (targetArray != null) {
+ int targetIndex = buffer._unsafeHeapWriterIndex();
+ int arrIndex = targetIndex;
+ arrIndex += LittleEndian.putVarUint36Small(targetArray, arrIndex,
header);
+ writerIndex += arrIndex - targetIndex;
+ targetIndex = StringEncodingUtils.convertUTF16ToUTF8(bytes, targetArray,
arrIndex + 4);
+ int written = targetIndex - arrIndex - 4;
+ buffer._unsafePutInt32(writerIndex, written);
+ buffer._unsafeWriterIndex(writerIndex + 4 + written);
+ } else {
+ final byte[] tmpArray = getByteArray(estimateMaxBytes);
+ int written = StringEncodingUtils.convertUTF16ToUTF8(bytes, tmpArray, 0);
+ writerIndex += buffer._unsafePutVarUint36Small(writerIndex, header);
+ buffer._unsafePutInt32(writerIndex, written);
+ writerIndex += 4;
+ buffer.put(writerIndex, tmpArray, 0, written);
+ buffer._unsafeWriterIndex(writerIndex + written);
+ }
+ }
+
private static final MethodHandles.Lookup STRING_LOOK_UP =
_JDKAccess._trustedLookup(String.class);
private static final BiFunction<char[], Boolean, String>
CHARS_STRING_ZERO_COPY_CTR =
@@ -603,26 +743,121 @@ public final class StringSerializer extends
ImmutableSerializer<String> {
}
}
- public void writeUTF8String(MemoryBuffer buffer, String value) {
- byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
- buffer.writeVarUint32(bytes.length);
- buffer.writeBytes(bytes);
+ private static void heapWriteCharsUTF16BE(
+ char[] chars, int arrIndex, int numBytes, byte[] targetArray) {
+ // Write to heap memory then copy is 250% faster than unsafe write to
direct memory.
+ int charIndex = 0;
+ for (int i = arrIndex, end = i + numBytes; i < end; i += 2) {
+ char c = chars[charIndex++];
+ targetArray[i] = (byte) (c >> StringUTF16.HI_BYTE_SHIFT);
+ targetArray[i + 1] = (byte) (c >> StringUTF16.LO_BYTE_SHIFT);
+ }
}
- public String readUTF8String(MemoryBuffer buffer) {
- int numBytes = buffer.readVarUint32Small14();
- buffer.checkReadableBytes(numBytes);
- final byte[] targetArray = buffer.getHeapMemory();
- if (targetArray != null) {
- String str =
- new String(
- targetArray, buffer._unsafeHeapReaderIndex(), numBytes,
StandardCharsets.UTF_8);
- buffer.increaseReaderIndex(numBytes);
- return str;
+ private int offHeapWriteCharsUTF16(
+ MemoryBuffer buffer, char[] chars, int writerIndex, int numBytes) {
+ byte[] tmpArray = getByteArray(numBytes);
+ int charIndex = 0;
+ for (int i = 0; i < numBytes; i += 2) {
+ char c = chars[charIndex++];
+ tmpArray[i] = (byte) (c >> StringUTF16.HI_BYTE_SHIFT);
+ tmpArray[i + 1] = (byte) (c >> StringUTF16.LO_BYTE_SHIFT);
+ }
+ buffer.put(writerIndex, tmpArray, 0, numBytes);
+ writerIndex += numBytes;
+ return writerIndex;
+ }
+
+ private static byte bestCoder(char[] chars) {
+ int numChars = chars.length;
+ // sample 64 chars
+ int sampleNum = Math.min(64, numChars);
+ int vectorizedLen = sampleNum >> 2;
+ int vectorizedChars = vectorizedLen << 2;
+ int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1);
+ int count = 0;
+ for (int offset = Platform.CHAR_ARRAY_OFFSET, charOffset = 0;
+ offset < endOffset;
+ offset += 8, charOffset += 4) {
+ long multiChars = Platform.getLong(chars, offset);
+ if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
+ count += 4;
+ } else {
+ for (int i = 0; i < 4; ++i) {
+ if (chars[charOffset + i] < 0x80) {
+ count++;
+ }
+ }
+ }
+ }
+
+ for (int i = vectorizedChars; i < sampleNum; i++) {
+ if (chars[i] < 0x80) {
+ count++;
+ }
+ }
+
+ // ascii number > 50%, choose UTF-8
+ if (count >= sampleNum * 0.5) {
+ if (count == numChars || (count == sampleNum &&
StringUtils.isLatin(chars, sampleNum))) {
+ return LATIN1;
+ }
+ return UTF8;
} else {
- final byte[] tmpArray = getByteArray(numBytes);
- buffer.readBytes(tmpArray, 0, numBytes);
- return new String(tmpArray, 0, numBytes, StandardCharsets.UTF_8);
+ return UTF16;
+ }
+ }
+
+ private static byte bestCoder(byte[] bytes) {
+ int numBytes = bytes.length;
+ // sample 64 chars
+ int sampleNum = Math.min(64 << 1, numBytes);
+ int vectorizedLen = sampleNum >> 3;
+ int vectorizedBytes = vectorizedLen << 3;
+ int endOffset = Platform.BYTE_ARRAY_OFFSET + vectorizedBytes;
+ int count = 0;
+ for (int offset = Platform.BYTE_ARRAY_OFFSET, bytesOffset = 0;
+ offset < endOffset;
+ offset += 8, bytesOffset += 8) {
+ long multiChars = Platform.getLong(bytes, offset);
+ if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
+ count += 4;
+ } else {
+ for (int i = Platform.IS_LITTLE_ENDIAN ? 1 : 0; i < 8; i += 2) {
+ if (bytes[bytesOffset + i] == 0) {
+ count++;
+ }
+ }
+ }
}
+ for (int i = Platform.IS_LITTLE_ENDIAN ? vectorizedBytes + 1 :
vectorizedBytes;
+ i < sampleNum;
+ ++i) {
+ if (bytes[i] == 0) {
+ count++;
+ }
+ }
+ // ascii number > 50%, choose UTF-8
+ if (count >= sampleNum * 0.5) {
+ return UTF8;
+ } else {
+ return UTF16;
+ }
+ }
+
+ private byte[] getByteArray(int numElements) {
+ byte[] byteArray = this.byteArray;
+ if (byteArray.length < numElements) {
+ byteArray = new byte[numElements];
+ this.byteArray = byteArray;
+ }
+ if (byteArray.length > DEFAULT_BUFFER_SIZE) {
+ smoothByteArrayLength =
+ Math.max(((int) (smoothByteArrayLength * 0.9 + numElements * 0.1)),
DEFAULT_BUFFER_SIZE);
+ if (smoothByteArrayLength <= DEFAULT_BUFFER_SIZE) {
+ this.byteArray = new byte[DEFAULT_BUFFER_SIZE];
+ }
+ }
+ return byteArray;
}
}
diff --git
a/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java
b/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java
new file mode 100644
index 00000000..d90b5412
--- /dev/null
+++ b/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java
@@ -0,0 +1,381 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.fury.util;
+
+import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_LATIN_MASK;
+
+import org.apache.fury.memory.Platform;
+
+/** String Encoding Utils. */
+public class StringEncodingUtils {
+
+ /** A fast convert algorithm to convert an utf16 char array into an utf8
byte array. */
+ public static int convertUTF16ToUTF8(char[] src, byte[] dst, int dp) {
+ int numChars = src.length;
+ for (int charOffset = 0; charOffset < numChars; ) {
+ if (charOffset + 4 <= numChars
+ && (Platform.getLong(src, Platform.CHAR_ARRAY_OFFSET + charOffset *
2L)
+ & MULTI_CHARS_NON_LATIN_MASK)
+ == 0) {
+ // ascii only
+ dst[dp] = (byte) src[charOffset];
+ dst[dp + 1] = (byte) src[charOffset + 1];
+ dst[dp + 2] = (byte) src[charOffset + 2];
+ dst[dp + 3] = (byte) src[charOffset + 3];
+ dp += 4;
+ charOffset += 4;
+ } else {
+ char c = src[charOffset++];
+ if (c < 0x80) {
+ dst[dp++] = (byte) c;
+ } else if (c < 0x800) {
+ dst[dp] = (byte) (0xc0 | (c >> 6));
+ dst[dp + 1] = (byte) (0x80 | (c & 0x3f));
+ dp += 2;
+ } else if (c >= '\uD800' && c <= Character.MAX_LOW_SURROGATE) {
+ utf8ToChar2(src, charOffset, c, dst, dp);
+ dp += 4;
+ charOffset++;
+ } else {
+ dst[dp] = (byte) (0xe0 | ((c >> 12)));
+ dst[dp + 1] = (byte) (0x80 | ((c >> 6) & 0x3f));
+ dst[dp + 2] = (byte) (0x80 | (c & 0x3f));
+ dp += 3;
+ }
+ }
+ }
+ return dp;
+ }
+
+ /** A fast convert algorithm to convert an utf16 byte array into an utf8
byte array. */
+ public static int convertUTF16ToUTF8(byte[] src, byte[] dst, int dp) {
+ int numBytes = src.length;
+ for (int offset = 0; offset < numBytes; ) {
+ if (offset + 8 <= numBytes
+ && (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset)
+ & MULTI_CHARS_NON_LATIN_MASK)
+ == 0) {
+ // ascii only
+ if (Platform.IS_LITTLE_ENDIAN) {
+ dst[dp] = src[offset];
+ dst[dp + 1] = src[offset + 2];
+ dst[dp + 2] = src[offset + 4];
+ dst[dp + 3] = src[offset + 6];
+ } else {
+ dst[dp] = src[offset + 1];
+ dst[dp + 1] = src[offset + 3];
+ dst[dp + 2] = src[offset + 5];
+ dst[dp + 3] = src[offset + 7];
+ }
+ dp += 4;
+ offset += 8;
+ } else {
+ char c = Platform.getChar(src, Platform.BYTE_ARRAY_OFFSET + offset);
+ offset += 2;
+
+ if (c < 0x80) {
+ dst[dp++] = (byte) c;
+ } else {
+ if (c < 0x800) {
+ // 2 bytes, 11 bits
+ dst[dp] = (byte) (0xc0 | (c >> 6));
+ dst[dp + 1] = (byte) (0x80 | (c & 0x3f));
+ dp += 2;
+ } else if (c >= '\uD800' && c <= Character.MAX_LOW_SURROGATE) {
+ utf8ToChar2(src, offset, c, numBytes, dst, dp);
+ dp += 4;
+ offset += 2;
+ } else {
+ // 3 bytes, 16 bits
+ dst[dp] = (byte) (0xe0 | ((c >> 12)));
+ dst[dp + 1] = (byte) (0x80 | ((c >> 6) & 0x3f));
+ dst[dp + 2] = (byte) (0x80 | (c & 0x3f));
+ dp += 3;
+ }
+ }
+ }
+ }
+ return dp;
+ }
+
+ /**
+ * A fast convert algorithm to convert an utf8 encoded byte array into an
utf16 encoded byte
+ * array.
+ */
+ public static int convertUTF8ToUTF16(byte[] src, int offset, int len, byte[]
dst) {
+ final int end = offset + len;
+ int dp = 0;
+
+ while (offset < end) {
+ if (offset + 8 <= end
+ && (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset) &
0x8080808080808080L)
+ == 0) {
+ // ascii only
+ if (Platform.IS_LITTLE_ENDIAN) {
+ dst[dp] = src[offset];
+ dst[dp + 2] = src[offset + 1];
+ dst[dp + 4] = src[offset + 2];
+ dst[dp + 6] = src[offset + 3];
+ dst[dp + 8] = src[offset + 4];
+ dst[dp + 10] = src[offset + 5];
+ dst[dp + 12] = src[offset + 6];
+ dst[dp + 14] = src[offset + 7];
+ } else {
+ dst[dp + 1] = src[offset];
+ dst[dp + 3] = src[offset + 1];
+ dst[dp + 5] = src[offset + 2];
+ dst[dp + 7] = src[offset + 3];
+ dst[dp + 9] = src[offset + 4];
+ dst[dp + 11] = src[offset + 5];
+ dst[dp + 13] = src[offset + 6];
+ dst[dp + 15] = src[offset + 7];
+ }
+ dp += 16;
+ offset += 8;
+ } else {
+ int b0 = src[offset++];
+ if (b0 >= 0) {
+ // 1 byte, 7 bits: 0xxxxxxx
+ dst[dp] = (byte) b0;
+ dst[dp + 1] = 0;
+ dp += 2;
+ } else if ((b0 >> 5) == -2 && (b0 & 0x1e) != 0) {
+ // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
+ if (offset >= end) {
+ return -1;
+ }
+ int b1 = src[offset++];
+ if ((b1 & 0xc0) != 0x80) { // isNotContinuation(b2)
+ return -1;
+ } else {
+ char c = (char) (((b0 << 6) ^ b1) ^ (((byte) 0xC0 << 6) ^ ((byte)
0x80)));
+ dst[dp] = (byte) c;
+ dst[dp + 1] = (byte) (c >> 8);
+ dp += 2;
+ }
+ } else if ((b0 >> 4) == -2) {
+ // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
+ if (offset + 1 >= end) {
+ return -1;
+ }
+ int b1 = src[offset];
+ int b2 = src[offset + 1];
+ offset += 2;
+ if ((b0 == (byte) 0xe0 && (b1 & 0xe0) == 0x80) //
+ || (b1 & 0xc0) != 0x80 //
+ || (b2 & 0xc0) != 0x80) { // isMalformed3(b0, b1, b2)
+ return -1;
+ } else {
+ char c =
+ (char)
+ ((b0 << 12)
+ ^ (b1 << 6)
+ ^ (b2 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^
((byte) 0x80))));
+ boolean isSurrogate = c >= '\uD800' && c <
(Character.MAX_LOW_SURROGATE + 1);
+ if (isSurrogate) {
+ return -1;
+ } else {
+ dst[dp] = (byte) c;
+ dst[dp + 1] = (byte) (c >> 8);
+ dp += 2;
+ }
+ }
+ } else if ((b0 >> 3) == -2) {
+ // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ if (offset + 2 >= end) {
+ return -1;
+ }
+ int b2 = src[offset];
+ int b3 = src[offset + 1];
+ int b4 = src[offset + 2];
+ offset += 3;
+ int uc =
+ ((b0 << 18)
+ ^ (b2 << 12)
+ ^ (b3 << 6)
+ ^ (b4
+ ^ (((byte) 0xF0 << 18)
+ ^ ((byte) 0x80 << 12)
+ ^ ((byte) 0x80 << 6)
+ ^ ((byte) 0x80))));
+ if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) !=
0x80) // isMalformed4
+ ||
+ // shortest form check
+ !(uc >= 0x010000 && uc < 0X10FFFF + 1) //
!Character.isSupplementaryCodePoint(uc)
+ ) {
+ return -1;
+ } else {
+ char c = (char) ((uc >>> 10) + ('\uD800' - (0x010000 >>> 10)));
+ dst[dp] = (byte) c;
+ dst[dp + 1] = (byte) (c >> 8);
+ dp += 2;
+
+ c = (char) ((uc & 0x3ff) + Character.MIN_LOW_SURROGATE);
+ dst[dp] = (byte) c;
+ dst[dp + 1] = (byte) (c >> 8);
+ dp += 2;
+ }
+ } else {
+ return -1;
+ }
+ }
+ }
+ return dp;
+ }
+
+ /**
+ * A fast convert algorithm to convert an utf8 encoded byte array into utf16
encoded char array.
+ */
+ public static int convertUTF8ToUTF16(byte[] src, int offset, int len, char[]
dst) {
+ int end = offset + len;
+ int dp = 0;
+ while (offset < end) {
+ if (offset + 8 <= end
+ && (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset) &
0x8080808080808080L)
+ == 0) {
+ // ascii only
+ dst[dp] = (char) src[offset];
+ dst[dp + 1] = (char) src[offset + 1];
+ dst[dp + 2] = (char) src[offset + 2];
+ dst[dp + 3] = (char) src[offset + 3];
+ dst[dp + 4] = (char) src[offset + 4];
+ dst[dp + 5] = (char) src[offset + 5];
+ dst[dp + 6] = (char) src[offset + 6];
+ dst[dp + 7] = (char) src[offset + 7];
+ dp += 8;
+ offset += 8;
+ } else {
+ int b1 = src[offset++];
+ if (b1 >= 0) {
+ // 1 byte, 7 bits: 0xxxxxxx
+ dst[dp++] = (char) b1;
+ } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
+ // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
+ if (offset >= end) {
+ return -1;
+ }
+ int b2 = src[offset++];
+ if ((b2 & 0xc0) != 0x80) { // isNotContinuation(b2)
+ return -1;
+ } else {
+ dst[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^
((byte) 0x80)));
+ }
+ } else if ((b1 >> 4) == -2) {
+ // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
+ if (offset + 1 >= end) {
+ return -1;
+ }
+
+ int b2 = src[offset];
+ int b3 = src[offset + 1];
+ offset += 2;
+ if ((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80) //
+ || (b2 & 0xc0) != 0x80 //
+ || (b3 & 0xc0) != 0x80) { // isMalformed3(b1, b2, b3)
+ return -1;
+ } else {
+ char c =
+ (char)
+ ((b1 << 12)
+ ^ (b2 << 6)
+ ^ (b3 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^
((byte) 0x80))));
+ boolean isSurrogate = c >= '\uD800' && c <
(Character.MAX_LOW_SURROGATE + 1);
+ if (isSurrogate) {
+ return -1;
+ } else {
+ dst[dp++] = c;
+ }
+ }
+ } else if ((b1 >> 3) == -2) {
+ // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ if (offset + 2 >= end) {
+ return -1;
+ }
+ int b2 = src[offset];
+ int b3 = src[offset + 1];
+ int b4 = src[offset + 2];
+ offset += 3;
+ int uc =
+ ((b1 << 18)
+ ^ (b2 << 12)
+ ^ (b3 << 6)
+ ^ (b4
+ ^ (((byte) 0xF0 << 18)
+ ^ ((byte) 0x80 << 12)
+ ^ ((byte) 0x80 << 6)
+ ^ ((byte) 0x80))));
+ if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) !=
0x80) // isMalformed4
+ ||
+ // shortest form check
+ !(uc >= 0x010000 && uc < 0X10FFFF + 1) //
!Character.isSupplementaryCodePoint(uc)
+ ) {
+ return -1;
+ } else {
+ dst[dp] =
+ (char)
+ ((uc >>> 10) + ('\uD800' - (0x010000 >>> 10))); //
Character.highSurrogate(uc);
+ dst[dp + 1] =
+ (char) ((uc & 0x3ff) + Character.MIN_LOW_SURROGATE); //
Character.lowSurrogate(uc);
+ dp += 2;
+ }
+ } else {
+ return -1;
+ }
+ }
+ }
+ return dp;
+ }
+
+ /** convert two utf16 char c and src[charOffset] to a four byte utf8 bytes.
*/
+ private static void utf8ToChar2(char[] src, int charOffset, char c, byte[]
dst, int dp) {
+ char d;
+ if (c > Character.MAX_HIGH_SURROGATE
+ || charOffset == src.length
+ || (d = src[charOffset]) < Character.MIN_LOW_SURROGATE
+ || d > Character.MAX_LOW_SURROGATE) {
+ throw new RuntimeException("malformed input off : " + charOffset);
+ }
+
+ int uc = ((c << 10) + d) + (0x010000 - ('\uD800' << 10) -
Character.MIN_LOW_SURROGATE);
+ dst[dp] = (byte) (0xf0 | ((uc >> 18)));
+ dst[dp + 1] = (byte) (0x80 | ((uc >> 12) & 0x3f));
+ dst[dp + 2] = (byte) (0x80 | ((uc >> 6) & 0x3f));
+ dst[dp + 3] = (byte) (0x80 | (uc & 0x3f));
+ }
+
+ /** convert two utf16 char c and char(src[offset], src[offset+1]) to a four
byte utf8 bytes. */
+ private static void utf8ToChar2(
+ byte[] src, int offset, char c, int numBytes, byte[] dst, int dp) {
+ char d;
+ if (c > Character.MAX_HIGH_SURROGATE
+ || numBytes - offset < 1
+ || (d = Platform.getChar(src, Platform.BYTE_ARRAY_OFFSET + offset))
+ < Character.MIN_LOW_SURROGATE
+ || d > Character.MAX_LOW_SURROGATE) {
+ throw new RuntimeException("malformed input off : " + offset);
+ }
+
+ int uc = ((c << 10) + d) + (0x010000 - ('\uD800' << 10) -
Character.MIN_LOW_SURROGATE);
+ dst[dp] = (byte) (0xf0 | ((uc >> 18)));
+ dst[dp + 1] = (byte) (0x80 | ((uc >> 12) & 0x3f));
+ dst[dp + 2] = (byte) (0x80 | ((uc >> 6) & 0x3f));
+ dst[dp + 3] = (byte) (0x80 | (uc & 0x3f));
+ }
+}
diff --git a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java
b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java
index cc892bef..99ea8b96 100644
--- a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java
+++ b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java
@@ -26,7 +26,7 @@ import org.apache.fury.memory.Platform;
public class StringUtils {
// A long mask used to clear all-higher bits of char in a super-word way.
- private static final long MULTI_CHARS_NON_LATIN_MASK;
+ public static final long MULTI_CHARS_NON_LATIN_MASK;
private static final char[] BASE16_CHARS2 = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e',
'f'
@@ -267,12 +267,20 @@ public class StringUtils {
}
public static boolean isLatin(char[] chars) {
+ return isLatin(chars, 0);
+ }
+
+ public static boolean isLatin(char[] chars, int start) {
+ if (start > chars.length) {
+ return false;
+ }
+ int byteOffset = start << 1;
int numChars = chars.length;
int vectorizedLen = numChars >> 2;
int vectorizedChars = vectorizedLen << 2;
int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1);
boolean isLatin = true;
- for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset
+= 8) {
+ for (int offset = Platform.CHAR_ARRAY_OFFSET + byteOffset; offset <
endOffset; offset += 8) {
// check 4 chars in a vectorized way, 4 times faster than scalar check
loop.
// See benchmark in CompressStringSuite.latinSuperWordCheck.
long multiChars = Platform.getLong(chars, offset);
diff --git
a/java/fury-core/src/test/java/org/apache/fury/builder/JITContextTest.java
b/java/fury-core/src/test/java/org/apache/fury/builder/JITContextTest.java
index 86143fe9..e375a609 100644
--- a/java/fury-core/src/test/java/org/apache/fury/builder/JITContextTest.java
+++ b/java/fury-core/src/test/java/org/apache/fury/builder/JITContextTest.java
@@ -154,6 +154,12 @@ public class JITContextTest extends FuryTestBase {
LOG.warn("Wait async compilation finish for {}", cls);
}
}
+ while (fury.getJITContext().hasJITResult(PkgAccessLevel.class)) {
+ Thread.sleep(10); // allow serializer be switched to generated version
+ }
+ while (fury.getJITContext().hasJITResult(PrivateAccessLevel.class)) {
+ Thread.sleep(10); // allow serializer be switched to generated version
+ }
Serializer<TestAccessLevel> serializer =
fury.getClassResolver().getSerializer(TestAccessLevel.class);
assertTrue(ReflectionUtils.getObjectFieldValue(serializer, "serializer")
instanceof Generated);
diff --git
a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java
b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java
index fc891ac5..15c46c57 100644
---
a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java
+++
b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java
@@ -303,7 +303,7 @@ public class StringSerializerTest extends FuryTestBase {
@Test
public void testReadUtf8String() {
- Fury fury = getJavaFury();
+ Fury fury =
Fury.builder().withStringCompressed(true).requireClassRegistration(false).build();
for (MemoryBuffer buffer :
new MemoryBuffer[] {
MemoryUtils.buffer(32),
MemoryUtils.wrap(ByteBuffer.allocateDirect(2048))
@@ -313,7 +313,8 @@ public class StringSerializerTest extends FuryTestBase {
assertEquals(serializer.read(buffer), "abc你好");
byte[] bytes = "abc你好".getBytes(StandardCharsets.UTF_8);
byte UTF8 = 2;
- buffer.writeVarUint64(((long) bytes.length) << 2 | UTF8);
+ buffer.writeVarUint64(((long) "abc你好".length() << 1) << 2 | UTF8);
+ buffer.writeInt32(bytes.length);
buffer.writeBytes(bytes);
assertEquals(serializer.read(buffer), "abc你好");
assertEquals(buffer.readerIndex(), buffer.writerIndex());
diff --git
a/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java
b/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java
new file mode 100644
index 00000000..0f5e5ed5
--- /dev/null
+++
b/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.fury.util;
+
+import static org.testng.Assert.assertEquals;
+
+import java.nio.charset.StandardCharsets;
+import org.apache.fury.FuryTestBase;
+import org.testng.annotations.Test;
+
+public class StringEncodingUtilsTest extends FuryTestBase {
+ @Test
+ public void testUTF8ToUTF16() {
+ String input = "你好, Fury";
+ byte[] utf8 = input.getBytes(StandardCharsets.UTF_8);
+ char[] utf16Chars = new char[utf8.length * 2];
+ int readLen = StringEncodingUtils.convertUTF8ToUTF16(utf8, 0, utf8.length,
utf16Chars);
+ String result = new String(utf16Chars, 0, readLen);
+ assertEquals(result, input);
+
+ byte[] utf16Bytes = new byte[utf8.length * 4];
+ readLen = StringEncodingUtils.convertUTF8ToUTF16(utf8, 0, utf8.length,
utf16Bytes);
+ result = new String(utf16Bytes, 0, readLen, StandardCharsets.UTF_16LE);
+ assertEquals(result, input);
+ }
+
+ @Test
+ public void testUTF16ToUTF8() {
+ String input = "你好, Fury";
+ char[] utf16 = new char[input.length()];
+ byte[] utf8 = new byte[input.length() * 3];
+ input.getChars(0, input.length(), utf16, 0);
+ int readLen = StringEncodingUtils.convertUTF16ToUTF8(utf16, utf8, 0);
+ String result = new String(utf8, 0, readLen, StandardCharsets.UTF_8);
+ assertEquals(result, input);
+
+ byte[] utf16Bytes = input.getBytes(StandardCharsets.UTF_16LE);
+ readLen = StringEncodingUtils.convertUTF16ToUTF8(utf16Bytes, utf8, 0);
+ result = new String(utf8, 0, readLen, StandardCharsets.UTF_8);
+ assertEquals(result, input);
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]