This is an automated email from the ASF dual-hosted git repository.

chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fury.git


The following commit(s) were added to refs/heads/main by this push:
     new e0874817 fix(java): Fix incorrect results of utf16 to utf8 conversion 
for latin1 but not ascii characters (#1914)
e0874817 is described below

commit e08748177a1217faf8f9e84957ad7bef67817857
Author: HuangXingBo <[email protected]>
AuthorDate: Sun Oct 27 23:49:28 2024 +0800

    fix(java): Fix incorrect results of utf16 to utf8 conversion for latin1 but 
not ascii characters (#1914)
    
    <!--
    **Thanks for contributing to Fury.**
    
    **If this is your first time opening a PR on fury, you can refer to
    
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).**
    
    Contribution Checklist
    
    - The **Apache Fury (incubating)** community has restrictions on the
    naming of pr titles. You can also find instructions in
    [CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).
    
    - Fury has a strong focus on performance. If the PR you submit will have
    an impact on performance, please benchmark it first and provide the
    benchmark result here.
    -->
    
    ## What does this PR do?
    
    <!-- Describe the purpose of this PR. -->
    
    Fix incorrect results of utf16 to utf8 conversion for latin1 but not
    ascii characters
    
    ## Related issues
    
    <!--
    Is there any related issue? Please attach here.
    
    - #xxxx0
    - #xxxx1
    - #xxxx2
    -->
    
    ## Does this PR introduce any user-facing change?
    
    <!--
    If any user-facing interface changes, please [open an
    issue](https://github.com/apache/fury/issues/new/choose) describing the
    need to do so and update the document if necessary.
    -->
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    ## Benchmark
    
    <!--
    When the PR has an impact on performance (if you don't know whether the
    PR will have an impact on performance, you can submit the PR first, and
    if it will have impact on performance, the code reviewer will explain
    it), be sure to attach a benchmark data here.
    -->
---
 .../apache/fury/serializer/StringSerializer.java   | 59 +++++++++++++---------
 .../org/apache/fury/util/StringEncodingUtils.java  | 13 ++---
 .../java/org/apache/fury/util/StringUtils.java     |  4 ++
 .../apache/fury/util/StringEncodingUtilsTest.java  |  4 +-
 4 files changed, 49 insertions(+), 31 deletions(-)

diff --git 
a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java 
b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
index a1161138..a3379660 100644
--- 
a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
+++ 
b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
@@ -20,6 +20,7 @@
 package org.apache.fury.serializer;
 
 import static org.apache.fury.type.TypeUtils.STRING_TYPE;
+import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_ASCII_MASK;
 import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_LATIN_MASK;
 
 import java.lang.invoke.CallSite;
@@ -387,7 +388,6 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
   }
 
   public char[] readCharsLatin1(MemoryBuffer buffer, int numBytes) {
-    //    int utf8AsciiBytes = buffer.readInt32();
     buffer.checkReadableBytes(numBytes);
     byte[] srcArray = buffer.getHeapMemory();
     char[] chars = new char[numBytes];
@@ -775,17 +775,29 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     int vectorizedLen = sampleNum >> 2;
     int vectorizedChars = vectorizedLen << 2;
     int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1);
-    int count = 0;
+    int asciiCount = 0;
+    int latin1Count = 0;
     for (int offset = Platform.CHAR_ARRAY_OFFSET, charOffset = 0;
         offset < endOffset;
         offset += 8, charOffset += 4) {
       long multiChars = Platform.getLong(chars, offset);
-      if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
-        count += 4;
+      if ((multiChars & MULTI_CHARS_NON_ASCII_MASK) == 0) {
+        latin1Count += 4;
+        asciiCount += 4;
+      } else if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
+        latin1Count += 4;
+        for (int i = 0; i < 4; ++i) {
+          if (chars[charOffset + i] < 0x80) {
+            asciiCount++;
+          }
+        }
       } else {
         for (int i = 0; i < 4; ++i) {
           if (chars[charOffset + i] < 0x80) {
-            count++;
+            latin1Count++;
+            asciiCount++;
+          } else if (chars[charOffset + i] <= 0xFF) {
+            latin1Count++;
           }
         }
       }
@@ -793,15 +805,18 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
 
     for (int i = vectorizedChars; i < sampleNum; i++) {
       if (chars[i] < 0x80) {
-        count++;
+        latin1Count++;
+        asciiCount++;
+      } else if (chars[i] <= 0xFF) {
+        latin1Count++;
       }
     }
 
-    // ascii number > 50%, choose UTF-8
-    if (count >= sampleNum * 0.5) {
-      if (count == numChars || (count == sampleNum && 
StringUtils.isLatin(chars, sampleNum))) {
-        return LATIN1;
-      }
+    if (latin1Count == numChars
+        || (latin1Count == sampleNum && StringUtils.isLatin(chars, 
sampleNum))) {
+      return LATIN1;
+    } else if (asciiCount >= sampleNum * 0.5) {
+      // ascii number > 50%, choose UTF-8
       return UTF8;
     } else {
       return UTF16;
@@ -815,30 +830,28 @@ public final class StringSerializer extends 
ImmutableSerializer<String> {
     int vectorizedLen = sampleNum >> 3;
     int vectorizedBytes = vectorizedLen << 3;
     int endOffset = Platform.BYTE_ARRAY_OFFSET + vectorizedBytes;
-    int count = 0;
+    int asciiCount = 0;
     for (int offset = Platform.BYTE_ARRAY_OFFSET, bytesOffset = 0;
         offset < endOffset;
         offset += 8, bytesOffset += 8) {
       long multiChars = Platform.getLong(bytes, offset);
-      if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
-        count += 4;
+      if ((multiChars & MULTI_CHARS_NON_ASCII_MASK) == 0) {
+        asciiCount += 4;
       } else {
-        for (int i = Platform.IS_LITTLE_ENDIAN ? 1 : 0; i < 8; i += 2) {
-          if (bytes[bytesOffset + i] == 0) {
-            count++;
+        for (int i = 0; i < 8; i += 2) {
+          if (Platform.getChar(bytes, offset + i) < 0x80) {
+            asciiCount++;
           }
         }
       }
     }
-    for (int i = Platform.IS_LITTLE_ENDIAN ? vectorizedBytes + 1 : 
vectorizedBytes;
-        i < sampleNum;
-        ++i) {
-      if (bytes[i] == 0) {
-        count++;
+    for (int i = vectorizedBytes; vectorizedBytes < sampleNum; vectorizedBytes 
+= 2) {
+      if (Platform.getChar(bytes, Platform.BYTE_ARRAY_OFFSET + i) < 0x80) {
+        asciiCount++;
       }
     }
     // ascii number > 50%, choose UTF-8
-    if (count >= sampleNum * 0.5) {
+    if (asciiCount >= sampleNum * 0.5) {
       return UTF8;
     } else {
       return UTF16;
diff --git 
a/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java 
b/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java
index d90b5412..a33d7bd5 100644
--- a/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java
+++ b/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java
@@ -19,7 +19,7 @@
 
 package org.apache.fury.util;
 
-import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_LATIN_MASK;
+import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_ASCII_MASK;
 
 import org.apache.fury.memory.Platform;
 
@@ -29,11 +29,9 @@ public class StringEncodingUtils {
   /** A fast convert algorithm to convert an utf16 char array into an utf8 
byte array. */
   public static int convertUTF16ToUTF8(char[] src, byte[] dst, int dp) {
     int numChars = src.length;
-    for (int charOffset = 0; charOffset < numChars; ) {
+    for (int charOffset = 0, arrayOffset = Platform.CHAR_ARRAY_OFFSET; 
charOffset < numChars; ) {
       if (charOffset + 4 <= numChars
-          && (Platform.getLong(src, Platform.CHAR_ARRAY_OFFSET + charOffset * 
2L)
-                  & MULTI_CHARS_NON_LATIN_MASK)
-              == 0) {
+          && (Platform.getLong(src, arrayOffset) & MULTI_CHARS_NON_ASCII_MASK) 
== 0) {
         // ascii only
         dst[dp] = (byte) src[charOffset];
         dst[dp + 1] = (byte) src[charOffset + 1];
@@ -41,8 +39,10 @@ public class StringEncodingUtils {
         dst[dp + 3] = (byte) src[charOffset + 3];
         dp += 4;
         charOffset += 4;
+        arrayOffset += 8;
       } else {
         char c = src[charOffset++];
+        arrayOffset += 2;
         if (c < 0x80) {
           dst[dp++] = (byte) c;
         } else if (c < 0x800) {
@@ -53,6 +53,7 @@ public class StringEncodingUtils {
           utf8ToChar2(src, charOffset, c, dst, dp);
           dp += 4;
           charOffset++;
+          arrayOffset += 2;
         } else {
           dst[dp] = (byte) (0xe0 | ((c >> 12)));
           dst[dp + 1] = (byte) (0x80 | ((c >> 6) & 0x3f));
@@ -70,7 +71,7 @@ public class StringEncodingUtils {
     for (int offset = 0; offset < numBytes; ) {
       if (offset + 8 <= numBytes
           && (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset)
-                  & MULTI_CHARS_NON_LATIN_MASK)
+                  & MULTI_CHARS_NON_ASCII_MASK)
               == 0) {
         // ascii only
         if (Platform.IS_LITTLE_ENDIAN) {
diff --git a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java 
b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java
index 99ea8b96..f9a7a16f 100644
--- a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java
+++ b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java
@@ -28,6 +28,8 @@ public class StringUtils {
   // A long mask used to clear all-higher bits of char in a super-word way.
   public static final long MULTI_CHARS_NON_LATIN_MASK;
 
+  public static final long MULTI_CHARS_NON_ASCII_MASK;
+
   private static final char[] BASE16_CHARS2 = {
     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 
'f'
   };
@@ -37,10 +39,12 @@ public class StringUtils {
       // latin chars will be 0xXX,0x00;0xXX,0x00 in byte order;
       // Using 0x00,0xff(0xff00) to clear latin bits.
       MULTI_CHARS_NON_LATIN_MASK = 0xff00ff00ff00ff00L;
+      MULTI_CHARS_NON_ASCII_MASK = 0xff80ff80ff80ff80L;
     } else {
       // latin chars will be 0x00,0xXX;0x00,0xXX in byte order;
       // Using 0x00,0xff(0x00ff) to clear latin bits.
       MULTI_CHARS_NON_LATIN_MASK = 0x00ff00ff00ff00ffL;
+      MULTI_CHARS_NON_ASCII_MASK = 0x80ff80ff80ff80ffL;
     }
   }
 
diff --git 
a/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java
 
b/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java
index 0f5e5ed5..fc783deb 100644
--- 
a/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java
+++ 
b/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java
@@ -28,7 +28,7 @@ import org.testng.annotations.Test;
 public class StringEncodingUtilsTest extends FuryTestBase {
   @Test
   public void testUTF8ToUTF16() {
-    String input = "你好, Fury";
+    String input = "jbmbmner8 jhk hj \n \t üäßß@µ你好";
     byte[] utf8 = input.getBytes(StandardCharsets.UTF_8);
     char[] utf16Chars = new char[utf8.length * 2];
     int readLen = StringEncodingUtils.convertUTF8ToUTF16(utf8, 0, utf8.length, 
utf16Chars);
@@ -43,7 +43,7 @@ public class StringEncodingUtilsTest extends FuryTestBase {
 
   @Test
   public void testUTF16ToUTF8() {
-    String input = "你好, Fury";
+    String input = "jbmbmner8 jhk hj \n \t üäßß@µ你好";
     char[] utf16 = new char[input.length()];
     byte[] utf8 = new byte[input.length() * 3];
     input.getChars(0, input.length(), utf16, 0);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to