This is an automated email from the ASF dual-hosted git repository.

suiliangliang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-fury.git


The following commit(s) were added to refs/heads/main by this push:
     new b904639d fix(go/java): Enhance ASCII check in meta string encoding 
(#1631)
b904639d is described below

commit b904639db6892e49ca6f58167488d72b11ac03bc
Author: Jason Mok <[email protected]>
AuthorDate: Tue May 14 06:32:34 2024 -0500

    fix(go/java): Enhance ASCII check in meta string encoding (#1631)
    
    <!--
    **Thanks for contributing to Fury.**
    
    **If this is your first time opening a PR on fury, you can refer to
    
[CONTRIBUTING.md](https://github.com/apache/incubator-fury/blob/main/CONTRIBUTING.md).**
    
    Contribution Checklist
    
    - The **Apache Fury (incubating)** community has restrictions on the
    naming of pr titles. You can also find instructions in
    
[CONTRIBUTING.md](https://github.com/apache/incubator-fury/blob/main/CONTRIBUTING.md).
    
    - Fury has a strong focus on performance. If the PR you submit will have
    an impact on performance, please benchmark it first and provide the
    benchmark result here.
    -->
    
    ## What does this PR do?
    
    <!-- Describe the purpose of this PR. -->
    This PR enhances the current ASCII check (before meta string encoding) I
    implemented in #1620 to return a UTF-8 encoded `MetaString` early if the
    input is non-ASCII. This improves efficiency and saves time on
    `computeEncoding` and `encode`. Unit tests are also added.
    
    
    
    
    ## Related issues
    https://github.com/apache/incubator-fury/issues/1619
    
    ## Does this PR introduce any user-facing change?
    
    <!--
    If any user-facing interface changes, please [open an
    issue](https://github.com/apache/incubator-fury/issues/new/choose)
    describing the need to do so and update the document if necessary.
    -->
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    
    ## Benchmark
    
    <!--
    When the PR has an impact on performance (if you don't know whether the
    PR will have an impact on performance, you can submit the PR first, and
    if it will have impact on performance, the code reviewer will explain
    it), be sure to attach a benchmark data here.
    -->
    
    ---------
    
    Signed-off-by: Jason Mok <[email protected]>
---
 go/fury/meta/meta_string_encoder.go                | 21 +++++++++++-----
 go/fury/meta/meta_string_test.go                   | 27 +++++++++++++++++++-
 .../org/apache/fury/meta/MetaStringEncoder.java    |  8 ++++++
 .../java/org/apache/fury/meta/MetaStringTest.java  | 29 ++++++++++++++++++++++
 4 files changed, 78 insertions(+), 7 deletions(-)

diff --git a/go/fury/meta/meta_string_encoder.go 
b/go/fury/meta/meta_string_encoder.go
index 6d3b89bf..419e41e2 100644
--- a/go/fury/meta/meta_string_encoder.go
+++ b/go/fury/meta/meta_string_encoder.go
@@ -36,6 +36,15 @@ func NewEncoder(specialCh1 byte, specialCh2 byte) *Encoder {
 
 // Encode the input string to MetaString using adaptive encoding
 func (e *Encoder) Encode(input string) (MetaString, error) {
+       if !isASCII(input) {
+               return MetaString{
+                       inputString:  input,
+                       encoding:     UTF_8,
+                       specialChar1: e.specialChar1,
+                       specialChar2: e.specialChar2,
+                       encodedBytes: []byte(input),
+               }, nil
+       }
        encoding := e.ComputeEncoding(input)
        return e.EncodeWithEncoding(input, encoding)
 }
@@ -43,7 +52,7 @@ func (e *Encoder) Encode(input string) (MetaString, error) {
 // EncodeWithEncoding Encodes the input string to MetaString using specified 
encoding.
 func (e *Encoder) EncodeWithEncoding(input string, encoding Encoding) 
(MetaString, error) {
        if encoding != UTF_8 && !isASCII(input) {
-           return MetaString{}, errors.New("non-ASCII characters in meta 
string are not allowed")
+               return MetaString{}, errors.New("non-ASCII characters in meta 
string are not allowed")
        }
        if len(input) > 32767 {
                return MetaString{}, errors.New("long meta string than 32767 is 
not allowed")
@@ -171,12 +180,12 @@ func (e *Encoder) ComputeEncoding(input string) Encoding {
 }
 
 func isASCII(input string) bool {
-    for _, r := range input {
-        if r > 127 {
+       for _, r := range input {
+               if r > 127 {
                        return false
-        }
-    }
-    return true
+               }
+       }
+       return true
 }
 
 type stringStatistics struct {
diff --git a/go/fury/meta/meta_string_test.go b/go/fury/meta/meta_string_test.go
index 0fb89f6d..6560578d 100644
--- a/go/fury/meta/meta_string_test.go
+++ b/go/fury/meta/meta_string_test.go
@@ -18,8 +18,9 @@
 package meta
 
 import (
-       "github.com/stretchr/testify/require"
        "testing"
+
+       "github.com/stretchr/testify/require"
 )
 
 func TestEncodeAndDecodeMetaString(t *testing.T) {
@@ -80,3 +81,27 @@ func calcTotalBytes(src string, bitsPerChar int, encoding 
Encoding) int {
        }
        return (ret + 7) / 8
 }
+
+func TestAsciiEncoding(t *testing.T) {
+       encoder := NewEncoder('.', '_')
+
+       data, err := encoder.Encode("asciiOnly")
+       require.NoError(t, err)
+       require.NotEqual(t, UTF_8, data.GetEncoding(), "Encoding should not be 
UTF-8 for ASCII strings")
+}
+
+func TestNonAsciiEncoding(t *testing.T) {
+       encoder := NewEncoder('.', '_')
+
+       data, err := encoder.Encode("こんにちは") // Non-ASCII String
+       require.NoError(t, err)
+       require.Equal(t, UTF_8, data.GetEncoding(), "Encoding should be UTF-8 
for non-ASCII strings")
+}
+
+func TestEncodeWithEncodingNonAscii(t *testing.T) {
+       encoder := NewEncoder('.', '_')
+
+       _, err := encoder.EncodeWithEncoding("こんにちは", LOWER_SPECIAL)
+       require.Error(t, err, "Expected error for non-ASCII characters in 
non-UTF-8 encoding")
+       require.Equal(t, "non-ASCII characters in meta string are not allowed", 
err.Error())
+}
diff --git 
a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java 
b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java
index 619a441c..b6a0a58b 100644
--- a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java
+++ b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java
@@ -57,6 +57,14 @@ public class MetaStringEncoder {
     if (input.isEmpty()) {
       return new MetaString(input, Encoding.UTF_8, specialChar1, specialChar2, 
new byte[0]);
     }
+    if (!StringSerializer.isLatin(input.toCharArray())) {
+      return new MetaString(
+          input,
+          Encoding.UTF_8,
+          specialChar1,
+          specialChar2,
+          input.getBytes(StandardCharsets.UTF_8));
+    }
     Encoding encoding = computeEncoding(input, encodings);
     return encode(input, encoding);
   }
diff --git 
a/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java 
b/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java
index f85d5e15..10bfe37a 100644
--- a/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java
+++ b/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java
@@ -214,4 +214,33 @@ public class MetaStringTest {
     String decoded = decoder.decode(metaString.getBytes(), 
metaString.getEncoding());
     assertEquals(decoded, "");
   }
+
+  @Test
+  public void testAsciiEncoding() {
+    MetaStringEncoder encoder = new MetaStringEncoder('_', '$');
+    String testString = "asciiOnly";
+    MetaString encodedMetaString = encoder.encode(testString);
+    assertNotSame(encodedMetaString.getEncoding(), MetaString.Encoding.UTF_8);
+    assertEquals(encodedMetaString.getEncoding(), 
MetaString.Encoding.ALL_TO_LOWER_SPECIAL);
+  }
+
+  @Test
+  public void testNonAsciiEncoding() {
+    MetaStringEncoder encoder = new MetaStringEncoder('_', '$');
+    String testString = "こんにちは"; // Non-ASCII string
+    MetaString encodedMetaString = encoder.encode(testString);
+    assertEquals(encodedMetaString.getEncoding(), MetaString.Encoding.UTF_8);
+  }
+
+  @Test
+  public void testNonAsciiEncodingAndNonUTF8() {
+    MetaStringEncoder encoder = new MetaStringEncoder('_', '$');
+    String nonAsciiString = "こんにちは"; // Non-ASCII string
+
+    try {
+      encoder.encode(nonAsciiString, MetaString.Encoding.LOWER_SPECIAL);
+    } catch (IllegalArgumentException e) {
+      assertEquals(e.getMessage(), "Non-ASCII characters in meta string are 
not allowed");
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to