This is an automated email from the ASF dual-hosted git repository.
suiliangliang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-fury.git
The following commit(s) were added to refs/heads/main by this push:
new b904639d fix(go/java): Enhance ASCII check in meta string encoding
(#1631)
b904639d is described below
commit b904639db6892e49ca6f58167488d72b11ac03bc
Author: Jason Mok <[email protected]>
AuthorDate: Tue May 14 06:32:34 2024 -0500
fix(go/java): Enhance ASCII check in meta string encoding (#1631)
<!--
**Thanks for contributing to Fury.**
**If this is your first time opening a PR on fury, you can refer to
[CONTRIBUTING.md](https://github.com/apache/incubator-fury/blob/main/CONTRIBUTING.md).**
Contribution Checklist
- The **Apache Fury (incubating)** community has restrictions on the
naming of pr titles. You can also find instructions in
[CONTRIBUTING.md](https://github.com/apache/incubator-fury/blob/main/CONTRIBUTING.md).
- Fury has a strong focus on performance. If the PR you submit will have
an impact on performance, please benchmark it first and provide the
benchmark result here.
-->
## What does this PR do?
<!-- Describe the purpose of this PR. -->
This PR enhances the current ASCII check (before meta string encoding) I
implemented in #1620 to return a UTF-8 encoded `MetaString` early if the
input is non-ASCII. This improves efficiency and saves time on
`computeEncoding` and `encode`. Unit tests are also added.
## Related issues
https://github.com/apache/incubator-fury/issues/1619
## Does this PR introduce any user-facing change?
<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/incubator-fury/issues/new/choose)
describing the need to do so and update the document if necessary.
-->
- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?
## Benchmark
<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
-->
---------
Signed-off-by: Jason Mok <[email protected]>
---
go/fury/meta/meta_string_encoder.go | 21 +++++++++++-----
go/fury/meta/meta_string_test.go | 27 +++++++++++++++++++-
.../org/apache/fury/meta/MetaStringEncoder.java | 8 ++++++
.../java/org/apache/fury/meta/MetaStringTest.java | 29 ++++++++++++++++++++++
4 files changed, 78 insertions(+), 7 deletions(-)
diff --git a/go/fury/meta/meta_string_encoder.go
b/go/fury/meta/meta_string_encoder.go
index 6d3b89bf..419e41e2 100644
--- a/go/fury/meta/meta_string_encoder.go
+++ b/go/fury/meta/meta_string_encoder.go
@@ -36,6 +36,15 @@ func NewEncoder(specialCh1 byte, specialCh2 byte) *Encoder {
// Encode the input string to MetaString using adaptive encoding
func (e *Encoder) Encode(input string) (MetaString, error) {
+ if !isASCII(input) {
+ return MetaString{
+ inputString: input,
+ encoding: UTF_8,
+ specialChar1: e.specialChar1,
+ specialChar2: e.specialChar2,
+ encodedBytes: []byte(input),
+ }, nil
+ }
encoding := e.ComputeEncoding(input)
return e.EncodeWithEncoding(input, encoding)
}
@@ -43,7 +52,7 @@ func (e *Encoder) Encode(input string) (MetaString, error) {
// EncodeWithEncoding Encodes the input string to MetaString using specified
encoding.
func (e *Encoder) EncodeWithEncoding(input string, encoding Encoding)
(MetaString, error) {
if encoding != UTF_8 && !isASCII(input) {
- return MetaString{}, errors.New("non-ASCII characters in meta
string are not allowed")
+ return MetaString{}, errors.New("non-ASCII characters in meta
string are not allowed")
}
if len(input) > 32767 {
return MetaString{}, errors.New("long meta string than 32767 is
not allowed")
@@ -171,12 +180,12 @@ func (e *Encoder) ComputeEncoding(input string) Encoding {
}
func isASCII(input string) bool {
- for _, r := range input {
- if r > 127 {
+ for _, r := range input {
+ if r > 127 {
return false
- }
- }
- return true
+ }
+ }
+ return true
}
type stringStatistics struct {
diff --git a/go/fury/meta/meta_string_test.go b/go/fury/meta/meta_string_test.go
index 0fb89f6d..6560578d 100644
--- a/go/fury/meta/meta_string_test.go
+++ b/go/fury/meta/meta_string_test.go
@@ -18,8 +18,9 @@
package meta
import (
- "github.com/stretchr/testify/require"
"testing"
+
+ "github.com/stretchr/testify/require"
)
func TestEncodeAndDecodeMetaString(t *testing.T) {
@@ -80,3 +81,27 @@ func calcTotalBytes(src string, bitsPerChar int, encoding
Encoding) int {
}
return (ret + 7) / 8
}
+
+func TestAsciiEncoding(t *testing.T) {
+ encoder := NewEncoder('.', '_')
+
+ data, err := encoder.Encode("asciiOnly")
+ require.NoError(t, err)
+ require.NotEqual(t, UTF_8, data.GetEncoding(), "Encoding should not be
UTF-8 for ASCII strings")
+}
+
+func TestNonAsciiEncoding(t *testing.T) {
+ encoder := NewEncoder('.', '_')
+
+ data, err := encoder.Encode("こんにちは") // Non-ASCII String
+ require.NoError(t, err)
+ require.Equal(t, UTF_8, data.GetEncoding(), "Encoding should be UTF-8
for non-ASCII strings")
+}
+
+func TestEncodeWithEncodingNonAscii(t *testing.T) {
+ encoder := NewEncoder('.', '_')
+
+ _, err := encoder.EncodeWithEncoding("こんにちは", LOWER_SPECIAL)
+ require.Error(t, err, "Expected error for non-ASCII characters in
non-UTF-8 encoding")
+ require.Equal(t, "non-ASCII characters in meta string are not allowed",
err.Error())
+}
diff --git
a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java
b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java
index 619a441c..b6a0a58b 100644
--- a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java
+++ b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java
@@ -57,6 +57,14 @@ public class MetaStringEncoder {
if (input.isEmpty()) {
return new MetaString(input, Encoding.UTF_8, specialChar1, specialChar2,
new byte[0]);
}
+ if (!StringSerializer.isLatin(input.toCharArray())) {
+ return new MetaString(
+ input,
+ Encoding.UTF_8,
+ specialChar1,
+ specialChar2,
+ input.getBytes(StandardCharsets.UTF_8));
+ }
Encoding encoding = computeEncoding(input, encodings);
return encode(input, encoding);
}
diff --git
a/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java
b/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java
index f85d5e15..10bfe37a 100644
--- a/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java
+++ b/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java
@@ -214,4 +214,33 @@ public class MetaStringTest {
String decoded = decoder.decode(metaString.getBytes(),
metaString.getEncoding());
assertEquals(decoded, "");
}
+
+ @Test
+ public void testAsciiEncoding() {
+ MetaStringEncoder encoder = new MetaStringEncoder('_', '$');
+ String testString = "asciiOnly";
+ MetaString encodedMetaString = encoder.encode(testString);
+ assertNotSame(encodedMetaString.getEncoding(), MetaString.Encoding.UTF_8);
+ assertEquals(encodedMetaString.getEncoding(),
MetaString.Encoding.ALL_TO_LOWER_SPECIAL);
+ }
+
+ @Test
+ public void testNonAsciiEncoding() {
+ MetaStringEncoder encoder = new MetaStringEncoder('_', '$');
+ String testString = "こんにちは"; // Non-ASCII string
+ MetaString encodedMetaString = encoder.encode(testString);
+ assertEquals(encodedMetaString.getEncoding(), MetaString.Encoding.UTF_8);
+ }
+
+ @Test
+ public void testNonAsciiEncodingAndNonUTF8() {
+ MetaStringEncoder encoder = new MetaStringEncoder('_', '$');
+ String nonAsciiString = "こんにちは"; // Non-ASCII string
+
+ try {
+ encoder.encode(nonAsciiString, MetaString.Encoding.LOWER_SPECIAL);
+ } catch (IllegalArgumentException e) {
+ assertEquals(e.getMessage(), "Non-ASCII characters in meta string are
not allowed");
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]