This is an automated email from the ASF dual-hosted git repository.
vivekrai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/gobblin.git
The following commit(s) were added to refs/heads/master by this push:
new 3617c8f7d7 Revert "[GOBBLIN-2223] Optimise writing of serialised Work
Unit to File syste…" (#4138)
3617c8f7d7 is described below
commit 3617c8f7d75ed1f8ad55842b983f77f2113ae393
Author: thisisArjit <[email protected]>
AuthorDate: Thu Sep 4 10:06:01 2025 +0530
Revert "[GOBBLIN-2223] Optimise writing of serialised Work Unit to File
syste…" (#4138)
This reverts commit 88555c24cc99a3de07e8fabae7475b6878560e34.
---
.../gobblin/compat/hadoop/TextSerializer.java | 22 ++++++++--------------
.../apache/gobblin/compat/TextSerializerTest.java | 20 --------------------
2 files changed, 8 insertions(+), 34 deletions(-)
diff --git
a/gobblin-api/src/main/java/org/apache/gobblin/compat/hadoop/TextSerializer.java
b/gobblin-api/src/main/java/org/apache/gobblin/compat/hadoop/TextSerializer.java
index 14ecce1486..a0939d68b9 100644
---
a/gobblin-api/src/main/java/org/apache/gobblin/compat/hadoop/TextSerializer.java
+++
b/gobblin-api/src/main/java/org/apache/gobblin/compat/hadoop/TextSerializer.java
@@ -19,6 +19,7 @@ package org.apache.gobblin.compat.hadoop;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
/**
@@ -30,27 +31,20 @@ public class TextSerializer {
* Serialize a String using the same logic as a Hadoop Text object
*/
public static void writeStringAsText(DataOutput stream, String str) throws
IOException {
- // TODO: Use writeChars instead of writeBytes to support unicode
- for (int i = 0; i < str.length(); i++) {
- if (str.charAt(i) > 0x7F) {
- throw new IllegalArgumentException("Non-ASCII character detected.");
- }
- }
- writeVLong(stream, str.length());
- stream.writeBytes(str);
+ byte[] utf8Encoded = str.getBytes(StandardCharsets.UTF_8);
+ writeVLong(stream, utf8Encoded.length);
+ stream.write(utf8Encoded);
}
/**
* Deserialize a Hadoop Text object into a String
*/
public static String readTextAsString(DataInput in) throws IOException {
- int bufLen = (int) readVLong(in);
- StringBuilder sb = new StringBuilder();
+ int bufLen = (int)readVLong(in);
+ byte[] buf = new byte[bufLen];
+ in.readFully(buf);
- for (int i = 0; i < bufLen; i++) {
- sb.append((char) in.readByte());
- }
- return sb.toString();
+ return new String(buf, StandardCharsets.UTF_8);
}
/**
diff --git
a/gobblin-api/src/test/java/org/apache/gobblin/compat/TextSerializerTest.java
b/gobblin-api/src/test/java/org/apache/gobblin/compat/TextSerializerTest.java
index 30906c9222..04ba79f9b6 100644
---
a/gobblin-api/src/test/java/org/apache/gobblin/compat/TextSerializerTest.java
+++
b/gobblin-api/src/test/java/org/apache/gobblin/compat/TextSerializerTest.java
@@ -33,26 +33,6 @@ import org.apache.gobblin.compat.hadoop.TextSerializer;
public class TextSerializerTest {
private static final String[] textsToSerialize = new String[]{"abracadabra",
Strings.repeat("longString", 128000)};
- private static final String[] serializationErrorText = new String[]{".߸´ˇ",
Strings.repeat("ˀ.¸¯.", 128000)};
-
- @Test
- public void testSerializeError() throws IOException {
- // Use our serializer, verify Hadoop deserializer can read it back
- for (String textToSerialize : serializationErrorText) {
- ByteArrayOutputStream bOs = new ByteArrayOutputStream();
- DataOutputStream dataOutputStream = new DataOutputStream(bOs);
-
- try {
- TextSerializer.writeStringAsText(dataOutputStream, textToSerialize);
- Assert.fail("Expected IOException not thrown");
- } catch (Exception e) {
- Assert.assertTrue(e instanceof IllegalArgumentException);
- // Expected exception
- } finally {
- dataOutputStream.close();
- }
- }
- }
@Test
public void testSerialize()