This is an automated email from the ASF dual-hosted git repository.
lakshsingla pushed a commit to branch 30.0.0
in repository https://gitbox.apache.org/repos/asf/druid.git
The following commit(s) were added to refs/heads/30.0.0 by this push:
new 9b1c2bbe4b4 JSONFlattenerMaker: Speed up charsetFix. (#16212) (#16400)
9b1c2bbe4b4 is described below
commit 9b1c2bbe4b4243915f3b2edc2270e3b4468cd54a
Author: Adarsh Sanjeev <[email protected]>
AuthorDate: Tue May 7 15:08:07 2024 +0530
JSONFlattenerMaker: Speed up charsetFix. (#16212) (#16400)
JSON parsing has this function "charsetFix" that fixes up strings
so they can round-trip through UTF-8 encoding without loss of
fidelity. It was originally introduced to fix a bug where strings
could be sorted, encoded, then decoded, and the resulting decoded
strings could end up no longer in sorted order (due to character
swaps during the encode operation).
The code has been in place for some time, and only applies to JSON.
I am not sure if it needs to apply to other formats; it's certainly
more difficult to get broken strings from other formats. It's easy
in JSON because you can write a JSON string like "foo\uD900".
At any rate, this patch does not revisit whether charsetFix should
be applied to all formats. It merely optimizes it for the JSON case.
The function works by using CharsetEncoder.canEncode, which is
a relatively slow method (just as expensive as actually encoding).
This patch adds a short-circuit to skip canEncode if all chars in
a string are in the basic multilingual plane (i.e. if no chars are
surrogates).
Co-authored-by: Gian Merlino <[email protected]>
---
.../util/common/parsers/JSONFlattenerMaker.java | 44 +++++++++++++++++++---
.../common/parsers/JSONFlattenerMakerTest.java | 15 ++++++++
2 files changed, 54 insertions(+), 5 deletions(-)
diff --git
a/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
b/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
index 5df98199080..84490b19fce 100644
---
a/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
+++
b/processing/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
@@ -203,19 +203,53 @@ public class JSONFlattenerMaker implements
ObjectFlatteners.FlattenerMaker<JsonN
return null;
}
+ /**
+ * Fix up a string so it can round-trip through UTF-8 encoding without loss
of fidelity. This comes up when a string
+ * has surrogates (e.g. \uD900) that appear in invalid positions, such as
alone rather than as part of a
+ * surrogate pair.
+ *
+ * This operation is useful because when a string cannot round-trip
properly, it can cause it to sort differently
+ * relative to other strings after an encode/decode round-trip. This causes
incorrect ordering in cases where strings
+ * are sorted, then encoded, then decoded again.
+ *
+ * @param s string, or null
+ * @param enc UTF-8 encoder
+ *
+ * @return the original string, or a fixed version that can round-trip
properly
+ */
@Nullable
- private static String charsetFix(String s, CharsetEncoder enc)
+ static String charsetFix(@Nullable String s, CharsetEncoder enc)
{
- if (s != null && !enc.canEncode(s)) {
- // Some whacky characters are in this string (e.g. \uD900). These are
problematic because they are decodeable
- // by new String(...) but will not encode into the same character. This
dance here will replace these
- // characters with something more sane.
+ if (s != null && !isBmp(s) && !enc.canEncode(s)) {
+ // Note: the check isBmp isn't necessary for correct behavior, but it
improves performance in the common case
+ // where all characters are in BMP. It short-circuits "enc.canEncode",
which is a slow operation. The short
+ // circuit is valid because if every char in a string is in BMP, it is
definitely encodable as UTF-8.
+
+ // This dance here will replace the original string with one that can be
round-tripped.
return StringUtils.fromUtf8(StringUtils.toUtf8(s));
} else {
return s;
}
}
+ /**
+ * Returns whether every character in a string is in BMP (basic multilingual
plane).
+ */
+ private static boolean isBmp(String s)
+ {
+ for (int i = 0; i < s.length(); i++) {
+ final char c = s.charAt(i);
+
+ // All 16-bit code units are either valid code points, or surrogates. So
if this char is not a surrogate,
+ // it must represent a code point in BMP.
+ if (Character.isSurrogate(c)) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
private static boolean isFlatList(JsonNode list)
{
for (JsonNode obj : list) {
diff --git
a/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java
b/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java
index 5583081db10..b471c24b40d 100644
---
a/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java
+++
b/processing/src/test/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMakerTest.java
@@ -31,6 +31,8 @@ import org.junit.Assert;
import org.junit.Test;
import java.math.BigInteger;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
@@ -199,4 +201,17 @@ public class JSONFlattenerMakerTest
ImmutableSet.copyOf(FLATTENER_MAKER_NESTED.discoverRootFields(node))
);
}
+
+ @Test
+ public void testCharsetFix()
+ {
+ final CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder();
+ Assert.assertEquals("hello", JSONFlattenerMaker.charsetFix("hello",
encoder));
+ Assert.assertEquals("Apache® Druid",
JSONFlattenerMaker.charsetFix("Apache® Druid", encoder));
+ Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uD900",
encoder));
+ Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uD83D",
encoder));
+ Assert.assertEquals("hello?", JSONFlattenerMaker.charsetFix("hello\uDCAF",
encoder));
+ Assert.assertEquals("hello💯",
JSONFlattenerMaker.charsetFix("hello\uD83D\uDCAF", encoder));
+ Assert.assertEquals("héllö", JSONFlattenerMaker.charsetFix("héllö",
encoder));
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]