This is an automated email from the ASF dual-hosted git repository.
gian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git
The following commit(s) were added to refs/heads/master by this push:
new 78d0b0abce Add string comparison methods to StringUtils, fix
dictionary comparisons. (#13364)
78d0b0abce is described below
commit 78d0b0abce2b27b88a53790684d81530a2318fc9
Author: Gian Merlino <[email protected]>
AuthorDate: Wed Nov 16 07:15:00 2022 -0800
Add string comparison methods to StringUtils, fix dictionary comparisons.
(#13364)
* Add string comparison methods to StringUtils, fix dictionary comparisons.
There are various places in Druid code where we assume that String.compareTo
is consistent with Unicode code-point ordering. Sadly this is not the case.
To help deal with this, this patch introduces the following helpers:
1) compareUnicode: Compares two Strings in Unicode code-point order.
2) compareUtf8: Compares two UTF-8 byte arrays in Unicode code-point order.
Equivalent to comparison as unsigned bytes.
3) compareUtf8UsingJavaStringOrdering: Compares two UTF-8 byte arrays, or
ByteBuffers, in a manner consistent with String.compareTo.
There is no helper for comparing two Strings in a manner consistent
with String.compareTo, because for that we can use compareTo directly.
The patch also fixes an inconsistency between the String and UTF-8
dictionary GenericIndexed flavors of string-typed columns: they were
formerly using incompatible comparators.
* Adjust test.
* FrontCodedIndexed updates.
* Add test.
* Fix comments.
---
.../druid/benchmark/BoundFilterBenchmark.java | 2 +-
...tionaryEncodedStringIndexSupplierBenchmark.java | 2 +-
.../DimensionPredicateFilterBenchmark.java | 2 +-
.../benchmark/FrontCodedIndexedBenchmark.java | 2 +-
.../apache/druid/benchmark/InFilterBenchmark.java | 2 +-
.../druid/benchmark/LikeFilterBenchmark.java | 2 +-
.../druid/java/util/common/ByteBufferUtils.java | 52 ++------
.../apache/druid/java/util/common/StringUtils.java | 142 +++++++++++++++++++++
.../java/util/common/ByteBufferUtilsTest.java | 59 +++++++--
.../druid/java/util/common/StringUtilsTest.java | 96 ++++++++++++++
.../apache/druid/frame/write/FrameWriterUtils.java | 2 +-
.../org/apache/druid/query/filter/InDimFilter.java | 2 +-
.../druid/query/ordering/StringComparators.java | 8 +-
.../java/org/apache/druid/segment/IndexIO.java | 2 +-
.../segment/column/IndexedUtf8ValueSetIndex.java | 2 +-
.../druid/segment/column/Utf8ValueSetIndex.java | 2 +-
.../druid/segment/data/FrontCodedIndexed.java | 20 +--
.../segment/data/FrontCodedIndexedWriter.java | 24 ++--
.../apache/druid/segment/data/GenericIndexed.java | 13 +-
.../segment/nested/NestedDataColumnSupplier.java | 4 +-
.../serde/DictionaryEncodedColumnPartSerde.java | 2 +-
.../druid/segment/data/FrontCodedIndexedTest.java | 4 +-
.../segment/filter/ExtractionDimFilterTest.java | 2 +-
.../filter/PredicateValueMatcherFactoryTest.java | 4 +-
.../druid/segment/filter/ValueMatchersTest.java | 6 +-
.../NestedFieldLiteralColumnIndexSupplierTest.java | 2 +-
.../DictionaryEncodedStringIndexSupplierTest.java | 4 +-
27 files changed, 359 insertions(+), 105 deletions(-)
diff --git
a/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java
b/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java
index cdb3cf2f7c..819c528198 100644
---
a/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java
+++
b/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java
@@ -179,7 +179,7 @@ public class BoundFilterBenchmark
final GenericIndexed<ByteBuffer> dictionaryUtf8 =
GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i ->
ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
);
selector = new MockColumnIndexSelector(
bitmapFactory,
diff --git
a/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java
b/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java
index 282b25e198..1806f28ad1 100644
---
a/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java
+++
b/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java
@@ -101,7 +101,7 @@ public class DictionaryEncodedStringIndexSupplierBenchmark
final GenericIndexed<ByteBuffer> dictionaryUtf8 =
GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i ->
ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps =
GenericIndexed.fromIterable(
() -> IntStream.range(0, dictionarySize)
diff --git
a/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java
b/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java
index f01b09a79f..34a35b559a 100644
---
a/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java
+++
b/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java
@@ -130,7 +130,7 @@ public class DimensionPredicateFilterBenchmark
final GenericIndexed<ByteBuffer> dictionaryUtf8 =
GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i ->
ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps =
GenericIndexed.fromIterable(
FluentIterable.from(ints)
diff --git
a/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java
b/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java
index 2dba1ba5c0..3065663065 100644
---
a/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java
+++
b/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java
@@ -174,7 +174,7 @@ public class FrontCodedIndexedBenchmark
genericIndexed = GenericIndexed.read(
byteBufferGeneric,
- GenericIndexed.BYTE_BUFFER_STRATEGY,
+ GenericIndexed.UTF8_STRATEGY,
SmooshedFileMapper.load(smooshDirFrontCoded)
);
frontCodedIndexed = FrontCodedIndexed.read(
diff --git
a/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java
b/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java
index 09c3253af0..0a97367493 100644
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java
@@ -93,7 +93,7 @@ public class InFilterBenchmark
final GenericIndexed<ByteBuffer> dictionaryUtf8 =
GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i ->
ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps =
GenericIndexed.fromIterable(
() -> IntStream.range(0, dictionarySize)
diff --git
a/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java
b/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java
index 1369d69787..bb85422792 100644
---
a/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java
+++
b/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java
@@ -130,7 +130,7 @@ public class LikeFilterBenchmark
final GenericIndexed<ByteBuffer> dictionaryUtf8 =
GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i ->
ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps =
GenericIndexed.fromIterable(
FluentIterable.from(ints)
diff --git
a/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java
b/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java
index 8209049b84..fb67c32383 100644
--- a/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java
+++ b/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java
@@ -47,7 +47,7 @@ public class ByteBufferUtils
// null if unmap is supported
private static final RuntimeException UNMAP_NOT_SUPPORTED_EXCEPTION;
- private static final Comparator<ByteBuffer> COMPARATOR_UNSIGNED = new
UnsignedByteBufferComparator();
+ private static final Comparator<ByteBuffer> COMPARATOR_UTF8 = new
Utf8ByteBufferComparator();
static {
Object unmap = null;
@@ -214,40 +214,12 @@ public class ByteBufferUtils
}
/**
- * Compares two ByteBuffer ranges using unsigned byte ordering.
+ * Compares two ByteBuffers from their positions to their limits using
ordering consistent with
+ * {@link String#compareTo(String)}. Null buffers are accepted, and are
ordered earlier than any nonnull buffer.
*
- * Different from {@link ByteBuffer#compareTo}, which uses signed ordering.
+ * Different from {@link ByteBuffer#compareTo}, which uses signed-bytes
ordering.
*/
- public static int compareByteBuffers(
- final ByteBuffer buf1,
- final int position1,
- final int length1,
- final ByteBuffer buf2,
- final int position2,
- final int length2
- )
- {
- final int commonLength = Math.min(length1, length2);
-
- for (int i = 0; i < commonLength; i++) {
- final byte byte1 = buf1.get(position1 + i);
- final byte byte2 = buf2.get(position2 + i);
- final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison
- if (cmp != 0) {
- return cmp;
- }
- }
-
- return Integer.compare(length1, length2);
- }
-
- /**
- * Compares two ByteBuffers from their positions to their limits using
unsigned byte ordering. Accepts null
- * buffers, which are ordered earlier than any nonnull buffer.
- *
- * Different from {@link ByteBuffer#compareTo}, which uses signed ordering.
- */
- public static int compareByteBuffers(
+ public static int compareUtf8ByteBuffers(
@Nullable final ByteBuffer buf1,
@Nullable final ByteBuffer buf2
)
@@ -260,7 +232,7 @@ public class ByteBufferUtils
return 1;
}
- return ByteBufferUtils.compareByteBuffers(
+ return StringUtils.compareUtf8UsingJavaStringOrdering(
buf1,
buf1.position(),
buf1.remaining(),
@@ -271,20 +243,20 @@ public class ByteBufferUtils
}
/**
- * Comparator that compares two {@link ByteBuffer} using unsigned ordering.
Null buffers are accepted, and
- * are ordered earlier than any nonnull buffer.
+ * Comparator that compares two {@link ByteBuffer} using ordering consistent
with {@link String#compareTo(String)}.
+ * Null buffers are accepted, and are ordered earlier than any nonnull
buffer.
*/
- public static Comparator<ByteBuffer> unsignedComparator()
+ public static Comparator<ByteBuffer> utf8Comparator()
{
- return COMPARATOR_UNSIGNED;
+ return COMPARATOR_UTF8;
}
- private static class UnsignedByteBufferComparator implements
Comparator<ByteBuffer>
+ private static class Utf8ByteBufferComparator implements
Comparator<ByteBuffer>
{
@Override
public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2)
{
- return ByteBufferUtils.compareByteBuffers(o1, o2);
+ return compareUtf8ByteBuffers(o1, o2);
}
}
}
diff --git
a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java
b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java
index bd17f42c40..41078961a0 100644
--- a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java
+++ b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java
@@ -77,6 +77,148 @@ public class StringUtils
return string == null ? EMPTY_BYTES : toUtf8(string);
}
+ /**
+ * Compares two Java Strings in Unicode code-point order.
+ *
+ * Order is consistent with {@link #compareUtf8(byte[], byte[])}, but is not
consistent with
+ * {@link String#compareTo(String)}.
+ */
+ public static int compareUnicode(final String a, final String b)
+ {
+ final int commonLength = Math.min(a.length(), b.length());
+
+ for (int i = 0; i < commonLength; i++) {
+ int char1 = a.charAt(i) & 0xFFFF; // Unsigned
+ int char2 = b.charAt(i) & 0xFFFF; // Unsigned
+
+ if (char1 != char2 && char1 >= 0xd800 && char2 >= 0xd800) {
+ // Fixup logic for code units at or above the surrogate range, based
on logic described at
+ // https://www.icu-project.org/docs/papers/utf16_code_point_order.html.
+ //
+ // If both code units are at or above the surrogate range (>= 0xd800)
then adjust non-surrogates (legitimate
+ // single-code-unit characters) to be below the surrogate range, so
they compare earlier than surrogates.
+
+ if (!Character.isSurrogate((char) char1)) {
+ char1 -= 0x2800;
+ }
+
+ if (!Character.isSurrogate((char) char2)) {
+ char2 -= 0x2800;
+ }
+ }
+
+ final int cmp = char1 - char2;
+ if (cmp != 0) {
+ return cmp;
+ }
+ }
+
+ return Integer.compare(a.length(), b.length());
+ }
+
+ /**
+ * Compares two UTF-8 byte strings in Unicode code-point order.
+ *
+ * Equivalent to a comparison of the two byte arrays as if they were
unsigned bytes.
+ *
+ * Order is consistent with {@link #compareUnicode(String, String)}, but is
not consistent with
+ * {@link String#compareTo(String)}. For an ordering consistent with {@link
String#compareTo(String)}, use
+ * {@link #compareUtf8UsingJavaStringOrdering(byte[], byte[])} instead.
+ */
+ public static int compareUtf8(final byte[] a, final byte[] b)
+ {
+ final int commonLength = Math.min(a.length, b.length);
+
+ for (int i = 0; i < commonLength; i++) {
+ final byte byte1 = a[i];
+ final byte byte2 = b[i];
+ final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison
+ if (cmp != 0) {
+ return cmp;
+ }
+ }
+
+ return Integer.compare(a.length, b.length);
+ }
+
+ /**
+ * Compares two UTF-8 byte strings in UTF-16 code-unit order.
+ *
+ * Order is consistent with {@link String#compareTo(String)}, but is not
consistent with
+ * {@link #compareUnicode(String, String)} or {@link #compareUtf8(byte[],
byte[])}.
+ */
+ public static int compareUtf8UsingJavaStringOrdering(final byte[] a, final
byte[] b)
+ {
+ final int commonLength = Math.min(a.length, b.length);
+
+ for (int i = 0; i < commonLength; i++) {
+ final int cmp = compareUtf8UsingJavaStringOrdering(a[i], b[i]);
+ if (cmp != 0) {
+ return cmp;
+ }
+ }
+
+ return Integer.compare(a.length, b.length);
+ }
+
+ /**
+ * Compares two UTF-8 byte strings in UTF-16 code-unit order.
+ *
+ * Order is consistent with {@link String#compareTo(String)}, but is not
consistent with
+ * {@link #compareUnicode(String, String)} or {@link #compareUtf8(byte[],
byte[])}.
+ */
+ public static int compareUtf8UsingJavaStringOrdering(
+ final ByteBuffer buf1,
+ final int position1,
+ final int length1,
+ final ByteBuffer buf2,
+ final int position2,
+ final int length2
+ )
+ {
+ final int commonLength = Math.min(length1, length2);
+
+ for (int i = 0; i < commonLength; i++) {
+ final int cmp = compareUtf8UsingJavaStringOrdering(buf1.get(position1 +
i), buf2.get(position2 + i));
+ if (cmp != 0) {
+ return cmp;
+ }
+ }
+
+ return Integer.compare(length1, length2);
+ }
+
+ /**
+ * Compares two bytes from UTF-8 strings in such a way that the entire byte
arrays are compared in UTF-16
+ * code-unit order.
+ *
+ * Compatible with {@link #compareUtf8UsingJavaStringOrdering(byte[],
byte[])} and
+ * {@link #compareUtf8UsingJavaStringOrdering(ByteBuffer, int, int,
ByteBuffer, int, int)}.
+ */
+ public static int compareUtf8UsingJavaStringOrdering(byte byte1, byte byte2)
+ {
+ // Treat as unsigned bytes.
+ int ubyte1 = byte1 & 0xFF;
+ int ubyte2 = byte2 & 0xFF;
+
+ if (ubyte1 != ubyte2 && ubyte1 >= 0xEE && ubyte2 >= 0xEE) {
+ // Fixup logic for lead bytes for U+E000 ... U+FFFF, based on logic
described at
+ // https://www.icu-project.org/docs/papers/utf16_code_point_order.html.
+ //
+ // Move possible lead bytes for this range (0xEE and 0xEF) above all
other bytes, so they compare later.
+
+ if (ubyte1 == 0xEE || ubyte1 == 0xEF) {
+ ubyte1 += 0xFF;
+ }
+
+ if (ubyte2 == 0xEE || ubyte2 == 0xEF) {
+ ubyte2 += 0xFF;
+ }
+ }
+
+ return ubyte1 - ubyte2;
+ }
+
public static String fromUtf8(final byte[] bytes)
{
try {
diff --git
a/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java
b/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java
index f5acb6e030..c32f29bfdd 100644
---
a/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java
+++
b/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java
@@ -19,6 +19,7 @@
package org.apache.druid.java.util.common;
+import com.google.common.collect.ImmutableList;
import com.google.common.io.Files;
import org.apache.druid.collections.ResourceHolder;
import org.hamcrest.MatcherAssert;
@@ -36,9 +37,28 @@ import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.util.Arrays;
import java.util.Comparator;
+import java.util.List;
public class ByteBufferUtilsTest
{
+ private static final List<String> COMPARE_TEST_STRINGS = ImmutableList.of(
+ "(請參見已被刪除版本)",
+ "請參見已被刪除版本",
+ "שָׁלוֹם",
+ "+{{[[Template:別名重定向|別名重定向]]}}",
+ "\uD83D\uDC4D\uD83D\uDC4D\uD83D\uDC4D",
+ "\uD83D\uDCA9",
+ "",
+ "f",
+ "fo",
+ "\uD83D\uDE42",
+ "\uD83E\uDEE5",
+ "\uD83E\uDD20",
+ "quick",
+ "brown",
+ "fox"
+ );
+
@Rule
public TemporaryFolder temporaryFolder = new TemporaryFolder();
@@ -82,9 +102,9 @@ public class ByteBufferUtilsTest
@Test
@SuppressWarnings("EqualsWithItself")
- public void testUnsignedComparator()
+ public void testUtf8Comparator()
{
- final Comparator<ByteBuffer> comparator =
ByteBufferUtils.unsignedComparator();
+ final Comparator<ByteBuffer> comparator = ByteBufferUtils.utf8Comparator();
// Tests involving null
MatcherAssert.assertThat(comparator.compare(null, null),
Matchers.equalTo(0));
@@ -112,18 +132,33 @@ public class ByteBufferUtilsTest
Matchers.greaterThan(0)
);
- // Tests involving the full range of bytes
- for (byte i = Byte.MIN_VALUE; i < Byte.MAX_VALUE; i++) {
- for (byte j = Byte.MIN_VALUE; j < Byte.MAX_VALUE; j++) {
- final int cmp = Integer.compare(Byte.toUnsignedInt(i),
Byte.toUnsignedInt(j));
+ for (final String string1 : COMPARE_TEST_STRINGS) {
+ for (final String string2 : COMPARE_TEST_STRINGS) {
+ final byte[] utf8Bytes1 = StringUtils.toUtf8(string1);
+ final byte[] utf8Bytes2 = StringUtils.toUtf8(string2);
+ final ByteBuffer utf8ByteBuffer1 =
ByteBuffer.allocate(utf8Bytes1.length + 2);
+ final ByteBuffer utf8ByteBuffer2 =
ByteBuffer.allocate(utf8Bytes2.length + 2);
+ utf8ByteBuffer1.position(1);
+ utf8ByteBuffer1.put(utf8Bytes1, 0,
utf8Bytes1.length).position(utf8Bytes1.length);
+ utf8ByteBuffer1.position(1).limit(1 + utf8Bytes1.length);
+ utf8ByteBuffer2.position(1);
+ utf8ByteBuffer2.put(utf8Bytes2, 0,
utf8Bytes2.length).position(utf8Bytes2.length);
+ utf8ByteBuffer2.position(1).limit(1 + utf8Bytes2.length);
+
+ final int compareByteBufferUtilsUtf8 =
ByteBufferUtils.utf8Comparator().compare(
+ utf8ByteBuffer1,
+ utf8ByteBuffer2
+ );
- MatcherAssert.assertThat(
- StringUtils.format("comparison of %s to %s",
Byte.toUnsignedInt(i), Byte.toUnsignedInt(j)),
- comparator.compare(
- ByteBuffer.wrap(new byte[]{i}),
- ByteBuffer.wrap(new byte[]{j})
+ Assert.assertEquals(
+ StringUtils.format(
+ "compareByteBufferUtilsUtf8(byte[]) (actual) "
+ + "matches compareJavaString (expected) for [%s] vs [%s]",
+ string1,
+ string2
),
- cmp < 0 ? Matchers.lessThan(0) : cmp > 0 ? Matchers.greaterThan(0)
: Matchers.equalTo(0)
+ (int) Math.signum(string1.compareTo(string2)),
+ (int) Math.signum(compareByteBufferUtilsUtf8)
);
}
}
diff --git
a/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java
b/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java
index 754e7237c7..3f2d5713c2 100644
--- a/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java
+++ b/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java
@@ -19,6 +19,7 @@
package org.apache.druid.java.util.common;
+import com.google.common.collect.ImmutableList;
import org.apache.druid.collections.ResourceHolder;
import org.junit.Assert;
import org.junit.Rule;
@@ -28,12 +29,31 @@ import org.junit.rules.ExpectedException;
import java.io.UnsupportedEncodingException;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
+import java.util.List;
/**
*
*/
public class StringUtilsTest
{
+ private static final List<String> COMPARE_TEST_STRINGS = ImmutableList.of(
+ "(請參見已被刪除版本)",
+ "請參見已被刪除版本",
+ "שָׁלוֹם",
+ "+{{[[Template:別名重定向|別名重定向]]}}",
+ "\uD83D\uDC4D\uD83D\uDC4D\uD83D\uDC4D",
+ "\uD83D\uDCA9",
+ "",
+ "f",
+ "fo",
+ "\uD83D\uDE42",
+ "\uD83E\uDEE5",
+ "\uD83E\uDD20",
+ "quick",
+ "brown",
+ "fox"
+ );
+
@Rule
public ExpectedException expectedException = ExpectedException.none();
@@ -290,4 +310,80 @@ public class StringUtilsTest
Assert.assertEquals("smile ", StringUtils.fastLooseChop("smile 🙂 for the
camera", 6));
Assert.assertEquals("smile", StringUtils.fastLooseChop("smile 🙂 for the
camera", 5));
}
+
+ @Test
+ public void testUnicodeStringCompare()
+ {
+ for (final String string1 : COMPARE_TEST_STRINGS) {
+ for (final String string2 : COMPARE_TEST_STRINGS) {
+ final int compareUnicode = StringUtils.compareUnicode(string1,
string2);
+ final int compareUtf8 = StringUtils.compareUtf8(
+ StringUtils.toUtf8(string1),
+ StringUtils.toUtf8(string2)
+ );
+
+ Assert.assertEquals(
+ StringUtils.format(
+ "compareUnicode (actual) matches compareUtf8 (expected) for
[%s] vs [%s]",
+ string1,
+ string2
+ ),
+ (int) Math.signum(compareUtf8),
+ (int) Math.signum(compareUnicode)
+ );
+ }
+ }
+ }
+
+ @Test
+ public void testJavaStringCompare()
+ {
+ for (final String string1 : COMPARE_TEST_STRINGS) {
+ for (final String string2 : COMPARE_TEST_STRINGS) {
+ final int compareJavaString = string1.compareTo(string2);
+
+ final byte[] utf8Bytes1 = StringUtils.toUtf8(string1);
+ final byte[] utf8Bytes2 = StringUtils.toUtf8(string2);
+ final int compareByteArrayUtf8UsingJavaStringOrdering =
+ StringUtils.compareUtf8UsingJavaStringOrdering(utf8Bytes1,
utf8Bytes2);
+
+ final ByteBuffer utf8ByteBuffer1 =
ByteBuffer.allocate(utf8Bytes1.length + 2);
+ final ByteBuffer utf8ByteBuffer2 =
ByteBuffer.allocate(utf8Bytes2.length + 2);
+ utf8ByteBuffer1.position(1);
+ utf8ByteBuffer1.put(utf8Bytes1, 0,
utf8Bytes1.length).position(utf8Bytes1.length);
+ utf8ByteBuffer2.position(1);
+ utf8ByteBuffer2.put(utf8Bytes2, 0,
utf8Bytes2.length).position(utf8Bytes2.length);
+ final int compareByteBufferUtf8UsingJavaStringOrdering =
StringUtils.compareUtf8UsingJavaStringOrdering(
+ utf8ByteBuffer1,
+ 1,
+ utf8Bytes1.length,
+ utf8ByteBuffer2,
+ 1,
+ utf8Bytes2.length
+ );
+
+ Assert.assertEquals(
+ StringUtils.format(
+ "compareUtf8UsingJavaStringOrdering(byte[]) (actual) "
+ + "matches compareJavaString (expected) for [%s] vs [%s]",
+ string1,
+ string2
+ ),
+ (int) Math.signum(compareJavaString),
+ (int) Math.signum(compareByteArrayUtf8UsingJavaStringOrdering)
+ );
+
+ Assert.assertEquals(
+ StringUtils.format(
+ "compareByteBufferUtf8UsingJavaStringOrdering(ByteBuffer)
(actual) "
+ + "matches compareJavaString (expected) for [%s] vs [%s]",
+ string1,
+ string2
+ ),
+ (int) Math.signum(compareJavaString),
+ (int) Math.signum(compareByteBufferUtf8UsingJavaStringOrdering)
+ );
+ }
+ }
+ }
}
diff --git
a/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java
b/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java
index 961e99a3f0..ac6d9bfe65 100644
---
a/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java
+++
b/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java
@@ -205,7 +205,7 @@ public class FrameWriterUtils
/**
* Copies "len" bytes from {@code src.position()} to "dstPosition" in
"memory". Does not update the position of src.
*
- * @throws InvalidNullByteException "allowNullBytes" is true and a null byte
is encountered
+ * @throws InvalidNullByteException if "allowNullBytes" is false and a null
byte is encountered
*/
public static void copyByteBufferToMemory(
final ByteBuffer src,
diff --git
a/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java
b/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java
index 2dec044cf1..afddb0e42a 100644
--- a/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java
+++ b/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java
@@ -674,7 +674,7 @@ public class InDimFilter extends
AbstractOptimizableDimFilter implements Filter
public SortedSet<ByteBuffer> toUtf8()
{
- final TreeSet<ByteBuffer> valuesUtf8 = new
TreeSet<>(ByteBufferUtils.unsignedComparator());
+ final TreeSet<ByteBuffer> valuesUtf8 = new
TreeSet<>(ByteBufferUtils.utf8Comparator());
for (final String value : values) {
if (value == null) {
diff --git
a/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
b/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
index 58e228ad2c..4fdcc5c6f3 100644
---
a/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
+++
b/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
@@ -47,9 +47,15 @@ public class StringComparators
public static final int STRLEN_CACHE_ID = 0x04;
public static final int VERSION_CACHE_ID = 0x05;
+ /**
+ * Comparison using the natural comparator of {@link String}.
+ *
+ * Note that this is not equivalent to comparing UTF-8 byte arrays; see
javadocs for
+ * {@link
org.apache.druid.java.util.common.StringUtils#compareUnicode(String, String)}
and
+ * {@link
org.apache.druid.java.util.common.StringUtils#compareUtf8UsingJavaStringOrdering(byte[],
byte[])}.
+ */
public static class LexicographicComparator extends StringComparator
{
- // Equivalent to comparing UTF-8 encoded strings as byte arrays.
private static final Ordering<String> ORDERING =
Ordering.from(String::compareTo).nullsFirst();
@Override
diff --git a/processing/src/main/java/org/apache/druid/segment/IndexIO.java
b/processing/src/main/java/org/apache/druid/segment/IndexIO.java
index 9698ebdc2b..9b74f71768 100644
--- a/processing/src/main/java/org/apache/druid/segment/IndexIO.java
+++ b/processing/src/main/java/org/apache/druid/segment/IndexIO.java
@@ -379,7 +379,7 @@ public class IndexIO
// Duplicate the first buffer since we are reading the dictionary
twice.
dimValueLookups.put(dimension,
GenericIndexed.read(dimBuffer.duplicate(), GenericIndexed.STRING_STRATEGY));
- dimValueUtf8Lookups.put(dimension, GenericIndexed.read(dimBuffer,
GenericIndexed.BYTE_BUFFER_STRATEGY));
+ dimValueUtf8Lookups.put(dimension, GenericIndexed.read(dimBuffer,
GenericIndexed.UTF8_STRATEGY));
dimColumns.put(dimension,
VSizeColumnarMultiInts.readFromByteBuffer(dimBuffer));
}
diff --git
a/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java
b/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java
index 5680a1400f..c568e78d9b 100644
---
a/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java
+++
b/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java
@@ -47,7 +47,7 @@ public final class IndexedUtf8ValueSetIndex<TDictionary
extends Indexed<ByteBuff
// sorted merge instead of binary-search based algorithm.
private static final double SORTED_MERGE_RATIO_THRESHOLD = 0.12D;
private static final int SIZE_WORTH_CHECKING_MIN = 8;
- private static final Comparator<ByteBuffer> COMPARATOR =
ByteBufferUtils.unsignedComparator();
+ private static final Comparator<ByteBuffer> COMPARATOR =
ByteBufferUtils.utf8Comparator();
private final BitmapFactory bitmapFactory;
private final TDictionary dictionary;
diff --git
a/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java
b/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java
index ef0d08ee0a..6598e36f20 100644
---
a/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java
+++
b/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java
@@ -29,7 +29,7 @@ public interface Utf8ValueSetIndex
/**
* Get an {@link Iterable} of {@link ImmutableBitmap} corresponding to the
specified set of values (if they are
* contained in the underlying column). The set must be sorted using
- * {@link
org.apache.druid.java.util.common.ByteBufferUtils#unsignedComparator()}.
+ * {@link
org.apache.druid.java.util.common.ByteBufferUtils#utf8Comparator()}.
*/
BitmapColumnIndex forSortedValuesUtf8(SortedSet<ByteBuffer> valuesUtf8);
}
diff --git
a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java
b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java
index d2d6c28d34..2596f7ec2b 100644
---
a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java
+++
b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java
@@ -23,6 +23,7 @@ import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
import javax.annotation.Nullable;
@@ -340,10 +341,13 @@ public final class FrontCodedIndexed implements
Indexed<ByteBuffer>
/**
- * Performs an unsigned byte comparison of the first value in a bucket with
the specified value. Note that this method
+ * Performs byte-by-byte comparison of the first value in a bucket with the
specified value. Note that this method
* MUST be prepared before calling, as it expects the length of the first
value to have already been read externally,
* and the buffer position to be at the start of the first bucket value. The
final buffer position will be the
- * 'shared prefix length' of the first value in the bucket and the value to
compare
+ * 'shared prefix length' of the first value in the bucket and the value to
compare.
+ *
+ * Bytes are compared using {@link
StringUtils#compareUtf8UsingJavaStringOrdering(byte, byte)}. Therefore, when the
+ * values are UTF-8 encoded strings, the ordering is compatible with {@link
String#compareTo(String)}.
*/
private static int compareBucketFirstValue(ByteBuffer bucketBuffer, int
length, ByteBuffer value)
{
@@ -355,7 +359,7 @@ public final class FrontCodedIndexed implements
Indexed<ByteBuffer>
int sharedPrefix;
int comparison = 0;
for (sharedPrefix = 0; sharedPrefix < commonLength; sharedPrefix++) {
- comparison = unsignedByteCompare(bucketBuffer.get(),
value.get(sharedPrefix));
+ comparison =
StringUtils.compareUtf8UsingJavaStringOrdering(bucketBuffer.get(),
value.get(sharedPrefix));
if (comparison != 0) {
bucketBuffer.position(startOffset + sharedPrefix);
break;
@@ -403,7 +407,10 @@ public final class FrontCodedIndexed implements
Indexed<ByteBuffer>
final int common = Math.min(fragmentLength, value.remaining() -
prefixLength);
int fragmentComparison = 0;
for (int i = 0; i < common; i++) {
- fragmentComparison =
unsignedByteCompare(buffer.get(buffer.position() + i), value.get(prefixLength +
i));
+ fragmentComparison = StringUtils.compareUtf8UsingJavaStringOrdering(
+ buffer.get(buffer.position() + i),
+ value.get(prefixLength + i)
+ );
if (fragmentComparison != 0) {
break;
}
@@ -502,9 +509,4 @@ public final class FrontCodedIndexed implements
Indexed<ByteBuffer>
}
return bucketBuffers;
}
-
- public static int unsignedByteCompare(byte b1, byte b2)
- {
- return (b1 & 0xFF) - (b2 & 0xFF);
- }
}
diff --git
a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java
b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java
index b6120d6c12..bcbe47db62 100644
---
a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java
+++
b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java
@@ -44,8 +44,8 @@ import java.nio.channels.WritableByteChannel;
* the bucket is written entirely, and remaining values are stored as pairs of
an integer which indicates how much
* of the first byte array of the bucket to use as a prefix, followed by the
remaining value bytes after the prefix.
*
- * This is valid to use for any values which can be compared byte by byte with
unsigned comparison. Otherwise, this
- * is not the collection for you.
+ * This writer is designed for use with UTF-8 encoded strings that are written
in an order compatible with
+ * {@link String#compareTo(String)}.
*
* @see FrontCodedIndexed for additional details.
*/
@@ -99,7 +99,7 @@ public class FrontCodedIndexedWriter implements
DictionaryWriter<byte[]>
@Override
public void write(@Nullable byte[] value) throws IOException
{
- if (prevObject != null && unsignedCompare(prevObject, value) >= 0) {
+ if (prevObject != null &&
compareNullableUtf8UsingJavaStringOrdering(prevObject, value) >= 0) {
throw new ISE(
"Values must be sorted and unique. Element [%s] with value [%s] is
before or equivalent to [%s]",
numWritten,
@@ -283,7 +283,7 @@ public class FrontCodedIndexedWriter implements
DictionaryWriter<byte[]>
// all other values must be partitioned into a prefix length and
suffix bytes
int prefixLength = 0;
for (; prefixLength < first.length; prefixLength++) {
- final int cmp =
FrontCodedIndexed.unsignedByteCompare(first[prefixLength], next[prefixLength]);
+ final int cmp =
StringUtils.compareUtf8UsingJavaStringOrdering(first[prefixLength],
next[prefixLength]);
if (cmp != 0) {
break;
}
@@ -325,7 +325,11 @@ public class FrontCodedIndexedWriter implements
DictionaryWriter<byte[]>
return buffer.position() - pos;
}
- public static int unsignedCompare(
+ /**
+ * Same as {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte[],
byte[])}, but accepts nulls. Nulls are
+ * sorted first.
+ */
+ private static int compareNullableUtf8UsingJavaStringOrdering(
@Nullable final byte[] b1,
@Nullable final byte[] b2
)
@@ -337,15 +341,7 @@ public class FrontCodedIndexedWriter implements
DictionaryWriter<byte[]>
if (b2 == null) {
return 1;
}
- final int commonLength = Math.min(b1.length, b2.length);
-
- for (int i = 0; i < commonLength; i++) {
- final int cmp = FrontCodedIndexed.unsignedByteCompare(b1[i], b2[i]);
- if (cmp != 0) {
- return cmp;
- }
- }
- return Integer.compare(b1.length, b2.length);
+ return StringUtils.compareUtf8UsingJavaStringOrdering(b1, b2);
}
}
diff --git
a/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java
b/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java
index ff1c570b03..62f50b0dc6 100644
--- a/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java
+++ b/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java
@@ -100,13 +100,16 @@ public class GenericIndexed<T> implements
CloseableIndexed<T>, Serializer
private static final SerializerUtils SERIALIZER_UTILS = new
SerializerUtils();
/**
- * An ObjectStrategy that returns a big-endian ByteBuffer pointing to the
original data.
+ * An ObjectStrategy that returns a big-endian ByteBuffer pointing to
original data.
*
* The returned ByteBuffer is a fresh read-only instance, so it is OK for
callers to modify its position, limit, etc.
* However, it does point to the original data, so callers must take care
not to use it if the original data may
* have been freed.
+ *
+ * The compare method of this instance uses {@link
StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])}
+ * so that behavior is consistent with {@link #STRING_STRATEGY}.
*/
- public static final ObjectStrategy<ByteBuffer> BYTE_BUFFER_STRATEGY = new
ObjectStrategy<ByteBuffer>()
+ public static final ObjectStrategy<ByteBuffer> UTF8_STRATEGY = new
ObjectStrategy<ByteBuffer>()
{
@Override
public Class<ByteBuffer> getClazz()
@@ -140,7 +143,7 @@ public class GenericIndexed<T> implements
CloseableIndexed<T>, Serializer
@Override
public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2)
{
- return ByteBufferUtils.unsignedComparator().compare(o1, o2);
+ return ByteBufferUtils.utf8Comparator().compare(o1, o2);
}
};
@@ -541,7 +544,7 @@ public class GenericIndexed<T> implements
CloseableIndexed<T>, Serializer
}
//noinspection ObjectEquality
- final boolean isByteBufferStrategy = strategy == BYTE_BUFFER_STRATEGY;
+ final boolean isByteBufferStrategy = strategy == UTF8_STRATEGY;
int minIndex = 0;
int maxIndex = size - 1;
@@ -553,7 +556,7 @@ public class GenericIndexed<T> implements
CloseableIndexed<T>, Serializer
if (isByteBufferStrategy) {
// Specialization avoids ByteBuffer allocation in
strategy.fromByteBuffer.
ByteBuffer currValue = getByteBuffer(currIndex);
- comparison = ByteBufferUtils.compareByteBuffers(currValue,
(ByteBuffer) value);
+ comparison = ByteBufferUtils.compareUtf8ByteBuffers(currValue,
(ByteBuffer) value);
} else {
T currValue = get(currIndex);
comparison = strategy.compare(currValue, value);
diff --git
a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java
b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java
index 8594cab8ca..39d8cf081b 100644
---
a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java
+++
b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java
@@ -107,7 +107,7 @@ public class NestedDataColumnSupplier implements
Supplier<ComplexColumn>
// this cannot happen naturally right now since generic indexed is
written in the 'legacy' format, but
// this provides backwards compatibility should we switch at some
point in the future to always
// writing dictionaryVersion
- dictionary = GenericIndexed.read(stringDictionaryBuffer,
GenericIndexed.BYTE_BUFFER_STRATEGY, mapper);
+ dictionary = GenericIndexed.read(stringDictionaryBuffer,
GenericIndexed.UTF8_STRATEGY, mapper);
frontCodedDictionarySupplier = null;
} else {
throw new ISE("impossible, unknown encoding strategy id: %s",
encodingId);
@@ -117,7 +117,7 @@ public class NestedDataColumnSupplier implements
Supplier<ComplexColumn>
// as dictionaryVersion is actually also the GenericIndexed version,
so we reset start position so the
// GenericIndexed version can be correctly read
stringDictionaryBuffer.position(dictionaryStartPosition);
- dictionary = GenericIndexed.read(stringDictionaryBuffer,
GenericIndexed.BYTE_BUFFER_STRATEGY, mapper);
+ dictionary = GenericIndexed.read(stringDictionaryBuffer,
GenericIndexed.UTF8_STRATEGY, mapper);
frontCodedDictionarySupplier = null;
}
final ByteBuffer longDictionaryBuffer = loadInternalFile(
diff --git
a/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java
b/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java
index ef130f5428..b1eef3307b 100644
---
a/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java
+++
b/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java
@@ -353,7 +353,7 @@ public class DictionaryEncodedColumnPartSerde implements
ColumnPartSerde
final GenericIndexed<ByteBuffer> rDictionaryUtf8 = GenericIndexed.read(
buffer,
- GenericIndexed.BYTE_BUFFER_STRATEGY,
+ GenericIndexed.UTF8_STRATEGY,
builder.getFileMapper()
);
diff --git
a/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java
b/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java
index f1bd478c95..c9e5613536 100644
---
a/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java
+++
b/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java
@@ -241,7 +241,9 @@ public class FrontCodedIndexedTest extends
InitializedNullHandlingTest
public void testFrontCodedIndexedUnicodes() throws IOException
{
ByteBuffer buffer = ByteBuffer.allocate(1 << 12).order(order);
- List<String> theList = ImmutableList.of("Győ-Moson-Sopron", "Győr");
+
+ // "\uD83D\uDCA9" and "(請參見已被刪除版本)" are a regression test for
https://github.com/apache/druid/pull/13364
+ List<String> theList = ImmutableList.of("Győ-Moson-Sopron", "Győr",
"\uD83D\uDCA9", "(請參見已被刪除版本)");
fillBuffer(buffer, theList, 4);
buffer.position(0);
diff --git
a/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java
b/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java
index 268a4a2c97..eaf161036d 100644
---
a/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java
+++
b/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java
@@ -121,7 +121,7 @@ public class ExtractionDimFilterTest extends
InitializedNullHandlingTest
GenericIndexed.fromIterable(Collections.singletonList("foo1"),
GenericIndexed.STRING_STRATEGY),
GenericIndexed.fromIterable(
Collections.singletonList(ByteBuffer.wrap(StringUtils.toUtf8("foo1"))),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
),
GenericIndexed.fromIterable(Collections.singletonList(foo1BitMap),
serdeFactory.getObjectStrategy()),
null
diff --git
a/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java
b/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java
index e10d64241f..f7525e3f9c 100644
---
a/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java
+++
b/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java
@@ -75,7 +75,7 @@ public class PredicateValueMatcherFactoryTest extends
InitializedNullHandlingTes
ByteBuffer.wrap(StringUtils.toUtf8("v2")),
ByteBuffer.wrap(StringUtils.toUtf8("v3"))
),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
),
null,
() ->
VSizeColumnarMultiInts.fromIterable(ImmutableList.of(VSizeColumnarInts.fromArray(new
int[]{1}))),
@@ -98,7 +98,7 @@ public class PredicateValueMatcherFactoryTest extends
InitializedNullHandlingTes
ByteBuffer.wrap(StringUtils.toUtf8("v2")),
ByteBuffer.wrap(StringUtils.toUtf8("v3"))
),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
),
null,
() ->
VSizeColumnarMultiInts.fromIterable(ImmutableList.of(VSizeColumnarInts.fromArray(new
int[]{1}))),
diff --git
a/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java
b/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java
index cb5459226b..98631d3dc9 100644
---
a/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java
+++
b/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java
@@ -49,7 +49,7 @@ public class ValueMatchersTest extends
InitializedNullHandlingTest
GenericIndexed.fromIterable(ImmutableList.of("value"),
GenericIndexed.STRING_STRATEGY),
GenericIndexed.fromIterable(
ImmutableList.of(ByteBuffer.wrap(StringUtils.toUtf8("value"))),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
),
() -> VSizeColumnarInts.fromArray(new int[]{0}),
null,
@@ -62,7 +62,7 @@ public class ValueMatchersTest extends
InitializedNullHandlingTest
ByteBuffer.wrap(StringUtils.toUtf8("value")),
ByteBuffer.wrap(StringUtils.toUtf8("value2"))
),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
),
() -> VSizeColumnarInts.fromArray(new int[]{0, 0, 1, 0, 1}),
null,
@@ -72,7 +72,7 @@ public class ValueMatchersTest extends
InitializedNullHandlingTest
GenericIndexed.fromIterable(ImmutableList.of("value"),
GenericIndexed.STRING_STRATEGY),
GenericIndexed.fromIterable(
ImmutableList.of(ByteBuffer.wrap(StringUtils.toUtf8("value"))),
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
),
null,
() -> VSizeColumnarMultiInts.fromIterable(
diff --git
a/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java
b/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java
index 6d66464eee..15b40b729f 100644
---
a/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java
+++
b/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java
@@ -127,7 +127,7 @@ public class NestedFieldLiteralColumnIndexSupplierTest
extends InitializedNullHa
doubleWriter.write(9.9);
writeToBuffer(doubleBuffer, doubleWriter);
- GenericIndexed<ByteBuffer> strings = GenericIndexed.read(stringBuffer,
GenericIndexed.BYTE_BUFFER_STRATEGY);
+ GenericIndexed<ByteBuffer> strings = GenericIndexed.read(stringBuffer,
GenericIndexed.UTF8_STRATEGY);
globalStrings = () -> strings.singleThreaded();
globalLongs = FixedIndexed.read(longBuffer, TypeStrategies.LONG,
ByteOrder.nativeOrder(), Long.BYTES);
globalDoubles = FixedIndexed.read(doubleBuffer, TypeStrategies.DOUBLE,
ByteOrder.nativeOrder(), Double.BYTES);
diff --git
a/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java
b/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java
index 36d5ba76a0..a9ab64a378 100644
---
a/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java
+++
b/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java
@@ -115,7 +115,7 @@ public class DictionaryEncodedStringIndexSupplierTest
extends InitializedNullHan
GenericIndexedWriter<ByteBuffer> byteBufferWriter = new
GenericIndexedWriter<>(
new OnHeapMemorySegmentWriteOutMedium(),
"byteBuffers",
- GenericIndexed.BYTE_BUFFER_STRATEGY
+ GenericIndexed.UTF8_STRATEGY
);
stringWriter.open();
@@ -167,7 +167,7 @@ public class DictionaryEncodedStringIndexSupplierTest
extends InitializedNullHan
return new DictionaryEncodedStringIndexSupplier(
roaringFactory.getBitmapFactory(),
GenericIndexed.read(stringBuffer, GenericIndexed.STRING_STRATEGY),
- GenericIndexed.read(byteBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY),
+ GenericIndexed.read(byteBuffer, GenericIndexed.UTF8_STRATEGY),
bitmaps,
null
);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]