This is an automated email from the ASF dual-hosted git repository.

gian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new 78d0b0abce  Add string comparison methods to StringUtils, fix 
dictionary comparisons. (#13364)
78d0b0abce is described below

commit 78d0b0abce2b27b88a53790684d81530a2318fc9
Author: Gian Merlino <[email protected]>
AuthorDate: Wed Nov 16 07:15:00 2022 -0800

     Add string comparison methods to StringUtils, fix dictionary comparisons. 
(#13364)
    
    * Add string comparison methods to StringUtils, fix dictionary comparisons.
    
    There are various places in Druid code where we assume that String.compareTo
    is consistent with Unicode code-point ordering. Sadly this is not the case.
    
    To help deal with this, this patch introduces the following helpers:
    
    1) compareUnicode: Compares two Strings in Unicode code-point order.
    2) compareUtf8: Compares two UTF-8 byte arrays in Unicode code-point order.
       Equivalent to comparison as unsigned bytes.
    3) compareUtf8UsingJavaStringOrdering: Compares two UTF-8 byte arrays, or
       ByteBuffers, in a manner consistent with String.compareTo.
    
    There is no helper for comparing two Strings in a manner consistent
    with String.compareTo, because for that we can use compareTo directly.
    
    The patch also fixes an inconsistency between the String and UTF-8
    dictionary GenericIndexed flavors of string-typed columns: they were
    formerly using incompatible comparators.
    
    * Adjust test.
    
    * FrontCodedIndexed updates.
    
    * Add test.
    
    * Fix comments.
---
 .../druid/benchmark/BoundFilterBenchmark.java      |   2 +-
 ...tionaryEncodedStringIndexSupplierBenchmark.java |   2 +-
 .../DimensionPredicateFilterBenchmark.java         |   2 +-
 .../benchmark/FrontCodedIndexedBenchmark.java      |   2 +-
 .../apache/druid/benchmark/InFilterBenchmark.java  |   2 +-
 .../druid/benchmark/LikeFilterBenchmark.java       |   2 +-
 .../druid/java/util/common/ByteBufferUtils.java    |  52 ++------
 .../apache/druid/java/util/common/StringUtils.java | 142 +++++++++++++++++++++
 .../java/util/common/ByteBufferUtilsTest.java      |  59 +++++++--
 .../druid/java/util/common/StringUtilsTest.java    |  96 ++++++++++++++
 .../apache/druid/frame/write/FrameWriterUtils.java |   2 +-
 .../org/apache/druid/query/filter/InDimFilter.java |   2 +-
 .../druid/query/ordering/StringComparators.java    |   8 +-
 .../java/org/apache/druid/segment/IndexIO.java     |   2 +-
 .../segment/column/IndexedUtf8ValueSetIndex.java   |   2 +-
 .../druid/segment/column/Utf8ValueSetIndex.java    |   2 +-
 .../druid/segment/data/FrontCodedIndexed.java      |  20 +--
 .../segment/data/FrontCodedIndexedWriter.java      |  24 ++--
 .../apache/druid/segment/data/GenericIndexed.java  |  13 +-
 .../segment/nested/NestedDataColumnSupplier.java   |   4 +-
 .../serde/DictionaryEncodedColumnPartSerde.java    |   2 +-
 .../druid/segment/data/FrontCodedIndexedTest.java  |   4 +-
 .../segment/filter/ExtractionDimFilterTest.java    |   2 +-
 .../filter/PredicateValueMatcherFactoryTest.java   |   4 +-
 .../druid/segment/filter/ValueMatchersTest.java    |   6 +-
 .../NestedFieldLiteralColumnIndexSupplierTest.java |   2 +-
 .../DictionaryEncodedStringIndexSupplierTest.java  |   4 +-
 27 files changed, 359 insertions(+), 105 deletions(-)

diff --git 
a/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java 
b/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java
index cdb3cf2f7c..819c528198 100644
--- 
a/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java
+++ 
b/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java
@@ -179,7 +179,7 @@ public class BoundFilterBenchmark
     final GenericIndexed<ByteBuffer> dictionaryUtf8 = 
GenericIndexed.fromIterable(
         FluentIterable.from(ints)
                       .transform(i -> 
ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
-        GenericIndexed.BYTE_BUFFER_STRATEGY
+        GenericIndexed.UTF8_STRATEGY
     );
     selector = new MockColumnIndexSelector(
         bitmapFactory,
diff --git 
a/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java
 
b/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java
index 282b25e198..1806f28ad1 100644
--- 
a/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java
+++ 
b/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java
@@ -101,7 +101,7 @@ public class DictionaryEncodedStringIndexSupplierBenchmark
       final GenericIndexed<ByteBuffer> dictionaryUtf8 = 
GenericIndexed.fromIterable(
           FluentIterable.from(ints)
                         .transform(i -> 
ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
-          GenericIndexed.BYTE_BUFFER_STRATEGY
+          GenericIndexed.UTF8_STRATEGY
       );
       final GenericIndexed<ImmutableBitmap> bitmaps = 
GenericIndexed.fromIterable(
           () -> IntStream.range(0, dictionarySize)
diff --git 
a/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java
 
b/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java
index f01b09a79f..34a35b559a 100644
--- 
a/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java
+++ 
b/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java
@@ -130,7 +130,7 @@ public class DimensionPredicateFilterBenchmark
     final GenericIndexed<ByteBuffer> dictionaryUtf8 = 
GenericIndexed.fromIterable(
         FluentIterable.from(ints)
                       .transform(i -> 
ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
-        GenericIndexed.BYTE_BUFFER_STRATEGY
+        GenericIndexed.UTF8_STRATEGY
     );
     final GenericIndexed<ImmutableBitmap> bitmaps = 
GenericIndexed.fromIterable(
         FluentIterable.from(ints)
diff --git 
a/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java
 
b/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java
index 2dba1ba5c0..3065663065 100644
--- 
a/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java
+++ 
b/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java
@@ -174,7 +174,7 @@ public class FrontCodedIndexedBenchmark
 
     genericIndexed = GenericIndexed.read(
         byteBufferGeneric,
-        GenericIndexed.BYTE_BUFFER_STRATEGY,
+        GenericIndexed.UTF8_STRATEGY,
         SmooshedFileMapper.load(smooshDirFrontCoded)
     );
     frontCodedIndexed = FrontCodedIndexed.read(
diff --git 
a/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java 
b/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java
index 09c3253af0..0a97367493 100644
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java
@@ -93,7 +93,7 @@ public class InFilterBenchmark
     final GenericIndexed<ByteBuffer> dictionaryUtf8 = 
GenericIndexed.fromIterable(
         FluentIterable.from(ints)
                       .transform(i -> 
ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
-        GenericIndexed.BYTE_BUFFER_STRATEGY
+        GenericIndexed.UTF8_STRATEGY
     );
     final GenericIndexed<ImmutableBitmap> bitmaps = 
GenericIndexed.fromIterable(
         () -> IntStream.range(0, dictionarySize)
diff --git 
a/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java 
b/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java
index 1369d69787..bb85422792 100644
--- 
a/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java
+++ 
b/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java
@@ -130,7 +130,7 @@ public class LikeFilterBenchmark
     final GenericIndexed<ByteBuffer> dictionaryUtf8 = 
GenericIndexed.fromIterable(
         FluentIterable.from(ints)
                       .transform(i -> 
ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
-        GenericIndexed.BYTE_BUFFER_STRATEGY
+        GenericIndexed.UTF8_STRATEGY
     );
     final GenericIndexed<ImmutableBitmap> bitmaps = 
GenericIndexed.fromIterable(
         FluentIterable.from(ints)
diff --git 
a/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java 
b/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java
index 8209049b84..fb67c32383 100644
--- a/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java
+++ b/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java
@@ -47,7 +47,7 @@ public class ByteBufferUtils
   // null if unmap is supported
   private static final RuntimeException UNMAP_NOT_SUPPORTED_EXCEPTION;
 
-  private static final Comparator<ByteBuffer> COMPARATOR_UNSIGNED = new 
UnsignedByteBufferComparator();
+  private static final Comparator<ByteBuffer> COMPARATOR_UTF8 = new 
Utf8ByteBufferComparator();
 
   static {
     Object unmap = null;
@@ -214,40 +214,12 @@ public class ByteBufferUtils
   }
 
   /**
-   * Compares two ByteBuffer ranges using unsigned byte ordering.
+   * Compares two ByteBuffers from their positions to their limits using 
ordering consistent with
+   * {@link String#compareTo(String)}. Null buffers are accepted, and are 
ordered earlier than any nonnull buffer.
    *
-   * Different from {@link ByteBuffer#compareTo}, which uses signed ordering.
+   * Different from {@link ByteBuffer#compareTo}, which uses signed-bytes 
ordering.
    */
-  public static int compareByteBuffers(
-      final ByteBuffer buf1,
-      final int position1,
-      final int length1,
-      final ByteBuffer buf2,
-      final int position2,
-      final int length2
-  )
-  {
-    final int commonLength = Math.min(length1, length2);
-
-    for (int i = 0; i < commonLength; i++) {
-      final byte byte1 = buf1.get(position1 + i);
-      final byte byte2 = buf2.get(position2 + i);
-      final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison
-      if (cmp != 0) {
-        return cmp;
-      }
-    }
-
-    return Integer.compare(length1, length2);
-  }
-
-  /**
-   * Compares two ByteBuffers from their positions to their limits using 
unsigned byte ordering. Accepts null
-   * buffers, which are ordered earlier than any nonnull buffer.
-   *
-   * Different from {@link ByteBuffer#compareTo}, which uses signed ordering.
-   */
-  public static int compareByteBuffers(
+  public static int compareUtf8ByteBuffers(
       @Nullable final ByteBuffer buf1,
       @Nullable final ByteBuffer buf2
   )
@@ -260,7 +232,7 @@ public class ByteBufferUtils
       return 1;
     }
 
-    return ByteBufferUtils.compareByteBuffers(
+    return StringUtils.compareUtf8UsingJavaStringOrdering(
         buf1,
         buf1.position(),
         buf1.remaining(),
@@ -271,20 +243,20 @@ public class ByteBufferUtils
   }
 
   /**
-   * Comparator that compares two {@link ByteBuffer} using unsigned ordering. 
Null buffers are accepted, and
-   * are ordered earlier than any nonnull buffer.
+   * Comparator that compares two {@link ByteBuffer} using ordering consistent 
with {@link String#compareTo(String)}.
+   * Null buffers are accepted, and are ordered earlier than any nonnull 
buffer.
    */
-  public static Comparator<ByteBuffer> unsignedComparator()
+  public static Comparator<ByteBuffer> utf8Comparator()
   {
-    return COMPARATOR_UNSIGNED;
+    return COMPARATOR_UTF8;
   }
 
-  private static class UnsignedByteBufferComparator implements 
Comparator<ByteBuffer>
+  private static class Utf8ByteBufferComparator implements 
Comparator<ByteBuffer>
   {
     @Override
     public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2)
     {
-      return ByteBufferUtils.compareByteBuffers(o1, o2);
+      return compareUtf8ByteBuffers(o1, o2);
     }
   }
 }
diff --git 
a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java 
b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java
index bd17f42c40..41078961a0 100644
--- a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java
+++ b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java
@@ -77,6 +77,148 @@ public class StringUtils
     return string == null ? EMPTY_BYTES : toUtf8(string);
   }
 
+  /**
+   * Compares two Java Strings in Unicode code-point order.
+   *
+   * Order is consistent with {@link #compareUtf8(byte[], byte[])}, but is not 
consistent with
+   * {@link String#compareTo(String)}.
+   */
+  public static int compareUnicode(final String a, final String b)
+  {
+    final int commonLength = Math.min(a.length(), b.length());
+
+    for (int i = 0; i < commonLength; i++) {
+      int char1 = a.charAt(i) & 0xFFFF; // Unsigned
+      int char2 = b.charAt(i) & 0xFFFF; // Unsigned
+
+      if (char1 != char2 && char1 >= 0xd800 && char2 >= 0xd800) {
+        // Fixup logic for code units at or above the surrogate range, based 
on logic described at
+        // https://www.icu-project.org/docs/papers/utf16_code_point_order.html.
+        //
+        // If both code units are at or above the surrogate range (>= 0xd800) 
then adjust non-surrogates (legitimate
+        // single-code-unit characters) to be below the surrogate range, so 
they compare earlier than surrogates.
+
+        if (!Character.isSurrogate((char) char1)) {
+          char1 -= 0x2800;
+        }
+
+        if (!Character.isSurrogate((char) char2)) {
+          char2 -= 0x2800;
+        }
+      }
+
+      final int cmp = char1 - char2;
+      if (cmp != 0) {
+        return cmp;
+      }
+    }
+
+    return Integer.compare(a.length(), b.length());
+  }
+
+  /**
+   * Compares two UTF-8 byte strings in Unicode code-point order.
+   *
+   * Equivalent to a comparison of the two byte arrays as if they were 
unsigned bytes.
+   *
+   * Order is consistent with {@link #compareUnicode(String, String)}, but is 
not consistent with
+   * {@link String#compareTo(String)}. For an ordering consistent with {@link 
String#compareTo(String)}, use
+   * {@link #compareUtf8UsingJavaStringOrdering(byte[], byte[])} instead.
+   */
+  public static int compareUtf8(final byte[] a, final byte[] b)
+  {
+    final int commonLength = Math.min(a.length, b.length);
+
+    for (int i = 0; i < commonLength; i++) {
+      final byte byte1 = a[i];
+      final byte byte2 = b[i];
+      final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison
+      if (cmp != 0) {
+        return cmp;
+      }
+    }
+
+    return Integer.compare(a.length, b.length);
+  }
+
+  /**
+   * Compares two UTF-8 byte strings in UTF-16 code-unit order.
+   *
+   * Order is consistent with {@link String#compareTo(String)}, but is not 
consistent with
+   * {@link #compareUnicode(String, String)} or {@link #compareUtf8(byte[], 
byte[])}.
+   */
+  public static int compareUtf8UsingJavaStringOrdering(final byte[] a, final 
byte[] b)
+  {
+    final int commonLength = Math.min(a.length, b.length);
+
+    for (int i = 0; i < commonLength; i++) {
+      final int cmp = compareUtf8UsingJavaStringOrdering(a[i], b[i]);
+      if (cmp != 0) {
+        return cmp;
+      }
+    }
+
+    return Integer.compare(a.length, b.length);
+  }
+
+  /**
+   * Compares two UTF-8 byte strings in UTF-16 code-unit order.
+   *
+   * Order is consistent with {@link String#compareTo(String)}, but is not 
consistent with
+   * {@link #compareUnicode(String, String)} or {@link #compareUtf8(byte[], 
byte[])}.
+   */
+  public static int compareUtf8UsingJavaStringOrdering(
+      final ByteBuffer buf1,
+      final int position1,
+      final int length1,
+      final ByteBuffer buf2,
+      final int position2,
+      final int length2
+  )
+  {
+    final int commonLength = Math.min(length1, length2);
+
+    for (int i = 0; i < commonLength; i++) {
+      final int cmp = compareUtf8UsingJavaStringOrdering(buf1.get(position1 + 
i), buf2.get(position2 + i));
+      if (cmp != 0) {
+        return cmp;
+      }
+    }
+
+    return Integer.compare(length1, length2);
+  }
+
+  /**
+   * Compares two bytes from UTF-8 strings in such a way that the entire byte 
arrays are compared in UTF-16
+   * code-unit order.
+   *
+   * Compatible with {@link #compareUtf8UsingJavaStringOrdering(byte[], 
byte[])} and
+   * {@link #compareUtf8UsingJavaStringOrdering(ByteBuffer, int, int, 
ByteBuffer, int, int)}.
+   */
+  public static int compareUtf8UsingJavaStringOrdering(byte byte1, byte byte2)
+  {
+    // Treat as unsigned bytes.
+    int ubyte1 = byte1 & 0xFF;
+    int ubyte2 = byte2 & 0xFF;
+
+    if (ubyte1 != ubyte2 && ubyte1 >= 0xEE && ubyte2 >= 0xEE) {
+      // Fixup logic for lead bytes for U+E000 ... U+FFFF, based on logic 
described at
+      // https://www.icu-project.org/docs/papers/utf16_code_point_order.html.
+      //
+      // Move possible lead bytes for this range (0xEE and 0xEF) above all 
other bytes, so they compare later.
+
+      if (ubyte1 == 0xEE || ubyte1 == 0xEF) {
+        ubyte1 += 0xFF;
+      }
+
+      if (ubyte2 == 0xEE || ubyte2 == 0xEF) {
+        ubyte2 += 0xFF;
+      }
+    }
+
+    return ubyte1 - ubyte2;
+  }
+
   public static String fromUtf8(final byte[] bytes)
   {
     try {
diff --git 
a/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java 
b/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java
index f5acb6e030..c32f29bfdd 100644
--- 
a/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java
+++ 
b/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java
@@ -19,6 +19,7 @@
 
 package org.apache.druid.java.util.common;
 
+import com.google.common.collect.ImmutableList;
 import com.google.common.io.Files;
 import org.apache.druid.collections.ResourceHolder;
 import org.hamcrest.MatcherAssert;
@@ -36,9 +37,28 @@ import java.nio.ByteBuffer;
 import java.nio.MappedByteBuffer;
 import java.util.Arrays;
 import java.util.Comparator;
+import java.util.List;
 
 public class ByteBufferUtilsTest
 {
+  private static final List<String> COMPARE_TEST_STRINGS = ImmutableList.of(
+      "(請參見已被刪除版本)",
+      "請參見已被刪除版本",
+      "שָׁלוֹם",
+      "+{{[[Template:別名重定向|別名重定向]]}}",
+      "\uD83D\uDC4D\uD83D\uDC4D\uD83D\uDC4D",
+      "\uD83D\uDCA9",
+      "",
+      "f",
+      "fo",
+      "\uD83D\uDE42",
+      "\uD83E\uDEE5",
+      "\uD83E\uDD20",
+      "quick",
+      "brown",
+      "fox"
+  );
+
   @Rule
   public TemporaryFolder temporaryFolder = new TemporaryFolder();
 
@@ -82,9 +102,9 @@ public class ByteBufferUtilsTest
 
   @Test
   @SuppressWarnings("EqualsWithItself")
-  public void testUnsignedComparator()
+  public void testUtf8Comparator()
   {
-    final Comparator<ByteBuffer> comparator = 
ByteBufferUtils.unsignedComparator();
+    final Comparator<ByteBuffer> comparator = ByteBufferUtils.utf8Comparator();
 
     // Tests involving null
     MatcherAssert.assertThat(comparator.compare(null, null), 
Matchers.equalTo(0));
@@ -112,18 +132,33 @@ public class ByteBufferUtilsTest
         Matchers.greaterThan(0)
     );
 
-    // Tests involving the full range of bytes
-    for (byte i = Byte.MIN_VALUE; i < Byte.MAX_VALUE; i++) {
-      for (byte j = Byte.MIN_VALUE; j < Byte.MAX_VALUE; j++) {
-        final int cmp = Integer.compare(Byte.toUnsignedInt(i), 
Byte.toUnsignedInt(j));
+    for (final String string1 : COMPARE_TEST_STRINGS) {
+      for (final String string2 : COMPARE_TEST_STRINGS) {
+        final byte[] utf8Bytes1 = StringUtils.toUtf8(string1);
+        final byte[] utf8Bytes2 = StringUtils.toUtf8(string2);
+        final ByteBuffer utf8ByteBuffer1 = 
ByteBuffer.allocate(utf8Bytes1.length + 2);
+        final ByteBuffer utf8ByteBuffer2 = 
ByteBuffer.allocate(utf8Bytes2.length + 2);
+        utf8ByteBuffer1.position(1);
+        utf8ByteBuffer1.put(utf8Bytes1, 0, 
utf8Bytes1.length).position(utf8Bytes1.length);
+        utf8ByteBuffer1.position(1).limit(1 + utf8Bytes1.length);
+        utf8ByteBuffer2.position(1);
+        utf8ByteBuffer2.put(utf8Bytes2, 0, 
utf8Bytes2.length).position(utf8Bytes2.length);
+        utf8ByteBuffer2.position(1).limit(1 + utf8Bytes2.length);
+
+        final int compareByteBufferUtilsUtf8 = 
ByteBufferUtils.utf8Comparator().compare(
+            utf8ByteBuffer1,
+            utf8ByteBuffer2
+        );
 
-        MatcherAssert.assertThat(
-            StringUtils.format("comparison of %s to %s", 
Byte.toUnsignedInt(i), Byte.toUnsignedInt(j)),
-            comparator.compare(
-                ByteBuffer.wrap(new byte[]{i}),
-                ByteBuffer.wrap(new byte[]{j})
+        Assert.assertEquals(
+            StringUtils.format(
+                "compareByteBufferUtilsUtf8(byte[]) (actual) "
+                + "matches compareJavaString (expected) for [%s] vs [%s]",
+                string1,
+                string2
             ),
-            cmp < 0 ? Matchers.lessThan(0) : cmp > 0 ? Matchers.greaterThan(0) 
: Matchers.equalTo(0)
+            (int) Math.signum(string1.compareTo(string2)),
+            (int) Math.signum(compareByteBufferUtilsUtf8)
         );
       }
     }
diff --git 
a/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java 
b/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java
index 754e7237c7..3f2d5713c2 100644
--- a/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java
+++ b/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java
@@ -19,6 +19,7 @@
 
 package org.apache.druid.java.util.common;
 
+import com.google.common.collect.ImmutableList;
 import org.apache.druid.collections.ResourceHolder;
 import org.junit.Assert;
 import org.junit.Rule;
@@ -28,12 +29,31 @@ import org.junit.rules.ExpectedException;
 import java.io.UnsupportedEncodingException;
 import java.nio.BufferUnderflowException;
 import java.nio.ByteBuffer;
+import java.util.List;
 
 /**
  *
  */
 public class StringUtilsTest
 {
+  private static final List<String> COMPARE_TEST_STRINGS = ImmutableList.of(
+      "(請參見已被刪除版本)",
+      "請參見已被刪除版本",
+      "שָׁלוֹם",
+      "+{{[[Template:別名重定向|別名重定向]]}}",
+      "\uD83D\uDC4D\uD83D\uDC4D\uD83D\uDC4D",
+      "\uD83D\uDCA9",
+      "",
+      "f",
+      "fo",
+      "\uD83D\uDE42",
+      "\uD83E\uDEE5",
+      "\uD83E\uDD20",
+      "quick",
+      "brown",
+      "fox"
+  );
+
   @Rule
   public ExpectedException expectedException = ExpectedException.none();
 
@@ -290,4 +310,80 @@ public class StringUtilsTest
     Assert.assertEquals("smile ", StringUtils.fastLooseChop("smile 🙂 for the 
camera", 6));
     Assert.assertEquals("smile", StringUtils.fastLooseChop("smile 🙂 for the 
camera", 5));
   }
+
+  @Test
+  public void testUnicodeStringCompare()
+  {
+    for (final String string1 : COMPARE_TEST_STRINGS) {
+      for (final String string2 : COMPARE_TEST_STRINGS) {
+        final int compareUnicode = StringUtils.compareUnicode(string1, 
string2);
+        final int compareUtf8 = StringUtils.compareUtf8(
+            StringUtils.toUtf8(string1),
+            StringUtils.toUtf8(string2)
+        );
+
+        Assert.assertEquals(
+            StringUtils.format(
+                "compareUnicode (actual) matches compareUtf8 (expected) for 
[%s] vs [%s]",
+                string1,
+                string2
+            ),
+            (int) Math.signum(compareUtf8),
+            (int) Math.signum(compareUnicode)
+        );
+      }
+    }
+  }
+
+  @Test
+  public void testJavaStringCompare()
+  {
+    for (final String string1 : COMPARE_TEST_STRINGS) {
+      for (final String string2 : COMPARE_TEST_STRINGS) {
+        final int compareJavaString = string1.compareTo(string2);
+
+        final byte[] utf8Bytes1 = StringUtils.toUtf8(string1);
+        final byte[] utf8Bytes2 = StringUtils.toUtf8(string2);
+        final int compareByteArrayUtf8UsingJavaStringOrdering =
+            StringUtils.compareUtf8UsingJavaStringOrdering(utf8Bytes1, 
utf8Bytes2);
+
+        final ByteBuffer utf8ByteBuffer1 = 
ByteBuffer.allocate(utf8Bytes1.length + 2);
+        final ByteBuffer utf8ByteBuffer2 = 
ByteBuffer.allocate(utf8Bytes2.length + 2);
+        utf8ByteBuffer1.position(1);
+        utf8ByteBuffer1.put(utf8Bytes1, 0, 
utf8Bytes1.length).position(utf8Bytes1.length);
+        utf8ByteBuffer2.position(1);
+        utf8ByteBuffer2.put(utf8Bytes2, 0, 
utf8Bytes2.length).position(utf8Bytes2.length);
+        final int compareByteBufferUtf8UsingJavaStringOrdering = 
StringUtils.compareUtf8UsingJavaStringOrdering(
+            utf8ByteBuffer1,
+            1,
+            utf8Bytes1.length,
+            utf8ByteBuffer2,
+            1,
+            utf8Bytes2.length
+        );
+
+        Assert.assertEquals(
+            StringUtils.format(
+                "compareUtf8UsingJavaStringOrdering(byte[]) (actual) "
+                + "matches compareJavaString (expected) for [%s] vs [%s]",
+                string1,
+                string2
+            ),
+            (int) Math.signum(compareJavaString),
+            (int) Math.signum(compareByteArrayUtf8UsingJavaStringOrdering)
+        );
+
+        Assert.assertEquals(
+            StringUtils.format(
+                "compareByteBufferUtf8UsingJavaStringOrdering(ByteBuffer) 
(actual) "
+                + "matches compareJavaString (expected) for [%s] vs [%s]",
+                string1,
+                string2
+            ),
+            (int) Math.signum(compareJavaString),
+            (int) Math.signum(compareByteBufferUtf8UsingJavaStringOrdering)
+        );
+      }
+    }
+  }
 }
diff --git 
a/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java 
b/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java
index 961e99a3f0..ac6d9bfe65 100644
--- 
a/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java
+++ 
b/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java
@@ -205,7 +205,7 @@ public class FrameWriterUtils
   /**
    * Copies "len" bytes from {@code src.position()} to "dstPosition" in 
"memory". Does not update the position of src.
    *
-   * @throws InvalidNullByteException "allowNullBytes" is true and a null byte 
is encountered
+   * @throws InvalidNullByteException if "allowNullBytes" is false and a null 
byte is encountered
    */
   public static void copyByteBufferToMemory(
       final ByteBuffer src,
diff --git 
a/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java 
b/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java
index 2dec044cf1..afddb0e42a 100644
--- a/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java
+++ b/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java
@@ -674,7 +674,7 @@ public class InDimFilter extends 
AbstractOptimizableDimFilter implements Filter
 
     public SortedSet<ByteBuffer> toUtf8()
     {
-      final TreeSet<ByteBuffer> valuesUtf8 = new 
TreeSet<>(ByteBufferUtils.unsignedComparator());
+      final TreeSet<ByteBuffer> valuesUtf8 = new 
TreeSet<>(ByteBufferUtils.utf8Comparator());
 
       for (final String value : values) {
         if (value == null) {
diff --git 
a/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
 
b/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
index 58e228ad2c..4fdcc5c6f3 100644
--- 
a/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
+++ 
b/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
@@ -47,9 +47,15 @@ public class StringComparators
   public static final int STRLEN_CACHE_ID = 0x04;
   public static final int VERSION_CACHE_ID = 0x05;
 
+  /**
+   * Comparison using the natural comparator of {@link String}.
+   *
+   * Note that this is not equivalent to comparing UTF-8 byte arrays; see 
javadocs for
+   * {@link 
org.apache.druid.java.util.common.StringUtils#compareUnicode(String, String)} 
and
+   * {@link 
org.apache.druid.java.util.common.StringUtils#compareUtf8UsingJavaStringOrdering(byte[],
 byte[])}.
+   */
   public static class LexicographicComparator extends StringComparator
   {
-    // Equivalent to comparing UTF-8 encoded strings as byte arrays.
     private static final Ordering<String> ORDERING = 
Ordering.from(String::compareTo).nullsFirst();
 
     @Override
diff --git a/processing/src/main/java/org/apache/druid/segment/IndexIO.java 
b/processing/src/main/java/org/apache/druid/segment/IndexIO.java
index 9698ebdc2b..9b74f71768 100644
--- a/processing/src/main/java/org/apache/druid/segment/IndexIO.java
+++ b/processing/src/main/java/org/apache/druid/segment/IndexIO.java
@@ -379,7 +379,7 @@ public class IndexIO
 
         // Duplicate the first buffer since we are reading the dictionary 
twice.
         dimValueLookups.put(dimension, 
GenericIndexed.read(dimBuffer.duplicate(), GenericIndexed.STRING_STRATEGY));
-        dimValueUtf8Lookups.put(dimension, GenericIndexed.read(dimBuffer, 
GenericIndexed.BYTE_BUFFER_STRATEGY));
+        dimValueUtf8Lookups.put(dimension, GenericIndexed.read(dimBuffer, 
GenericIndexed.UTF8_STRATEGY));
         dimColumns.put(dimension, 
VSizeColumnarMultiInts.readFromByteBuffer(dimBuffer));
       }
 
diff --git 
a/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java
 
b/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java
index 5680a1400f..c568e78d9b 100644
--- 
a/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java
+++ 
b/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java
@@ -47,7 +47,7 @@ public final class IndexedUtf8ValueSetIndex<TDictionary 
extends Indexed<ByteBuff
   // sorted merge instead of binary-search based algorithm.
   private static final double SORTED_MERGE_RATIO_THRESHOLD = 0.12D;
   private static final int SIZE_WORTH_CHECKING_MIN = 8;
-  private static final Comparator<ByteBuffer> COMPARATOR = 
ByteBufferUtils.unsignedComparator();
+  private static final Comparator<ByteBuffer> COMPARATOR = 
ByteBufferUtils.utf8Comparator();
 
   private final BitmapFactory bitmapFactory;
   private final TDictionary dictionary;
diff --git 
a/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java
 
b/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java
index ef0d08ee0a..6598e36f20 100644
--- 
a/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java
+++ 
b/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java
@@ -29,7 +29,7 @@ public interface Utf8ValueSetIndex
   /**
    * Get an {@link Iterable} of {@link ImmutableBitmap} corresponding to the 
specified set of values (if they are
    * contained in the underlying column). The set must be sorted using
-   * {@link 
org.apache.druid.java.util.common.ByteBufferUtils#unsignedComparator()}.
+   * {@link 
org.apache.druid.java.util.common.ByteBufferUtils#utf8Comparator()}.
    */
   BitmapColumnIndex forSortedValuesUtf8(SortedSet<ByteBuffer> valuesUtf8);
 }
diff --git 
a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java 
b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java
index d2d6c28d34..2596f7ec2b 100644
--- 
a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java
+++ 
b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java
@@ -23,6 +23,7 @@ import com.google.common.base.Preconditions;
 import com.google.common.base.Supplier;
 import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
 
 import javax.annotation.Nullable;
@@ -340,10 +341,13 @@ public final class FrontCodedIndexed implements 
Indexed<ByteBuffer>
 
 
   /**
-   * Performs an unsigned byte comparison of the first value in a bucket with 
the specified value. Note that this method
+   * Performs byte-by-byte comparison of the first value in a bucket with the 
specified value. Note that this method
    * MUST be prepared before calling, as it expects the length of the first 
value to have already been read externally,
    * and the buffer position to be at the start of the first bucket value. The 
final buffer position will be the
-   * 'shared prefix length' of the first value in the bucket and the value to 
compare
+   * 'shared prefix length' of the first value in the bucket and the value to 
compare.
+   *
+   * Bytes are compared using {@link 
StringUtils#compareUtf8UsingJavaStringOrdering(byte, byte)}. Therefore, when the
+   * values are UTF-8 encoded strings, the ordering is compatible with {@link 
String#compareTo(String)}.
    */
   private static int compareBucketFirstValue(ByteBuffer bucketBuffer, int 
length, ByteBuffer value)
   {
@@ -355,7 +359,7 @@ public final class FrontCodedIndexed implements 
Indexed<ByteBuffer>
     int sharedPrefix;
     int comparison = 0;
     for (sharedPrefix = 0; sharedPrefix < commonLength; sharedPrefix++) {
-      comparison = unsignedByteCompare(bucketBuffer.get(), 
value.get(sharedPrefix));
+      comparison = 
StringUtils.compareUtf8UsingJavaStringOrdering(bucketBuffer.get(), 
value.get(sharedPrefix));
       if (comparison != 0) {
         bucketBuffer.position(startOffset + sharedPrefix);
         break;
@@ -403,7 +407,10 @@ public final class FrontCodedIndexed implements 
Indexed<ByteBuffer>
         final int common = Math.min(fragmentLength, value.remaining() - 
prefixLength);
         int fragmentComparison = 0;
         for (int i = 0; i < common; i++) {
-          fragmentComparison = 
unsignedByteCompare(buffer.get(buffer.position() + i), value.get(prefixLength + 
i));
+          fragmentComparison = StringUtils.compareUtf8UsingJavaStringOrdering(
+              buffer.get(buffer.position() + i),
+              value.get(prefixLength + i)
+          );
           if (fragmentComparison != 0) {
             break;
           }
@@ -502,9 +509,4 @@ public final class FrontCodedIndexed implements 
Indexed<ByteBuffer>
     }
     return bucketBuffers;
   }
-
-  public static int unsignedByteCompare(byte b1, byte b2)
-  {
-    return (b1 & 0xFF) - (b2 & 0xFF);
-  }
 }
diff --git 
a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java
 
b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java
index b6120d6c12..bcbe47db62 100644
--- 
a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java
+++ 
b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java
@@ -44,8 +44,8 @@ import java.nio.channels.WritableByteChannel;
  * the bucket is written entirely, and remaining values are stored as pairs of 
an integer which indicates how much
  * of the first byte array of the bucket to use as a prefix, followed by the 
remaining value bytes after the prefix.
  *
- * This is valid to use for any values which can be compared byte by byte with 
unsigned comparison. Otherwise, this
- * is not the collection for you.
+ * This writer is designed for use with UTF-8 encoded strings that are written 
in an order compatible with
+ * {@link String#compareTo(String)}.
  *
  * @see FrontCodedIndexed for additional details.
  */
@@ -99,7 +99,7 @@ public class FrontCodedIndexedWriter implements 
DictionaryWriter<byte[]>
   @Override
   public void write(@Nullable byte[] value) throws IOException
   {
-    if (prevObject != null && unsignedCompare(prevObject, value) >= 0) {
+    if (prevObject != null && 
compareNullableUtf8UsingJavaStringOrdering(prevObject, value) >= 0) {
       throw new ISE(
           "Values must be sorted and unique. Element [%s] with value [%s] is 
before or equivalent to [%s]",
           numWritten,
@@ -283,7 +283,7 @@ public class FrontCodedIndexedWriter implements 
DictionaryWriter<byte[]>
         // all other values must be partitioned into a prefix length and 
suffix bytes
         int prefixLength = 0;
         for (; prefixLength < first.length; prefixLength++) {
-          final int cmp = 
FrontCodedIndexed.unsignedByteCompare(first[prefixLength], next[prefixLength]);
+          final int cmp = 
StringUtils.compareUtf8UsingJavaStringOrdering(first[prefixLength], 
next[prefixLength]);
           if (cmp != 0) {
             break;
           }
@@ -325,7 +325,11 @@ public class FrontCodedIndexedWriter implements 
DictionaryWriter<byte[]>
     return buffer.position() - pos;
   }
 
-  public static int unsignedCompare(
+  /**
+   * Same as {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte[], 
byte[])}, but accepts nulls. Nulls are
+   * sorted first.
+   */
+  private static int compareNullableUtf8UsingJavaStringOrdering(
       @Nullable final byte[] b1,
       @Nullable final byte[] b2
   )
@@ -337,15 +341,7 @@ public class FrontCodedIndexedWriter implements 
DictionaryWriter<byte[]>
     if (b2 == null) {
       return 1;
     }
-    final int commonLength = Math.min(b1.length, b2.length);
-
-    for (int i = 0; i < commonLength; i++) {
-      final int cmp = FrontCodedIndexed.unsignedByteCompare(b1[i], b2[i]);
-      if (cmp != 0) {
-        return cmp;
-      }
-    }
 
-    return Integer.compare(b1.length, b2.length);
+    return StringUtils.compareUtf8UsingJavaStringOrdering(b1, b2);
   }
 }
diff --git 
a/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java 
b/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java
index ff1c570b03..62f50b0dc6 100644
--- a/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java
+++ b/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java
@@ -100,13 +100,16 @@ public class GenericIndexed<T> implements 
CloseableIndexed<T>, Serializer
   private static final SerializerUtils SERIALIZER_UTILS = new 
SerializerUtils();
 
   /**
-   * An ObjectStrategy that returns a big-endian ByteBuffer pointing to the 
original data.
+   * An ObjectStrategy that returns a big-endian ByteBuffer pointing to 
original data.
    *
    * The returned ByteBuffer is a fresh read-only instance, so it is OK for 
callers to modify its position, limit, etc.
    * However, it does point to the original data, so callers must take care 
not to use it if the original data may
    * have been freed.
+   *
+   * The compare method of this instance uses {@link 
StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])}
+   * so that behavior is consistent with {@link #STRING_STRATEGY}.
    */
-  public static final ObjectStrategy<ByteBuffer> BYTE_BUFFER_STRATEGY = new 
ObjectStrategy<ByteBuffer>()
+  public static final ObjectStrategy<ByteBuffer> UTF8_STRATEGY = new 
ObjectStrategy<ByteBuffer>()
   {
     @Override
     public Class<ByteBuffer> getClazz()
@@ -140,7 +143,7 @@ public class GenericIndexed<T> implements 
CloseableIndexed<T>, Serializer
     @Override
     public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2)
     {
-      return ByteBufferUtils.unsignedComparator().compare(o1, o2);
+      return ByteBufferUtils.utf8Comparator().compare(o1, o2);
     }
   };
 
@@ -541,7 +544,7 @@ public class GenericIndexed<T> implements 
CloseableIndexed<T>, Serializer
       }
 
       //noinspection ObjectEquality
-      final boolean isByteBufferStrategy = strategy == BYTE_BUFFER_STRATEGY;
+      final boolean isByteBufferStrategy = strategy == UTF8_STRATEGY;
 
       int minIndex = 0;
       int maxIndex = size - 1;
@@ -553,7 +556,7 @@ public class GenericIndexed<T> implements 
CloseableIndexed<T>, Serializer
         if (isByteBufferStrategy) {
           // Specialization avoids ByteBuffer allocation in 
strategy.fromByteBuffer.
           ByteBuffer currValue = getByteBuffer(currIndex);
-          comparison = ByteBufferUtils.compareByteBuffers(currValue, 
(ByteBuffer) value);
+          comparison = ByteBufferUtils.compareUtf8ByteBuffers(currValue, 
(ByteBuffer) value);
         } else {
           T currValue = get(currIndex);
           comparison = strategy.compare(currValue, value);
diff --git 
a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java
 
b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java
index 8594cab8ca..39d8cf081b 100644
--- 
a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java
+++ 
b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java
@@ -107,7 +107,7 @@ public class NestedDataColumnSupplier implements 
Supplier<ComplexColumn>
             // this cannot happen naturally right now since generic indexed is 
written in the 'legacy' format, but
             // this provides backwards compatibility should we switch at some 
point in the future to always
             // writing dictionaryVersion
-            dictionary = GenericIndexed.read(stringDictionaryBuffer, 
GenericIndexed.BYTE_BUFFER_STRATEGY, mapper);
+            dictionary = GenericIndexed.read(stringDictionaryBuffer, 
GenericIndexed.UTF8_STRATEGY, mapper);
             frontCodedDictionarySupplier = null;
           } else {
             throw new ISE("impossible, unknown encoding strategy id: %s", 
encodingId);
@@ -117,7 +117,7 @@ public class NestedDataColumnSupplier implements 
Supplier<ComplexColumn>
           // as dictionaryVersion is actually also the GenericIndexed version, 
so we reset start position so the
           // GenericIndexed version can be correctly read
           stringDictionaryBuffer.position(dictionaryStartPosition);
-          dictionary = GenericIndexed.read(stringDictionaryBuffer, 
GenericIndexed.BYTE_BUFFER_STRATEGY, mapper);
+          dictionary = GenericIndexed.read(stringDictionaryBuffer, 
GenericIndexed.UTF8_STRATEGY, mapper);
           frontCodedDictionarySupplier = null;
         }
         final ByteBuffer longDictionaryBuffer = loadInternalFile(
diff --git 
a/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java
 
b/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java
index ef130f5428..b1eef3307b 100644
--- 
a/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java
+++ 
b/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java
@@ -353,7 +353,7 @@ public class DictionaryEncodedColumnPartSerde implements 
ColumnPartSerde
 
         final GenericIndexed<ByteBuffer> rDictionaryUtf8 = GenericIndexed.read(
             buffer,
-            GenericIndexed.BYTE_BUFFER_STRATEGY,
+            GenericIndexed.UTF8_STRATEGY,
             builder.getFileMapper()
         );
 
diff --git 
a/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java
 
b/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java
index f1bd478c95..c9e5613536 100644
--- 
a/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java
+++ 
b/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java
@@ -241,7 +241,9 @@ public class FrontCodedIndexedTest extends 
InitializedNullHandlingTest
   public void testFrontCodedIndexedUnicodes() throws IOException
   {
     ByteBuffer buffer = ByteBuffer.allocate(1 << 12).order(order);
-    List<String> theList = ImmutableList.of("Győ-Moson-Sopron", "Győr");
+
+    // "\uD83D\uDCA9" and "(請參見已被刪除版本)" are a regression test for 
https://github.com/apache/druid/pull/13364
+    List<String> theList = ImmutableList.of("Győ-Moson-Sopron", "Győr", 
"\uD83D\uDCA9", "(請參見已被刪除版本)");
     fillBuffer(buffer, theList, 4);
 
     buffer.position(0);
diff --git 
a/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java
 
b/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java
index 268a4a2c97..eaf161036d 100644
--- 
a/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java
+++ 
b/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java
@@ -121,7 +121,7 @@ public class ExtractionDimFilterTest extends 
InitializedNullHandlingTest
             GenericIndexed.fromIterable(Collections.singletonList("foo1"), 
GenericIndexed.STRING_STRATEGY),
             GenericIndexed.fromIterable(
                 
Collections.singletonList(ByteBuffer.wrap(StringUtils.toUtf8("foo1"))),
-                GenericIndexed.BYTE_BUFFER_STRATEGY
+                GenericIndexed.UTF8_STRATEGY
             ),
             GenericIndexed.fromIterable(Collections.singletonList(foo1BitMap), 
serdeFactory.getObjectStrategy()),
             null
diff --git 
a/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java
 
b/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java
index e10d64241f..f7525e3f9c 100644
--- 
a/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java
+++ 
b/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java
@@ -75,7 +75,7 @@ public class PredicateValueMatcherFactoryTest extends 
InitializedNullHandlingTes
                 ByteBuffer.wrap(StringUtils.toUtf8("v2")),
                 ByteBuffer.wrap(StringUtils.toUtf8("v3"))
             ),
-            GenericIndexed.BYTE_BUFFER_STRATEGY
+            GenericIndexed.UTF8_STRATEGY
         ),
         null,
         () -> 
VSizeColumnarMultiInts.fromIterable(ImmutableList.of(VSizeColumnarInts.fromArray(new
 int[]{1}))),
@@ -98,7 +98,7 @@ public class PredicateValueMatcherFactoryTest extends 
InitializedNullHandlingTes
                 ByteBuffer.wrap(StringUtils.toUtf8("v2")),
                 ByteBuffer.wrap(StringUtils.toUtf8("v3"))
             ),
-            GenericIndexed.BYTE_BUFFER_STRATEGY
+            GenericIndexed.UTF8_STRATEGY
         ),
         null,
         () -> 
VSizeColumnarMultiInts.fromIterable(ImmutableList.of(VSizeColumnarInts.fromArray(new
 int[]{1}))),
diff --git 
a/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java
 
b/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java
index cb5459226b..98631d3dc9 100644
--- 
a/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java
+++ 
b/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java
@@ -49,7 +49,7 @@ public class ValueMatchersTest extends 
InitializedNullHandlingTest
         GenericIndexed.fromIterable(ImmutableList.of("value"), 
GenericIndexed.STRING_STRATEGY),
         GenericIndexed.fromIterable(
             ImmutableList.of(ByteBuffer.wrap(StringUtils.toUtf8("value"))),
-            GenericIndexed.BYTE_BUFFER_STRATEGY
+            GenericIndexed.UTF8_STRATEGY
         ),
         () -> VSizeColumnarInts.fromArray(new int[]{0}),
         null,
@@ -62,7 +62,7 @@ public class ValueMatchersTest extends 
InitializedNullHandlingTest
                 ByteBuffer.wrap(StringUtils.toUtf8("value")),
                 ByteBuffer.wrap(StringUtils.toUtf8("value2"))
             ),
-            GenericIndexed.BYTE_BUFFER_STRATEGY
+            GenericIndexed.UTF8_STRATEGY
         ),
         () -> VSizeColumnarInts.fromArray(new int[]{0, 0, 1, 0, 1}),
         null,
@@ -72,7 +72,7 @@ public class ValueMatchersTest extends 
InitializedNullHandlingTest
         GenericIndexed.fromIterable(ImmutableList.of("value"), 
GenericIndexed.STRING_STRATEGY),
         GenericIndexed.fromIterable(
             ImmutableList.of(ByteBuffer.wrap(StringUtils.toUtf8("value"))),
-            GenericIndexed.BYTE_BUFFER_STRATEGY
+            GenericIndexed.UTF8_STRATEGY
         ),
         null,
         () -> VSizeColumnarMultiInts.fromIterable(
diff --git 
a/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java
 
b/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java
index 6d66464eee..15b40b729f 100644
--- 
a/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java
+++ 
b/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java
@@ -127,7 +127,7 @@ public class NestedFieldLiteralColumnIndexSupplierTest 
extends InitializedNullHa
     doubleWriter.write(9.9);
     writeToBuffer(doubleBuffer, doubleWriter);
 
-    GenericIndexed<ByteBuffer> strings = GenericIndexed.read(stringBuffer, 
GenericIndexed.BYTE_BUFFER_STRATEGY);
+    GenericIndexed<ByteBuffer> strings = GenericIndexed.read(stringBuffer, 
GenericIndexed.UTF8_STRATEGY);
     globalStrings = () -> strings.singleThreaded();
     globalLongs = FixedIndexed.read(longBuffer, TypeStrategies.LONG, 
ByteOrder.nativeOrder(), Long.BYTES);
     globalDoubles = FixedIndexed.read(doubleBuffer, TypeStrategies.DOUBLE, 
ByteOrder.nativeOrder(), Double.BYTES);
diff --git 
a/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java
 
b/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java
index 36d5ba76a0..a9ab64a378 100644
--- 
a/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java
+++ 
b/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java
@@ -115,7 +115,7 @@ public class DictionaryEncodedStringIndexSupplierTest 
extends InitializedNullHan
     GenericIndexedWriter<ByteBuffer> byteBufferWriter = new 
GenericIndexedWriter<>(
         new OnHeapMemorySegmentWriteOutMedium(),
         "byteBuffers",
-        GenericIndexed.BYTE_BUFFER_STRATEGY
+        GenericIndexed.UTF8_STRATEGY
     );
 
     stringWriter.open();
@@ -167,7 +167,7 @@ public class DictionaryEncodedStringIndexSupplierTest 
extends InitializedNullHan
     return new DictionaryEncodedStringIndexSupplier(
         roaringFactory.getBitmapFactory(),
         GenericIndexed.read(stringBuffer, GenericIndexed.STRING_STRATEGY),
-        GenericIndexed.read(byteBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY),
+        GenericIndexed.read(byteBuffer, GenericIndexed.UTF8_STRATEGY),
         bitmaps,
         null
     );


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to