This is an automated email from the ASF dual-hosted git repository.
jtao pushed a commit to branch hotfix-unicode
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/hotfix-unicode by this push:
new d3dbd5829f9 Enforce UTF8 when decoding byte[] to string in ValueReader
(#16608)
d3dbd5829f9 is described below
commit d3dbd5829f98a419c9f896d06b28d88ecc1b4626
Author: Jiapeng Tao <[email protected]>
AuthorDate: Fri Aug 15 14:04:21 2025 -0700
Enforce UTF8 when decoding byte[] to string in ValueReader (#16608)
---
.../pinot/segment/local/io/util/ValueReader.java | 3 +-
.../FixedByteValueReaderWriterTest.java | 36 ++++++++++++++++++++++
2 files changed, 38 insertions(+), 1 deletion(-)
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
index 9aa9382e319..acf3d4fea2a 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
@@ -20,6 +20,7 @@ package org.apache.pinot.segment.local.io.util;
import java.io.Closeable;
import java.math.BigDecimal;
+import java.nio.charset.StandardCharsets;
import org.apache.pinot.spi.utils.BigDecimalUtils;
import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
@@ -63,7 +64,7 @@ public interface ValueReader extends Closeable {
*/
default String getUnpaddedString(int index, int numBytesPerValue, byte[]
buffer) {
int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
- return new String(buffer, 0, length);
+ return new String(buffer, 0, length, StandardCharsets.UTF_8);
}
/**
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java
index 65b09df5644..5e47acf8bca 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java
@@ -74,4 +74,40 @@ public class FixedByteValueReaderWriterTest implements
PinotBuffersAfterMethodCh
}
}
}
+
+ @Test(dataProvider = "params")
+ public void testFixedByteValueReaderWriterNonAscii(int maxStringLength, int
configuredMaxLength, ByteOrder byteOrder)
+ throws IOException {
+ byte[] bytes = new byte[configuredMaxLength];
+ // Use a multi-byte UTF-8 character (é = 0xC3 0xA9)
+ byte[] nonAsciiChar = "é".getBytes(StandardCharsets.UTF_8);
+
+ try (PinotDataBuffer buffer =
PinotDataBuffer.allocateDirect(configuredMaxLength * 1000L, byteOrder,
+ "testFixedByteValueReaderWriterNonAscii")) {
+ FixedByteValueReaderWriter readerWriter = new
FixedByteValueReaderWriter(buffer);
+ List<String> inputs = new ArrayList<>(1000);
+
+ for (int i = 0; i < 1000; i++) {
+ // number of *characters* to write
+ int charCount = ThreadLocalRandom.current().nextInt(maxStringLength);
+ int byteCount = charCount * nonAsciiChar.length;
+ if (byteCount > configuredMaxLength) {
+ byteCount = configuredMaxLength - (configuredMaxLength %
nonAsciiChar.length); // fit whole chars
+ charCount = byteCount / nonAsciiChar.length;
+ }
+
+ Arrays.fill(bytes, (byte) 0);
+ for (int pos = 0; pos < byteCount; pos += nonAsciiChar.length) {
+ System.arraycopy(nonAsciiChar, 0, bytes, pos, nonAsciiChar.length);
+ }
+
+ readerWriter.writeBytes(i, configuredMaxLength, bytes);
+ inputs.add("é".repeat(charCount));
+ }
+
+ for (int i = 0; i < 1000; i++) {
+ assertEquals(readerWriter.getUnpaddedString(i, configuredMaxLength,
bytes), inputs.get(i));
+ }
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]