This is an automated email from the ASF dual-hosted git repository.

yashmayya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 24138821e2c Fix dictionary backward compatibility issue for empty 
string columns (#18447)
24138821e2c is described below

commit 24138821e2c872b0d0384aa578b5a2b9a56a1ecc
Author: Yash Mayya <[email protected]>
AuthorDate: Fri May 8 17:25:56 2026 -0700

    Fix dictionary backward compatibility issue for empty string columns 
(#18447)
---
 .../index/readers/ImmutableDictionaryTest.java     | 48 ++++++++++++++++++++++
 .../spi/index/metadata/ColumnMetadataImpl.java     |  6 ++-
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTest.java
index db4a4fd6a20..5df10389da3 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readers/ImmutableDictionaryTest.java
@@ -30,14 +30,18 @@ import java.util.HashSet;
 import java.util.Random;
 import java.util.Set;
 import java.util.TreeSet;
+import org.apache.commons.configuration2.PropertiesConfiguration;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang3.RandomStringUtils;
 import org.apache.pinot.segment.local.PinotBuffersAfterMethodCheckRule;
 import 
org.apache.pinot.segment.local.segment.creator.impl.SegmentDictionaryCreator;
 import org.apache.pinot.segment.spi.V1Constants;
+import org.apache.pinot.segment.spi.V1Constants.MetadataKeys.Column;
+import org.apache.pinot.segment.spi.index.metadata.ColumnMetadataImpl;
 import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
 import org.apache.pinot.spi.data.DimensionFieldSpec;
 import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.apache.pinot.spi.data.FieldSpec.FieldType;
 import org.apache.pinot.spi.data.MetricFieldSpec;
 import org.apache.pinot.spi.utils.BigDecimalUtils;
 import org.apache.pinot.spi.utils.ByteArray;
@@ -466,6 +470,50 @@ public class ImmutableDictionaryTest implements 
PinotBuffersAfterMethodCheckRule
     }
   }
 
+  /**
+   * Regression test for old segments (pre-1.6.0) that have a STRING column 
with all-empty values:
+   * the segment lacks LENGTH_OF_LONGEST_ELEMENT (introduced in 1.6.0) and has 
DICTIONARY_ELEMENT_SIZE=0
+   * (the longest entry length is zero when every entry is empty). The 
metadata loader must canonicalize
+   * the longest-element length to 0 here; otherwise it leaves the field at 
the UNAVAILABLE sentinel
+   * (-1), which propagates into StringDictionary.getBuffer() as `new 
byte[-1]` and fails query
+   * execution with NegativeArraySizeException.
+   */
+  @Test
+  public void testStringDictionaryReadOnPre16OldSegmentMetadata()
+      throws Exception {
+    String columnName = "emptyStringCol";
+    // Build a real var-length string dictionary on disk holding a single 
empty value.
+    try (SegmentDictionaryCreator dictCreator = new SegmentDictionaryCreator(
+        new DimensionFieldSpec(columnName, DataType.STRING, true), TEMP_DIR, 
true)) {
+      dictCreator.build(new String[]{""});
+      assertEquals(dictCreator.getNumBytesPerEntry(), 0);
+    }
+
+    // Simulate pre-1.6.0 metadata: HAS_DICTIONARY=true, 
DICTIONARY_ELEMENT_SIZE=0, no LENGTH_OF_LONGEST_ELEMENT.
+    PropertiesConfiguration config = new PropertiesConfiguration();
+    config.setProperty(Column.getKeyFor(columnName, Column.COLUMN_NAME), 
columnName);
+    config.setProperty(Column.getKeyFor(columnName, Column.COLUMN_TYPE), 
FieldType.DIMENSION.name());
+    config.setProperty(Column.getKeyFor(columnName, Column.DATA_TYPE), 
DataType.STRING.name());
+    config.setProperty(Column.getKeyFor(columnName, Column.IS_SINGLE_VALUED), 
true);
+    config.setProperty(Column.getKeyFor(columnName, Column.CARDINALITY), 1);
+    config.setProperty(Column.getKeyFor(columnName, Column.HAS_DICTIONARY), 
true);
+    config.setProperty(Column.getKeyFor(columnName, 
Column.DICTIONARY_ELEMENT_SIZE), 0);
+    // LENGTH_OF_LONGEST_ELEMENT intentionally NOT set.
+
+    ColumnMetadataImpl metadata = 
ColumnMetadataImpl.fromPropertiesConfiguration(config, 1, columnName);
+
+    // Reproduce the production code path: DictionaryIndexType passes 
metadata.getLengthOfLongestElement()
+    // as numBytesPerValue. Without the canonicalization fix this is -1, and 
StringDictionary.readStringValues
+    // throws NegativeArraySizeException at 
BaseImmutableDictionary.getBuffer().
+    try (PinotDataBuffer buffer = PinotDataBuffer.mapReadOnlyBigEndianFile(
+        new File(TEMP_DIR, columnName + V1Constants.Dict.FILE_EXTENSION));
+        StringDictionary dict = new StringDictionary(buffer, 1, 
metadata.getLengthOfLongestElement())) {
+      String[] outValues = new String[1];
+      dict.readStringValues(new int[]{0}, 1, outValues);
+      assertEquals(outValues[0], "");
+    }
+  }
+
   @AfterClass
   public void tearDown() {
     FileUtils.deleteQuietly(TEMP_DIR);
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
index 8ce7d53f820..45c423e66c5 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/metadata/ColumnMetadataImpl.java
@@ -571,7 +571,11 @@ public class ColumnMetadataImpl implements ColumnMetadata {
         _lengthOfShortestElement = size;
         _lengthOfLongestElement = size;
       } else {
-        if (_lengthOfLongestElement < 0 && _dictionaryElementSize > 0) {
+        // Pre-1.6.0 segments don't write LENGTH_OF_LONGEST_ELEMENT; fall back 
to DICTIONARY_ELEMENT_SIZE,
+        // which has been written for dictionary-encoded columns since well 
before 1.6.0 (including the
+        // zero-length case where every entry is an empty string). Leaving the 
field at the UNAVAILABLE
+        // sentinel would propagate as `numBytesPerValue = -1` into 
BaseImmutableDictionary.getBuffer().
+        if (_lengthOfLongestElement < 0 && _hasDictionary) {
           _lengthOfLongestElement = _dictionaryElementSize;
         }
       }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to