This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new 0ecd799dc Allow reading dictionary encoded boolean (#3370)
0ecd799dc is described below
commit 0ecd799dc01768ec0816a1ab38507b805a788f6a
Author: Fokko Driesprong <[email protected]>
AuthorDate: Fri Dec 19 16:47:10 2025 +0100
Allow reading dictionary encoded boolean (#3370)
* Allow reading dictionary encoded boolean
I've observed some Parquet files in the wild that contain dictionary
encoded boolean values, which is also wild.
I don't think we want allow producing this, but I think it would be
good to allow reading this. We don't judge.
* Thanks Gang
---
.../java/org/apache/parquet/column/Encoding.java | 3 +
.../values/dictionary/PlainValuesDictionary.java | 43 ++++++++++++++
.../column/values/dictionary/TestDictionary.java | 69 ++++++++++++++++++++++
3 files changed, 115 insertions(+)
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java
b/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java
index cadf8f2e0..874c99fde 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java
@@ -40,6 +40,7 @@ import
org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArra
import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
import
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBinaryDictionary;
+import
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBooleanDictionary;
import
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainDoubleDictionary;
import
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainFloatDictionary;
import
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainIntegerDictionary;
@@ -102,6 +103,8 @@ public enum Encoding {
return new PlainIntegerDictionary(dictionaryPage);
case FLOAT:
return new PlainFloatDictionary(dictionaryPage);
+ case BOOLEAN:
+ return new PlainBooleanDictionary(dictionaryPage);
default:
throw new ParquetDecodingException(
"Dictionary encoding not supported for type: " +
descriptor.getType());
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java
b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java
index 436bddd3c..468c7d110 100644
---
a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java
+++
b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java
@@ -28,6 +28,7 @@ import org.apache.parquet.Preconditions;
import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.column.Dictionary;
import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.plain.BooleanPlainValuesReader;
import
org.apache.parquet.column.values.plain.PlainValuesReader.DoublePlainValuesReader;
import
org.apache.parquet.column.values.plain.PlainValuesReader.FloatPlainValuesReader;
import
org.apache.parquet.column.values.plain.PlainValuesReader.IntegerPlainValuesReader;
@@ -300,4 +301,46 @@ public abstract class PlainValuesDictionary extends
Dictionary {
return floatDictionaryContent.length - 1;
}
}
+
+ /**
+ * a simple implementation of dictionary for plain encoded boolean values
+ */
+ public static class PlainBooleanDictionary extends PlainValuesDictionary {
+
+ private final boolean[] boolDictionaryContent;
+
+ /**
+ * @param dictionaryPage a dictionary page of encoded boolean values
+ * @throws IOException if there is an exception while decoding the
dictionary page
+ */
+ public PlainBooleanDictionary(DictionaryPage dictionaryPage) throws
IOException {
+ super(dictionaryPage);
+ ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream();
+ boolDictionaryContent = new boolean[dictionaryPage.getDictionarySize()];
+ BooleanPlainValuesReader boolReader = new BooleanPlainValuesReader();
+ boolReader.initFromPage(dictionaryPage.getDictionarySize(), in);
+ for (int i = 0; i < boolDictionaryContent.length; i++) {
+ boolDictionaryContent[i] = boolReader.readBoolean();
+ }
+ }
+
+ @Override
+ public boolean decodeToBoolean(int id) {
+ return boolDictionaryContent[id];
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder("PlainBooleanDictionary {\n");
+ for (int i = 0; i < boolDictionaryContent.length; i++) {
+ sb.append(i).append(" =>
").append(boolDictionaryContent[i]).append("\n");
+ }
+ return sb.append("}").toString();
+ }
+
+ @Override
+ public int getMaxId() {
+ return boolDictionaryContent.length - 1;
+ }
+ }
}
diff --git
a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
index 6f7116bc3..a91f807e7 100644
---
a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
+++
b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
@@ -25,6 +25,8 @@ import static
org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.nio.ByteBuffer;
@@ -44,6 +46,7 @@ import
org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainD
import
org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter;
import
org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter;
import
org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter;
+import
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBooleanDictionary;
import org.apache.parquet.column.values.fallback.FallbackValuesWriter;
import org.apache.parquet.column.values.plain.BinaryPlainValuesReader;
import org.apache.parquet.column.values.plain.PlainValuesReader;
@@ -678,6 +681,72 @@ public class TestDictionary {
}
}
+ @Test
+ public void testBooleanDictionary() throws IOException {
+ // Create a dictionary page with boolean values (false, true)
+ // Bit-packed: bit 0 = false (0), bit 1 = true (1) => byte = 0b00000010 =
0x02
+ BytesInput bytes = BytesInput.from(new byte[] {0x02});
+ DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN);
+
+ PlainBooleanDictionary dictionary = new
PlainBooleanDictionary(dictionaryPage);
+
+ // Verify dictionary decoding
+ assertFalse(dictionary.decodeToBoolean(0));
+ assertTrue(dictionary.decodeToBoolean(1));
+ assertEquals(1, dictionary.getMaxId());
+ }
+
+ @Test
+ public void testBooleanDictionarySingleValue() throws IOException {
+ // Test dictionary with only true value
+ // Bit-packed: bit 0 = true (1) => byte = 0b00000001 = 0x01
+ BytesInput bytesTrue = BytesInput.from(new byte[] {0x01});
+ DictionaryPage dictionaryPageTrue = new DictionaryPage(bytesTrue, 1,
PLAIN);
+
+ PlainBooleanDictionary dictionaryTrue = new
PlainBooleanDictionary(dictionaryPageTrue);
+
+ assertTrue(dictionaryTrue.decodeToBoolean(0));
+ assertEquals(0, dictionaryTrue.getMaxId());
+
+ // Test dictionary with only false value
+ // Bit-packed: bit 0 = false (0) => byte = 0b00000000 = 0x00
+ BytesInput bytesFalse = BytesInput.from(new byte[] {0x00});
+ DictionaryPage dictionaryPageFalse = new DictionaryPage(bytesFalse, 1,
PLAIN);
+
+ PlainBooleanDictionary dictionaryFalse = new
PlainBooleanDictionary(dictionaryPageFalse);
+
+ assertFalse(dictionaryFalse.decodeToBoolean(0));
+ assertEquals(0, dictionaryFalse.getMaxId());
+ }
+
+ @Test
+ public void testBooleanDictionaryToString() throws IOException {
+ // Bit-packed: bit 0 = false (0), bit 1 = true (1) => byte = 0b00000010 =
0x02
+ BytesInput bytes = BytesInput.from(new byte[] {0x02});
+ DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN);
+
+ PlainBooleanDictionary dictionary = new
PlainBooleanDictionary(dictionaryPage);
+
+ String str = dictionary.toString();
+ Assert.assertTrue(str.contains("PlainBooleanDictionary"));
+ Assert.assertTrue(str.contains("0 => false"));
+ Assert.assertTrue(str.contains("1 => true"));
+ }
+
+ @Test
+ public void testBooleanDictionaryWithDictionaryEncoding() throws IOException
{
+ // Test with PLAIN_DICTIONARY encoding (both PLAIN and PLAIN_DICTIONARY
should work)
+ // Bit-packed: bit 0 = true (1), bit 1 = false (0) => byte = 0b00000001 =
0x01
+ BytesInput bytes = BytesInput.from(new byte[] {0x01});
+ DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2,
PLAIN_DICTIONARY);
+
+ PlainBooleanDictionary dictionary = new
PlainBooleanDictionary(dictionaryPage);
+
+ assertEquals(true, dictionary.decodeToBoolean(0));
+ assertEquals(false, dictionary.decodeToBoolean(1));
+ assertEquals(1, dictionary.getMaxId());
+ }
+
private DictionaryValuesReader initDicReader(ValuesWriter cw,
PrimitiveTypeName type) throws IOException {
final DictionaryPage dictionaryPage = cw.toDictPageAndClose().copy();
final ColumnDescriptor descriptor = new ColumnDescriptor(new String[]
{"foo"}, type, 0, 0);