This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/master by this push:
     new 0ecd799dc Allow reading dictionary encoded boolean (#3370)
0ecd799dc is described below

commit 0ecd799dc01768ec0816a1ab38507b805a788f6a
Author: Fokko Driesprong <[email protected]>
AuthorDate: Fri Dec 19 16:47:10 2025 +0100

    Allow reading dictionary encoded boolean (#3370)
    
    * Allow reading dictionary encoded boolean
    
    I've observed some Parquet files in the wild that contain dictionary
    encoded boolean values, which is also wild.
    
    I don't think we want allow producing this, but I think it would be
    good to allow reading this. We don't judge.
    
    * Thanks Gang
---
 .../java/org/apache/parquet/column/Encoding.java   |  3 +
 .../values/dictionary/PlainValuesDictionary.java   | 43 ++++++++++++++
 .../column/values/dictionary/TestDictionary.java   | 69 ++++++++++++++++++++++
 3 files changed, 115 insertions(+)

diff --git 
a/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java 
b/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java
index cadf8f2e0..874c99fde 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java
@@ -40,6 +40,7 @@ import 
org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArra
 import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
 import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
 import 
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBinaryDictionary;
+import 
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBooleanDictionary;
 import 
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainDoubleDictionary;
 import 
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainFloatDictionary;
 import 
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainIntegerDictionary;
@@ -102,6 +103,8 @@ public enum Encoding {
           return new PlainIntegerDictionary(dictionaryPage);
         case FLOAT:
           return new PlainFloatDictionary(dictionaryPage);
+        case BOOLEAN:
+          return new PlainBooleanDictionary(dictionaryPage);
         default:
           throw new ParquetDecodingException(
               "Dictionary encoding not supported for type: " + 
descriptor.getType());
diff --git 
a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java
 
b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java
index 436bddd3c..468c7d110 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java
@@ -28,6 +28,7 @@ import org.apache.parquet.Preconditions;
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.apache.parquet.column.Dictionary;
 import org.apache.parquet.column.page.DictionaryPage;
+import org.apache.parquet.column.values.plain.BooleanPlainValuesReader;
 import 
org.apache.parquet.column.values.plain.PlainValuesReader.DoublePlainValuesReader;
 import 
org.apache.parquet.column.values.plain.PlainValuesReader.FloatPlainValuesReader;
 import 
org.apache.parquet.column.values.plain.PlainValuesReader.IntegerPlainValuesReader;
@@ -300,4 +301,46 @@ public abstract class PlainValuesDictionary extends 
Dictionary {
       return floatDictionaryContent.length - 1;
     }
   }
+
+  /**
+   * a simple implementation of dictionary for plain encoded boolean values
+   */
+  public static class PlainBooleanDictionary extends PlainValuesDictionary {
+
+    private final boolean[] boolDictionaryContent;
+
+    /**
+     * @param dictionaryPage a dictionary page of encoded boolean values
+     * @throws IOException if there is an exception while decoding the 
dictionary page
+     */
+    public PlainBooleanDictionary(DictionaryPage dictionaryPage) throws 
IOException {
+      super(dictionaryPage);
+      ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream();
+      boolDictionaryContent = new boolean[dictionaryPage.getDictionarySize()];
+      BooleanPlainValuesReader boolReader = new BooleanPlainValuesReader();
+      boolReader.initFromPage(dictionaryPage.getDictionarySize(), in);
+      for (int i = 0; i < boolDictionaryContent.length; i++) {
+        boolDictionaryContent[i] = boolReader.readBoolean();
+      }
+    }
+
+    @Override
+    public boolean decodeToBoolean(int id) {
+      return boolDictionaryContent[id];
+    }
+
+    @Override
+    public String toString() {
+      StringBuilder sb = new StringBuilder("PlainBooleanDictionary {\n");
+      for (int i = 0; i < boolDictionaryContent.length; i++) {
+        sb.append(i).append(" => 
").append(boolDictionaryContent[i]).append("\n");
+      }
+      return sb.append("}").toString();
+    }
+
+    @Override
+    public int getMaxId() {
+      return boolDictionaryContent.length - 1;
+    }
+  }
 }
diff --git 
a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
 
b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
index 6f7116bc3..a91f807e7 100644
--- 
a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
+++ 
b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java
@@ -25,6 +25,8 @@ import static 
org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
@@ -44,6 +46,7 @@ import 
org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainD
 import 
org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter;
 import 
org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter;
 import 
org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter;
+import 
org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBooleanDictionary;
 import org.apache.parquet.column.values.fallback.FallbackValuesWriter;
 import org.apache.parquet.column.values.plain.BinaryPlainValuesReader;
 import org.apache.parquet.column.values.plain.PlainValuesReader;
@@ -678,6 +681,72 @@ public class TestDictionary {
     }
   }
 
+  @Test
+  public void testBooleanDictionary() throws IOException {
+    // Create a dictionary page with boolean values (false, true)
+    // Bit-packed: bit 0 = false (0), bit 1 = true (1) => byte = 0b00000010 = 
0x02
+    BytesInput bytes = BytesInput.from(new byte[] {0x02});
+    DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN);
+
+    PlainBooleanDictionary dictionary = new 
PlainBooleanDictionary(dictionaryPage);
+
+    // Verify dictionary decoding
+    assertFalse(dictionary.decodeToBoolean(0));
+    assertTrue(dictionary.decodeToBoolean(1));
+    assertEquals(1, dictionary.getMaxId());
+  }
+
+  @Test
+  public void testBooleanDictionarySingleValue() throws IOException {
+    // Test dictionary with only true value
+    // Bit-packed: bit 0 = true (1) => byte = 0b00000001 = 0x01
+    BytesInput bytesTrue = BytesInput.from(new byte[] {0x01});
+    DictionaryPage dictionaryPageTrue = new DictionaryPage(bytesTrue, 1, 
PLAIN);
+
+    PlainBooleanDictionary dictionaryTrue = new 
PlainBooleanDictionary(dictionaryPageTrue);
+
+    assertTrue(dictionaryTrue.decodeToBoolean(0));
+    assertEquals(0, dictionaryTrue.getMaxId());
+
+    // Test dictionary with only false value
+    // Bit-packed: bit 0 = false (0) => byte = 0b00000000 = 0x00
+    BytesInput bytesFalse = BytesInput.from(new byte[] {0x00});
+    DictionaryPage dictionaryPageFalse = new DictionaryPage(bytesFalse, 1, 
PLAIN);
+
+    PlainBooleanDictionary dictionaryFalse = new 
PlainBooleanDictionary(dictionaryPageFalse);
+
+    assertFalse(dictionaryFalse.decodeToBoolean(0));
+    assertEquals(0, dictionaryFalse.getMaxId());
+  }
+
+  @Test
+  public void testBooleanDictionaryToString() throws IOException {
+    // Bit-packed: bit 0 = false (0), bit 1 = true (1) => byte = 0b00000010 = 
0x02
+    BytesInput bytes = BytesInput.from(new byte[] {0x02});
+    DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN);
+
+    PlainBooleanDictionary dictionary = new 
PlainBooleanDictionary(dictionaryPage);
+
+    String str = dictionary.toString();
+    Assert.assertTrue(str.contains("PlainBooleanDictionary"));
+    Assert.assertTrue(str.contains("0 => false"));
+    Assert.assertTrue(str.contains("1 => true"));
+  }
+
+  @Test
+  public void testBooleanDictionaryWithDictionaryEncoding() throws IOException 
{
+    // Test with PLAIN_DICTIONARY encoding (both PLAIN and PLAIN_DICTIONARY 
should work)
+    // Bit-packed: bit 0 = true (1), bit 1 = false (0) => byte = 0b00000001 = 
0x01
+    BytesInput bytes = BytesInput.from(new byte[] {0x01});
+    DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, 
PLAIN_DICTIONARY);
+
+    PlainBooleanDictionary dictionary = new 
PlainBooleanDictionary(dictionaryPage);
+
+    assertEquals(true, dictionary.decodeToBoolean(0));
+    assertEquals(false, dictionary.decodeToBoolean(1));
+    assertEquals(1, dictionary.getMaxId());
+  }
+
   private DictionaryValuesReader initDicReader(ValuesWriter cw, 
PrimitiveTypeName type) throws IOException {
     final DictionaryPage dictionaryPage = cw.toDictPageAndClose().copy();
     final ColumnDescriptor descriptor = new ColumnDescriptor(new String[] 
{"foo"}, type, 0, 0);

Reply via email to