This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/master by this push:
     new 5187eb193 GH-3263: Add DictionaryPage.decode to allow dictionary reuse 
in the ColumnReaderBase ctor (#3264)
5187eb193 is described below

commit 5187eb1933356bbec11ff82a2c928b0fb92ce82e
Author: pyckle <[email protected]>
AuthorDate: Fri Aug 15 17:15:45 2025 +0300

    GH-3263: Add DictionaryPage.decode to allow dictionary reuse in the 
ColumnReaderBase ctor (#3264)
---
 .../parquet/column/impl/ColumnReaderBase.java      | 14 +----
 .../apache/parquet/column/page/DictionaryPage.java | 14 +++++
 .../parquet/column/impl/TestColumnReaderImpl.java  | 70 ++++++++++++++++++----
 3 files changed, 76 insertions(+), 22 deletions(-)

diff --git 
a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java
 
b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java
index 87418409f..2b3e47116 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java
@@ -453,17 +453,9 @@ abstract class ColumnReaderBase implements ColumnReader {
     this.writerVersion = writerVersion;
     this.maxDefinitionLevel = path.getMaxDefinitionLevel();
     DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
-    if (dictionaryPage != null) {
-      try {
-        this.dictionary = dictionaryPage.getEncoding().initDictionary(path, 
dictionaryPage);
-        if (converter.hasDictionarySupport()) {
-          converter.setDictionary(dictionary);
-        }
-      } catch (IOException e) {
-        throw new ParquetDecodingException("could not decode the dictionary 
for " + path, e);
-      }
-    } else {
-      this.dictionary = null;
+    this.dictionary = dictionaryPage == null ? null : 
dictionaryPage.decode(path);
+    if (dictionary != null && converter.hasDictionarySupport()) {
+      converter.setDictionary(dictionary);
     }
     this.totalValueCount = pageReader.getTotalValueCount();
     if (totalValueCount <= 0) {
diff --git 
a/parquet-column/src/main/java/org/apache/parquet/column/page/DictionaryPage.java
 
b/parquet-column/src/main/java/org/apache/parquet/column/page/DictionaryPage.java
index e32c0e0ce..26cc031ac 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/column/page/DictionaryPage.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/column/page/DictionaryPage.java
@@ -21,7 +21,10 @@ package org.apache.parquet.column.page;
 import java.io.IOException;
 import java.util.Objects;
 import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Dictionary;
 import org.apache.parquet.column.Encoding;
+import org.apache.parquet.io.ParquetDecodingException;
 
 /**
  * Data for a dictionary page
@@ -74,6 +77,17 @@ public class DictionaryPage extends Page {
     return new DictionaryPage(BytesInput.copy(bytes), getUncompressedSize(), 
dictionarySize, encoding);
   }
 
+  /**
+   * @return the decoded dictionary
+   */
+  public Dictionary decode(ColumnDescriptor path) {
+    try {
+      return getEncoding().initDictionary(path, this);
+    } catch (IOException e) {
+      throw new ParquetDecodingException("could not decode the dictionary for 
" + path, e);
+    }
+  }
+
   @Override
   public String toString() {
     return "Page [bytes.size=" + bytes.size() + ", entryCount=" + 
dictionarySize + ", uncompressedSize="
diff --git 
a/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java
 
b/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java
index ac6818952..63b9e957f 100644
--- 
a/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java
+++ 
b/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java
@@ -24,17 +24,21 @@ import static 
org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_
 import java.util.List;
 import org.apache.parquet.Version;
 import org.apache.parquet.VersionParser;
+import org.apache.parquet.bytes.BytesInput;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.ColumnReader;
+import org.apache.parquet.column.Dictionary;
 import org.apache.parquet.column.ParquetProperties;
 import org.apache.parquet.column.page.DataPage;
 import org.apache.parquet.column.page.DataPageV2;
+import org.apache.parquet.column.page.DictionaryPage;
 import org.apache.parquet.column.page.mem.MemPageReader;
 import org.apache.parquet.column.page.mem.MemPageWriter;
 import org.apache.parquet.io.api.Binary;
 import org.apache.parquet.io.api.PrimitiveConverter;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.MessageTypeParser;
+import org.junit.Assert;
 import org.junit.Test;
 
 public class TestColumnReaderImpl {
@@ -53,8 +57,28 @@ public class TestColumnReaderImpl {
 
   @Test
   public void test() throws Exception {
+    ColumnDescriptor col = requiredBinaryColumn();
+    MemPageWriter pageWriter = writeBinaryDictColumn(col);
+    List<DataPage> pages = pageWriter.getPages();
+    int valueCount = 0;
+    int rowCount = 0;
+    for (DataPage dataPage : pages) {
+      valueCount += dataPage.getValueCount();
+      rowCount += ((DataPageV2) dataPage).getRowCount();
+    }
+    assertEquals(rows, rowCount);
+    assertEquals(rows, valueCount);
+    MemPageReader pageReader = toReader(pageWriter);
+    validateExpectedValuesAndCount(col, pageReader);
+  }
+
+  private static ColumnDescriptor requiredBinaryColumn() {
     MessageType schema = MessageTypeParser.parseMessageType("message test { 
required binary foo; }");
     ColumnDescriptor col = schema.getColumns().get(0);
+    return col;
+  }
+
+  private MemPageWriter writeBinaryDictColumn(ColumnDescriptor col) {
     MemPageWriter pageWriter = new MemPageWriter();
     ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(
         col,
@@ -72,16 +96,15 @@ public class TestColumnReaderImpl {
     }
     columnWriterV2.writePage();
     columnWriterV2.finalizeColumnChunk();
-    List<DataPage> pages = pageWriter.getPages();
-    int valueCount = 0;
-    int rowCount = 0;
-    for (DataPage dataPage : pages) {
-      valueCount += dataPage.getValueCount();
-      rowCount += ((DataPageV2) dataPage).getRowCount();
-    }
-    assertEquals(rows, rowCount);
-    assertEquals(rows, valueCount);
-    MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), 
pageWriter.getDictionaryPage());
+    return pageWriter;
+  }
+
+  private MemPageReader toReader(MemPageWriter pageWriter) {
+    return new MemPageReader(rows, pageWriter.getPages().iterator(), 
pageWriter.getDictionaryPage());
+  }
+
+  private void validateExpectedValuesAndCount(ColumnDescriptor col, 
MemPageReader pageReader)
+      throws VersionParser.VersionParseException {
     ValidatingConverter converter = new ValidatingConverter();
     ColumnReader columnReader =
         new ColumnReaderImpl(col, pageReader, converter, 
VersionParser.parse(Version.FULL_VERSION));
@@ -124,7 +147,7 @@ public class TestColumnReaderImpl {
     }
     assertEquals(rows, rowCount);
     assertEquals(rows, valueCount);
-    MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), 
pageWriter.getDictionaryPage());
+    MemPageReader pageReader = toReader(pageWriter);
     ValidatingConverter converter = new ValidatingConverter();
     ColumnReader columnReader =
         new ColumnReaderImpl(col, pageReader, converter, 
VersionParser.parse(Version.FULL_VERSION));
@@ -135,4 +158,29 @@ public class TestColumnReaderImpl {
     }
     assertEquals(0, converter.count);
   }
+
+  @Test
+  public void testDeduplicatedDecodedDictionary() throws Exception {
+    ColumnDescriptor col = requiredBinaryColumn();
+    MemPageWriter pageWriter = writeBinaryDictColumn(col);
+
+    DictionaryPage dictionaryPage = pageWriter.getDictionaryPage();
+    Assert.assertNotNull("Expected a dictionary", dictionaryPage);
+
+    Dictionary dict = dictionaryPage.decode(col);
+
+    // construct a page reader from a dictionary page that lacks bytes but 
stores the decoded data.
+    MemPageReader pageReader = new MemPageReader(
+        rows,
+        pageWriter.getPages().iterator(),
+        new DictionaryPage(
+            BytesInput.empty(), dictionaryPage.getDictionarySize(), 
dictionaryPage.getEncoding()) {
+          @Override
+          public Dictionary decode(ColumnDescriptor path) {
+            return dict;
+          }
+        });
+
+    validateExpectedValuesAndCount(col, pageReader);
+  }
 }

Reply via email to