This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new 5187eb193 GH-3263: Add DictionaryPage.decode to allow dictionary reuse
in the ColumnReaderBase ctor (#3264)
5187eb193 is described below
commit 5187eb1933356bbec11ff82a2c928b0fb92ce82e
Author: pyckle <[email protected]>
AuthorDate: Fri Aug 15 17:15:45 2025 +0300
GH-3263: Add DictionaryPage.decode to allow dictionary reuse in the
ColumnReaderBase ctor (#3264)
---
.../parquet/column/impl/ColumnReaderBase.java | 14 +----
.../apache/parquet/column/page/DictionaryPage.java | 14 +++++
.../parquet/column/impl/TestColumnReaderImpl.java | 70 ++++++++++++++++++----
3 files changed, 76 insertions(+), 22 deletions(-)
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java
b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java
index 87418409f..2b3e47116 100644
---
a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java
+++
b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnReaderBase.java
@@ -453,17 +453,9 @@ abstract class ColumnReaderBase implements ColumnReader {
this.writerVersion = writerVersion;
this.maxDefinitionLevel = path.getMaxDefinitionLevel();
DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
- if (dictionaryPage != null) {
- try {
- this.dictionary = dictionaryPage.getEncoding().initDictionary(path,
dictionaryPage);
- if (converter.hasDictionarySupport()) {
- converter.setDictionary(dictionary);
- }
- } catch (IOException e) {
- throw new ParquetDecodingException("could not decode the dictionary
for " + path, e);
- }
- } else {
- this.dictionary = null;
+ this.dictionary = dictionaryPage == null ? null :
dictionaryPage.decode(path);
+ if (dictionary != null && converter.hasDictionarySupport()) {
+ converter.setDictionary(dictionary);
}
this.totalValueCount = pageReader.getTotalValueCount();
if (totalValueCount <= 0) {
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/page/DictionaryPage.java
b/parquet-column/src/main/java/org/apache/parquet/column/page/DictionaryPage.java
index e32c0e0ce..26cc031ac 100644
---
a/parquet-column/src/main/java/org/apache/parquet/column/page/DictionaryPage.java
+++
b/parquet-column/src/main/java/org/apache/parquet/column/page/DictionaryPage.java
@@ -21,7 +21,10 @@ package org.apache.parquet.column.page;
import java.io.IOException;
import java.util.Objects;
import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Dictionary;
import org.apache.parquet.column.Encoding;
+import org.apache.parquet.io.ParquetDecodingException;
/**
* Data for a dictionary page
@@ -74,6 +77,17 @@ public class DictionaryPage extends Page {
return new DictionaryPage(BytesInput.copy(bytes), getUncompressedSize(),
dictionarySize, encoding);
}
+ /**
+ * @return the decoded dictionary
+ */
+ public Dictionary decode(ColumnDescriptor path) {
+ try {
+ return getEncoding().initDictionary(path, this);
+ } catch (IOException e) {
+ throw new ParquetDecodingException("could not decode the dictionary for
" + path, e);
+ }
+ }
+
@Override
public String toString() {
return "Page [bytes.size=" + bytes.size() + ", entryCount=" +
dictionarySize + ", uncompressedSize="
diff --git
a/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java
b/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java
index ac6818952..63b9e957f 100644
---
a/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java
+++
b/parquet-column/src/test/java/org/apache/parquet/column/impl/TestColumnReaderImpl.java
@@ -24,17 +24,21 @@ import static
org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_
import java.util.List;
import org.apache.parquet.Version;
import org.apache.parquet.VersionParser;
+import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReader;
+import org.apache.parquet.column.Dictionary;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.page.DataPage;
import org.apache.parquet.column.page.DataPageV2;
+import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.mem.MemPageReader;
import org.apache.parquet.column.page.mem.MemPageWriter;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
+import org.junit.Assert;
import org.junit.Test;
public class TestColumnReaderImpl {
@@ -53,8 +57,28 @@ public class TestColumnReaderImpl {
@Test
public void test() throws Exception {
+ ColumnDescriptor col = requiredBinaryColumn();
+ MemPageWriter pageWriter = writeBinaryDictColumn(col);
+ List<DataPage> pages = pageWriter.getPages();
+ int valueCount = 0;
+ int rowCount = 0;
+ for (DataPage dataPage : pages) {
+ valueCount += dataPage.getValueCount();
+ rowCount += ((DataPageV2) dataPage).getRowCount();
+ }
+ assertEquals(rows, rowCount);
+ assertEquals(rows, valueCount);
+ MemPageReader pageReader = toReader(pageWriter);
+ validateExpectedValuesAndCount(col, pageReader);
+ }
+
+ private static ColumnDescriptor requiredBinaryColumn() {
MessageType schema = MessageTypeParser.parseMessageType("message test {
required binary foo; }");
ColumnDescriptor col = schema.getColumns().get(0);
+ return col;
+ }
+
+ private MemPageWriter writeBinaryDictColumn(ColumnDescriptor col) {
MemPageWriter pageWriter = new MemPageWriter();
ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(
col,
@@ -72,16 +96,15 @@ public class TestColumnReaderImpl {
}
columnWriterV2.writePage();
columnWriterV2.finalizeColumnChunk();
- List<DataPage> pages = pageWriter.getPages();
- int valueCount = 0;
- int rowCount = 0;
- for (DataPage dataPage : pages) {
- valueCount += dataPage.getValueCount();
- rowCount += ((DataPageV2) dataPage).getRowCount();
- }
- assertEquals(rows, rowCount);
- assertEquals(rows, valueCount);
- MemPageReader pageReader = new MemPageReader(rows, pages.iterator(),
pageWriter.getDictionaryPage());
+ return pageWriter;
+ }
+
+ private MemPageReader toReader(MemPageWriter pageWriter) {
+ return new MemPageReader(rows, pageWriter.getPages().iterator(),
pageWriter.getDictionaryPage());
+ }
+
+ private void validateExpectedValuesAndCount(ColumnDescriptor col,
MemPageReader pageReader)
+ throws VersionParser.VersionParseException {
ValidatingConverter converter = new ValidatingConverter();
ColumnReader columnReader =
new ColumnReaderImpl(col, pageReader, converter,
VersionParser.parse(Version.FULL_VERSION));
@@ -124,7 +147,7 @@ public class TestColumnReaderImpl {
}
assertEquals(rows, rowCount);
assertEquals(rows, valueCount);
- MemPageReader pageReader = new MemPageReader(rows, pages.iterator(),
pageWriter.getDictionaryPage());
+ MemPageReader pageReader = toReader(pageWriter);
ValidatingConverter converter = new ValidatingConverter();
ColumnReader columnReader =
new ColumnReaderImpl(col, pageReader, converter,
VersionParser.parse(Version.FULL_VERSION));
@@ -135,4 +158,29 @@ public class TestColumnReaderImpl {
}
assertEquals(0, converter.count);
}
+
+ @Test
+ public void testDeduplicatedDecodedDictionary() throws Exception {
+ ColumnDescriptor col = requiredBinaryColumn();
+ MemPageWriter pageWriter = writeBinaryDictColumn(col);
+
+ DictionaryPage dictionaryPage = pageWriter.getDictionaryPage();
+ Assert.assertNotNull("Expected a dictionary", dictionaryPage);
+
+ Dictionary dict = dictionaryPage.decode(col);
+
+ // construct a page reader from a dictionary page that lacks bytes but
stores the decoded data.
+ MemPageReader pageReader = new MemPageReader(
+ rows,
+ pageWriter.getPages().iterator(),
+ new DictionaryPage(
+ BytesInput.empty(), dictionaryPage.getDictionarySize(),
dictionaryPage.getEncoding()) {
+ @Override
+ public Dictionary decode(ColumnDescriptor path) {
+ return dict;
+ }
+ });
+
+ validateExpectedValuesAndCount(col, pageReader);
+ }
}