This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch parquet-1.15.x
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/parquet-1.15.x by this push:
new a19e98512 GH-3133: Fix SizeStatistics to handle omitted histogram
(#3135)
a19e98512 is described below
commit a19e98512ccdb349f8d71ffbd8f64019b09e0958
Author: Gang Wu <[email protected]>
AuthorDate: Tue Mar 4 22:39:39 2025 +0800
GH-3133: Fix SizeStatistics to handle omitted histogram (#3135)
---
.../parquet/column/statistics/SizeStatistics.java | 6 ++++--
.../parquet/column/statistics/TestSizeStatistics.java | 17 +++++++++++++++++
.../format/converter/ParquetMetadataConverter.java | 10 ++++++++--
parquet-plugins/parquet-encoding-vector/pom.xml | 2 +-
parquet-plugins/parquet-plugins-benchmarks/pom.xml | 2 +-
5 files changed, 31 insertions(+), 6 deletions(-)
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
index 97a49be65..0dbb20e66 100644
---
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
+++
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
@@ -136,8 +136,10 @@ public class SizeStatistics {
List<Long> definitionLevelHistogram) {
this.type = type;
this.unencodedByteArrayDataBytes = unencodedByteArrayDataBytes;
- this.repetitionLevelHistogram = repetitionLevelHistogram;
- this.definitionLevelHistogram = definitionLevelHistogram;
+ this.repetitionLevelHistogram =
+ repetitionLevelHistogram == null ? Collections.emptyList() :
repetitionLevelHistogram;
+ this.definitionLevelHistogram =
+ definitionLevelHistogram == null ? Collections.emptyList() :
definitionLevelHistogram;
}
/**
diff --git
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
index 6c166b0e7..6e2b68167 100644
---
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
+++
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
@@ -19,6 +19,7 @@
package org.apache.parquet.column.statistics;
import java.util.Arrays;
+import java.util.Collections;
import java.util.Optional;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.LogicalTypeAnnotation;
@@ -124,4 +125,20 @@ public class TestSizeStatistics {
Assert.assertEquals(Arrays.asList(1L, 1L, 1L),
copy.getRepetitionLevelHistogram());
Assert.assertEquals(Arrays.asList(1L, 1L, 1L),
copy.getDefinitionLevelHistogram());
}
+
+ @Test
+ public void testOmittedHistogram() {
+ PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(LogicalTypeAnnotation.stringType())
+ .named("a");
+ SizeStatistics statistics = new SizeStatistics(type, 1024L, null, null);
+ Assert.assertEquals(Optional.of(1024L),
statistics.getUnencodedByteArrayDataBytes());
+ Assert.assertEquals(Collections.emptyList(),
statistics.getRepetitionLevelHistogram());
+ Assert.assertEquals(Collections.emptyList(),
statistics.getDefinitionLevelHistogram());
+
+ SizeStatistics copy = statistics.copy();
+ Assert.assertEquals(Optional.of(1024L),
copy.getUnencodedByteArrayDataBytes());
+ Assert.assertEquals(Collections.emptyList(),
copy.getRepetitionLevelHistogram());
+ Assert.assertEquals(Collections.emptyList(),
copy.getDefinitionLevelHistogram());
+ }
}
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index d1c6b01c9..e72f2c33a 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -2382,8 +2382,14 @@ public class ParquetMetadataConverter {
formatStats.setUnencoded_byte_array_data_bytes(
stats.getUnencodedByteArrayDataBytes().get());
}
-
formatStats.setRepetition_level_histogram(stats.getRepetitionLevelHistogram());
-
formatStats.setDefinition_level_histogram(stats.getDefinitionLevelHistogram());
+ List<Long> repLevelHistogram = stats.getRepetitionLevelHistogram();
+ if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) {
+ formatStats.setRepetition_level_histogram(repLevelHistogram);
+ }
+ List<Long> defLevelHistogram = stats.getDefinitionLevelHistogram();
+ if (defLevelHistogram != null && !defLevelHistogram.isEmpty()) {
+ formatStats.setDefinition_level_histogram(defLevelHistogram);
+ }
return formatStats;
}
}
diff --git a/parquet-plugins/parquet-encoding-vector/pom.xml
b/parquet-plugins/parquet-encoding-vector/pom.xml
index 4b79efdeb..390ac8800 100644
--- a/parquet-plugins/parquet-encoding-vector/pom.xml
+++ b/parquet-plugins/parquet-encoding-vector/pom.xml
@@ -22,7 +22,7 @@
<parent>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
- <version>1.15.0-SNAPSHOT</version>
+ <version>1.15.1-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
diff --git a/parquet-plugins/parquet-plugins-benchmarks/pom.xml
b/parquet-plugins/parquet-plugins-benchmarks/pom.xml
index 19e500bdc..99f779c8f 100644
--- a/parquet-plugins/parquet-plugins-benchmarks/pom.xml
+++ b/parquet-plugins/parquet-plugins-benchmarks/pom.xml
@@ -22,7 +22,7 @@
<parent>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet</artifactId>
- <version>1.15.0-SNAPSHOT</version>
+ <version>1.15.1-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>