This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch parquet-1.15.x
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/parquet-1.15.x by this push:
     new a19e98512 GH-3133: Fix SizeStatistics to handle omitted histogram 
(#3135)
a19e98512 is described below

commit a19e98512ccdb349f8d71ffbd8f64019b09e0958
Author: Gang Wu <[email protected]>
AuthorDate: Tue Mar 4 22:39:39 2025 +0800

    GH-3133: Fix SizeStatistics to handle omitted histogram (#3135)
---
 .../parquet/column/statistics/SizeStatistics.java       |  6 ++++--
 .../parquet/column/statistics/TestSizeStatistics.java   | 17 +++++++++++++++++
 .../format/converter/ParquetMetadataConverter.java      | 10 ++++++++--
 parquet-plugins/parquet-encoding-vector/pom.xml         |  2 +-
 parquet-plugins/parquet-plugins-benchmarks/pom.xml      |  2 +-
 5 files changed, 31 insertions(+), 6 deletions(-)

diff --git 
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
 
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
index 97a49be65..0dbb20e66 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
@@ -136,8 +136,10 @@ public class SizeStatistics {
       List<Long> definitionLevelHistogram) {
     this.type = type;
     this.unencodedByteArrayDataBytes = unencodedByteArrayDataBytes;
-    this.repetitionLevelHistogram = repetitionLevelHistogram;
-    this.definitionLevelHistogram = definitionLevelHistogram;
+    this.repetitionLevelHistogram =
+        repetitionLevelHistogram == null ? Collections.emptyList() : 
repetitionLevelHistogram;
+    this.definitionLevelHistogram =
+        definitionLevelHistogram == null ? Collections.emptyList() : 
definitionLevelHistogram;
   }
 
   /**
diff --git 
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
 
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
index 6c166b0e7..6e2b68167 100644
--- 
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
+++ 
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
@@ -19,6 +19,7 @@
 package org.apache.parquet.column.statistics;
 
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.Optional;
 import org.apache.parquet.io.api.Binary;
 import org.apache.parquet.schema.LogicalTypeAnnotation;
@@ -124,4 +125,20 @@ public class TestSizeStatistics {
     Assert.assertEquals(Arrays.asList(1L, 1L, 1L), 
copy.getRepetitionLevelHistogram());
     Assert.assertEquals(Arrays.asList(1L, 1L, 1L), 
copy.getDefinitionLevelHistogram());
   }
+
+  @Test
+  public void testOmittedHistogram() {
+    PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY)
+        .as(LogicalTypeAnnotation.stringType())
+        .named("a");
+    SizeStatistics statistics = new SizeStatistics(type, 1024L, null, null);
+    Assert.assertEquals(Optional.of(1024L), 
statistics.getUnencodedByteArrayDataBytes());
+    Assert.assertEquals(Collections.emptyList(), 
statistics.getRepetitionLevelHistogram());
+    Assert.assertEquals(Collections.emptyList(), 
statistics.getDefinitionLevelHistogram());
+
+    SizeStatistics copy = statistics.copy();
+    Assert.assertEquals(Optional.of(1024L), 
copy.getUnencodedByteArrayDataBytes());
+    Assert.assertEquals(Collections.emptyList(), 
copy.getRepetitionLevelHistogram());
+    Assert.assertEquals(Collections.emptyList(), 
copy.getDefinitionLevelHistogram());
+  }
 }
diff --git 
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
 
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index d1c6b01c9..e72f2c33a 100644
--- 
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ 
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -2382,8 +2382,14 @@ public class ParquetMetadataConverter {
       formatStats.setUnencoded_byte_array_data_bytes(
           stats.getUnencodedByteArrayDataBytes().get());
     }
-    
formatStats.setRepetition_level_histogram(stats.getRepetitionLevelHistogram());
-    
formatStats.setDefinition_level_histogram(stats.getDefinitionLevelHistogram());
+    List<Long> repLevelHistogram = stats.getRepetitionLevelHistogram();
+    if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) {
+      formatStats.setRepetition_level_histogram(repLevelHistogram);
+    }
+    List<Long> defLevelHistogram = stats.getDefinitionLevelHistogram();
+    if (defLevelHistogram != null && !defLevelHistogram.isEmpty()) {
+      formatStats.setDefinition_level_histogram(defLevelHistogram);
+    }
     return formatStats;
   }
 }
diff --git a/parquet-plugins/parquet-encoding-vector/pom.xml 
b/parquet-plugins/parquet-encoding-vector/pom.xml
index 4b79efdeb..390ac8800 100644
--- a/parquet-plugins/parquet-encoding-vector/pom.xml
+++ b/parquet-plugins/parquet-encoding-vector/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.parquet</groupId>
     <artifactId>parquet</artifactId>
-    <version>1.15.0-SNAPSHOT</version>
+    <version>1.15.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/parquet-plugins/parquet-plugins-benchmarks/pom.xml 
b/parquet-plugins/parquet-plugins-benchmarks/pom.xml
index 19e500bdc..99f779c8f 100644
--- a/parquet-plugins/parquet-plugins-benchmarks/pom.xml
+++ b/parquet-plugins/parquet-plugins-benchmarks/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.parquet</groupId>
     <artifactId>parquet</artifactId>
-    <version>1.15.0-SNAPSHOT</version>
+    <version>1.15.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 

Reply via email to