This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/master by this push:
     new d5f86d7c0 GH-3133: Fix SizeStatistics to handle omitted histogram 
(#3134)
d5f86d7c0 is described below

commit d5f86d7c0e9894510e8af6dfd37444843e6d1bc4
Author: Gang Wu <[email protected]>
AuthorDate: Tue Jan 21 16:18:19 2025 +0800

    GH-3133: Fix SizeStatistics to handle omitted histogram (#3134)
---
 .../apache/parquet/column/statistics/SizeStatistics.java |  6 ++++--
 .../parquet/column/statistics/TestSizeStatistics.java    | 16 ++++++++++++++++
 .../format/converter/ParquetMetadataConverter.java       | 10 ++++++++--
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git 
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
 
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
index d267d3f6d..fe1826c19 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
@@ -148,8 +148,10 @@ public class SizeStatistics {
       List<Long> definitionLevelHistogram) {
     this.type = type;
     this.unencodedByteArrayDataBytes = unencodedByteArrayDataBytes;
-    this.repetitionLevelHistogram = repetitionLevelHistogram;
-    this.definitionLevelHistogram = definitionLevelHistogram;
+    this.repetitionLevelHistogram =
+        repetitionLevelHistogram == null ? Collections.emptyList() : 
repetitionLevelHistogram;
+    this.definitionLevelHistogram =
+        definitionLevelHistogram == null ? Collections.emptyList() : 
definitionLevelHistogram;
   }
 
   /**
diff --git 
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
 
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
index 32bdf54f5..813298c2b 100644
--- 
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
+++ 
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
@@ -125,4 +125,20 @@ public class TestSizeStatistics {
     Assert.assertEquals(Arrays.asList(1L, 1L, 1L), 
copy.getRepetitionLevelHistogram());
     Assert.assertEquals(Arrays.asList(1L, 1L, 1L), 
copy.getDefinitionLevelHistogram());
   }
+
+  @Test
+  public void testOmittedHistogram() {
+    PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY)
+        .as(LogicalTypeAnnotation.stringType())
+        .named("a");
+    SizeStatistics statistics = new SizeStatistics(type, 1024L, null, null);
+    Assert.assertEquals(Optional.of(1024L), 
statistics.getUnencodedByteArrayDataBytes());
+    Assert.assertEquals(Collections.emptyList(), 
statistics.getRepetitionLevelHistogram());
+    Assert.assertEquals(Collections.emptyList(), 
statistics.getDefinitionLevelHistogram());
+
+    SizeStatistics copy = statistics.copy();
+    Assert.assertEquals(Optional.of(1024L), 
copy.getUnencodedByteArrayDataBytes());
+    Assert.assertEquals(Collections.emptyList(), 
copy.getRepetitionLevelHistogram());
+    Assert.assertEquals(Collections.emptyList(), 
copy.getDefinitionLevelHistogram());
+  }
 }
diff --git 
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
 
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index d1c6b01c9..e72f2c33a 100644
--- 
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ 
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -2382,8 +2382,14 @@ public class ParquetMetadataConverter {
       formatStats.setUnencoded_byte_array_data_bytes(
           stats.getUnencodedByteArrayDataBytes().get());
     }
-    
formatStats.setRepetition_level_histogram(stats.getRepetitionLevelHistogram());
-    
formatStats.setDefinition_level_histogram(stats.getDefinitionLevelHistogram());
+    List<Long> repLevelHistogram = stats.getRepetitionLevelHistogram();
+    if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) {
+      formatStats.setRepetition_level_histogram(repLevelHistogram);
+    }
+    List<Long> defLevelHistogram = stats.getDefinitionLevelHistogram();
+    if (defLevelHistogram != null && !defLevelHistogram.isEmpty()) {
+      formatStats.setDefinition_level_histogram(defLevelHistogram);
+    }
     return formatStats;
   }
 }

Reply via email to