This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new d5f86d7c0 GH-3133: Fix SizeStatistics to handle omitted histogram
(#3134)
d5f86d7c0 is described below
commit d5f86d7c0e9894510e8af6dfd37444843e6d1bc4
Author: Gang Wu <[email protected]>
AuthorDate: Tue Jan 21 16:18:19 2025 +0800
GH-3133: Fix SizeStatistics to handle omitted histogram (#3134)
---
.../apache/parquet/column/statistics/SizeStatistics.java | 6 ++++--
.../parquet/column/statistics/TestSizeStatistics.java | 16 ++++++++++++++++
.../format/converter/ParquetMetadataConverter.java | 10 ++++++++--
3 files changed, 28 insertions(+), 4 deletions(-)
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
index d267d3f6d..fe1826c19 100644
---
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
+++
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
@@ -148,8 +148,10 @@ public class SizeStatistics {
List<Long> definitionLevelHistogram) {
this.type = type;
this.unencodedByteArrayDataBytes = unencodedByteArrayDataBytes;
- this.repetitionLevelHistogram = repetitionLevelHistogram;
- this.definitionLevelHistogram = definitionLevelHistogram;
+ this.repetitionLevelHistogram =
+ repetitionLevelHistogram == null ? Collections.emptyList() :
repetitionLevelHistogram;
+ this.definitionLevelHistogram =
+ definitionLevelHistogram == null ? Collections.emptyList() :
definitionLevelHistogram;
}
/**
diff --git
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
index 32bdf54f5..813298c2b 100644
---
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
+++
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
@@ -125,4 +125,20 @@ public class TestSizeStatistics {
Assert.assertEquals(Arrays.asList(1L, 1L, 1L),
copy.getRepetitionLevelHistogram());
Assert.assertEquals(Arrays.asList(1L, 1L, 1L),
copy.getDefinitionLevelHistogram());
}
+
+ @Test
+ public void testOmittedHistogram() {
+ PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(LogicalTypeAnnotation.stringType())
+ .named("a");
+ SizeStatistics statistics = new SizeStatistics(type, 1024L, null, null);
+ Assert.assertEquals(Optional.of(1024L),
statistics.getUnencodedByteArrayDataBytes());
+ Assert.assertEquals(Collections.emptyList(),
statistics.getRepetitionLevelHistogram());
+ Assert.assertEquals(Collections.emptyList(),
statistics.getDefinitionLevelHistogram());
+
+ SizeStatistics copy = statistics.copy();
+ Assert.assertEquals(Optional.of(1024L),
copy.getUnencodedByteArrayDataBytes());
+ Assert.assertEquals(Collections.emptyList(),
copy.getRepetitionLevelHistogram());
+ Assert.assertEquals(Collections.emptyList(),
copy.getDefinitionLevelHistogram());
+ }
}
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index d1c6b01c9..e72f2c33a 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -2382,8 +2382,14 @@ public class ParquetMetadataConverter {
formatStats.setUnencoded_byte_array_data_bytes(
stats.getUnencodedByteArrayDataBytes().get());
}
-
formatStats.setRepetition_level_histogram(stats.getRepetitionLevelHistogram());
-
formatStats.setDefinition_level_histogram(stats.getDefinitionLevelHistogram());
+ List<Long> repLevelHistogram = stats.getRepetitionLevelHistogram();
+ if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) {
+ formatStats.setRepetition_level_histogram(repLevelHistogram);
+ }
+ List<Long> defLevelHistogram = stats.getDefinitionLevelHistogram();
+ if (defLevelHistogram != null && !defLevelHistogram.isEmpty()) {
+ formatStats.setDefinition_level_histogram(defLevelHistogram);
+ }
return formatStats;
}
}