This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/master by this push:
     new ec537c7da GH-3123: Omit level histogram for some max levels (#3124)
ec537c7da is described below

commit ec537c7da416ce974bbf87a8ce545ba4de69773b
Author: Gang Wu <[email protected]>
AuthorDate: Tue Jan 21 12:51:18 2025 +0800

    GH-3123: Omit level histogram for some max levels (#3124)
---
 .../parquet/column/statistics/SizeStatistics.java  | 51 ++++++++++++-----
 .../column/statistics/TestSizeStatistics.java      |  3 +-
 .../statistics/TestSizeStatisticsRoundTrip.java    | 66 ++++++++++++++++++++--
 3 files changed, 102 insertions(+), 18 deletions(-)

diff --git 
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
 
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
index 97a49be65..d267d3f6d 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
@@ -67,8 +67,16 @@ public class SizeStatistics {
     private Builder(PrimitiveType type, int maxRepetitionLevel, int 
maxDefinitionLevel) {
       this.type = type;
       this.unencodedByteArrayDataBytes = 0L;
-      repetitionLevelHistogram = new long[maxRepetitionLevel + 1];
-      definitionLevelHistogram = new long[maxDefinitionLevel + 1];
+      if (maxRepetitionLevel > 0) {
+        repetitionLevelHistogram = new long[maxRepetitionLevel + 1];
+      } else {
+        repetitionLevelHistogram = new long[0]; // omitted
+      }
+      if (maxDefinitionLevel > 1) {
+        definitionLevelHistogram = new long[maxDefinitionLevel + 1];
+      } else {
+        definitionLevelHistogram = new long[0]; // omitted
+      }
     }
 
     /**
@@ -79,8 +87,12 @@ public class SizeStatistics {
      * @param definitionLevel definition level of the value
      */
     public void add(int repetitionLevel, int definitionLevel) {
-      repetitionLevelHistogram[repetitionLevel]++;
-      definitionLevelHistogram[definitionLevel]++;
+      if (repetitionLevelHistogram.length > 0) {
+        repetitionLevelHistogram[repetitionLevel]++;
+      }
+      if (definitionLevelHistogram.length > 0) {
+        definitionLevelHistogram[definitionLevel]++;
+      }
     }
 
     /**
@@ -160,16 +172,29 @@ public class SizeStatistics {
 
     Preconditions.checkArgument(type.equals(other.type), "Cannot merge 
SizeStatistics of different types");
     unencodedByteArrayDataBytes = Math.addExact(unencodedByteArrayDataBytes, 
other.unencodedByteArrayDataBytes);
-    for (int i = 0; i < repetitionLevelHistogram.size(); i++) {
-      repetitionLevelHistogram.set(
-          i, Math.addExact(repetitionLevelHistogram.get(i), 
other.repetitionLevelHistogram.get(i)));
+
+    if (other.repetitionLevelHistogram.isEmpty()) {
+      repetitionLevelHistogram.clear();
+    } else {
+      Preconditions.checkArgument(
+          repetitionLevelHistogram.size() == 
other.repetitionLevelHistogram.size(),
+          "Cannot merge SizeStatistics with different repetition level 
histogram size");
+      for (int i = 0; i < repetitionLevelHistogram.size(); i++) {
+        repetitionLevelHistogram.set(
+            i, Math.addExact(repetitionLevelHistogram.get(i), 
other.repetitionLevelHistogram.get(i)));
+      }
     }
-    for (int i = 0; i < definitionLevelHistogram.size(); i++) {
-      definitionLevelHistogram.set(
-          i,
-          Math.addExact(
-              definitionLevelHistogram.get(i),
-              other.getDefinitionLevelHistogram().get(i)));
+
+    if (other.definitionLevelHistogram.isEmpty()) {
+      definitionLevelHistogram.clear();
+    } else {
+      Preconditions.checkArgument(
+          definitionLevelHistogram.size() == 
other.definitionLevelHistogram.size(),
+          "Cannot merge SizeStatistics with different definition level 
histogram size");
+      for (int i = 0; i < definitionLevelHistogram.size(); i++) {
+        definitionLevelHistogram.set(
+            i, Math.addExact(definitionLevelHistogram.get(i), 
other.definitionLevelHistogram.get(i)));
+      }
     }
   }
 
diff --git 
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
 
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
index 6c166b0e7..32bdf54f5 100644
--- 
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
+++ 
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
@@ -19,6 +19,7 @@
 package org.apache.parquet.column.statistics;
 
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.Optional;
 import org.apache.parquet.io.api.Binary;
 import org.apache.parquet.schema.LogicalTypeAnnotation;
@@ -67,7 +68,7 @@ public class TestSizeStatistics {
     SizeStatistics statistics = builder.build();
     Assert.assertEquals(Optional.empty(), 
statistics.getUnencodedByteArrayDataBytes());
     Assert.assertEquals(Arrays.asList(2L, 4L), 
statistics.getRepetitionLevelHistogram());
-    Assert.assertEquals(Arrays.asList(3L, 3L), 
statistics.getDefinitionLevelHistogram());
+    Assert.assertEquals(Collections.emptyList(), 
statistics.getDefinitionLevelHistogram());
   }
 
   @Test
diff --git 
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestSizeStatisticsRoundTrip.java
 
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestSizeStatisticsRoundTrip.java
index 59e4aff2d..026e13a3b 100644
--- 
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestSizeStatisticsRoundTrip.java
+++ 
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestSizeStatisticsRoundTrip.java
@@ -21,6 +21,7 @@ package org.apache.parquet.statistics;
 import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.Optional;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
@@ -82,12 +83,12 @@ public class TestSizeStatisticsRoundTrip {
 
       SizeStatistics sizeStatistics = column.getSizeStatistics();
       Assert.assertEquals(Optional.of(4L), 
sizeStatistics.getUnencodedByteArrayDataBytes());
-      Assert.assertEquals(Arrays.asList(4L), 
sizeStatistics.getRepetitionLevelHistogram());
-      Assert.assertEquals(Arrays.asList(0L, 4L), 
sizeStatistics.getDefinitionLevelHistogram());
+      Assert.assertEquals(Collections.emptyList(), 
sizeStatistics.getRepetitionLevelHistogram());
+      Assert.assertEquals(Collections.emptyList(), 
sizeStatistics.getDefinitionLevelHistogram());
 
       ColumnIndex columnIndex = reader.readColumnIndex(column);
-      Assert.assertEquals(Arrays.asList(2L, 2L), 
columnIndex.getRepetitionLevelHistogram());
-      Assert.assertEquals(Arrays.asList(0L, 2L, 0L, 2L), 
columnIndex.getDefinitionLevelHistogram());
+      Assert.assertEquals(Collections.emptyList(), 
columnIndex.getRepetitionLevelHistogram());
+      Assert.assertEquals(Collections.emptyList(), 
columnIndex.getDefinitionLevelHistogram());
 
       OffsetIndex offsetIndex = reader.readOffsetIndex(column);
       Assert.assertEquals(2, offsetIndex.getPageCount());
@@ -96,6 +97,63 @@ public class TestSizeStatisticsRoundTrip {
     }
   }
 
+  @Test
+  public void testNestedRepeatedOptionalColumnSizeStatistics() throws 
IOException {
+    MessageType schema = Types.buildMessage()
+        .optionalGroup()
+        .repeatedGroup()
+        .optional(PrimitiveType.PrimitiveTypeName.BINARY)
+        .as(LogicalTypeAnnotation.stringType())
+        .named("value")
+        .named("list")
+        .named("outer")
+        .named("msg");
+
+    Configuration conf = new Configuration();
+    GroupWriteSupport.setSchema(schema, conf);
+
+    GroupFactory factory = new SimpleGroupFactory(schema);
+    Path path = newTempPath();
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
+        .withPageRowCountLimit(2)
+        .withMinRowCountForPageSizeCheck(1)
+        .withConf(conf)
+        .build()) {
+      // Create groups with different nesting patterns
+      Group g1 = factory.newGroup();
+      Group outer1 = g1.addGroup("outer");
+      Group list1 = outer1.addGroup("list");
+      list1.append("value", "a");
+      Group list2 = outer1.addGroup("list");
+      list2.append("value", "b");
+      writer.write(g1);
+
+      Group g2 = factory.newGroup();
+      Group outer2 = g2.addGroup("outer");
+      Group list3 = outer2.addGroup("list");
+      list3.append("value", "c");
+      writer.write(g2);
+    }
+
+    try (ParquetFileReader reader = 
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
+      ParquetMetadata footer = reader.getFooter();
+      ColumnChunkMetaData column = 
footer.getBlocks().get(0).getColumns().get(0);
+
+      SizeStatistics sizeStatistics = column.getSizeStatistics();
+      Assert.assertEquals(Optional.of(3L), 
sizeStatistics.getUnencodedByteArrayDataBytes());
+      Assert.assertEquals(Arrays.asList(2L, 1L), 
sizeStatistics.getRepetitionLevelHistogram());
+      Assert.assertEquals(Arrays.asList(0L, 0L, 0L, 3L), 
sizeStatistics.getDefinitionLevelHistogram());
+
+      ColumnIndex columnIndex = reader.readColumnIndex(column);
+      Assert.assertEquals(Arrays.asList(2L, 1L), 
sizeStatistics.getRepetitionLevelHistogram());
+      Assert.assertEquals(Arrays.asList(0L, 0L, 0L, 3L), 
sizeStatistics.getDefinitionLevelHistogram());
+
+      OffsetIndex offsetIndex = reader.readOffsetIndex(column);
+      Assert.assertEquals(1, offsetIndex.getPageCount());
+      Assert.assertEquals(Optional.of(3L), 
offsetIndex.getUnencodedByteArrayDataBytes(0));
+    }
+  }
+
   private Path newTempPath() throws IOException {
     File file = temp.newFile();
     Preconditions.checkArgument(file.delete(), "Could not remove temp file");

Reply via email to