This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new ec537c7da GH-3123: Omit level histogram for some max levels (#3124)
ec537c7da is described below
commit ec537c7da416ce974bbf87a8ce545ba4de69773b
Author: Gang Wu <[email protected]>
AuthorDate: Tue Jan 21 12:51:18 2025 +0800
GH-3123: Omit level histogram for some max levels (#3124)
---
.../parquet/column/statistics/SizeStatistics.java | 51 ++++++++++++-----
.../column/statistics/TestSizeStatistics.java | 3 +-
.../statistics/TestSizeStatisticsRoundTrip.java | 66 ++++++++++++++++++++--
3 files changed, 102 insertions(+), 18 deletions(-)
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
index 97a49be65..d267d3f6d 100644
---
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
+++
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java
@@ -67,8 +67,16 @@ public class SizeStatistics {
private Builder(PrimitiveType type, int maxRepetitionLevel, int
maxDefinitionLevel) {
this.type = type;
this.unencodedByteArrayDataBytes = 0L;
- repetitionLevelHistogram = new long[maxRepetitionLevel + 1];
- definitionLevelHistogram = new long[maxDefinitionLevel + 1];
+ if (maxRepetitionLevel > 0) {
+ repetitionLevelHistogram = new long[maxRepetitionLevel + 1];
+ } else {
+ repetitionLevelHistogram = new long[0]; // omitted
+ }
+ if (maxDefinitionLevel > 1) {
+ definitionLevelHistogram = new long[maxDefinitionLevel + 1];
+ } else {
+ definitionLevelHistogram = new long[0]; // omitted
+ }
}
/**
@@ -79,8 +87,12 @@ public class SizeStatistics {
* @param definitionLevel definition level of the value
*/
public void add(int repetitionLevel, int definitionLevel) {
- repetitionLevelHistogram[repetitionLevel]++;
- definitionLevelHistogram[definitionLevel]++;
+ if (repetitionLevelHistogram.length > 0) {
+ repetitionLevelHistogram[repetitionLevel]++;
+ }
+ if (definitionLevelHistogram.length > 0) {
+ definitionLevelHistogram[definitionLevel]++;
+ }
}
/**
@@ -160,16 +172,29 @@ public class SizeStatistics {
Preconditions.checkArgument(type.equals(other.type), "Cannot merge
SizeStatistics of different types");
unencodedByteArrayDataBytes = Math.addExact(unencodedByteArrayDataBytes,
other.unencodedByteArrayDataBytes);
- for (int i = 0; i < repetitionLevelHistogram.size(); i++) {
- repetitionLevelHistogram.set(
- i, Math.addExact(repetitionLevelHistogram.get(i),
other.repetitionLevelHistogram.get(i)));
+
+ if (other.repetitionLevelHistogram.isEmpty()) {
+ repetitionLevelHistogram.clear();
+ } else {
+ Preconditions.checkArgument(
+ repetitionLevelHistogram.size() ==
other.repetitionLevelHistogram.size(),
+ "Cannot merge SizeStatistics with different repetition level
histogram size");
+ for (int i = 0; i < repetitionLevelHistogram.size(); i++) {
+ repetitionLevelHistogram.set(
+ i, Math.addExact(repetitionLevelHistogram.get(i),
other.repetitionLevelHistogram.get(i)));
+ }
}
- for (int i = 0; i < definitionLevelHistogram.size(); i++) {
- definitionLevelHistogram.set(
- i,
- Math.addExact(
- definitionLevelHistogram.get(i),
- other.getDefinitionLevelHistogram().get(i)));
+
+ if (other.definitionLevelHistogram.isEmpty()) {
+ definitionLevelHistogram.clear();
+ } else {
+ Preconditions.checkArgument(
+ definitionLevelHistogram.size() ==
other.definitionLevelHistogram.size(),
+ "Cannot merge SizeStatistics with different definition level
histogram size");
+ for (int i = 0; i < definitionLevelHistogram.size(); i++) {
+ definitionLevelHistogram.set(
+ i, Math.addExact(definitionLevelHistogram.get(i),
other.definitionLevelHistogram.get(i)));
+ }
}
}
diff --git
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
index 6c166b0e7..32bdf54f5 100644
---
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
+++
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
@@ -19,6 +19,7 @@
package org.apache.parquet.column.statistics;
import java.util.Arrays;
+import java.util.Collections;
import java.util.Optional;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.LogicalTypeAnnotation;
@@ -67,7 +68,7 @@ public class TestSizeStatistics {
SizeStatistics statistics = builder.build();
Assert.assertEquals(Optional.empty(),
statistics.getUnencodedByteArrayDataBytes());
Assert.assertEquals(Arrays.asList(2L, 4L),
statistics.getRepetitionLevelHistogram());
- Assert.assertEquals(Arrays.asList(3L, 3L),
statistics.getDefinitionLevelHistogram());
+ Assert.assertEquals(Collections.emptyList(),
statistics.getDefinitionLevelHistogram());
}
@Test
diff --git
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestSizeStatisticsRoundTrip.java
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestSizeStatisticsRoundTrip.java
index 59e4aff2d..026e13a3b 100644
---
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestSizeStatisticsRoundTrip.java
+++
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestSizeStatisticsRoundTrip.java
@@ -21,6 +21,7 @@ package org.apache.parquet.statistics;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
+import java.util.Collections;
import java.util.Optional;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
@@ -82,12 +83,12 @@ public class TestSizeStatisticsRoundTrip {
SizeStatistics sizeStatistics = column.getSizeStatistics();
Assert.assertEquals(Optional.of(4L),
sizeStatistics.getUnencodedByteArrayDataBytes());
- Assert.assertEquals(Arrays.asList(4L),
sizeStatistics.getRepetitionLevelHistogram());
- Assert.assertEquals(Arrays.asList(0L, 4L),
sizeStatistics.getDefinitionLevelHistogram());
+ Assert.assertEquals(Collections.emptyList(),
sizeStatistics.getRepetitionLevelHistogram());
+ Assert.assertEquals(Collections.emptyList(),
sizeStatistics.getDefinitionLevelHistogram());
ColumnIndex columnIndex = reader.readColumnIndex(column);
- Assert.assertEquals(Arrays.asList(2L, 2L),
columnIndex.getRepetitionLevelHistogram());
- Assert.assertEquals(Arrays.asList(0L, 2L, 0L, 2L),
columnIndex.getDefinitionLevelHistogram());
+ Assert.assertEquals(Collections.emptyList(),
columnIndex.getRepetitionLevelHistogram());
+ Assert.assertEquals(Collections.emptyList(),
columnIndex.getDefinitionLevelHistogram());
OffsetIndex offsetIndex = reader.readOffsetIndex(column);
Assert.assertEquals(2, offsetIndex.getPageCount());
@@ -96,6 +97,63 @@ public class TestSizeStatisticsRoundTrip {
}
}
+ @Test
+ public void testNestedRepeatedOptionalColumnSizeStatistics() throws
IOException {
+ MessageType schema = Types.buildMessage()
+ .optionalGroup()
+ .repeatedGroup()
+ .optional(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(LogicalTypeAnnotation.stringType())
+ .named("value")
+ .named("list")
+ .named("outer")
+ .named("msg");
+
+ Configuration conf = new Configuration();
+ GroupWriteSupport.setSchema(schema, conf);
+
+ GroupFactory factory = new SimpleGroupFactory(schema);
+ Path path = newTempPath();
+ try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
+ .withPageRowCountLimit(2)
+ .withMinRowCountForPageSizeCheck(1)
+ .withConf(conf)
+ .build()) {
+ // Create groups with different nesting patterns
+ Group g1 = factory.newGroup();
+ Group outer1 = g1.addGroup("outer");
+ Group list1 = outer1.addGroup("list");
+ list1.append("value", "a");
+ Group list2 = outer1.addGroup("list");
+ list2.append("value", "b");
+ writer.write(g1);
+
+ Group g2 = factory.newGroup();
+ Group outer2 = g2.addGroup("outer");
+ Group list3 = outer2.addGroup("list");
+ list3.append("value", "c");
+ writer.write(g2);
+ }
+
+ try (ParquetFileReader reader =
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
+ ParquetMetadata footer = reader.getFooter();
+ ColumnChunkMetaData column =
footer.getBlocks().get(0).getColumns().get(0);
+
+ SizeStatistics sizeStatistics = column.getSizeStatistics();
+ Assert.assertEquals(Optional.of(3L),
sizeStatistics.getUnencodedByteArrayDataBytes());
+ Assert.assertEquals(Arrays.asList(2L, 1L),
sizeStatistics.getRepetitionLevelHistogram());
+ Assert.assertEquals(Arrays.asList(0L, 0L, 0L, 3L),
sizeStatistics.getDefinitionLevelHistogram());
+
+ ColumnIndex columnIndex = reader.readColumnIndex(column);
+ Assert.assertEquals(Arrays.asList(2L, 1L),
sizeStatistics.getRepetitionLevelHistogram());
+ Assert.assertEquals(Arrays.asList(0L, 0L, 0L, 3L),
sizeStatistics.getDefinitionLevelHistogram());
+
+ OffsetIndex offsetIndex = reader.readOffsetIndex(column);
+ Assert.assertEquals(1, offsetIndex.getPageCount());
+ Assert.assertEquals(Optional.of(3L),
offsetIndex.getUnencodedByteArrayDataBytes(0));
+ }
+ }
+
private Path newTempPath() throws IOException {
File file = temp.newFile();
Preconditions.checkArgument(file.delete(), "Could not remove temp file");