This is an automated email from the ASF dual-hosted git repository.
wgtmac pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new 63aebcc0c GH-3574: Statistics.toParquetStatistics always set
null_count(#3575)
63aebcc0c is described below
commit 63aebcc0cf3684ba6c9dc2b7243c6007d3a1269f
Author: Mahdi Dibaiee <[email protected]>
AuthorDate: Tue Jun 16 15:42:11 2026 +0100
GH-3574: Statistics.toParquetStatistics always set null_count(#3575)
---
.../apache/parquet/format/converter/ParquetMetadataConverter.java | 4 +++-
.../parquet/format/converter/TestParquetMetadataConverter.java | 7 ++++---
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 3597898c3..89e39c3b6 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -800,12 +800,14 @@ public class ParquetMetadataConverter {
public static Statistics toParquetStatistics(
org.apache.parquet.column.statistics.Statistics stats, int
truncateLength) {
Statistics formatStats = new Statistics();
+ if (!stats.isEmpty()) {
+ formatStats.setNull_count(stats.getNumNulls());
+ }
// Don't write stats larger than the max size rather than truncating. The
// rationale is that some engines may use the minimum value in the page as
// the true minimum for aggregations and there is no way to mark that a
// value has been truncated and is a lower bound and not in the page.
if (!stats.isEmpty() && withinLimit(stats, truncateLength)) {
- formatStats.setNull_count(stats.getNumNulls());
if (stats.hasNonNullValue()) {
byte[] min;
byte[] max;
diff --git
a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
index e4851d8ec..7972a75ba 100644
---
a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
+++
b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
@@ -807,7 +807,7 @@ public class TestParquetMetadataConverter {
}
Assert.assertEquals("Num nulls should match", 3004,
formatStats.getNull_count());
- // convert to empty stats because the values are too large
+ // min/max are not written because the values are too large, but null
count is always written
stats.setMinMaxFromBytes(max, max);
formatStats = helper.toParquetStatistics(stats);
@@ -816,7 +816,7 @@ public class TestParquetMetadataConverter {
Assert.assertFalse("Max should not be set", formatStats.isSetMax());
Assert.assertFalse("Min_value should not be set",
formatStats.isSetMin_value());
Assert.assertFalse("Max_value should not be set",
formatStats.isSetMax_value());
- Assert.assertFalse("Num nulls should not be set",
formatStats.isSetNull_count());
+ Assert.assertEquals("Num nulls should match", 3004,
formatStats.getNull_count());
Statistics roundTripStats =
ParquetMetadataConverter.fromParquetStatisticsInternal(
Version.FULL_VERSION,
@@ -824,7 +824,8 @@ public class TestParquetMetadataConverter {
new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, ""),
ParquetMetadataConverter.SortOrder.SIGNED);
- Assert.assertTrue(roundTripStats.isEmpty());
+ Assert.assertFalse("Round-trip stats should not be empty (null count is
set)", roundTripStats.isEmpty());
+ Assert.assertEquals("Round-trip null count should match", 3004,
roundTripStats.getNumNulls());
}
@Test