This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new 6e2f7bb09 GH-3406: fix Float16 statistics handling for NaN and zero
values (#3407)
6e2f7bb09 is described below
commit 6e2f7bb098e37ccef3a0055b5d4daec97cba54a9
Author: Zehua Zou <[email protected]>
AuthorDate: Wed Mar 4 12:50:08 2026 +0800
GH-3406: fix Float16 statistics handling for NaN and zero values (#3407)
---
.../parquet/column/statistics/Statistics.java | 12 ++---
.../columnindex/BinaryColumnIndexBuilder.java | 40 +++++++++++++++-
.../java/org/apache/parquet/schema/Float16.java | 7 +++
.../statistics/TestFloat16ReadWriteRoundTrip.java | 54 +++++++++++++++-------
.../parquet/statistics/TestFloat16Statistics.java | 4 +-
5 files changed, 89 insertions(+), 28 deletions(-)
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
index 206ddadad..bee987773 100644
---
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
+++
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
@@ -142,10 +142,6 @@ public abstract class Statistics<T extends Comparable<T>> {
// Builder for FLOAT16 type to handle special cases of min/max values like
NaN, -0.0, and 0.0
private static class Float16Builder extends Builder {
- private static final Binary POSITIVE_ZERO_LITTLE_ENDIAN =
Binary.fromConstantByteArray(new byte[] {0x00, 0x00});
- private static final Binary NEGATIVE_ZERO_LITTLE_ENDIAN =
- Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80});
-
public Float16Builder(PrimitiveType type) {
super(type);
assert type.getPrimitiveTypeName() ==
PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
@@ -162,15 +158,17 @@ public abstract class Statistics<T extends Comparable<T>>
{
short max = bMax.get2BytesLittleEndian();
// Drop min/max values in case of NaN as the sorting order of values
is undefined for this case
if (Float16.isNaN(min) || Float16.isNaN(max)) {
- stats.setMinMax(POSITIVE_ZERO_LITTLE_ENDIAN,
NEGATIVE_ZERO_LITTLE_ENDIAN);
+ stats.setMinMax(Float16.POSITIVE_ZERO_LITTLE_ENDIAN,
Float16.POSITIVE_ZERO_LITTLE_ENDIAN);
((Statistics<?>) stats).hasNonNullValue = false;
} else {
// Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values
would be skipped
if (min == (short) 0x0000) {
- stats.setMinMax(NEGATIVE_ZERO_LITTLE_ENDIAN, bMax);
+ bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN;
+ stats.setMinMax(bMin, bMax);
}
if (max == (short) 0x8000) {
- stats.setMinMax(bMin, POSITIVE_ZERO_LITTLE_ENDIAN);
+ bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN;
+ stats.setMinMax(bMin, bMax);
}
}
}
diff --git
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java
index 1c546b516..24de97d01 100644
---
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java
+++
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java
@@ -23,6 +23,8 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.parquet.filter2.predicate.Statistics;
import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.Float16;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.PrimitiveComparator;
import org.apache.parquet.schema.PrimitiveType;
@@ -82,6 +84,8 @@ class BinaryColumnIndexBuilder extends ColumnIndexBuilder {
private final List<Binary> maxValues = new ArrayList<>();
private final BinaryTruncator truncator;
private final int truncateLength;
+ private final boolean isFloat16;
+ private boolean invalid;
private static Binary convert(ByteBuffer buffer) {
return Binary.fromReusedByteBuffer(buffer);
@@ -94,6 +98,7 @@ class BinaryColumnIndexBuilder extends ColumnIndexBuilder {
BinaryColumnIndexBuilder(PrimitiveType type, int truncateLength) {
truncator = BinaryTruncator.getTruncator(type);
this.truncateLength = truncateLength;
+ this.isFloat16 = type.getLogicalTypeAnnotation() instanceof
LogicalTypeAnnotation.Float16LogicalTypeAnnotation;
}
@Override
@@ -104,12 +109,43 @@ class BinaryColumnIndexBuilder extends ColumnIndexBuilder
{
@Override
void addMinMax(Object min, Object max) {
- minValues.add(min == null ? null : truncator.truncateMin((Binary) min,
truncateLength));
- maxValues.add(max == null ? null : truncator.truncateMax((Binary) max,
truncateLength));
+ Binary bMin = (Binary) min;
+ Binary bMax = (Binary) max;
+
+ if (isFloat16 && bMin != null && bMax != null) {
+ if (bMin.length() !=
LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES
+ || bMax.length() !=
LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES) {
+ // Should not happen for Float16
+ invalid = true;
+ } else {
+ short sMin = bMin.get2BytesLittleEndian();
+ short sMax = bMax.get2BytesLittleEndian();
+
+ if (Float16.isNaN(sMin) || Float16.isNaN(sMax)) {
+ invalid = true;
+ }
+
+ // Sorting order is undefined for -0.0 so let min = -0.0 and max =
+0.0 to
+ // ensure that no 0.0 values are skipped
+ // +0.0 is 0x0000, -0.0 is 0x8000 (little endian: 00 00, 00 80)
+ if (sMin == (short) 0x0000) {
+ bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN;
+ }
+ if (sMax == (short) 0x8000) {
+ bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN;
+ }
+ }
+ }
+
+ minValues.add(bMin == null ? null : truncator.truncateMin(bMin,
truncateLength));
+ maxValues.add(bMax == null ? null : truncator.truncateMax(bMax,
truncateLength));
}
@Override
ColumnIndexBase<Binary> createColumnIndex(PrimitiveType type) {
+ if (invalid) {
+ return null;
+ }
BinaryColumnIndex columnIndex = new BinaryColumnIndex(type);
columnIndex.minValues = minValues.toArray(new Binary[0]);
columnIndex.maxValues = maxValues.toArray(new Binary[0]);
diff --git
a/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java
b/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java
index 6fe0e3d4c..fc146a889 100644
--- a/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java
+++ b/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java
@@ -46,6 +46,13 @@ import org.apache.parquet.io.api.Binary;
* Ref:
https://android.googlesource.com/platform/libcore/+/master/luni/src/main/java/libcore/util/FP16.java
*/
public class Float16 {
+ // Positive zero of type half-precision float.
+ public static final Binary POSITIVE_ZERO_LITTLE_ENDIAN =
+ Binary.fromConstantByteArray(new byte[] {0x00, 0x00}, 0, 2);
+ // Negative zero of type half-precision float.
+ public static final Binary NEGATIVE_ZERO_LITTLE_ENDIAN =
+ Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80}, 0, 2);
+
// Positive infinity of type half-precision float.
private static final short POSITIVE_INFINITY = (short) 0x7c00;
// A Not-a-Number representation of a half-precision float.
diff --git
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java
index 8251ab212..b391823d4 100644
---
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java
+++
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java
@@ -21,6 +21,7 @@ package org.apache.parquet.statistics;
import static org.apache.parquet.schema.LogicalTypeAnnotation.float16Type;
import static
org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
import java.io.File;
import java.io.IOException;
@@ -122,21 +123,11 @@ public class TestFloat16ReadWriteRoundTrip {
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c})
}; // Infinity
- private Binary[] valuesAllPositiveZeroMinMax = {
- Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0
+ private Binary[] valuesAllZeroMinMax = {
+ Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00})
}; // +0
- private Binary[] valuesAllNegativeZeroMinMax = {
- Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0
- Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80})
- }; // -0
-
- private Binary[] valuesWithNaNMinMax = {
- Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), //
-2.0
- Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7e})
- }; // NaN
-
@Test
public void testFloat16ColumnIndex() throws IOException {
List<Binary[]> testValues = List.of(
@@ -144,15 +135,13 @@ public class TestFloat16ReadWriteRoundTrip {
valuesInDescendingOrder,
valuesUndefinedOrder,
valuesAllPositiveZero,
- valuesAllNegativeZero,
- valuesWithNaN);
+ valuesAllNegativeZero);
List<Binary[]> expectedValues = List.of(
valuesInAscendingOrderMinMax,
valuesInDescendingOrderMinMax,
valuesUndefinedOrderMinMax,
- valuesAllPositiveZeroMinMax,
- valuesAllNegativeZeroMinMax,
- valuesWithNaNMinMax);
+ valuesAllZeroMinMax,
+ valuesAllZeroMinMax);
for (int i = 0; i < testValues.size(); i++) {
MessageType schema = Types.buildMessage()
@@ -187,6 +176,37 @@ public class TestFloat16ReadWriteRoundTrip {
}
}
+ @Test
+ public void testFloat16NanColumnIndex() throws IOException {
+ MessageType schema = Types.buildMessage()
+ .required(FIXED_LEN_BYTE_ARRAY)
+ .as(float16Type())
+ .length(2)
+ .named("col_float16")
+ .named("msg");
+
+ Configuration conf = new Configuration();
+ GroupWriteSupport.setSchema(schema, conf);
+ GroupFactory factory = new SimpleGroupFactory(schema);
+ Path path = newTempPath();
+ try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
+ .withConf(conf)
+ .withDictionaryEncoding(false)
+ .build()) {
+
+ for (Binary value : valuesWithNaN) {
+ writer.write(factory.newGroup().append("col_float16", value));
+ }
+ }
+
+ try (ParquetFileReader reader =
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
+ ColumnChunkMetaData column =
+ reader.getFooter().getBlocks().get(0).getColumns().get(0);
+ ColumnIndex index = reader.readColumnIndex(column);
+ assertNull(index);
+ }
+ }
+
private Path newTempPath() throws IOException {
File file = temp.newFile();
Preconditions.checkArgument(file.delete(), "Could not remove temp file");
diff --git
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java
index 8617ea260..38d362d5f 100644
---
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java
+++
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java
@@ -135,8 +135,8 @@ public class TestFloat16Statistics {
// Float16Builder: Drop min/max values in case of NaN as the sorting order
of values is undefined
private Binary[] valuesWithNaNStatsMinMax = {
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0
- Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80})
- }; // -0
+ Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00})
+ }; // +0
@Test
public void testFloat16StatisticsMultipleCases() throws IOException {