This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/master by this push:
     new 6e2f7bb09 GH-3406: fix Float16 statistics handling for NaN and zero 
values (#3407)
6e2f7bb09 is described below

commit 6e2f7bb098e37ccef3a0055b5d4daec97cba54a9
Author: Zehua Zou <[email protected]>
AuthorDate: Wed Mar 4 12:50:08 2026 +0800

    GH-3406: fix Float16 statistics handling for NaN and zero values (#3407)
---
 .../parquet/column/statistics/Statistics.java      | 12 ++---
 .../columnindex/BinaryColumnIndexBuilder.java      | 40 +++++++++++++++-
 .../java/org/apache/parquet/schema/Float16.java    |  7 +++
 .../statistics/TestFloat16ReadWriteRoundTrip.java  | 54 +++++++++++++++-------
 .../parquet/statistics/TestFloat16Statistics.java  |  4 +-
 5 files changed, 89 insertions(+), 28 deletions(-)

diff --git 
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
 
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
index 206ddadad..bee987773 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
@@ -142,10 +142,6 @@ public abstract class Statistics<T extends Comparable<T>> {
 
   // Builder for FLOAT16 type to handle special cases of min/max values like 
NaN, -0.0, and 0.0
   private static class Float16Builder extends Builder {
-    private static final Binary POSITIVE_ZERO_LITTLE_ENDIAN = 
Binary.fromConstantByteArray(new byte[] {0x00, 0x00});
-    private static final Binary NEGATIVE_ZERO_LITTLE_ENDIAN =
-        Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80});
-
     public Float16Builder(PrimitiveType type) {
       super(type);
       assert type.getPrimitiveTypeName() == 
PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
@@ -162,15 +158,17 @@ public abstract class Statistics<T extends Comparable<T>> 
{
         short max = bMax.get2BytesLittleEndian();
         // Drop min/max values in case of NaN as the sorting order of values 
is undefined for this case
         if (Float16.isNaN(min) || Float16.isNaN(max)) {
-          stats.setMinMax(POSITIVE_ZERO_LITTLE_ENDIAN, 
NEGATIVE_ZERO_LITTLE_ENDIAN);
+          stats.setMinMax(Float16.POSITIVE_ZERO_LITTLE_ENDIAN, 
Float16.POSITIVE_ZERO_LITTLE_ENDIAN);
           ((Statistics<?>) stats).hasNonNullValue = false;
         } else {
           // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values 
would be skipped
           if (min == (short) 0x0000) {
-            stats.setMinMax(NEGATIVE_ZERO_LITTLE_ENDIAN, bMax);
+            bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN;
+            stats.setMinMax(bMin, bMax);
           }
           if (max == (short) 0x8000) {
-            stats.setMinMax(bMin, POSITIVE_ZERO_LITTLE_ENDIAN);
+            bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN;
+            stats.setMinMax(bMin, bMax);
           }
         }
       }
diff --git 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java
 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java
index 1c546b516..24de97d01 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java
@@ -23,6 +23,8 @@ import java.util.ArrayList;
 import java.util.List;
 import org.apache.parquet.filter2.predicate.Statistics;
 import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.Float16;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
 import org.apache.parquet.schema.PrimitiveComparator;
 import org.apache.parquet.schema.PrimitiveType;
 
@@ -82,6 +84,8 @@ class BinaryColumnIndexBuilder extends ColumnIndexBuilder {
   private final List<Binary> maxValues = new ArrayList<>();
   private final BinaryTruncator truncator;
   private final int truncateLength;
+  private final boolean isFloat16;
+  private boolean invalid;
 
   private static Binary convert(ByteBuffer buffer) {
     return Binary.fromReusedByteBuffer(buffer);
@@ -94,6 +98,7 @@ class BinaryColumnIndexBuilder extends ColumnIndexBuilder {
   BinaryColumnIndexBuilder(PrimitiveType type, int truncateLength) {
     truncator = BinaryTruncator.getTruncator(type);
     this.truncateLength = truncateLength;
+    this.isFloat16 = type.getLogicalTypeAnnotation() instanceof 
LogicalTypeAnnotation.Float16LogicalTypeAnnotation;
   }
 
   @Override
@@ -104,12 +109,43 @@ class BinaryColumnIndexBuilder extends ColumnIndexBuilder 
{
 
   @Override
   void addMinMax(Object min, Object max) {
-    minValues.add(min == null ? null : truncator.truncateMin((Binary) min, 
truncateLength));
-    maxValues.add(max == null ? null : truncator.truncateMax((Binary) max, 
truncateLength));
+    Binary bMin = (Binary) min;
+    Binary bMax = (Binary) max;
+
+    if (isFloat16 && bMin != null && bMax != null) {
+      if (bMin.length() != 
LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES
+          || bMax.length() != 
LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES) {
+        // Should not happen for Float16
+        invalid = true;
+      } else {
+        short sMin = bMin.get2BytesLittleEndian();
+        short sMax = bMax.get2BytesLittleEndian();
+
+        if (Float16.isNaN(sMin) || Float16.isNaN(sMax)) {
+          invalid = true;
+        }
+
+        // Sorting order is undefined for -0.0 so let min = -0.0 and max = 
+0.0 to
+        // ensure that no 0.0 values are skipped
+        // +0.0 is 0x0000, -0.0 is 0x8000 (little endian: 00 00, 00 80)
+        if (sMin == (short) 0x0000) {
+          bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN;
+        }
+        if (sMax == (short) 0x8000) {
+          bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN;
+        }
+      }
+    }
+
+    minValues.add(bMin == null ? null : truncator.truncateMin(bMin, 
truncateLength));
+    maxValues.add(bMax == null ? null : truncator.truncateMax(bMax, 
truncateLength));
   }
 
   @Override
   ColumnIndexBase<Binary> createColumnIndex(PrimitiveType type) {
+    if (invalid) {
+      return null;
+    }
     BinaryColumnIndex columnIndex = new BinaryColumnIndex(type);
     columnIndex.minValues = minValues.toArray(new Binary[0]);
     columnIndex.maxValues = maxValues.toArray(new Binary[0]);
diff --git 
a/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java 
b/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java
index 6fe0e3d4c..fc146a889 100644
--- a/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java
+++ b/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java
@@ -46,6 +46,13 @@ import org.apache.parquet.io.api.Binary;
  * Ref: 
https://android.googlesource.com/platform/libcore/+/master/luni/src/main/java/libcore/util/FP16.java
  */
 public class Float16 {
+  // Positive zero of type half-precision float.
+  public static final Binary POSITIVE_ZERO_LITTLE_ENDIAN =
+      Binary.fromConstantByteArray(new byte[] {0x00, 0x00}, 0, 2);
+  // Negative zero of type half-precision float.
+  public static final Binary NEGATIVE_ZERO_LITTLE_ENDIAN =
+      Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80}, 0, 2);
+
   // Positive infinity of type half-precision float.
   private static final short POSITIVE_INFINITY = (short) 0x7c00;
   // A Not-a-Number representation of a half-precision float.
diff --git 
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java
 
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java
index 8251ab212..b391823d4 100644
--- 
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java
+++ 
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16ReadWriteRoundTrip.java
@@ -21,6 +21,7 @@ package org.apache.parquet.statistics;
 import static org.apache.parquet.schema.LogicalTypeAnnotation.float16Type;
 import static 
org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
 
 import java.io.File;
 import java.io.IOException;
@@ -122,21 +123,11 @@ public class TestFloat16ReadWriteRoundTrip {
     Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c})
   }; // Infinity
 
-  private Binary[] valuesAllPositiveZeroMinMax = {
-    Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0
+  private Binary[] valuesAllZeroMinMax = {
+    Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0
     Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00})
   }; // +0
 
-  private Binary[] valuesAllNegativeZeroMinMax = {
-    Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0
-    Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80})
-  }; // -0
-
-  private Binary[] valuesWithNaNMinMax = {
-    Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // 
-2.0
-    Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7e})
-  }; // NaN
-
   @Test
   public void testFloat16ColumnIndex() throws IOException {
     List<Binary[]> testValues = List.of(
@@ -144,15 +135,13 @@ public class TestFloat16ReadWriteRoundTrip {
         valuesInDescendingOrder,
         valuesUndefinedOrder,
         valuesAllPositiveZero,
-        valuesAllNegativeZero,
-        valuesWithNaN);
+        valuesAllNegativeZero);
     List<Binary[]> expectedValues = List.of(
         valuesInAscendingOrderMinMax,
         valuesInDescendingOrderMinMax,
         valuesUndefinedOrderMinMax,
-        valuesAllPositiveZeroMinMax,
-        valuesAllNegativeZeroMinMax,
-        valuesWithNaNMinMax);
+        valuesAllZeroMinMax,
+        valuesAllZeroMinMax);
 
     for (int i = 0; i < testValues.size(); i++) {
       MessageType schema = Types.buildMessage()
@@ -187,6 +176,37 @@ public class TestFloat16ReadWriteRoundTrip {
     }
   }
 
+  @Test
+  public void testFloat16NanColumnIndex() throws IOException {
+    MessageType schema = Types.buildMessage()
+        .required(FIXED_LEN_BYTE_ARRAY)
+        .as(float16Type())
+        .length(2)
+        .named("col_float16")
+        .named("msg");
+
+    Configuration conf = new Configuration();
+    GroupWriteSupport.setSchema(schema, conf);
+    GroupFactory factory = new SimpleGroupFactory(schema);
+    Path path = newTempPath();
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
+        .withConf(conf)
+        .withDictionaryEncoding(false)
+        .build()) {
+
+      for (Binary value : valuesWithNaN) {
+        writer.write(factory.newGroup().append("col_float16", value));
+      }
+    }
+
+    try (ParquetFileReader reader = 
ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
+      ColumnChunkMetaData column =
+          reader.getFooter().getBlocks().get(0).getColumns().get(0);
+      ColumnIndex index = reader.readColumnIndex(column);
+      assertNull(index);
+    }
+  }
+
   private Path newTempPath() throws IOException {
     File file = temp.newFile();
     Preconditions.checkArgument(file.delete(), "Could not remove temp file");
diff --git 
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java
 
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java
index 8617ea260..38d362d5f 100644
--- 
a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java
+++ 
b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestFloat16Statistics.java
@@ -135,8 +135,8 @@ public class TestFloat16Statistics {
   // Float16Builder: Drop min/max values in case of NaN as the sorting order 
of values is undefined
   private Binary[] valuesWithNaNStatsMinMax = {
     Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0
-    Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80})
-  }; // -0
+    Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00})
+  }; // +0
 
   @Test
   public void testFloat16StatisticsMultipleCases() throws IOException {

Reply via email to