[ 
https://issues.apache.org/jira/browse/PARQUET-1386?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16589852#comment-16589852
 ] 

ASF GitHub Bot commented on PARQUET-1386:
-----------------------------------------

gszadovszky closed pull request #515: PARQUET-1386: Fix issues of NaN and +-0.0 
in case of float/double column indexes
URL: https://github.com/apache/parquet-mr/pull/515
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
index 9633b6100..b28fddee4 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java
@@ -557,6 +557,10 @@ public ColumnIndex build() {
       return null;
     }
     ColumnIndexBase<?> columnIndex = createColumnIndex(type);
+    if (columnIndex == null) {
+      // Might happen if the specialized builder discovers invalid min/max 
values
+      return null;
+    }
     columnIndex.nullPages = nullPages.toBooleanArray();
     // Null counts is optional so keep it null if the builder has no values
     if (!nullCounts.isEmpty()) {
diff --git 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java
 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java
index 24be02c2e..074d02573 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/DoubleColumnIndexBuilder.java
@@ -84,6 +84,7 @@ int compareValueToMax(int arrayIndex) {
 
   private final DoubleList minValues = new DoubleArrayList();
   private final DoubleList maxValues = new DoubleArrayList();
+  private boolean invalid;
 
   private static double convert(ByteBuffer buffer) {
     return buffer.order(LITTLE_ENDIAN).getDouble(0);
@@ -101,12 +102,30 @@ void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) {
 
   @Override
   void addMinMax(Object min, Object max) {
-    minValues.add((double) min);
-    maxValues.add((double) max);
+    double dMin = (double) min;
+    double dMax = (double) max;
+    if (Double.isNaN(dMin) || Double.isNaN(dMax)) {
+      // Invalidate this column index in case of NaN as the sorting order of 
values is undefined for this case
+      invalid = true;
+    }
+
+    // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to 
ensure that no 0.0 values are skipped
+    if (Double.compare(dMin, +0.0) == 0) {
+      dMin = -0.0;
+    }
+    if (Double.compare(dMax, -0.0) == 0) {
+      dMax = +0.0;
+    }
+
+    minValues.add(dMin);
+    maxValues.add(dMax);
   }
 
   @Override
   ColumnIndexBase<Double> createColumnIndex(PrimitiveType type) {
+    if (invalid) {
+      return null;
+    }
     DoubleColumnIndex columnIndex = new DoubleColumnIndex(type);
     columnIndex.minValues = minValues.toDoubleArray();
     columnIndex.maxValues = maxValues.toDoubleArray();
diff --git 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java
 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java
index 4452746c4..cbcdf949d 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/FloatColumnIndexBuilder.java
@@ -84,6 +84,7 @@ int compareValueToMax(int arrayIndex) {
 
   private final FloatList minValues = new FloatArrayList();
   private final FloatList maxValues = new FloatArrayList();
+  private boolean invalid;
 
   private static float convert(ByteBuffer buffer) {
     return buffer.order(LITTLE_ENDIAN).getFloat(0);
@@ -101,12 +102,30 @@ void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) {
 
   @Override
   void addMinMax(Object min, Object max) {
-    minValues.add((float) min);
-    maxValues.add((float) max);
+    float fMin = (float) min;
+    float fMax = (float) max;
+    if (Float.isNaN(fMin) || Float.isNaN(fMax)) {
+      // Invalidate this column index in case of NaN as the sorting order of 
values is undefined for this case
+      invalid = true;
+    }
+
+    // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to 
ensure that no 0.0 values are skipped
+    if (Float.compare(fMin, +0.0f) == 0) {
+      fMin = -0.0f;
+    }
+    if (Float.compare(fMax, -0.0f) == 0) {
+      fMax = +0.0f;
+    }
+
+    minValues.add(fMin);
+    maxValues.add(fMax);
   }
 
   @Override
   ColumnIndexBase<Float> createColumnIndex(PrimitiveType type) {
+    if (invalid) {
+      return null;
+    }
     FloatColumnIndex columnIndex = new FloatColumnIndex(type);
     columnIndex.minValues = minValues.toFloatArray();
     columnIndex.maxValues = maxValues.toFloatArray();
diff --git 
a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
 
b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
index e4655b265..5a3947c98 100644
--- 
a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
+++ 
b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
@@ -48,6 +48,7 @@
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
 import java.math.BigDecimal;
@@ -810,6 +811,25 @@ public void testBuildDouble() {
     assertCorrectFiltering(columnIndex, invert(userDefined(col, 
DoubleIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8);
   }
 
+  @Test
+  public void testBuildDoubleZeroNaN() {
+    PrimitiveType type = Types.required(DOUBLE).named("test_double");
+    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, 
Integer.MAX_VALUE);
+    StatsBuilder sb = new StatsBuilder();
+    builder.add(sb.stats(type, -1.0, -0.0));
+    builder.add(sb.stats(type, 0.0, 1.0));
+    builder.add(sb.stats(type, 1.0, 100.0));
+    ColumnIndex columnIndex = builder.build();
+    assertCorrectValues(columnIndex.getMinValues(), -1.0, -0.0, 1.0);
+    assertCorrectValues(columnIndex.getMaxValues(), 0.0, 1.0, 100.0);
+
+    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
+    builder.add(sb.stats(type, -1.0, -0.0));
+    builder.add(sb.stats(type, 0.0, Double.NaN));
+    builder.add(sb.stats(type, 1.0, 100.0));
+    assertNull(builder.build());
+  }
+
   @Test
   public void testStaticBuildDouble() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
@@ -921,6 +941,25 @@ public void testBuildFloat() {
     assertCorrectFiltering(columnIndex, invert(userDefined(col, 
FloatIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8);
   }
 
+  @Test
+  public void testBuildFloatZeroNaN() {
+    PrimitiveType type = Types.required(FLOAT).named("test_float");
+    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, 
Integer.MAX_VALUE);
+    StatsBuilder sb = new StatsBuilder();
+    builder.add(sb.stats(type, -1.0f, -0.0f));
+    builder.add(sb.stats(type, 0.0f, 1.0f));
+    builder.add(sb.stats(type, 1.0f, 100.0f));
+    ColumnIndex columnIndex = builder.build();
+    assertCorrectValues(columnIndex.getMinValues(), -1.0f, -0.0f, 1.0f);
+    assertCorrectValues(columnIndex.getMaxValues(), 0.0f, 1.0f, 100.0f);
+
+    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
+    builder.add(sb.stats(type, -1.0f, -0.0f));
+    builder.add(sb.stats(type, 0.0f, Float.NaN));
+    builder.add(sb.stats(type, 1.0f, 100.0f));
+    assertNull(builder.build());
+  }
+
   @Test
   public void testStaticBuildFloat() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
@@ -1391,7 +1430,7 @@ private static void assertCorrectValues(List<ByteBuffer> 
values, Double... expec
         assertFalse("The byte buffer should be empty for null pages", 
value.hasRemaining());
       } else {
         assertEquals("The byte buffer should be 8 bytes long for double", 8, 
value.remaining());
-        assertEquals("Invalid value for page " + i, 
expectedValue.doubleValue(), value.getDouble(0), 0.0);
+        assertTrue("Invalid value for page " + i, 
Double.compare(expectedValue.doubleValue(), value.getDouble(0)) == 0);
       }
     }
   }
@@ -1405,7 +1444,7 @@ private static void assertCorrectValues(List<ByteBuffer> 
values, Float... expect
         assertFalse("The byte buffer should be empty for null pages", 
value.hasRemaining());
       } else {
         assertEquals("The byte buffer should be 4 bytes long for double", 4, 
value.remaining());
-        assertEquals("Invalid value for page " + i, 
expectedValue.floatValue(), value.getFloat(0), 0.0f);
+        assertTrue("Invalid value for page " + i, 
Float.compare(expectedValue.floatValue(), value.getFloat(0)) == 0);
       }
     }
   }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Fix issues of NaN and +-0.0 in case of float/double column indexes
> ------------------------------------------------------------------
>
>                 Key: PARQUET-1386
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1386
>             Project: Parquet
>          Issue Type: Sub-task
>            Reporter: Gabor Szadovszky
>            Assignee: Gabor Szadovszky
>            Priority: Major
>              Labels: pull-request-available
>
> Workaround the float/double column indexes just like we did for statistics in 
> PARQUET-1246.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to