[
https://issues.apache.org/jira/browse/PARQUET-1246?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16452129#comment-16452129
]
ASF GitHub Bot commented on PARQUET-1246:
-----------------------------------------
zivanfi closed pull request #468: PARQUET-1246: Ignore float/double statistics
in case of NaN
URL: https://github.com/apache/parquet-mr/pull/468
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
index 26c14c135..ab07a0fec 100644
---
a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
+++
b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java
@@ -69,6 +69,72 @@ public Builder withNumNulls(long numNulls) {
}
}
+ // Builder for FLOAT type to handle special cases of min/max values like
NaN, -0.0, and 0.0
+ private static class FloatBuilder extends Builder {
+ public FloatBuilder(PrimitiveTypeName type) {
+ super(type);
+ assert type == PrimitiveTypeName.FLOAT;
+ }
+
+ @Override
+ public Statistics<?> build() {
+ FloatStatistics stats = (FloatStatistics) super.build();
+ if (stats.hasNonNullValue()) {
+ Float min = stats.genericGetMin();
+ Float max = stats.genericGetMax();
+ // Drop min/max values in case of NaN as the sorting order of values
is undefined for this case
+ if (min.isNaN() || max.isNaN()) {
+ stats.setMinMax(0.0f, 0.0f);
+ ((Statistics<?>) stats).hasNonNullValue = false;
+ } else {
+ // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values
would be skipped
+ if (Float.compare(min, 0.0f) == 0) {
+ min = -0.0f;
+ stats.setMinMax(min, max);
+ }
+ if (Float.compare(max, -0.0f) == 0) {
+ max = 0.0f;
+ stats.setMinMax(min, max);
+ }
+ }
+ }
+ return stats;
+ }
+ }
+
+ // Builder for DOUBLE type to handle special cases of min/max values like
NaN, -0.0, and 0.0
+ private static class DoubleBuilder extends Builder {
+ public DoubleBuilder(PrimitiveTypeName type) {
+ super(type);
+ assert type == PrimitiveTypeName.DOUBLE;
+ }
+
+ @Override
+ public Statistics<?> build() {
+ DoubleStatistics stats = (DoubleStatistics) super.build();
+ if (stats.hasNonNullValue()) {
+ Double min = stats.genericGetMin();
+ Double max = stats.genericGetMax();
+ // Drop min/max values in case of NaN as the sorting order of values
is undefined for this case
+ if (min.isNaN() || max.isNaN()) {
+ stats.setMinMax(0.0, 0.0);
+ ((Statistics<?>) stats).hasNonNullValue = false;
+ } else {
+ // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values
would be skipped
+ if (Double.compare(min, 0.0) == 0) {
+ min = -0.0;
+ stats.setMinMax(min, max);
+ }
+ if (Double.compare(max, -0.0) == 0) {
+ max = 0.0;
+ stats.setMinMax(min, max);
+ }
+ }
+ }
+ return stats;
+ }
+ }
+
private boolean hasNonNullValue;
private long num_nulls;
@@ -112,8 +178,15 @@ public static Statistics
getStatsBasedOnType(PrimitiveTypeName type) {
* type of the column
* @return builder to create new statistics object
*/
- public static Builder getBuilder(PrimitiveTypeName type) {
- return new Builder(type);
+ public static Builder getBuilderForReading(PrimitiveTypeName type) {
+ switch (type) {
+ case FLOAT:
+ return new FloatBuilder(type);
+ case DOUBLE:
+ return new DoubleBuilder(type);
+ default:
+ return new Builder(type);
+ }
}
/**
@@ -221,7 +294,7 @@ public void mergeStatistics(Statistics stats) {
* Abstract method to set min and max values from byte arrays.
* @param minBytes byte array to set the min value to
* @param maxBytes byte array to set the max value to
- * @deprecated will be removed in 2.0.0. Use {@link
#getBuilder(PrimitiveType)} instead.
+ * @deprecated will be removed in 2.0.0. Use {@link
#getBuilderForReading(PrimitiveType)} instead.
*/
@Deprecated
abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);
@@ -283,7 +356,7 @@ public long getNumNulls() {
*
* @param nulls
* null count to set the count to
- * @deprecated will be removed in 2.0.0. Use {@link
#getBuilder(PrimitiveType)} instead.
+ * @deprecated will be removed in 2.0.0. Use {@link
#getBuilderForReading(PrimitiveType)} instead.
*/
@Deprecated
public void setNumNulls(long nulls) {
diff --git
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
index cf4bf59af..7d5278e20 100644
---
a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
+++
b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
@@ -18,13 +18,25 @@
*/
package org.apache.parquet.column.statistics;
+import static java.lang.Double.doubleToLongBits;
+import static java.lang.Float.floatToIntBits;
+import static org.apache.parquet.bytes.BytesUtils.intToBytes;
+import static org.apache.parquet.bytes.BytesUtils.longToBytes;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
+import static
org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
+import static
org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
import static org.junit.Assert.*;
import java.nio.ByteBuffer;
import org.junit.Test;
-
import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
public class TestStatistics {
private int[] integerArray;
@@ -585,4 +597,142 @@ private void testMergingStringStats() {
assertEquals(stats.getMax(), Binary.fromString("zzzz"));
assertEquals(stats.getMin(), Binary.fromString(""));
}
+
+ @Test
+ public void testBuilder() {
+ testBuilder(BOOLEAN, false, new byte[] { 0 }, true, new byte[] { 1 });
+ testBuilder(INT32, -42, intToBytes(-42), 42, intToBytes(42));
+ testBuilder(INT64, -42l, longToBytes(-42), 42l, longToBytes(42));
+ testBuilder(FLOAT, -42.0f, intToBytes(floatToIntBits(-42.0f)), 42.0f,
+ intToBytes(floatToIntBits(42.0f)));
+ testBuilder(DOUBLE, -42.0, longToBytes(doubleToLongBits(-42.0)), 42.0,
+ longToBytes(Double.doubleToLongBits(42.0f)));
+
+ byte[] min = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
+ byte[] max = { 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 };
+ testBuilder(INT96, Binary.fromConstantByteArray(min), min,
+ Binary.fromConstantByteArray(max), max);
+ testBuilder(FIXED_LEN_BYTE_ARRAY, Binary.fromConstantByteArray(min),
+ min,
+ Binary.fromConstantByteArray(max), max);
+ testBuilder(BINARY, Binary.fromConstantByteArray(min), min,
+ Binary.fromConstantByteArray(max), max);
+ }
+
+ private void testBuilder(PrimitiveTypeName type, Object min, byte[]
minBytes, Object max, byte[] maxBytes) {
+ Statistics.Builder builder = Statistics.getBuilderForReading(type);
+ Statistics<?> stats = builder.build();
+ assertTrue(stats.isEmpty());
+ assertFalse(stats.isNumNullsSet());
+ assertFalse(stats.hasNonNullValue());
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withNumNulls(0).withMin(minBytes).build();
+ assertFalse(stats.isEmpty());
+ assertTrue(stats.isNumNullsSet());
+ assertFalse(stats.hasNonNullValue());
+ assertEquals(0, stats.getNumNulls());
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withNumNulls(11).withMax(maxBytes).build();
+ assertFalse(stats.isEmpty());
+ assertTrue(stats.isNumNullsSet());
+ assertFalse(stats.hasNonNullValue());
+ assertEquals(11, stats.getNumNulls());
+
+ builder = Statistics.getBuilderForReading(type);
+ stats =
builder.withNumNulls(42).withMin(minBytes).withMax(maxBytes).build();
+ assertFalse(stats.isEmpty());
+ assertTrue(stats.isNumNullsSet());
+ assertTrue(stats.hasNonNullValue());
+ assertEquals(42, stats.getNumNulls());
+ assertEquals(min, stats.genericGetMin());
+ assertEquals(max, stats.genericGetMax());
+ }
+
+ @Test
+ public void testSpecBuilderForFloat() {
+ PrimitiveTypeName type = FLOAT;
+ Statistics.Builder builder = Statistics.getBuilderForReading(type);
+ Statistics<?> stats =
builder.withMin(intToBytes(floatToIntBits(Float.NaN)))
+ .withMax(intToBytes(floatToIntBits(42.0f))).withNumNulls(0).build();
+ assertTrue(stats.isNumNullsSet());
+ assertEquals(0, stats.getNumNulls());
+ assertFalse(stats.hasNonNullValue());
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withMin(intToBytes(floatToIntBits(-42.0f)))
+
.withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(11).build();
+ assertTrue(stats.isNumNullsSet());
+ assertEquals(11, stats.getNumNulls());
+ assertFalse(stats.hasNonNullValue());
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN)))
+
.withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(42).build();
+ assertTrue(stats.isNumNullsSet());
+ assertEquals(42, stats.getNumNulls());
+ assertFalse(stats.hasNonNullValue());
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withMin(intToBytes(floatToIntBits(0.0f)))
+ .withMax(intToBytes(floatToIntBits(42.0f))).build();
+ assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin()));
+ assertEquals(0, Float.compare(42.0f, (Float) stats.genericGetMax()));
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withMin(intToBytes(floatToIntBits(-42.0f)))
+ .withMax(intToBytes(floatToIntBits(-0.0f))).build();
+ assertEquals(0, Float.compare(-42.0f, (Float) stats.genericGetMin()));
+ assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax()));
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withMin(intToBytes(floatToIntBits(0.0f)))
+ .withMax(intToBytes(floatToIntBits(-0.0f))).build();
+ assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin()));
+ assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax()));
+ }
+
+ @Test
+ public void testSpecBuilderForDouble() {
+ PrimitiveTypeName type = DOUBLE;
+ Statistics.Builder builder = Statistics.getBuilderForReading(type);
+ Statistics<?> stats =
builder.withMin(longToBytes(doubleToLongBits(Double.NaN)))
+ .withMax(longToBytes(doubleToLongBits(42.0))).withNumNulls(0).build();
+ assertTrue(stats.isNumNullsSet());
+ assertEquals(0, stats.getNumNulls());
+ assertFalse(stats.hasNonNullValue());
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withMin(longToBytes(doubleToLongBits(-42.0)))
+
.withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(11).build();
+ assertTrue(stats.isNumNullsSet());
+ assertEquals(11, stats.getNumNulls());
+ assertFalse(stats.hasNonNullValue());
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN)))
+
.withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(42).build();
+ assertTrue(stats.isNumNullsSet());
+ assertEquals(42, stats.getNumNulls());
+ assertFalse(stats.hasNonNullValue());
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withMin(longToBytes(doubleToLongBits(0.0)))
+ .withMax(longToBytes(doubleToLongBits(42.0))).build();
+ assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
+ assertEquals(0, Double.compare(42.0, (Double) stats.genericGetMax()));
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withMin(longToBytes(doubleToLongBits(-42.0)))
+ .withMax(longToBytes(doubleToLongBits(-0.0))).build();
+ assertEquals(0, Double.compare(-42.0, (Double) stats.genericGetMin()));
+ assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));
+
+ builder = Statistics.getBuilderForReading(type);
+ stats = builder.withMin(longToBytes(doubleToLongBits(0.0)))
+ .withMax(longToBytes(doubleToLongBits(-0.0))).build();
+ assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
+ assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));
+ }
}
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 9df566046..9800dab31 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -338,7 +338,7 @@ public static Statistics toParquetStatistics(
(String createdBy, Statistics statistics, PrimitiveTypeName type,
SortOrder typeSortOrder) {
// create stats object based on the column type
org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
- org.apache.parquet.column.statistics.Statistics.getBuilder(type);
+
org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type);
// If there was no statistics written to the footer, create an empty
Statistics object and return
// NOTE: See docs in CorruptStatistics for explanation of why this check
is needed
diff --git
a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
index a0551a452..cfc429aab 100644
---
a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
+++
b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
@@ -90,10 +90,10 @@ private static ColumnChunkMetaData
getDoubleColumnMeta(org.apache.parquet.column
private static final IntStatistics intStats = new IntStatistics();
private static final IntStatistics nullIntStats = new IntStatistics();
private static final org.apache.parquet.column.statistics.Statistics<?>
emptyIntStats = org.apache.parquet.column.statistics.Statistics
- .getBuilder(PrimitiveTypeName.INT32).build();
+ .getBuilderForReading(PrimitiveTypeName.INT32).build();
private static final DoubleStatistics doubleStats = new DoubleStatistics();
private static final org.apache.parquet.column.statistics.Statistics<?>
missingMinMaxDoubleStats = org.apache.parquet.column.statistics.Statistics
- .getBuilder(PrimitiveTypeName.DOUBLE).withNumNulls(100).build();
+
.getBuilderForReading(PrimitiveTypeName.DOUBLE).withNumNulls(100).build();
static {
intStats.setMinMax(10, 100);
diff --git
a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
index a83247f03..80363848d 100644
---
a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
+++
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java
@@ -90,7 +90,7 @@ public void test() throws Exception {
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
- Statistics<?> statistics =
Statistics.getBuilder(PrimitiveTypeName.BINARY).build();
+ Statistics<?> statistics =
Statistics.getBuilderForReading(PrimitiveTypeName.BINARY).build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
diff --git
a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
index 24307b6f3..0bbdf637e 100644
---
a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
+++
b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
@@ -94,7 +94,7 @@
private static final CompressionCodecName CODEC =
CompressionCodecName.UNCOMPRESSED;
private static final org.apache.parquet.column.statistics.Statistics<?>
EMPTY_STATS = org.apache.parquet.column.statistics.Statistics
- .getBuilder(PrimitiveTypeName.BINARY).build();
+ .getBuilderForReading(PrimitiveTypeName.BINARY).build();
private String writeSchema;
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Ignore float/double statistics in case of NaN
> ---------------------------------------------
>
> Key: PARQUET-1246
> URL: https://issues.apache.org/jira/browse/PARQUET-1246
> Project: Parquet
> Issue Type: Bug
> Affects Versions: 1.8.1
> Reporter: Gabor Szadovszky
> Assignee: Gabor Szadovszky
> Priority: Major
> Fix For: 1.10.0
>
>
> The sorting order of the floating point values are not properly specified,
> therefore NaN values can cause skipping valid values when filtering. See
> PARQUET-1222 for more info.
> This issue is for ignoring statistics for float/double if it contains NaN to
> prevent data loss at the read path when filtering.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)