[ https://issues.apache.org/jira/browse/PARQUET-1246?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16398633#comment-16398633 ]
ASF GitHub Bot commented on PARQUET-1246: ----------------------------------------- zivanfi commented on a change in pull request #461: PARQUET-1246: Ignore float/double statistics in case of NaN URL: https://github.com/apache/parquet-mr/pull/461#discussion_r174466575 ########## File path: parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java ########## @@ -637,4 +648,106 @@ private void testMergingStringStats() { assertEquals(stats.getMax(), Binary.fromString("zzzz")); assertEquals(stats.getMin(), Binary.fromString("")); } + + @Test + public void testBuilder() { + testBuilder(Types.required(BOOLEAN).named("test_boolean"), false, new byte[] { 0 }, true, new byte[] { 1 }); + testBuilder(Types.required(INT32).named("test_int32"), -42, intToBytes(-42), 42, intToBytes(42)); + testBuilder(Types.required(INT64).named("test_int64"), -42l, longToBytes(-42), 42l, longToBytes(42)); + testBuilder(Types.required(FLOAT).named("test_float"), -42.0f, intToBytes(floatToIntBits(-42.0f)), 42.0f, + intToBytes(floatToIntBits(42.0f))); + testBuilder(Types.required(DOUBLE).named("test_double"), -42.0, longToBytes(doubleToLongBits(-42.0)), 42.0, + longToBytes(Double.doubleToLongBits(42.0f))); + + byte[] min = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; + byte[] max = { 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; + testBuilder(Types.required(INT96).named("test_int96"), Binary.fromConstantByteArray(min), min, + Binary.fromConstantByteArray(max), max); + testBuilder(Types.required(FIXED_LEN_BYTE_ARRAY).length(12).named("test_fixed"), Binary.fromConstantByteArray(min), + min, + Binary.fromConstantByteArray(max), max); + testBuilder(Types.required(BINARY).named("test_binary"), Binary.fromConstantByteArray(min), min, + Binary.fromConstantByteArray(max), max); + } + + private void testBuilder(PrimitiveType type, Object min, byte[] minBytes, Object max, byte[] maxBytes) { + Statistics.Builder builder = Statistics.getBuilderForReading(type); + Statistics<?> stats = builder.build(); + assertTrue(stats.isEmpty()); + assertFalse(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withNumNulls(0).withMin(minBytes).build(); + assertFalse(stats.isEmpty()); + assertTrue(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + assertEquals(0, stats.getNumNulls()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withNumNulls(11).withMax(maxBytes).build(); + assertFalse(stats.isEmpty()); + assertTrue(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + assertEquals(11, stats.getNumNulls()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withNumNulls(42).withMin(minBytes).withMax(maxBytes).build(); + assertFalse(stats.isEmpty()); + assertTrue(stats.isNumNullsSet()); + assertTrue(stats.hasNonNullValue()); + assertEquals(42, stats.getNumNulls()); + assertEquals(min, stats.genericGetMin()); + assertEquals(max, stats.genericGetMax()); + } + + @Test Review comment: Please add test case for -0 and +0. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Ignore float/double statistics in case of NaN > --------------------------------------------- > > Key: PARQUET-1246 > URL: https://issues.apache.org/jira/browse/PARQUET-1246 > Project: Parquet > Issue Type: Bug > Affects Versions: 1.8.1 > Reporter: Gabor Szadovszky > Assignee: Gabor Szadovszky > Priority: Major > Fix For: 1.10.0 > > > The sorting order of the floating point values are not properly specified, > therefore NaN values can cause skipping valid values when filtering. See > PARQUET-1222 for more info. > This issue is for ignoring statistics for float/double if it contains NaN to > prevent data loss at the read path when filtering. -- This message was sent by Atlassian JIRA (v7.6.3#76005)