This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-1.6 in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.6 by this push: new 1e9dfdd ORC-629: PPD: Floating point NaN is not transitive across comparisons (Panagiotis Garefalakis, via Gopal V) 1e9dfdd is described below commit 1e9dfdd4684d0dced4d2bc61fdce15a28289ed61 Author: Panos Garefalakis <pga...@cloudera.com> AuthorDate: Thu May 14 10:40:23 2020 -0700 ORC-629: PPD: Floating point NaN is not transitive across comparisons (Panagiotis Garefalakis, via Gopal V) Signed-off-by: Gopal V <gop...@apache.org> (cherry picked from commit 5dd3b29a2f4a97aa716b301c657c1853cc27e6c2) Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../java/org/apache/orc/impl/RecordReaderImpl.java | 8 +++ .../src/test/org/apache/orc/TestVectorOrcFile.java | 57 ++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java index f5a65ba..5851c75 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -495,6 +495,14 @@ public class RecordReaderImpl implements RecordReader { " include ORC-517. Writer version: {}", predicate.getColumnName(), writerVersion); return TruthValue.YES_NO_NULL; + } else if (category == TypeDescription.Category.DOUBLE) { + DoubleColumnStatistics dstas = (DoubleColumnStatistics) cs; + if (!Double.isFinite(dstas.getMinimum()) || !Double.isFinite(dstas.getMaximum()) + || !Double.isFinite(dstas.getSum())) { + LOG.debug("Not using predication pushdown on {} because stats contain NaN values", + predicate.getColumnName()); + return dstas.hasNull() ? TruthValue.YES_NO_NULL : TruthValue.YES_NO; + } } return evaluatePredicateRange(predicate, range, BloomFilterIO.deserialize(kind, encoding, writerVersion, type.getCategory(), diff --git a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java index df7a621..bc2ff94 100644 --- a/java/core/src/test/org/apache/orc/TestVectorOrcFile.java +++ b/java/core/src/test/org/apache/orc/TestVectorOrcFile.java @@ -4005,6 +4005,63 @@ public class TestVectorOrcFile { } @Test + public void testPredicatePushdownWithNan() throws Exception { + TypeDescription schema = TypeDescription.createStruct() + .addField("double1", TypeDescription.createDouble()); + + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .setSchema(schema) + .stripeSize(400000L) + .compress(CompressionKind.NONE) + .bufferSize(500) + .rowIndexStride(1000) + .version(fileFormat)); + VectorizedRowBatch batch = schema.createRowBatch(); + batch.ensureSize(3500); + batch.size = 3500; + batch.cols[0].noNulls = true; + + DoubleColumnVector dbcol = ((DoubleColumnVector) batch.cols[0]); + + // first row NaN (resulting to both min and max columnStats of stride to be NaN) + // NaN in the middle of a stride causes Sum of last stride to be NaN + dbcol.vector[0] = Double.NaN; + for (int i=1; i < 3500; ++i) { + dbcol.vector[i] = i == 3200 ? Double.NaN : i; + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals(3500, reader.getNumberOfRows()); + + // Only the first stride matches the predicate, just need to make sure NaN stats are ignored + SearchArgument sarg = SearchArgumentFactory.newBuilder() + .startAnd() + .lessThan("double1", PredicateLeaf.Type.FLOAT, 100d) + .end() + .build(); + + RecordReader rows = reader.rows(reader.options() + .range(0L, Long.MAX_VALUE) + .searchArgument(sarg, new String[]{"double1"})); + batch = reader.getSchema().createRowBatch(3500); + + rows.nextBatch(batch); + // First stride should be read as NaN min/max are ignored + assertEquals(1000, batch.size); + + rows.nextBatch(batch); + // Last stride should be read as NaN sum is ignored + assertEquals(500, batch.size); + + rows.nextBatch(batch); + assertEquals(0, batch.size); + } + + @Test public void testColumnEncryption() throws Exception { Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11); final int ROWS = 1000;