This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new b9eaf05115 [core] Support IsNaN predicate pushdown with Parquet (#7875)
b9eaf05115 is described below
commit b9eaf051153357257e3634b50e4a7976bc023f5d
Author: Arnav Balyan <[email protected]>
AuthorDate: Mon May 18 11:42:19 2026 +0530
[core] Support IsNaN predicate pushdown with Parquet (#7875)
- Paimon has no IsNaN predicate today, add support for the same with
Paimon evaluation and parquet pushdown
- Add IsNaN as a new LeafUnaryFunction, which can evaluate rows and
pushdown to Parquet for double and float
- ParquetFilters pushes down to Parquet via new UserDefinedPredicate.
---
.../apache/paimon/predicate/FunctionVisitor.java | 4 ++
.../java/org/apache/paimon/predicate/IsNaN.java | 68 ++++++++++++++++++++++
.../apache/paimon/predicate/PredicateBuilder.java | 8 +++
.../org/apache/paimon/predicate/PredicateTest.java | 28 +++++++++
.../parquet/filter2/predicate/ParquetFilters.java | 57 ++++++++++++++++++
.../paimon/format/parquet/ParquetFiltersTest.java | 32 ++++++++++
6 files changed, 197 insertions(+)
diff --git
a/paimon-common/src/main/java/org/apache/paimon/predicate/FunctionVisitor.java
b/paimon-common/src/main/java/org/apache/paimon/predicate/FunctionVisitor.java
index 5aa4ca1373..f7040dae06 100644
---
a/paimon-common/src/main/java/org/apache/paimon/predicate/FunctionVisitor.java
+++
b/paimon-common/src/main/java/org/apache/paimon/predicate/FunctionVisitor.java
@@ -54,6 +54,10 @@ public interface FunctionVisitor<T> extends
PredicateVisitor<T> {
T visitIsNull(FieldRef fieldRef);
+ default T visitIsNaN(FieldRef fieldRef) {
+ throw new UnsupportedOperationException();
+ }
+
// ----------------- Binary functions ------------------------
T visitStartsWith(FieldRef fieldRef, Object literal);
diff --git a/paimon-common/src/main/java/org/apache/paimon/predicate/IsNaN.java
b/paimon-common/src/main/java/org/apache/paimon/predicate/IsNaN.java
new file mode 100644
index 0000000000..42d3a40832
--- /dev/null
+++ b/paimon-common/src/main/java/org/apache/paimon/predicate/IsNaN.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.predicate;
+
+import org.apache.paimon.types.DataType;
+
+import
org.apache.paimon.shade.jackson2.com.fasterxml.jackson.annotation.JsonCreator;
+
+import java.util.List;
+import java.util.Optional;
+
+/** A LeafUnaryFunction to evaluate field is nan for float and double columns.
*/
+public class IsNaN extends LeafUnaryFunction {
+
+ public static final String NAME = "IS_NAN";
+
+ public static final IsNaN INSTANCE = new IsNaN();
+
+ @JsonCreator
+ private IsNaN() {}
+
+ @Override
+ public boolean test(DataType type, Object field) {
+ if (field instanceof Float) {
+ return Float.isNaN((Float) field);
+ }
+ if (field instanceof Double) {
+ return Double.isNaN((Double) field);
+ }
+ return false;
+ }
+
+ @Override
+ public boolean test(DataType type, long rowCount, Object min, Object max,
Long nullCount) {
+ return true;
+ }
+
+ @Override
+ public Optional<LeafFunction> negate() {
+ return Optional.empty();
+ }
+
+ @Override
+ public <T> T visit(FunctionVisitor<T> visitor, FieldRef fieldRef,
List<Object> literals) {
+ return visitor.visitIsNaN(fieldRef);
+ }
+
+ @Override
+ public String toJson() {
+ return NAME;
+ }
+}
diff --git
a/paimon-common/src/main/java/org/apache/paimon/predicate/PredicateBuilder.java
b/paimon-common/src/main/java/org/apache/paimon/predicate/PredicateBuilder.java
index c4343475b4..05acce1729 100644
---
a/paimon-common/src/main/java/org/apache/paimon/predicate/PredicateBuilder.java
+++
b/paimon-common/src/main/java/org/apache/paimon/predicate/PredicateBuilder.java
@@ -138,6 +138,14 @@ public class PredicateBuilder {
return leaf(IsNotNull.INSTANCE, transform);
}
+ public Predicate isNaN(int idx) {
+ return leaf(IsNaN.INSTANCE, idx);
+ }
+
+ public Predicate isNaN(Transform transform) {
+ return leaf(IsNaN.INSTANCE, transform);
+ }
+
public Predicate startsWith(int idx, Object patternLiteral) {
return leaf(StartsWith.INSTANCE, idx, patternLiteral);
}
diff --git
a/paimon-common/src/test/java/org/apache/paimon/predicate/PredicateTest.java
b/paimon-common/src/test/java/org/apache/paimon/predicate/PredicateTest.java
index 5bece36654..0e67372d72 100644
--- a/paimon-common/src/test/java/org/apache/paimon/predicate/PredicateTest.java
+++ b/paimon-common/src/test/java/org/apache/paimon/predicate/PredicateTest.java
@@ -23,6 +23,8 @@ import org.apache.paimon.data.GenericRow;
import org.apache.paimon.format.SimpleColStats;
import org.apache.paimon.types.CharType;
import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.DoubleType;
+import org.apache.paimon.types.FloatType;
import org.apache.paimon.types.IntType;
import org.apache.paimon.types.RowType;
import org.apache.paimon.types.VarCharType;
@@ -295,6 +297,32 @@ public class PredicateTest {
assertThat(predicate.negate().orElse(null)).isEqualTo(builder.isNull(0));
}
+ @Test
+ public void testIsNaNDouble() {
+ PredicateBuilder builder = new PredicateBuilder(RowType.of(new
DoubleType()));
+ Predicate predicate = builder.isNaN(0);
+
+ assertThat(predicate.test(GenericRow.of(Double.NaN))).isEqualTo(true);
+ assertThat(predicate.test(GenericRow.of(1.5))).isEqualTo(false);
+
assertThat(predicate.test(GenericRow.of(Double.POSITIVE_INFINITY))).isEqualTo(false);
+ assertThat(predicate.test(GenericRow.of((Object)
null))).isEqualTo(false);
+
+ assertThat(test(predicate, 3, new SimpleColStats[] {new
SimpleColStats(0.0, 1.0, 0L)}))
+ .isEqualTo(true);
+
+ assertThat(predicate.negate()).isEmpty();
+ }
+
+ @Test
+ public void testIsNaNFloat() {
+ PredicateBuilder builder = new PredicateBuilder(RowType.of(new
FloatType()));
+ Predicate predicate = builder.isNaN(0);
+
+ assertThat(predicate.test(GenericRow.of(Float.NaN))).isEqualTo(true);
+ assertThat(predicate.test(GenericRow.of(1.5f))).isEqualTo(false);
+ assertThat(predicate.test(GenericRow.of((Object)
null))).isEqualTo(false);
+ }
+
@Test
public void testIn() {
PredicateBuilder builder = new PredicateBuilder(RowType.of(new
IntType()));
diff --git
a/paimon-format/src/main/java/org/apache/parquet/filter2/predicate/ParquetFilters.java
b/paimon-format/src/main/java/org/apache/parquet/filter2/predicate/ParquetFilters.java
index dacd12f492..29feeb1b5e 100644
---
a/paimon-format/src/main/java/org/apache/parquet/filter2/predicate/ParquetFilters.java
+++
b/paimon-format/src/main/java/org/apache/parquet/filter2/predicate/ParquetFilters.java
@@ -52,8 +52,11 @@ import org.apache.paimon.types.VariantType;
import org.apache.paimon.types.VectorType;
import org.apache.parquet.filter2.compat.FilterCompat;
+import org.apache.parquet.filter2.predicate.Operators.DoubleColumn;
+import org.apache.parquet.filter2.predicate.Operators.FloatColumn;
import org.apache.parquet.io.api.Binary;
+import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@@ -97,6 +100,18 @@ public class ParquetFilters {
return new Operators.Eq<>(toParquetColumn(fieldRef), null);
}
+ @Override
+ public FilterPredicate visitIsNaN(FieldRef fieldRef) {
+ Operators.Column<?> column = toParquetColumn(fieldRef);
+ if (column instanceof DoubleColumn) {
+ return FilterApi.userDefined((DoubleColumn) column, new
IsNaNDoublePredicate());
+ }
+ if (column instanceof FloatColumn) {
+ return FilterApi.userDefined((FloatColumn) column, new
IsNaNFloatPredicate());
+ }
+ throw new UnsupportedOperationException();
+ }
+
@Override
public FilterPredicate visitLessThan(FieldRef fieldRef, Object
literal) {
return new Operators.Lt(
@@ -441,4 +456,46 @@ public class ParquetFilters {
throw new UnsupportedOperationException();
}
}
+
+ /** user defined predicate that keeps double rows where the value is nan.
*/
+ public static class IsNaNDoublePredicate extends
UserDefinedPredicate<Double>
+ implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public boolean keep(Double value) {
+ return value != null && Double.isNaN(value);
+ }
+
+ @Override
+ public boolean canDrop(Statistics<Double> statistics) {
+ return false;
+ }
+
+ @Override
+ public boolean inverseCanDrop(Statistics<Double> statistics) {
+ return false;
+ }
+ }
+
+ /** user defined predicate that keeps float rows where the value is nan. */
+ public static class IsNaNFloatPredicate extends UserDefinedPredicate<Float>
+ implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public boolean keep(Float value) {
+ return value != null && Float.isNaN(value);
+ }
+
+ @Override
+ public boolean canDrop(Statistics<Float> statistics) {
+ return false;
+ }
+
+ @Override
+ public boolean inverseCanDrop(Statistics<Float> statistics) {
+ return false;
+ }
+ }
}
diff --git
a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFiltersTest.java
b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFiltersTest.java
index 18fe1ef28c..4fdd1e3927 100644
---
a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFiltersTest.java
+++
b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFiltersTest.java
@@ -198,6 +198,38 @@ class ParquetFiltersTest {
true);
}
+ @Test
+ public void testIsNaNDouble() {
+ PredicateBuilder builder =
+ new PredicateBuilder(
+ new RowType(
+ Collections.singletonList(
+ new DataField(0, "d1", new
DoubleType()))));
+
+ FilterCompat.Filter filter =
+
ParquetFilters.convert(Collections.singletonList(builder.isNaN(0)));
+ FilterPredicateCompat compat = (FilterPredicateCompat) filter;
+ assertThat(compat.getFilterPredicate().toString())
+ .contains(
+ "userdefinedbyinstance(d1,
org.apache.parquet.filter2.predicate.ParquetFilters$IsNaNDoublePredicate");
+ }
+
+ @Test
+ public void testIsNaNFloat() {
+ PredicateBuilder builder =
+ new PredicateBuilder(
+ new RowType(
+ Collections.singletonList(
+ new DataField(0, "f1", new
FloatType()))));
+
+ FilterCompat.Filter filter =
+
ParquetFilters.convert(Collections.singletonList(builder.isNaN(0)));
+ FilterPredicateCompat compat = (FilterPredicateCompat) filter;
+ assertThat(compat.getFilterPredicate().toString())
+ .contains(
+ "userdefinedbyinstance(f1,
org.apache.parquet.filter2.predicate.ParquetFilters$IsNaNFloatPredicate");
+ }
+
@Test
public void testInFilterFloat() {
PredicateBuilder builder =