This is an automated email from the ASF dual-hosted git repository.

zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/auron.git


The following commit(s) were added to refs/heads/master by this push:
     new 027cffa5 [AURON #1646] `isNan` semantics are aligned with Spark (#1647)
027cffa5 is described below

commit 027cffa52b7a2ce44dd4fbafb360448a7733a1df
Author: Shreyesh <[email protected]>
AuthorDate: Thu Dec 4 19:43:48 2025 -0800

    [AURON #1646] `isNan` semantics are aligned with Spark (#1647)
    
    # Which issue does this PR close?
    
    Closes #1646
    
     # Rationale for this change
    isnan currently uses math::isnan which propagates nulls and can mismatch 
Spark semantics. In Spark, isnan(null) must return false, not null. 
Additionally, computed NaN values like log(-3) may not be handled consistently 
during Parquet round-trip.
    
    # What changes are included in this PR?
    
    new implementation for is_nan native function made for spark
    
    # Are there any user-facing changes?
    
    # How was this patch tested?
    
    - unit tests
    
    
    ---------
    
    Co-authored-by: sarangat_LinkedIn <[email protected]>
---
 .../datafusion-ext-commons/src/arrow/boolean.rs    |  59 ++++++++
 .../datafusion-ext-commons/src/arrow/mod.rs        |   1 +
 native-engine/datafusion-ext-functions/src/lib.rs  |   4 +-
 .../datafusion-ext-functions/src/spark_isnan.rs    | 150 +++++++++++++++++++++
 .../org.apache.auron/AuronFunctionSuite.scala      |  42 +++---
 .../apache/spark/sql/auron/NativeConverters.scala  |   2 +-
 6 files changed, 237 insertions(+), 21 deletions(-)

diff --git a/native-engine/datafusion-ext-commons/src/arrow/boolean.rs 
b/native-engine/datafusion-ext-commons/src/arrow/boolean.rs
new file mode 100644
index 00000000..d1407016
--- /dev/null
+++ b/native-engine/datafusion-ext-commons/src/arrow/boolean.rs
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use arrow::array::{Array, BooleanArray};
+
+/// Returns a BooleanArray where nulls are converted to `false` and the result
+/// has no null bitmap (all values are valid).
+#[inline]
+pub fn nulls_to_false(is_boolean: &BooleanArray) -> BooleanArray {
+    match is_boolean.nulls() {
+        Some(nulls) => {
+            let is_not_null = nulls.inner();
+            BooleanArray::new(is_boolean.values() & is_not_null, None)
+        }
+        None => is_boolean.clone(),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{Array, BooleanArray};
+
+    use super::nulls_to_false;
+
+    #[test]
+    fn converts_nulls_to_false() {
+        let input = BooleanArray::from(vec![Some(true), None, Some(false)]);
+        let output = nulls_to_false(&input);
+
+        assert!(output.nulls().is_none());
+
+        let got: Vec<Option<bool>> = output.iter().collect();
+        let expected = vec![Some(true), Some(false), Some(false)];
+        assert_eq!(got, expected);
+    }
+
+    #[test]
+    fn preserves_when_no_nulls() {
+        let input = BooleanArray::from(vec![Some(false), Some(true)]);
+        let output = nulls_to_false(&input);
+
+        assert!(output.nulls().is_none());
+        let got: Vec<Option<bool>> = output.iter().collect();
+        let expected = vec![Some(false), Some(true)];
+        assert_eq!(got, expected);
+    }
+}
diff --git a/native-engine/datafusion-ext-commons/src/arrow/mod.rs 
b/native-engine/datafusion-ext-commons/src/arrow/mod.rs
index afa9df90..b4e8c180 100644
--- a/native-engine/datafusion-ext-commons/src/arrow/mod.rs
+++ b/native-engine/datafusion-ext-commons/src/arrow/mod.rs
@@ -14,6 +14,7 @@
 // limitations under the License.
 
 pub mod array_size;
+pub mod boolean;
 pub mod cast;
 pub mod coalesce;
 pub mod eq_comparator;
diff --git a/native-engine/datafusion-ext-functions/src/lib.rs 
b/native-engine/datafusion-ext-functions/src/lib.rs
index b99e406d..cad5198d 100644
--- a/native-engine/datafusion-ext-functions/src/lib.rs
+++ b/native-engine/datafusion-ext-functions/src/lib.rs
@@ -24,6 +24,7 @@ mod spark_crypto;
 mod spark_dates;
 pub mod spark_get_json_object;
 mod spark_hash;
+mod spark_isnan;
 mod spark_make_array;
 mod spark_make_decimal;
 mod spark_normalize_nan_and_zero;
@@ -75,6 +76,7 @@ pub fn create_auron_ext_function(name: &str) -> 
Result<ScalarFunctionImplementat
         "Spark_NormalizeNanAndZero" => {
             
Arc::new(spark_normalize_nan_and_zero::spark_normalize_nan_and_zero)
         }
-        _ => df_unimplemented_err!("auron ext function not implemented: 
{name}")?,
+        "Spark_IsNaN" => Arc::new(spark_isnan::spark_isnan),
+        _ => df_unimplemented_err!("spark ext function not implemented: 
{name}")?,
     })
 }
diff --git a/native-engine/datafusion-ext-functions/src/spark_isnan.rs 
b/native-engine/datafusion-ext-functions/src/spark_isnan.rs
new file mode 100644
index 00000000..3de97b5c
--- /dev/null
+++ b/native-engine/datafusion-ext-functions/src/spark_isnan.rs
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use arrow::{
+    array::{Array, BooleanArray, Float32Array, Float64Array},
+    datatypes::DataType,
+};
+use datafusion::{
+    common::{Result, ScalarValue},
+    logical_expr::ColumnarValue,
+};
+use datafusion_ext_commons::arrow::boolean::nulls_to_false;
+
+pub fn spark_isnan(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    let value = &args[0];
+    match value {
+        ColumnarValue::Array(array) => match array.data_type() {
+            DataType::Float64 => {
+                let array = 
array.as_any().downcast_ref::<Float64Array>().unwrap();
+                let is_nan = BooleanArray::from_unary(array, |x| x.is_nan());
+                let cleaned = nulls_to_false(&is_nan);
+                Ok(ColumnarValue::Array(Arc::new(cleaned)))
+            }
+            DataType::Float32 => {
+                let array = 
array.as_any().downcast_ref::<Float32Array>().unwrap();
+                let is_nan = BooleanArray::from_unary(array, |x| x.is_nan());
+                let cleaned = nulls_to_false(&is_nan);
+                Ok(ColumnarValue::Array(Arc::new(cleaned)))
+            }
+            _other => {
+                // For non-float arrays, Spark's isnan is effectively false.
+                let len = array.len();
+                let out = 
ScalarValue::Boolean(Some(false)).to_array_of_size(len)?;
+                Ok(ColumnarValue::Array(out))
+            }
+        },
+        ColumnarValue::Scalar(sv) => 
Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
+            match sv {
+                ScalarValue::Float64(a) => a.map(|x| 
x.is_nan()).unwrap_or(false),
+                ScalarValue::Float32(a) => a.map(|x| 
x.is_nan()).unwrap_or(false),
+                _ => false,
+            },
+        )))),
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{error::Error, sync::Arc};
+
+    use arrow::array::{ArrayRef, BooleanArray, Float32Array, Float64Array};
+    use datafusion::{common::ScalarValue, logical_expr::ColumnarValue};
+
+    use crate::spark_isnan::spark_isnan;
+
+    #[test]
+    fn test_isnan_array_f64() -> Result<(), Box<dyn Error>> {
+        let input_data = vec![
+            Some(12345678.0),
+            Some(f64::NAN),
+            Some(-0.0),
+            None,
+            Some(f64::INFINITY),
+            Some(f64::NEG_INFINITY),
+        ];
+        let input_columnar_value = 
ColumnarValue::Array(Arc::new(Float64Array::from(input_data)));
+
+        let result = spark_isnan(&vec![input_columnar_value])?.into_array(6)?;
+
+        let expected_data = vec![
+            Some(false),
+            Some(true),
+            Some(false),
+            Some(false), // null returns false in Spark
+            Some(false),
+            Some(false),
+        ];
+        let expected: ArrayRef = Arc::new(BooleanArray::from(expected_data));
+        assert_eq!(&result, &expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_isnan_array_f32() -> Result<(), Box<dyn Error>> {
+        let input_data = vec![
+            Some(12345678.0f32),
+            Some(f32::NAN),
+            Some(-0.0f32),
+            None,
+            Some(f32::INFINITY),
+            Some(f32::NEG_INFINITY),
+        ];
+        let input_columnar_value = 
ColumnarValue::Array(Arc::new(Float32Array::from(input_data)));
+
+        let result = spark_isnan(&vec![input_columnar_value])?.into_array(6)?;
+
+        let expected_data = vec![
+            Some(false),
+            Some(true),
+            Some(false),
+            Some(false), // null returns false in Spark
+            Some(false),
+            Some(false),
+        ];
+        let expected: ArrayRef = Arc::new(BooleanArray::from(expected_data));
+        assert_eq!(&result, &expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_isnan_scalar_f64_nan() -> Result<(), Box<dyn Error>> {
+        let input_columnar_value = 
ColumnarValue::Scalar(ScalarValue::Float64(Some(f64::NAN)));
+        let result = spark_isnan(&vec![input_columnar_value])?.into_array(1)?;
+        let expected: ArrayRef = 
Arc::new(BooleanArray::from(vec![Some(true)]));
+        assert_eq!(&result, &expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_isnan_scalar_f64_null() -> Result<(), Box<dyn Error>> {
+        let input_columnar_value = 
ColumnarValue::Scalar(ScalarValue::Float64(None));
+        let result = spark_isnan(&vec![input_columnar_value])?.into_array(1)?;
+        let expected: ArrayRef = 
Arc::new(BooleanArray::from(vec![Some(false)]));
+        assert_eq!(&result, &expected);
+        Ok(())
+    }
+
+    #[test]
+    fn test_isnan_scalar_f32_null() -> Result<(), Box<dyn Error>> {
+        let input_columnar_value = 
ColumnarValue::Scalar(ScalarValue::Float32(None));
+        let result = spark_isnan(&vec![input_columnar_value])?.into_array(1)?;
+        let expected: ArrayRef = 
Arc::new(BooleanArray::from(vec![Some(false)]));
+        assert_eq!(&result, &expected);
+        Ok(())
+    }
+}
diff --git 
a/spark-extension-shims-spark/src/test/scala/org.apache.auron/AuronFunctionSuite.scala
 
b/spark-extension-shims-spark/src/test/scala/org.apache.auron/AuronFunctionSuite.scala
index 07725e80..78bddbba 100644
--- 
a/spark-extension-shims-spark/src/test/scala/org.apache.auron/AuronFunctionSuite.scala
+++ 
b/spark-extension-shims-spark/src/test/scala/org.apache.auron/AuronFunctionSuite.scala
@@ -381,27 +381,31 @@ class AuronFunctionSuite extends AuronQueryTest with 
BaseAuronSQLSuite {
     }
   }
 
-  ignore("DISABLED: isNaN native semantics mismatch (null -> false)") {
-    /* TODO: enable once Spark-compatible isNaN lands 
https://github.com/apache/auron/issues/1646 */
-
-    test("test function IsNaN") {
-      withTable("t1") {
-        sql(
-          "create table test_is_nan using parquet as select cast('NaN' as 
double) as c1, cast('NaN' as float) as c2, log(-3) as c3, cast(null as double) 
as c4, 5.5f as c5")
-        val functions =
-          """
-            |select
-            |    isnan(c1),
-            |    isnan(c2),
-            |    isnan(c3),
-            |    isnan(c4),
-            |    isnan(c5)
-            |from
-            |    test_is_nan
+  test("test function IsNaN") {
+    withTable("t1") {
+      sql("""
+          |create table test_is_nan using parquet as select
+          |  cast('NaN' as double) as c1,
+          |  cast('NaN' as float) as c2,
+          |  cast(null as double) as c3,
+          |  cast(null as double) as c4,
+          |  cast(5.5 as float) as c5,
+          |  cast(null as float) as c6
+          |""".stripMargin)
+      val functions =
+        """
+          |select
+          |    isnan(c1),
+          |    isnan(c2),
+          |    isnan(c3),
+          |    isnan(c4),
+          |    isnan(c5),
+          |    isnan(c6)
+          |from
+          |    test_is_nan
         """.stripMargin
 
-        checkSparkAnswerAndOperator(functions)
-      }
+      checkSparkAnswerAndOperator(functions)
     }
   }
 
diff --git 
a/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeConverters.scala
 
b/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeConverters.scala
index 11cdcc4b..65d6040d 100644
--- 
a/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeConverters.scala
+++ 
b/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeConverters.scala
@@ -844,7 +844,7 @@ object NativeConverters extends Logging {
         buildScalarFunction(pb.ScalarFunction.Factorial, e.children, 
e.dataType)
       case e: Hex => buildScalarFunction(pb.ScalarFunction.Hex, e.children, 
e.dataType)
       case e: IsNaN =>
-        buildScalarFunction(pb.ScalarFunction.IsNaN, e.children, e.dataType)
+        buildExtScalarFunction("Spark_IsNaN", e.children, e.dataType)
       case e: Round =>
         e.scale match {
           case Literal(n: Int, _) =>

Reply via email to