This is an automated email from the ASF dual-hosted git repository. agrove pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push: new cdfdc2146 chore: Update documentation and ignore Spark SQL tests for known issue with count distinct on NaN in aggregate (#1847) cdfdc2146 is described below commit cdfdc2146f879bf9246361687ec4d0d25eb2a87e Author: Andy Grove <agr...@apache.org> AuthorDate: Fri Jun 6 12:59:30 2025 -0600 chore: Update documentation and ignore Spark SQL tests for known issue with count distinct on NaN in aggregate (#1847) --- dev/diffs/3.4.3.diff | 12 +++++++++++- dev/diffs/3.5.4.diff | 12 +++++++++++- dev/diffs/3.5.5.diff | 12 +++++++++++- dev/diffs/4.0.0-preview1.diff | 12 +++++++++++- docs/source/user-guide/compatibility.md | 9 +++------ docs/templates/compatibility-template.md | 9 +++------ 6 files changed, 50 insertions(+), 16 deletions(-) diff --git a/dev/diffs/3.4.3.diff b/dev/diffs/3.4.3.diff index 049671c2e..bf44e3d7a 100644 --- a/dev/diffs/3.4.3.diff +++ b/dev/diffs/3.4.3.diff @@ -247,7 +247,7 @@ index cf40e944c09..bdd5be4f462 100644 test("A cached table preserves the partitioning and ordering of its cached SparkPlan") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala -index 1cc09c3d7fc..f031fa45c33 100644 +index 1cc09c3d7fc..b85b53a9688 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -27,7 +27,7 @@ import org.apache.spark.SparkException @@ -268,6 +268,16 @@ index 1cc09c3d7fc..f031fa45c33 100644 } assert(exchangePlans.length == 1) } +@@ -1100,7 +1100,8 @@ class DataFrameAggregateSuite extends QueryTest + } + } + +- test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") { ++ test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate", ++ IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) { + withTempView("view") { + val nan1 = java.lang.Float.intBitsToFloat(0x7f800001) + val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index 56e9520fdab..917932336df 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala diff --git a/dev/diffs/3.5.4.diff b/dev/diffs/3.5.4.diff index c594fbbe4..8b7f05280 100644 --- a/dev/diffs/3.5.4.diff +++ b/dev/diffs/3.5.4.diff @@ -226,7 +226,7 @@ index 9815cb816c9..95b5f9992b0 100644 test("A cached table preserves the partitioning and ordering of its cached SparkPlan") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala -index 5a8681aed97..da9d25e2eb4 100644 +index 5a8681aed97..db69fde723a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Expand @@ -247,6 +247,16 @@ index 5a8681aed97..da9d25e2eb4 100644 } assert(exchangePlans.length == 1) } +@@ -1255,7 +1255,8 @@ class DataFrameAggregateSuite extends QueryTest + } + } + +- test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") { ++ test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate", ++ IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) { + withTempView("view") { + val nan1 = java.lang.Float.intBitsToFloat(0x7f800001) + val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index 56e9520fdab..917932336df 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala diff --git a/dev/diffs/3.5.5.diff b/dev/diffs/3.5.5.diff index 310aa6881..26a51f6c4 100644 --- a/dev/diffs/3.5.5.diff +++ b/dev/diffs/3.5.5.diff @@ -226,7 +226,7 @@ index 9815cb816c9..95b5f9992b0 100644 test("A cached table preserves the partitioning and ordering of its cached SparkPlan") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala -index 5a8681aed97..da9d25e2eb4 100644 +index 5a8681aed97..db69fde723a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Expand @@ -247,6 +247,16 @@ index 5a8681aed97..da9d25e2eb4 100644 } assert(exchangePlans.length == 1) } +@@ -1255,7 +1255,8 @@ class DataFrameAggregateSuite extends QueryTest + } + } + +- test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") { ++ test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate", ++ IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) { + withTempView("view") { + val nan1 = java.lang.Float.intBitsToFloat(0x7f800001) + val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index 56e9520fdab..917932336df 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala diff --git a/dev/diffs/4.0.0-preview1.diff b/dev/diffs/4.0.0-preview1.diff index c014660cf..99cbd0325 100644 --- a/dev/diffs/4.0.0-preview1.diff +++ b/dev/diffs/4.0.0-preview1.diff @@ -268,7 +268,7 @@ index d023fb82185..0f4f03bda6c 100644 withTempView("t0", "t1", "t2") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala -index 620ee430cab..9d383a4bff9 100644 +index 620ee430cab..f5df9218fc1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.util.AUTO_GENERATED_ALIAS @@ -289,6 +289,16 @@ index 620ee430cab..9d383a4bff9 100644 } assert(exchangePlans.length == 1) } +@@ -1275,7 +1275,8 @@ class DataFrameAggregateSuite extends QueryTest + } + } + +- test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") { ++ test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate", ++ IgnoreComet("TODO: https://github.com/apache/datafusion-comet/issues/1824")) { + withTempView("view") { + val nan1 = java.lang.Float.intBitsToFloat(0x7f800001) + val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index f6fd6b501d7..11870c85d82 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala diff --git a/docs/source/user-guide/compatibility.md b/docs/source/user-guide/compatibility.md index 212dc0dc3..39cd3a058 100644 --- a/docs/source/user-guide/compatibility.md +++ b/docs/source/user-guide/compatibility.md @@ -29,12 +29,6 @@ Comet aims to provide consistent results with the version of Apache Spark that i This guide offers information about areas of functionality where there are known differences. -# Compatibility Guide - -Comet aims to provide consistent results with the version of Apache Spark that is being used. - -This guide offers information about areas of functionality where there are known differences. - ## Parquet Scans Comet currently has three distinct implementations of the Parquet scan operator. The configuration property @@ -89,6 +83,9 @@ because they are handled well in Spark (e.g., `SQLOrderingUtil.compareFloats`). functions of arrow-rs used by DataFusion do not normalize NaN and zero (e.g., [arrow::compute::kernels::cmp::eq](https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html#)). So Comet will add additional normalization expression of NaN and zero for comparison. +There is a known bug with using count(distinct) within aggregate queries, where each NaN value will be counted +separately [#1824](https://github.com/apache/datafusion-comet/issues/1824). + ## Incompatible Expressions Some Comet native expressions are not 100% compatible with Spark and are disabled by default. These expressions diff --git a/docs/templates/compatibility-template.md b/docs/templates/compatibility-template.md index 9f9f9911b..e304d933f 100644 --- a/docs/templates/compatibility-template.md +++ b/docs/templates/compatibility-template.md @@ -29,12 +29,6 @@ Comet aims to provide consistent results with the version of Apache Spark that i This guide offers information about areas of functionality where there are known differences. -# Compatibility Guide - -Comet aims to provide consistent results with the version of Apache Spark that is being used. - -This guide offers information about areas of functionality where there are known differences. - ## Parquet Scans Comet currently has three distinct implementations of the Parquet scan operator. The configuration property @@ -89,6 +83,9 @@ because they are handled well in Spark (e.g., `SQLOrderingUtil.compareFloats`). functions of arrow-rs used by DataFusion do not normalize NaN and zero (e.g., [arrow::compute::kernels::cmp::eq](https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html#)). So Comet will add additional normalization expression of NaN and zero for comparison. +There is a known bug with using count(distinct) within aggregate queries, where each NaN value will be counted +separately [#1824](https://github.com/apache/datafusion-comet/issues/1824). + ## Incompatible Expressions Some Comet native expressions are not 100% compatible with Spark and are disabled by default. These expressions --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org