(spark) branch branch-3.5 updated: Revert "[SPARK-45599][CORE] Use object equality in OpenHashSet"

dongjoon Tue, 27 Feb 2024 08:39:36 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.5 by this push:
     new cbf25fb633f4 Revert "[SPARK-45599][CORE] Use object equality in 
OpenHashSet"
cbf25fb633f4 is described below

commit cbf25fb633f4bf2f83a6f6e39aafaa80bf47e160
Author: Dongjoon Hyun <dh...@apple.com>
AuthorDate: Tue Feb 27 08:38:54 2024 -0800

    Revert "[SPARK-45599][CORE] Use object equality in OpenHashSet"
    
    This reverts commit 588a55d010fefda7a63cde3b616ac38728fe4cfe.
---
 .../apache/spark/util/collection/OpenHashSet.scala | 16 ++-------
 .../spark/util/collection/OpenHashMapSuite.scala   | 30 -----------------
 .../spark/util/collection/OpenHashSetSuite.scala   | 39 ----------------------
 .../sql-tests/analyzer-results/ansi/array.sql.out  | 14 --------
 .../analyzer-results/ansi/literals.sql.out         |  7 ----
 .../sql-tests/analyzer-results/array.sql.out       | 14 --------
 .../sql-tests/analyzer-results/group-by.sql.out    | 19 -----------
 .../sql-tests/analyzer-results/literals.sql.out    |  7 ----
 .../src/test/resources/sql-tests/inputs/array.sql  |  4 ---
 .../test/resources/sql-tests/inputs/group-by.sql   | 15 ---------
 .../test/resources/sql-tests/inputs/literals.sql   |  3 --
 .../resources/sql-tests/results/ansi/array.sql.out | 16 ---------
 .../sql-tests/results/ansi/literals.sql.out        |  8 -----
 .../test/resources/sql-tests/results/array.sql.out | 16 ---------
 .../resources/sql-tests/results/group-by.sql.out   | 22 ------------
 .../resources/sql-tests/results/literals.sql.out   |  8 -----
 .../apache/spark/sql/DataFrameAggregateSuite.scala | 33 ------------------
 17 files changed, 3 insertions(+), 268 deletions(-)

diff --git 
a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala 
b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 435cf1a03cbc..6815e47a198d 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -126,17 +126,6 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) 
T: ClassTag](
     this
   }
 
-  /**
-   * Check if a key exists at the provided position using object equality 
rather than
-   * cooperative equality. Otherwise, hash sets will mishandle values for 
which `==`
-   * and `equals` return different results, like 0.0/-0.0 and NaN/NaN.
-   *
-   * See: https://issues.apache.org/jira/browse/SPARK-45599
-   */
-  @annotation.nowarn("cat=other-non-cooperative-equals")
-  private def keyExistsAtPos(k: T, pos: Int) =
-    _data(pos) equals k
-
   /**
    * Add an element to the set. This one differs from add in that it doesn't 
trigger rehashing.
    * The caller is responsible for calling rehashIfNeeded.
@@ -157,7 +146,8 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: 
ClassTag](
         _bitset.set(pos)
         _size += 1
         return pos | NONEXISTENCE_MASK
-      } else if (keyExistsAtPos(k, pos)) {
+      } else if (_data(pos) == k) {
+        // Found an existing key.
         return pos
       } else {
         // quadratic probing with values increase by 1, 2, 3, ...
@@ -191,7 +181,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: 
ClassTag](
     while (true) {
       if (!_bitset.get(pos)) {
         return INVALID_POS
-      } else if (keyExistsAtPos(k, pos)) {
+      } else if (k == _data(pos)) {
         return pos
       } else {
         // quadratic probing with values increase by 1, 2, 3, ...
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index f7b026ab565f..1af99e9017c9 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -249,34 +249,4 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers 
{
     map(null) = null
     assert(map.get(null) === Some(null))
   }
-
-  test("SPARK-45599: 0.0 and -0.0 should count distinctly; NaNs should count 
together") {
-    // Exactly these elements provided in roughly this order trigger a 
condition where lookups of
-    // 0.0 and -0.0 in the bitset happen to collide, causing their counts to 
be merged incorrectly
-    // and inconsistently if `==` is used to check for key equality.
-    val spark45599Repro = Seq(
-      Double.NaN,
-      2.0,
-      168.0,
-      Double.NaN,
-      Double.NaN,
-      -0.0,
-      153.0,
-      0.0
-    )
-
-    val map1 = new OpenHashMap[Double, Int]()
-    spark45599Repro.foreach(map1.changeValue(_, 1, {_ + 1}))
-    assert(map1(0.0) == 1)
-    assert(map1(-0.0) == 1)
-    assert(map1(Double.NaN) == 3)
-
-    val map2 = new OpenHashMap[Double, Int]()
-    // Simply changing the order in which the elements are added to the map 
should not change the
-    // counts for 0.0 and -0.0.
-    spark45599Repro.reverse.foreach(map2.changeValue(_, 1, {_ + 1}))
-    assert(map2(0.0) == 1)
-    assert(map2(-0.0) == 1)
-    assert(map2(Double.NaN) == 3)
-  }
 }
diff --git 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
index 0bc8aa067f57..89a308556d5d 100644
--- 
a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -269,43 +269,4 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers 
{
       assert(pos1 == pos2)
     }
   }
-
-  test("SPARK-45599: 0.0 and -0.0 are equal but not the same") {
-    // Therefore, 0.0 and -0.0 should get separate entries in the hash set.
-    //
-    // Exactly these elements provided in roughly this order will trigger the 
following scenario:
-    // When probing the bitset in `getPos(-0.0)`, the loop will happen upon 
the entry for 0.0.
-    // In the old logic pre-SPARK-45599, the loop will find that the bit is 
set and, because
-    // -0.0 == 0.0, it will think that's the position of -0.0. But in reality 
this is the position
-    // of 0.0. So -0.0 and 0.0 will be stored at different positions, but 
`getPos()` will return
-    // the same position for them. This can cause users of OpenHashSet, like 
OpenHashMap, to
-    // return the wrong value for a key based on whether or not this bitset 
lookup collision
-    // happens.
-    val spark45599Repro = Seq(
-      Double.NaN,
-      2.0,
-      168.0,
-      Double.NaN,
-      Double.NaN,
-      -0.0,
-      153.0,
-      0.0
-    )
-    val set = new OpenHashSet[Double]()
-    spark45599Repro.foreach(set.add)
-    assert(set.size == 6)
-    val zeroPos = set.getPos(0.0)
-    val negZeroPos = set.getPos(-0.0)
-    assert(zeroPos != negZeroPos)
-  }
-
-  test("SPARK-45599: NaN and NaN are the same but not equal") {
-    // Any mathematical comparison to NaN will return false, but when we place 
it in
-    // a hash set we want the lookup to work like a "normal" value.
-    val set = new OpenHashSet[Double]()
-    set.add(Double.NaN)
-    set.add(Double.NaN)
-    assert(set.contains(Double.NaN))
-    assert(set.size == 1)
-  }
 }
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out
index 3b196ea93e40..6fc308157933 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out
@@ -747,17 +747,3 @@ select array_prepend(array(CAST(NULL AS String)), 
CAST(NULL AS String))
 -- !query analysis
 Project [array_prepend(array(cast(null as string)), cast(null as string)) AS 
array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING))#x]
 +- OneRowRelation
-
-
--- !query
-select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, 
DOUBLE("NaN")))
--- !query analysis
-Project [array_union(array(cast(0.0 as double), cast(0.0 as double), cast(NaN 
as double)), array(cast(0.0 as double), cast(0.0 as double), cast(NaN as 
double))) AS array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN))#x]
-+- OneRowRelation
-
-
--- !query
-select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN")))
--- !query analysis
-Project [array_distinct(array(cast(0.0 as double), cast(0.0 as double), 
cast(0.0 as double), cast(NaN as double), cast(NaN as double))) AS 
array_distinct(array(0.0, 0.0, 0.0, NaN, NaN))#x]
-+- OneRowRelation
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out
index 001dd4d64487..53c7327c5871 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out
@@ -692,10 +692,3 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
     "fragment" : "-x'2379ACFe'"
   } ]
 }
-
-
--- !query
-select -0, -0.0
--- !query analysis
-Project [0 AS 0#x, 0.0 AS 0.0#x]
-+- OneRowRelation
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out
index ca6c89bfadc3..e0585b77cb6b 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out
@@ -747,17 +747,3 @@ select array_prepend(array(CAST(NULL AS String)), 
CAST(NULL AS String))
 -- !query analysis
 Project [array_prepend(array(cast(null as string)), cast(null as string)) AS 
array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING))#x]
 +- OneRowRelation
-
-
--- !query
-select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, 
DOUBLE("NaN")))
--- !query analysis
-Project [array_union(array(cast(0.0 as double), cast(0.0 as double), cast(NaN 
as double)), array(cast(0.0 as double), cast(0.0 as double), cast(NaN as 
double))) AS array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN))#x]
-+- OneRowRelation
-
-
--- !query
-select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN")))
--- !query analysis
-Project [array_distinct(array(cast(0.0 as double), cast(0.0 as double), 
cast(0.0 as double), cast(NaN as double), cast(NaN as double))) AS 
array_distinct(array(0.0, 0.0, 0.0, NaN, NaN))#x]
-+- OneRowRelation
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out
index 93c463575dc1..202ceee18046 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out
@@ -1196,22 +1196,3 @@ Aggregate [c#x], [(c#x * 2) AS d#x]
          +- Project [if ((a#x < 0)) 0 else a#x AS b#x]
             +- SubqueryAlias t1
                +- LocalRelation [a#x]
-
-
--- !query
-SELECT col1, count(*) AS cnt
-FROM VALUES
-  (0.0),
-  (-0.0),
-  (double('NaN')),
-  (double('NaN')),
-  (double('Infinity')),
-  (double('Infinity')),
-  (-double('Infinity')),
-  (-double('Infinity'))
-GROUP BY col1
-ORDER BY col1
--- !query analysis
-Sort [col1#x ASC NULLS FIRST], true
-+- Aggregate [col1#x], [col1#x, count(1) AS cnt#xL]
-   +- LocalRelation [col1#x]
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out
index 001dd4d64487..53c7327c5871 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out
@@ -692,10 +692,3 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
     "fragment" : "-x'2379ACFe'"
   } ]
 }
-
-
--- !query
-select -0, -0.0
--- !query analysis
-Project [0 AS 0#x, 0.0 AS 0.0#x]
-+- OneRowRelation
diff --git a/sql/core/src/test/resources/sql-tests/inputs/array.sql 
b/sql/core/src/test/resources/sql-tests/inputs/array.sql
index 865dc8bac4ea..52a0906ea739 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/array.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/array.sql
@@ -177,7 +177,3 @@ select array_prepend(CAST(null AS ARRAY<String>), CAST(null 
as String));
 select array_prepend(array(), 1);
 select array_prepend(CAST(array() AS ARRAY<String>), CAST(NULL AS String));
 select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String));
-
--- SPARK-45599: Confirm 0.0, -0.0, and NaN are handled appropriately.
-select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, 
DOUBLE("NaN")));
-select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN")));
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql 
b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index ce1b422de319..c35cdb0de271 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -264,18 +264,3 @@ FROM (
          GROUP BY b
      ) t3
 GROUP BY c;
-
--- SPARK-45599: Check that "weird" doubles group and sort as desired.
-SELECT col1, count(*) AS cnt
-FROM VALUES
-  (0.0),
-  (-0.0),
-  (double('NaN')),
-  (double('NaN')),
-  (double('Infinity')),
-  (double('Infinity')),
-  (-double('Infinity')),
-  (-double('Infinity'))
-GROUP BY col1
-ORDER BY col1
-;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/literals.sql 
b/sql/core/src/test/resources/sql-tests/inputs/literals.sql
index e1e4a370bffd..9f0eefc16a8c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/literals.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/literals.sql
@@ -118,6 +118,3 @@ select +X'1';
 select -date '1999-01-01';
 select -timestamp '1999-01-01';
 select -x'2379ACFe';
-
--- normalize -0 and -0.0
-select -0, -0.0;
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out 
b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out
index 6a07d659e39b..49e18411ffa3 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out
@@ -907,19 +907,3 @@ select array_prepend(array(CAST(NULL AS String)), 
CAST(NULL AS String))
 struct<array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS 
STRING)):array<string>>
 -- !query output
 [null,null]
-
-
--- !query
-select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, 
DOUBLE("NaN")))
--- !query schema
-struct<array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN)):array<double>>
--- !query output
-[0.0,NaN]
-
-
--- !query
-select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN")))
--- !query schema
-struct<array_distinct(array(0.0, 0.0, 0.0, NaN, NaN)):array<double>>
--- !query output
-[0.0,NaN]
diff --git 
a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out 
b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out
index 452580e4f3c3..85bcc2713ff5 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out
@@ -770,11 +770,3 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
     "fragment" : "-x'2379ACFe'"
   } ]
 }
-
-
--- !query
-select -0, -0.0
--- !query schema
-struct<0:int,0.0:decimal(1,1)>
--- !query output
-0      0.0
diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out 
b/sql/core/src/test/resources/sql-tests/results/array.sql.out
index d33fc62f0d9a..e568f5fa7796 100644
--- a/sql/core/src/test/resources/sql-tests/results/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out
@@ -788,19 +788,3 @@ select array_prepend(array(CAST(NULL AS String)), 
CAST(NULL AS String))
 struct<array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS 
STRING)):array<string>>
 -- !query output
 [null,null]
-
-
--- !query
-select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, 
DOUBLE("NaN")))
--- !query schema
-struct<array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN)):array<double>>
--- !query output
-[0.0,NaN]
-
-
--- !query
-select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN")))
--- !query schema
-struct<array_distinct(array(0.0, 0.0, 0.0, NaN, NaN)):array<double>>
--- !query output
-[0.0,NaN]
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out 
b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 548917ef79b2..db79646fe435 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1121,25 +1121,3 @@ struct<d:int>
 -- !query output
 0
 2
-
-
--- !query
-SELECT col1, count(*) AS cnt
-FROM VALUES
-  (0.0),
-  (-0.0),
-  (double('NaN')),
-  (double('NaN')),
-  (double('Infinity')),
-  (double('Infinity')),
-  (-double('Infinity')),
-  (-double('Infinity'))
-GROUP BY col1
-ORDER BY col1
--- !query schema
-struct<col1:double,cnt:bigint>
--- !query output
--Infinity      2
-0.0    2
-Infinity       2
-NaN    2
diff --git a/sql/core/src/test/resources/sql-tests/results/literals.sql.out 
b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
index 452580e4f3c3..85bcc2713ff5 100644
--- a/sql/core/src/test/resources/sql-tests/results/literals.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/literals.sql.out
@@ -770,11 +770,3 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
     "fragment" : "-x'2379ACFe'"
   } ]
 }
-
-
--- !query
-select -0, -0.0
--- !query schema
-struct<0:int,0.0:decimal(1,1)>
--- !query output
-0      0.0
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 1ba3f6c84d0a..631fcd8c0d87 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -1068,39 +1068,6 @@ class DataFrameAggregateSuite extends QueryTest
     )
   }
 
-  test("SPARK-45599: Neither 0.0 nor -0.0 should be dropped when computing 
percentile") {
-    // To reproduce the bug described in SPARK-45599, we need exactly these 
rows in roughly
-    // this order in a DataFrame with exactly 1 partition.
-    // scalastyle:off line.size.limit
-    // See: 
https://issues.apache.org/jira/browse/SPARK-45599?focusedCommentId=17806954&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17806954
-    // scalastyle:on line.size.limit
-    val spark45599Repro: DataFrame = Seq(
-        0.0,
-        2.0,
-        153.0,
-        168.0,
-        3252411229536261.0,
-        7.205759403792794e+16,
-        1.7976931348623157e+308,
-        0.25,
-        Double.NaN,
-        Double.NaN,
-        -0.0,
-        -128.0,
-        Double.NaN,
-        Double.NaN
-      ).toDF("val").coalesce(1)
-
-    checkAnswer(
-      spark45599Repro.agg(
-        percentile(col("val"), lit(0.1))
-      ),
-      // With the buggy implementation of OpenHashSet, this returns 
`0.050000000000000044`
-      // instead of `-0.0`.
-      List(Row(-0.0))
-    )
-  }
-
   test("any_value") {
     checkAnswer(
       courseSales.groupBy("course").agg(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch branch-3.5 updated: Revert "[SPARK-45599][CORE] Use object equality in OpenHashSet"

Reply via email to