This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new cbf25fb633f4 Revert "[SPARK-45599][CORE] Use object equality in OpenHashSet" cbf25fb633f4 is described below commit cbf25fb633f4bf2f83a6f6e39aafaa80bf47e160 Author: Dongjoon Hyun <dh...@apple.com> AuthorDate: Tue Feb 27 08:38:54 2024 -0800 Revert "[SPARK-45599][CORE] Use object equality in OpenHashSet" This reverts commit 588a55d010fefda7a63cde3b616ac38728fe4cfe. --- .../apache/spark/util/collection/OpenHashSet.scala | 16 ++------- .../spark/util/collection/OpenHashMapSuite.scala | 30 ----------------- .../spark/util/collection/OpenHashSetSuite.scala | 39 ---------------------- .../sql-tests/analyzer-results/ansi/array.sql.out | 14 -------- .../analyzer-results/ansi/literals.sql.out | 7 ---- .../sql-tests/analyzer-results/array.sql.out | 14 -------- .../sql-tests/analyzer-results/group-by.sql.out | 19 ----------- .../sql-tests/analyzer-results/literals.sql.out | 7 ---- .../src/test/resources/sql-tests/inputs/array.sql | 4 --- .../test/resources/sql-tests/inputs/group-by.sql | 15 --------- .../test/resources/sql-tests/inputs/literals.sql | 3 -- .../resources/sql-tests/results/ansi/array.sql.out | 16 --------- .../sql-tests/results/ansi/literals.sql.out | 8 ----- .../test/resources/sql-tests/results/array.sql.out | 16 --------- .../resources/sql-tests/results/group-by.sql.out | 22 ------------ .../resources/sql-tests/results/literals.sql.out | 8 ----- .../apache/spark/sql/DataFrameAggregateSuite.scala | 33 ------------------ 17 files changed, 3 insertions(+), 268 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index 435cf1a03cbc..6815e47a198d 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -126,17 +126,6 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( this } - /** - * Check if a key exists at the provided position using object equality rather than - * cooperative equality. Otherwise, hash sets will mishandle values for which `==` - * and `equals` return different results, like 0.0/-0.0 and NaN/NaN. - * - * See: https://issues.apache.org/jira/browse/SPARK-45599 - */ - @annotation.nowarn("cat=other-non-cooperative-equals") - private def keyExistsAtPos(k: T, pos: Int) = - _data(pos) equals k - /** * Add an element to the set. This one differs from add in that it doesn't trigger rehashing. * The caller is responsible for calling rehashIfNeeded. @@ -157,7 +146,8 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( _bitset.set(pos) _size += 1 return pos | NONEXISTENCE_MASK - } else if (keyExistsAtPos(k, pos)) { + } else if (_data(pos) == k) { + // Found an existing key. return pos } else { // quadratic probing with values increase by 1, 2, 3, ... @@ -191,7 +181,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( while (true) { if (!_bitset.get(pos)) { return INVALID_POS - } else if (keyExistsAtPos(k, pos)) { + } else if (k == _data(pos)) { return pos } else { // quadratic probing with values increase by 1, 2, 3, ... diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index f7b026ab565f..1af99e9017c9 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -249,34 +249,4 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { map(null) = null assert(map.get(null) === Some(null)) } - - test("SPARK-45599: 0.0 and -0.0 should count distinctly; NaNs should count together") { - // Exactly these elements provided in roughly this order trigger a condition where lookups of - // 0.0 and -0.0 in the bitset happen to collide, causing their counts to be merged incorrectly - // and inconsistently if `==` is used to check for key equality. - val spark45599Repro = Seq( - Double.NaN, - 2.0, - 168.0, - Double.NaN, - Double.NaN, - -0.0, - 153.0, - 0.0 - ) - - val map1 = new OpenHashMap[Double, Int]() - spark45599Repro.foreach(map1.changeValue(_, 1, {_ + 1})) - assert(map1(0.0) == 1) - assert(map1(-0.0) == 1) - assert(map1(Double.NaN) == 3) - - val map2 = new OpenHashMap[Double, Int]() - // Simply changing the order in which the elements are added to the map should not change the - // counts for 0.0 and -0.0. - spark45599Repro.reverse.foreach(map2.changeValue(_, 1, {_ + 1})) - assert(map2(0.0) == 1) - assert(map2(-0.0) == 1) - assert(map2(Double.NaN) == 3) - } } diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index 0bc8aa067f57..89a308556d5d 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -269,43 +269,4 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { assert(pos1 == pos2) } } - - test("SPARK-45599: 0.0 and -0.0 are equal but not the same") { - // Therefore, 0.0 and -0.0 should get separate entries in the hash set. - // - // Exactly these elements provided in roughly this order will trigger the following scenario: - // When probing the bitset in `getPos(-0.0)`, the loop will happen upon the entry for 0.0. - // In the old logic pre-SPARK-45599, the loop will find that the bit is set and, because - // -0.0 == 0.0, it will think that's the position of -0.0. But in reality this is the position - // of 0.0. So -0.0 and 0.0 will be stored at different positions, but `getPos()` will return - // the same position for them. This can cause users of OpenHashSet, like OpenHashMap, to - // return the wrong value for a key based on whether or not this bitset lookup collision - // happens. - val spark45599Repro = Seq( - Double.NaN, - 2.0, - 168.0, - Double.NaN, - Double.NaN, - -0.0, - 153.0, - 0.0 - ) - val set = new OpenHashSet[Double]() - spark45599Repro.foreach(set.add) - assert(set.size == 6) - val zeroPos = set.getPos(0.0) - val negZeroPos = set.getPos(-0.0) - assert(zeroPos != negZeroPos) - } - - test("SPARK-45599: NaN and NaN are the same but not equal") { - // Any mathematical comparison to NaN will return false, but when we place it in - // a hash set we want the lookup to work like a "normal" value. - val set = new OpenHashSet[Double]() - set.add(Double.NaN) - set.add(Double.NaN) - assert(set.contains(Double.NaN)) - assert(set.size == 1) - } } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out index 3b196ea93e40..6fc308157933 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out @@ -747,17 +747,3 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String)) -- !query analysis Project [array_prepend(array(cast(null as string)), cast(null as string)) AS array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING))#x] +- OneRowRelation - - --- !query -select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN"))) --- !query analysis -Project [array_union(array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double)), array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double))) AS array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN))#x] -+- OneRowRelation - - --- !query -select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN"))) --- !query analysis -Project [array_distinct(array(cast(0.0 as double), cast(0.0 as double), cast(0.0 as double), cast(NaN as double), cast(NaN as double))) AS array_distinct(array(0.0, 0.0, 0.0, NaN, NaN))#x] -+- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out index 001dd4d64487..53c7327c5871 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/literals.sql.out @@ -692,10 +692,3 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "fragment" : "-x'2379ACFe'" } ] } - - --- !query -select -0, -0.0 --- !query analysis -Project [0 AS 0#x, 0.0 AS 0.0#x] -+- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out index ca6c89bfadc3..e0585b77cb6b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out @@ -747,17 +747,3 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String)) -- !query analysis Project [array_prepend(array(cast(null as string)), cast(null as string)) AS array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING))#x] +- OneRowRelation - - --- !query -select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN"))) --- !query analysis -Project [array_union(array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double)), array(cast(0.0 as double), cast(0.0 as double), cast(NaN as double))) AS array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN))#x] -+- OneRowRelation - - --- !query -select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN"))) --- !query analysis -Project [array_distinct(array(cast(0.0 as double), cast(0.0 as double), cast(0.0 as double), cast(NaN as double), cast(NaN as double))) AS array_distinct(array(0.0, 0.0, 0.0, NaN, NaN))#x] -+- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out index 93c463575dc1..202ceee18046 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by.sql.out @@ -1196,22 +1196,3 @@ Aggregate [c#x], [(c#x * 2) AS d#x] +- Project [if ((a#x < 0)) 0 else a#x AS b#x] +- SubqueryAlias t1 +- LocalRelation [a#x] - - --- !query -SELECT col1, count(*) AS cnt -FROM VALUES - (0.0), - (-0.0), - (double('NaN')), - (double('NaN')), - (double('Infinity')), - (double('Infinity')), - (-double('Infinity')), - (-double('Infinity')) -GROUP BY col1 -ORDER BY col1 --- !query analysis -Sort [col1#x ASC NULLS FIRST], true -+- Aggregate [col1#x], [col1#x, count(1) AS cnt#xL] - +- LocalRelation [col1#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out index 001dd4d64487..53c7327c5871 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/literals.sql.out @@ -692,10 +692,3 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "fragment" : "-x'2379ACFe'" } ] } - - --- !query -select -0, -0.0 --- !query analysis -Project [0 AS 0#x, 0.0 AS 0.0#x] -+- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/inputs/array.sql b/sql/core/src/test/resources/sql-tests/inputs/array.sql index 865dc8bac4ea..52a0906ea739 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/array.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/array.sql @@ -177,7 +177,3 @@ select array_prepend(CAST(null AS ARRAY<String>), CAST(null as String)); select array_prepend(array(), 1); select array_prepend(CAST(array() AS ARRAY<String>), CAST(NULL AS String)); select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String)); - --- SPARK-45599: Confirm 0.0, -0.0, and NaN are handled appropriately. -select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN"))); -select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN"))); diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql index ce1b422de319..c35cdb0de271 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql @@ -264,18 +264,3 @@ FROM ( GROUP BY b ) t3 GROUP BY c; - --- SPARK-45599: Check that "weird" doubles group and sort as desired. -SELECT col1, count(*) AS cnt -FROM VALUES - (0.0), - (-0.0), - (double('NaN')), - (double('NaN')), - (double('Infinity')), - (double('Infinity')), - (-double('Infinity')), - (-double('Infinity')) -GROUP BY col1 -ORDER BY col1 -; diff --git a/sql/core/src/test/resources/sql-tests/inputs/literals.sql b/sql/core/src/test/resources/sql-tests/inputs/literals.sql index e1e4a370bffd..9f0eefc16a8c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/literals.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/literals.sql @@ -118,6 +118,3 @@ select +X'1'; select -date '1999-01-01'; select -timestamp '1999-01-01'; select -x'2379ACFe'; - --- normalize -0 and -0.0 -select -0, -0.0; diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out index 6a07d659e39b..49e18411ffa3 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out @@ -907,19 +907,3 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String)) struct<array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING)):array<string>> -- !query output [null,null] - - --- !query -select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN"))) --- !query schema -struct<array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN)):array<double>> --- !query output -[0.0,NaN] - - --- !query -select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN"))) --- !query schema -struct<array_distinct(array(0.0, 0.0, 0.0, NaN, NaN)):array<double>> --- !query output -[0.0,NaN] diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out index 452580e4f3c3..85bcc2713ff5 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out @@ -770,11 +770,3 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "fragment" : "-x'2379ACFe'" } ] } - - --- !query -select -0, -0.0 --- !query schema -struct<0:int,0.0:decimal(1,1)> --- !query output -0 0.0 diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out b/sql/core/src/test/resources/sql-tests/results/array.sql.out index d33fc62f0d9a..e568f5fa7796 100644 --- a/sql/core/src/test/resources/sql-tests/results/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out @@ -788,19 +788,3 @@ select array_prepend(array(CAST(NULL AS String)), CAST(NULL AS String)) struct<array_prepend(array(CAST(NULL AS STRING)), CAST(NULL AS STRING)):array<string>> -- !query output [null,null] - - --- !query -select array_union(array(0.0, -0.0, DOUBLE("NaN")), array(0.0, -0.0, DOUBLE("NaN"))) --- !query schema -struct<array_union(array(0.0, 0.0, NaN), array(0.0, 0.0, NaN)):array<double>> --- !query output -[0.0,NaN] - - --- !query -select array_distinct(array(0.0, -0.0, -0.0, DOUBLE("NaN"), DOUBLE("NaN"))) --- !query schema -struct<array_distinct(array(0.0, 0.0, 0.0, NaN, NaN)):array<double>> --- !query output -[0.0,NaN] diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out index 548917ef79b2..db79646fe435 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out @@ -1121,25 +1121,3 @@ struct<d:int> -- !query output 0 2 - - --- !query -SELECT col1, count(*) AS cnt -FROM VALUES - (0.0), - (-0.0), - (double('NaN')), - (double('NaN')), - (double('Infinity')), - (double('Infinity')), - (-double('Infinity')), - (-double('Infinity')) -GROUP BY col1 -ORDER BY col1 --- !query schema -struct<col1:double,cnt:bigint> --- !query output --Infinity 2 -0.0 2 -Infinity 2 -NaN 2 diff --git a/sql/core/src/test/resources/sql-tests/results/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/literals.sql.out index 452580e4f3c3..85bcc2713ff5 100644 --- a/sql/core/src/test/resources/sql-tests/results/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/literals.sql.out @@ -770,11 +770,3 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "fragment" : "-x'2379ACFe'" } ] } - - --- !query -select -0, -0.0 --- !query schema -struct<0:int,0.0:decimal(1,1)> --- !query output -0 0.0 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index 1ba3f6c84d0a..631fcd8c0d87 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -1068,39 +1068,6 @@ class DataFrameAggregateSuite extends QueryTest ) } - test("SPARK-45599: Neither 0.0 nor -0.0 should be dropped when computing percentile") { - // To reproduce the bug described in SPARK-45599, we need exactly these rows in roughly - // this order in a DataFrame with exactly 1 partition. - // scalastyle:off line.size.limit - // See: https://issues.apache.org/jira/browse/SPARK-45599?focusedCommentId=17806954&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17806954 - // scalastyle:on line.size.limit - val spark45599Repro: DataFrame = Seq( - 0.0, - 2.0, - 153.0, - 168.0, - 3252411229536261.0, - 7.205759403792794e+16, - 1.7976931348623157e+308, - 0.25, - Double.NaN, - Double.NaN, - -0.0, - -128.0, - Double.NaN, - Double.NaN - ).toDF("val").coalesce(1) - - checkAnswer( - spark45599Repro.agg( - percentile(col("val"), lit(0.1)) - ), - // With the buggy implementation of OpenHashSet, this returns `0.050000000000000044` - // instead of `-0.0`. - List(Row(-0.0)) - ) - } - test("any_value") { checkAnswer( courseSales.groupBy("course").agg( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org