This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 1a577b8f8881 [SPARK-46980][SQL][MINOR] Avoid using internal APIs in
dataframe end-to-end tests
1a577b8f8881 is described below
commit 1a577b8f88816eabd78b3924f1e159f2593209c5
Author: Mark Jarvin <[email protected]>
AuthorDate: Tue Feb 6 09:11:19 2024 +0800
[SPARK-46980][SQL][MINOR] Avoid using internal APIs in dataframe end-to-end
tests
### What changes were proposed in this pull request?
Avoid using the internal `XORShiftRandom` API in tests by instead using
public APIs to collect the random values.
### Why are the changes needed?
Testing using internal APIs introduces unnecessary coupling between the
implementation and the test.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Ran tests using SBT
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #45034 from markj-db/test-public-api-rand.
Authored-by: Mark Jarvin <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala | 7 ++-----
sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala | 4 +---
2 files changed, 3 insertions(+), 8 deletions(-)
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
index d6cf77572731..cbc39557ce4c 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
@@ -305,11 +305,8 @@ class DataFrameSetOperationsSuite extends QueryTest
// When generating expected results at here, we need to follow the
implementation of
// Rand expression.
def expected(df: DataFrame): Seq[Row] =
- df.rdd.collectPartitions().zipWithIndex.flatMap {
- case (data, index) =>
- val rng = new org.apache.spark.util.random.XORShiftRandom(7 + index)
- data.filter(_.getInt(0) < rng.nextDouble() * 10)
- }.toSeq
+ df.select($"i", rand(7) * 10).as[(Long, Double)].collect()
+ .filter(r => r._1 < r._2).map(r => Row(r._1)).toImmutableArraySeq
val union = df1.union(df2)
checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 1442320e9a94..d85c7b7dfa3d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -57,7 +57,6 @@ import org.apache.spark.tags.SlowSQLTest
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.ArrayImplicits._
import org.apache.spark.util.Utils
-import org.apache.spark.util.random.XORShiftRandom
@SlowSQLTest
class DataFrameSuite extends QueryTest
@@ -1922,8 +1921,7 @@ class DataFrameSuite extends QueryTest
test("SPARK-9083: sort with non-deterministic expressions") {
val seed = 33
val df = (1 to 100).map(Tuple1.apply).toDF("i").repartition(1)
- val random = new XORShiftRandom(seed)
- val expected = (1 to 100).map(_ ->
random.nextDouble()).sortBy(_._2).map(_._1)
+ val expected = df.select($"i", rand(seed)).as[(Long,
Double)].collect().sortBy(_._2).map(_._1)
val actual = df.sort(rand(seed)).collect().map(_.getInt(0))
assert(expected === actual)
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]