This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 1a577b8f8881 [SPARK-46980][SQL][MINOR] Avoid using internal APIs in 
dataframe end-to-end tests
1a577b8f8881 is described below

commit 1a577b8f88816eabd78b3924f1e159f2593209c5
Author: Mark Jarvin <mark.jar...@databricks.com>
AuthorDate: Tue Feb 6 09:11:19 2024 +0800

    [SPARK-46980][SQL][MINOR] Avoid using internal APIs in dataframe end-to-end 
tests
    
    ### What changes were proposed in this pull request?
    
    Avoid using the internal `XORShiftRandom` API in tests by instead using 
public APIs to collect the random values.
    
    ### Why are the changes needed?
    
    Testing using internal APIs introduces unnecessary coupling between the 
implementation and the test.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Ran tests using SBT
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #45034 from markj-db/test-public-api-rand.
    
    Authored-by: Mark Jarvin <mark.jar...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala   | 7 ++-----
 sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala  | 4 +---
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
index d6cf77572731..cbc39557ce4c 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
@@ -305,11 +305,8 @@ class DataFrameSetOperationsSuite extends QueryTest
     // When generating expected results at here, we need to follow the 
implementation of
     // Rand expression.
     def expected(df: DataFrame): Seq[Row] =
-      df.rdd.collectPartitions().zipWithIndex.flatMap {
-        case (data, index) =>
-          val rng = new org.apache.spark.util.random.XORShiftRandom(7 + index)
-          data.filter(_.getInt(0) < rng.nextDouble() * 10)
-      }.toSeq
+      df.select($"i", rand(7) * 10).as[(Long, Double)].collect()
+        .filter(r => r._1 < r._2).map(r => Row(r._1)).toImmutableArraySeq
 
     val union = df1.union(df2)
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 1442320e9a94..d85c7b7dfa3d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -57,7 +57,6 @@ import org.apache.spark.tags.SlowSQLTest
 import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.ArrayImplicits._
 import org.apache.spark.util.Utils
-import org.apache.spark.util.random.XORShiftRandom
 
 @SlowSQLTest
 class DataFrameSuite extends QueryTest
@@ -1922,8 +1921,7 @@ class DataFrameSuite extends QueryTest
   test("SPARK-9083: sort with non-deterministic expressions") {
     val seed = 33
     val df = (1 to 100).map(Tuple1.apply).toDF("i").repartition(1)
-    val random = new XORShiftRandom(seed)
-    val expected = (1 to 100).map(_ -> 
random.nextDouble()).sortBy(_._2).map(_._1)
+    val expected = df.select($"i", rand(seed)).as[(Long, 
Double)].collect().sortBy(_._2).map(_._1)
     val actual = df.sort(rand(seed)).collect().map(_.getInt(0))
     assert(expected === actual)
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to