Github user maryannxue-databricks commented on a diff in the pull request:
https://github.com/apache/spark/pull/21531#discussion_r195236676
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala ---
@@ -96,4 +99,38 @@ class DatasetCacheSuite extends QueryTest with
SharedSQLContext {
agged.unpersist()
assert(agged.storageLevel == StorageLevel.NONE, "The Dataset agged
should not be cached.")
}
+
+ test("persist and then withColumn") {
+ val df = Seq(("test", 1)).toDF("s", "i")
+ // We should not invalidate the cached DataFrame
+ val df2 = df.withColumn("newColumn", lit(1))
+
+ df.cache()
+ assertCached(df)
+ assertCached(df2)
+
+ df.count()
+ assertCached(df2)
+
+ df.unpersist()
+ assert(df.storageLevel == StorageLevel.NONE)
+ }
+
+ test("cache UDF result correctly") {
+ val expensiveUDF = udf({x: Int => Thread.sleep(10000); x})
+ val df = spark.range(0, 10).toDF("a").withColumn("b",
expensiveUDF($"a"))
+ val df2 = df.agg(sum(df("b")))
+
+ df.cache()
+ df.count()
+
+ assertCached(df2)
+
+ failAfter(5 seconds) {
--- End diff --
Can you add a comment here, like "udf has been evaluated during caching,
and thus should not be re-evaluated."
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]