Github user viirya commented on a diff in the pull request:
https://github.com/apache/spark/pull/20142#discussion_r160017182
--- Diff: python/pyspark/sql/tests.py ---
@@ -3950,6 +3975,33 @@ def
test_vectorized_udf_timestamps_respect_session_timezone(self):
finally:
self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
+ def test_nondeterministic_udf(self):
+ # Test that nondeterministic UDFs are evaluated only once in
chained UDF evaluations
+ from pyspark.sql.functions import udf, pandas_udf, col
+
+ @pandas_udf('double')
+ def plus_ten(v):
+ return v + 10
+ random_udf = self.random_udf
+
+ df = self.spark.range(10).withColumn('rand', random_udf(col('id')))
+ result1 = df.withColumn('plus_ten(rand)',
plus_ten(df['rand'])).toPandas()
+
+ self.assertEqual(random_udf.deterministic, False)
+ self.assertTrue(result1['plus_ten(rand)'].equals(result1['rand'] +
10))
+
+ def test_nondeterministic_udf_in_aggregate(self):
--- End diff --
test_vectorized_nondeterministic_udf_in_aggregate
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]