Github user BryanCutler commented on a diff in the pull request:
https://github.com/apache/spark/pull/20142#discussion_r159983977
--- Diff: python/pyspark/sql/tests.py ---
@@ -3950,6 +3974,33 @@ def
test_vectorized_udf_timestamps_respect_session_timezone(self):
finally:
self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
+ def test_nondeterministic_udf(self):
+ # Non-deterministic UDFs should be allowed in select and withColumn
+ from pyspark.sql.functions import pandas_udf, col
+
+ random_udf = self.random_udf
+ df = self.spark.range(10)
+
+ result1 = df.select(random_udf(col('id')).alias('rand')).collect()
+ result2 = df.withColumn('rand', random_udf(col('id'))).collect()
+
+ for row in result1:
+ self.assertTrue(0.0 <= row.rand < 1.0)
+ for row in result2:
+ self.assertTrue(0.0 <= row.rand < 1.0)
--- End diff --
Ideally we should be checking that the optimizer doesn't cache any previous
results. I think the non-pandas udf test I linked above did that by comparing
the original non-deterministic data plus a constant to that of adding the same
constant as a deterministic udf
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]