Github user icexelloss commented on a diff in the pull request:
https://github.com/apache/spark/pull/20142#discussion_r159994714
--- Diff: python/pyspark/sql/tests.py ---
@@ -3950,6 +3974,33 @@ def
test_vectorized_udf_timestamps_respect_session_timezone(self):
finally:
self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
+ def test_nondeterministic_udf(self):
+ # Non-deterministic UDFs should be allowed in select and withColumn
+ from pyspark.sql.functions import pandas_udf, col
+
+ random_udf = self.random_udf
+ df = self.spark.range(10)
+
+ result1 = df.select(random_udf(col('id')).alias('rand')).collect()
+ result2 = df.withColumn('rand', random_udf(col('id'))).collect()
+
+ for row in result1:
+ self.assertTrue(0.0 <= row.rand < 1.0)
+ for row in result2:
+ self.assertTrue(0.0 <= row.rand < 1.0)
--- End diff --
I changed the test to be similar to the non-pandas one.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]