Github user BryanCutler commented on a diff in the pull request: https://github.com/apache/spark/pull/18664#discussion_r145786734 --- Diff: python/pyspark/sql/tests.py --- @@ -3383,6 +3403,42 @@ def test_vectorized_udf_varargs(self): res = df.select(f(col('id'))) self.assertEquals(df.collect(), res.collect()) + def test_vectorized_udf_timestamps(self): + from pyspark.sql.functions import pandas_udf, col + from datetime import date, datetime + schema = StructType([ + StructField("idx", LongType(), True), + StructField("date", DateType(), True), + StructField("timestamp", TimestampType(), True)]) + data = [(0, date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1)), + (1, date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)), + (2, date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3)), + (3, date(2104, 4, 4), datetime(2104, 4, 4, 4, 4, 4))] + + df = self.spark.createDataFrame(data, schema=schema) + + # Check that a timestamp passed through a pandas_udf will not be altered by timezone calc + identity = pandas_udf(lambda t: t, returnType=TimestampType()) + df = df.withColumn("timestamp_copy", identity(col("timestamp"))) + + @pandas_udf(returnType=BooleanType()) + def check_data(idx, date, timestamp, timestamp_copy): --- End diff -- Yeah, it is a little strange, but the point is to test that the user will see the expected timestamps in the UDF. We already test collect, so we know that works but this is the easiest way I could think to make sure the UDF values are also correct.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org