Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/18664#discussion_r145737159
--- Diff: python/pyspark/sql/tests.py ---
@@ -3383,6 +3403,42 @@ def test_vectorized_udf_varargs(self):
res = df.select(f(col('id')))
self.assertEquals(df.collect(), res.collect())
+ def test_vectorized_udf_timestamps(self):
+ from pyspark.sql.functions import pandas_udf, col
+ from datetime import date, datetime
+ schema = StructType([
+ StructField("idx", LongType(), True),
+ StructField("date", DateType(), True),
+ StructField("timestamp", TimestampType(), True)])
+ data = [(0, date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1)),
+ (1, date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)),
+ (2, date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3)),
+ (3, date(2104, 4, 4), datetime(2104, 4, 4, 4, 4, 4))]
+
+ df = self.spark.createDataFrame(data, schema=schema)
+
+ # Check that a timestamp passed through a pandas_udf will not be
altered by timezone calc
+ identity = pandas_udf(lambda t: t, returnType=TimestampType())
+ df = df.withColumn("timestamp_copy", identity(col("timestamp")))
+
+ @pandas_udf(returnType=BooleanType())
+ def check_data(idx, date, timestamp, timestamp_copy):
--- End diff --
it's a little weird to check the value by UDF, can't we collect the
dataframe and check the value of these 2 columns?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]