Github user BryanCutler commented on a diff in the pull request:

    https://github.com/apache/spark/pull/18664#discussion_r145786734
  
    --- Diff: python/pyspark/sql/tests.py ---
    @@ -3383,6 +3403,42 @@ def test_vectorized_udf_varargs(self):
             res = df.select(f(col('id')))
             self.assertEquals(df.collect(), res.collect())
     
    +    def test_vectorized_udf_timestamps(self):
    +        from pyspark.sql.functions import pandas_udf, col
    +        from datetime import date, datetime
    +        schema = StructType([
    +            StructField("idx", LongType(), True),
    +            StructField("date", DateType(), True),
    +            StructField("timestamp", TimestampType(), True)])
    +        data = [(0, date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1)),
    +                (1, date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)),
    +                (2, date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3)),
    +                (3, date(2104, 4, 4), datetime(2104, 4, 4, 4, 4, 4))]
    +
    +        df = self.spark.createDataFrame(data, schema=schema)
    +
    +        # Check that a timestamp passed through a pandas_udf will not be 
altered by timezone calc
    +        identity = pandas_udf(lambda t: t, returnType=TimestampType())
    +        df = df.withColumn("timestamp_copy", identity(col("timestamp")))
    +
    +        @pandas_udf(returnType=BooleanType())
    +        def check_data(idx, date, timestamp, timestamp_copy):
    --- End diff --
    
    Yeah, it is a little strange, but the point is to test that the user will 
see the expected timestamps in the UDF.  We already test collect, so we know 
that works but this is the easiest way I could think to make sure the UDF 
values are also correct.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to