[GitHub] spark pull request #21650: [SPARK-24624][SQL][PYTHON] Support mixture of Pyt...

HyukjinKwon Wed, 25 Jul 2018 18:55:06 -0700

Github user HyukjinKwon commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21650#discussion_r205311130
  
    --- Diff: python/pyspark/sql/tests.py ---
    @@ -5060,6 +5049,147 @@ def test_type_annotation(self):
             df = self.spark.range(1).select(pandas_udf(f=_locals['noop'], 
returnType='bigint')('id'))
             self.assertEqual(df.first()[0], 0)
     
    +    def test_mixed_udf(self):
    +        import pandas as pd
    +        from pyspark.sql.functions import col, udf, pandas_udf
    +
    +        df = self.spark.range(0, 1).toDF('v')
    +
    +        # Test mixture of multiple UDFs and Pandas UDFs
    +
    +        @udf('int')
    +        def f1(x):
    +            assert type(x) == int
    +            return x + 1
    +
    +        @pandas_udf('int')
    +        def f2(x):
    +            assert type(x) == pd.Series
    +            return x + 10
    +
    +        @udf('int')
    +        def f3(x):
    +            assert type(x) == int
    +            return x + 100
    +
    +        @pandas_udf('int')
    +        def f4(x):
    +            assert type(x) == pd.Series
    +            return x + 1000
    +
    +        # Test mixed udfs in a single projection
    +        df1 = df \
    +            .withColumn('f1', f1(col('v'))) \
    +            .withColumn('f2', f2(col('v'))) \
    +            .withColumn('f3', f3(col('v'))) \
    +            .withColumn('f4', f4(col('v'))) \
    +            .withColumn('f2_f1', f2(col('f1'))) \
    +            .withColumn('f3_f1', f3(col('f1'))) \
    +            .withColumn('f4_f1', f4(col('f1'))) \
    +            .withColumn('f3_f2', f3(col('f2'))) \
    +            .withColumn('f4_f2', f4(col('f2'))) \
    +            .withColumn('f4_f3', f4(col('f3'))) \
    +            .withColumn('f3_f2_f1', f3(col('f2_f1'))) \
    +            .withColumn('f4_f2_f1', f4(col('f2_f1'))) \
    +            .withColumn('f4_f3_f1', f4(col('f3_f1'))) \
    +            .withColumn('f4_f3_f2', f4(col('f3_f2'))) \
    +            .withColumn('f4_f3_f2_f1', f4(col('f3_f2_f1')))
    +
    +        # Test mixed udfs in a single expression
    +        df2 = df \
    +            .withColumn('f1', f1(col('v'))) \
    +            .withColumn('f2', f2(col('v'))) \
    +            .withColumn('f3', f3(col('v'))) \
    +            .withColumn('f4', f4(col('v'))) \
    +            .withColumn('f2_f1', f2(f1(col('v')))) \
    +            .withColumn('f3_f1', f3(f1(col('v')))) \
    +            .withColumn('f4_f1', f4(f1(col('v')))) \
    +            .withColumn('f3_f2', f3(f2(col('v')))) \
    +            .withColumn('f4_f2', f4(f2(col('v')))) \
    +            .withColumn('f4_f3', f4(f3(col('v')))) \
    +            .withColumn('f3_f2_f1', f3(f2(f1(col('v'))))) \
    +            .withColumn('f4_f2_f1', f4(f2(f1(col('v'))))) \
    +            .withColumn('f4_f3_f1', f4(f3(f1(col('v'))))) \
    +            .withColumn('f4_f3_f2', f4(f3(f2(col('v'))))) \
    +            .withColumn('f4_f3_f2_f1', f4(f3(f2(f1(col('v'))))))
    +
    +        # expected result
    +        df3 = df \
    +            .withColumn('f1', df['v'] + 1) \
    +            .withColumn('f2', df['v'] + 10) \
    +            .withColumn('f3', df['v'] + 100) \
    +            .withColumn('f4', df['v'] + 1000) \
    +            .withColumn('f2_f1', df['v'] + 11) \
    +            .withColumn('f3_f1', df['v'] + 101) \
    +            .withColumn('f4_f1', df['v'] + 1001) \
    +            .withColumn('f3_f2', df['v'] + 110) \
    +            .withColumn('f4_f2', df['v'] + 1010) \
    +            .withColumn('f4_f3', df['v'] + 1100) \
    +            .withColumn('f3_f2_f1', df['v'] + 111) \
    +            .withColumn('f4_f2_f1', df['v'] + 1011) \
    +            .withColumn('f4_f3_f1', df['v'] + 1101) \
    +            .withColumn('f4_f3_f2', df['v'] + 1110) \
    +            .withColumn('f4_f3_f2_f1', df['v'] + 1111)
    +
    +        self.assertEquals(df3.collect(), df1.collect())
    +        self.assertEquals(df3.collect(), df2.collect())
    +
    +    def test_mixed_udf_and_sql(self):
    +        import pandas as pd
    +        from pyspark.sql.functions import udf, pandas_udf
    +
    +        df = self.spark.range(0, 1).toDF('v')
    +
    +        # Test mixture of UDFs, Pandas UDFs and SQL expression.
    +
    +        @udf('int')
    +        def f1(x):
    +            assert type(x) == int
    +            return x + 1
    +
    +        def f2(x):
    +            return x + 10
    +
    +        @pandas_udf('int')
    +        def f3(x):
    +            assert type(x) == pd.Series
    +            return x + 100
    +
    +        df1 = df.withColumn('f1', f1(df['v'])) \
    +            .withColumn('f2', f2(df['v'])) \
    +            .withColumn('f3', f3(df['v'])) \
    +            .withColumn('f1_f2', f1(f2(df['v']))) \
    +            .withColumn('f1_f3', f1(f3(df['v']))) \
    +            .withColumn('f2_f1', f2(f1(df['v']))) \
    +            .withColumn('f2_f3', f2(f3(df['v']))) \
    +            .withColumn('f3_f1', f3(f1(df['v']))) \
    --- End diff --
    
    I am okay to leave it too here since it's clear they are virtually the same 
but let's remove duplicated tests or orthogonal tests next time.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #21650: [SPARK-24624][SQL][PYTHON] Support mixture of Pyt...

Reply via email to