[GitHub] spark pull request #21650: [SPARK-24624][SQL][PYTHON] Support mixture of Pyt...

icexelloss Wed, 25 Jul 2018 12:59:59 -0700

Github user icexelloss commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21650#discussion_r205243011
  
    --- Diff: python/pyspark/sql/tests.py ---
    @@ -5060,6 +5049,147 @@ def test_type_annotation(self):
             df = self.spark.range(1).select(pandas_udf(f=_locals['noop'], 
returnType='bigint')('id'))
             self.assertEqual(df.first()[0], 0)
     
    +    def test_mixed_udf(self):
    +        import pandas as pd
    +        from pyspark.sql.functions import col, udf, pandas_udf
    +
    +        df = self.spark.range(0, 1).toDF('v')
    +
    +        # Test mixture of multiple UDFs and Pandas UDFs
    +
    +        @udf('int')
    +        def f1(x):
    +            assert type(x) == int
    +            return x + 1
    +
    +        @pandas_udf('int')
    +        def f2(x):
    +            assert type(x) == pd.Series
    +            return x + 10
    +
    +        @udf('int')
    +        def f3(x):
    +            assert type(x) == int
    +            return x + 100
    +
    +        @pandas_udf('int')
    +        def f4(x):
    +            assert type(x) == pd.Series
    +            return x + 1000
    +
    +        # Test mixed udfs in a single projection
    +        df1 = df \
    +            .withColumn('f1', f1(col('v'))) \
    +            .withColumn('f2', f2(col('v'))) \
    +            .withColumn('f3', f3(col('v'))) \
    +            .withColumn('f4', f4(col('v'))) \
    +            .withColumn('f2_f1', f2(col('f1'))) \
    +            .withColumn('f3_f1', f3(col('f1'))) \
    +            .withColumn('f4_f1', f4(col('f1'))) \
    +            .withColumn('f3_f2', f3(col('f2'))) \
    +            .withColumn('f4_f2', f4(col('f2'))) \
    +            .withColumn('f4_f3', f4(col('f3'))) \
    +            .withColumn('f3_f2_f1', f3(col('f2_f1'))) \
    +            .withColumn('f4_f2_f1', f4(col('f2_f1'))) \
    +            .withColumn('f4_f3_f1', f4(col('f3_f1'))) \
    +            .withColumn('f4_f3_f2', f4(col('f3_f2'))) \
    +            .withColumn('f4_f3_f2_f1', f4(col('f3_f2_f1')))
    +
    +        # Test mixed udfs in a single expression
    +        df2 = df \
    +            .withColumn('f1', f1(col('v'))) \
    +            .withColumn('f2', f2(col('v'))) \
    +            .withColumn('f3', f3(col('v'))) \
    +            .withColumn('f4', f4(col('v'))) \
    +            .withColumn('f2_f1', f2(f1(col('v')))) \
    +            .withColumn('f3_f1', f3(f1(col('v')))) \
    +            .withColumn('f4_f1', f4(f1(col('v')))) \
    +            .withColumn('f3_f2', f3(f2(col('v')))) \
    +            .withColumn('f4_f2', f4(f2(col('v')))) \
    +            .withColumn('f4_f3', f4(f3(col('v')))) \
    +            .withColumn('f3_f2_f1', f3(f2(f1(col('v'))))) \
    +            .withColumn('f4_f2_f1', f4(f2(f1(col('v'))))) \
    +            .withColumn('f4_f3_f1', f4(f3(f1(col('v'))))) \
    +            .withColumn('f4_f3_f2', f4(f3(f2(col('v'))))) \
    +            .withColumn('f4_f3_f2_f1', f4(f3(f2(f1(col('v'))))))
    +
    +        # expected result
    +        df3 = df \
    +            .withColumn('f1', df['v'] + 1) \
    +            .withColumn('f2', df['v'] + 10) \
    +            .withColumn('f3', df['v'] + 100) \
    +            .withColumn('f4', df['v'] + 1000) \
    +            .withColumn('f2_f1', df['v'] + 11) \
    +            .withColumn('f3_f1', df['v'] + 101) \
    +            .withColumn('f4_f1', df['v'] + 1001) \
    +            .withColumn('f3_f2', df['v'] + 110) \
    +            .withColumn('f4_f2', df['v'] + 1010) \
    +            .withColumn('f4_f3', df['v'] + 1100) \
    +            .withColumn('f3_f2_f1', df['v'] + 111) \
    +            .withColumn('f4_f2_f1', df['v'] + 1011) \
    +            .withColumn('f4_f3_f1', df['v'] + 1101) \
    +            .withColumn('f4_f3_f2', df['v'] + 1110) \
    +            .withColumn('f4_f3_f2_f1', df['v'] + 1111)
    +
    +        self.assertEquals(df3.collect(), df1.collect())
    +        self.assertEquals(df3.collect(), df2.collect())
    +
    +    def test_mixed_udf_and_sql(self):
    +        import pandas as pd
    +        from pyspark.sql.functions import udf, pandas_udf
    +
    +        df = self.spark.range(0, 1).toDF('v')
    +
    +        # Test mixture of UDFs, Pandas UDFs and SQL expression.
    +
    +        @udf('int')
    +        def f1(x):
    +            assert type(x) == int
    +            return x + 1
    +
    +        def f2(x):
    +            return x + 10
    +
    +        @pandas_udf('int')
    +        def f3(x):
    +            assert type(x) == pd.Series
    +            return x + 100
    +
    +        df1 = df.withColumn('f1', f1(df['v'])) \
    +            .withColumn('f2', f2(df['v'])) \
    +            .withColumn('f3', f3(df['v'])) \
    +            .withColumn('f1_f2', f1(f2(df['v']))) \
    +            .withColumn('f1_f3', f1(f3(df['v']))) \
    +            .withColumn('f2_f1', f2(f1(df['v']))) \
    +            .withColumn('f2_f3', f2(f3(df['v']))) \
    +            .withColumn('f3_f1', f3(f1(df['v']))) \
    --- End diff --
    
    I see. I don't think it's necessary (we are only likely to remove a few 
cases and like you said, the test time is virtually the same) and helps the 
readability of the tests (so it doesn't look like some test cases are missed). 
    
    But if that's the preferred practice I can remove duplicate cases in the 
next commit.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #21650: [SPARK-24624][SQL][PYTHON] Support mixture of Pyt...

Reply via email to