[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...

icexelloss Thu, 18 Jan 2018 07:07:16 -0800

Github user icexelloss commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19872#discussion_r162367851
  
    --- Diff: python/pyspark/sql/tests.py ---
    @@ -4279,6 +4273,425 @@ def test_unsupported_types(self):
                     df.groupby('id').apply(f).collect()
     
     
    [email protected](not _have_pandas or not _have_arrow, "Pandas or Arrow not 
installed")
    +class GroupbyAggPandasUDFTests(ReusedSQLTestCase):
    +
    +    @property
    +    def data(self):
    +        from pyspark.sql.functions import array, explode, col, lit
    +        return self.spark.range(10).toDF('id') \
    +            .withColumn("vs", array([lit(i * 1.0) + col('id') for i in 
range(20, 30)])) \
    +            .withColumn("v", explode(col('vs'))) \
    +            .drop('vs') \
    +            .withColumn('w', lit(1.0))
    +
    +    @property
    +    def python_plus_one(self):
    +        from pyspark.sql.functions import udf
    +
    +        @udf('double')
    +        def plus_one(v):
    +            assert isinstance(v, (int, float))
    +            return v + 1
    +        return plus_one
    +
    +    @property
    +    def pandas_scalar_plus_two(self):
    +        import pandas as pd
    +        from pyspark.sql.functions import pandas_udf, PandasUDFType
    +
    +        @pandas_udf('double', PandasUDFType.SCALAR)
    +        def plus_two(v):
    +            assert isinstance(v, pd.Series)
    +            return v + 2
    +        return plus_two
    +
    +    @property
    +    def pandas_agg_mean_udf(self):
    +        from pyspark.sql.functions import pandas_udf, PandasUDFType
    +
    +        @pandas_udf('double', PandasUDFType.GROUP_AGG)
    +        def avg(v):
    +            return v.mean()
    +        return avg
    +
    +    @property
    +    def pandas_agg_sum_udf(self):
    +        from pyspark.sql.functions import pandas_udf, PandasUDFType
    +
    +        @pandas_udf('double', PandasUDFType.GROUP_AGG)
    +        def sum(v):
    +            return v.sum()
    +        return sum
    +
    +    @property
    +    def pandas_agg_weighted_mean_udf(self):
    +        import numpy as np
    +        from pyspark.sql.functions import pandas_udf, PandasUDFType
    +
    +        @pandas_udf('double', PandasUDFType.GROUP_AGG)
    +        def weighted_mean(v, w):
    +            return np.average(v, weights=w)
    +        return weighted_mean
    +
    +    def test_basic(self):
    +        from pyspark.sql.functions import col, lit, sum, mean
    +
    +        df = self.data
    +        weighted_mean_udf = self.pandas_agg_weighted_mean_udf
    +
    +        # Groupby one column and aggregate one UDF with literal
    +        result1 = df.groupby('id').agg(weighted_mean_udf(df.v, 
lit(1.0))).sort('id')
    +        expected1 = 
df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id')
    --- End diff --
    
    Here `mean` is the spark built-in aggregation function, so it's not a test 
target here. This is a behavior test that test "aggregate UDF should be the 
same as equivalent built-in function".
    
    I generally prefer behavior test over manual test because they are more 
robust and we can write a lot of them. Maybe we can add one manual test result 
to be safe, what do you think?



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #19872: [SPARK-22274][PYTHON][SQL] User-defined aggregati...

Reply via email to