Github user icexelloss commented on a diff in the pull request:
https://github.com/apache/spark/pull/18732#discussion_r141885324
--- Diff: python/pyspark/sql/tests.py ---
@@ -3376,6 +3377,74 @@ def test_vectorized_udf_empty_partition(self):
res = df.select(f(col('id')))
self.assertEquals(df.collect(), res.collect())
[email protected](not _have_pandas or not _have_arrow, "Pandas or Arrow not
installed")
+class GroupbyApplyTests(ReusedPySparkTestCase):
+ @classmethod
+ def setUpClass(cls):
+ ReusedPySparkTestCase.setUpClass()
+ cls.spark = SparkSession(cls.sc)
+
+ @classmethod
+ def tearDownClass(cls):
+ ReusedPySparkTestCase.tearDownClass()
+ cls.spark.stop()
+
+ def assertFramesEqual(self, expected, result):
+ msg = ("DataFrames are not equal: " +
+ ("\n\nExpected:\n%s\n%s" % (expected, expected.dtypes)) +
+ ("\n\nResult:\n%s\n%s" % (result, result.dtypes)))
+ self.assertTrue(expected.equals(result), msg=msg)
+
+ @property
+ def data(self):
+ from pyspark.sql.functions import pandas_udf, array, explode, col,
lit
+ return self.spark.range(10).toDF('id') \
+ .withColumn("vs", array([lit(i) for i in range(20, 30)])) \
+ .withColumn("v", explode(col('vs'))).drop('vs')
+
+ def test_groupby_apply_simple(self):
+ from pyspark.sql.functions import pandas_udf
+ df = self.data
+
+ def foo(df):
+ ret = df
+ ret = ret.assign(v1=df.v * df.id * 1.0)
+ ret = ret.assign(v2=df.v + df.id)
+ return ret
+
+ foo_udf = pandas_udf(
+ foo,
+ StructType(
+ [StructField('id', LongType()),
+ StructField('v', IntegerType()),
+ StructField('v1', DoubleType()),
+ StructField('v2', LongType())]))
--- End diff --
Yes the column names are specified in the returnType and the returnType
must be a `StructType`.
The rational is that `apply()` is a mapping from a pd.Dataframe ->
pd.DataFrame, therefore the returnType must be a `StructType`.
This is the best way I can think of to specify the column names and
returnType, it makes sense to me because there should be a one-to-one mapping
between the return value of the function (a `pd.DataFrame`) and it's schema (a
`StructType` containing column names and dataType)
Also because `pd.DataFrame` doesn't support nested types, there is no
ambiguity whether a `StructType` indicates a `pd.DataFrame` or nested type
either.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]