Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/20211#discussion_r160583899
--- Diff: python/pyspark/sql/tests.py ---
@@ -3995,23 +3995,49 @@ def test_coerce(self):
self.assertFramesEqual(expected, result)
def test_complex_groupby(self):
+ import pandas as pd
from pyspark.sql.functions import pandas_udf, col, PandasUDFType
df = self.data
+ pdf = df.toPandas()
@pandas_udf(
- 'id long, v int, norm double',
+ 'v int, v2 double',
PandasUDFType.GROUP_MAP
)
- def normalize(pdf):
+ def foo(pdf):
v = pdf.v
- return pdf.assign(norm=(v - v.mean()) / v.std())
-
- result = df.groupby(col('id') % 2 ==
0).apply(normalize).sort('id', 'v').toPandas()
- pdf = df.toPandas()
- expected = pdf.groupby(pdf['id'] % 2 == 0).apply(normalize.func)
- expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
- expected = expected.assign(norm=expected.norm.astype('float64'))
- self.assertFramesEqual(expected, result)
+ return pd.DataFrame({'v': v + 1, 'v2': v - v.mean()})[:]
--- End diff --
Why should we copy here by the way?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]