Github user icexelloss commented on a diff in the pull request:
https://github.com/apache/spark/pull/20211#discussion_r160862182
--- Diff: python/pyspark/sql/tests.py ---
@@ -3995,23 +3995,49 @@ def test_coerce(self):
self.assertFramesEqual(expected, result)
def test_complex_groupby(self):
+ import pandas as pd
from pyspark.sql.functions import pandas_udf, col, PandasUDFType
df = self.data
+ pdf = df.toPandas()
@pandas_udf(
- 'id long, v int, norm double',
+ 'v int, v2 double',
PandasUDFType.GROUP_MAP
)
- def normalize(pdf):
+ def foo(pdf):
v = pdf.v
- return pdf.assign(norm=(v - v.mean()) / v.std())
-
- result = df.groupby(col('id') % 2 ==
0).apply(normalize).sort('id', 'v').toPandas()
- pdf = df.toPandas()
- expected = pdf.groupby(pdf['id'] % 2 == 0).apply(normalize.func)
- expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
- expected = expected.assign(norm=expected.norm.astype('float64'))
- self.assertFramesEqual(expected, result)
+ return pd.DataFrame({'v': v + 1, 'v2': v - v.mean()})[:]
--- End diff --
This is just for simplifying the test - pandas has very complicated
behavior when it comes to what's the index of the return value when using
`groupby apply`
If interested, take a look at
http://nbviewer.jupyter.org/gist/mbirdi/05f8a83d340476e5f03a
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]