This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push:
new 906df15 [SPARK-34703][PYSPARK][2.4] Fix pyspark test when using
sort_values on Pandas
906df15 is described below
commit 906df15f81c1e1c41a097f4230695da3a919227a
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Wed Mar 10 18:42:11 2021 -0800
[SPARK-34703][PYSPARK][2.4] Fix pyspark test when using sort_values on
Pandas
### What changes were proposed in this pull request?
This patch fixes a few PySpark test error related to Pandas, in order to
restore 2.4 Jenkins builds.
### Why are the changes needed?
There are APIs changed since Pandas 0.24. If there are index and column
name are the same, `sort_values` will throw error.
Three PySpark tests are currently failed in Jenkins 2.4 build:
`test_column_order`, `test_complex_groupby`, `test_udf_with_key`:
```
======================================================================
ERROR: test_column_order (pyspark.sql.tests.GroupedMapPandasUDFTests)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/spark/python/pyspark/sql/tests.py", line 5996, in test_column_order
expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line
4711, in sort_values
for x in by]
File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py",
line 1702, in _get_label_or_level_values
self._check_label_or_level_ambiguity(key, axis=axis)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py",
line 1656, in _check_label_or_level_ambiguity
raise ValueError(msg)
ValueError: 'id' is both an index level and a column label, which is
ambiguous.
======================================================================
ERROR: test_complex_groupby (pyspark.sql.tests.GroupedMapPandasUDFTests)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/spark/python/pyspark/sql/tests.py", line 5765, in
test_complex_groupby
expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line
4711, in sort_values
for x in by]
File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py",
line 1702, in _get_label_or_level_values
self._check_label_or_level_ambiguity(key, axis=axis)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py",
line 1656, in _check_label_or_level_ambiguity
raise ValueError(msg)
ValueError: 'id' is both an index level and a column label, which is
ambiguous.
======================================================================
ERROR: test_udf_with_key (pyspark.sql.tests.GroupedMapPandasUDFTests)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/spark/python/pyspark/sql/tests.py", line 5922, in test_udf_with_key
.sort_values(['id', 'v']).reset_index(drop=True)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line
4711, in sort_values
for x in by]
File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py",
line 1702, in _get_label_or_level_values
self._check_label_or_level_ambiguity(key, axis=axis)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py",
line 1656, in _check_label_or_level_ambiguity
raise ValueError(msg)
ValueError: 'id' is both an index level and a column label, which is
ambiguous.
```
### Does this PR introduce _any_ user-facing change?
No, dev only.
### How was this patch tested?
Verified by running the tests locally.
Closes #31803 from viirya/SPARK-34703.
Authored-by: Liang-Chi Hsieh <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
python/pyspark/sql/tests.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 70f3882..e3b8e19 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -5761,7 +5761,7 @@ class GroupedMapPandasUDFTests(ReusedSQLTestCase):
result = df.groupby(col('id') % 2 == 0).apply(normalize).sort('id',
'v').toPandas()
pdf = df.toPandas()
- expected = pdf.groupby(pdf['id'] % 2 == 0).apply(normalize.func)
+ expected = pdf.groupby(pdf['id'] % 2 == 0,
as_index=False).apply(normalize.func)
expected = expected.sort_values(['id', 'v']).reset_index(drop=True)
expected = expected.assign(norm=expected.norm.astype('float64'))
self.assertPandasEqual(expected, result)
@@ -5917,21 +5917,21 @@ class GroupedMapPandasUDFTests(ReusedSQLTestCase):
# Test groupby column
result1 = df.groupby('id').apply(udf1).sort('id', 'v').toPandas()
- expected1 = pdf.groupby('id')\
+ expected1 = pdf.groupby('id', as_index=False)\
.apply(lambda x: udf1.func((x.id.iloc[0],), x))\
.sort_values(['id', 'v']).reset_index(drop=True)
self.assertPandasEqual(expected1, result1)
# Test groupby expression
result2 = df.groupby(df.id % 2).apply(udf1).sort('id', 'v').toPandas()
- expected2 = pdf.groupby(pdf.id % 2)\
+ expected2 = pdf.groupby(pdf.id % 2, as_index=False)\
.apply(lambda x: udf1.func((x.id.iloc[0] % 2,), x))\
.sort_values(['id', 'v']).reset_index(drop=True)
self.assertPandasEqual(expected2, result2)
# Test complex groupby
result3 = df.groupby(df.id, df.v % 2).apply(udf2).sort('id',
'v').toPandas()
- expected3 = pdf.groupby([pdf.id, pdf.v % 2])\
+ expected3 = pdf.groupby([pdf.id, pdf.v % 2], as_index=False)\
.apply(lambda x: udf2.func((x.id.iloc[0], (x.v % 2).iloc[0],), x))\
.sort_values(['id', 'v']).reset_index(drop=True)
self.assertPandasEqual(expected3, result3)
@@ -5953,7 +5953,7 @@ class GroupedMapPandasUDFTests(ReusedSQLTestCase):
df = self.data
grouped_df = df.groupby('id')
- grouped_pdf = df.toPandas().groupby('id')
+ grouped_pdf = df.toPandas().groupby('id', as_index=False)
# Function returns a pdf with required column names, but order could
be arbitrary using dict
def change_col_order(pdf):
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]