This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new b2b7cca [SPARK-30777][PYTHON][TESTS] Fix test failures for Pandas >= 1.0.0 b2b7cca is described below commit b2b7cca6dec575b578f093bc7caa80f1b9d7b170 Author: Bryan Cutler <cutl...@gmail.com> AuthorDate: Tue Feb 11 10:03:01 2020 +0900 [SPARK-30777][PYTHON][TESTS] Fix test failures for Pandas >= 1.0.0 ### What changes were proposed in this pull request? Fix PySpark test failures for using Pandas >= 1.0.0. ### Why are the changes needed? Pandas 1.0.0 has recently been released and has API changes that result in PySpark test failures, this PR fixes the broken tests. ### Does this PR introduce any user-facing change? No ### How was this patch tested? Manually tested with Pandas 1.0.1 and PyArrow 0.16.0 Closes #27529 from BryanCutler/pandas-fix-tests-1.0-SPARK-30777. Authored-by: Bryan Cutler <cutl...@gmail.com> Signed-off-by: HyukjinKwon <gurwls...@apache.org> (cherry picked from commit 07a9885f2792be1353f4a923d649e90bc431cb38) Signed-off-by: HyukjinKwon <gurwls...@apache.org> --- python/pyspark/sql/tests/test_arrow.py | 4 ++-- python/pyspark/sql/tests/test_pandas_grouped_map.py | 6 +++--- python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index 98f44df..004c79f 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -297,9 +297,9 @@ class ArrowTests(ReusedSQLTestCase): # Some series get converted for Spark to consume, this makes sure input is unchanged pdf = self.create_pandas_data_frame() # Use a nanosecond value to make sure it is not truncated - pdf.ix[0, '8_timestamp_t'] = pd.Timestamp(1) + pdf.iloc[0, 7] = pd.Timestamp(1) # Integers with nulls will get NaNs filled with 0 and will be casted - pdf.ix[1, '2_int_t'] = None + pdf.iloc[1, 1] = None pdf_copy = pdf.copy(deep=True) self.spark.createDataFrame(pdf, schema=self.schema) self.assertTrue(pdf.equals(pdf_copy)) diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py index 51dd07f..ff53a0c 100644 --- a/python/pyspark/sql/tests/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py @@ -390,11 +390,11 @@ class GroupedMapInPandasTests(ReusedSQLTestCase): # Function returns a pdf with required column names, but order could be arbitrary using dict def change_col_order(pdf): # Constructing a DataFrame from a dict should result in the same order, - # but use from_items to ensure the pdf column order is different than schema - return pd.DataFrame.from_items([ + # but use OrderedDict to ensure the pdf column order is different than schema + return pd.DataFrame.from_dict(OrderedDict([ ('id', pdf.id), ('u', pdf.v * 2), - ('v', pdf.v)]) + ('v', pdf.v)])) ordered_udf = pandas_udf( change_col_order, diff --git a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py index 974ad56..2167978 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py +++ b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py @@ -357,7 +357,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase): plus_one(sum_udf(col('v1'))), sum_udf(plus_one(col('v2')))) .sort(['id', '(v % 2)']) - .toPandas().sort_index(by=['id', '(v % 2)'])) + .toPandas().sort_values(by=['id', '(v % 2)'])) expected1 = (df.withColumn('v1', df.v + 1) .withColumn('v2', df.v + 2) @@ -368,7 +368,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase): plus_one(sum(col('v1'))), sum(plus_one(col('v2')))) .sort(['id', '(v % 2)']) - .toPandas().sort_index(by=['id', '(v % 2)'])) + .toPandas().sort_values(by=['id', '(v % 2)'])) # Test complex expressions with sql expression, scala pandas UDF and # group aggregate pandas UDF @@ -381,7 +381,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase): plus_two(sum_udf(col('v1'))), sum_udf(plus_two(col('v2')))) .sort(['id', '(v % 2)']) - .toPandas().sort_index(by=['id', '(v % 2)'])) + .toPandas().sort_values(by=['id', '(v % 2)'])) expected2 = (df.withColumn('v1', df.v + 1) .withColumn('v2', df.v + 2) @@ -392,7 +392,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase): plus_two(sum(col('v1'))), sum(plus_two(col('v2')))) .sort(['id', '(v % 2)']) - .toPandas().sort_index(by=['id', '(v % 2)'])) + .toPandas().sort_values(by=['id', '(v % 2)'])) # Test sequential groupby aggregate result3 = (df.groupby('id') --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org