This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ac5ec646292 [SPARK-38821][PYTHON] Skip nsmall/nlarge nan test under pandas 1.4.[0,1,2] ac5ec646292 is described below commit ac5ec646292c9738c29b5e0b5d29a95ce593dbc6 Author: Yikun Jiang <yikunk...@gmail.com> AuthorDate: Tue Apr 26 19:14:20 2022 +0900 [SPARK-38821][PYTHON] Skip nsmall/nlarge nan test under pandas 1.4.[0,1,2] ### What changes were proposed in this pull request? Skip nsmall/nlarge nan test under pandas 1.4.[0,1,2]. Pandas get wrong results when ``np.nan`` in the sorting column since https://github.com/pandas-dev/pandas/commit/16d2f59589cfc09c2754c988497c27b56cb169c4 (v1.4.0) I confirmed this issue are fixed by: https://github.com/pandas-dev/pandas/commit/28863884390468073b2522b6be23199d97d1eab1 ### Why are the changes needed? No ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed Closes #36356 from Yikun/SPARK-38821. Authored-by: Yikun Jiang <yikunk...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/pandas/tests/test_dataframe.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index 5fa6919c129..008da92c9a9 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -1814,8 +1814,12 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): index=np.random.rand(7), ) psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.nlargest(5, columns="a"), pdf.nlargest(5, columns="a")) - self.assert_eq(psdf.nlargest(5, columns=["a", "b"]), pdf.nlargest(5, columns=["a", "b"])) + # see also: https://github.com/pandas-dev/pandas/issues/46589 + if not (LooseVersion("1.4.0") <= LooseVersion(pd.__version__) <= LooseVersion("1.4.2")): + self.assert_eq(psdf.nlargest(5, columns="a"), pdf.nlargest(5, columns="a")) + self.assert_eq( + psdf.nlargest(5, columns=["a", "b"]), pdf.nlargest(5, columns=["a", "b"]) + ) self.assert_eq(psdf.nlargest(5, columns=["c"]), pdf.nlargest(5, columns=["c"])) self.assert_eq( psdf.nlargest(5, columns=["c"], keep="first"), @@ -1838,10 +1842,12 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): index=np.random.rand(7), ) psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.nsmallest(n=5, columns="a"), pdf.nsmallest(5, columns="a")) - self.assert_eq( - psdf.nsmallest(n=5, columns=["a", "b"]), pdf.nsmallest(5, columns=["a", "b"]) - ) + # see also: https://github.com/pandas-dev/pandas/issues/46589 + if not (LooseVersion("1.4.0") <= LooseVersion(pd.__version__) <= LooseVersion("1.4.2")): + self.assert_eq(psdf.nsmallest(n=5, columns="a"), pdf.nsmallest(5, columns="a")) + self.assert_eq( + psdf.nsmallest(n=5, columns=["a", "b"]), pdf.nsmallest(5, columns=["a", "b"]) + ) self.assert_eq(psdf.nsmallest(n=5, columns=["c"]), pdf.nsmallest(5, columns=["c"])) self.assert_eq( psdf.nsmallest(n=5, columns=["c"], keep="first"), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org