[spark] 01/06: [SPARK-36345][SPARK-36367][INFRA][PYTHON] Disable tests failed by the incompatible behavior of pandas 1.3

gurwls223 Thu, 26 Aug 2021 18:04:50 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


commit cb075b5301e08b9d9b06f3d33a41b3d63d95378e
Author: Takuya UESHIN <[email protected]>
AuthorDate: Tue Aug 3 14:02:18 2021 +0900

    [SPARK-36345][SPARK-36367][INFRA][PYTHON] Disable tests failed by the 
incompatible behavior of pandas 1.3
    
    Disable tests failed by the incompatible behavior of pandas 1.3.
    
    Pandas 1.3 has been released.
    There are some behavior changes and we should follow it, but it's not ready 
yet.
    
    No.
    
    Disabled some tests related to the behavior change.
    
    Closes #33598 from ueshin/issues/SPARK-36367/disable_tests.
    
    Authored-by: Takuya UESHIN <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
    (cherry picked from commit 8cb9cf39b6a1899175aeaefb2a85480f5a514aac)
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 .github/workflows/build_and_test.yml               |  4 +-
 python/pyspark/pandas/groupby.py                   |  8 +++
 .../tests/data_type_ops/test_categorical_ops.py    |  6 +-
 python/pyspark/pandas/tests/indexes/test_base.py   | 76 +++++++++++---------
 .../pyspark/pandas/tests/indexes/test_category.py  |  5 +-
 python/pyspark/pandas/tests/test_categorical.py    | 82 ++++++++++++++++++----
 python/pyspark/pandas/tests/test_expanding.py      | 51 ++++++++------
 .../test_ops_on_diff_frames_groupby_expanding.py   | 13 ++--
 .../test_ops_on_diff_frames_groupby_rolling.py     | 14 ++--
 python/pyspark/pandas/tests/test_rolling.py        | 52 ++++++++------
 python/pyspark/pandas/tests/test_series.py         | 16 +++--
 11 files changed, 222 insertions(+), 105 deletions(-)

diff --git a/.github/workflows/build_and_test.yml 
b/.github/workflows/build_and_test.yml
index b518f87..7fc99ef 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -149,7 +149,7 @@ jobs:
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ubuntu-20.04
     container:
-      image: dongjoon/apache-spark-github-action-image:20210602
+      image: dongjoon/apache-spark-github-action-image:20210730
     strategy:
       fail-fast: false
       matrix:
@@ -227,8 +227,6 @@ jobs:
     # Run the tests.
     - name: Run tests
       run: |
-        # TODO(SPARK-36345): Install mlflow>=1.0 and sklearn in Python 3.9 of 
the base image
-        python3.9 -m pip install 'mlflow>=1.0' sklearn
         export PATH=$PATH:$HOME/miniconda/bin
         ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST"
     - name: Upload test results to report
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 70ece9c..faa1de6 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -20,6 +20,7 @@ A wrapper for GroupedData to behave similar to pandas GroupBy.
 """
 
 from abc import ABCMeta, abstractmethod
+import builtins
 import sys
 import inspect
 from collections import OrderedDict, namedtuple
@@ -43,6 +44,7 @@ from typing import (
     TYPE_CHECKING,
 )
 
+import numpy as np
 import pandas as pd
 from pandas.api.types import is_hashable, is_list_like
 
@@ -102,6 +104,12 @@ if TYPE_CHECKING:
 # to keep it the same as pandas
 NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
 
+_builtin_table = {
+    builtins.sum: np.sum,
+    builtins.max: np.max,
+    builtins.min: np.min,
+}  # type: Dict[Callable, Callable]
+
 
 class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
     """
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py 
b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
index 6ac9073..11871ea 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_categorical_ops.py
@@ -190,8 +190,12 @@ class CategoricalOpsTest(PandasOnSparkTestCase, 
TestCasesUtils):
         self.assert_eq(pser.astype(str), psser.astype(str))
         self.assert_eq(pser.astype(bool), psser.astype(bool))
         self.assert_eq(pser.astype("category"), psser.astype("category"))
+
         cat_type = CategoricalDtype(categories=[3, 1, 2])
-        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
             self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
         else:
             self.assert_eq(pd.Series(data).astype(cat_type), 
psser.astype(cat_type))
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py 
b/python/pyspark/pandas/tests/indexes/test_base.py
index 8238b67..39e22bd 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -1478,25 +1478,30 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
             psidx2 = ps.from_pandas(pidx2)
 
             self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2))
-            self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
             self.assert_eq(
                 psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 
6]), almost=True
             )
             self.assert_eq(
-                psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
-                pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
-                almost=True,
-            )
-            self.assert_eq(
                 psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])),
                 pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])),
                 almost=True,
             )
-            self.assert_eq(
-                psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])),
-                pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])),
-                almost=True,
-            )
+
+            if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+                # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+                pass
+            else:
+                self.assert_eq(psidx2.union(psidx1), pidx2.union(pidx1))
+                self.assert_eq(
+                    psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
+                    pidx2.union([1, 2, 3, 4, 3, 4, 3, 4]),
+                    almost=True,
+                )
+                self.assert_eq(
+                    psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])),
+                    pidx2.union(pd.Series([1, 2, 3, 4, 3, 4, 3, 4])),
+                    almost=True,
+                )
 
         # MultiIndex
         pmidx1 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", 
"a"), ("x", "b")])
@@ -1508,30 +1513,37 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils):
         psmidx3 = ps.from_pandas(pmidx3)
         psmidx4 = ps.from_pandas(pmidx4)
 
-        self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2))
-        self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1))
-        self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4))
-        self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3))
-        self.assert_eq(
-            psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
-            pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
-        )
-        self.assert_eq(
-            psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
-            pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
-        )
-        self.assert_eq(
-            psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
-            pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
-        )
-        self.assert_eq(
-            psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
-            pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
-        )
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(psmidx1.union(psmidx2), pmidx1.union(pmidx2))
+            self.assert_eq(psmidx2.union(psmidx1), pmidx2.union(pmidx1))
+            self.assert_eq(psmidx3.union(psmidx4), pmidx3.union(pmidx4))
+            self.assert_eq(psmidx4.union(psmidx3), pmidx4.union(pmidx3))
+            self.assert_eq(
+                psmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", 
"d")]),
+                pmidx1.union([("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")]),
+            )
+            self.assert_eq(
+                psmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", 
"b")]),
+                pmidx2.union([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]),
+            )
+            self.assert_eq(
+                psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
+                pmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)]),
+            )
+            self.assert_eq(
+                psmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 
4)]),
+                pmidx4.union([(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)]),
+            )
 
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
         # Testing if the result is correct after sort=False.
         # The `sort` argument is added in pandas 0.24.
-        if LooseVersion(pd.__version__) >= LooseVersion("0.24"):
+        elif LooseVersion(pd.__version__) >= LooseVersion("0.24"):
             self.assert_eq(
                 psmidx1.union(psmidx2, sort=False).sort_values(),
                 pmidx1.union(pmidx2, sort=False).sort_values(),
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py 
b/python/pyspark/pandas/tests/indexes/test_category.py
index 37216bd..f241918 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -176,7 +176,10 @@ class CategoricalIndexTest(PandasOnSparkTestCase, 
TestUtils):
 
         self.assert_eq(kcidx.astype("category"), pcidx.astype("category"))
 
-        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
             self.assert_eq(
                 kcidx.astype(CategoricalDtype(["b", "c", "a"])),
                 pcidx.astype(CategoricalDtype(["b", "c", "a"])),
diff --git a/python/pyspark/pandas/tests/test_categorical.py 
b/python/pyspark/pandas/tests/test_categorical.py
index 67cdf3c..1335d59 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -73,7 +73,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         pser.cat.categories = ["z", "y", "x"]
         psser.cat.categories = ["z", "y", "x"]
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         with self.assertRaises(ValueError):
@@ -91,7 +95,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         pser.cat.add_categories(4, inplace=True)
         psser.cat.add_categories(4, inplace=True)
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         self.assertRaises(ValueError, lambda: psser.cat.add_categories(4))
@@ -115,7 +123,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         pser.cat.remove_categories(2, inplace=True)
         psser.cat.remove_categories(2, inplace=True)
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
@@ -138,7 +150,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         pser.cat.remove_unused_categories(inplace=True)
         psser.cat.remove_unused_categories(inplace=True)
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
     def test_reorder_categories(self):
@@ -164,12 +180,20 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         pser.cat.reorder_categories([1, 2, 3], inplace=True)
         psser.cat.reorder_categories([1, 2, 3], inplace=True)
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
         psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 
2]))
@@ -189,7 +213,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         pser.cat.as_ordered(inplace=True)
         psser.cat.as_ordered(inplace=True)
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         # as_unordered
@@ -215,7 +243,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         self.assert_eq(kcser.astype("category"), pcser.astype("category"))
 
-        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
             self.assert_eq(
                 kcser.astype(CategoricalDtype(["b", "c", "a"])),
                 pcser.astype(CategoricalDtype(["b", "c", "a"])),
@@ -419,7 +450,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
         def astype(x) -> ps.Series[dtype]:
             return x.astype(dtype)
 
-        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
             self.assert_eq(
                 
psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
                 
pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
@@ -637,17 +671,29 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
         psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         pser.cat.rename_categories(lambda x: x.upper(), inplace=True)
         psser.cat.rename_categories(lambda x: x.upper(), inplace=True)
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         pser.cat.rename_categories([0, 1, 3, 2], inplace=True)
         psser.cat.rename_categories([0, 1, 3, 2], inplace=True)
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         self.assertRaisesRegex(
@@ -717,12 +763,20 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
             pser.cat.set_categories(["a", "c", "b", "o"], inplace=True, 
rename=True),
             psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, 
rename=True),
         )
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
         psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
-        self.assert_eq(pser, psser)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         self.assertRaisesRegex(
diff --git a/python/pyspark/pandas/tests/test_expanding.py 
b/python/pyspark/pandas/tests/test_expanding.py
index 57b4e48..2cd5e52 100644
--- a/python/pyspark/pandas/tests/test_expanding.py
+++ b/python/pyspark/pandas/tests/test_expanding.py
@@ -145,18 +145,24 @@ class ExpandingTest(PandasOnSparkTestCase, TestUtils):
 
         pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0, 2.0], "b": [4.0, 2.0, 3.0, 
1.0]})
         psdf = ps.from_pandas(pdf)
-        self.assert_eq(
-            getattr(psdf.groupby(psdf.a).expanding(2), f)().sort_index(),
-            getattr(pdf.groupby(pdf.a).expanding(2), f)().sort_index(),
-        )
-        self.assert_eq(
-            getattr(psdf.groupby(psdf.a).expanding(2), f)().sum(),
-            getattr(pdf.groupby(pdf.a).expanding(2), f)().sum(),
-        )
-        self.assert_eq(
-            getattr(psdf.groupby(psdf.a + 1).expanding(2), f)().sort_index(),
-            getattr(pdf.groupby(pdf.a + 1).expanding(2), f)().sort_index(),
-        )
+
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(
+                getattr(psdf.groupby(psdf.a).expanding(2), f)().sort_index(),
+                getattr(pdf.groupby(pdf.a).expanding(2), f)().sort_index(),
+            )
+            self.assert_eq(
+                getattr(psdf.groupby(psdf.a).expanding(2), f)().sum(),
+                getattr(pdf.groupby(pdf.a).expanding(2), f)().sum(),
+            )
+            self.assert_eq(
+                getattr(psdf.groupby(psdf.a + 1).expanding(2), 
f)().sort_index(),
+                getattr(pdf.groupby(pdf.a + 1).expanding(2), f)().sort_index(),
+            )
+
         self.assert_eq(
             getattr(psdf.b.groupby(psdf.a).expanding(2), f)().sort_index(),
             getattr(pdf.b.groupby(pdf.a).expanding(2), f)().sort_index(),
@@ -174,15 +180,20 @@ class ExpandingTest(PandasOnSparkTestCase, TestUtils):
         columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
         pdf.columns = columns
         psdf.columns = columns
-        self.assert_eq(
-            getattr(psdf.groupby(("a", "x")).expanding(2), f)().sort_index(),
-            getattr(pdf.groupby(("a", "x")).expanding(2), f)().sort_index(),
-        )
 
-        self.assert_eq(
-            getattr(psdf.groupby([("a", "x"), ("a", "y")]).expanding(2), 
f)().sort_index(),
-            getattr(pdf.groupby([("a", "x"), ("a", "y")]).expanding(2), 
f)().sort_index(),
-        )
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(
+                getattr(psdf.groupby(("a", "x")).expanding(2), 
f)().sort_index(),
+                getattr(pdf.groupby(("a", "x")).expanding(2), 
f)().sort_index(),
+            )
+
+            self.assert_eq(
+                getattr(psdf.groupby([("a", "x"), ("a", "y")]).expanding(2), 
f)().sort_index(),
+                getattr(pdf.groupby([("a", "x"), ("a", "y")]).expanding(2), 
f)().sort_index(),
+            )
 
     def test_groupby_expanding_count(self):
         # The behaviour of ExpandingGroupby.count are different between 
pandas>=1.0.0 and lower,
diff --git 
a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py 
b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py
index c6a2852..223adea 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py
@@ -52,10 +52,15 @@ class 
OpsOnDiffFramesGroupByExpandingTest(PandasOnSparkTestCase, TestUtils):
         psdf = ps.from_pandas(pdf)
         kkey = ps.from_pandas(pkey)
 
-        self.assert_eq(
-            getattr(psdf.groupby(kkey).expanding(2), f)().sort_index(),
-            getattr(pdf.groupby(pkey).expanding(2), f)().sort_index(),
-        )
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(
+                getattr(psdf.groupby(kkey).expanding(2), f)().sort_index(),
+                getattr(pdf.groupby(pkey).expanding(2), f)().sort_index(),
+            )
+
         self.assert_eq(
             getattr(psdf.groupby(kkey)["b"].expanding(2), f)().sort_index(),
             getattr(pdf.groupby(pkey)["b"].expanding(2), f)().sort_index(),
diff --git 
a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py 
b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
index 306a081..4f97769 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from distutils.version import LooseVersion
 
 import pandas as pd
 
@@ -49,10 +50,15 @@ class 
OpsOnDiffFramesGroupByRollingTest(PandasOnSparkTestCase, TestUtils):
         psdf = ps.from_pandas(pdf)
         kkey = ps.from_pandas(pkey)
 
-        self.assert_eq(
-            getattr(psdf.groupby(kkey).rolling(2), f)().sort_index(),
-            getattr(pdf.groupby(pkey).rolling(2), f)().sort_index(),
-        )
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(
+                getattr(psdf.groupby(kkey).rolling(2), f)().sort_index(),
+                getattr(pdf.groupby(pkey).rolling(2), f)().sort_index(),
+            )
+
         self.assert_eq(
             getattr(psdf.groupby(kkey)["b"].rolling(2), f)().sort_index(),
             getattr(pdf.groupby(pkey)["b"].rolling(2), f)().sort_index(),
diff --git a/python/pyspark/pandas/tests/test_rolling.py 
b/python/pyspark/pandas/tests/test_rolling.py
index 92373d2..7409d69 100644
--- a/python/pyspark/pandas/tests/test_rolling.py
+++ b/python/pyspark/pandas/tests/test_rolling.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from distutils.version import LooseVersion
 
 import numpy as np
 import pandas as pd
@@ -110,18 +111,24 @@ class RollingTest(PandasOnSparkTestCase, TestUtils):
 
         pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0, 2.0], "b": [4.0, 2.0, 3.0, 
1.0]})
         psdf = ps.from_pandas(pdf)
-        self.assert_eq(
-            getattr(psdf.groupby(psdf.a).rolling(2), f)().sort_index(),
-            getattr(pdf.groupby(pdf.a).rolling(2), f)().sort_index(),
-        )
-        self.assert_eq(
-            getattr(psdf.groupby(psdf.a).rolling(2), f)().sum(),
-            getattr(pdf.groupby(pdf.a).rolling(2), f)().sum(),
-        )
-        self.assert_eq(
-            getattr(psdf.groupby(psdf.a + 1).rolling(2), f)().sort_index(),
-            getattr(pdf.groupby(pdf.a + 1).rolling(2), f)().sort_index(),
-        )
+
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(
+                getattr(psdf.groupby(psdf.a).rolling(2), f)().sort_index(),
+                getattr(pdf.groupby(pdf.a).rolling(2), f)().sort_index(),
+            )
+            self.assert_eq(
+                getattr(psdf.groupby(psdf.a).rolling(2), f)().sum(),
+                getattr(pdf.groupby(pdf.a).rolling(2), f)().sum(),
+            )
+            self.assert_eq(
+                getattr(psdf.groupby(psdf.a + 1).rolling(2), f)().sort_index(),
+                getattr(pdf.groupby(pdf.a + 1).rolling(2), f)().sort_index(),
+            )
+
         self.assert_eq(
             getattr(psdf.b.groupby(psdf.a).rolling(2), f)().sort_index(),
             getattr(pdf.b.groupby(pdf.a).rolling(2), f)().sort_index(),
@@ -139,15 +146,20 @@ class RollingTest(PandasOnSparkTestCase, TestUtils):
         columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
         pdf.columns = columns
         psdf.columns = columns
-        self.assert_eq(
-            getattr(psdf.groupby(("a", "x")).rolling(2), f)().sort_index(),
-            getattr(pdf.groupby(("a", "x")).rolling(2), f)().sort_index(),
-        )
 
-        self.assert_eq(
-            getattr(psdf.groupby([("a", "x"), ("a", "y")]).rolling(2), 
f)().sort_index(),
-            getattr(pdf.groupby([("a", "x"), ("a", "y")]).rolling(2), 
f)().sort_index(),
-        )
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+            pass
+        else:
+            self.assert_eq(
+                getattr(psdf.groupby(("a", "x")).rolling(2), f)().sort_index(),
+                getattr(pdf.groupby(("a", "x")).rolling(2), f)().sort_index(),
+            )
+
+            self.assert_eq(
+                getattr(psdf.groupby([("a", "x"), ("a", "y")]).rolling(2), 
f)().sort_index(),
+                getattr(pdf.groupby([("a", "x"), ("a", "y")]).rolling(2), 
f)().sort_index(),
+            )
 
     def test_groupby_rolling_count(self):
         self._test_groupby_rolling_func("count")
diff --git a/python/pyspark/pandas/tests/test_series.py 
b/python/pyspark/pandas/tests/test_series.py
index b42d3cd..d9ba3c76 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -1556,12 +1556,16 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
         if extension_object_dtypes_available:
             from pandas import StringDtype
 
-            self._check_extension(
-                psser.astype("M").astype("string"), 
pser.astype("M").astype("string")
-            )
-            self._check_extension(
-                psser.astype("M").astype(StringDtype()), 
pser.astype("M").astype(StringDtype())
-            )
+            if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+                # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
+                pass
+            else:
+                self._check_extension(
+                    psser.astype("M").astype("string"), 
pser.astype("M").astype("string")
+                )
+                self._check_extension(
+                    psser.astype("M").astype(StringDtype()), 
pser.astype("M").astype(StringDtype())
+                )
 
         with self.assertRaisesRegex(TypeError, "not understood"):
             psser.astype("int63")

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] 01/06: [SPARK-36345][SPARK-36367][INFRA][PYTHON] Disable tests failed by the incompatible behavior of pandas 1.3

Reply via email to