This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git

commit 2dc15d9d8476da327c54577e3bbb261ad7923f2f
Author: itholic <[email protected]>
AuthorDate: Thu Aug 26 17:43:49 2021 +0900

    [SPARK-36537][PYTHON] Revisit disabled tests for CategoricalDtype
    
    This PR proposes to enable the tests, disabled since different behavior 
with pandas 1.3.
    
    - `inplace` argument for `CategoricalDtype` functions is deprecated from 
pandas 1.3, and seems they have bug. So we manually created the expected result 
and test them.
    - Fixed the `GroupBy.transform` since it doesn't work properly for 
`CategoricalDtype`.
    
    We should enable the tests as much as possible even if pandas has a bug.
    
    And we should follow the behavior of latest pandas.
    
    Yes, `GroupBy.transform` now follow the behavior of latest pandas.
    
    Unittests.
    
    Closes #33817 from itholic/SPARK-36537.
    
    Authored-by: itholic <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
    (cherry picked from commit fe486185c4a3a05278b1f01884e2b95ed3ca31bc)
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/pandas/groupby.py                |   1 +
 python/pyspark/pandas/tests/test_categorical.py | 116 +++++++++++++-----------
 2 files changed, 63 insertions(+), 54 deletions(-)

diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index c732dff..2815a6b 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -2264,6 +2264,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
                 for c in psdf._internal.data_spark_column_names
                 if c not in groupkey_names
             ]
+
             return_schema = StructType([field.struct_field for field in 
data_fields])
 
             sdf = GroupBy._spark_group_map_apply(
diff --git a/python/pyspark/pandas/tests/test_categorical.py 
b/python/pyspark/pandas/tests/test_categorical.py
index 1fb0d58..e55c08c 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -74,10 +74,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
         pser.cat.categories = ["z", "y", "x"]
         psser.cat.categories = ["z", "y", "x"]
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=["x", "y", "z"]))
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         with self.assertRaises(ValueError):
@@ -96,10 +96,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
         pser.cat.add_categories(4, inplace=True)
         psser.cat.add_categories(4, inplace=True)
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=[1, 2, 3, 4]))
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         self.assertRaises(ValueError, lambda: psser.cat.add_categories(4))
@@ -124,10 +124,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
         pser.cat.remove_categories(2, inplace=True)
         psser.cat.remove_categories(2, inplace=True)
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=[1, 3]))
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
@@ -151,10 +151,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
         pser.cat.remove_unused_categories(inplace=True)
         psser.cat.remove_unused_categories(inplace=True)
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=[1, 3]))
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
     def test_reorder_categories(self):
@@ -180,20 +180,17 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         pser.cat.reorder_categories([1, 2, 3], inplace=True)
         psser.cat.reorder_categories([1, 2, 3], inplace=True)
-        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
         psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=[3, 2, 1], 
ordered=True))
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 
2]))
@@ -214,10 +211,10 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
         pser.cat.as_ordered(inplace=True)
         psser.cat.as_ordered(inplace=True)
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=[1, 2, 3], 
ordered=True))
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         # as_unordered
@@ -225,6 +222,11 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         pser.cat.as_unordered(inplace=True)
         psser.cat.as_unordered(inplace=True)
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=[1, 2, 3], 
ordered=False))
+            pdf.a = pser
+
         self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
@@ -445,13 +447,16 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
 
         dtype = CategoricalDtype(categories=["a", "b", "c", "d"])
 
-        def astype(x) -> ps.Series[dtype]:
+        # The behavior for CategoricalDtype is changed from pandas 1.3
+        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
+            ret_dtype = pdf.b.dtype
+        else:
+            ret_dtype = dtype
+
+        def astype(x) -> ps.Series[ret_dtype]:
             return x.astype(dtype)
 
-        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
+        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
             self.assert_eq(
                 
psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
                 
pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
@@ -670,28 +675,30 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
         pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
         psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=["C", "b", "d", 
"A"]))
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         pser.cat.rename_categories(lambda x: x.upper(), inplace=True)
         psser.cat.rename_categories(lambda x: x.upper(), inplace=True)
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=["C", "B", "D", 
"A"]))
+            pdf.b = pser
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         pser.cat.rename_categories([0, 1, 3, 2], inplace=True)
         psser.cat.rename_categories([0, 1, 3, 2], inplace=True)
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=[0, 1, 3, 2]))
+            pdf.b = pser
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         self.assertRaisesRegex(
@@ -762,19 +769,20 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
             psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, 
rename=True),
         )
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=["a", "c", "b", 
"o"]))
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
         psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
         if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
-            # TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
-            pass
-        else:
-            self.assert_eq(pser, psser)
+            # Bug in pandas 1.3. dtype is not updated properly with `inplace` 
argument.
+            pser = pser.astype(CategoricalDtype(categories=[2, 3, 1, 0]))
+            pdf.b = pser
+
+        self.assert_eq(pser, psser)
         self.assert_eq(pdf, psdf)
 
         self.assertRaisesRegex(

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to