(spark) branch master updated: [SPARK-46462][PS][TESTS] Reorganize `OpsOnDiffFramesGroupByRollingTests`

gurwls223 Wed, 20 Dec 2023 15:58:42 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 9b583809dd94 [SPARK-46462][PS][TESTS] Reorganize 
`OpsOnDiffFramesGroupByRollingTests`
9b583809dd94 is described below

commit 9b583809dd9494ee8ed3c2e50356230e1ffae218
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Thu Dec 21 08:58:25 2023 +0900

    [SPARK-46462][PS][TESTS] Reorganize `OpsOnDiffFramesGroupByRollingTests`
    
    ### What changes were proposed in this pull request?
    Reorganize `OpsOnDiffFramesGroupByRollingTests`
    
    ### Why are the changes needed?
    for parallelism
    
    ### Does this PR introduce _any_ user-facing change?
    no, test-only
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #44420 from zhengruifeng/ps_test_diff_group_roll.
    
    Lead-authored-by: Ruifeng Zheng <[email protected]>
    Co-authored-by: Hyukjin Kwon <[email protected]>
    Co-authored-by: Hyukjin Kwon <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 dev/sparktestsupport/modules.py                    |  8 ++-
 .../test_parity_groupby_rolling.py}                | 13 ++---
 .../test_parity_groupby_rolling_adv.py}            | 13 ++---
 .../test_parity_groupby_rolling_count.py}          | 13 ++---
 .../test_groupby_rolling.py}                       | 66 ++++++----------------
 .../diff_frames_ops/test_groupby_rolling_adv.py    | 61 ++++++++++++++++++++
 .../test_groupby_rolling_count.py}                 | 62 +++-----------------
 7 files changed, 107 insertions(+), 129 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 0388f1812b0d..cbd3b35c0015 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -745,7 +745,9 @@ pyspark_pandas = Module(
         "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding",
         "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_adv",
         "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_count",
-        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
+        "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling",
+        "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv",
+        "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count",
         "pyspark.pandas.tests.test_repr",
         "pyspark.pandas.tests.resample.test_on",
         "pyspark.pandas.tests.resample.test_error",
@@ -1170,7 +1172,9 @@ pyspark_pandas_connect_part2 = Module(
         "pyspark.pandas.tests.connect.window.test_parity_expanding_error",
         "pyspark.pandas.tests.connect.window.test_parity_groupby_expanding",
         
"pyspark.pandas.tests.connect.window.test_parity_groupby_expanding_adv",
-        
"pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling",
+        
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling",
+        
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_adv",
+        
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_count",
         "pyspark.pandas.tests.connect.computation.test_parity_missing_data",
         "pyspark.pandas.tests.connect.groupby.test_parity_index",
         "pyspark.pandas.tests.connect.groupby.test_parity_describe",
diff --git 
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
 
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling.py
similarity index 75%
copy from 
python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
copy to 
python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling.py
index 4a52bb0748f5..c8255d6f9c42 100644
--- 
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
+++ 
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling.py
@@ -16,24 +16,21 @@
 #
 import unittest
 
-from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import (
-    OpsOnDiffFramesGroupByRollingTestsMixin,
-)
+from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling import 
GroupByRollingMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class OpsOnDiffFramesGroupByRollingParityTests(
-    OpsOnDiffFramesGroupByRollingTestsMixin,
+class GroupByRollingParityTests(
+    GroupByRollingMixin,
     PandasOnSparkTestUtils,
-    TestUtils,
     ReusedConnectTestCase,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from 
pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling 
import *
+    from 
pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling import 
*  # noqa
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git 
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
 
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_adv.py
similarity index 75%
copy from 
python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
copy to 
python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_adv.py
index 4a52bb0748f5..f1793a1f8d82 100644
--- 
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
+++ 
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_adv.py
@@ -16,24 +16,21 @@
 #
 import unittest
 
-from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import (
-    OpsOnDiffFramesGroupByRollingTestsMixin,
-)
+from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv import 
GroupByRollingAdvMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class OpsOnDiffFramesGroupByRollingParityTests(
-    OpsOnDiffFramesGroupByRollingTestsMixin,
+class GroupByRollingAdvParityTests(
+    GroupByRollingAdvMixin,
     PandasOnSparkTestUtils,
-    TestUtils,
     ReusedConnectTestCase,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from 
pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling 
import *
+    from 
pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_adv 
import *  # noqa
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git 
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
 
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_count.py
similarity index 75%
rename from 
python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
rename to 
python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_count.py
index 4a52bb0748f5..b0316401700e 100644
--- 
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
+++ 
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_count.py
@@ -16,24 +16,21 @@
 #
 import unittest
 
-from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import (
-    OpsOnDiffFramesGroupByRollingTestsMixin,
-)
+from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count import 
GroupByRollingCountMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class OpsOnDiffFramesGroupByRollingParityTests(
-    OpsOnDiffFramesGroupByRollingTestsMixin,
+class GroupByRollingCountParityTests(
+    GroupByRollingCountMixin,
     PandasOnSparkTestUtils,
-    TestUtils,
     ReusedConnectTestCase,
 ):
     pass
 
 
 if __name__ == "__main__":
-    from 
pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling 
import *
+    from 
pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_count 
import *  # noqa
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git 
a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py 
b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling.py
similarity index 62%
copy from python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
copy to python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling.py
index 676eafa74eed..ea1489ad55cd 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling.py
@@ -19,20 +19,11 @@ import pandas as pd
 
 from pyspark import pandas as ps
 from pyspark.pandas.config import set_option, reset_option
-from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
 
 
-class OpsOnDiffFramesGroupByRollingTestsMixin:
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        set_option("compute.ops_on_diff_frames", True)
-
-    @classmethod
-    def tearDownClass(cls):
-        reset_option("compute.ops_on_diff_frames")
-        super().tearDownClass()
-
+class GroupByRollingTestingFuncMixin:
     def _test_groupby_rolling_func(self, f):
         pser = pd.Series([1, 2, 3], name="a")
         pkey = pd.Series([1, 2, 3], name="a")
@@ -63,35 +54,17 @@ class OpsOnDiffFramesGroupByRollingTestsMixin:
             getattr(pdf.groupby(pkey)[["b"]].rolling(2), f)().sort_index(),
         )
 
-    def test_groupby_rolling_count(self):
-        pser = pd.Series([1, 2, 3], name="a")
-        pkey = pd.Series([1, 2, 3], name="a")
-        psser = ps.from_pandas(pser)
-        kkey = ps.from_pandas(pkey)
 
-        # TODO(SPARK-43432): Fix `min_periods` for Rolling.count() to work 
same as pandas
-        self.assert_eq(
-            psser.groupby(kkey).rolling(2).count().sort_index(),
-            pser.groupby(pkey).rolling(2, min_periods=1).count().sort_index(),
-        )
-
-        pdf = pd.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
-        pkey = pd.Series([1, 2, 3, 2], name="a")
-        psdf = ps.from_pandas(pdf)
-        kkey = ps.from_pandas(pkey)
+class GroupByRollingMixin(GroupByRollingTestingFuncMixin):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        set_option("compute.ops_on_diff_frames", True)
 
-        self.assert_eq(
-            psdf.groupby(kkey).rolling(2).count().sort_index(),
-            pdf.groupby(pkey).rolling(2, min_periods=1).count().sort_index(),
-        )
-        self.assert_eq(
-            psdf.groupby(kkey)["b"].rolling(2).count().sort_index(),
-            pdf.groupby(pkey)["b"].rolling(2, 
min_periods=1).count().sort_index(),
-        )
-        self.assert_eq(
-            psdf.groupby(kkey)[["b"]].rolling(2).count().sort_index(),
-            pdf.groupby(pkey)[["b"]].rolling(2, 
min_periods=1).count().sort_index(),
-        )
+    @classmethod
+    def tearDownClass(cls):
+        reset_option("compute.ops_on_diff_frames")
+        super().tearDownClass()
 
     def test_groupby_rolling_min(self):
         self._test_groupby_rolling_func("min")
@@ -105,23 +78,18 @@ class OpsOnDiffFramesGroupByRollingTestsMixin:
     def test_groupby_rolling_sum(self):
         self._test_groupby_rolling_func("sum")
 
-    def test_groupby_rolling_std(self):
-        # TODO: `std` now raise error in pandas 1.0.0
-        self._test_groupby_rolling_func("std")
-
-    def test_groupby_rolling_var(self):
-        self._test_groupby_rolling_func("var")
-
 
-class OpsOnDiffFramesGroupByRollingTests(
-    OpsOnDiffFramesGroupByRollingTestsMixin, PandasOnSparkTestCase, TestUtils
+class GroupByRollingTests(
+    GroupByRollingMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
 ):
     pass
 
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import * 
 # noqa: F401
+    from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling import *  # 
noqa
 
     try:
         import xmlrunner
diff --git 
a/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_adv.py 
b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_adv.py
new file mode 100644
index 000000000000..48d75fbcaf89
--- /dev/null
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_adv.py
@@ -0,0 +1,61 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark.pandas.config import set_option, reset_option
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling import 
GroupByRollingTestingFuncMixin
+
+
+class GroupByRollingAdvMixin(GroupByRollingTestingFuncMixin):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        set_option("compute.ops_on_diff_frames", True)
+
+    @classmethod
+    def tearDownClass(cls):
+        reset_option("compute.ops_on_diff_frames")
+        super().tearDownClass()
+
+    def test_groupby_rolling_std(self):
+        # TODO: `std` now raise error in pandas 1.0.0
+        self._test_groupby_rolling_func("std")
+
+    def test_groupby_rolling_var(self):
+        self._test_groupby_rolling_func("var")
+
+
+class GroupByRollingAdvTests(
+    GroupByRollingAdvMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
+):
+    pass
+
+
+if __name__ == "__main__":
+    import unittest
+    from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv import 
*  # noqa
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git 
a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py 
b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_count.py
similarity index 56%
rename from 
python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
rename to 
python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_count.py
index 676eafa74eed..41ac982db745 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_count.py
@@ -19,10 +19,11 @@ import pandas as pd
 
 from pyspark import pandas as ps
 from pyspark.pandas.config import set_option, reset_option
-from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
 
 
-class OpsOnDiffFramesGroupByRollingTestsMixin:
+class GroupByRollingCountMixin:
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
@@ -33,36 +34,6 @@ class OpsOnDiffFramesGroupByRollingTestsMixin:
         reset_option("compute.ops_on_diff_frames")
         super().tearDownClass()
 
-    def _test_groupby_rolling_func(self, f):
-        pser = pd.Series([1, 2, 3], name="a")
-        pkey = pd.Series([1, 2, 3], name="a")
-        psser = ps.from_pandas(pser)
-        kkey = ps.from_pandas(pkey)
-
-        self.assert_eq(
-            getattr(psser.groupby(kkey).rolling(2), f)().sort_index(),
-            getattr(pser.groupby(pkey).rolling(2), f)().sort_index(),
-        )
-
-        pdf = pd.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
-        pkey = pd.Series([1, 2, 3, 2], name="a")
-        psdf = ps.from_pandas(pdf)
-        kkey = ps.from_pandas(pkey)
-
-        self.assert_eq(
-            getattr(psdf.groupby(kkey).rolling(2), f)().sort_index(),
-            getattr(pdf.groupby(pkey).rolling(2), f)().sort_index(),
-        )
-
-        self.assert_eq(
-            getattr(psdf.groupby(kkey)["b"].rolling(2), f)().sort_index(),
-            getattr(pdf.groupby(pkey)["b"].rolling(2), f)().sort_index(),
-        )
-        self.assert_eq(
-            getattr(psdf.groupby(kkey)[["b"]].rolling(2), f)().sort_index(),
-            getattr(pdf.groupby(pkey)[["b"]].rolling(2), f)().sort_index(),
-        )
-
     def test_groupby_rolling_count(self):
         pser = pd.Series([1, 2, 3], name="a")
         pkey = pd.Series([1, 2, 3], name="a")
@@ -93,35 +64,18 @@ class OpsOnDiffFramesGroupByRollingTestsMixin:
             pdf.groupby(pkey)[["b"]].rolling(2, 
min_periods=1).count().sort_index(),
         )
 
-    def test_groupby_rolling_min(self):
-        self._test_groupby_rolling_func("min")
-
-    def test_groupby_rolling_max(self):
-        self._test_groupby_rolling_func("max")
-
-    def test_groupby_rolling_mean(self):
-        self._test_groupby_rolling_func("mean")
-
-    def test_groupby_rolling_sum(self):
-        self._test_groupby_rolling_func("sum")
-
-    def test_groupby_rolling_std(self):
-        # TODO: `std` now raise error in pandas 1.0.0
-        self._test_groupby_rolling_func("std")
-
-    def test_groupby_rolling_var(self):
-        self._test_groupby_rolling_func("var")
-
 
-class OpsOnDiffFramesGroupByRollingTests(
-    OpsOnDiffFramesGroupByRollingTestsMixin, PandasOnSparkTestCase, TestUtils
+class GroupByRollingCountTests(
+    GroupByRollingCountMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
 ):
     pass
 
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import * 
 # noqa: F401
+    from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count 
import *  # noqa
 
     try:
         import xmlrunner


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-46462][PS][TESTS] Reorganize `OpsOnDiffFramesGroupByRollingTests`

Reply via email to