This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 9b583809dd94 [SPARK-46462][PS][TESTS] Reorganize
`OpsOnDiffFramesGroupByRollingTests`
9b583809dd94 is described below
commit 9b583809dd9494ee8ed3c2e50356230e1ffae218
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Thu Dec 21 08:58:25 2023 +0900
[SPARK-46462][PS][TESTS] Reorganize `OpsOnDiffFramesGroupByRollingTests`
### What changes were proposed in this pull request?
Reorganize `OpsOnDiffFramesGroupByRollingTests`
### Why are the changes needed?
for parallelism
### Does this PR introduce _any_ user-facing change?
no, test-only
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44420 from zhengruifeng/ps_test_diff_group_roll.
Lead-authored-by: Ruifeng Zheng <[email protected]>
Co-authored-by: Hyukjin Kwon <[email protected]>
Co-authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
dev/sparktestsupport/modules.py | 8 ++-
.../test_parity_groupby_rolling.py} | 13 ++---
.../test_parity_groupby_rolling_adv.py} | 13 ++---
.../test_parity_groupby_rolling_count.py} | 13 ++---
.../test_groupby_rolling.py} | 66 ++++++----------------
.../diff_frames_ops/test_groupby_rolling_adv.py | 61 ++++++++++++++++++++
.../test_groupby_rolling_count.py} | 62 +++-----------------
7 files changed, 107 insertions(+), 129 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 0388f1812b0d..cbd3b35c0015 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -745,7 +745,9 @@ pyspark_pandas = Module(
"pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_adv",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_count",
- "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
+ "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling",
+ "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv",
+ "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count",
"pyspark.pandas.tests.test_repr",
"pyspark.pandas.tests.resample.test_on",
"pyspark.pandas.tests.resample.test_error",
@@ -1170,7 +1172,9 @@ pyspark_pandas_connect_part2 = Module(
"pyspark.pandas.tests.connect.window.test_parity_expanding_error",
"pyspark.pandas.tests.connect.window.test_parity_groupby_expanding",
"pyspark.pandas.tests.connect.window.test_parity_groupby_expanding_adv",
-
"pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling",
+
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling",
+
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_adv",
+
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_count",
"pyspark.pandas.tests.connect.computation.test_parity_missing_data",
"pyspark.pandas.tests.connect.groupby.test_parity_index",
"pyspark.pandas.tests.connect.groupby.test_parity_describe",
diff --git
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling.py
similarity index 75%
copy from
python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
copy to
python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling.py
index 4a52bb0748f5..c8255d6f9c42 100644
---
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
+++
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling.py
@@ -16,24 +16,21 @@
#
import unittest
-from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import (
- OpsOnDiffFramesGroupByRollingTestsMixin,
-)
+from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling import
GroupByRollingMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class OpsOnDiffFramesGroupByRollingParityTests(
- OpsOnDiffFramesGroupByRollingTestsMixin,
+class GroupByRollingParityTests(
+ GroupByRollingMixin,
PandasOnSparkTestUtils,
- TestUtils,
ReusedConnectTestCase,
):
pass
if __name__ == "__main__":
- from
pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling
import *
+ from
pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling import
* # noqa
try:
import xmlrunner # type: ignore[import]
diff --git
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_adv.py
similarity index 75%
copy from
python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
copy to
python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_adv.py
index 4a52bb0748f5..f1793a1f8d82 100644
---
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
+++
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_adv.py
@@ -16,24 +16,21 @@
#
import unittest
-from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import (
- OpsOnDiffFramesGroupByRollingTestsMixin,
-)
+from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv import
GroupByRollingAdvMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class OpsOnDiffFramesGroupByRollingParityTests(
- OpsOnDiffFramesGroupByRollingTestsMixin,
+class GroupByRollingAdvParityTests(
+ GroupByRollingAdvMixin,
PandasOnSparkTestUtils,
- TestUtils,
ReusedConnectTestCase,
):
pass
if __name__ == "__main__":
- from
pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling
import *
+ from
pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_adv
import * # noqa
try:
import xmlrunner # type: ignore[import]
diff --git
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_count.py
similarity index 75%
rename from
python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
rename to
python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_count.py
index 4a52bb0748f5..b0316401700e 100644
---
a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py
+++
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_count.py
@@ -16,24 +16,21 @@
#
import unittest
-from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import (
- OpsOnDiffFramesGroupByRollingTestsMixin,
-)
+from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count import
GroupByRollingCountMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class OpsOnDiffFramesGroupByRollingParityTests(
- OpsOnDiffFramesGroupByRollingTestsMixin,
+class GroupByRollingCountParityTests(
+ GroupByRollingCountMixin,
PandasOnSparkTestUtils,
- TestUtils,
ReusedConnectTestCase,
):
pass
if __name__ == "__main__":
- from
pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling
import *
+ from
pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_count
import * # noqa
try:
import xmlrunner # type: ignore[import]
diff --git
a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling.py
similarity index 62%
copy from python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
copy to python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling.py
index 676eafa74eed..ea1489ad55cd 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling.py
@@ -19,20 +19,11 @@ import pandas as pd
from pyspark import pandas as ps
from pyspark.pandas.config import set_option, reset_option
-from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
-class OpsOnDiffFramesGroupByRollingTestsMixin:
- @classmethod
- def setUpClass(cls):
- super().setUpClass()
- set_option("compute.ops_on_diff_frames", True)
-
- @classmethod
- def tearDownClass(cls):
- reset_option("compute.ops_on_diff_frames")
- super().tearDownClass()
-
+class GroupByRollingTestingFuncMixin:
def _test_groupby_rolling_func(self, f):
pser = pd.Series([1, 2, 3], name="a")
pkey = pd.Series([1, 2, 3], name="a")
@@ -63,35 +54,17 @@ class OpsOnDiffFramesGroupByRollingTestsMixin:
getattr(pdf.groupby(pkey)[["b"]].rolling(2), f)().sort_index(),
)
- def test_groupby_rolling_count(self):
- pser = pd.Series([1, 2, 3], name="a")
- pkey = pd.Series([1, 2, 3], name="a")
- psser = ps.from_pandas(pser)
- kkey = ps.from_pandas(pkey)
- # TODO(SPARK-43432): Fix `min_periods` for Rolling.count() to work
same as pandas
- self.assert_eq(
- psser.groupby(kkey).rolling(2).count().sort_index(),
- pser.groupby(pkey).rolling(2, min_periods=1).count().sort_index(),
- )
-
- pdf = pd.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
- pkey = pd.Series([1, 2, 3, 2], name="a")
- psdf = ps.from_pandas(pdf)
- kkey = ps.from_pandas(pkey)
+class GroupByRollingMixin(GroupByRollingTestingFuncMixin):
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ set_option("compute.ops_on_diff_frames", True)
- self.assert_eq(
- psdf.groupby(kkey).rolling(2).count().sort_index(),
- pdf.groupby(pkey).rolling(2, min_periods=1).count().sort_index(),
- )
- self.assert_eq(
- psdf.groupby(kkey)["b"].rolling(2).count().sort_index(),
- pdf.groupby(pkey)["b"].rolling(2,
min_periods=1).count().sort_index(),
- )
- self.assert_eq(
- psdf.groupby(kkey)[["b"]].rolling(2).count().sort_index(),
- pdf.groupby(pkey)[["b"]].rolling(2,
min_periods=1).count().sort_index(),
- )
+ @classmethod
+ def tearDownClass(cls):
+ reset_option("compute.ops_on_diff_frames")
+ super().tearDownClass()
def test_groupby_rolling_min(self):
self._test_groupby_rolling_func("min")
@@ -105,23 +78,18 @@ class OpsOnDiffFramesGroupByRollingTestsMixin:
def test_groupby_rolling_sum(self):
self._test_groupby_rolling_func("sum")
- def test_groupby_rolling_std(self):
- # TODO: `std` now raise error in pandas 1.0.0
- self._test_groupby_rolling_func("std")
-
- def test_groupby_rolling_var(self):
- self._test_groupby_rolling_func("var")
-
-class OpsOnDiffFramesGroupByRollingTests(
- OpsOnDiffFramesGroupByRollingTestsMixin, PandasOnSparkTestCase, TestUtils
+class GroupByRollingTests(
+ GroupByRollingMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
):
pass
if __name__ == "__main__":
import unittest
- from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import *
# noqa: F401
+ from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling import * #
noqa
try:
import xmlrunner
diff --git
a/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_adv.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_adv.py
new file mode 100644
index 000000000000..48d75fbcaf89
--- /dev/null
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_adv.py
@@ -0,0 +1,61 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark.pandas.config import set_option, reset_option
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling import
GroupByRollingTestingFuncMixin
+
+
+class GroupByRollingAdvMixin(GroupByRollingTestingFuncMixin):
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ set_option("compute.ops_on_diff_frames", True)
+
+ @classmethod
+ def tearDownClass(cls):
+ reset_option("compute.ops_on_diff_frames")
+ super().tearDownClass()
+
+ def test_groupby_rolling_std(self):
+ # TODO: `std` now raise error in pandas 1.0.0
+ self._test_groupby_rolling_func("std")
+
+ def test_groupby_rolling_var(self):
+ self._test_groupby_rolling_func("var")
+
+
+class GroupByRollingAdvTests(
+ GroupByRollingAdvMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ import unittest
+ from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv import
* # noqa
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git
a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_count.py
similarity index 56%
rename from
python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
rename to
python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_count.py
index 676eafa74eed..41ac982db745 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_count.py
@@ -19,10 +19,11 @@ import pandas as pd
from pyspark import pandas as ps
from pyspark.pandas.config import set_option, reset_option
-from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
-class OpsOnDiffFramesGroupByRollingTestsMixin:
+class GroupByRollingCountMixin:
@classmethod
def setUpClass(cls):
super().setUpClass()
@@ -33,36 +34,6 @@ class OpsOnDiffFramesGroupByRollingTestsMixin:
reset_option("compute.ops_on_diff_frames")
super().tearDownClass()
- def _test_groupby_rolling_func(self, f):
- pser = pd.Series([1, 2, 3], name="a")
- pkey = pd.Series([1, 2, 3], name="a")
- psser = ps.from_pandas(pser)
- kkey = ps.from_pandas(pkey)
-
- self.assert_eq(
- getattr(psser.groupby(kkey).rolling(2), f)().sort_index(),
- getattr(pser.groupby(pkey).rolling(2), f)().sort_index(),
- )
-
- pdf = pd.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
- pkey = pd.Series([1, 2, 3, 2], name="a")
- psdf = ps.from_pandas(pdf)
- kkey = ps.from_pandas(pkey)
-
- self.assert_eq(
- getattr(psdf.groupby(kkey).rolling(2), f)().sort_index(),
- getattr(pdf.groupby(pkey).rolling(2), f)().sort_index(),
- )
-
- self.assert_eq(
- getattr(psdf.groupby(kkey)["b"].rolling(2), f)().sort_index(),
- getattr(pdf.groupby(pkey)["b"].rolling(2), f)().sort_index(),
- )
- self.assert_eq(
- getattr(psdf.groupby(kkey)[["b"]].rolling(2), f)().sort_index(),
- getattr(pdf.groupby(pkey)[["b"]].rolling(2), f)().sort_index(),
- )
-
def test_groupby_rolling_count(self):
pser = pd.Series([1, 2, 3], name="a")
pkey = pd.Series([1, 2, 3], name="a")
@@ -93,35 +64,18 @@ class OpsOnDiffFramesGroupByRollingTestsMixin:
pdf.groupby(pkey)[["b"]].rolling(2,
min_periods=1).count().sort_index(),
)
- def test_groupby_rolling_min(self):
- self._test_groupby_rolling_func("min")
-
- def test_groupby_rolling_max(self):
- self._test_groupby_rolling_func("max")
-
- def test_groupby_rolling_mean(self):
- self._test_groupby_rolling_func("mean")
-
- def test_groupby_rolling_sum(self):
- self._test_groupby_rolling_func("sum")
-
- def test_groupby_rolling_std(self):
- # TODO: `std` now raise error in pandas 1.0.0
- self._test_groupby_rolling_func("std")
-
- def test_groupby_rolling_var(self):
- self._test_groupby_rolling_func("var")
-
-class OpsOnDiffFramesGroupByRollingTests(
- OpsOnDiffFramesGroupByRollingTestsMixin, PandasOnSparkTestCase, TestUtils
+class GroupByRollingCountTests(
+ GroupByRollingCountMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
):
pass
if __name__ == "__main__":
import unittest
- from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import *
# noqa: F401
+ from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count
import * # noqa
try:
import xmlrunner
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]