(spark) branch master updated: [SPARK-46264][PS][CONNECT][TESTS] Re-organize the resampling tests

ruifengz Tue, 05 Dec 2023 02:31:49 -0800

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 12817035c575 [SPARK-46264][PS][CONNECT][TESTS] Re-organize the 
resampling tests
12817035c575 is described below

commit 12817035c57505c5eeea228d5184c4ab629c95d4
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Dec 5 18:31:24 2023 +0800

    [SPARK-46264][PS][CONNECT][TESTS] Re-organize the resampling tests
    
    ### What changes were proposed in this pull request?
    Re-organize the resampling tests
    
    ### Why are the changes needed?
    Re-organize the resampling tests by topics to be more consistent with 
[Pandas tests](https://github.com/pandas-dev/pandas/tree/main/pandas/tests)
    
    ### Does this PR introduce _any_ user-facing change?
    no
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #44180 from zhengruifeng/reorg_test_resample.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 dev/sparktestsupport/modules.py                    |  10 +-
 .../pandas/tests/connect/resample/__init__.py      |  16 ++
 .../test_parity_error.py}                          |  16 +-
 .../test_parity_missing.py}                        |  16 +-
 .../test_parity_on.py}                             |  16 +-
 .../test_parity_timezone.py}                       |  16 +-
 python/pyspark/pandas/tests/resample/__init__.py   |  16 ++
 python/pyspark/pandas/tests/resample/test_error.py |  94 ++++++
 .../pyspark/pandas/tests/resample/test_missing.py  | 141 +++++++++
 python/pyspark/pandas/tests/resample/test_on.py    |  66 +++++
 .../pyspark/pandas/tests/resample/test_timezone.py |  83 ++++++
 python/pyspark/pandas/tests/test_resample.py       | 320 ---------------------
 12 files changed, 442 insertions(+), 368 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 15b2e8f186e5..f35c42d11e58 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -740,7 +740,10 @@ pyspark_pandas = Module(
         "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
         "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
         "pyspark.pandas.tests.test_repr",
-        "pyspark.pandas.tests.test_resample",
+        "pyspark.pandas.tests.resample.test_on",
+        "pyspark.pandas.tests.resample.test_error",
+        "pyspark.pandas.tests.resample.test_missing",
+        "pyspark.pandas.tests.resample.test_timezone",
         "pyspark.pandas.tests.test_frame_resample",
         "pyspark.pandas.tests.test_series_resample",
         "pyspark.pandas.tests.test_reshape",
@@ -984,7 +987,10 @@ pyspark_pandas_connect_part0 = Module(
         "pyspark.pandas.tests.connect.test_parity_namespace",
         "pyspark.pandas.tests.connect.test_parity_numpy_compat",
         "pyspark.pandas.tests.connect.test_parity_repr",
-        "pyspark.pandas.tests.connect.test_parity_resample",
+        "pyspark.pandas.tests.connect.resample.test_parity_error",
+        "pyspark.pandas.tests.connect.resample.test_parity_missing",
+        "pyspark.pandas.tests.connect.resample.test_parity_on",
+        "pyspark.pandas.tests.connect.resample.test_parity_timezone",
         "pyspark.pandas.tests.connect.test_parity_scalars",
         "pyspark.pandas.tests.connect.test_parity_series_conversion",
         "pyspark.pandas.tests.connect.test_parity_series_datetime",
diff --git a/python/pyspark/pandas/tests/connect/resample/__init__.py 
b/python/pyspark/pandas/tests/connect/resample/__init__.py
new file mode 100644
index 000000000000..cce3acad34a4
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/resample/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py 
b/python/pyspark/pandas/tests/connect/resample/test_parity_error.py
similarity index 70%
copy from python/pyspark/pandas/tests/connect/test_parity_resample.py
copy to python/pyspark/pandas/tests/connect/resample/test_parity_error.py
index caca2f957b50..f6365ee8dc87 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_resample.py
+++ b/python/pyspark/pandas/tests/connect/resample/test_parity_error.py
@@ -16,25 +16,17 @@
 #
 import unittest
 
-from pyspark.pandas.tests.test_resample import ResampleTestsMixin, 
ResampleWithTimezoneMixin
+from pyspark.pandas.tests.resample.test_error import ResampleErrorMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class ResampleParityTests(
-    ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils, 
ReusedConnectTestCase
-):
-    pass
-
-
-class ResampleWithTimezoneTests(
-    ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils, 
ReusedConnectTestCase
-):
+class ResampleParityErrorTests(ResampleErrorMixin, PandasOnSparkTestUtils, 
ReusedConnectTestCase):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.test_parity_resample import *  # noqa: 
F401
+    from pyspark.pandas.tests.connect.resample.test_parity_error import *  # 
noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py 
b/python/pyspark/pandas/tests/connect/resample/test_parity_missing.py
similarity index 70%
copy from python/pyspark/pandas/tests/connect/test_parity_resample.py
copy to python/pyspark/pandas/tests/connect/resample/test_parity_missing.py
index caca2f957b50..49e541084609 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_resample.py
+++ b/python/pyspark/pandas/tests/connect/resample/test_parity_missing.py
@@ -16,25 +16,19 @@
 #
 import unittest
 
-from pyspark.pandas.tests.test_resample import ResampleTestsMixin, 
ResampleWithTimezoneMixin
+from pyspark.pandas.tests.resample.test_missing import ResampleMissingMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class ResampleParityTests(
-    ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils, 
ReusedConnectTestCase
-):
-    pass
-
-
-class ResampleWithTimezoneTests(
-    ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils, 
ReusedConnectTestCase
+class ResampleParityMissingTests(
+    ResampleMissingMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.test_parity_resample import *  # noqa: 
F401
+    from pyspark.pandas.tests.connect.resample.test_parity_missing import *  # 
noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py 
b/python/pyspark/pandas/tests/connect/resample/test_parity_on.py
similarity index 70%
copy from python/pyspark/pandas/tests/connect/test_parity_resample.py
copy to python/pyspark/pandas/tests/connect/resample/test_parity_on.py
index caca2f957b50..d4b767329fb8 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_resample.py
+++ b/python/pyspark/pandas/tests/connect/resample/test_parity_on.py
@@ -16,25 +16,17 @@
 #
 import unittest
 
-from pyspark.pandas.tests.test_resample import ResampleTestsMixin, 
ResampleWithTimezoneMixin
+from pyspark.pandas.tests.resample.test_on import ResampleOnMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class ResampleParityTests(
-    ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils, 
ReusedConnectTestCase
-):
-    pass
-
-
-class ResampleWithTimezoneTests(
-    ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils, 
ReusedConnectTestCase
-):
+class ResampleParityOnTests(ResampleOnMixin, PandasOnSparkTestUtils, 
ReusedConnectTestCase):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.test_parity_resample import *  # noqa: 
F401
+    from pyspark.pandas.tests.connect.resample.test_parity_on import *  # 
noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py 
b/python/pyspark/pandas/tests/connect/resample/test_parity_timezone.py
similarity index 70%
rename from python/pyspark/pandas/tests/connect/test_parity_resample.py
rename to python/pyspark/pandas/tests/connect/resample/test_parity_timezone.py
index caca2f957b50..0e8f36a51168 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_resample.py
+++ b/python/pyspark/pandas/tests/connect/resample/test_parity_timezone.py
@@ -16,25 +16,19 @@
 #
 import unittest
 
-from pyspark.pandas.tests.test_resample import ResampleTestsMixin, 
ResampleWithTimezoneMixin
+from pyspark.pandas.tests.resample.test_timezone import ResampleTimezoneMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class ResampleParityTests(
-    ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils, 
ReusedConnectTestCase
-):
-    pass
-
-
-class ResampleWithTimezoneTests(
-    ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils, 
ReusedConnectTestCase
+class ResampleParityTimezoneTests(
+    ResampleTimezoneMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
 ):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.test_parity_resample import *  # noqa: 
F401
+    from pyspark.pandas.tests.connect.resample.test_parity_timezone import *  
# noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/resample/__init__.py 
b/python/pyspark/pandas/tests/resample/__init__.py
new file mode 100644
index 000000000000..cce3acad34a4
--- /dev/null
+++ b/python/pyspark/pandas/tests/resample/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pyspark/pandas/tests/resample/test_error.py 
b/python/pyspark/pandas/tests/resample/test_error.py
new file mode 100644
index 000000000000..15b5df7b3b80
--- /dev/null
+++ b/python/pyspark/pandas/tests/resample/test_error.py
@@ -0,0 +1,94 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+import datetime
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class ResampleErrorMixin:
+    def test_resample_error(self):
+        psdf = ps.range(10)
+
+        with self.assertRaisesRegex(
+            NotImplementedError, "resample currently works only for 
DatetimeIndex"
+        ):
+            psdf.resample("3Y").sum()
+
+        with self.assertRaisesRegex(
+            NotImplementedError, "resample currently works only for 
DatetimeIndex"
+        ):
+            psdf.id.resample("3Y").sum()
+
+        dates = [
+            datetime.datetime(2012, 1, 2),
+            datetime.datetime(2012, 5, 3),
+            datetime.datetime(2022, 5, 3),
+            pd.NaT,
+        ]
+        pdf = pd.DataFrame(np.ones(len(dates)), index=pd.DatetimeIndex(dates), 
columns=["A"])
+        psdf = ps.from_pandas(pdf)
+
+        with self.assertRaisesRegex(ValueError, "rule code W-SUN is not 
supported"):
+            psdf.A.resample("3W").sum()
+
+        with self.assertRaisesRegex(ValueError, "rule offset must be 
positive"):
+            psdf.A.resample("0Y").sum()
+
+        with self.assertRaisesRegex(ValueError, "invalid closed: 'middle'"):
+            psdf.A.resample("3Y", closed="middle").sum()
+
+        with self.assertRaisesRegex(ValueError, "invalid label: 'both'"):
+            psdf.A.resample("3Y", label="both").sum()
+
+        with self.assertRaisesRegex(
+            NotImplementedError, "`on` currently works only for TimestampType"
+        ):
+            psdf.A.resample("2D", on=psdf.A).sum()
+
+        with self.assertRaisesRegex(
+            NotImplementedError, "`on` currently works only for TimestampType"
+        ):
+            psdf[["A"]].resample("2D", on=psdf.A).sum()
+
+        psdf["B"] = ["a", "b", "c", "d"]
+        with self.assertRaisesRegex(ValueError, "No available aggregation 
columns!"):
+            psdf.B.resample("2D").sum()
+
+        with self.assertRaisesRegex(ValueError, "No available aggregation 
columns!"):
+            psdf[[]].resample("2D").sum()
+
+
+class ResampleErrorTests(ResampleErrorMixin, PandasOnSparkTestCase, TestUtils):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.resample.test_error import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/resample/test_missing.py 
b/python/pyspark/pandas/tests/resample/test_missing.py
new file mode 100644
index 000000000000..07dee20bad4f
--- /dev/null
+++ b/python/pyspark/pandas/tests/resample/test_missing.py
@@ -0,0 +1,141 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import unittest
+import inspect
+import datetime
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.pandas.exceptions import PandasNotImplementedError
+from pyspark.pandas.missing.resample import (
+    MissingPandasLikeDataFrameResampler,
+    MissingPandasLikeSeriesResampler,
+)
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class ResampleMissingMixin:
+    @property
+    def pdf1(self):
+        np.random.seed(11)
+        dates = [
+            pd.NaT,
+            datetime.datetime(2011, 12, 31),
+            datetime.datetime(2011, 12, 31, 0, 0, 1),
+            datetime.datetime(2011, 12, 31, 23, 59, 59),
+            datetime.datetime(2012, 1, 1),
+            datetime.datetime(2012, 1, 1, 0, 0, 1),
+            pd.NaT,
+            datetime.datetime(2012, 1, 1, 23, 59, 59),
+            datetime.datetime(2012, 1, 2),
+            pd.NaT,
+            datetime.datetime(2012, 1, 30, 23, 59, 59),
+            datetime.datetime(2012, 1, 31),
+            datetime.datetime(2012, 1, 31, 0, 0, 1),
+            datetime.datetime(2012, 3, 31),
+            datetime.datetime(2013, 5, 3),
+            datetime.datetime(2022, 5, 3),
+        ]
+        return pd.DataFrame(
+            np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), 
columns=list("AB")
+        )
+
+    @property
+    def psdf1(self):
+        return ps.from_pandas(self.pdf1)
+
+    def test_missing(self):
+        pdf_r = self.psdf1.resample("3Y")
+        pser_r = self.psdf1.A.resample("3Y")
+
+        # DataFrameResampler functions
+        missing_functions = inspect.getmembers(
+            MissingPandasLikeDataFrameResampler, inspect.isfunction
+        )
+        unsupported_functions = [
+            name for (name, type_) in missing_functions if type_.__name__ == 
"unsupported_function"
+        ]
+        for name in unsupported_functions:
+            with self.assertRaisesRegex(
+                PandasNotImplementedError,
+                "method.*Resampler.*{}.*not implemented( yet\\.|\\. 
.+)".format(name),
+            ):
+                getattr(pdf_r, name)()
+
+        # SeriesResampler functions
+        missing_functions = 
inspect.getmembers(MissingPandasLikeSeriesResampler, inspect.isfunction)
+        unsupported_functions = [
+            name for (name, type_) in missing_functions if type_.__name__ == 
"unsupported_function"
+        ]
+        for name in unsupported_functions:
+            with self.assertRaisesRegex(
+                PandasNotImplementedError,
+                "method.*Resampler.*{}.*not implemented( yet\\.|\\. 
.+)".format(name),
+            ):
+                getattr(pser_r, name)()
+
+        # DataFrameResampler properties
+        missing_properties = inspect.getmembers(
+            MissingPandasLikeDataFrameResampler, lambda o: isinstance(o, 
property)
+        )
+        unsupported_properties = [
+            name
+            for (name, type_) in missing_properties
+            if type_.fget.__name__ == "unsupported_property"
+        ]
+        for name in unsupported_properties:
+            with self.assertRaisesRegex(
+                PandasNotImplementedError,
+                "property.*Resampler.*{}.*not implemented( yet\\.|\\. 
.+)".format(name),
+            ):
+                getattr(pdf_r, name)
+
+        # SeriesResampler properties
+        missing_properties = inspect.getmembers(
+            MissingPandasLikeSeriesResampler, lambda o: isinstance(o, property)
+        )
+        unsupported_properties = [
+            name
+            for (name, type_) in missing_properties
+            if type_.fget.__name__ == "unsupported_property"
+        ]
+        for name in unsupported_properties:
+            with self.assertRaisesRegex(
+                PandasNotImplementedError,
+                "property.*Resampler.*{}.*not implemented( yet\\.|\\. 
.+)".format(name),
+            ):
+                getattr(pser_r, name)
+
+
+class ResampleMissingTests(ResampleMissingMixin, PandasOnSparkTestCase, 
TestUtils):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.resample.test_missing import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/resample/test_on.py 
b/python/pyspark/pandas/tests/resample/test_on.py
new file mode 100644
index 000000000000..6062762fd2e3
--- /dev/null
+++ b/python/pyspark/pandas/tests/resample/test_on.py
@@ -0,0 +1,66 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import unittest
+import datetime
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class ResampleOnMixin:
+    def test_resample_on(self):
+        np.random.seed(77)
+        dates = [
+            datetime.datetime(2022, 5, 1, 4, 5, 6),
+            datetime.datetime(2022, 5, 3),
+            datetime.datetime(2022, 5, 3, 23, 59, 59),
+            datetime.datetime(2022, 5, 4),
+            pd.NaT,
+            datetime.datetime(2022, 5, 4, 0, 0, 1),
+            datetime.datetime(2022, 5, 11),
+        ]
+        pdf = pd.DataFrame(
+            np.random.rand(len(dates), 3), index=pd.DatetimeIndex(dates), 
columns=list("ABC")
+        )
+        pdf["X"] = pd.DatetimeIndex(dates)
+        psdf = ps.from_pandas(pdf)
+        self.assert_eq(
+            pdf.resample("2D", on="X").sum().sort_index(),
+            psdf.resample("2D", on=psdf.X).sum().sort_index(),
+            almost=True,
+        )
+
+
+class ResampleOnTests(ResampleOnMixin, PandasOnSparkTestCase, TestUtils):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.resample.test_on import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/resample/test_timezone.py 
b/python/pyspark/pandas/tests/resample/test_timezone.py
new file mode 100644
index 000000000000..17c46dd26b35
--- /dev/null
+++ b/python/pyspark/pandas/tests/resample/test_timezone.py
@@ -0,0 +1,83 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import unittest
+import os
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class ResampleTimezoneMixin:
+    timezone = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.timezone = os.environ.get("TZ", None)
+        os.environ["TZ"] = "America/New_York"
+        super(ResampleTimezoneMixin, cls).setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        super(ResampleTimezoneMixin, cls).tearDownClass()
+        if cls.timezone is not None:
+            os.environ["TZ"] = cls.timezone
+
+    @property
+    def pdf(self):
+        np.random.seed(22)
+        index = pd.date_range(start="2011-01-02", end="2022-05-01", freq="1D")
+        return pd.DataFrame(np.random.rand(len(index), 2), index=index, 
columns=list("AB"))
+
+    @property
+    def psdf(self):
+        return ps.from_pandas(self.pdf)
+
+    def test_series_resample_with_timezone(self):
+        with self.sql_conf(
+            {
+                "spark.sql.session.timeZone": "Asia/Seoul",
+                "spark.sql.timestampType": "TIMESTAMP_NTZ",
+            }
+        ):
+            p_resample = self.pdf.resample(rule="1001H", closed="right", 
label="right")
+            ps_resample = self.psdf.resample(rule="1001H", closed="right", 
label="right")
+            self.assert_eq(
+                p_resample.sum().sort_index(),
+                ps_resample.sum().sort_index(),
+                almost=True,
+            )
+
+
+class ResampleTimezoneTests(ResampleTimezoneMixin, PandasOnSparkTestCase, 
TestUtils):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.resample.test_timezone import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/test_resample.py 
b/python/pyspark/pandas/tests/test_resample.py
deleted file mode 100644
index c6005247f013..000000000000
--- a/python/pyspark/pandas/tests/test_resample.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-import unittest
-import inspect
-import datetime
-import os
-
-import numpy as np
-import pandas as pd
-
-from pyspark import pandas as ps
-from pyspark.pandas.exceptions import PandasNotImplementedError
-from pyspark.pandas.missing.resample import (
-    MissingPandasLikeDataFrameResampler,
-    MissingPandasLikeSeriesResampler,
-)
-from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
-
-
-class ResampleTestsMixin:
-    @property
-    def pdf1(self):
-        np.random.seed(11)
-        dates = [
-            pd.NaT,
-            datetime.datetime(2011, 12, 31),
-            datetime.datetime(2011, 12, 31, 0, 0, 1),
-            datetime.datetime(2011, 12, 31, 23, 59, 59),
-            datetime.datetime(2012, 1, 1),
-            datetime.datetime(2012, 1, 1, 0, 0, 1),
-            pd.NaT,
-            datetime.datetime(2012, 1, 1, 23, 59, 59),
-            datetime.datetime(2012, 1, 2),
-            pd.NaT,
-            datetime.datetime(2012, 1, 30, 23, 59, 59),
-            datetime.datetime(2012, 1, 31),
-            datetime.datetime(2012, 1, 31, 0, 0, 1),
-            datetime.datetime(2012, 3, 31),
-            datetime.datetime(2013, 5, 3),
-            datetime.datetime(2022, 5, 3),
-        ]
-        return pd.DataFrame(
-            np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), 
columns=list("AB")
-        )
-
-    @property
-    def pdf2(self):
-        np.random.seed(22)
-        dates = [
-            datetime.datetime(2022, 5, 1, 4, 5, 6),
-            datetime.datetime(2022, 5, 3),
-            datetime.datetime(2022, 5, 3, 23, 59, 59),
-            datetime.datetime(2022, 5, 4),
-            pd.NaT,
-            datetime.datetime(2022, 5, 4, 0, 0, 1),
-            datetime.datetime(2022, 5, 11),
-        ]
-        return pd.DataFrame(
-            np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), 
columns=list("AB")
-        )
-
-    @property
-    def pdf3(self):
-        np.random.seed(22)
-        index = pd.date_range(start="2011-01-02", end="2022-05-01", freq="1D")
-        return pd.DataFrame(np.random.rand(len(index), 2), index=index, 
columns=list("AB"))
-
-    @property
-    def pdf4(self):
-        np.random.seed(33)
-        index = pd.date_range(start="2020-12-12", end="2022-05-01", freq="1H")
-        return pd.DataFrame(np.random.rand(len(index), 2), index=index, 
columns=list("AB"))
-
-    @property
-    def pdf5(self):
-        np.random.seed(44)
-        index = pd.date_range(start="2021-12-30 03:04:05", end="2022-01-02 
06:07:08", freq="1T")
-        return pd.DataFrame(np.random.rand(len(index), 2), index=index, 
columns=list("AB"))
-
-    @property
-    def pdf6(self):
-        np.random.seed(55)
-        index = pd.date_range(start="2022-05-02 03:04:05", end="2022-05-02 
06:07:08", freq="1S")
-        return pd.DataFrame(np.random.rand(len(index), 2), index=index, 
columns=list("AB"))
-
-    @property
-    def psdf1(self):
-        return ps.from_pandas(self.pdf1)
-
-    @property
-    def psdf2(self):
-        return ps.from_pandas(self.pdf2)
-
-    @property
-    def psdf3(self):
-        return ps.from_pandas(self.pdf3)
-
-    @property
-    def psdf4(self):
-        return ps.from_pandas(self.pdf4)
-
-    @property
-    def psdf5(self):
-        return ps.from_pandas(self.pdf5)
-
-    @property
-    def psdf6(self):
-        return ps.from_pandas(self.pdf6)
-
-    def test_resample_error(self):
-        psdf = ps.range(10)
-
-        with self.assertRaisesRegex(
-            NotImplementedError, "resample currently works only for 
DatetimeIndex"
-        ):
-            psdf.resample("3Y").sum()
-
-        with self.assertRaisesRegex(
-            NotImplementedError, "resample currently works only for 
DatetimeIndex"
-        ):
-            psdf.id.resample("3Y").sum()
-
-        dates = [
-            datetime.datetime(2012, 1, 2),
-            datetime.datetime(2012, 5, 3),
-            datetime.datetime(2022, 5, 3),
-            pd.NaT,
-        ]
-        pdf = pd.DataFrame(np.ones(len(dates)), index=pd.DatetimeIndex(dates), 
columns=["A"])
-        psdf = ps.from_pandas(pdf)
-
-        with self.assertRaisesRegex(ValueError, "rule code W-SUN is not 
supported"):
-            psdf.A.resample("3W").sum()
-
-        with self.assertRaisesRegex(ValueError, "rule offset must be 
positive"):
-            psdf.A.resample("0Y").sum()
-
-        with self.assertRaisesRegex(ValueError, "invalid closed: 'middle'"):
-            psdf.A.resample("3Y", closed="middle").sum()
-
-        with self.assertRaisesRegex(ValueError, "invalid label: 'both'"):
-            psdf.A.resample("3Y", label="both").sum()
-
-        with self.assertRaisesRegex(
-            NotImplementedError, "`on` currently works only for TimestampType"
-        ):
-            psdf.A.resample("2D", on=psdf.A).sum()
-
-        with self.assertRaisesRegex(
-            NotImplementedError, "`on` currently works only for TimestampType"
-        ):
-            psdf[["A"]].resample("2D", on=psdf.A).sum()
-
-        psdf["B"] = ["a", "b", "c", "d"]
-        with self.assertRaisesRegex(ValueError, "No available aggregation 
columns!"):
-            psdf.B.resample("2D").sum()
-
-        with self.assertRaisesRegex(ValueError, "No available aggregation 
columns!"):
-            psdf[[]].resample("2D").sum()
-
-    def test_missing(self):
-        pdf_r = self.psdf1.resample("3Y")
-        pser_r = self.psdf1.A.resample("3Y")
-
-        # DataFrameResampler functions
-        missing_functions = inspect.getmembers(
-            MissingPandasLikeDataFrameResampler, inspect.isfunction
-        )
-        unsupported_functions = [
-            name for (name, type_) in missing_functions if type_.__name__ == 
"unsupported_function"
-        ]
-        for name in unsupported_functions:
-            with self.assertRaisesRegex(
-                PandasNotImplementedError,
-                "method.*Resampler.*{}.*not implemented( yet\\.|\\. 
.+)".format(name),
-            ):
-                getattr(pdf_r, name)()
-
-        # SeriesResampler functions
-        missing_functions = 
inspect.getmembers(MissingPandasLikeSeriesResampler, inspect.isfunction)
-        unsupported_functions = [
-            name for (name, type_) in missing_functions if type_.__name__ == 
"unsupported_function"
-        ]
-        for name in unsupported_functions:
-            with self.assertRaisesRegex(
-                PandasNotImplementedError,
-                "method.*Resampler.*{}.*not implemented( yet\\.|\\. 
.+)".format(name),
-            ):
-                getattr(pser_r, name)()
-
-        # DataFrameResampler properties
-        missing_properties = inspect.getmembers(
-            MissingPandasLikeDataFrameResampler, lambda o: isinstance(o, 
property)
-        )
-        unsupported_properties = [
-            name
-            for (name, type_) in missing_properties
-            if type_.fget.__name__ == "unsupported_property"
-        ]
-        for name in unsupported_properties:
-            with self.assertRaisesRegex(
-                PandasNotImplementedError,
-                "property.*Resampler.*{}.*not implemented( yet\\.|\\. 
.+)".format(name),
-            ):
-                getattr(pdf_r, name)
-
-        # SeriesResampler properties
-        missing_properties = inspect.getmembers(
-            MissingPandasLikeSeriesResampler, lambda o: isinstance(o, property)
-        )
-        unsupported_properties = [
-            name
-            for (name, type_) in missing_properties
-            if type_.fget.__name__ == "unsupported_property"
-        ]
-        for name in unsupported_properties:
-            with self.assertRaisesRegex(
-                PandasNotImplementedError,
-                "property.*Resampler.*{}.*not implemented( yet\\.|\\. 
.+)".format(name),
-            ):
-                getattr(pser_r, name)
-
-    def test_resample_on(self):
-        np.random.seed(77)
-        dates = [
-            datetime.datetime(2022, 5, 1, 4, 5, 6),
-            datetime.datetime(2022, 5, 3),
-            datetime.datetime(2022, 5, 3, 23, 59, 59),
-            datetime.datetime(2022, 5, 4),
-            pd.NaT,
-            datetime.datetime(2022, 5, 4, 0, 0, 1),
-            datetime.datetime(2022, 5, 11),
-        ]
-        pdf = pd.DataFrame(
-            np.random.rand(len(dates), 3), index=pd.DatetimeIndex(dates), 
columns=list("ABC")
-        )
-        pdf["X"] = pd.DatetimeIndex(dates)
-        psdf = ps.from_pandas(pdf)
-        self.assert_eq(
-            pdf.resample("2D", on="X").sum().sort_index(),
-            psdf.resample("2D", on=psdf.X).sum().sort_index(),
-            almost=True,
-        )
-
-
-class ResampleWithTimezoneMixin:
-    timezone = None
-
-    @classmethod
-    def setUpClass(cls):
-        cls.timezone = os.environ.get("TZ", None)
-        os.environ["TZ"] = "America/New_York"
-        super(ResampleWithTimezoneMixin, cls).setUpClass()
-
-    @classmethod
-    def tearDownClass(cls):
-        super(ResampleWithTimezoneMixin, cls).tearDownClass()
-        if cls.timezone is not None:
-            os.environ["TZ"] = cls.timezone
-
-    @property
-    def pdf(self):
-        np.random.seed(22)
-        index = pd.date_range(start="2011-01-02", end="2022-05-01", freq="1D")
-        return pd.DataFrame(np.random.rand(len(index), 2), index=index, 
columns=list("AB"))
-
-    @property
-    def psdf(self):
-        return ps.from_pandas(self.pdf)
-
-    def test_series_resample_with_timezone(self):
-        with self.sql_conf(
-            {
-                "spark.sql.session.timeZone": "Asia/Seoul",
-                "spark.sql.timestampType": "TIMESTAMP_NTZ",
-            }
-        ):
-            p_resample = self.pdf.resample(rule="1001H", closed="right", 
label="right")
-            ps_resample = self.psdf.resample(rule="1001H", closed="right", 
label="right")
-            self.assert_eq(
-                p_resample.sum().sort_index(),
-                ps_resample.sum().sort_index(),
-                almost=True,
-            )
-
-
-class ResampleTests(ResampleTestsMixin, PandasOnSparkTestCase, TestUtils):
-    pass
-
-
-class ResampleWithTimezoneTests(ResampleWithTimezoneMixin, 
PandasOnSparkTestCase, TestUtils):
-    pass
-
-
-if __name__ == "__main__":
-    from pyspark.pandas.tests.test_resample import *  # noqa: F401
-
-    try:
-        import xmlrunner
-
-        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
-    except ImportError:
-        testRunner = None
-    unittest.main(testRunner=testRunner, verbosity=2)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-46264][PS][CONNECT][TESTS] Re-organize the resampling tests

Reply via email to