This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 12817035c575 [SPARK-46264][PS][CONNECT][TESTS] Re-organize the resampling tests 12817035c575 is described below commit 12817035c57505c5eeea228d5184c4ab629c95d4 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Tue Dec 5 18:31:24 2023 +0800 [SPARK-46264][PS][CONNECT][TESTS] Re-organize the resampling tests ### What changes were proposed in this pull request? Re-organize the resampling tests ### Why are the changes needed? Re-organize the resampling tests by topics to be more consistent with [Pandas tests](https://github.com/pandas-dev/pandas/tree/main/pandas/tests) ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44180 from zhengruifeng/reorg_test_resample. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- dev/sparktestsupport/modules.py | 10 +- .../pandas/tests/connect/resample/__init__.py | 16 ++ .../test_parity_error.py} | 16 +- .../test_parity_missing.py} | 16 +- .../test_parity_on.py} | 16 +- .../test_parity_timezone.py} | 16 +- python/pyspark/pandas/tests/resample/__init__.py | 16 ++ python/pyspark/pandas/tests/resample/test_error.py | 94 ++++++ .../pyspark/pandas/tests/resample/test_missing.py | 141 +++++++++ python/pyspark/pandas/tests/resample/test_on.py | 66 +++++ .../pyspark/pandas/tests/resample/test_timezone.py | 83 ++++++ python/pyspark/pandas/tests/test_resample.py | 320 --------------------- 12 files changed, 442 insertions(+), 368 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 15b2e8f186e5..f35c42d11e58 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -740,7 +740,10 @@ pyspark_pandas = Module( "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding", "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling", "pyspark.pandas.tests.test_repr", - "pyspark.pandas.tests.test_resample", + "pyspark.pandas.tests.resample.test_on", + "pyspark.pandas.tests.resample.test_error", + "pyspark.pandas.tests.resample.test_missing", + "pyspark.pandas.tests.resample.test_timezone", "pyspark.pandas.tests.test_frame_resample", "pyspark.pandas.tests.test_series_resample", "pyspark.pandas.tests.test_reshape", @@ -984,7 +987,10 @@ pyspark_pandas_connect_part0 = Module( "pyspark.pandas.tests.connect.test_parity_namespace", "pyspark.pandas.tests.connect.test_parity_numpy_compat", "pyspark.pandas.tests.connect.test_parity_repr", - "pyspark.pandas.tests.connect.test_parity_resample", + "pyspark.pandas.tests.connect.resample.test_parity_error", + "pyspark.pandas.tests.connect.resample.test_parity_missing", + "pyspark.pandas.tests.connect.resample.test_parity_on", + "pyspark.pandas.tests.connect.resample.test_parity_timezone", "pyspark.pandas.tests.connect.test_parity_scalars", "pyspark.pandas.tests.connect.test_parity_series_conversion", "pyspark.pandas.tests.connect.test_parity_series_datetime", diff --git a/python/pyspark/pandas/tests/connect/resample/__init__.py b/python/pyspark/pandas/tests/connect/resample/__init__.py new file mode 100644 index 000000000000..cce3acad34a4 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/resample/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py b/python/pyspark/pandas/tests/connect/resample/test_parity_error.py similarity index 70% copy from python/pyspark/pandas/tests/connect/test_parity_resample.py copy to python/pyspark/pandas/tests/connect/resample/test_parity_error.py index caca2f957b50..f6365ee8dc87 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_resample.py +++ b/python/pyspark/pandas/tests/connect/resample/test_parity_error.py @@ -16,25 +16,17 @@ # import unittest -from pyspark.pandas.tests.test_resample import ResampleTestsMixin, ResampleWithTimezoneMixin +from pyspark.pandas.tests.resample.test_error import ResampleErrorMixin from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class ResampleParityTests( - ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase -): - pass - - -class ResampleWithTimezoneTests( - ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase -): +class ResampleParityErrorTests(ResampleErrorMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_resample import * # noqa: F401 + from pyspark.pandas.tests.connect.resample.test_parity_error import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py b/python/pyspark/pandas/tests/connect/resample/test_parity_missing.py similarity index 70% copy from python/pyspark/pandas/tests/connect/test_parity_resample.py copy to python/pyspark/pandas/tests/connect/resample/test_parity_missing.py index caca2f957b50..49e541084609 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_resample.py +++ b/python/pyspark/pandas/tests/connect/resample/test_parity_missing.py @@ -16,25 +16,19 @@ # import unittest -from pyspark.pandas.tests.test_resample import ResampleTestsMixin, ResampleWithTimezoneMixin +from pyspark.pandas.tests.resample.test_missing import ResampleMissingMixin from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class ResampleParityTests( - ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase -): - pass - - -class ResampleWithTimezoneTests( - ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase +class ResampleParityMissingTests( + ResampleMissingMixin, PandasOnSparkTestUtils, ReusedConnectTestCase ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_resample import * # noqa: F401 + from pyspark.pandas.tests.connect.resample.test_parity_missing import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py b/python/pyspark/pandas/tests/connect/resample/test_parity_on.py similarity index 70% copy from python/pyspark/pandas/tests/connect/test_parity_resample.py copy to python/pyspark/pandas/tests/connect/resample/test_parity_on.py index caca2f957b50..d4b767329fb8 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_resample.py +++ b/python/pyspark/pandas/tests/connect/resample/test_parity_on.py @@ -16,25 +16,17 @@ # import unittest -from pyspark.pandas.tests.test_resample import ResampleTestsMixin, ResampleWithTimezoneMixin +from pyspark.pandas.tests.resample.test_on import ResampleOnMixin from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class ResampleParityTests( - ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase -): - pass - - -class ResampleWithTimezoneTests( - ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase -): +class ResampleParityOnTests(ResampleOnMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_resample import * # noqa: F401 + from pyspark.pandas.tests.connect.resample.test_parity_on import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py b/python/pyspark/pandas/tests/connect/resample/test_parity_timezone.py similarity index 70% rename from python/pyspark/pandas/tests/connect/test_parity_resample.py rename to python/pyspark/pandas/tests/connect/resample/test_parity_timezone.py index caca2f957b50..0e8f36a51168 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_resample.py +++ b/python/pyspark/pandas/tests/connect/resample/test_parity_timezone.py @@ -16,25 +16,19 @@ # import unittest -from pyspark.pandas.tests.test_resample import ResampleTestsMixin, ResampleWithTimezoneMixin +from pyspark.pandas.tests.resample.test_timezone import ResampleTimezoneMixin from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class ResampleParityTests( - ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase -): - pass - - -class ResampleWithTimezoneTests( - ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase +class ResampleParityTimezoneTests( + ResampleTimezoneMixin, PandasOnSparkTestUtils, ReusedConnectTestCase ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_resample import * # noqa: F401 + from pyspark.pandas.tests.connect.resample.test_parity_timezone import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/resample/__init__.py b/python/pyspark/pandas/tests/resample/__init__.py new file mode 100644 index 000000000000..cce3acad34a4 --- /dev/null +++ b/python/pyspark/pandas/tests/resample/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/pyspark/pandas/tests/resample/test_error.py b/python/pyspark/pandas/tests/resample/test_error.py new file mode 100644 index 000000000000..15b5df7b3b80 --- /dev/null +++ b/python/pyspark/pandas/tests/resample/test_error.py @@ -0,0 +1,94 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest +import datetime + +import numpy as np +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils + + +class ResampleErrorMixin: + def test_resample_error(self): + psdf = ps.range(10) + + with self.assertRaisesRegex( + NotImplementedError, "resample currently works only for DatetimeIndex" + ): + psdf.resample("3Y").sum() + + with self.assertRaisesRegex( + NotImplementedError, "resample currently works only for DatetimeIndex" + ): + psdf.id.resample("3Y").sum() + + dates = [ + datetime.datetime(2012, 1, 2), + datetime.datetime(2012, 5, 3), + datetime.datetime(2022, 5, 3), + pd.NaT, + ] + pdf = pd.DataFrame(np.ones(len(dates)), index=pd.DatetimeIndex(dates), columns=["A"]) + psdf = ps.from_pandas(pdf) + + with self.assertRaisesRegex(ValueError, "rule code W-SUN is not supported"): + psdf.A.resample("3W").sum() + + with self.assertRaisesRegex(ValueError, "rule offset must be positive"): + psdf.A.resample("0Y").sum() + + with self.assertRaisesRegex(ValueError, "invalid closed: 'middle'"): + psdf.A.resample("3Y", closed="middle").sum() + + with self.assertRaisesRegex(ValueError, "invalid label: 'both'"): + psdf.A.resample("3Y", label="both").sum() + + with self.assertRaisesRegex( + NotImplementedError, "`on` currently works only for TimestampType" + ): + psdf.A.resample("2D", on=psdf.A).sum() + + with self.assertRaisesRegex( + NotImplementedError, "`on` currently works only for TimestampType" + ): + psdf[["A"]].resample("2D", on=psdf.A).sum() + + psdf["B"] = ["a", "b", "c", "d"] + with self.assertRaisesRegex(ValueError, "No available aggregation columns!"): + psdf.B.resample("2D").sum() + + with self.assertRaisesRegex(ValueError, "No available aggregation columns!"): + psdf[[]].resample("2D").sum() + + +class ResampleErrorTests(ResampleErrorMixin, PandasOnSparkTestCase, TestUtils): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.resample.test_error import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/resample/test_missing.py b/python/pyspark/pandas/tests/resample/test_missing.py new file mode 100644 index 000000000000..07dee20bad4f --- /dev/null +++ b/python/pyspark/pandas/tests/resample/test_missing.py @@ -0,0 +1,141 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import unittest +import inspect +import datetime + +import numpy as np +import pandas as pd + +from pyspark import pandas as ps +from pyspark.pandas.exceptions import PandasNotImplementedError +from pyspark.pandas.missing.resample import ( + MissingPandasLikeDataFrameResampler, + MissingPandasLikeSeriesResampler, +) +from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils + + +class ResampleMissingMixin: + @property + def pdf1(self): + np.random.seed(11) + dates = [ + pd.NaT, + datetime.datetime(2011, 12, 31), + datetime.datetime(2011, 12, 31, 0, 0, 1), + datetime.datetime(2011, 12, 31, 23, 59, 59), + datetime.datetime(2012, 1, 1), + datetime.datetime(2012, 1, 1, 0, 0, 1), + pd.NaT, + datetime.datetime(2012, 1, 1, 23, 59, 59), + datetime.datetime(2012, 1, 2), + pd.NaT, + datetime.datetime(2012, 1, 30, 23, 59, 59), + datetime.datetime(2012, 1, 31), + datetime.datetime(2012, 1, 31, 0, 0, 1), + datetime.datetime(2012, 3, 31), + datetime.datetime(2013, 5, 3), + datetime.datetime(2022, 5, 3), + ] + return pd.DataFrame( + np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=list("AB") + ) + + @property + def psdf1(self): + return ps.from_pandas(self.pdf1) + + def test_missing(self): + pdf_r = self.psdf1.resample("3Y") + pser_r = self.psdf1.A.resample("3Y") + + # DataFrameResampler functions + missing_functions = inspect.getmembers( + MissingPandasLikeDataFrameResampler, inspect.isfunction + ) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] + for name in unsupported_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, + "method.*Resampler.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(pdf_r, name)() + + # SeriesResampler functions + missing_functions = inspect.getmembers(MissingPandasLikeSeriesResampler, inspect.isfunction) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] + for name in unsupported_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, + "method.*Resampler.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(pser_r, name)() + + # DataFrameResampler properties + missing_properties = inspect.getmembers( + MissingPandasLikeDataFrameResampler, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] + for name in unsupported_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, + "property.*Resampler.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(pdf_r, name) + + # SeriesResampler properties + missing_properties = inspect.getmembers( + MissingPandasLikeSeriesResampler, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] + for name in unsupported_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, + "property.*Resampler.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(pser_r, name) + + +class ResampleMissingTests(ResampleMissingMixin, PandasOnSparkTestCase, TestUtils): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.resample.test_missing import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/resample/test_on.py b/python/pyspark/pandas/tests/resample/test_on.py new file mode 100644 index 000000000000..6062762fd2e3 --- /dev/null +++ b/python/pyspark/pandas/tests/resample/test_on.py @@ -0,0 +1,66 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import unittest +import datetime + +import numpy as np +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils + + +class ResampleOnMixin: + def test_resample_on(self): + np.random.seed(77) + dates = [ + datetime.datetime(2022, 5, 1, 4, 5, 6), + datetime.datetime(2022, 5, 3), + datetime.datetime(2022, 5, 3, 23, 59, 59), + datetime.datetime(2022, 5, 4), + pd.NaT, + datetime.datetime(2022, 5, 4, 0, 0, 1), + datetime.datetime(2022, 5, 11), + ] + pdf = pd.DataFrame( + np.random.rand(len(dates), 3), index=pd.DatetimeIndex(dates), columns=list("ABC") + ) + pdf["X"] = pd.DatetimeIndex(dates) + psdf = ps.from_pandas(pdf) + self.assert_eq( + pdf.resample("2D", on="X").sum().sort_index(), + psdf.resample("2D", on=psdf.X).sum().sort_index(), + almost=True, + ) + + +class ResampleOnTests(ResampleOnMixin, PandasOnSparkTestCase, TestUtils): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.resample.test_on import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/resample/test_timezone.py b/python/pyspark/pandas/tests/resample/test_timezone.py new file mode 100644 index 000000000000..17c46dd26b35 --- /dev/null +++ b/python/pyspark/pandas/tests/resample/test_timezone.py @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import unittest +import os + +import numpy as np +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils + + +class ResampleTimezoneMixin: + timezone = None + + @classmethod + def setUpClass(cls): + cls.timezone = os.environ.get("TZ", None) + os.environ["TZ"] = "America/New_York" + super(ResampleTimezoneMixin, cls).setUpClass() + + @classmethod + def tearDownClass(cls): + super(ResampleTimezoneMixin, cls).tearDownClass() + if cls.timezone is not None: + os.environ["TZ"] = cls.timezone + + @property + def pdf(self): + np.random.seed(22) + index = pd.date_range(start="2011-01-02", end="2022-05-01", freq="1D") + return pd.DataFrame(np.random.rand(len(index), 2), index=index, columns=list("AB")) + + @property + def psdf(self): + return ps.from_pandas(self.pdf) + + def test_series_resample_with_timezone(self): + with self.sql_conf( + { + "spark.sql.session.timeZone": "Asia/Seoul", + "spark.sql.timestampType": "TIMESTAMP_NTZ", + } + ): + p_resample = self.pdf.resample(rule="1001H", closed="right", label="right") + ps_resample = self.psdf.resample(rule="1001H", closed="right", label="right") + self.assert_eq( + p_resample.sum().sort_index(), + ps_resample.sum().sort_index(), + almost=True, + ) + + +class ResampleTimezoneTests(ResampleTimezoneMixin, PandasOnSparkTestCase, TestUtils): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.resample.test_timezone import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/test_resample.py b/python/pyspark/pandas/tests/test_resample.py deleted file mode 100644 index c6005247f013..000000000000 --- a/python/pyspark/pandas/tests/test_resample.py +++ /dev/null @@ -1,320 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -import unittest -import inspect -import datetime -import os - -import numpy as np -import pandas as pd - -from pyspark import pandas as ps -from pyspark.pandas.exceptions import PandasNotImplementedError -from pyspark.pandas.missing.resample import ( - MissingPandasLikeDataFrameResampler, - MissingPandasLikeSeriesResampler, -) -from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils - - -class ResampleTestsMixin: - @property - def pdf1(self): - np.random.seed(11) - dates = [ - pd.NaT, - datetime.datetime(2011, 12, 31), - datetime.datetime(2011, 12, 31, 0, 0, 1), - datetime.datetime(2011, 12, 31, 23, 59, 59), - datetime.datetime(2012, 1, 1), - datetime.datetime(2012, 1, 1, 0, 0, 1), - pd.NaT, - datetime.datetime(2012, 1, 1, 23, 59, 59), - datetime.datetime(2012, 1, 2), - pd.NaT, - datetime.datetime(2012, 1, 30, 23, 59, 59), - datetime.datetime(2012, 1, 31), - datetime.datetime(2012, 1, 31, 0, 0, 1), - datetime.datetime(2012, 3, 31), - datetime.datetime(2013, 5, 3), - datetime.datetime(2022, 5, 3), - ] - return pd.DataFrame( - np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=list("AB") - ) - - @property - def pdf2(self): - np.random.seed(22) - dates = [ - datetime.datetime(2022, 5, 1, 4, 5, 6), - datetime.datetime(2022, 5, 3), - datetime.datetime(2022, 5, 3, 23, 59, 59), - datetime.datetime(2022, 5, 4), - pd.NaT, - datetime.datetime(2022, 5, 4, 0, 0, 1), - datetime.datetime(2022, 5, 11), - ] - return pd.DataFrame( - np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates), columns=list("AB") - ) - - @property - def pdf3(self): - np.random.seed(22) - index = pd.date_range(start="2011-01-02", end="2022-05-01", freq="1D") - return pd.DataFrame(np.random.rand(len(index), 2), index=index, columns=list("AB")) - - @property - def pdf4(self): - np.random.seed(33) - index = pd.date_range(start="2020-12-12", end="2022-05-01", freq="1H") - return pd.DataFrame(np.random.rand(len(index), 2), index=index, columns=list("AB")) - - @property - def pdf5(self): - np.random.seed(44) - index = pd.date_range(start="2021-12-30 03:04:05", end="2022-01-02 06:07:08", freq="1T") - return pd.DataFrame(np.random.rand(len(index), 2), index=index, columns=list("AB")) - - @property - def pdf6(self): - np.random.seed(55) - index = pd.date_range(start="2022-05-02 03:04:05", end="2022-05-02 06:07:08", freq="1S") - return pd.DataFrame(np.random.rand(len(index), 2), index=index, columns=list("AB")) - - @property - def psdf1(self): - return ps.from_pandas(self.pdf1) - - @property - def psdf2(self): - return ps.from_pandas(self.pdf2) - - @property - def psdf3(self): - return ps.from_pandas(self.pdf3) - - @property - def psdf4(self): - return ps.from_pandas(self.pdf4) - - @property - def psdf5(self): - return ps.from_pandas(self.pdf5) - - @property - def psdf6(self): - return ps.from_pandas(self.pdf6) - - def test_resample_error(self): - psdf = ps.range(10) - - with self.assertRaisesRegex( - NotImplementedError, "resample currently works only for DatetimeIndex" - ): - psdf.resample("3Y").sum() - - with self.assertRaisesRegex( - NotImplementedError, "resample currently works only for DatetimeIndex" - ): - psdf.id.resample("3Y").sum() - - dates = [ - datetime.datetime(2012, 1, 2), - datetime.datetime(2012, 5, 3), - datetime.datetime(2022, 5, 3), - pd.NaT, - ] - pdf = pd.DataFrame(np.ones(len(dates)), index=pd.DatetimeIndex(dates), columns=["A"]) - psdf = ps.from_pandas(pdf) - - with self.assertRaisesRegex(ValueError, "rule code W-SUN is not supported"): - psdf.A.resample("3W").sum() - - with self.assertRaisesRegex(ValueError, "rule offset must be positive"): - psdf.A.resample("0Y").sum() - - with self.assertRaisesRegex(ValueError, "invalid closed: 'middle'"): - psdf.A.resample("3Y", closed="middle").sum() - - with self.assertRaisesRegex(ValueError, "invalid label: 'both'"): - psdf.A.resample("3Y", label="both").sum() - - with self.assertRaisesRegex( - NotImplementedError, "`on` currently works only for TimestampType" - ): - psdf.A.resample("2D", on=psdf.A).sum() - - with self.assertRaisesRegex( - NotImplementedError, "`on` currently works only for TimestampType" - ): - psdf[["A"]].resample("2D", on=psdf.A).sum() - - psdf["B"] = ["a", "b", "c", "d"] - with self.assertRaisesRegex(ValueError, "No available aggregation columns!"): - psdf.B.resample("2D").sum() - - with self.assertRaisesRegex(ValueError, "No available aggregation columns!"): - psdf[[]].resample("2D").sum() - - def test_missing(self): - pdf_r = self.psdf1.resample("3Y") - pser_r = self.psdf1.A.resample("3Y") - - # DataFrameResampler functions - missing_functions = inspect.getmembers( - MissingPandasLikeDataFrameResampler, inspect.isfunction - ) - unsupported_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" - ] - for name in unsupported_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Resampler.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(pdf_r, name)() - - # SeriesResampler functions - missing_functions = inspect.getmembers(MissingPandasLikeSeriesResampler, inspect.isfunction) - unsupported_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" - ] - for name in unsupported_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Resampler.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(pser_r, name)() - - # DataFrameResampler properties - missing_properties = inspect.getmembers( - MissingPandasLikeDataFrameResampler, lambda o: isinstance(o, property) - ) - unsupported_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "unsupported_property" - ] - for name in unsupported_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Resampler.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(pdf_r, name) - - # SeriesResampler properties - missing_properties = inspect.getmembers( - MissingPandasLikeSeriesResampler, lambda o: isinstance(o, property) - ) - unsupported_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "unsupported_property" - ] - for name in unsupported_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Resampler.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(pser_r, name) - - def test_resample_on(self): - np.random.seed(77) - dates = [ - datetime.datetime(2022, 5, 1, 4, 5, 6), - datetime.datetime(2022, 5, 3), - datetime.datetime(2022, 5, 3, 23, 59, 59), - datetime.datetime(2022, 5, 4), - pd.NaT, - datetime.datetime(2022, 5, 4, 0, 0, 1), - datetime.datetime(2022, 5, 11), - ] - pdf = pd.DataFrame( - np.random.rand(len(dates), 3), index=pd.DatetimeIndex(dates), columns=list("ABC") - ) - pdf["X"] = pd.DatetimeIndex(dates) - psdf = ps.from_pandas(pdf) - self.assert_eq( - pdf.resample("2D", on="X").sum().sort_index(), - psdf.resample("2D", on=psdf.X).sum().sort_index(), - almost=True, - ) - - -class ResampleWithTimezoneMixin: - timezone = None - - @classmethod - def setUpClass(cls): - cls.timezone = os.environ.get("TZ", None) - os.environ["TZ"] = "America/New_York" - super(ResampleWithTimezoneMixin, cls).setUpClass() - - @classmethod - def tearDownClass(cls): - super(ResampleWithTimezoneMixin, cls).tearDownClass() - if cls.timezone is not None: - os.environ["TZ"] = cls.timezone - - @property - def pdf(self): - np.random.seed(22) - index = pd.date_range(start="2011-01-02", end="2022-05-01", freq="1D") - return pd.DataFrame(np.random.rand(len(index), 2), index=index, columns=list("AB")) - - @property - def psdf(self): - return ps.from_pandas(self.pdf) - - def test_series_resample_with_timezone(self): - with self.sql_conf( - { - "spark.sql.session.timeZone": "Asia/Seoul", - "spark.sql.timestampType": "TIMESTAMP_NTZ", - } - ): - p_resample = self.pdf.resample(rule="1001H", closed="right", label="right") - ps_resample = self.psdf.resample(rule="1001H", closed="right", label="right") - self.assert_eq( - p_resample.sum().sort_index(), - ps_resample.sum().sort_index(), - almost=True, - ) - - -class ResampleTests(ResampleTestsMixin, PandasOnSparkTestCase, TestUtils): - pass - - -class ResampleWithTimezoneTests(ResampleWithTimezoneMixin, PandasOnSparkTestCase, TestUtils): - pass - - -if __name__ == "__main__": - from pyspark.pandas.tests.test_resample import * # noqa: F401 - - try: - import xmlrunner - - testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) - except ImportError: - testRunner = None - unittest.main(testRunner=testRunner, verbosity=2) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org