This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 12817035c575 [SPARK-46264][PS][CONNECT][TESTS] Re-organize the
resampling tests
12817035c575 is described below
commit 12817035c57505c5eeea228d5184c4ab629c95d4
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Dec 5 18:31:24 2023 +0800
[SPARK-46264][PS][CONNECT][TESTS] Re-organize the resampling tests
### What changes were proposed in this pull request?
Re-organize the resampling tests
### Why are the changes needed?
Re-organize the resampling tests by topics to be more consistent with
[Pandas tests](https://github.com/pandas-dev/pandas/tree/main/pandas/tests)
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44180 from zhengruifeng/reorg_test_resample.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
dev/sparktestsupport/modules.py | 10 +-
.../pandas/tests/connect/resample/__init__.py | 16 ++
.../test_parity_error.py} | 16 +-
.../test_parity_missing.py} | 16 +-
.../test_parity_on.py} | 16 +-
.../test_parity_timezone.py} | 16 +-
python/pyspark/pandas/tests/resample/__init__.py | 16 ++
python/pyspark/pandas/tests/resample/test_error.py | 94 ++++++
.../pyspark/pandas/tests/resample/test_missing.py | 141 +++++++++
python/pyspark/pandas/tests/resample/test_on.py | 66 +++++
.../pyspark/pandas/tests/resample/test_timezone.py | 83 ++++++
python/pyspark/pandas/tests/test_resample.py | 320 ---------------------
12 files changed, 442 insertions(+), 368 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 15b2e8f186e5..f35c42d11e58 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -740,7 +740,10 @@ pyspark_pandas = Module(
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
"pyspark.pandas.tests.test_repr",
- "pyspark.pandas.tests.test_resample",
+ "pyspark.pandas.tests.resample.test_on",
+ "pyspark.pandas.tests.resample.test_error",
+ "pyspark.pandas.tests.resample.test_missing",
+ "pyspark.pandas.tests.resample.test_timezone",
"pyspark.pandas.tests.test_frame_resample",
"pyspark.pandas.tests.test_series_resample",
"pyspark.pandas.tests.test_reshape",
@@ -984,7 +987,10 @@ pyspark_pandas_connect_part0 = Module(
"pyspark.pandas.tests.connect.test_parity_namespace",
"pyspark.pandas.tests.connect.test_parity_numpy_compat",
"pyspark.pandas.tests.connect.test_parity_repr",
- "pyspark.pandas.tests.connect.test_parity_resample",
+ "pyspark.pandas.tests.connect.resample.test_parity_error",
+ "pyspark.pandas.tests.connect.resample.test_parity_missing",
+ "pyspark.pandas.tests.connect.resample.test_parity_on",
+ "pyspark.pandas.tests.connect.resample.test_parity_timezone",
"pyspark.pandas.tests.connect.test_parity_scalars",
"pyspark.pandas.tests.connect.test_parity_series_conversion",
"pyspark.pandas.tests.connect.test_parity_series_datetime",
diff --git a/python/pyspark/pandas/tests/connect/resample/__init__.py
b/python/pyspark/pandas/tests/connect/resample/__init__.py
new file mode 100644
index 000000000000..cce3acad34a4
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/resample/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py
b/python/pyspark/pandas/tests/connect/resample/test_parity_error.py
similarity index 70%
copy from python/pyspark/pandas/tests/connect/test_parity_resample.py
copy to python/pyspark/pandas/tests/connect/resample/test_parity_error.py
index caca2f957b50..f6365ee8dc87 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_resample.py
+++ b/python/pyspark/pandas/tests/connect/resample/test_parity_error.py
@@ -16,25 +16,17 @@
#
import unittest
-from pyspark.pandas.tests.test_resample import ResampleTestsMixin,
ResampleWithTimezoneMixin
+from pyspark.pandas.tests.resample.test_error import ResampleErrorMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class ResampleParityTests(
- ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils,
ReusedConnectTestCase
-):
- pass
-
-
-class ResampleWithTimezoneTests(
- ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils,
ReusedConnectTestCase
-):
+class ResampleParityErrorTests(ResampleErrorMixin, PandasOnSparkTestUtils,
ReusedConnectTestCase):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_resample import * # noqa:
F401
+ from pyspark.pandas.tests.connect.resample.test_parity_error import * #
noqa: F401
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py
b/python/pyspark/pandas/tests/connect/resample/test_parity_missing.py
similarity index 70%
copy from python/pyspark/pandas/tests/connect/test_parity_resample.py
copy to python/pyspark/pandas/tests/connect/resample/test_parity_missing.py
index caca2f957b50..49e541084609 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_resample.py
+++ b/python/pyspark/pandas/tests/connect/resample/test_parity_missing.py
@@ -16,25 +16,19 @@
#
import unittest
-from pyspark.pandas.tests.test_resample import ResampleTestsMixin,
ResampleWithTimezoneMixin
+from pyspark.pandas.tests.resample.test_missing import ResampleMissingMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class ResampleParityTests(
- ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils,
ReusedConnectTestCase
-):
- pass
-
-
-class ResampleWithTimezoneTests(
- ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils,
ReusedConnectTestCase
+class ResampleParityMissingTests(
+ ResampleMissingMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_resample import * # noqa:
F401
+ from pyspark.pandas.tests.connect.resample.test_parity_missing import * #
noqa: F401
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py
b/python/pyspark/pandas/tests/connect/resample/test_parity_on.py
similarity index 70%
copy from python/pyspark/pandas/tests/connect/test_parity_resample.py
copy to python/pyspark/pandas/tests/connect/resample/test_parity_on.py
index caca2f957b50..d4b767329fb8 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_resample.py
+++ b/python/pyspark/pandas/tests/connect/resample/test_parity_on.py
@@ -16,25 +16,17 @@
#
import unittest
-from pyspark.pandas.tests.test_resample import ResampleTestsMixin,
ResampleWithTimezoneMixin
+from pyspark.pandas.tests.resample.test_on import ResampleOnMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class ResampleParityTests(
- ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils,
ReusedConnectTestCase
-):
- pass
-
-
-class ResampleWithTimezoneTests(
- ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils,
ReusedConnectTestCase
-):
+class ResampleParityOnTests(ResampleOnMixin, PandasOnSparkTestUtils,
ReusedConnectTestCase):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_resample import * # noqa:
F401
+ from pyspark.pandas.tests.connect.resample.test_parity_on import * #
noqa: F401
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/test_parity_resample.py
b/python/pyspark/pandas/tests/connect/resample/test_parity_timezone.py
similarity index 70%
rename from python/pyspark/pandas/tests/connect/test_parity_resample.py
rename to python/pyspark/pandas/tests/connect/resample/test_parity_timezone.py
index caca2f957b50..0e8f36a51168 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_resample.py
+++ b/python/pyspark/pandas/tests/connect/resample/test_parity_timezone.py
@@ -16,25 +16,19 @@
#
import unittest
-from pyspark.pandas.tests.test_resample import ResampleTestsMixin,
ResampleWithTimezoneMixin
+from pyspark.pandas.tests.resample.test_timezone import ResampleTimezoneMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
-from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class ResampleParityTests(
- ResampleTestsMixin, PandasOnSparkTestUtils, TestUtils,
ReusedConnectTestCase
-):
- pass
-
-
-class ResampleWithTimezoneTests(
- ResampleWithTimezoneMixin, PandasOnSparkTestUtils, TestUtils,
ReusedConnectTestCase
+class ResampleParityTimezoneTests(
+ ResampleTimezoneMixin, PandasOnSparkTestUtils, ReusedConnectTestCase
):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_resample import * # noqa:
F401
+ from pyspark.pandas.tests.connect.resample.test_parity_timezone import *
# noqa: F401
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/resample/__init__.py
b/python/pyspark/pandas/tests/resample/__init__.py
new file mode 100644
index 000000000000..cce3acad34a4
--- /dev/null
+++ b/python/pyspark/pandas/tests/resample/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pyspark/pandas/tests/resample/test_error.py
b/python/pyspark/pandas/tests/resample/test_error.py
new file mode 100644
index 000000000000..15b5df7b3b80
--- /dev/null
+++ b/python/pyspark/pandas/tests/resample/test_error.py
@@ -0,0 +1,94 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+import datetime
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class ResampleErrorMixin:
+ def test_resample_error(self):
+ psdf = ps.range(10)
+
+ with self.assertRaisesRegex(
+ NotImplementedError, "resample currently works only for
DatetimeIndex"
+ ):
+ psdf.resample("3Y").sum()
+
+ with self.assertRaisesRegex(
+ NotImplementedError, "resample currently works only for
DatetimeIndex"
+ ):
+ psdf.id.resample("3Y").sum()
+
+ dates = [
+ datetime.datetime(2012, 1, 2),
+ datetime.datetime(2012, 5, 3),
+ datetime.datetime(2022, 5, 3),
+ pd.NaT,
+ ]
+ pdf = pd.DataFrame(np.ones(len(dates)), index=pd.DatetimeIndex(dates),
columns=["A"])
+ psdf = ps.from_pandas(pdf)
+
+ with self.assertRaisesRegex(ValueError, "rule code W-SUN is not
supported"):
+ psdf.A.resample("3W").sum()
+
+ with self.assertRaisesRegex(ValueError, "rule offset must be
positive"):
+ psdf.A.resample("0Y").sum()
+
+ with self.assertRaisesRegex(ValueError, "invalid closed: 'middle'"):
+ psdf.A.resample("3Y", closed="middle").sum()
+
+ with self.assertRaisesRegex(ValueError, "invalid label: 'both'"):
+ psdf.A.resample("3Y", label="both").sum()
+
+ with self.assertRaisesRegex(
+ NotImplementedError, "`on` currently works only for TimestampType"
+ ):
+ psdf.A.resample("2D", on=psdf.A).sum()
+
+ with self.assertRaisesRegex(
+ NotImplementedError, "`on` currently works only for TimestampType"
+ ):
+ psdf[["A"]].resample("2D", on=psdf.A).sum()
+
+ psdf["B"] = ["a", "b", "c", "d"]
+ with self.assertRaisesRegex(ValueError, "No available aggregation
columns!"):
+ psdf.B.resample("2D").sum()
+
+ with self.assertRaisesRegex(ValueError, "No available aggregation
columns!"):
+ psdf[[]].resample("2D").sum()
+
+
+class ResampleErrorTests(ResampleErrorMixin, PandasOnSparkTestCase, TestUtils):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.resample.test_error import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/resample/test_missing.py
b/python/pyspark/pandas/tests/resample/test_missing.py
new file mode 100644
index 000000000000..07dee20bad4f
--- /dev/null
+++ b/python/pyspark/pandas/tests/resample/test_missing.py
@@ -0,0 +1,141 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import unittest
+import inspect
+import datetime
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.pandas.exceptions import PandasNotImplementedError
+from pyspark.pandas.missing.resample import (
+ MissingPandasLikeDataFrameResampler,
+ MissingPandasLikeSeriesResampler,
+)
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class ResampleMissingMixin:
+ @property
+ def pdf1(self):
+ np.random.seed(11)
+ dates = [
+ pd.NaT,
+ datetime.datetime(2011, 12, 31),
+ datetime.datetime(2011, 12, 31, 0, 0, 1),
+ datetime.datetime(2011, 12, 31, 23, 59, 59),
+ datetime.datetime(2012, 1, 1),
+ datetime.datetime(2012, 1, 1, 0, 0, 1),
+ pd.NaT,
+ datetime.datetime(2012, 1, 1, 23, 59, 59),
+ datetime.datetime(2012, 1, 2),
+ pd.NaT,
+ datetime.datetime(2012, 1, 30, 23, 59, 59),
+ datetime.datetime(2012, 1, 31),
+ datetime.datetime(2012, 1, 31, 0, 0, 1),
+ datetime.datetime(2012, 3, 31),
+ datetime.datetime(2013, 5, 3),
+ datetime.datetime(2022, 5, 3),
+ ]
+ return pd.DataFrame(
+ np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates),
columns=list("AB")
+ )
+
+ @property
+ def psdf1(self):
+ return ps.from_pandas(self.pdf1)
+
+ def test_missing(self):
+ pdf_r = self.psdf1.resample("3Y")
+ pser_r = self.psdf1.A.resample("3Y")
+
+ # DataFrameResampler functions
+ missing_functions = inspect.getmembers(
+ MissingPandasLikeDataFrameResampler, inspect.isfunction
+ )
+ unsupported_functions = [
+ name for (name, type_) in missing_functions if type_.__name__ ==
"unsupported_function"
+ ]
+ for name in unsupported_functions:
+ with self.assertRaisesRegex(
+ PandasNotImplementedError,
+ "method.*Resampler.*{}.*not implemented( yet\\.|\\.
.+)".format(name),
+ ):
+ getattr(pdf_r, name)()
+
+ # SeriesResampler functions
+ missing_functions =
inspect.getmembers(MissingPandasLikeSeriesResampler, inspect.isfunction)
+ unsupported_functions = [
+ name for (name, type_) in missing_functions if type_.__name__ ==
"unsupported_function"
+ ]
+ for name in unsupported_functions:
+ with self.assertRaisesRegex(
+ PandasNotImplementedError,
+ "method.*Resampler.*{}.*not implemented( yet\\.|\\.
.+)".format(name),
+ ):
+ getattr(pser_r, name)()
+
+ # DataFrameResampler properties
+ missing_properties = inspect.getmembers(
+ MissingPandasLikeDataFrameResampler, lambda o: isinstance(o,
property)
+ )
+ unsupported_properties = [
+ name
+ for (name, type_) in missing_properties
+ if type_.fget.__name__ == "unsupported_property"
+ ]
+ for name in unsupported_properties:
+ with self.assertRaisesRegex(
+ PandasNotImplementedError,
+ "property.*Resampler.*{}.*not implemented( yet\\.|\\.
.+)".format(name),
+ ):
+ getattr(pdf_r, name)
+
+ # SeriesResampler properties
+ missing_properties = inspect.getmembers(
+ MissingPandasLikeSeriesResampler, lambda o: isinstance(o, property)
+ )
+ unsupported_properties = [
+ name
+ for (name, type_) in missing_properties
+ if type_.fget.__name__ == "unsupported_property"
+ ]
+ for name in unsupported_properties:
+ with self.assertRaisesRegex(
+ PandasNotImplementedError,
+ "property.*Resampler.*{}.*not implemented( yet\\.|\\.
.+)".format(name),
+ ):
+ getattr(pser_r, name)
+
+
+class ResampleMissingTests(ResampleMissingMixin, PandasOnSparkTestCase,
TestUtils):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.resample.test_missing import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/resample/test_on.py
b/python/pyspark/pandas/tests/resample/test_on.py
new file mode 100644
index 000000000000..6062762fd2e3
--- /dev/null
+++ b/python/pyspark/pandas/tests/resample/test_on.py
@@ -0,0 +1,66 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import unittest
+import datetime
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class ResampleOnMixin:
+ def test_resample_on(self):
+ np.random.seed(77)
+ dates = [
+ datetime.datetime(2022, 5, 1, 4, 5, 6),
+ datetime.datetime(2022, 5, 3),
+ datetime.datetime(2022, 5, 3, 23, 59, 59),
+ datetime.datetime(2022, 5, 4),
+ pd.NaT,
+ datetime.datetime(2022, 5, 4, 0, 0, 1),
+ datetime.datetime(2022, 5, 11),
+ ]
+ pdf = pd.DataFrame(
+ np.random.rand(len(dates), 3), index=pd.DatetimeIndex(dates),
columns=list("ABC")
+ )
+ pdf["X"] = pd.DatetimeIndex(dates)
+ psdf = ps.from_pandas(pdf)
+ self.assert_eq(
+ pdf.resample("2D", on="X").sum().sort_index(),
+ psdf.resample("2D", on=psdf.X).sum().sort_index(),
+ almost=True,
+ )
+
+
+class ResampleOnTests(ResampleOnMixin, PandasOnSparkTestCase, TestUtils):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.resample.test_on import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/resample/test_timezone.py
b/python/pyspark/pandas/tests/resample/test_timezone.py
new file mode 100644
index 000000000000..17c46dd26b35
--- /dev/null
+++ b/python/pyspark/pandas/tests/resample/test_timezone.py
@@ -0,0 +1,83 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import unittest
+import os
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class ResampleTimezoneMixin:
+ timezone = None
+
+ @classmethod
+ def setUpClass(cls):
+ cls.timezone = os.environ.get("TZ", None)
+ os.environ["TZ"] = "America/New_York"
+ super(ResampleTimezoneMixin, cls).setUpClass()
+
+ @classmethod
+ def tearDownClass(cls):
+ super(ResampleTimezoneMixin, cls).tearDownClass()
+ if cls.timezone is not None:
+ os.environ["TZ"] = cls.timezone
+
+ @property
+ def pdf(self):
+ np.random.seed(22)
+ index = pd.date_range(start="2011-01-02", end="2022-05-01", freq="1D")
+ return pd.DataFrame(np.random.rand(len(index), 2), index=index,
columns=list("AB"))
+
+ @property
+ def psdf(self):
+ return ps.from_pandas(self.pdf)
+
+ def test_series_resample_with_timezone(self):
+ with self.sql_conf(
+ {
+ "spark.sql.session.timeZone": "Asia/Seoul",
+ "spark.sql.timestampType": "TIMESTAMP_NTZ",
+ }
+ ):
+ p_resample = self.pdf.resample(rule="1001H", closed="right",
label="right")
+ ps_resample = self.psdf.resample(rule="1001H", closed="right",
label="right")
+ self.assert_eq(
+ p_resample.sum().sort_index(),
+ ps_resample.sum().sort_index(),
+ almost=True,
+ )
+
+
+class ResampleTimezoneTests(ResampleTimezoneMixin, PandasOnSparkTestCase,
TestUtils):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.resample.test_timezone import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/test_resample.py
b/python/pyspark/pandas/tests/test_resample.py
deleted file mode 100644
index c6005247f013..000000000000
--- a/python/pyspark/pandas/tests/test_resample.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-import unittest
-import inspect
-import datetime
-import os
-
-import numpy as np
-import pandas as pd
-
-from pyspark import pandas as ps
-from pyspark.pandas.exceptions import PandasNotImplementedError
-from pyspark.pandas.missing.resample import (
- MissingPandasLikeDataFrameResampler,
- MissingPandasLikeSeriesResampler,
-)
-from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
-
-
-class ResampleTestsMixin:
- @property
- def pdf1(self):
- np.random.seed(11)
- dates = [
- pd.NaT,
- datetime.datetime(2011, 12, 31),
- datetime.datetime(2011, 12, 31, 0, 0, 1),
- datetime.datetime(2011, 12, 31, 23, 59, 59),
- datetime.datetime(2012, 1, 1),
- datetime.datetime(2012, 1, 1, 0, 0, 1),
- pd.NaT,
- datetime.datetime(2012, 1, 1, 23, 59, 59),
- datetime.datetime(2012, 1, 2),
- pd.NaT,
- datetime.datetime(2012, 1, 30, 23, 59, 59),
- datetime.datetime(2012, 1, 31),
- datetime.datetime(2012, 1, 31, 0, 0, 1),
- datetime.datetime(2012, 3, 31),
- datetime.datetime(2013, 5, 3),
- datetime.datetime(2022, 5, 3),
- ]
- return pd.DataFrame(
- np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates),
columns=list("AB")
- )
-
- @property
- def pdf2(self):
- np.random.seed(22)
- dates = [
- datetime.datetime(2022, 5, 1, 4, 5, 6),
- datetime.datetime(2022, 5, 3),
- datetime.datetime(2022, 5, 3, 23, 59, 59),
- datetime.datetime(2022, 5, 4),
- pd.NaT,
- datetime.datetime(2022, 5, 4, 0, 0, 1),
- datetime.datetime(2022, 5, 11),
- ]
- return pd.DataFrame(
- np.random.rand(len(dates), 2), index=pd.DatetimeIndex(dates),
columns=list("AB")
- )
-
- @property
- def pdf3(self):
- np.random.seed(22)
- index = pd.date_range(start="2011-01-02", end="2022-05-01", freq="1D")
- return pd.DataFrame(np.random.rand(len(index), 2), index=index,
columns=list("AB"))
-
- @property
- def pdf4(self):
- np.random.seed(33)
- index = pd.date_range(start="2020-12-12", end="2022-05-01", freq="1H")
- return pd.DataFrame(np.random.rand(len(index), 2), index=index,
columns=list("AB"))
-
- @property
- def pdf5(self):
- np.random.seed(44)
- index = pd.date_range(start="2021-12-30 03:04:05", end="2022-01-02
06:07:08", freq="1T")
- return pd.DataFrame(np.random.rand(len(index), 2), index=index,
columns=list("AB"))
-
- @property
- def pdf6(self):
- np.random.seed(55)
- index = pd.date_range(start="2022-05-02 03:04:05", end="2022-05-02
06:07:08", freq="1S")
- return pd.DataFrame(np.random.rand(len(index), 2), index=index,
columns=list("AB"))
-
- @property
- def psdf1(self):
- return ps.from_pandas(self.pdf1)
-
- @property
- def psdf2(self):
- return ps.from_pandas(self.pdf2)
-
- @property
- def psdf3(self):
- return ps.from_pandas(self.pdf3)
-
- @property
- def psdf4(self):
- return ps.from_pandas(self.pdf4)
-
- @property
- def psdf5(self):
- return ps.from_pandas(self.pdf5)
-
- @property
- def psdf6(self):
- return ps.from_pandas(self.pdf6)
-
- def test_resample_error(self):
- psdf = ps.range(10)
-
- with self.assertRaisesRegex(
- NotImplementedError, "resample currently works only for
DatetimeIndex"
- ):
- psdf.resample("3Y").sum()
-
- with self.assertRaisesRegex(
- NotImplementedError, "resample currently works only for
DatetimeIndex"
- ):
- psdf.id.resample("3Y").sum()
-
- dates = [
- datetime.datetime(2012, 1, 2),
- datetime.datetime(2012, 5, 3),
- datetime.datetime(2022, 5, 3),
- pd.NaT,
- ]
- pdf = pd.DataFrame(np.ones(len(dates)), index=pd.DatetimeIndex(dates),
columns=["A"])
- psdf = ps.from_pandas(pdf)
-
- with self.assertRaisesRegex(ValueError, "rule code W-SUN is not
supported"):
- psdf.A.resample("3W").sum()
-
- with self.assertRaisesRegex(ValueError, "rule offset must be
positive"):
- psdf.A.resample("0Y").sum()
-
- with self.assertRaisesRegex(ValueError, "invalid closed: 'middle'"):
- psdf.A.resample("3Y", closed="middle").sum()
-
- with self.assertRaisesRegex(ValueError, "invalid label: 'both'"):
- psdf.A.resample("3Y", label="both").sum()
-
- with self.assertRaisesRegex(
- NotImplementedError, "`on` currently works only for TimestampType"
- ):
- psdf.A.resample("2D", on=psdf.A).sum()
-
- with self.assertRaisesRegex(
- NotImplementedError, "`on` currently works only for TimestampType"
- ):
- psdf[["A"]].resample("2D", on=psdf.A).sum()
-
- psdf["B"] = ["a", "b", "c", "d"]
- with self.assertRaisesRegex(ValueError, "No available aggregation
columns!"):
- psdf.B.resample("2D").sum()
-
- with self.assertRaisesRegex(ValueError, "No available aggregation
columns!"):
- psdf[[]].resample("2D").sum()
-
- def test_missing(self):
- pdf_r = self.psdf1.resample("3Y")
- pser_r = self.psdf1.A.resample("3Y")
-
- # DataFrameResampler functions
- missing_functions = inspect.getmembers(
- MissingPandasLikeDataFrameResampler, inspect.isfunction
- )
- unsupported_functions = [
- name for (name, type_) in missing_functions if type_.__name__ ==
"unsupported_function"
- ]
- for name in unsupported_functions:
- with self.assertRaisesRegex(
- PandasNotImplementedError,
- "method.*Resampler.*{}.*not implemented( yet\\.|\\.
.+)".format(name),
- ):
- getattr(pdf_r, name)()
-
- # SeriesResampler functions
- missing_functions =
inspect.getmembers(MissingPandasLikeSeriesResampler, inspect.isfunction)
- unsupported_functions = [
- name for (name, type_) in missing_functions if type_.__name__ ==
"unsupported_function"
- ]
- for name in unsupported_functions:
- with self.assertRaisesRegex(
- PandasNotImplementedError,
- "method.*Resampler.*{}.*not implemented( yet\\.|\\.
.+)".format(name),
- ):
- getattr(pser_r, name)()
-
- # DataFrameResampler properties
- missing_properties = inspect.getmembers(
- MissingPandasLikeDataFrameResampler, lambda o: isinstance(o,
property)
- )
- unsupported_properties = [
- name
- for (name, type_) in missing_properties
- if type_.fget.__name__ == "unsupported_property"
- ]
- for name in unsupported_properties:
- with self.assertRaisesRegex(
- PandasNotImplementedError,
- "property.*Resampler.*{}.*not implemented( yet\\.|\\.
.+)".format(name),
- ):
- getattr(pdf_r, name)
-
- # SeriesResampler properties
- missing_properties = inspect.getmembers(
- MissingPandasLikeSeriesResampler, lambda o: isinstance(o, property)
- )
- unsupported_properties = [
- name
- for (name, type_) in missing_properties
- if type_.fget.__name__ == "unsupported_property"
- ]
- for name in unsupported_properties:
- with self.assertRaisesRegex(
- PandasNotImplementedError,
- "property.*Resampler.*{}.*not implemented( yet\\.|\\.
.+)".format(name),
- ):
- getattr(pser_r, name)
-
- def test_resample_on(self):
- np.random.seed(77)
- dates = [
- datetime.datetime(2022, 5, 1, 4, 5, 6),
- datetime.datetime(2022, 5, 3),
- datetime.datetime(2022, 5, 3, 23, 59, 59),
- datetime.datetime(2022, 5, 4),
- pd.NaT,
- datetime.datetime(2022, 5, 4, 0, 0, 1),
- datetime.datetime(2022, 5, 11),
- ]
- pdf = pd.DataFrame(
- np.random.rand(len(dates), 3), index=pd.DatetimeIndex(dates),
columns=list("ABC")
- )
- pdf["X"] = pd.DatetimeIndex(dates)
- psdf = ps.from_pandas(pdf)
- self.assert_eq(
- pdf.resample("2D", on="X").sum().sort_index(),
- psdf.resample("2D", on=psdf.X).sum().sort_index(),
- almost=True,
- )
-
-
-class ResampleWithTimezoneMixin:
- timezone = None
-
- @classmethod
- def setUpClass(cls):
- cls.timezone = os.environ.get("TZ", None)
- os.environ["TZ"] = "America/New_York"
- super(ResampleWithTimezoneMixin, cls).setUpClass()
-
- @classmethod
- def tearDownClass(cls):
- super(ResampleWithTimezoneMixin, cls).tearDownClass()
- if cls.timezone is not None:
- os.environ["TZ"] = cls.timezone
-
- @property
- def pdf(self):
- np.random.seed(22)
- index = pd.date_range(start="2011-01-02", end="2022-05-01", freq="1D")
- return pd.DataFrame(np.random.rand(len(index), 2), index=index,
columns=list("AB"))
-
- @property
- def psdf(self):
- return ps.from_pandas(self.pdf)
-
- def test_series_resample_with_timezone(self):
- with self.sql_conf(
- {
- "spark.sql.session.timeZone": "Asia/Seoul",
- "spark.sql.timestampType": "TIMESTAMP_NTZ",
- }
- ):
- p_resample = self.pdf.resample(rule="1001H", closed="right",
label="right")
- ps_resample = self.psdf.resample(rule="1001H", closed="right",
label="right")
- self.assert_eq(
- p_resample.sum().sort_index(),
- ps_resample.sum().sort_index(),
- almost=True,
- )
-
-
-class ResampleTests(ResampleTestsMixin, PandasOnSparkTestCase, TestUtils):
- pass
-
-
-class ResampleWithTimezoneTests(ResampleWithTimezoneMixin,
PandasOnSparkTestCase, TestUtils):
- pass
-
-
-if __name__ == "__main__":
- from pyspark.pandas.tests.test_resample import * # noqa: F401
-
- try:
- import xmlrunner
-
- testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
- except ImportError:
- testRunner = None
- unittest.main(testRunner=testRunner, verbosity=2)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]