This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 679926270c75 [SPARK-46504][PS][TESTS][FOLLOWUPS] Moving more slow tests out of `IndexesTests` 679926270c75 is described below commit 679926270c75e298373d7e59dad47145b41ec5ac Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Wed Jan 3 09:01:49 2024 +0900 [SPARK-46504][PS][TESTS][FOLLOWUPS] Moving more slow tests out of `IndexesTests` ### What changes were proposed in this pull request? Moving more slow tests out of `IndexesTests` ### Why are the changes needed? for testing parallelism ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44562 from zhengruifeng/ps_test_index_base_last. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- dev/sparktestsupport/modules.py | 8 + .../tests/connect/indexes/test_parity_level.py | 41 ++ .../tests/connect/indexes/test_parity_missing.py | 41 ++ .../tests/connect/indexes/test_parity_repeat.py | 41 ++ .../tests/connect/indexes/test_parity_stat.py | 41 ++ python/pyspark/pandas/tests/indexes/test_base.py | 561 +-------------------- python/pyspark/pandas/tests/indexes/test_level.py | 174 +++++++ .../pyspark/pandas/tests/indexes/test_missing.py | 244 +++++++++ python/pyspark/pandas/tests/indexes/test_rename.py | 71 +++ python/pyspark/pandas/tests/indexes/test_repeat.py | 65 +++ python/pyspark/pandas/tests/indexes/test_stat.py | 181 +++++++ 11 files changed, 914 insertions(+), 554 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 6aca31e5efdf..a97e6afdc356 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -798,7 +798,11 @@ pyspark_pandas_slow = Module( "pyspark.pandas.tests.indexes.test_base", "pyspark.pandas.tests.indexes.test_conversion", "pyspark.pandas.tests.indexes.test_drop", + "pyspark.pandas.tests.indexes.test_level", + "pyspark.pandas.tests.indexes.test_missing", + "pyspark.pandas.tests.indexes.test_repeat", "pyspark.pandas.tests.indexes.test_sort", + "pyspark.pandas.tests.indexes.test_stat", "pyspark.pandas.tests.indexes.test_symmetric_diff", "pyspark.pandas.tests.indexes.test_take", "pyspark.pandas.tests.indexes.test_unique", @@ -1094,7 +1098,11 @@ pyspark_pandas_connect_part0 = Module( "pyspark.pandas.tests.connect.indexes.test_parity_base", "pyspark.pandas.tests.connect.indexes.test_parity_conversion", "pyspark.pandas.tests.connect.indexes.test_parity_drop", + "pyspark.pandas.tests.connect.indexes.test_parity_level", + "pyspark.pandas.tests.connect.indexes.test_parity_missing", + "pyspark.pandas.tests.connect.indexes.test_parity_repeat", "pyspark.pandas.tests.connect.indexes.test_parity_sort", + "pyspark.pandas.tests.connect.indexes.test_parity_stat", "pyspark.pandas.tests.connect.indexes.test_parity_symmetric_diff", "pyspark.pandas.tests.connect.indexes.test_parity_take", "pyspark.pandas.tests.connect.indexes.test_parity_unique", diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_level.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_level.py new file mode 100644 index 000000000000..8bff94481131 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_level.py @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from pyspark.pandas.tests.indexes.test_level import LevelMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils + + +class LevelParityTests( + LevelMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.indexes.test_parity_level import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_missing.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_missing.py new file mode 100644 index 000000000000..0fe8b1a7159b --- /dev/null +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_missing.py @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from pyspark.pandas.tests.indexes.test_missing import MissingMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils + + +class MissingParityTests( + MissingMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.indexes.test_parity_missing import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_repeat.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_repeat.py new file mode 100644 index 000000000000..b81ff533bc50 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_repeat.py @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from pyspark.pandas.tests.indexes.test_repeat import RepeatMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils + + +class RepeatParityTests( + RepeatMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.indexes.test_parity_repeat import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_stat.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_stat.py new file mode 100644 index 000000000000..29b3a2ea5075 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_stat.py @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from pyspark.pandas.tests.indexes.test_stat import StatMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils + + +class StatParityTests( + StatMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.indexes.test_parity_stat import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index f7e6c553ac15..6671e3d93f75 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -15,9 +15,8 @@ # limitations under the License. # -import inspect import unittest -from datetime import datetime, timedelta +from datetime import datetime import numpy as np import pandas as pd @@ -25,13 +24,7 @@ import pandas as pd import pyspark.pandas as ps from pyspark.loose_version import LooseVersion from pyspark.pandas.exceptions import PandasNotImplementedError -from pyspark.pandas.missing.indexes import ( - MissingPandasLikeDatetimeIndex, - MissingPandasLikeIndex, - MissingPandasLikeMultiIndex, - MissingPandasLikeTimedeltaIndex, -) -from pyspark.testing.pandasutils import ComparisonTestBase, TestUtils, SPARK_CONF_ARROW_ENABLED +from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, SPARK_CONF_ARROW_ENABLED class IndexesTestsMixin: @@ -42,6 +35,10 @@ class IndexesTestsMixin: index=[0, 1, 3, 5, 6, 8, 9, 9, 9], ) + @property + def psdf(self): + return ps.from_pandas(self.pdf) + def test_index_basic(self): for pdf in [ pd.DataFrame(np.random.randn(10, 5), index=np.random.randint(100, size=10)), @@ -172,69 +169,6 @@ class IndexesTestsMixin: with self.assertRaises(PandasNotImplementedError): psidx.name = "renamed" - def test_index_rename(self): - pdf = pd.DataFrame( - np.random.randn(10, 5), index=pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x") - ) - psdf = ps.from_pandas(pdf) - - pidx = pdf.index - psidx = psdf.index - - self.assert_eq(psidx.rename("y"), pidx.rename("y")) - self.assert_eq(psdf.index.names, pdf.index.names) - - # non-string names - self.assert_eq(psidx.rename(0), pidx.rename(0)) - self.assert_eq(psidx.rename(("y", 0)), pidx.rename(("y", 0))) - - psidx.rename("z", inplace=True) - pidx.rename("z", inplace=True) - - self.assert_eq(psidx, pidx) - self.assert_eq(psdf.index.names, pdf.index.names) - - self.assert_eq(psidx.rename(None), pidx.rename(None)) - self.assert_eq(psdf.index.names, pdf.index.names) - - self.assertRaises(TypeError, lambda: psidx.rename(["x", "y"])) - - def test_multi_index_rename(self): - arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] - idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) - pdf = pd.DataFrame(np.random.randn(4, 5), idx) - psdf = ps.from_pandas(pdf) - - pmidx = pdf.index - psmidx = psdf.index - - self.assert_eq(psmidx.rename(["n", "c"]), pmidx.rename(["n", "c"])) - self.assert_eq(psdf.index.names, pdf.index.names) - - # non-string names - self.assert_eq(psmidx.rename([0, 1]), pmidx.rename([0, 1])) - self.assert_eq( - psmidx.rename([("x", "a"), ("y", "b")]), pmidx.rename([("x", "a"), ("y", "b")]) - ) - - psmidx.rename(["num", "col"], inplace=True) - pmidx.rename(["num", "col"], inplace=True) - - self.assert_eq(psmidx, pmidx) - self.assert_eq(psdf.index.names, pdf.index.names) - - self.assert_eq(psmidx.rename([None, None]), pmidx.rename([None, None])) - self.assert_eq(psdf.index.names, pdf.index.names) - - self.assertRaises(TypeError, lambda: psmidx.rename("number")) - self.assertRaises(TypeError, lambda: psmidx.rename(None)) - self.assertRaises(ValueError, lambda: psmidx.rename(["number"])) - - def test_multi_index_levshape(self): - pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) - psidx = ps.from_pandas(pidx) - self.assertEqual(pidx.levshape, psidx.levshape) - def test_multi_index_copy(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) @@ -243,269 +177,6 @@ class IndexesTestsMixin: self.assert_eq(psdf.index.copy(), pdf.index.copy()) - def test_missing(self): - psdf = ps.DataFrame( - { - "a": [1, 2, 3], - "b": [4, 5, 6], - "c": pd.date_range("2011-01-01", freq="D", periods=3), - "d": pd.Categorical(["a", "b", "c"]), - "e": [timedelta(1), timedelta(2), timedelta(3)], - } - ) - - # Index functions - missing_functions = inspect.getmembers(MissingPandasLikeIndex, inspect.isfunction) - unsupported_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" - ] - for name in unsupported_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.set_index("a").index, name)() - - deprecated_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" - ] - for name in deprecated_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) - ): - getattr(psdf.set_index("a").index, name)() - - # MultiIndex functions - missing_functions = inspect.getmembers(MissingPandasLikeMultiIndex, inspect.isfunction) - unsupported_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" - ] - for name in unsupported_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.set_index(["a", "b"]).index, name)() - - deprecated_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" - ] - for name in deprecated_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) - ): - getattr(psdf.set_index(["a", "b"]).index, name)() - - # DatetimeIndex functions - missing_functions = inspect.getmembers(MissingPandasLikeDatetimeIndex, inspect.isfunction) - unsupported_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" - ] - for name in unsupported_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.set_index("c").index, name)() - - deprecated_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" - ] - for name in deprecated_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) - ): - getattr(psdf.set_index("c").index, name)() - - # TimedeltaIndex functions - missing_functions = inspect.getmembers(MissingPandasLikeTimedeltaIndex, inspect.isfunction) - unsupported_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" - ] - for name in unsupported_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, - "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.set_index("e").index, name)() - - deprecated_functions = [ - name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" - ] - for name in deprecated_functions: - with self.assertRaisesRegex( - PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) - ): - getattr(psdf.set_index("e").index, name)() - - # Index properties - missing_properties = inspect.getmembers( - MissingPandasLikeIndex, lambda o: isinstance(o, property) - ) - unsupported_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "unsupported_property" - ] - for name in unsupported_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.set_index("a").index, name) - - deprecated_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "deprecated_property" - ] - for name in deprecated_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name) - ): - getattr(psdf.set_index("a").index, name) - - # MultiIndex properties - missing_properties = inspect.getmembers( - MissingPandasLikeMultiIndex, lambda o: isinstance(o, property) - ) - unsupported_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "unsupported_property" - ] - for name in unsupported_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.set_index(["a", "b"]).index, name) - - deprecated_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "deprecated_property" - ] - for name in deprecated_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name) - ): - getattr(psdf.set_index(["a", "b"]).index, name) - - # DatetimeIndex properties - missing_properties = inspect.getmembers( - MissingPandasLikeDatetimeIndex, lambda o: isinstance(o, property) - ) - unsupported_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "unsupported_property" - ] - for name in unsupported_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.set_index("c").index, name) - - # TimedeltaIndex properties - missing_properties = inspect.getmembers( - MissingPandasLikeDatetimeIndex, lambda o: isinstance(o, property) - ) - unsupported_properties = [ - name - for (name, type_) in missing_properties - if type_.fget.__name__ == "unsupported_property" - ] - for name in unsupported_properties: - with self.assertRaisesRegex( - PandasNotImplementedError, - "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), - ): - getattr(psdf.set_index("c").index, name) - - def test_multi_index_not_supported(self): - psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - - with self.assertRaisesRegex(TypeError, "cannot perform any with this index type"): - psdf.set_index(["a", "b"]).index.any() - - with self.assertRaisesRegex(TypeError, "cannot perform all with this index type"): - psdf.set_index(["a", "b"]).index.all() - - def test_index_nlevels(self): - pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(["a", "b", "c"])) - psdf = ps.from_pandas(pdf) - - self.assertEqual(psdf.index.nlevels, 1) - - def test_multiindex_nlevel(self): - pdf = pd.DataFrame({"a": [1, 2, 3]}, index=[list("abc"), list("def")]) - psdf = ps.from_pandas(pdf) - - self.assertEqual(psdf.index.nlevels, 2) - - def test_multiindex_swaplevel(self): - pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) - - pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", "number"]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) - - pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", None]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.swaplevel(-2, -1), psidx.swaplevel(-2, -1)) - self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) - self.assert_eq(pidx.swaplevel("word", 1), psidx.swaplevel("word", 1)) - - with self.assertRaisesRegex(IndexError, "Too many levels: Index"): - psidx.swaplevel(-3, "word") - with self.assertRaisesRegex(IndexError, "Too many levels: Index"): - psidx.swaplevel(0, 2) - with self.assertRaisesRegex(IndexError, "Too many levels: Index"): - psidx.swaplevel(0, -3) - with self.assertRaisesRegex(KeyError, "Level work not found"): - psidx.swaplevel(0, "work") - - def test_index_fillna(self): - pidx = pd.Index([1, 2, None]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(pidx.fillna(0), psidx.fillna(0), almost=True) - self.assert_eq(pidx.rename("name").fillna(0), psidx.rename("name").fillna(0), almost=True) - - with self.assertRaisesRegex(TypeError, "Unsupported type list"): - psidx.fillna([1, 2]) - - def test_multiindex_isna(self): - psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - - with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"): - psidx.isna() - - with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"): - psidx.isnull() - - with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): - psidx.notna() - - with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): - psidx.notnull() - - def test_multiindex_rename(self): - pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - psidx = ps.from_pandas(pidx) - - pidx = pidx.rename(list("ABC")) - psidx = psidx.rename(list("ABC")) - self.assert_eq(pidx, psidx) - - pidx = pidx.rename(["my", "name", "is"]) - psidx = psidx.rename(["my", "name", "is"]) - self.assert_eq(pidx, psidx) - def test_multiindex_set_names(self): pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) psidx = ps.from_pandas(pidx) @@ -549,150 +220,6 @@ class IndexesTestsMixin: psdf = ps.from_pandas(pdf) self.assert_eq(pdf, psdf) - def test_len(self): - pidx = pd.Index(range(10000)) - psidx = ps.from_pandas(pidx) - - self.assert_eq(len(pidx), len(psidx)) - - pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - - self.assert_eq(len(pidx), len(psidx)) - - def test_argmin(self): - pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(pidx.argmin(), psidx.argmin()) - - # MultiIndex - psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - with self.assertRaisesRegex( - TypeError, "reduction operation 'argmin' not allowed for this dtype" - ): - psidx.argmin() - - def test_argmax(self): - pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(pidx.argmax(), psidx.argmax()) - - # MultiIndex - psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) - with self.assertRaisesRegex( - TypeError, "reduction operation 'argmax' not allowed for this dtype" - ): - psidx.argmax() - - def test_min(self): - pidx = pd.Index([3, 2, 1]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(pidx.min(), psidx.min()) - - # MultiIndex - pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) - psmidx = ps.from_pandas(pmidx) - - self.assert_eq(pmidx.min(), psmidx.min()) - - pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(pidx.min(), psidx.min()) - - def test_max(self): - pidx = pd.Index([3, 2, 1]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(pidx.max(), psidx.max()) - - # MultiIndex - pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) - psmidx = ps.from_pandas(pmidx) - - self.assert_eq(pmidx.max(), psmidx.max()) - - pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(pidx.max(), psidx.max()) - - def test_repeat(self): - pidx = pd.Index(["a", "b", "c"]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(psidx.repeat(3).sort_values(), pidx.repeat(3).sort_values()) - self.assert_eq(psidx.repeat(0).sort_values(), pidx.repeat(0).sort_values()) - self.assert_eq((psidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values()) - - self.assertRaises(ValueError, lambda: psidx.repeat(-1)) - self.assertRaises(TypeError, lambda: psidx.repeat("abc")) - - pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) - psmidx = ps.from_pandas(pmidx) - - self.assert_eq(psmidx.repeat(3).sort_values(), pmidx.repeat(3).sort_values()) - self.assert_eq(psmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True) - - self.assertRaises(ValueError, lambda: psmidx.repeat(-1)) - self.assertRaises(TypeError, lambda: psmidx.repeat("abc")) - - def test_index_get_level_values(self): - pidx = pd.Index([1, 2, 3], name="ks") - psidx = ps.from_pandas(pidx) - - for level in [0, "ks"]: - self.assert_eq(psidx.get_level_values(level), pidx.get_level_values(level)) - - def test_multiindex_get_level_values(self): - pmidx = pd.MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")]) - pmidx.names = ["level_1", "level_2"] - psmidx = ps.from_pandas(pmidx) - - for level in [0, 1, "level_1", "level_2"]: - self.assert_eq(psmidx.get_level_values(level), pmidx.get_level_values(level)) - - def test_index_get_level_number(self): - # name of two levels are the same, which is None - psdf = ps.DataFrame({"a": [1, 2, 3]}, index=[list("aac"), list("ddf")]) - with self.assertRaisesRegex( - ValueError, "The name None occurs multiple times, use a level number" - ): - psdf.index._get_level_number(None) - - mi = pd.MultiIndex.from_arrays((list("abc"), list("def"))) - mi.names = ["level_1", "level_2"] - psdf = ps.DataFrame({"a": [1, 2, 3]}, index=mi) - - # level is not int and not in the level name list - with self.assertRaisesRegex(KeyError, "Level lv_3 not found"): - psdf.index._get_level_number("lv_3") - - # level is int, but an invalid negative number - with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"): - psdf.index._get_level_number(-3) - - # level is int, but an invalid positive number - with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"): - psdf.index._get_level_number(3) - - # Correct and valid inputs in numbers - level_number = [-2, -1, 0, 1] - outputs = [0, 1, 0, 1] - - for lv, output in zip(level_number, outputs): - self.assertEqual(output, psdf.index._get_level_number(lv)) - - # Valid inputs as level names - level_names = ["level_1", "level_2"] - outputs = [0, 1] - - for lv, output in zip(level_names, outputs): - self.assertEqual(output, psdf.index._get_level_number(lv)) - def test_holds_integer(self): pidx = pd.Index([1, 2, 3, 4]) psidx = ps.from_pandas(pidx) @@ -715,45 +242,6 @@ class IndexesTestsMixin: psmidx = ps.from_pandas(pmidx) self.assert_eq(pmidx.holds_integer(), psmidx.holds_integer()) - def test_abs(self): - pidx = pd.Index([-2, -1, 0, 1]) - psidx = ps.from_pandas(pidx) - - self.assert_eq(abs(pidx), abs(psidx)) - self.assert_eq(np.abs(pidx), np.abs(psidx)) - - psidx = ps.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"]) - with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"): - abs(psidx) - - def test_hasnans(self): - # BooleanType - pidx = pd.Index([True, False, True, True]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.hasnans, psidx.hasnans) - - pidx = pd.Index([True, False, np.nan, True]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.hasnans, psidx.hasnans) - - # TimestampType - pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, psser.hasnans) - - pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.hasnans, psser.hasnans) - - # empty - pidx = pd.Index([]) - psidx = ps.from_pandas(pidx) - self.assert_eq(pidx.hasnans, psidx.hasnans) - - # Not supported for MultiIndex - psmidx = ps.Index([("a", 1), ("b", 2)]) - self.assertRaises(NotImplementedError, lambda: psmidx.hasnans()) - def test_item(self): pidx = pd.Index([10]) psidx = ps.from_pandas(pidx) @@ -858,45 +346,10 @@ class IndexesTestsMixin: self.assertRaises(PandasNotImplementedError, lambda: psmidx.factorize()) - def test_multiindex_equal_levels(self): - pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) - psmidx1 = ps.from_pandas(pmidx1) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")]) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")]) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")]) - pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")]) - psmidx1 = ps.from_pandas(pmidx1) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - - pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) - pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")]) - psmidx1 = ps.from_pandas(pmidx1) - psmidx2 = ps.from_pandas(pmidx2) - self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) - class IndexesTests( IndexesTestsMixin, - ComparisonTestBase, + PandasOnSparkTestCase, TestUtils, ): pass diff --git a/python/pyspark/pandas/tests/indexes/test_level.py b/python/pyspark/pandas/tests/indexes/test_level.py new file mode 100644 index 000000000000..80eb769704d8 --- /dev/null +++ b/python/pyspark/pandas/tests/indexes/test_level.py @@ -0,0 +1,174 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils + + +class LevelMixin: + def test_multi_index_levshape(self): + pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) + psidx = ps.from_pandas(pidx) + self.assertEqual(pidx.levshape, psidx.levshape) + + def test_index_nlevels(self): + pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(["a", "b", "c"])) + psdf = ps.from_pandas(pdf) + + self.assertEqual(psdf.index.nlevels, 1) + + def test_multiindex_nlevel(self): + pdf = pd.DataFrame({"a": [1, 2, 3]}, index=[list("abc"), list("def")]) + psdf = ps.from_pandas(pdf) + + self.assertEqual(psdf.index.nlevels, 2) + + def test_multiindex_swaplevel(self): + pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]]) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) + + pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", "number"]) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) + + pidx = pd.MultiIndex.from_arrays([["a", "b"], [1, 2]], names=["word", None]) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.swaplevel(-2, -1), psidx.swaplevel(-2, -1)) + self.assert_eq(pidx.swaplevel(0, 1), psidx.swaplevel(0, 1)) + self.assert_eq(pidx.swaplevel("word", 1), psidx.swaplevel("word", 1)) + + with self.assertRaisesRegex(IndexError, "Too many levels: Index"): + psidx.swaplevel(-3, "word") + with self.assertRaisesRegex(IndexError, "Too many levels: Index"): + psidx.swaplevel(0, 2) + with self.assertRaisesRegex(IndexError, "Too many levels: Index"): + psidx.swaplevel(0, -3) + with self.assertRaisesRegex(KeyError, "Level work not found"): + psidx.swaplevel(0, "work") + + def test_index_get_level_values(self): + pidx = pd.Index([1, 2, 3], name="ks") + psidx = ps.from_pandas(pidx) + + for level in [0, "ks"]: + self.assert_eq(psidx.get_level_values(level), pidx.get_level_values(level)) + + def test_multiindex_get_level_values(self): + pmidx = pd.MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")]) + pmidx.names = ["level_1", "level_2"] + psmidx = ps.from_pandas(pmidx) + + for level in [0, 1, "level_1", "level_2"]: + self.assert_eq(psmidx.get_level_values(level), pmidx.get_level_values(level)) + + def test_index_get_level_number(self): + # name of two levels are the same, which is None + psdf = ps.DataFrame({"a": [1, 2, 3]}, index=[list("aac"), list("ddf")]) + with self.assertRaisesRegex( + ValueError, "The name None occurs multiple times, use a level number" + ): + psdf.index._get_level_number(None) + + mi = pd.MultiIndex.from_arrays((list("abc"), list("def"))) + mi.names = ["level_1", "level_2"] + psdf = ps.DataFrame({"a": [1, 2, 3]}, index=mi) + + # level is not int and not in the level name list + with self.assertRaisesRegex(KeyError, "Level lv_3 not found"): + psdf.index._get_level_number("lv_3") + + # level is int, but an invalid negative number + with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"): + psdf.index._get_level_number(-3) + + # level is int, but an invalid positive number + with self.assertRaisesRegex(IndexError, "Too many levels: Index has only"): + psdf.index._get_level_number(3) + + # Correct and valid inputs in numbers + level_number = [-2, -1, 0, 1] + outputs = [0, 1, 0, 1] + + for lv, output in zip(level_number, outputs): + self.assertEqual(output, psdf.index._get_level_number(lv)) + + # Valid inputs as level names + level_names = ["level_1", "level_2"] + outputs = [0, 1] + + for lv, output in zip(level_names, outputs): + self.assertEqual(output, psdf.index._get_level_number(lv)) + + def test_multiindex_equal_levels(self): + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c", "z")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "j")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "x")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z")]) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z"), ("a", "y")]) + pmidx2 = pd.MultiIndex.from_tuples([("a", "y"), ("b", "x"), ("c", "z"), ("c", "x")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) + pmidx2 = pd.MultiIndex.from_tuples([("a", "x", "q"), ("b", "y", "w"), ("c", "z", "e")]) + psmidx1 = ps.from_pandas(pmidx1) + psmidx2 = ps.from_pandas(pmidx2) + self.assert_eq(pmidx1.equal_levels(pmidx2), psmidx1.equal_levels(psmidx2)) + + +class LevelTests( + LevelMixin, + PandasOnSparkTestCase, + SQLTestUtils, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.indexes.test_level import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/indexes/test_missing.py b/python/pyspark/pandas/tests/indexes/test_missing.py new file mode 100644 index 000000000000..9ebbaf5dce2a --- /dev/null +++ b/python/pyspark/pandas/tests/indexes/test_missing.py @@ -0,0 +1,244 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest +import inspect +from datetime import timedelta + +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils +from pyspark.pandas.exceptions import PandasNotImplementedError +from pyspark.pandas.missing.indexes import ( + MissingPandasLikeDatetimeIndex, + MissingPandasLikeIndex, + MissingPandasLikeMultiIndex, + MissingPandasLikeTimedeltaIndex, +) + + +class MissingMixin: + def test_missing(self): + psdf = ps.DataFrame( + { + "a": [1, 2, 3], + "b": [4, 5, 6], + "c": pd.date_range("2011-01-01", freq="D", periods=3), + "d": pd.Categorical(["a", "b", "c"]), + "e": [timedelta(1), timedelta(2), timedelta(3)], + } + ) + + # Index functions + missing_functions = inspect.getmembers(MissingPandasLikeIndex, inspect.isfunction) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] + for name in unsupported_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, + "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.set_index("a").index, name)() + + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] + for name in deprecated_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) + ): + getattr(psdf.set_index("a").index, name)() + + # MultiIndex functions + missing_functions = inspect.getmembers(MissingPandasLikeMultiIndex, inspect.isfunction) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] + for name in unsupported_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, + "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.set_index(["a", "b"]).index, name)() + + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] + for name in deprecated_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) + ): + getattr(psdf.set_index(["a", "b"]).index, name)() + + # DatetimeIndex functions + missing_functions = inspect.getmembers(MissingPandasLikeDatetimeIndex, inspect.isfunction) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] + for name in unsupported_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, + "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.set_index("c").index, name)() + + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] + for name in deprecated_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) + ): + getattr(psdf.set_index("c").index, name)() + + # TimedeltaIndex functions + missing_functions = inspect.getmembers(MissingPandasLikeTimedeltaIndex, inspect.isfunction) + unsupported_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "unsupported_function" + ] + for name in unsupported_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, + "method.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.set_index("e").index, name)() + + deprecated_functions = [ + name for (name, type_) in missing_functions if type_.__name__ == "deprecated_function" + ] + for name in deprecated_functions: + with self.assertRaisesRegex( + PandasNotImplementedError, "method.*Index.*{}.*is deprecated".format(name) + ): + getattr(psdf.set_index("e").index, name)() + + # Index properties + missing_properties = inspect.getmembers( + MissingPandasLikeIndex, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] + for name in unsupported_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, + "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.set_index("a").index, name) + + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] + for name in deprecated_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name) + ): + getattr(psdf.set_index("a").index, name) + + # MultiIndex properties + missing_properties = inspect.getmembers( + MissingPandasLikeMultiIndex, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] + for name in unsupported_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, + "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.set_index(["a", "b"]).index, name) + + deprecated_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "deprecated_property" + ] + for name in deprecated_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, "property.*Index.*{}.*is deprecated".format(name) + ): + getattr(psdf.set_index(["a", "b"]).index, name) + + # DatetimeIndex properties + missing_properties = inspect.getmembers( + MissingPandasLikeDatetimeIndex, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] + for name in unsupported_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, + "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.set_index("c").index, name) + + # TimedeltaIndex properties + missing_properties = inspect.getmembers( + MissingPandasLikeDatetimeIndex, lambda o: isinstance(o, property) + ) + unsupported_properties = [ + name + for (name, type_) in missing_properties + if type_.fget.__name__ == "unsupported_property" + ] + for name in unsupported_properties: + with self.assertRaisesRegex( + PandasNotImplementedError, + "property.*Index.*{}.*not implemented( yet\\.|\\. .+)".format(name), + ): + getattr(psdf.set_index("c").index, name) + + def test_multi_index_not_supported(self): + psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + + with self.assertRaisesRegex(TypeError, "cannot perform any with this index type"): + psdf.set_index(["a", "b"]).index.any() + + with self.assertRaisesRegex(TypeError, "cannot perform all with this index type"): + psdf.set_index(["a", "b"]).index.all() + + +class MissingTests( + MissingMixin, + PandasOnSparkTestCase, + SQLTestUtils, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.indexes.test_missing import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/indexes/test_rename.py b/python/pyspark/pandas/tests/indexes/test_rename.py index 662071f420e9..07c70792f25c 100644 --- a/python/pyspark/pandas/tests/indexes/test_rename.py +++ b/python/pyspark/pandas/tests/indexes/test_rename.py @@ -16,6 +16,7 @@ # import unittest +import numpy as np import pandas as pd from pyspark import pandas as ps @@ -226,6 +227,76 @@ class FrameRenameMixin: psdf.rename_axis(index=str.upper, columns=str.upper).sort_index(), ) + def test_multi_index_rename(self): + arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) + pdf = pd.DataFrame(np.random.randn(4, 5), idx) + psdf = ps.from_pandas(pdf) + + pmidx = pdf.index + psmidx = psdf.index + + self.assert_eq(psmidx.rename(["n", "c"]), pmidx.rename(["n", "c"])) + self.assert_eq(psdf.index.names, pdf.index.names) + + # non-string names + self.assert_eq(psmidx.rename([0, 1]), pmidx.rename([0, 1])) + self.assert_eq( + psmidx.rename([("x", "a"), ("y", "b")]), pmidx.rename([("x", "a"), ("y", "b")]) + ) + + psmidx.rename(["num", "col"], inplace=True) + pmidx.rename(["num", "col"], inplace=True) + + self.assert_eq(psmidx, pmidx) + self.assert_eq(psdf.index.names, pdf.index.names) + + self.assert_eq(psmidx.rename([None, None]), pmidx.rename([None, None])) + self.assert_eq(psdf.index.names, pdf.index.names) + + self.assertRaises(TypeError, lambda: psmidx.rename("number")) + self.assertRaises(TypeError, lambda: psmidx.rename(None)) + self.assertRaises(ValueError, lambda: psmidx.rename(["number"])) + + def test_multiindex_rename(self): + pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + psidx = ps.from_pandas(pidx) + + pidx = pidx.rename(list("ABC")) + psidx = psidx.rename(list("ABC")) + self.assert_eq(pidx, psidx) + + pidx = pidx.rename(["my", "name", "is"]) + psidx = psidx.rename(["my", "name", "is"]) + self.assert_eq(pidx, psidx) + + def test_index_rename(self): + pdf = pd.DataFrame( + np.random.randn(10, 5), index=pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x") + ) + psdf = ps.from_pandas(pdf) + + pidx = pdf.index + psidx = psdf.index + + self.assert_eq(psidx.rename("y"), pidx.rename("y")) + self.assert_eq(psdf.index.names, pdf.index.names) + + # non-string names + self.assert_eq(psidx.rename(0), pidx.rename(0)) + self.assert_eq(psidx.rename(("y", 0)), pidx.rename(("y", 0))) + + psidx.rename("z", inplace=True) + pidx.rename("z", inplace=True) + + self.assert_eq(psidx, pidx) + self.assert_eq(psdf.index.names, pdf.index.names) + + self.assert_eq(psidx.rename(None), pidx.rename(None)) + self.assert_eq(psdf.index.names, pdf.index.names) + + self.assertRaises(TypeError, lambda: psidx.rename(["x", "y"])) + class FrameRenameTests(FrameRenameMixin, ComparisonTestBase, SQLTestUtils): pass diff --git a/python/pyspark/pandas/tests/indexes/test_repeat.py b/python/pyspark/pandas/tests/indexes/test_repeat.py new file mode 100644 index 000000000000..737fdb53622e --- /dev/null +++ b/python/pyspark/pandas/tests/indexes/test_repeat.py @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils + + +class RepeatMixin: + def test_repeat(self): + pidx = pd.Index(["a", "b", "c"]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(psidx.repeat(3).sort_values(), pidx.repeat(3).sort_values()) + self.assert_eq(psidx.repeat(0).sort_values(), pidx.repeat(0).sort_values()) + self.assert_eq((psidx + "x").repeat(3).sort_values(), (pidx + "x").repeat(3).sort_values()) + + self.assertRaises(ValueError, lambda: psidx.repeat(-1)) + self.assertRaises(TypeError, lambda: psidx.repeat("abc")) + + pmidx = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")]) + psmidx = ps.from_pandas(pmidx) + + self.assert_eq(psmidx.repeat(3).sort_values(), pmidx.repeat(3).sort_values()) + self.assert_eq(psmidx.repeat(0).sort_values(), pmidx.repeat(0).sort_values(), almost=True) + + self.assertRaises(ValueError, lambda: psmidx.repeat(-1)) + self.assertRaises(TypeError, lambda: psmidx.repeat("abc")) + + +class RepeatTests( + RepeatMixin, + PandasOnSparkTestCase, + SQLTestUtils, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.indexes.test_repeat import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/indexes/test_stat.py b/python/pyspark/pandas/tests/indexes/test_stat.py new file mode 100644 index 000000000000..101a23c53127 --- /dev/null +++ b/python/pyspark/pandas/tests/indexes/test_stat.py @@ -0,0 +1,181 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +import numpy as np +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils + + +class StatMixin: + def test_len(self): + pidx = pd.Index(range(10000)) + psidx = ps.from_pandas(pidx) + + self.assert_eq(len(pidx), len(psidx)) + + pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + + self.assert_eq(len(pidx), len(psidx)) + + def test_abs(self): + pidx = pd.Index([-2, -1, 0, 1]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(abs(pidx), abs(psidx)) + self.assert_eq(np.abs(pidx), np.abs(psidx)) + + psidx = ps.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"]) + with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"): + abs(psidx) + + def test_argmin(self): + pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.argmin(), psidx.argmin()) + + # MultiIndex + psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + with self.assertRaisesRegex( + TypeError, "reduction operation 'argmin' not allowed for this dtype" + ): + psidx.argmin() + + def test_argmax(self): + pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.argmax(), psidx.argmax()) + + # MultiIndex + psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + with self.assertRaisesRegex( + TypeError, "reduction operation 'argmax' not allowed for this dtype" + ): + psidx.argmax() + + def test_min(self): + pidx = pd.Index([3, 2, 1]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.min(), psidx.min()) + + # MultiIndex + pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) + psmidx = ps.from_pandas(pmidx) + + self.assert_eq(pmidx.min(), psmidx.min()) + + pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.min(), psidx.min()) + + def test_max(self): + pidx = pd.Index([3, 2, 1]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.max(), psidx.max()) + + # MultiIndex + pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2)]) + psmidx = ps.from_pandas(pmidx) + + self.assert_eq(pmidx.max(), psmidx.max()) + + pidx = pd.DatetimeIndex(["2021-02-01", "2021-01-01", "2021-04-01", "2021-03-01"]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.max(), psidx.max()) + + def test_hasnans(self): + # BooleanType + pidx = pd.Index([True, False, True, True]) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.hasnans, psidx.hasnans) + + pidx = pd.Index([True, False, np.nan, True]) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.hasnans, psidx.hasnans) + + # TimestampType + pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)]) + psser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, psser.hasnans) + + pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")]) + psser = ps.from_pandas(pser) + self.assert_eq(pser.hasnans, psser.hasnans) + + # empty + pidx = pd.Index([]) + psidx = ps.from_pandas(pidx) + self.assert_eq(pidx.hasnans, psidx.hasnans) + + # Not supported for MultiIndex + psmidx = ps.Index([("a", 1), ("b", 2)]) + self.assertRaises(NotImplementedError, lambda: psmidx.hasnans()) + + def test_index_fillna(self): + pidx = pd.Index([1, 2, None]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.fillna(0), psidx.fillna(0), almost=True) + self.assert_eq(pidx.rename("name").fillna(0), psidx.rename("name").fillna(0), almost=True) + + with self.assertRaisesRegex(TypeError, "Unsupported type list"): + psidx.fillna([1, 2]) + + def test_multiindex_isna(self): + psidx = ps.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) + + with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"): + psidx.isna() + + with self.assertRaisesRegex(NotImplementedError, "isna is not defined for MultiIndex"): + psidx.isnull() + + with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): + psidx.notna() + + with self.assertRaisesRegex(NotImplementedError, "notna is not defined for MultiIndex"): + psidx.notnull() + + +class StatTests( + StatMixin, + PandasOnSparkTestCase, + SQLTestUtils, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.indexes.test_stat import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org