This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new e9ba896f8446 [SPARK-46266][PS][CONNECT][TESTS] Re-organize `NumOpsTests` e9ba896f8446 is described below commit e9ba896f8446885d42bcb27953fe2d6050794be1 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Tue Dec 5 08:53:05 2023 -0800 [SPARK-46266][PS][CONNECT][TESTS] Re-organize `NumOpsTests` ### What changes were proposed in this pull request? Re-organize `NumOpsTests`, factor out the `astype` tests ### Why are the changes needed? group the tests by topics ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44182 from zhengruifeng/ps_reorg_as_type. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- dev/sparktestsupport/modules.py | 2 + .../connect/data_type_ops/test_parity_as_type.py | 43 +++++++++ .../pandas/tests/data_type_ops/test_as_type.py | 103 +++++++++++++++++++++ .../pandas/tests/data_type_ops/test_num_ops.py | 57 ------------ 4 files changed, 148 insertions(+), 57 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index f35c42d11e58..9bbe86baa1dc 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -698,6 +698,7 @@ pyspark_pandas = Module( "pyspark.pandas.spark.utils", "pyspark.pandas.typedef.typehints", # unittests + "pyspark.pandas.tests.data_type_ops.test_as_type", "pyspark.pandas.tests.data_type_ops.test_base", "pyspark.pandas.tests.data_type_ops.test_binary_ops", "pyspark.pandas.tests.data_type_ops.test_boolean_ops", @@ -952,6 +953,7 @@ pyspark_pandas_connect_part0 = Module( ], python_test_goals=[ # pandas-on-Spark unittests + "pyspark.pandas.tests.connect.data_type_ops.test_parity_as_type", "pyspark.pandas.tests.connect.data_type_ops.test_parity_base", "pyspark.pandas.tests.connect.data_type_ops.test_parity_binary_ops", "pyspark.pandas.tests.connect.data_type_ops.test_parity_boolean_ops", diff --git a/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_as_type.py b/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_as_type.py new file mode 100644 index 000000000000..a2a9e28a5ab5 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_as_type.py @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from pyspark import pandas as ps +from pyspark.pandas.tests.data_type_ops.test_as_type import AsTypeTestsMixin +from pyspark.pandas.tests.connect.data_type_ops.testing_utils import OpsTestBase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils +from pyspark.testing.connectutils import ReusedConnectTestCase + + +class AsTypeParityTests( + AsTypeTestsMixin, PandasOnSparkTestUtils, OpsTestBase, ReusedConnectTestCase +): + @property + def psdf(self): + return ps.from_pandas(self.pdf) + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.data_type_ops.test_parity_as_type import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_as_type.py b/python/pyspark/pandas/tests/data_type_ops/test_as_type.py new file mode 100644 index 000000000000..9d5c0d03d548 --- /dev/null +++ b/python/pyspark/pandas/tests/data_type_ops/test_as_type.py @@ -0,0 +1,103 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +import pandas as pd +import numpy as np +from pandas.api.types import CategoricalDtype + +from pyspark import pandas as ps +from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase +from pyspark.pandas.typedef.typehints import ( + extension_dtypes_available, + extension_float_dtypes_available, + extension_object_dtypes_available, +) + + +class AsTypeTestsMixin: + """Unit tests for arithmetic operations of numeric data types. + + A few test cases are disabled because pandas-on-Spark returns float64 whereas pandas + returns float32. + The underlying reason is the respective Spark operations return DoubleType always. + """ + + def test_astype(self): + pdf, psdf = self.pdf, self.psdf + for col in self.numeric_df_cols: + pser, psser = pdf[col], psdf[col] + + for int_type in [int, np.int32, np.int16, np.int8]: + if not pser.hasnans: + self.assert_eq(pser.astype(int_type), psser.astype(int_type)) + else: + self.assertRaisesRegex( + ValueError, + "Cannot convert %s with missing " + "values to integer" % psser._dtype_op.pretty_name, + lambda: psser.astype(int_type), + ) + + # TODO(SPARK-37039): the np.nan series.astype(bool) should be True + if not pser.hasnans: + self.assert_eq(pser.astype(bool), psser.astype(bool)) + + self.assert_eq(pser.astype(float), psser.astype(float)) + self.assert_eq(pser.astype(np.float32), psser.astype(np.float32)) + self.assert_eq(pser.astype(str), psser.astype(str)) + self.assert_eq(pser.astype("category"), psser.astype("category")) + cat_type = CategoricalDtype(categories=[2, 1, 3]) + self.assert_eq(pser.astype(cat_type), psser.astype(cat_type)) + if extension_object_dtypes_available and extension_float_dtypes_available: + pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype()) + psser = ps.from_pandas(pser) + self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype())) + + def test_astype_eager_check(self): + psser = self.psdf["float_nan"] + with ps.option_context("compute.eager_check", True), self.assertRaisesRegex( + ValueError, "Cannot convert" + ): + psser.astype(int) + with ps.option_context("compute.eager_check", False): + psser.astype(int) + + psser = self.psdf["decimal_nan"] + with ps.option_context("compute.eager_check", True), self.assertRaisesRegex( + ValueError, "Cannot convert" + ): + psser.astype(int) + with ps.option_context("compute.eager_check", False): + psser.astype(int) + + +class AsTypeTests(AsTypeTestsMixin, OpsTestBase): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.data_type_ops.test_as_type import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py index 70bb6a8da1c2..b1c80b31651b 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py @@ -19,7 +19,6 @@ import unittest import pandas as pd import numpy as np -from pandas.api.types import CategoricalDtype from pyspark import pandas as ps from pyspark.pandas.config import option_context @@ -40,14 +39,6 @@ class NumOpsTestsMixin: The underlying reason is the respective Spark operations return DoubleType always. """ - @property - def float_pser(self): - return pd.Series([1, 2, 3], dtype=float) - - @property - def float_psser(self): - return ps.from_pandas(self.float_pser) - def test_and(self): psdf = self.psdf for col in self.numeric_df_cols: @@ -117,54 +108,6 @@ class NumOpsTestsMixin: for col in self.numeric_df_cols: self.assert_eq(pdf[col].isnull(), psdf[col].isnull()) - def test_astype(self): - pdf, psdf = self.pdf, self.psdf - for col in self.numeric_df_cols: - pser, psser = pdf[col], psdf[col] - - for int_type in [int, np.int32, np.int16, np.int8]: - if not pser.hasnans: - self.assert_eq(pser.astype(int_type), psser.astype(int_type)) - else: - self.assertRaisesRegex( - ValueError, - "Cannot convert %s with missing " - "values to integer" % psser._dtype_op.pretty_name, - lambda: psser.astype(int_type), - ) - - # TODO(SPARK-37039): the np.nan series.astype(bool) should be True - if not pser.hasnans: - self.assert_eq(pser.astype(bool), psser.astype(bool)) - - self.assert_eq(pser.astype(float), psser.astype(float)) - self.assert_eq(pser.astype(np.float32), psser.astype(np.float32)) - self.assert_eq(pser.astype(str), psser.astype(str)) - self.assert_eq(pser.astype("category"), psser.astype("category")) - cat_type = CategoricalDtype(categories=[2, 1, 3]) - self.assert_eq(pser.astype(cat_type), psser.astype(cat_type)) - if extension_object_dtypes_available and extension_float_dtypes_available: - pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype()) - psser = ps.from_pandas(pser) - self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype())) - - def test_astype_eager_check(self): - psser = self.psdf["float_nan"] - with ps.option_context("compute.eager_check", True), self.assertRaisesRegex( - ValueError, "Cannot convert" - ): - psser.astype(int) - with ps.option_context("compute.eager_check", False): - psser.astype(int) - - psser = self.psdf["decimal_nan"] - with ps.option_context("compute.eager_check", True), self.assertRaisesRegex( - ValueError, "Cannot convert" - ): - psser.astype(int) - with ps.option_context("compute.eager_check", False): - psser.astype(int) - def test_neg(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org