This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new e9ba896f8446 [SPARK-46266][PS][CONNECT][TESTS] Re-organize
`NumOpsTests`
e9ba896f8446 is described below
commit e9ba896f8446885d42bcb27953fe2d6050794be1
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Dec 5 08:53:05 2023 -0800
[SPARK-46266][PS][CONNECT][TESTS] Re-organize `NumOpsTests`
### What changes were proposed in this pull request?
Re-organize `NumOpsTests`, factor out the `astype` tests
### Why are the changes needed?
group the tests by topics
### Does this PR introduce _any_ user-facing change?
no, test-only
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44182 from zhengruifeng/ps_reorg_as_type.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
dev/sparktestsupport/modules.py | 2 +
.../connect/data_type_ops/test_parity_as_type.py | 43 +++++++++
.../pandas/tests/data_type_ops/test_as_type.py | 103 +++++++++++++++++++++
.../pandas/tests/data_type_ops/test_num_ops.py | 57 ------------
4 files changed, 148 insertions(+), 57 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index f35c42d11e58..9bbe86baa1dc 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -698,6 +698,7 @@ pyspark_pandas = Module(
"pyspark.pandas.spark.utils",
"pyspark.pandas.typedef.typehints",
# unittests
+ "pyspark.pandas.tests.data_type_ops.test_as_type",
"pyspark.pandas.tests.data_type_ops.test_base",
"pyspark.pandas.tests.data_type_ops.test_binary_ops",
"pyspark.pandas.tests.data_type_ops.test_boolean_ops",
@@ -952,6 +953,7 @@ pyspark_pandas_connect_part0 = Module(
],
python_test_goals=[
# pandas-on-Spark unittests
+ "pyspark.pandas.tests.connect.data_type_ops.test_parity_as_type",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_base",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_binary_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_boolean_ops",
diff --git
a/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_as_type.py
b/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_as_type.py
new file mode 100644
index 000000000000..a2a9e28a5ab5
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_as_type.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark import pandas as ps
+from pyspark.pandas.tests.data_type_ops.test_as_type import AsTypeTestsMixin
+from pyspark.pandas.tests.connect.data_type_ops.testing_utils import
OpsTestBase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+from pyspark.testing.connectutils import ReusedConnectTestCase
+
+
+class AsTypeParityTests(
+ AsTypeTestsMixin, PandasOnSparkTestUtils, OpsTestBase,
ReusedConnectTestCase
+):
+ @property
+ def psdf(self):
+ return ps.from_pandas(self.pdf)
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.connect.data_type_ops.test_parity_as_type import
* # noqa: F401
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_as_type.py
b/python/pyspark/pandas/tests/data_type_ops/test_as_type.py
new file mode 100644
index 000000000000..9d5c0d03d548
--- /dev/null
+++ b/python/pyspark/pandas/tests/data_type_ops/test_as_type.py
@@ -0,0 +1,103 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+import pandas as pd
+import numpy as np
+from pandas.api.types import CategoricalDtype
+
+from pyspark import pandas as ps
+from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
+from pyspark.pandas.typedef.typehints import (
+ extension_dtypes_available,
+ extension_float_dtypes_available,
+ extension_object_dtypes_available,
+)
+
+
+class AsTypeTestsMixin:
+ """Unit tests for arithmetic operations of numeric data types.
+
+ A few test cases are disabled because pandas-on-Spark returns float64
whereas pandas
+ returns float32.
+ The underlying reason is the respective Spark operations return DoubleType
always.
+ """
+
+ def test_astype(self):
+ pdf, psdf = self.pdf, self.psdf
+ for col in self.numeric_df_cols:
+ pser, psser = pdf[col], psdf[col]
+
+ for int_type in [int, np.int32, np.int16, np.int8]:
+ if not pser.hasnans:
+ self.assert_eq(pser.astype(int_type),
psser.astype(int_type))
+ else:
+ self.assertRaisesRegex(
+ ValueError,
+ "Cannot convert %s with missing "
+ "values to integer" % psser._dtype_op.pretty_name,
+ lambda: psser.astype(int_type),
+ )
+
+ # TODO(SPARK-37039): the np.nan series.astype(bool) should be True
+ if not pser.hasnans:
+ self.assert_eq(pser.astype(bool), psser.astype(bool))
+
+ self.assert_eq(pser.astype(float), psser.astype(float))
+ self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
+ self.assert_eq(pser.astype(str), psser.astype(str))
+ self.assert_eq(pser.astype("category"), psser.astype("category"))
+ cat_type = CategoricalDtype(categories=[2, 1, 3])
+ self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
+ if extension_object_dtypes_available and
extension_float_dtypes_available:
+ pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]),
dtype=pd.Float64Dtype())
+ psser = ps.from_pandas(pser)
+ self.assert_eq(pser.astype(pd.BooleanDtype()),
psser.astype(pd.BooleanDtype()))
+
+ def test_astype_eager_check(self):
+ psser = self.psdf["float_nan"]
+ with ps.option_context("compute.eager_check", True),
self.assertRaisesRegex(
+ ValueError, "Cannot convert"
+ ):
+ psser.astype(int)
+ with ps.option_context("compute.eager_check", False):
+ psser.astype(int)
+
+ psser = self.psdf["decimal_nan"]
+ with ps.option_context("compute.eager_check", True),
self.assertRaisesRegex(
+ ValueError, "Cannot convert"
+ ):
+ psser.astype(int)
+ with ps.option_context("compute.eager_check", False):
+ psser.astype(int)
+
+
+class AsTypeTests(AsTypeTestsMixin, OpsTestBase):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.data_type_ops.test_as_type import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
index 70bb6a8da1c2..b1c80b31651b 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
@@ -19,7 +19,6 @@ import unittest
import pandas as pd
import numpy as np
-from pandas.api.types import CategoricalDtype
from pyspark import pandas as ps
from pyspark.pandas.config import option_context
@@ -40,14 +39,6 @@ class NumOpsTestsMixin:
The underlying reason is the respective Spark operations return DoubleType
always.
"""
- @property
- def float_pser(self):
- return pd.Series([1, 2, 3], dtype=float)
-
- @property
- def float_psser(self):
- return ps.from_pandas(self.float_pser)
-
def test_and(self):
psdf = self.psdf
for col in self.numeric_df_cols:
@@ -117,54 +108,6 @@ class NumOpsTestsMixin:
for col in self.numeric_df_cols:
self.assert_eq(pdf[col].isnull(), psdf[col].isnull())
- def test_astype(self):
- pdf, psdf = self.pdf, self.psdf
- for col in self.numeric_df_cols:
- pser, psser = pdf[col], psdf[col]
-
- for int_type in [int, np.int32, np.int16, np.int8]:
- if not pser.hasnans:
- self.assert_eq(pser.astype(int_type),
psser.astype(int_type))
- else:
- self.assertRaisesRegex(
- ValueError,
- "Cannot convert %s with missing "
- "values to integer" % psser._dtype_op.pretty_name,
- lambda: psser.astype(int_type),
- )
-
- # TODO(SPARK-37039): the np.nan series.astype(bool) should be True
- if not pser.hasnans:
- self.assert_eq(pser.astype(bool), psser.astype(bool))
-
- self.assert_eq(pser.astype(float), psser.astype(float))
- self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
- self.assert_eq(pser.astype(str), psser.astype(str))
- self.assert_eq(pser.astype("category"), psser.astype("category"))
- cat_type = CategoricalDtype(categories=[2, 1, 3])
- self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
- if extension_object_dtypes_available and
extension_float_dtypes_available:
- pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]),
dtype=pd.Float64Dtype())
- psser = ps.from_pandas(pser)
- self.assert_eq(pser.astype(pd.BooleanDtype()),
psser.astype(pd.BooleanDtype()))
-
- def test_astype_eager_check(self):
- psser = self.psdf["float_nan"]
- with ps.option_context("compute.eager_check", True),
self.assertRaisesRegex(
- ValueError, "Cannot convert"
- ):
- psser.astype(int)
- with ps.option_context("compute.eager_check", False):
- psser.astype(int)
-
- psser = self.psdf["decimal_nan"]
- with ps.option_context("compute.eager_check", True),
self.assertRaisesRegex(
- ValueError, "Cannot convert"
- ):
- psser.astype(int)
- with ps.option_context("compute.eager_check", False):
- psser.astype(int)
-
def test_neg(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]