(spark) branch master updated: [SPARK-46266][PS][CONNECT][TESTS] Re-organize `NumOpsTests`

dongjoon Tue, 05 Dec 2023 08:53:25 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new e9ba896f8446 [SPARK-46266][PS][CONNECT][TESTS] Re-organize 
`NumOpsTests`
e9ba896f8446 is described below

commit e9ba896f8446885d42bcb27953fe2d6050794be1
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Dec 5 08:53:05 2023 -0800

    [SPARK-46266][PS][CONNECT][TESTS] Re-organize `NumOpsTests`
    
    ### What changes were proposed in this pull request?
    Re-organize `NumOpsTests`, factor out the `astype` tests
    
    ### Why are the changes needed?
    group the tests by topics
    
    ### Does this PR introduce _any_ user-facing change?
    no, test-only
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #44182 from zhengruifeng/ps_reorg_as_type.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 dev/sparktestsupport/modules.py                    |   2 +
 .../connect/data_type_ops/test_parity_as_type.py   |  43 +++++++++
 .../pandas/tests/data_type_ops/test_as_type.py     | 103 +++++++++++++++++++++
 .../pandas/tests/data_type_ops/test_num_ops.py     |  57 ------------
 4 files changed, 148 insertions(+), 57 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index f35c42d11e58..9bbe86baa1dc 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -698,6 +698,7 @@ pyspark_pandas = Module(
         "pyspark.pandas.spark.utils",
         "pyspark.pandas.typedef.typehints",
         # unittests
+        "pyspark.pandas.tests.data_type_ops.test_as_type",
         "pyspark.pandas.tests.data_type_ops.test_base",
         "pyspark.pandas.tests.data_type_ops.test_binary_ops",
         "pyspark.pandas.tests.data_type_ops.test_boolean_ops",
@@ -952,6 +953,7 @@ pyspark_pandas_connect_part0 = Module(
     ],
     python_test_goals=[
         # pandas-on-Spark unittests
+        "pyspark.pandas.tests.connect.data_type_ops.test_parity_as_type",
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_base",
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_binary_ops",
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_boolean_ops",
diff --git 
a/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_as_type.py 
b/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_as_type.py
new file mode 100644
index 000000000000..a2a9e28a5ab5
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_as_type.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark import pandas as ps
+from pyspark.pandas.tests.data_type_ops.test_as_type import AsTypeTestsMixin
+from pyspark.pandas.tests.connect.data_type_ops.testing_utils import 
OpsTestBase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+from pyspark.testing.connectutils import ReusedConnectTestCase
+
+
+class AsTypeParityTests(
+    AsTypeTestsMixin, PandasOnSparkTestUtils, OpsTestBase, 
ReusedConnectTestCase
+):
+    @property
+    def psdf(self):
+        return ps.from_pandas(self.pdf)
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.data_type_ops.test_parity_as_type import 
*  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_as_type.py 
b/python/pyspark/pandas/tests/data_type_ops/test_as_type.py
new file mode 100644
index 000000000000..9d5c0d03d548
--- /dev/null
+++ b/python/pyspark/pandas/tests/data_type_ops/test_as_type.py
@@ -0,0 +1,103 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+import pandas as pd
+import numpy as np
+from pandas.api.types import CategoricalDtype
+
+from pyspark import pandas as ps
+from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
+from pyspark.pandas.typedef.typehints import (
+    extension_dtypes_available,
+    extension_float_dtypes_available,
+    extension_object_dtypes_available,
+)
+
+
+class AsTypeTestsMixin:
+    """Unit tests for arithmetic operations of numeric data types.
+
+    A few test cases are disabled because pandas-on-Spark returns float64 
whereas pandas
+    returns float32.
+    The underlying reason is the respective Spark operations return DoubleType 
always.
+    """
+
+    def test_astype(self):
+        pdf, psdf = self.pdf, self.psdf
+        for col in self.numeric_df_cols:
+            pser, psser = pdf[col], psdf[col]
+
+            for int_type in [int, np.int32, np.int16, np.int8]:
+                if not pser.hasnans:
+                    self.assert_eq(pser.astype(int_type), 
psser.astype(int_type))
+                else:
+                    self.assertRaisesRegex(
+                        ValueError,
+                        "Cannot convert %s with missing "
+                        "values to integer" % psser._dtype_op.pretty_name,
+                        lambda: psser.astype(int_type),
+                    )
+
+            # TODO(SPARK-37039): the np.nan series.astype(bool) should be True
+            if not pser.hasnans:
+                self.assert_eq(pser.astype(bool), psser.astype(bool))
+
+            self.assert_eq(pser.astype(float), psser.astype(float))
+            self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
+            self.assert_eq(pser.astype(str), psser.astype(str))
+            self.assert_eq(pser.astype("category"), psser.astype("category"))
+            cat_type = CategoricalDtype(categories=[2, 1, 3])
+            self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
+        if extension_object_dtypes_available and 
extension_float_dtypes_available:
+            pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), 
dtype=pd.Float64Dtype())
+            psser = ps.from_pandas(pser)
+            self.assert_eq(pser.astype(pd.BooleanDtype()), 
psser.astype(pd.BooleanDtype()))
+
+    def test_astype_eager_check(self):
+        psser = self.psdf["float_nan"]
+        with ps.option_context("compute.eager_check", True), 
self.assertRaisesRegex(
+            ValueError, "Cannot convert"
+        ):
+            psser.astype(int)
+        with ps.option_context("compute.eager_check", False):
+            psser.astype(int)
+
+        psser = self.psdf["decimal_nan"]
+        with ps.option_context("compute.eager_check", True), 
self.assertRaisesRegex(
+            ValueError, "Cannot convert"
+        ):
+            psser.astype(int)
+        with ps.option_context("compute.eager_check", False):
+            psser.astype(int)
+
+
+class AsTypeTests(AsTypeTestsMixin, OpsTestBase):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.data_type_ops.test_as_type import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py 
b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
index 70bb6a8da1c2..b1c80b31651b 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
@@ -19,7 +19,6 @@ import unittest
 
 import pandas as pd
 import numpy as np
-from pandas.api.types import CategoricalDtype
 
 from pyspark import pandas as ps
 from pyspark.pandas.config import option_context
@@ -40,14 +39,6 @@ class NumOpsTestsMixin:
     The underlying reason is the respective Spark operations return DoubleType 
always.
     """
 
-    @property
-    def float_pser(self):
-        return pd.Series([1, 2, 3], dtype=float)
-
-    @property
-    def float_psser(self):
-        return ps.from_pandas(self.float_pser)
-
     def test_and(self):
         psdf = self.psdf
         for col in self.numeric_df_cols:
@@ -117,54 +108,6 @@ class NumOpsTestsMixin:
         for col in self.numeric_df_cols:
             self.assert_eq(pdf[col].isnull(), psdf[col].isnull())
 
-    def test_astype(self):
-        pdf, psdf = self.pdf, self.psdf
-        for col in self.numeric_df_cols:
-            pser, psser = pdf[col], psdf[col]
-
-            for int_type in [int, np.int32, np.int16, np.int8]:
-                if not pser.hasnans:
-                    self.assert_eq(pser.astype(int_type), 
psser.astype(int_type))
-                else:
-                    self.assertRaisesRegex(
-                        ValueError,
-                        "Cannot convert %s with missing "
-                        "values to integer" % psser._dtype_op.pretty_name,
-                        lambda: psser.astype(int_type),
-                    )
-
-            # TODO(SPARK-37039): the np.nan series.astype(bool) should be True
-            if not pser.hasnans:
-                self.assert_eq(pser.astype(bool), psser.astype(bool))
-
-            self.assert_eq(pser.astype(float), psser.astype(float))
-            self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
-            self.assert_eq(pser.astype(str), psser.astype(str))
-            self.assert_eq(pser.astype("category"), psser.astype("category"))
-            cat_type = CategoricalDtype(categories=[2, 1, 3])
-            self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
-        if extension_object_dtypes_available and 
extension_float_dtypes_available:
-            pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), 
dtype=pd.Float64Dtype())
-            psser = ps.from_pandas(pser)
-            self.assert_eq(pser.astype(pd.BooleanDtype()), 
psser.astype(pd.BooleanDtype()))
-
-    def test_astype_eager_check(self):
-        psser = self.psdf["float_nan"]
-        with ps.option_context("compute.eager_check", True), 
self.assertRaisesRegex(
-            ValueError, "Cannot convert"
-        ):
-            psser.astype(int)
-        with ps.option_context("compute.eager_check", False):
-            psser.astype(int)
-
-        psser = self.psdf["decimal_nan"]
-        with ps.option_context("compute.eager_check", True), 
self.assertRaisesRegex(
-            ValueError, "Cannot convert"
-        ):
-            psser.astype(int)
-        with ps.option_context("compute.eager_check", False):
-            psser.astype(int)
-
     def test_neg(self):
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-46266][PS][CONNECT][TESTS] Re-organize `NumOpsTests`

Reply via email to