(spark) branch master updated: [SPARK-46268][PS][CONNECT][TESTS] Re-organize `StatsTests`

dongjoon Tue, 05 Dec 2023 10:20:07 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new ef05fb632424 [SPARK-46268][PS][CONNECT][TESTS] Re-organize `StatsTests`
ef05fb632424 is described below

commit ef05fb632424f8f121bcd4518ab3a8815c295c85
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Dec 5 10:19:41 2023 -0800

    [SPARK-46268][PS][CONNECT][TESTS] Re-organize `StatsTests`
    
    ### What changes were proposed in this pull request?
    Re-organize `StatsTests`
    
    ### Why are the changes needed?
    break the big test file by grouping test cases by topics
    
    ### Does this PR introduce _any_ user-facing change?
    no
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #44185 from zhengruifeng/ps_reorg_test_stats.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 dev/sparktestsupport/modules.py                    |   8 +-
 .../pyspark/pandas/tests/computation/test_corr.py  | 222 ++++++++++++++++
 .../pandas/tests/{ => computation}/test_stats.py   | 278 +--------------------
 .../test_parity_corr.py}                           |   7 +-
 .../connect/{ => computation}/test_parity_stats.py |   4 +-
 .../test_parity_axis.py}                           |   6 +-
 python/pyspark/pandas/tests/frame/test_axis.py     | 135 ++++++++++
 7 files changed, 373 insertions(+), 287 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 9bbe86baa1dc..900329d07c00 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -789,6 +789,7 @@ pyspark_pandas_slow = Module(
         "pyspark.pandas.tests.computation.test_binary_ops",
         "pyspark.pandas.tests.computation.test_combine",
         "pyspark.pandas.tests.computation.test_compute",
+        "pyspark.pandas.tests.computation.test_corr",
         "pyspark.pandas.tests.computation.test_corrwith",
         "pyspark.pandas.tests.computation.test_cov",
         "pyspark.pandas.tests.computation.test_cumulative",
@@ -797,7 +798,9 @@ pyspark_pandas_slow = Module(
         "pyspark.pandas.tests.computation.test_melt",
         "pyspark.pandas.tests.computation.test_missing_data",
         "pyspark.pandas.tests.computation.test_pivot",
+        "pyspark.pandas.tests.computation.test_stats",
         "pyspark.pandas.tests.frame.test_attrs",
+        "pyspark.pandas.tests.frame.test_axis",
         "pyspark.pandas.tests.frame.test_constructor",
         "pyspark.pandas.tests.frame.test_conversion",
         "pyspark.pandas.tests.frame.test_reindexing",
@@ -841,7 +844,6 @@ pyspark_pandas_slow = Module(
         "pyspark.pandas.tests.series.test_series",
         "pyspark.pandas.tests.series.test_sort",
         "pyspark.pandas.tests.series.test_stat",
-        "pyspark.pandas.tests.test_stats",
     ],
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, 
pandas, and pyarrow and
@@ -1014,6 +1016,7 @@ pyspark_pandas_connect_part0 = Module(
         "pyspark.pandas.tests.connect.computation.test_parity_combine",
         "pyspark.pandas.tests.connect.computation.test_parity_compute",
         "pyspark.pandas.tests.connect.computation.test_parity_cov",
+        "pyspark.pandas.tests.connect.computation.test_parity_corr",
         "pyspark.pandas.tests.connect.computation.test_parity_corrwith",
         "pyspark.pandas.tests.connect.computation.test_parity_cumulative",
         "pyspark.pandas.tests.connect.computation.test_parity_describe",
@@ -1021,6 +1024,7 @@ pyspark_pandas_connect_part0 = Module(
         "pyspark.pandas.tests.connect.computation.test_parity_melt",
         "pyspark.pandas.tests.connect.groupby.test_parity_stat",
         "pyspark.pandas.tests.connect.frame.test_parity_attrs",
+        "pyspark.pandas.tests.connect.frame.test_parity_axis",
         "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_frame",
         "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_series",
     ],
@@ -1075,7 +1079,6 @@ pyspark_pandas_connect_part1 = Module(
         "pyspark.pandas.tests.connect.series.test_parity_stat",
         
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic",
         "pyspark.pandas.tests.connect.test_parity_reshape",
-        "pyspark.pandas.tests.connect.test_parity_stats",
         
"pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_expanding",
     ],
     excluded_python_implementations=[
@@ -1094,6 +1097,7 @@ pyspark_pandas_connect_part2 = Module(
     python_test_goals=[
         # pandas-on-Spark unittests
         "pyspark.pandas.tests.connect.computation.test_parity_pivot",
+        "pyspark.pandas.tests.connect.computation.test_parity_stats",
         "pyspark.pandas.tests.connect.indexes.test_parity_base_slow",
         "pyspark.pandas.tests.connect.indexes.test_parity_datetime_property",
         "pyspark.pandas.tests.connect.test_parity_frame_interpolate",
diff --git a/python/pyspark/pandas/tests/computation/test_corr.py 
b/python/pyspark/pandas/tests/computation/test_corr.py
new file mode 100644
index 000000000000..a7b06aa2928a
--- /dev/null
+++ b/python/pyspark/pandas/tests/computation/test_corr.py
@@ -0,0 +1,222 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, 
SPARK_CONF_ARROW_ENABLED
+from pyspark.testing.sqlutils import SQLTestUtils
+
+
+class FrameCorrMixin:
+    def test_dataframe_corr(self):
+        pdf = pd.DataFrame(
+            index=[
+                "".join(
+                    np.random.choice(
+                        
list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10
+                    )
+                )
+                for _ in range(30)
+            ],
+            columns=list("ABCD"),
+            dtype="float64",
+        )
+        psdf = ps.from_pandas(pdf)
+
+        with self.assertRaisesRegex(ValueError, "Invalid method"):
+            psdf.corr("std")
+        with self.assertRaisesRegex(TypeError, "Invalid min_periods type"):
+            psdf.corr(min_periods="3")
+
+        for method in ["pearson", "spearman", "kendall"]:
+            self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), 
check_exact=False)
+            self.assert_eq(
+                psdf.corr(method=method, min_periods=1),
+                pdf.corr(method=method, min_periods=1),
+                check_exact=False,
+            )
+            self.assert_eq(
+                psdf.corr(method=method, min_periods=3),
+                pdf.corr(method=method, min_periods=3),
+                check_exact=False,
+            )
+            self.assert_eq(
+                (psdf + 1).corr(method=method, min_periods=2),
+                (pdf + 1).corr(method=method, min_periods=2),
+                check_exact=False,
+            )
+
+        # multi-index columns
+        columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", 
"C"), ("Z", "D")])
+        pdf.columns = columns
+        psdf.columns = columns
+
+        for method in ["pearson", "spearman", "kendall"]:
+            self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), 
check_exact=False)
+            self.assert_eq(
+                psdf.corr(method=method, min_periods=1),
+                pdf.corr(method=method, min_periods=1),
+                check_exact=False,
+            )
+            self.assert_eq(
+                psdf.corr(method=method, min_periods=3),
+                pdf.corr(method=method, min_periods=3),
+                check_exact=False,
+            )
+            self.assert_eq(
+                (psdf + 1).corr(method=method, min_periods=2),
+                (pdf + 1).corr(method=method, min_periods=2),
+                check_exact=False,
+            )
+
+        # test with identical values
+        pdf = pd.DataFrame(
+            {
+                "a": [0, 1, 1, 1, 0],
+                "b": [2, 2, -1, 1, np.nan],
+                "c": [3, 3, 3, 3, 3],
+                "d": [np.nan, np.nan, np.nan, np.nan, np.nan],
+            }
+        )
+        psdf = ps.from_pandas(pdf)
+
+        for method in ["pearson", "spearman", "kendall"]:
+            self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), 
check_exact=False)
+            self.assert_eq(
+                psdf.corr(method=method, min_periods=1),
+                pdf.corr(method=method, min_periods=1),
+                check_exact=False,
+            )
+            self.assert_eq(
+                psdf.corr(method=method, min_periods=3),
+                pdf.corr(method=method, min_periods=3),
+                check_exact=False,
+            )
+
+    def test_series_corr(self):
+        pdf = pd.DataFrame(
+            index=[
+                "".join(
+                    np.random.choice(
+                        
list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10
+                    )
+                )
+                for _ in range(30)
+            ],
+            columns=list("ABCD"),
+            dtype="float64",
+        )
+        pser1 = pdf.A
+        pser2 = pdf.B
+        psdf = ps.from_pandas(pdf)
+        psser1 = psdf.A
+        psser2 = psdf.B
+
+        with self.assertRaisesRegex(ValueError, "Invalid method"):
+            psser1.corr(psser2, method="std")
+        with self.assertRaisesRegex(TypeError, "Invalid min_periods type"):
+            psser1.corr(psser2, min_periods="3")
+
+        for method in ["pearson", "spearman", "kendall"]:
+            self.assert_eq(
+                psser1.corr(psser2, method=method),
+                pser1.corr(pser2, method=method),
+                almost=True,
+            )
+            self.assert_eq(
+                psser1.corr(psser2, method=method, min_periods=1),
+                pser1.corr(pser2, method=method, min_periods=1),
+                almost=True,
+            )
+            self.assert_eq(
+                psser1.corr(psser2, method=method, min_periods=3),
+                pser1.corr(pser2, method=method, min_periods=3),
+                almost=True,
+            )
+            self.assert_eq(
+                (psser1 + 1).corr(psser2 - 2, method=method, min_periods=2),
+                (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2),
+                almost=True,
+            )
+
+        # different anchors
+        psser1 = ps.from_pandas(pser1)
+        psser2 = ps.from_pandas(pser2)
+
+        with self.assertRaisesRegex(ValueError, "Cannot combine the series or 
dataframe"):
+            psser1.corr(psser2)
+
+        for method in ["pearson", "spearman", "kendall"]:
+            with ps.option_context("compute.ops_on_diff_frames", True):
+                self.assert_eq(
+                    psser1.corr(psser2, method=method),
+                    pser1.corr(pser2, method=method),
+                    almost=True,
+                )
+                self.assert_eq(
+                    psser1.corr(psser2, method=method, min_periods=1),
+                    pser1.corr(pser2, method=method, min_periods=1),
+                    almost=True,
+                )
+                self.assert_eq(
+                    psser1.corr(psser2, method=method, min_periods=3),
+                    pser1.corr(pser2, method=method, min_periods=3),
+                    almost=True,
+                )
+                self.assert_eq(
+                    (psser1 + 1).corr(psser2 - 2, method=method, 
min_periods=2),
+                    (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2),
+                    almost=True,
+                )
+
+    def test_cov_corr_meta(self):
+        # Disable arrow execution since corr() is using UDT internally which 
is not supported.
+        with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
+            pdf = pd.DataFrame(
+                {
+                    "a": np.array([1, 2, 3], dtype="i1"),
+                    "b": np.array([1, 2, 3], dtype="i2"),
+                    "c": np.array([1, 2, 3], dtype="i4"),
+                    "d": np.array([1, 2, 3]),
+                    "e": np.array([1.0, 2.0, 3.0], dtype="f4"),
+                    "f": np.array([1.0, 2.0, 3.0]),
+                    "g": np.array([True, False, True]),
+                    "h": np.array(list("abc")),
+                },
+                index=pd.Index([1, 2, 3], name="myindex"),
+            )
+            psdf = ps.from_pandas(pdf)
+            self.assert_eq(psdf.corr(), pdf.corr(numeric_only=True), 
check_exact=False)
+
+
+class FrameCorrTests(FrameCorrMixin, PandasOnSparkTestCase, SQLTestUtils):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.computation.test_corr import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/test_stats.py 
b/python/pyspark/pandas/tests/computation/test_stats.py
similarity index 53%
rename from python/pyspark/pandas/tests/test_stats.py
rename to python/pyspark/pandas/tests/computation/test_stats.py
index bdc83ad7d5f5..c18c489617c2 100644
--- a/python/pyspark/pandas/tests/test_stats.py
+++ b/python/pyspark/pandas/tests/computation/test_stats.py
@@ -15,13 +15,10 @@
 # limitations under the License.
 #
 
-import unittest
-
 import numpy as np
 import pandas as pd
 
 from pyspark import pandas as ps
-from pyspark.pandas.config import option_context
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, 
SPARK_CONF_ARROW_ENABLED
 from pyspark.testing.sqlutils import SQLTestUtils
 
@@ -160,99 +157,6 @@ class StatsTestsMixin:
         ):
             psdf.D.abs()
 
-    def test_axis_on_dataframe(self):
-        # The number of each count is intentionally big
-        # because when data is small, it executes a shortcut.
-        # Less than 'compute.shortcut_limit' will execute a shortcut
-        # by using collected pandas dataframe directly.
-        # now we set the 'compute.shortcut_limit' as 1000 explicitly
-        with option_context("compute.shortcut_limit", 1000):
-            pdf = pd.DataFrame(
-                {
-                    "A": [1, -2, 3, -4, 5] * 300,
-                    "B": [1.0, -2, 3, -4, 5] * 300,
-                    "C": [-6.0, -7, -8, -9, 10] * 300,
-                    "D": [True, False, True, False, False] * 300,
-                },
-                index=range(10, 15001, 10),
-            )
-            # TODO(SPARK-45228): Update `test_axis_on_dataframe` when Pandas 
regression is fixed
-            # There is a regression in Pandas 2.1.0,
-            # so we should manually cast to float until the regression is 
fixed.
-            # See https://github.com/pandas-dev/pandas/issues/55194.
-            pdf = pdf.astype(float)
-            psdf = ps.from_pandas(pdf)
-            self.assert_eq(psdf.count(axis=1), pdf.count(axis=1))
-            self.assert_eq(psdf.var(axis=1), pdf.var(axis=1))
-            self.assert_eq(psdf.var(axis=1, ddof=0), pdf.var(axis=1, ddof=0))
-            self.assert_eq(psdf.std(axis=1), pdf.std(axis=1))
-            self.assert_eq(psdf.std(axis=1, ddof=0), pdf.std(axis=1, ddof=0))
-            self.assert_eq(psdf.max(axis=1), pdf.max(axis=1))
-            self.assert_eq(psdf.min(axis=1), pdf.min(axis=1))
-            self.assert_eq(psdf.sum(axis=1), pdf.sum(axis=1))
-            self.assert_eq(psdf.product(axis=1), pdf.product(axis=1))
-            self.assert_eq(psdf.kurtosis(axis=0), pdf.kurtosis(axis=0), 
almost=True)
-            self.assert_eq(psdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
-            self.assert_eq(psdf.skew(axis=0), pdf.skew(axis=0), almost=True)
-            self.assert_eq(psdf.skew(axis=1), pdf.skew(axis=1))
-            self.assert_eq(psdf.mean(axis=1), pdf.mean(axis=1))
-            self.assert_eq(psdf.sem(axis=1), pdf.sem(axis=1))
-            self.assert_eq(psdf.sem(axis=1, ddof=0), pdf.sem(axis=1, ddof=0))
-
-            self.assert_eq(
-                psdf.count(axis=1, numeric_only=True), pdf.count(axis=1, 
numeric_only=True)
-            )
-            self.assert_eq(psdf.var(axis=1, numeric_only=True), 
pdf.var(axis=1, numeric_only=True))
-            self.assert_eq(
-                psdf.var(axis=1, ddof=0, numeric_only=True),
-                pdf.var(axis=1, ddof=0, numeric_only=True),
-            )
-            self.assert_eq(psdf.std(axis=1, numeric_only=True), 
pdf.std(axis=1, numeric_only=True))
-            self.assert_eq(
-                psdf.std(axis=1, ddof=0, numeric_only=True),
-                pdf.std(axis=1, ddof=0, numeric_only=True),
-            )
-            self.assert_eq(
-                psdf.max(axis=1, numeric_only=True),
-                pdf.max(axis=1, numeric_only=True).astype(float),
-            )
-            self.assert_eq(
-                psdf.min(axis=1, numeric_only=True),
-                pdf.min(axis=1, numeric_only=True).astype(float),
-            )
-            self.assert_eq(
-                psdf.sum(axis=1, numeric_only=True),
-                pdf.sum(axis=1, numeric_only=True).astype(float),
-            )
-            self.assert_eq(
-                psdf.product(axis=1, numeric_only=True),
-                pdf.product(axis=1, numeric_only=True).astype(float),
-            )
-            self.assert_eq(
-                psdf.kurtosis(axis=0, numeric_only=True),
-                pdf.kurtosis(axis=0, numeric_only=True),
-                almost=True,
-            )
-            self.assert_eq(
-                psdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1, 
numeric_only=True)
-            )
-            self.assert_eq(
-                psdf.skew(axis=0, numeric_only=True),
-                pdf.skew(axis=0, numeric_only=True),
-                almost=True,
-            )
-            self.assert_eq(
-                psdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1, 
numeric_only=True)
-            )
-            self.assert_eq(
-                psdf.mean(axis=1, numeric_only=True), pdf.mean(axis=1, 
numeric_only=True)
-            )
-            self.assert_eq(psdf.sem(axis=1, numeric_only=True), 
pdf.sem(axis=1, numeric_only=True))
-            self.assert_eq(
-                psdf.sem(axis=1, ddof=0, numeric_only=True),
-                pdf.sem(axis=1, ddof=0, numeric_only=True),
-            )
-
     def test_skew_kurt_numerical_stability(self):
         pdf = pd.DataFrame(
             {
@@ -268,186 +172,6 @@ class StatsTestsMixin:
         self.assert_eq(psdf.skew(), pdf.skew(), almost=True)
         self.assert_eq(psdf.kurt(), pdf.kurt(), almost=True)
 
-    def test_dataframe_corr(self):
-        pdf = pd.DataFrame(
-            index=[
-                "".join(
-                    np.random.choice(
-                        
list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10
-                    )
-                )
-                for _ in range(30)
-            ],
-            columns=list("ABCD"),
-            dtype="float64",
-        )
-        psdf = ps.from_pandas(pdf)
-
-        with self.assertRaisesRegex(ValueError, "Invalid method"):
-            psdf.corr("std")
-        with self.assertRaisesRegex(TypeError, "Invalid min_periods type"):
-            psdf.corr(min_periods="3")
-
-        for method in ["pearson", "spearman", "kendall"]:
-            self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), 
check_exact=False)
-            self.assert_eq(
-                psdf.corr(method=method, min_periods=1),
-                pdf.corr(method=method, min_periods=1),
-                check_exact=False,
-            )
-            self.assert_eq(
-                psdf.corr(method=method, min_periods=3),
-                pdf.corr(method=method, min_periods=3),
-                check_exact=False,
-            )
-            self.assert_eq(
-                (psdf + 1).corr(method=method, min_periods=2),
-                (pdf + 1).corr(method=method, min_periods=2),
-                check_exact=False,
-            )
-
-        # multi-index columns
-        columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", 
"C"), ("Z", "D")])
-        pdf.columns = columns
-        psdf.columns = columns
-
-        for method in ["pearson", "spearman", "kendall"]:
-            self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), 
check_exact=False)
-            self.assert_eq(
-                psdf.corr(method=method, min_periods=1),
-                pdf.corr(method=method, min_periods=1),
-                check_exact=False,
-            )
-            self.assert_eq(
-                psdf.corr(method=method, min_periods=3),
-                pdf.corr(method=method, min_periods=3),
-                check_exact=False,
-            )
-            self.assert_eq(
-                (psdf + 1).corr(method=method, min_periods=2),
-                (pdf + 1).corr(method=method, min_periods=2),
-                check_exact=False,
-            )
-
-        # test with identical values
-        pdf = pd.DataFrame(
-            {
-                "a": [0, 1, 1, 1, 0],
-                "b": [2, 2, -1, 1, np.nan],
-                "c": [3, 3, 3, 3, 3],
-                "d": [np.nan, np.nan, np.nan, np.nan, np.nan],
-            }
-        )
-        psdf = ps.from_pandas(pdf)
-
-        for method in ["pearson", "spearman", "kendall"]:
-            self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), 
check_exact=False)
-            self.assert_eq(
-                psdf.corr(method=method, min_periods=1),
-                pdf.corr(method=method, min_periods=1),
-                check_exact=False,
-            )
-            self.assert_eq(
-                psdf.corr(method=method, min_periods=3),
-                pdf.corr(method=method, min_periods=3),
-                check_exact=False,
-            )
-
-    def test_series_corr(self):
-        pdf = pd.DataFrame(
-            index=[
-                "".join(
-                    np.random.choice(
-                        
list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10
-                    )
-                )
-                for _ in range(30)
-            ],
-            columns=list("ABCD"),
-            dtype="float64",
-        )
-        pser1 = pdf.A
-        pser2 = pdf.B
-        psdf = ps.from_pandas(pdf)
-        psser1 = psdf.A
-        psser2 = psdf.B
-
-        with self.assertRaisesRegex(ValueError, "Invalid method"):
-            psser1.corr(psser2, method="std")
-        with self.assertRaisesRegex(TypeError, "Invalid min_periods type"):
-            psser1.corr(psser2, min_periods="3")
-
-        for method in ["pearson", "spearman", "kendall"]:
-            self.assert_eq(
-                psser1.corr(psser2, method=method),
-                pser1.corr(pser2, method=method),
-                almost=True,
-            )
-            self.assert_eq(
-                psser1.corr(psser2, method=method, min_periods=1),
-                pser1.corr(pser2, method=method, min_periods=1),
-                almost=True,
-            )
-            self.assert_eq(
-                psser1.corr(psser2, method=method, min_periods=3),
-                pser1.corr(pser2, method=method, min_periods=3),
-                almost=True,
-            )
-            self.assert_eq(
-                (psser1 + 1).corr(psser2 - 2, method=method, min_periods=2),
-                (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2),
-                almost=True,
-            )
-
-        # different anchors
-        psser1 = ps.from_pandas(pser1)
-        psser2 = ps.from_pandas(pser2)
-
-        with self.assertRaisesRegex(ValueError, "Cannot combine the series or 
dataframe"):
-            psser1.corr(psser2)
-
-        for method in ["pearson", "spearman", "kendall"]:
-            with ps.option_context("compute.ops_on_diff_frames", True):
-                self.assert_eq(
-                    psser1.corr(psser2, method=method),
-                    pser1.corr(pser2, method=method),
-                    almost=True,
-                )
-                self.assert_eq(
-                    psser1.corr(psser2, method=method, min_periods=1),
-                    pser1.corr(pser2, method=method, min_periods=1),
-                    almost=True,
-                )
-                self.assert_eq(
-                    psser1.corr(psser2, method=method, min_periods=3),
-                    pser1.corr(pser2, method=method, min_periods=3),
-                    almost=True,
-                )
-                self.assert_eq(
-                    (psser1 + 1).corr(psser2 - 2, method=method, 
min_periods=2),
-                    (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2),
-                    almost=True,
-                )
-
-    def test_cov_corr_meta(self):
-        # Disable arrow execution since corr() is using UDT internally which 
is not supported.
-        with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
-            pdf = pd.DataFrame(
-                {
-                    "a": np.array([1, 2, 3], dtype="i1"),
-                    "b": np.array([1, 2, 3], dtype="i2"),
-                    "c": np.array([1, 2, 3], dtype="i4"),
-                    "d": np.array([1, 2, 3]),
-                    "e": np.array([1.0, 2.0, 3.0], dtype="f4"),
-                    "f": np.array([1.0, 2.0, 3.0]),
-                    "g": np.array([True, False, True]),
-                    "h": np.array(list("abc")),
-                },
-                index=pd.Index([1, 2, 3], name="myindex"),
-            )
-            psdf = ps.from_pandas(pdf)
-            self.assert_eq(psdf.corr(), pdf.corr(numeric_only=True), 
check_exact=False)
-
     def test_stats_on_boolean_dataframe(self):
         pdf = pd.DataFrame({"A": [True, False, True], "B": [False, False, 
True]})
         psdf = ps.from_pandas(pdf)
@@ -588,7 +312,7 @@ class StatsTests(StatsTestsMixin, PandasOnSparkTestCase, 
SQLTestUtils):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.pandas.tests.test_stats import *  # noqa: F401
+    from pyspark.pandas.tests.computation.test_stats import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/pandas/tests/connect/test_parity_stats.py 
b/python/pyspark/pandas/tests/connect/computation/test_parity_corr.py
similarity index 81%
copy from python/pyspark/pandas/tests/connect/test_parity_stats.py
copy to python/pyspark/pandas/tests/connect/computation/test_parity_corr.py
index 7eddc4c15d4f..acf36b07829a 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_stats.py
+++ b/python/pyspark/pandas/tests/connect/computation/test_parity_corr.py
@@ -16,17 +16,18 @@
 #
 import unittest
 
-from pyspark.pandas.tests.test_stats import StatsTestsMixin
+from pyspark import pandas as ps
+from pyspark.pandas.tests.computation.test_corr import FrameCorrMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
 from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class StatsParityTests(StatsTestsMixin, PandasOnSparkTestUtils, 
ReusedConnectTestCase):
+class FrameParityCorrTests(FrameCorrMixin, PandasOnSparkTestUtils, 
ReusedConnectTestCase):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.test_parity_stats import *  # noqa: F401
+    from pyspark.pandas.tests.connect.computation.test_parity_corr import *  # 
noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/test_parity_stats.py 
b/python/pyspark/pandas/tests/connect/computation/test_parity_stats.py
similarity index 88%
copy from python/pyspark/pandas/tests/connect/test_parity_stats.py
copy to python/pyspark/pandas/tests/connect/computation/test_parity_stats.py
index 7eddc4c15d4f..14d37949590b 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_stats.py
+++ b/python/pyspark/pandas/tests/connect/computation/test_parity_stats.py
@@ -16,7 +16,7 @@
 #
 import unittest
 
-from pyspark.pandas.tests.test_stats import StatsTestsMixin
+from pyspark.pandas.tests.computation.test_stats import StatsTestsMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
 from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
@@ -26,7 +26,7 @@ class StatsParityTests(StatsTestsMixin, 
PandasOnSparkTestUtils, ReusedConnectTes
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.test_parity_stats import *  # noqa: F401
+    from pyspark.pandas.tests.connect.computation.test_parity_stats import *  
# noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/test_parity_stats.py 
b/python/pyspark/pandas/tests/connect/frame/test_parity_axis.py
similarity index 83%
rename from python/pyspark/pandas/tests/connect/test_parity_stats.py
rename to python/pyspark/pandas/tests/connect/frame/test_parity_axis.py
index 7eddc4c15d4f..804ed97fa31c 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_stats.py
+++ b/python/pyspark/pandas/tests/connect/frame/test_parity_axis.py
@@ -16,17 +16,17 @@
 #
 import unittest
 
-from pyspark.pandas.tests.test_stats import StatsTestsMixin
+from pyspark.pandas.tests.frame.test_axis import FrameAxisMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
 from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class StatsParityTests(StatsTestsMixin, PandasOnSparkTestUtils, 
ReusedConnectTestCase):
+class FrameParityAxisTests(FrameAxisMixin, PandasOnSparkTestUtils, 
ReusedConnectTestCase):
     pass
 
 
 if __name__ == "__main__":
-    from pyspark.pandas.tests.connect.test_parity_stats import *  # noqa: F401
+    from pyspark.pandas.tests.connect.frame.test_parity_axis import *  # noqa: 
F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/frame/test_axis.py 
b/python/pyspark/pandas/tests/frame/test_axis.py
new file mode 100644
index 000000000000..ee67cf1b55ed
--- /dev/null
+++ b/python/pyspark/pandas/tests/frame/test_axis.py
@@ -0,0 +1,135 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import ComparisonTestBase
+from pyspark.testing.sqlutils import SQLTestUtils
+
+
+class FrameAxisMixin:
+    def test_axis_on_dataframe(self):
+        # The number of each count is intentionally big
+        # because when data is small, it executes a shortcut.
+        # Less than 'compute.shortcut_limit' will execute a shortcut
+        # by using collected pandas dataframe directly.
+        # now we set the 'compute.shortcut_limit' as 1000 explicitly
+        with ps.option_context("compute.shortcut_limit", 1000):
+            pdf = pd.DataFrame(
+                {
+                    "A": [1, -2, 3, -4, 5] * 300,
+                    "B": [1.0, -2, 3, -4, 5] * 300,
+                    "C": [-6.0, -7, -8, -9, 10] * 300,
+                    "D": [True, False, True, False, False] * 300,
+                },
+                index=range(10, 15001, 10),
+            )
+            # TODO(SPARK-45228): Update `test_axis_on_dataframe` when Pandas 
regression is fixed
+            # There is a regression in Pandas 2.1.0,
+            # so we should manually cast to float until the regression is 
fixed.
+            # See https://github.com/pandas-dev/pandas/issues/55194.
+            pdf = pdf.astype(float)
+            psdf = ps.from_pandas(pdf)
+            self.assert_eq(psdf.count(axis=1), pdf.count(axis=1))
+            self.assert_eq(psdf.var(axis=1), pdf.var(axis=1))
+            self.assert_eq(psdf.var(axis=1, ddof=0), pdf.var(axis=1, ddof=0))
+            self.assert_eq(psdf.std(axis=1), pdf.std(axis=1))
+            self.assert_eq(psdf.std(axis=1, ddof=0), pdf.std(axis=1, ddof=0))
+            self.assert_eq(psdf.max(axis=1), pdf.max(axis=1))
+            self.assert_eq(psdf.min(axis=1), pdf.min(axis=1))
+            self.assert_eq(psdf.sum(axis=1), pdf.sum(axis=1))
+            self.assert_eq(psdf.product(axis=1), pdf.product(axis=1))
+            self.assert_eq(psdf.kurtosis(axis=0), pdf.kurtosis(axis=0), 
almost=True)
+            self.assert_eq(psdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
+            self.assert_eq(psdf.skew(axis=0), pdf.skew(axis=0), almost=True)
+            self.assert_eq(psdf.skew(axis=1), pdf.skew(axis=1))
+            self.assert_eq(psdf.mean(axis=1), pdf.mean(axis=1))
+            self.assert_eq(psdf.sem(axis=1), pdf.sem(axis=1))
+            self.assert_eq(psdf.sem(axis=1, ddof=0), pdf.sem(axis=1, ddof=0))
+
+            self.assert_eq(
+                psdf.count(axis=1, numeric_only=True), pdf.count(axis=1, 
numeric_only=True)
+            )
+            self.assert_eq(psdf.var(axis=1, numeric_only=True), 
pdf.var(axis=1, numeric_only=True))
+            self.assert_eq(
+                psdf.var(axis=1, ddof=0, numeric_only=True),
+                pdf.var(axis=1, ddof=0, numeric_only=True),
+            )
+            self.assert_eq(psdf.std(axis=1, numeric_only=True), 
pdf.std(axis=1, numeric_only=True))
+            self.assert_eq(
+                psdf.std(axis=1, ddof=0, numeric_only=True),
+                pdf.std(axis=1, ddof=0, numeric_only=True),
+            )
+            self.assert_eq(
+                psdf.max(axis=1, numeric_only=True),
+                pdf.max(axis=1, numeric_only=True).astype(float),
+            )
+            self.assert_eq(
+                psdf.min(axis=1, numeric_only=True),
+                pdf.min(axis=1, numeric_only=True).astype(float),
+            )
+            self.assert_eq(
+                psdf.sum(axis=1, numeric_only=True),
+                pdf.sum(axis=1, numeric_only=True).astype(float),
+            )
+            self.assert_eq(
+                psdf.product(axis=1, numeric_only=True),
+                pdf.product(axis=1, numeric_only=True).astype(float),
+            )
+            self.assert_eq(
+                psdf.kurtosis(axis=0, numeric_only=True),
+                pdf.kurtosis(axis=0, numeric_only=True),
+                almost=True,
+            )
+            self.assert_eq(
+                psdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1, 
numeric_only=True)
+            )
+            self.assert_eq(
+                psdf.skew(axis=0, numeric_only=True),
+                pdf.skew(axis=0, numeric_only=True),
+                almost=True,
+            )
+            self.assert_eq(
+                psdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1, 
numeric_only=True)
+            )
+            self.assert_eq(
+                psdf.mean(axis=1, numeric_only=True), pdf.mean(axis=1, 
numeric_only=True)
+            )
+            self.assert_eq(psdf.sem(axis=1, numeric_only=True), 
pdf.sem(axis=1, numeric_only=True))
+            self.assert_eq(
+                psdf.sem(axis=1, ddof=0, numeric_only=True),
+                pdf.sem(axis=1, ddof=0, numeric_only=True),
+            )
+
+
+class FrameAxisTests(FrameAxisMixin, ComparisonTestBase, SQLTestUtils):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.frame.test_axis import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-46268][PS][CONNECT][TESTS] Re-organize `StatsTests`

Reply via email to