This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new ef05fb632424 [SPARK-46268][PS][CONNECT][TESTS] Re-organize `StatsTests`
ef05fb632424 is described below
commit ef05fb632424f8f121bcd4518ab3a8815c295c85
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Dec 5 10:19:41 2023 -0800
[SPARK-46268][PS][CONNECT][TESTS] Re-organize `StatsTests`
### What changes were proposed in this pull request?
Re-organize `StatsTests`
### Why are the changes needed?
break the big test file by grouping test cases by topics
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44185 from zhengruifeng/ps_reorg_test_stats.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
dev/sparktestsupport/modules.py | 8 +-
.../pyspark/pandas/tests/computation/test_corr.py | 222 ++++++++++++++++
.../pandas/tests/{ => computation}/test_stats.py | 278 +--------------------
.../test_parity_corr.py} | 7 +-
.../connect/{ => computation}/test_parity_stats.py | 4 +-
.../test_parity_axis.py} | 6 +-
python/pyspark/pandas/tests/frame/test_axis.py | 135 ++++++++++
7 files changed, 373 insertions(+), 287 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 9bbe86baa1dc..900329d07c00 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -789,6 +789,7 @@ pyspark_pandas_slow = Module(
"pyspark.pandas.tests.computation.test_binary_ops",
"pyspark.pandas.tests.computation.test_combine",
"pyspark.pandas.tests.computation.test_compute",
+ "pyspark.pandas.tests.computation.test_corr",
"pyspark.pandas.tests.computation.test_corrwith",
"pyspark.pandas.tests.computation.test_cov",
"pyspark.pandas.tests.computation.test_cumulative",
@@ -797,7 +798,9 @@ pyspark_pandas_slow = Module(
"pyspark.pandas.tests.computation.test_melt",
"pyspark.pandas.tests.computation.test_missing_data",
"pyspark.pandas.tests.computation.test_pivot",
+ "pyspark.pandas.tests.computation.test_stats",
"pyspark.pandas.tests.frame.test_attrs",
+ "pyspark.pandas.tests.frame.test_axis",
"pyspark.pandas.tests.frame.test_constructor",
"pyspark.pandas.tests.frame.test_conversion",
"pyspark.pandas.tests.frame.test_reindexing",
@@ -841,7 +844,6 @@ pyspark_pandas_slow = Module(
"pyspark.pandas.tests.series.test_series",
"pyspark.pandas.tests.series.test_sort",
"pyspark.pandas.tests.series.test_stat",
- "pyspark.pandas.tests.test_stats",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy,
pandas, and pyarrow and
@@ -1014,6 +1016,7 @@ pyspark_pandas_connect_part0 = Module(
"pyspark.pandas.tests.connect.computation.test_parity_combine",
"pyspark.pandas.tests.connect.computation.test_parity_compute",
"pyspark.pandas.tests.connect.computation.test_parity_cov",
+ "pyspark.pandas.tests.connect.computation.test_parity_corr",
"pyspark.pandas.tests.connect.computation.test_parity_corrwith",
"pyspark.pandas.tests.connect.computation.test_parity_cumulative",
"pyspark.pandas.tests.connect.computation.test_parity_describe",
@@ -1021,6 +1024,7 @@ pyspark_pandas_connect_part0 = Module(
"pyspark.pandas.tests.connect.computation.test_parity_melt",
"pyspark.pandas.tests.connect.groupby.test_parity_stat",
"pyspark.pandas.tests.connect.frame.test_parity_attrs",
+ "pyspark.pandas.tests.connect.frame.test_parity_axis",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_frame",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_series",
],
@@ -1075,7 +1079,6 @@ pyspark_pandas_connect_part1 = Module(
"pyspark.pandas.tests.connect.series.test_parity_stat",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic",
"pyspark.pandas.tests.connect.test_parity_reshape",
- "pyspark.pandas.tests.connect.test_parity_stats",
"pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_expanding",
],
excluded_python_implementations=[
@@ -1094,6 +1097,7 @@ pyspark_pandas_connect_part2 = Module(
python_test_goals=[
# pandas-on-Spark unittests
"pyspark.pandas.tests.connect.computation.test_parity_pivot",
+ "pyspark.pandas.tests.connect.computation.test_parity_stats",
"pyspark.pandas.tests.connect.indexes.test_parity_base_slow",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_property",
"pyspark.pandas.tests.connect.test_parity_frame_interpolate",
diff --git a/python/pyspark/pandas/tests/computation/test_corr.py
b/python/pyspark/pandas/tests/computation/test_corr.py
new file mode 100644
index 000000000000..a7b06aa2928a
--- /dev/null
+++ b/python/pyspark/pandas/tests/computation/test_corr.py
@@ -0,0 +1,222 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase,
SPARK_CONF_ARROW_ENABLED
+from pyspark.testing.sqlutils import SQLTestUtils
+
+
+class FrameCorrMixin:
+ def test_dataframe_corr(self):
+ pdf = pd.DataFrame(
+ index=[
+ "".join(
+ np.random.choice(
+
list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10
+ )
+ )
+ for _ in range(30)
+ ],
+ columns=list("ABCD"),
+ dtype="float64",
+ )
+ psdf = ps.from_pandas(pdf)
+
+ with self.assertRaisesRegex(ValueError, "Invalid method"):
+ psdf.corr("std")
+ with self.assertRaisesRegex(TypeError, "Invalid min_periods type"):
+ psdf.corr(min_periods="3")
+
+ for method in ["pearson", "spearman", "kendall"]:
+ self.assert_eq(psdf.corr(method=method), pdf.corr(method=method),
check_exact=False)
+ self.assert_eq(
+ psdf.corr(method=method, min_periods=1),
+ pdf.corr(method=method, min_periods=1),
+ check_exact=False,
+ )
+ self.assert_eq(
+ psdf.corr(method=method, min_periods=3),
+ pdf.corr(method=method, min_periods=3),
+ check_exact=False,
+ )
+ self.assert_eq(
+ (psdf + 1).corr(method=method, min_periods=2),
+ (pdf + 1).corr(method=method, min_periods=2),
+ check_exact=False,
+ )
+
+ # multi-index columns
+ columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y",
"C"), ("Z", "D")])
+ pdf.columns = columns
+ psdf.columns = columns
+
+ for method in ["pearson", "spearman", "kendall"]:
+ self.assert_eq(psdf.corr(method=method), pdf.corr(method=method),
check_exact=False)
+ self.assert_eq(
+ psdf.corr(method=method, min_periods=1),
+ pdf.corr(method=method, min_periods=1),
+ check_exact=False,
+ )
+ self.assert_eq(
+ psdf.corr(method=method, min_periods=3),
+ pdf.corr(method=method, min_periods=3),
+ check_exact=False,
+ )
+ self.assert_eq(
+ (psdf + 1).corr(method=method, min_periods=2),
+ (pdf + 1).corr(method=method, min_periods=2),
+ check_exact=False,
+ )
+
+ # test with identical values
+ pdf = pd.DataFrame(
+ {
+ "a": [0, 1, 1, 1, 0],
+ "b": [2, 2, -1, 1, np.nan],
+ "c": [3, 3, 3, 3, 3],
+ "d": [np.nan, np.nan, np.nan, np.nan, np.nan],
+ }
+ )
+ psdf = ps.from_pandas(pdf)
+
+ for method in ["pearson", "spearman", "kendall"]:
+ self.assert_eq(psdf.corr(method=method), pdf.corr(method=method),
check_exact=False)
+ self.assert_eq(
+ psdf.corr(method=method, min_periods=1),
+ pdf.corr(method=method, min_periods=1),
+ check_exact=False,
+ )
+ self.assert_eq(
+ psdf.corr(method=method, min_periods=3),
+ pdf.corr(method=method, min_periods=3),
+ check_exact=False,
+ )
+
+ def test_series_corr(self):
+ pdf = pd.DataFrame(
+ index=[
+ "".join(
+ np.random.choice(
+
list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10
+ )
+ )
+ for _ in range(30)
+ ],
+ columns=list("ABCD"),
+ dtype="float64",
+ )
+ pser1 = pdf.A
+ pser2 = pdf.B
+ psdf = ps.from_pandas(pdf)
+ psser1 = psdf.A
+ psser2 = psdf.B
+
+ with self.assertRaisesRegex(ValueError, "Invalid method"):
+ psser1.corr(psser2, method="std")
+ with self.assertRaisesRegex(TypeError, "Invalid min_periods type"):
+ psser1.corr(psser2, min_periods="3")
+
+ for method in ["pearson", "spearman", "kendall"]:
+ self.assert_eq(
+ psser1.corr(psser2, method=method),
+ pser1.corr(pser2, method=method),
+ almost=True,
+ )
+ self.assert_eq(
+ psser1.corr(psser2, method=method, min_periods=1),
+ pser1.corr(pser2, method=method, min_periods=1),
+ almost=True,
+ )
+ self.assert_eq(
+ psser1.corr(psser2, method=method, min_periods=3),
+ pser1.corr(pser2, method=method, min_periods=3),
+ almost=True,
+ )
+ self.assert_eq(
+ (psser1 + 1).corr(psser2 - 2, method=method, min_periods=2),
+ (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2),
+ almost=True,
+ )
+
+ # different anchors
+ psser1 = ps.from_pandas(pser1)
+ psser2 = ps.from_pandas(pser2)
+
+ with self.assertRaisesRegex(ValueError, "Cannot combine the series or
dataframe"):
+ psser1.corr(psser2)
+
+ for method in ["pearson", "spearman", "kendall"]:
+ with ps.option_context("compute.ops_on_diff_frames", True):
+ self.assert_eq(
+ psser1.corr(psser2, method=method),
+ pser1.corr(pser2, method=method),
+ almost=True,
+ )
+ self.assert_eq(
+ psser1.corr(psser2, method=method, min_periods=1),
+ pser1.corr(pser2, method=method, min_periods=1),
+ almost=True,
+ )
+ self.assert_eq(
+ psser1.corr(psser2, method=method, min_periods=3),
+ pser1.corr(pser2, method=method, min_periods=3),
+ almost=True,
+ )
+ self.assert_eq(
+ (psser1 + 1).corr(psser2 - 2, method=method,
min_periods=2),
+ (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2),
+ almost=True,
+ )
+
+ def test_cov_corr_meta(self):
+ # Disable arrow execution since corr() is using UDT internally which
is not supported.
+ with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
+ pdf = pd.DataFrame(
+ {
+ "a": np.array([1, 2, 3], dtype="i1"),
+ "b": np.array([1, 2, 3], dtype="i2"),
+ "c": np.array([1, 2, 3], dtype="i4"),
+ "d": np.array([1, 2, 3]),
+ "e": np.array([1.0, 2.0, 3.0], dtype="f4"),
+ "f": np.array([1.0, 2.0, 3.0]),
+ "g": np.array([True, False, True]),
+ "h": np.array(list("abc")),
+ },
+ index=pd.Index([1, 2, 3], name="myindex"),
+ )
+ psdf = ps.from_pandas(pdf)
+ self.assert_eq(psdf.corr(), pdf.corr(numeric_only=True),
check_exact=False)
+
+
+class FrameCorrTests(FrameCorrMixin, PandasOnSparkTestCase, SQLTestUtils):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.computation.test_corr import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/test_stats.py
b/python/pyspark/pandas/tests/computation/test_stats.py
similarity index 53%
rename from python/pyspark/pandas/tests/test_stats.py
rename to python/pyspark/pandas/tests/computation/test_stats.py
index bdc83ad7d5f5..c18c489617c2 100644
--- a/python/pyspark/pandas/tests/test_stats.py
+++ b/python/pyspark/pandas/tests/computation/test_stats.py
@@ -15,13 +15,10 @@
# limitations under the License.
#
-import unittest
-
import numpy as np
import pandas as pd
from pyspark import pandas as ps
-from pyspark.pandas.config import option_context
from pyspark.testing.pandasutils import PandasOnSparkTestCase,
SPARK_CONF_ARROW_ENABLED
from pyspark.testing.sqlutils import SQLTestUtils
@@ -160,99 +157,6 @@ class StatsTestsMixin:
):
psdf.D.abs()
- def test_axis_on_dataframe(self):
- # The number of each count is intentionally big
- # because when data is small, it executes a shortcut.
- # Less than 'compute.shortcut_limit' will execute a shortcut
- # by using collected pandas dataframe directly.
- # now we set the 'compute.shortcut_limit' as 1000 explicitly
- with option_context("compute.shortcut_limit", 1000):
- pdf = pd.DataFrame(
- {
- "A": [1, -2, 3, -4, 5] * 300,
- "B": [1.0, -2, 3, -4, 5] * 300,
- "C": [-6.0, -7, -8, -9, 10] * 300,
- "D": [True, False, True, False, False] * 300,
- },
- index=range(10, 15001, 10),
- )
- # TODO(SPARK-45228): Update `test_axis_on_dataframe` when Pandas
regression is fixed
- # There is a regression in Pandas 2.1.0,
- # so we should manually cast to float until the regression is
fixed.
- # See https://github.com/pandas-dev/pandas/issues/55194.
- pdf = pdf.astype(float)
- psdf = ps.from_pandas(pdf)
- self.assert_eq(psdf.count(axis=1), pdf.count(axis=1))
- self.assert_eq(psdf.var(axis=1), pdf.var(axis=1))
- self.assert_eq(psdf.var(axis=1, ddof=0), pdf.var(axis=1, ddof=0))
- self.assert_eq(psdf.std(axis=1), pdf.std(axis=1))
- self.assert_eq(psdf.std(axis=1, ddof=0), pdf.std(axis=1, ddof=0))
- self.assert_eq(psdf.max(axis=1), pdf.max(axis=1))
- self.assert_eq(psdf.min(axis=1), pdf.min(axis=1))
- self.assert_eq(psdf.sum(axis=1), pdf.sum(axis=1))
- self.assert_eq(psdf.product(axis=1), pdf.product(axis=1))
- self.assert_eq(psdf.kurtosis(axis=0), pdf.kurtosis(axis=0),
almost=True)
- self.assert_eq(psdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
- self.assert_eq(psdf.skew(axis=0), pdf.skew(axis=0), almost=True)
- self.assert_eq(psdf.skew(axis=1), pdf.skew(axis=1))
- self.assert_eq(psdf.mean(axis=1), pdf.mean(axis=1))
- self.assert_eq(psdf.sem(axis=1), pdf.sem(axis=1))
- self.assert_eq(psdf.sem(axis=1, ddof=0), pdf.sem(axis=1, ddof=0))
-
- self.assert_eq(
- psdf.count(axis=1, numeric_only=True), pdf.count(axis=1,
numeric_only=True)
- )
- self.assert_eq(psdf.var(axis=1, numeric_only=True),
pdf.var(axis=1, numeric_only=True))
- self.assert_eq(
- psdf.var(axis=1, ddof=0, numeric_only=True),
- pdf.var(axis=1, ddof=0, numeric_only=True),
- )
- self.assert_eq(psdf.std(axis=1, numeric_only=True),
pdf.std(axis=1, numeric_only=True))
- self.assert_eq(
- psdf.std(axis=1, ddof=0, numeric_only=True),
- pdf.std(axis=1, ddof=0, numeric_only=True),
- )
- self.assert_eq(
- psdf.max(axis=1, numeric_only=True),
- pdf.max(axis=1, numeric_only=True).astype(float),
- )
- self.assert_eq(
- psdf.min(axis=1, numeric_only=True),
- pdf.min(axis=1, numeric_only=True).astype(float),
- )
- self.assert_eq(
- psdf.sum(axis=1, numeric_only=True),
- pdf.sum(axis=1, numeric_only=True).astype(float),
- )
- self.assert_eq(
- psdf.product(axis=1, numeric_only=True),
- pdf.product(axis=1, numeric_only=True).astype(float),
- )
- self.assert_eq(
- psdf.kurtosis(axis=0, numeric_only=True),
- pdf.kurtosis(axis=0, numeric_only=True),
- almost=True,
- )
- self.assert_eq(
- psdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1,
numeric_only=True)
- )
- self.assert_eq(
- psdf.skew(axis=0, numeric_only=True),
- pdf.skew(axis=0, numeric_only=True),
- almost=True,
- )
- self.assert_eq(
- psdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1,
numeric_only=True)
- )
- self.assert_eq(
- psdf.mean(axis=1, numeric_only=True), pdf.mean(axis=1,
numeric_only=True)
- )
- self.assert_eq(psdf.sem(axis=1, numeric_only=True),
pdf.sem(axis=1, numeric_only=True))
- self.assert_eq(
- psdf.sem(axis=1, ddof=0, numeric_only=True),
- pdf.sem(axis=1, ddof=0, numeric_only=True),
- )
-
def test_skew_kurt_numerical_stability(self):
pdf = pd.DataFrame(
{
@@ -268,186 +172,6 @@ class StatsTestsMixin:
self.assert_eq(psdf.skew(), pdf.skew(), almost=True)
self.assert_eq(psdf.kurt(), pdf.kurt(), almost=True)
- def test_dataframe_corr(self):
- pdf = pd.DataFrame(
- index=[
- "".join(
- np.random.choice(
-
list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10
- )
- )
- for _ in range(30)
- ],
- columns=list("ABCD"),
- dtype="float64",
- )
- psdf = ps.from_pandas(pdf)
-
- with self.assertRaisesRegex(ValueError, "Invalid method"):
- psdf.corr("std")
- with self.assertRaisesRegex(TypeError, "Invalid min_periods type"):
- psdf.corr(min_periods="3")
-
- for method in ["pearson", "spearman", "kendall"]:
- self.assert_eq(psdf.corr(method=method), pdf.corr(method=method),
check_exact=False)
- self.assert_eq(
- psdf.corr(method=method, min_periods=1),
- pdf.corr(method=method, min_periods=1),
- check_exact=False,
- )
- self.assert_eq(
- psdf.corr(method=method, min_periods=3),
- pdf.corr(method=method, min_periods=3),
- check_exact=False,
- )
- self.assert_eq(
- (psdf + 1).corr(method=method, min_periods=2),
- (pdf + 1).corr(method=method, min_periods=2),
- check_exact=False,
- )
-
- # multi-index columns
- columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y",
"C"), ("Z", "D")])
- pdf.columns = columns
- psdf.columns = columns
-
- for method in ["pearson", "spearman", "kendall"]:
- self.assert_eq(psdf.corr(method=method), pdf.corr(method=method),
check_exact=False)
- self.assert_eq(
- psdf.corr(method=method, min_periods=1),
- pdf.corr(method=method, min_periods=1),
- check_exact=False,
- )
- self.assert_eq(
- psdf.corr(method=method, min_periods=3),
- pdf.corr(method=method, min_periods=3),
- check_exact=False,
- )
- self.assert_eq(
- (psdf + 1).corr(method=method, min_periods=2),
- (pdf + 1).corr(method=method, min_periods=2),
- check_exact=False,
- )
-
- # test with identical values
- pdf = pd.DataFrame(
- {
- "a": [0, 1, 1, 1, 0],
- "b": [2, 2, -1, 1, np.nan],
- "c": [3, 3, 3, 3, 3],
- "d": [np.nan, np.nan, np.nan, np.nan, np.nan],
- }
- )
- psdf = ps.from_pandas(pdf)
-
- for method in ["pearson", "spearman", "kendall"]:
- self.assert_eq(psdf.corr(method=method), pdf.corr(method=method),
check_exact=False)
- self.assert_eq(
- psdf.corr(method=method, min_periods=1),
- pdf.corr(method=method, min_periods=1),
- check_exact=False,
- )
- self.assert_eq(
- psdf.corr(method=method, min_periods=3),
- pdf.corr(method=method, min_periods=3),
- check_exact=False,
- )
-
- def test_series_corr(self):
- pdf = pd.DataFrame(
- index=[
- "".join(
- np.random.choice(
-
list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10
- )
- )
- for _ in range(30)
- ],
- columns=list("ABCD"),
- dtype="float64",
- )
- pser1 = pdf.A
- pser2 = pdf.B
- psdf = ps.from_pandas(pdf)
- psser1 = psdf.A
- psser2 = psdf.B
-
- with self.assertRaisesRegex(ValueError, "Invalid method"):
- psser1.corr(psser2, method="std")
- with self.assertRaisesRegex(TypeError, "Invalid min_periods type"):
- psser1.corr(psser2, min_periods="3")
-
- for method in ["pearson", "spearman", "kendall"]:
- self.assert_eq(
- psser1.corr(psser2, method=method),
- pser1.corr(pser2, method=method),
- almost=True,
- )
- self.assert_eq(
- psser1.corr(psser2, method=method, min_periods=1),
- pser1.corr(pser2, method=method, min_periods=1),
- almost=True,
- )
- self.assert_eq(
- psser1.corr(psser2, method=method, min_periods=3),
- pser1.corr(pser2, method=method, min_periods=3),
- almost=True,
- )
- self.assert_eq(
- (psser1 + 1).corr(psser2 - 2, method=method, min_periods=2),
- (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2),
- almost=True,
- )
-
- # different anchors
- psser1 = ps.from_pandas(pser1)
- psser2 = ps.from_pandas(pser2)
-
- with self.assertRaisesRegex(ValueError, "Cannot combine the series or
dataframe"):
- psser1.corr(psser2)
-
- for method in ["pearson", "spearman", "kendall"]:
- with ps.option_context("compute.ops_on_diff_frames", True):
- self.assert_eq(
- psser1.corr(psser2, method=method),
- pser1.corr(pser2, method=method),
- almost=True,
- )
- self.assert_eq(
- psser1.corr(psser2, method=method, min_periods=1),
- pser1.corr(pser2, method=method, min_periods=1),
- almost=True,
- )
- self.assert_eq(
- psser1.corr(psser2, method=method, min_periods=3),
- pser1.corr(pser2, method=method, min_periods=3),
- almost=True,
- )
- self.assert_eq(
- (psser1 + 1).corr(psser2 - 2, method=method,
min_periods=2),
- (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2),
- almost=True,
- )
-
- def test_cov_corr_meta(self):
- # Disable arrow execution since corr() is using UDT internally which
is not supported.
- with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
- pdf = pd.DataFrame(
- {
- "a": np.array([1, 2, 3], dtype="i1"),
- "b": np.array([1, 2, 3], dtype="i2"),
- "c": np.array([1, 2, 3], dtype="i4"),
- "d": np.array([1, 2, 3]),
- "e": np.array([1.0, 2.0, 3.0], dtype="f4"),
- "f": np.array([1.0, 2.0, 3.0]),
- "g": np.array([True, False, True]),
- "h": np.array(list("abc")),
- },
- index=pd.Index([1, 2, 3], name="myindex"),
- )
- psdf = ps.from_pandas(pdf)
- self.assert_eq(psdf.corr(), pdf.corr(numeric_only=True),
check_exact=False)
-
def test_stats_on_boolean_dataframe(self):
pdf = pd.DataFrame({"A": [True, False, True], "B": [False, False,
True]})
psdf = ps.from_pandas(pdf)
@@ -588,7 +312,7 @@ class StatsTests(StatsTestsMixin, PandasOnSparkTestCase,
SQLTestUtils):
if __name__ == "__main__":
import unittest
- from pyspark.pandas.tests.test_stats import * # noqa: F401
+ from pyspark.pandas.tests.computation.test_stats import * # noqa: F401
try:
import xmlrunner
diff --git a/python/pyspark/pandas/tests/connect/test_parity_stats.py
b/python/pyspark/pandas/tests/connect/computation/test_parity_corr.py
similarity index 81%
copy from python/pyspark/pandas/tests/connect/test_parity_stats.py
copy to python/pyspark/pandas/tests/connect/computation/test_parity_corr.py
index 7eddc4c15d4f..acf36b07829a 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_stats.py
+++ b/python/pyspark/pandas/tests/connect/computation/test_parity_corr.py
@@ -16,17 +16,18 @@
#
import unittest
-from pyspark.pandas.tests.test_stats import StatsTestsMixin
+from pyspark import pandas as ps
+from pyspark.pandas.tests.computation.test_corr import FrameCorrMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class StatsParityTests(StatsTestsMixin, PandasOnSparkTestUtils,
ReusedConnectTestCase):
+class FrameParityCorrTests(FrameCorrMixin, PandasOnSparkTestUtils,
ReusedConnectTestCase):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_stats import * # noqa: F401
+ from pyspark.pandas.tests.connect.computation.test_parity_corr import * #
noqa: F401
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/test_parity_stats.py
b/python/pyspark/pandas/tests/connect/computation/test_parity_stats.py
similarity index 88%
copy from python/pyspark/pandas/tests/connect/test_parity_stats.py
copy to python/pyspark/pandas/tests/connect/computation/test_parity_stats.py
index 7eddc4c15d4f..14d37949590b 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_stats.py
+++ b/python/pyspark/pandas/tests/connect/computation/test_parity_stats.py
@@ -16,7 +16,7 @@
#
import unittest
-from pyspark.pandas.tests.test_stats import StatsTestsMixin
+from pyspark.pandas.tests.computation.test_stats import StatsTestsMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
@@ -26,7 +26,7 @@ class StatsParityTests(StatsTestsMixin,
PandasOnSparkTestUtils, ReusedConnectTes
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_stats import * # noqa: F401
+ from pyspark.pandas.tests.connect.computation.test_parity_stats import *
# noqa: F401
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/connect/test_parity_stats.py
b/python/pyspark/pandas/tests/connect/frame/test_parity_axis.py
similarity index 83%
rename from python/pyspark/pandas/tests/connect/test_parity_stats.py
rename to python/pyspark/pandas/tests/connect/frame/test_parity_axis.py
index 7eddc4c15d4f..804ed97fa31c 100644
--- a/python/pyspark/pandas/tests/connect/test_parity_stats.py
+++ b/python/pyspark/pandas/tests/connect/frame/test_parity_axis.py
@@ -16,17 +16,17 @@
#
import unittest
-from pyspark.pandas.tests.test_stats import StatsTestsMixin
+from pyspark.pandas.tests.frame.test_axis import FrameAxisMixin
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
-class StatsParityTests(StatsTestsMixin, PandasOnSparkTestUtils,
ReusedConnectTestCase):
+class FrameParityAxisTests(FrameAxisMixin, PandasOnSparkTestUtils,
ReusedConnectTestCase):
pass
if __name__ == "__main__":
- from pyspark.pandas.tests.connect.test_parity_stats import * # noqa: F401
+ from pyspark.pandas.tests.connect.frame.test_parity_axis import * # noqa:
F401
try:
import xmlrunner # type: ignore[import]
diff --git a/python/pyspark/pandas/tests/frame/test_axis.py
b/python/pyspark/pandas/tests/frame/test_axis.py
new file mode 100644
index 000000000000..ee67cf1b55ed
--- /dev/null
+++ b/python/pyspark/pandas/tests/frame/test_axis.py
@@ -0,0 +1,135 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import ComparisonTestBase
+from pyspark.testing.sqlutils import SQLTestUtils
+
+
+class FrameAxisMixin:
+ def test_axis_on_dataframe(self):
+ # The number of each count is intentionally big
+ # because when data is small, it executes a shortcut.
+ # Less than 'compute.shortcut_limit' will execute a shortcut
+ # by using collected pandas dataframe directly.
+ # now we set the 'compute.shortcut_limit' as 1000 explicitly
+ with ps.option_context("compute.shortcut_limit", 1000):
+ pdf = pd.DataFrame(
+ {
+ "A": [1, -2, 3, -4, 5] * 300,
+ "B": [1.0, -2, 3, -4, 5] * 300,
+ "C": [-6.0, -7, -8, -9, 10] * 300,
+ "D": [True, False, True, False, False] * 300,
+ },
+ index=range(10, 15001, 10),
+ )
+ # TODO(SPARK-45228): Update `test_axis_on_dataframe` when Pandas
regression is fixed
+ # There is a regression in Pandas 2.1.0,
+ # so we should manually cast to float until the regression is
fixed.
+ # See https://github.com/pandas-dev/pandas/issues/55194.
+ pdf = pdf.astype(float)
+ psdf = ps.from_pandas(pdf)
+ self.assert_eq(psdf.count(axis=1), pdf.count(axis=1))
+ self.assert_eq(psdf.var(axis=1), pdf.var(axis=1))
+ self.assert_eq(psdf.var(axis=1, ddof=0), pdf.var(axis=1, ddof=0))
+ self.assert_eq(psdf.std(axis=1), pdf.std(axis=1))
+ self.assert_eq(psdf.std(axis=1, ddof=0), pdf.std(axis=1, ddof=0))
+ self.assert_eq(psdf.max(axis=1), pdf.max(axis=1))
+ self.assert_eq(psdf.min(axis=1), pdf.min(axis=1))
+ self.assert_eq(psdf.sum(axis=1), pdf.sum(axis=1))
+ self.assert_eq(psdf.product(axis=1), pdf.product(axis=1))
+ self.assert_eq(psdf.kurtosis(axis=0), pdf.kurtosis(axis=0),
almost=True)
+ self.assert_eq(psdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
+ self.assert_eq(psdf.skew(axis=0), pdf.skew(axis=0), almost=True)
+ self.assert_eq(psdf.skew(axis=1), pdf.skew(axis=1))
+ self.assert_eq(psdf.mean(axis=1), pdf.mean(axis=1))
+ self.assert_eq(psdf.sem(axis=1), pdf.sem(axis=1))
+ self.assert_eq(psdf.sem(axis=1, ddof=0), pdf.sem(axis=1, ddof=0))
+
+ self.assert_eq(
+ psdf.count(axis=1, numeric_only=True), pdf.count(axis=1,
numeric_only=True)
+ )
+ self.assert_eq(psdf.var(axis=1, numeric_only=True),
pdf.var(axis=1, numeric_only=True))
+ self.assert_eq(
+ psdf.var(axis=1, ddof=0, numeric_only=True),
+ pdf.var(axis=1, ddof=0, numeric_only=True),
+ )
+ self.assert_eq(psdf.std(axis=1, numeric_only=True),
pdf.std(axis=1, numeric_only=True))
+ self.assert_eq(
+ psdf.std(axis=1, ddof=0, numeric_only=True),
+ pdf.std(axis=1, ddof=0, numeric_only=True),
+ )
+ self.assert_eq(
+ psdf.max(axis=1, numeric_only=True),
+ pdf.max(axis=1, numeric_only=True).astype(float),
+ )
+ self.assert_eq(
+ psdf.min(axis=1, numeric_only=True),
+ pdf.min(axis=1, numeric_only=True).astype(float),
+ )
+ self.assert_eq(
+ psdf.sum(axis=1, numeric_only=True),
+ pdf.sum(axis=1, numeric_only=True).astype(float),
+ )
+ self.assert_eq(
+ psdf.product(axis=1, numeric_only=True),
+ pdf.product(axis=1, numeric_only=True).astype(float),
+ )
+ self.assert_eq(
+ psdf.kurtosis(axis=0, numeric_only=True),
+ pdf.kurtosis(axis=0, numeric_only=True),
+ almost=True,
+ )
+ self.assert_eq(
+ psdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1,
numeric_only=True)
+ )
+ self.assert_eq(
+ psdf.skew(axis=0, numeric_only=True),
+ pdf.skew(axis=0, numeric_only=True),
+ almost=True,
+ )
+ self.assert_eq(
+ psdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1,
numeric_only=True)
+ )
+ self.assert_eq(
+ psdf.mean(axis=1, numeric_only=True), pdf.mean(axis=1,
numeric_only=True)
+ )
+ self.assert_eq(psdf.sem(axis=1, numeric_only=True),
pdf.sem(axis=1, numeric_only=True))
+ self.assert_eq(
+ psdf.sem(axis=1, ddof=0, numeric_only=True),
+ pdf.sem(axis=1, ddof=0, numeric_only=True),
+ )
+
+
+class FrameAxisTests(FrameAxisMixin, ComparisonTestBase, SQLTestUtils):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.frame.test_axis import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]