This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 38a517de8e22 [SPARK-46856][PS][TESTS] Apply approximate equality in
ewm tests
38a517de8e22 is described below
commit 38a517de8e22183ac92c693e0137bd5bfb3c88a4
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Thu Jan 25 17:11:22 2024 +0800
[SPARK-46856][PS][TESTS] Apply approximate equality in ewm tests
### What changes were proposed in this pull request?
Apply approximate equality in ewm tests
### Why are the changes needed?
the `ewm` function in Spark is based on `EWM` expression in Scala, do not
need to compare the result too exactly.
on various envs, some tests may fail like:
```
Traceback (most recent call last):
File "/home/jenkins/python/pyspark/testing/pandasutils.py", line 91, in
_assert_pandas_equal
assert_frame_equal(
File
"/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py",
line 1308, in assert_frame_equal
assert_series_equal(
File
"/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py",
line 1018, in assert_series_equal
assert_numpy_array_equal(
File
"/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py",
line 741, in assert_numpy_array_equal
_raise(left, right, err_msg)
File
"/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py",
line 735, in _raise
raise_assert_detail(obj, msg, left, right, index_values=index_values)
File
"/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py",
line 665, in raise_assert_detail
raise AssertionError(msg)
AssertionError: DataFrame.iloc[:, 1] (column name="b") are different
DataFrame.iloc[:, 1] (column name="b") values are different (25.0 %)
[index]: [0.9781772871933869, 0.6938842103849581, 0.05954110855254491,
0.43191250286369276]
[left]: [4.0, 2.4615384615384617, 2.848920863309352, 1.5441072688779112]
[right]: [4.0, 2.4615384615384617, 2.8489208633093526, 1.5441072688779112]
```
### Does this PR introduce _any_ user-facing change?
no, test only
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44879 from zhengruifeng/ps_test_ewm_almost.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
.../pyspark/pandas/tests/window/test_ewm_mean.py | 106 +++++++++++++++++----
1 file changed, 90 insertions(+), 16 deletions(-)
diff --git a/python/pyspark/pandas/tests/window/test_ewm_mean.py
b/python/pyspark/pandas/tests/window/test_ewm_mean.py
index 00750b867610..5777445ecc57 100644
--- a/python/pyspark/pandas/tests/window/test_ewm_mean.py
+++ b/python/pyspark/pandas/tests/window/test_ewm_mean.py
@@ -25,56 +25,110 @@ class EWMMeanMixin:
def _test_ewm_func(self, f):
pser = pd.Series([1, 2, 3], index=np.random.rand(3), name="a")
psser = ps.from_pandas(pser)
- self.assert_eq(getattr(psser.ewm(com=0.2), f)(),
getattr(pser.ewm(com=0.2), f)())
self.assert_eq(
- getattr(psser.ewm(com=0.2), f)().sum(), getattr(pser.ewm(com=0.2),
f)().sum()
+ getattr(psser.ewm(com=0.2), f)(),
+ getattr(pser.ewm(com=0.2), f)(),
+ almost=True,
)
- self.assert_eq(getattr(psser.ewm(span=1.7), f)(),
getattr(pser.ewm(span=1.7), f)())
self.assert_eq(
- getattr(psser.ewm(span=1.7), f)().sum(),
getattr(pser.ewm(span=1.7), f)().sum()
+ getattr(psser.ewm(com=0.2), f)().sum(),
+ getattr(pser.ewm(com=0.2), f)().sum(),
+ almost=True,
)
- self.assert_eq(getattr(psser.ewm(halflife=0.5), f)(),
getattr(pser.ewm(halflife=0.5), f)())
self.assert_eq(
- getattr(psser.ewm(halflife=0.5), f)().sum(),
getattr(pser.ewm(halflife=0.5), f)().sum()
+ getattr(psser.ewm(span=1.7), f)(),
+ getattr(pser.ewm(span=1.7), f)(),
+ almost=True,
)
- self.assert_eq(getattr(psser.ewm(alpha=0.7), f)(),
getattr(pser.ewm(alpha=0.7), f)())
self.assert_eq(
- getattr(psser.ewm(alpha=0.7), f)().sum(),
getattr(pser.ewm(alpha=0.7), f)().sum()
+ getattr(psser.ewm(span=1.7), f)().sum(),
+ getattr(pser.ewm(span=1.7), f)().sum(),
+ almost=True,
+ )
+ self.assert_eq(
+ getattr(psser.ewm(halflife=0.5), f)(),
+ getattr(pser.ewm(halflife=0.5), f)(),
+ almost=True,
+ )
+ self.assert_eq(
+ getattr(psser.ewm(halflife=0.5), f)().sum(),
+ getattr(pser.ewm(halflife=0.5), f)().sum(),
+ almost=True,
+ )
+ self.assert_eq(
+ getattr(psser.ewm(alpha=0.7), f)(),
+ getattr(pser.ewm(alpha=0.7), f)(),
+ almost=True,
+ )
+ self.assert_eq(
+ getattr(psser.ewm(alpha=0.7), f)().sum(),
+ getattr(pser.ewm(alpha=0.7), f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psser.ewm(alpha=0.7, min_periods=2), f)(),
getattr(pser.ewm(alpha=0.7, min_periods=2), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psser.ewm(alpha=0.7, min_periods=2), f)().sum(),
getattr(pser.ewm(alpha=0.7, min_periods=2), f)().sum(),
+ almost=True,
)
pdf = pd.DataFrame(
{"a": [1.0, 2.0, 3.0, 2.0], "b": [4.0, 2.0, 3.0, 1.0]},
index=np.random.rand(4)
)
psdf = ps.from_pandas(pdf)
- self.assert_eq(getattr(psdf.ewm(com=0.2), f)(),
getattr(pdf.ewm(com=0.2), f)())
- self.assert_eq(getattr(psdf.ewm(com=0.2), f)().sum(),
getattr(pdf.ewm(com=0.2), f)().sum())
- self.assert_eq(getattr(psdf.ewm(span=1.7), f)(),
getattr(pdf.ewm(span=1.7), f)())
self.assert_eq(
- getattr(psdf.ewm(span=1.7), f)().sum(), getattr(pdf.ewm(span=1.7),
f)().sum()
+ getattr(psdf.ewm(com=0.2), f)(),
+ getattr(pdf.ewm(com=0.2), f)(),
+ almost=True,
+ )
+ self.assert_eq(
+ getattr(psdf.ewm(com=0.2), f)().sum(),
+ getattr(pdf.ewm(com=0.2), f)().sum(),
+ almost=True,
+ )
+ self.assert_eq(
+ getattr(psdf.ewm(span=1.7), f)(),
+ getattr(pdf.ewm(span=1.7), f)(),
+ almost=True,
+ )
+ self.assert_eq(
+ getattr(psdf.ewm(span=1.7), f)().sum(),
+ getattr(pdf.ewm(span=1.7), f)().sum(),
+ almost=True,
+ )
+ self.assert_eq(
+ getattr(psdf.ewm(halflife=0.5), f)(),
+ getattr(pdf.ewm(halflife=0.5), f)(),
+ almost=True,
+ )
+ self.assert_eq(
+ getattr(psdf.ewm(halflife=0.5), f)().sum(),
+ getattr(pdf.ewm(halflife=0.5), f)().sum(),
+ almost=True,
)
- self.assert_eq(getattr(psdf.ewm(halflife=0.5), f)(),
getattr(pdf.ewm(halflife=0.5), f)())
self.assert_eq(
- getattr(psdf.ewm(halflife=0.5), f)().sum(),
getattr(pdf.ewm(halflife=0.5), f)().sum()
+ getattr(psdf.ewm(alpha=0.7), f)(),
+ getattr(pdf.ewm(alpha=0.7), f)(),
+ almost=True,
)
- self.assert_eq(getattr(psdf.ewm(alpha=0.7), f)(),
getattr(pdf.ewm(alpha=0.7), f)())
self.assert_eq(
- getattr(psdf.ewm(alpha=0.7), f)().sum(),
getattr(pdf.ewm(alpha=0.7), f)().sum()
+ getattr(psdf.ewm(alpha=0.7), f)().sum(),
+ getattr(pdf.ewm(alpha=0.7), f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(alpha=0.7, min_periods=2), f)(),
getattr(pdf.ewm(alpha=0.7, min_periods=2), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(alpha=0.7, min_periods=2), f)().sum(),
getattr(pdf.ewm(alpha=0.7, min_periods=2), f)().sum(),
+ almost=True,
)
pdf = pd.DataFrame(
@@ -91,82 +145,102 @@ class EWMMeanMixin:
self.assert_eq(
getattr(psdf.ewm(com=0.2, ignore_na=True), f)(),
getattr(pdf.ewm(com=0.2, ignore_na=True), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(com=0.2, ignore_na=True), f)().sum(),
getattr(pdf.ewm(com=0.2, ignore_na=True), f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(com=0.2, ignore_na=False), f)(),
getattr(pdf.ewm(com=0.2, ignore_na=False), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(com=0.2, ignore_na=False), f)().sum(),
getattr(pdf.ewm(com=0.2, ignore_na=False), f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(span=1.7, ignore_na=True), f)(),
getattr(pdf.ewm(span=1.7, ignore_na=True), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(span=1.7, ignore_na=True), f)().sum(),
getattr(pdf.ewm(span=1.7, ignore_na=True), f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(span=1.7, ignore_na=False), f)(),
getattr(pdf.ewm(span=1.7, ignore_na=False), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(span=1.7, ignore_na=False), f)().sum(),
getattr(pdf.ewm(span=1.7, ignore_na=False), f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(halflife=0.5, ignore_na=True), f)(),
getattr(pdf.ewm(halflife=0.5, ignore_na=True), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(halflife=0.5, ignore_na=True), f)().sum(),
getattr(pdf.ewm(halflife=0.5, ignore_na=True), f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(halflife=0.5, ignore_na=False), f)(),
getattr(pdf.ewm(halflife=0.5, ignore_na=False), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(halflife=0.5, ignore_na=False), f)().sum(),
getattr(pdf.ewm(halflife=0.5, ignore_na=False), f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(alpha=0.7, ignore_na=True), f)(),
getattr(pdf.ewm(alpha=0.7, ignore_na=True), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(alpha=0.7, ignore_na=True), f)().sum(),
getattr(pdf.ewm(alpha=0.7, ignore_na=True), f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(alpha=0.7, ignore_na=False), f)(),
getattr(pdf.ewm(alpha=0.7, ignore_na=False), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(alpha=0.7, ignore_na=False), f)().sum(),
getattr(pdf.ewm(alpha=0.7, ignore_na=False), f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(alpha=0.7, ignore_na=True, min_periods=2), f)(),
getattr(pdf.ewm(alpha=0.7, ignore_na=True, min_periods=2), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(alpha=0.7, ignore_na=True, min_periods=2),
f)().sum(),
getattr(pdf.ewm(alpha=0.7, ignore_na=True, min_periods=2),
f)().sum(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(alpha=0.7, ignore_na=False, min_periods=2), f)(),
getattr(pdf.ewm(alpha=0.7, ignore_na=False, min_periods=2), f)(),
+ almost=True,
)
self.assert_eq(
getattr(psdf.ewm(alpha=0.7, ignore_na=False, min_periods=2),
f)().sum(),
getattr(pdf.ewm(alpha=0.7, ignore_na=False, min_periods=2),
f)().sum(),
+ almost=True,
)
def test_ewm_mean(self):
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]