This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 38a517de8e22 [SPARK-46856][PS][TESTS] Apply approximate equality in ewm tests 38a517de8e22 is described below commit 38a517de8e22183ac92c693e0137bd5bfb3c88a4 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Thu Jan 25 17:11:22 2024 +0800 [SPARK-46856][PS][TESTS] Apply approximate equality in ewm tests ### What changes were proposed in this pull request? Apply approximate equality in ewm tests ### Why are the changes needed? the `ewm` function in Spark is based on `EWM` expression in Scala, do not need to compare the result too exactly. on various envs, some tests may fail like: ``` Traceback (most recent call last): File "/home/jenkins/python/pyspark/testing/pandasutils.py", line 91, in _assert_pandas_equal assert_frame_equal( File "/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py", line 1308, in assert_frame_equal assert_series_equal( File "/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py", line 1018, in assert_series_equal assert_numpy_array_equal( File "/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py", line 741, in assert_numpy_array_equal _raise(left, right, err_msg) File "/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py", line 735, in _raise raise_assert_detail(obj, msg, left, right, index_values=index_values) File "/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py", line 665, in raise_assert_detail raise AssertionError(msg) AssertionError: DataFrame.iloc[:, 1] (column name="b") are different DataFrame.iloc[:, 1] (column name="b") values are different (25.0 %) [index]: [0.9781772871933869, 0.6938842103849581, 0.05954110855254491, 0.43191250286369276] [left]: [4.0, 2.4615384615384617, 2.848920863309352, 1.5441072688779112] [right]: [4.0, 2.4615384615384617, 2.8489208633093526, 1.5441072688779112] ``` ### Does this PR introduce _any_ user-facing change? no, test only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44879 from zhengruifeng/ps_test_ewm_almost. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- .../pyspark/pandas/tests/window/test_ewm_mean.py | 106 +++++++++++++++++---- 1 file changed, 90 insertions(+), 16 deletions(-) diff --git a/python/pyspark/pandas/tests/window/test_ewm_mean.py b/python/pyspark/pandas/tests/window/test_ewm_mean.py index 00750b867610..5777445ecc57 100644 --- a/python/pyspark/pandas/tests/window/test_ewm_mean.py +++ b/python/pyspark/pandas/tests/window/test_ewm_mean.py @@ -25,56 +25,110 @@ class EWMMeanMixin: def _test_ewm_func(self, f): pser = pd.Series([1, 2, 3], index=np.random.rand(3), name="a") psser = ps.from_pandas(pser) - self.assert_eq(getattr(psser.ewm(com=0.2), f)(), getattr(pser.ewm(com=0.2), f)()) self.assert_eq( - getattr(psser.ewm(com=0.2), f)().sum(), getattr(pser.ewm(com=0.2), f)().sum() + getattr(psser.ewm(com=0.2), f)(), + getattr(pser.ewm(com=0.2), f)(), + almost=True, ) - self.assert_eq(getattr(psser.ewm(span=1.7), f)(), getattr(pser.ewm(span=1.7), f)()) self.assert_eq( - getattr(psser.ewm(span=1.7), f)().sum(), getattr(pser.ewm(span=1.7), f)().sum() + getattr(psser.ewm(com=0.2), f)().sum(), + getattr(pser.ewm(com=0.2), f)().sum(), + almost=True, ) - self.assert_eq(getattr(psser.ewm(halflife=0.5), f)(), getattr(pser.ewm(halflife=0.5), f)()) self.assert_eq( - getattr(psser.ewm(halflife=0.5), f)().sum(), getattr(pser.ewm(halflife=0.5), f)().sum() + getattr(psser.ewm(span=1.7), f)(), + getattr(pser.ewm(span=1.7), f)(), + almost=True, ) - self.assert_eq(getattr(psser.ewm(alpha=0.7), f)(), getattr(pser.ewm(alpha=0.7), f)()) self.assert_eq( - getattr(psser.ewm(alpha=0.7), f)().sum(), getattr(pser.ewm(alpha=0.7), f)().sum() + getattr(psser.ewm(span=1.7), f)().sum(), + getattr(pser.ewm(span=1.7), f)().sum(), + almost=True, + ) + self.assert_eq( + getattr(psser.ewm(halflife=0.5), f)(), + getattr(pser.ewm(halflife=0.5), f)(), + almost=True, + ) + self.assert_eq( + getattr(psser.ewm(halflife=0.5), f)().sum(), + getattr(pser.ewm(halflife=0.5), f)().sum(), + almost=True, + ) + self.assert_eq( + getattr(psser.ewm(alpha=0.7), f)(), + getattr(pser.ewm(alpha=0.7), f)(), + almost=True, + ) + self.assert_eq( + getattr(psser.ewm(alpha=0.7), f)().sum(), + getattr(pser.ewm(alpha=0.7), f)().sum(), + almost=True, ) self.assert_eq( getattr(psser.ewm(alpha=0.7, min_periods=2), f)(), getattr(pser.ewm(alpha=0.7, min_periods=2), f)(), + almost=True, ) self.assert_eq( getattr(psser.ewm(alpha=0.7, min_periods=2), f)().sum(), getattr(pser.ewm(alpha=0.7, min_periods=2), f)().sum(), + almost=True, ) pdf = pd.DataFrame( {"a": [1.0, 2.0, 3.0, 2.0], "b": [4.0, 2.0, 3.0, 1.0]}, index=np.random.rand(4) ) psdf = ps.from_pandas(pdf) - self.assert_eq(getattr(psdf.ewm(com=0.2), f)(), getattr(pdf.ewm(com=0.2), f)()) - self.assert_eq(getattr(psdf.ewm(com=0.2), f)().sum(), getattr(pdf.ewm(com=0.2), f)().sum()) - self.assert_eq(getattr(psdf.ewm(span=1.7), f)(), getattr(pdf.ewm(span=1.7), f)()) self.assert_eq( - getattr(psdf.ewm(span=1.7), f)().sum(), getattr(pdf.ewm(span=1.7), f)().sum() + getattr(psdf.ewm(com=0.2), f)(), + getattr(pdf.ewm(com=0.2), f)(), + almost=True, + ) + self.assert_eq( + getattr(psdf.ewm(com=0.2), f)().sum(), + getattr(pdf.ewm(com=0.2), f)().sum(), + almost=True, + ) + self.assert_eq( + getattr(psdf.ewm(span=1.7), f)(), + getattr(pdf.ewm(span=1.7), f)(), + almost=True, + ) + self.assert_eq( + getattr(psdf.ewm(span=1.7), f)().sum(), + getattr(pdf.ewm(span=1.7), f)().sum(), + almost=True, + ) + self.assert_eq( + getattr(psdf.ewm(halflife=0.5), f)(), + getattr(pdf.ewm(halflife=0.5), f)(), + almost=True, + ) + self.assert_eq( + getattr(psdf.ewm(halflife=0.5), f)().sum(), + getattr(pdf.ewm(halflife=0.5), f)().sum(), + almost=True, ) - self.assert_eq(getattr(psdf.ewm(halflife=0.5), f)(), getattr(pdf.ewm(halflife=0.5), f)()) self.assert_eq( - getattr(psdf.ewm(halflife=0.5), f)().sum(), getattr(pdf.ewm(halflife=0.5), f)().sum() + getattr(psdf.ewm(alpha=0.7), f)(), + getattr(pdf.ewm(alpha=0.7), f)(), + almost=True, ) - self.assert_eq(getattr(psdf.ewm(alpha=0.7), f)(), getattr(pdf.ewm(alpha=0.7), f)()) self.assert_eq( - getattr(psdf.ewm(alpha=0.7), f)().sum(), getattr(pdf.ewm(alpha=0.7), f)().sum() + getattr(psdf.ewm(alpha=0.7), f)().sum(), + getattr(pdf.ewm(alpha=0.7), f)().sum(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(alpha=0.7, min_periods=2), f)(), getattr(pdf.ewm(alpha=0.7, min_periods=2), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(alpha=0.7, min_periods=2), f)().sum(), getattr(pdf.ewm(alpha=0.7, min_periods=2), f)().sum(), + almost=True, ) pdf = pd.DataFrame( @@ -91,82 +145,102 @@ class EWMMeanMixin: self.assert_eq( getattr(psdf.ewm(com=0.2, ignore_na=True), f)(), getattr(pdf.ewm(com=0.2, ignore_na=True), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(com=0.2, ignore_na=True), f)().sum(), getattr(pdf.ewm(com=0.2, ignore_na=True), f)().sum(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(com=0.2, ignore_na=False), f)(), getattr(pdf.ewm(com=0.2, ignore_na=False), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(com=0.2, ignore_na=False), f)().sum(), getattr(pdf.ewm(com=0.2, ignore_na=False), f)().sum(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(span=1.7, ignore_na=True), f)(), getattr(pdf.ewm(span=1.7, ignore_na=True), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(span=1.7, ignore_na=True), f)().sum(), getattr(pdf.ewm(span=1.7, ignore_na=True), f)().sum(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(span=1.7, ignore_na=False), f)(), getattr(pdf.ewm(span=1.7, ignore_na=False), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(span=1.7, ignore_na=False), f)().sum(), getattr(pdf.ewm(span=1.7, ignore_na=False), f)().sum(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(halflife=0.5, ignore_na=True), f)(), getattr(pdf.ewm(halflife=0.5, ignore_na=True), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(halflife=0.5, ignore_na=True), f)().sum(), getattr(pdf.ewm(halflife=0.5, ignore_na=True), f)().sum(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(halflife=0.5, ignore_na=False), f)(), getattr(pdf.ewm(halflife=0.5, ignore_na=False), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(halflife=0.5, ignore_na=False), f)().sum(), getattr(pdf.ewm(halflife=0.5, ignore_na=False), f)().sum(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(alpha=0.7, ignore_na=True), f)(), getattr(pdf.ewm(alpha=0.7, ignore_na=True), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(alpha=0.7, ignore_na=True), f)().sum(), getattr(pdf.ewm(alpha=0.7, ignore_na=True), f)().sum(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(alpha=0.7, ignore_na=False), f)(), getattr(pdf.ewm(alpha=0.7, ignore_na=False), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(alpha=0.7, ignore_na=False), f)().sum(), getattr(pdf.ewm(alpha=0.7, ignore_na=False), f)().sum(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(alpha=0.7, ignore_na=True, min_periods=2), f)(), getattr(pdf.ewm(alpha=0.7, ignore_na=True, min_periods=2), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(alpha=0.7, ignore_na=True, min_periods=2), f)().sum(), getattr(pdf.ewm(alpha=0.7, ignore_na=True, min_periods=2), f)().sum(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(alpha=0.7, ignore_na=False, min_periods=2), f)(), getattr(pdf.ewm(alpha=0.7, ignore_na=False, min_periods=2), f)(), + almost=True, ) self.assert_eq( getattr(psdf.ewm(alpha=0.7, ignore_na=False, min_periods=2), f)().sum(), getattr(pdf.ewm(alpha=0.7, ignore_na=False, min_periods=2), f)().sum(), + almost=True, ) def test_ewm_mean(self): --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org