(spark) branch master updated: [SPARK-46856][PS][TESTS] Apply approximate equality in ewm tests

ruifengz Thu, 25 Jan 2024 01:11:47 -0800

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 38a517de8e22 [SPARK-46856][PS][TESTS] Apply approximate equality in 
ewm tests
38a517de8e22 is described below

commit 38a517de8e22183ac92c693e0137bd5bfb3c88a4
Author: Ruifeng Zheng <ruife...@apache.org>
AuthorDate: Thu Jan 25 17:11:22 2024 +0800

    [SPARK-46856][PS][TESTS] Apply approximate equality in ewm tests
    
    ### What changes were proposed in this pull request?
    Apply approximate equality in ewm tests
    
    ### Why are the changes needed?
    the `ewm` function in Spark is based on `EWM` expression in Scala, do not 
need to compare the result too exactly.
    
    on various envs, some tests may fail like:
    ```
    Traceback (most recent call last):
      File "/home/jenkins/python/pyspark/testing/pandasutils.py", line 91, in 
_assert_pandas_equal
        assert_frame_equal(
      File 
"/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py",
 line 1308, in assert_frame_equal
        assert_series_equal(
      File 
"/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py",
 line 1018, in assert_series_equal
        assert_numpy_array_equal(
      File 
"/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py",
 line 741, in assert_numpy_array_equal
        _raise(left, right, err_msg)
      File 
"/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py",
 line 735, in _raise
        raise_assert_detail(obj, msg, left, right, index_values=index_values)
      File 
"/databricks/python3/lib/python3.10/site-packages/pandas/_testing/asserters.py",
 line 665, in raise_assert_detail
        raise AssertionError(msg)
    AssertionError: DataFrame.iloc[:, 1] (column name="b") are different
    DataFrame.iloc[:, 1] (column name="b") values are different (25.0 %)
    [index]: [0.9781772871933869, 0.6938842103849581, 0.05954110855254491, 
0.43191250286369276]
    [left]:  [4.0, 2.4615384615384617, 2.848920863309352, 1.5441072688779112]
    [right]: [4.0, 2.4615384615384617, 2.8489208633093526, 1.5441072688779112]
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    no, test only
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #44879 from zhengruifeng/ps_test_ewm_almost.
    
    Authored-by: Ruifeng Zheng <ruife...@apache.org>
    Signed-off-by: Ruifeng Zheng <ruife...@apache.org>
---
 .../pyspark/pandas/tests/window/test_ewm_mean.py   | 106 +++++++++++++++++----
 1 file changed, 90 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/pandas/tests/window/test_ewm_mean.py 
b/python/pyspark/pandas/tests/window/test_ewm_mean.py
index 00750b867610..5777445ecc57 100644
--- a/python/pyspark/pandas/tests/window/test_ewm_mean.py
+++ b/python/pyspark/pandas/tests/window/test_ewm_mean.py
@@ -25,56 +25,110 @@ class EWMMeanMixin:
     def _test_ewm_func(self, f):
         pser = pd.Series([1, 2, 3], index=np.random.rand(3), name="a")
         psser = ps.from_pandas(pser)
-        self.assert_eq(getattr(psser.ewm(com=0.2), f)(), 
getattr(pser.ewm(com=0.2), f)())
         self.assert_eq(
-            getattr(psser.ewm(com=0.2), f)().sum(), getattr(pser.ewm(com=0.2), 
f)().sum()
+            getattr(psser.ewm(com=0.2), f)(),
+            getattr(pser.ewm(com=0.2), f)(),
+            almost=True,
         )
-        self.assert_eq(getattr(psser.ewm(span=1.7), f)(), 
getattr(pser.ewm(span=1.7), f)())
         self.assert_eq(
-            getattr(psser.ewm(span=1.7), f)().sum(), 
getattr(pser.ewm(span=1.7), f)().sum()
+            getattr(psser.ewm(com=0.2), f)().sum(),
+            getattr(pser.ewm(com=0.2), f)().sum(),
+            almost=True,
         )
-        self.assert_eq(getattr(psser.ewm(halflife=0.5), f)(), 
getattr(pser.ewm(halflife=0.5), f)())
         self.assert_eq(
-            getattr(psser.ewm(halflife=0.5), f)().sum(), 
getattr(pser.ewm(halflife=0.5), f)().sum()
+            getattr(psser.ewm(span=1.7), f)(),
+            getattr(pser.ewm(span=1.7), f)(),
+            almost=True,
         )
-        self.assert_eq(getattr(psser.ewm(alpha=0.7), f)(), 
getattr(pser.ewm(alpha=0.7), f)())
         self.assert_eq(
-            getattr(psser.ewm(alpha=0.7), f)().sum(), 
getattr(pser.ewm(alpha=0.7), f)().sum()
+            getattr(psser.ewm(span=1.7), f)().sum(),
+            getattr(pser.ewm(span=1.7), f)().sum(),
+            almost=True,
+        )
+        self.assert_eq(
+            getattr(psser.ewm(halflife=0.5), f)(),
+            getattr(pser.ewm(halflife=0.5), f)(),
+            almost=True,
+        )
+        self.assert_eq(
+            getattr(psser.ewm(halflife=0.5), f)().sum(),
+            getattr(pser.ewm(halflife=0.5), f)().sum(),
+            almost=True,
+        )
+        self.assert_eq(
+            getattr(psser.ewm(alpha=0.7), f)(),
+            getattr(pser.ewm(alpha=0.7), f)(),
+            almost=True,
+        )
+        self.assert_eq(
+            getattr(psser.ewm(alpha=0.7), f)().sum(),
+            getattr(pser.ewm(alpha=0.7), f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psser.ewm(alpha=0.7, min_periods=2), f)(),
             getattr(pser.ewm(alpha=0.7, min_periods=2), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psser.ewm(alpha=0.7, min_periods=2), f)().sum(),
             getattr(pser.ewm(alpha=0.7, min_periods=2), f)().sum(),
+            almost=True,
         )
 
         pdf = pd.DataFrame(
             {"a": [1.0, 2.0, 3.0, 2.0], "b": [4.0, 2.0, 3.0, 1.0]}, 
index=np.random.rand(4)
         )
         psdf = ps.from_pandas(pdf)
-        self.assert_eq(getattr(psdf.ewm(com=0.2), f)(), 
getattr(pdf.ewm(com=0.2), f)())
-        self.assert_eq(getattr(psdf.ewm(com=0.2), f)().sum(), 
getattr(pdf.ewm(com=0.2), f)().sum())
-        self.assert_eq(getattr(psdf.ewm(span=1.7), f)(), 
getattr(pdf.ewm(span=1.7), f)())
         self.assert_eq(
-            getattr(psdf.ewm(span=1.7), f)().sum(), getattr(pdf.ewm(span=1.7), 
f)().sum()
+            getattr(psdf.ewm(com=0.2), f)(),
+            getattr(pdf.ewm(com=0.2), f)(),
+            almost=True,
+        )
+        self.assert_eq(
+            getattr(psdf.ewm(com=0.2), f)().sum(),
+            getattr(pdf.ewm(com=0.2), f)().sum(),
+            almost=True,
+        )
+        self.assert_eq(
+            getattr(psdf.ewm(span=1.7), f)(),
+            getattr(pdf.ewm(span=1.7), f)(),
+            almost=True,
+        )
+        self.assert_eq(
+            getattr(psdf.ewm(span=1.7), f)().sum(),
+            getattr(pdf.ewm(span=1.7), f)().sum(),
+            almost=True,
+        )
+        self.assert_eq(
+            getattr(psdf.ewm(halflife=0.5), f)(),
+            getattr(pdf.ewm(halflife=0.5), f)(),
+            almost=True,
+        )
+        self.assert_eq(
+            getattr(psdf.ewm(halflife=0.5), f)().sum(),
+            getattr(pdf.ewm(halflife=0.5), f)().sum(),
+            almost=True,
         )
-        self.assert_eq(getattr(psdf.ewm(halflife=0.5), f)(), 
getattr(pdf.ewm(halflife=0.5), f)())
         self.assert_eq(
-            getattr(psdf.ewm(halflife=0.5), f)().sum(), 
getattr(pdf.ewm(halflife=0.5), f)().sum()
+            getattr(psdf.ewm(alpha=0.7), f)(),
+            getattr(pdf.ewm(alpha=0.7), f)(),
+            almost=True,
         )
-        self.assert_eq(getattr(psdf.ewm(alpha=0.7), f)(), 
getattr(pdf.ewm(alpha=0.7), f)())
         self.assert_eq(
-            getattr(psdf.ewm(alpha=0.7), f)().sum(), 
getattr(pdf.ewm(alpha=0.7), f)().sum()
+            getattr(psdf.ewm(alpha=0.7), f)().sum(),
+            getattr(pdf.ewm(alpha=0.7), f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(alpha=0.7, min_periods=2), f)(),
             getattr(pdf.ewm(alpha=0.7, min_periods=2), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(alpha=0.7, min_periods=2), f)().sum(),
             getattr(pdf.ewm(alpha=0.7, min_periods=2), f)().sum(),
+            almost=True,
         )
 
         pdf = pd.DataFrame(
@@ -91,82 +145,102 @@ class EWMMeanMixin:
         self.assert_eq(
             getattr(psdf.ewm(com=0.2, ignore_na=True), f)(),
             getattr(pdf.ewm(com=0.2, ignore_na=True), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(com=0.2, ignore_na=True), f)().sum(),
             getattr(pdf.ewm(com=0.2, ignore_na=True), f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(com=0.2, ignore_na=False), f)(),
             getattr(pdf.ewm(com=0.2, ignore_na=False), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(com=0.2, ignore_na=False), f)().sum(),
             getattr(pdf.ewm(com=0.2, ignore_na=False), f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(span=1.7, ignore_na=True), f)(),
             getattr(pdf.ewm(span=1.7, ignore_na=True), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(span=1.7, ignore_na=True), f)().sum(),
             getattr(pdf.ewm(span=1.7, ignore_na=True), f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(span=1.7, ignore_na=False), f)(),
             getattr(pdf.ewm(span=1.7, ignore_na=False), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(span=1.7, ignore_na=False), f)().sum(),
             getattr(pdf.ewm(span=1.7, ignore_na=False), f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(halflife=0.5, ignore_na=True), f)(),
             getattr(pdf.ewm(halflife=0.5, ignore_na=True), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(halflife=0.5, ignore_na=True), f)().sum(),
             getattr(pdf.ewm(halflife=0.5, ignore_na=True), f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(halflife=0.5, ignore_na=False), f)(),
             getattr(pdf.ewm(halflife=0.5, ignore_na=False), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(halflife=0.5, ignore_na=False), f)().sum(),
             getattr(pdf.ewm(halflife=0.5, ignore_na=False), f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(alpha=0.7, ignore_na=True), f)(),
             getattr(pdf.ewm(alpha=0.7, ignore_na=True), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(alpha=0.7, ignore_na=True), f)().sum(),
             getattr(pdf.ewm(alpha=0.7, ignore_na=True), f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(alpha=0.7, ignore_na=False), f)(),
             getattr(pdf.ewm(alpha=0.7, ignore_na=False), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(alpha=0.7, ignore_na=False), f)().sum(),
             getattr(pdf.ewm(alpha=0.7, ignore_na=False), f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(alpha=0.7, ignore_na=True, min_periods=2), f)(),
             getattr(pdf.ewm(alpha=0.7, ignore_na=True, min_periods=2), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(alpha=0.7, ignore_na=True, min_periods=2), 
f)().sum(),
             getattr(pdf.ewm(alpha=0.7, ignore_na=True, min_periods=2), 
f)().sum(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(alpha=0.7, ignore_na=False, min_periods=2), f)(),
             getattr(pdf.ewm(alpha=0.7, ignore_na=False, min_periods=2), f)(),
+            almost=True,
         )
         self.assert_eq(
             getattr(psdf.ewm(alpha=0.7, ignore_na=False, min_periods=2), 
f)().sum(),
             getattr(pdf.ewm(alpha=0.7, ignore_na=False, min_periods=2), 
f)().sum(),
+            almost=True,
         )
 
     def test_ewm_mean(self):


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-46856][PS][TESTS] Apply approximate equality in ewm tests

Reply via email to