(spark) branch master updated: [MINOR][TESTS][DOCS] Make bfill/backfill tests not dependent on natural order

ruifengz Thu, 06 Nov 2025 18:29:02 -0800

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 9894abcca68a [MINOR][TESTS][DOCS] Make bfill/backfill tests not 
dependent on natural order
9894abcca68a is described below

commit 9894abcca68aa9339a740c676d1656f24ca3cedc
Author: Yicong-Huang <[email protected]>
AuthorDate: Fri Nov 7 10:28:46 2025 +0800

    [MINOR][TESTS][DOCS] Make bfill/backfill tests not dependent on natural 
order
    
    ### What changes were proposed in this pull request?
    This PR adds explicit `.sort_index()` calls to pandas API tests for 
`bfill()` and `backfill()` operations to ensure deterministic test results.
    
    ### Why are the changes needed?
    The `bfill()` (backward fill) and `backfill()` operations in PySpark pandas 
API don't guarantee a specific output ordering. Tests that directly compared 
the results without sorting could fail intermittently due to differences in the 
natural ordering of results between pandas and PySpark implementations.
    
    By adding `.sort_index()` to both sides of the test assertions, we ensure 
tests are order-independent and deterministic.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Existing tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #52924 from 
Yicong-Huang/no-ticket/test/no-dependent-on-natural-order.
    
    Authored-by: Yicong-Huang <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 python/pyspark/pandas/generic.py                   |  4 +--
 .../pandas/tests/computation/test_missing_data.py  | 31 ++++++++++++++--------
 .../pandas/tests/series/test_missing_data.py       | 25 ++++++++++-------
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index dd9cc311f274..14c2ec410589 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -3370,7 +3370,7 @@ class Frame(object, metaclass=ABCMeta):
 
         Propagate non-null values backward.
 
-        >>> psdf.bfill()
+        >>> psdf.bfill().sort_index()
              A    B    C  D
         0  3.0  2.0  1.0  0
         1  3.0  4.0  1.0  1
@@ -3387,7 +3387,7 @@ class Frame(object, metaclass=ABCMeta):
         3    1.0
         dtype: float64
 
-        >>> psser.bfill()
+        >>> psser.bfill().sort_index()
         0    1.0
         1    1.0
         2    1.0
diff --git a/python/pyspark/pandas/tests/computation/test_missing_data.py 
b/python/pyspark/pandas/tests/computation/test_missing_data.py
index c777499aea85..d4a4332eb2c8 100644
--- a/python/pyspark/pandas/tests/computation/test_missing_data.py
+++ b/python/pyspark/pandas/tests/computation/test_missing_data.py
@@ -40,12 +40,12 @@ class FrameMissingDataMixin:
         )
         psdf = ps.from_pandas(pdf)
 
-        self.assert_eq(pdf.backfill(), psdf.backfill())
+        self.assert_eq(pdf.backfill().sort_index(), 
psdf.backfill().sort_index())
 
         # Test `inplace=True`
         pdf.backfill(inplace=True)
         psdf.backfill(inplace=True)
-        self.assert_eq(pdf, psdf)
+        self.assert_eq(pdf.sort_index(), psdf.sort_index())
 
     def _test_dropna(self, pdf, axis):
         psdf = ps.from_pandas(pdf)
@@ -191,14 +191,21 @@ class FrameMissingDataMixin:
         )
         self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
         self.assert_eq(pdf.fillna(method="ffill", limit=2), 
psdf.fillna(method="ffill", limit=2))
-        self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill"))
-        self.assert_eq(pdf.fillna(method="bfill", limit=2), 
psdf.fillna(method="bfill", limit=2))
+        self.assert_eq(
+            pdf.fillna(method="bfill").sort_index(), 
psdf.fillna(method="bfill").sort_index()
+        )
+        self.assert_eq(
+            pdf.fillna(method="bfill", limit=2).sort_index(),
+            psdf.fillna(method="bfill", limit=2).sort_index(),
+        )
 
         pdf = pdf.set_index(["x", "y"])
         psdf = ps.from_pandas(pdf)
         # check multi index
         self.assert_eq(psdf.fillna(-1), pdf.fillna(-1))
-        self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill"))
+        self.assert_eq(
+            pdf.fillna(method="bfill").sort_index(), 
psdf.fillna(method="bfill").sort_index()
+        )
         self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
 
         pser = pdf.z
@@ -253,7 +260,9 @@ class FrameMissingDataMixin:
         )
         self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
         self.assert_eq(pdf.fillna(method="ffill", limit=2), 
psdf.fillna(method="ffill", limit=2))
-        self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill"))
+        self.assert_eq(
+            pdf.fillna(method="bfill").sort_index(), 
psdf.fillna(method="bfill").sort_index()
+        )
         self.assert_eq(pdf.fillna(method="bfill", limit=2), 
psdf.fillna(method="bfill", limit=2))
 
         self.assert_eq(psdf.fillna({"x": -1}), pdf.fillna({"x": -1}))
@@ -422,8 +431,8 @@ class FrameMissingDataMixin:
         )
         psdf = ps.from_pandas(pdf)
 
-        self.assert_eq(psdf.bfill(), pdf.bfill())
-        self.assert_eq(psdf.bfill(limit=1), pdf.bfill(limit=1))
+        self.assert_eq(psdf.bfill().sort_index(), pdf.bfill().sort_index())
+        self.assert_eq(psdf.bfill(limit=1).sort_index(), 
pdf.bfill(limit=1).sort_index())
 
         pser = pdf.x
         psser = psdf.x
@@ -431,9 +440,9 @@ class FrameMissingDataMixin:
         psdf.bfill(inplace=True)
         pdf.bfill(inplace=True)
 
-        self.assert_eq(psdf, pdf)
-        self.assert_eq(psser, pser)
-        self.assert_eq(psser[idx[0]], pser[idx[0]])
+        self.assert_eq(psdf.sort_index(), pdf.sort_index())
+        self.assert_eq(psser.sort_index(), pser.sort_index())
+        self.assert_eq(psser.sort_index()[idx[0]], pser.sort_index()[idx[0]])
 
     def test_pad(self):
         pdf = pd.DataFrame(
diff --git a/python/pyspark/pandas/tests/series/test_missing_data.py 
b/python/pyspark/pandas/tests/series/test_missing_data.py
index 7336ef23cc61..0457d35a8920 100644
--- a/python/pyspark/pandas/tests/series/test_missing_data.py
+++ b/python/pyspark/pandas/tests/series/test_missing_data.py
@@ -61,8 +61,13 @@ class SeriesMissingDataMixin:
 
         self.assert_eq(psser.fillna(0), pser.fillna(0))
         self.assert_eq(psser.fillna(method="ffill"), 
pser.fillna(method="ffill"))
-        self.assert_eq(psser.fillna(method="bfill"), 
pser.fillna(method="bfill"))
-        self.assert_eq(psser.fillna(method="backfill"), 
pser.fillna(method="backfill"))
+        self.assert_eq(
+            psser.fillna(method="bfill").sort_index(), 
pser.fillna(method="bfill").sort_index()
+        )
+        self.assert_eq(
+            psser.fillna(method="backfill").sort_index(),
+            pser.fillna(method="backfill").sort_index(),
+        )
 
         # inplace fillna on non-nullable column
         pdf = pd.DataFrame({"a": [1, 2, None], "b": [1, 2, 3]})
@@ -172,14 +177,14 @@ class SeriesMissingDataMixin:
         pser = pdf.x
         psser = psdf.x
 
-        self.assert_eq(psser.bfill(), pser.bfill())
-        self.assert_eq(psser.bfill()[0], pser.bfill()[0])
+        self.assert_eq(psser.bfill().sort_index(), pser.bfill().sort_index())
+        self.assert_eq(psser.bfill().sort_index()[0], 
pser.bfill().sort_index()[0])
 
         psser.bfill(inplace=True)
         pser.bfill(inplace=True)
-        self.assert_eq(psser, pser)
-        self.assert_eq(psser[0], pser[0])
-        self.assert_eq(psdf, pdf)
+        self.assert_eq(psser.sort_index(), pser.sort_index())
+        self.assert_eq(psser.sort_index()[0], pser.sort_index()[0])
+        self.assert_eq(psdf.sort_index(), pdf.sort_index())
 
     def test_ffill(self):
         pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 
2, 3, 4, np.nan, 6]})
@@ -215,13 +220,13 @@ class SeriesMissingDataMixin:
         psdf = ps.from_pandas(pdf)
         pser, psser = pdf.x, psdf.x
 
-        self.assert_eq(pser.backfill(), psser.backfill())
+        self.assert_eq(pser.backfill().sort_index(), 
psser.backfill().sort_index())
 
         # Test `inplace=True`
         pser.backfill(inplace=True)
         psser.backfill(inplace=True)
-        self.assert_eq(pser, psser)
-        self.assert_eq(pdf, psdf)
+        self.assert_eq(pser.sort_index(), psser.sort_index())
+        self.assert_eq(pdf.sort_index(), psdf.sort_index())
 
 
 class SeriesMissingDataTests(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [MINOR][TESTS][DOCS] Make bfill/backfill tests not dependent on natural order

Reply via email to