This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 9894abcca68a [MINOR][TESTS][DOCS] Make bfill/backfill tests not
dependent on natural order
9894abcca68a is described below
commit 9894abcca68aa9339a740c676d1656f24ca3cedc
Author: Yicong-Huang <[email protected]>
AuthorDate: Fri Nov 7 10:28:46 2025 +0800
[MINOR][TESTS][DOCS] Make bfill/backfill tests not dependent on natural
order
### What changes were proposed in this pull request?
This PR adds explicit `.sort_index()` calls to pandas API tests for
`bfill()` and `backfill()` operations to ensure deterministic test results.
### Why are the changes needed?
The `bfill()` (backward fill) and `backfill()` operations in PySpark pandas
API don't guarantee a specific output ordering. Tests that directly compared
the results without sorting could fail intermittently due to differences in the
natural ordering of results between pandas and PySpark implementations.
By adding `.sort_index()` to both sides of the test assertions, we ensure
tests are order-independent and deterministic.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #52924 from
Yicong-Huang/no-ticket/test/no-dependent-on-natural-order.
Authored-by: Yicong-Huang <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/pandas/generic.py | 4 +--
.../pandas/tests/computation/test_missing_data.py | 31 ++++++++++++++--------
.../pandas/tests/series/test_missing_data.py | 25 ++++++++++-------
3 files changed, 37 insertions(+), 23 deletions(-)
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index dd9cc311f274..14c2ec410589 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -3370,7 +3370,7 @@ class Frame(object, metaclass=ABCMeta):
Propagate non-null values backward.
- >>> psdf.bfill()
+ >>> psdf.bfill().sort_index()
A B C D
0 3.0 2.0 1.0 0
1 3.0 4.0 1.0 1
@@ -3387,7 +3387,7 @@ class Frame(object, metaclass=ABCMeta):
3 1.0
dtype: float64
- >>> psser.bfill()
+ >>> psser.bfill().sort_index()
0 1.0
1 1.0
2 1.0
diff --git a/python/pyspark/pandas/tests/computation/test_missing_data.py
b/python/pyspark/pandas/tests/computation/test_missing_data.py
index c777499aea85..d4a4332eb2c8 100644
--- a/python/pyspark/pandas/tests/computation/test_missing_data.py
+++ b/python/pyspark/pandas/tests/computation/test_missing_data.py
@@ -40,12 +40,12 @@ class FrameMissingDataMixin:
)
psdf = ps.from_pandas(pdf)
- self.assert_eq(pdf.backfill(), psdf.backfill())
+ self.assert_eq(pdf.backfill().sort_index(),
psdf.backfill().sort_index())
# Test `inplace=True`
pdf.backfill(inplace=True)
psdf.backfill(inplace=True)
- self.assert_eq(pdf, psdf)
+ self.assert_eq(pdf.sort_index(), psdf.sort_index())
def _test_dropna(self, pdf, axis):
psdf = ps.from_pandas(pdf)
@@ -191,14 +191,21 @@ class FrameMissingDataMixin:
)
self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
self.assert_eq(pdf.fillna(method="ffill", limit=2),
psdf.fillna(method="ffill", limit=2))
- self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill"))
- self.assert_eq(pdf.fillna(method="bfill", limit=2),
psdf.fillna(method="bfill", limit=2))
+ self.assert_eq(
+ pdf.fillna(method="bfill").sort_index(),
psdf.fillna(method="bfill").sort_index()
+ )
+ self.assert_eq(
+ pdf.fillna(method="bfill", limit=2).sort_index(),
+ psdf.fillna(method="bfill", limit=2).sort_index(),
+ )
pdf = pdf.set_index(["x", "y"])
psdf = ps.from_pandas(pdf)
# check multi index
self.assert_eq(psdf.fillna(-1), pdf.fillna(-1))
- self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill"))
+ self.assert_eq(
+ pdf.fillna(method="bfill").sort_index(),
psdf.fillna(method="bfill").sort_index()
+ )
self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
pser = pdf.z
@@ -253,7 +260,9 @@ class FrameMissingDataMixin:
)
self.assert_eq(pdf.fillna(method="ffill"), psdf.fillna(method="ffill"))
self.assert_eq(pdf.fillna(method="ffill", limit=2),
psdf.fillna(method="ffill", limit=2))
- self.assert_eq(pdf.fillna(method="bfill"), psdf.fillna(method="bfill"))
+ self.assert_eq(
+ pdf.fillna(method="bfill").sort_index(),
psdf.fillna(method="bfill").sort_index()
+ )
self.assert_eq(pdf.fillna(method="bfill", limit=2),
psdf.fillna(method="bfill", limit=2))
self.assert_eq(psdf.fillna({"x": -1}), pdf.fillna({"x": -1}))
@@ -422,8 +431,8 @@ class FrameMissingDataMixin:
)
psdf = ps.from_pandas(pdf)
- self.assert_eq(psdf.bfill(), pdf.bfill())
- self.assert_eq(psdf.bfill(limit=1), pdf.bfill(limit=1))
+ self.assert_eq(psdf.bfill().sort_index(), pdf.bfill().sort_index())
+ self.assert_eq(psdf.bfill(limit=1).sort_index(),
pdf.bfill(limit=1).sort_index())
pser = pdf.x
psser = psdf.x
@@ -431,9 +440,9 @@ class FrameMissingDataMixin:
psdf.bfill(inplace=True)
pdf.bfill(inplace=True)
- self.assert_eq(psdf, pdf)
- self.assert_eq(psser, pser)
- self.assert_eq(psser[idx[0]], pser[idx[0]])
+ self.assert_eq(psdf.sort_index(), pdf.sort_index())
+ self.assert_eq(psser.sort_index(), pser.sort_index())
+ self.assert_eq(psser.sort_index()[idx[0]], pser.sort_index()[idx[0]])
def test_pad(self):
pdf = pd.DataFrame(
diff --git a/python/pyspark/pandas/tests/series/test_missing_data.py
b/python/pyspark/pandas/tests/series/test_missing_data.py
index 7336ef23cc61..0457d35a8920 100644
--- a/python/pyspark/pandas/tests/series/test_missing_data.py
+++ b/python/pyspark/pandas/tests/series/test_missing_data.py
@@ -61,8 +61,13 @@ class SeriesMissingDataMixin:
self.assert_eq(psser.fillna(0), pser.fillna(0))
self.assert_eq(psser.fillna(method="ffill"),
pser.fillna(method="ffill"))
- self.assert_eq(psser.fillna(method="bfill"),
pser.fillna(method="bfill"))
- self.assert_eq(psser.fillna(method="backfill"),
pser.fillna(method="backfill"))
+ self.assert_eq(
+ psser.fillna(method="bfill").sort_index(),
pser.fillna(method="bfill").sort_index()
+ )
+ self.assert_eq(
+ psser.fillna(method="backfill").sort_index(),
+ pser.fillna(method="backfill").sort_index(),
+ )
# inplace fillna on non-nullable column
pdf = pd.DataFrame({"a": [1, 2, None], "b": [1, 2, 3]})
@@ -172,14 +177,14 @@ class SeriesMissingDataMixin:
pser = pdf.x
psser = psdf.x
- self.assert_eq(psser.bfill(), pser.bfill())
- self.assert_eq(psser.bfill()[0], pser.bfill()[0])
+ self.assert_eq(psser.bfill().sort_index(), pser.bfill().sort_index())
+ self.assert_eq(psser.bfill().sort_index()[0],
pser.bfill().sort_index()[0])
psser.bfill(inplace=True)
pser.bfill(inplace=True)
- self.assert_eq(psser, pser)
- self.assert_eq(psser[0], pser[0])
- self.assert_eq(psdf, pdf)
+ self.assert_eq(psser.sort_index(), pser.sort_index())
+ self.assert_eq(psser.sort_index()[0], pser.sort_index()[0])
+ self.assert_eq(psdf.sort_index(), pdf.sort_index())
def test_ffill(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan,
2, 3, 4, np.nan, 6]})
@@ -215,13 +220,13 @@ class SeriesMissingDataMixin:
psdf = ps.from_pandas(pdf)
pser, psser = pdf.x, psdf.x
- self.assert_eq(pser.backfill(), psser.backfill())
+ self.assert_eq(pser.backfill().sort_index(),
psser.backfill().sort_index())
# Test `inplace=True`
pser.backfill(inplace=True)
psser.backfill(inplace=True)
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
+ self.assert_eq(pser.sort_index(), psser.sort_index())
+ self.assert_eq(pdf.sort_index(), psdf.sort_index())
class SeriesMissingDataTests(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]