This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 2d41e46338ec [SPARK-55283][PYTHON][PS][TESTS] Add a new argument
ignore_null to assert_eq
2d41e46338ec is described below
commit 2d41e46338ecfb0447953e4e31c1100ebdc420fc
Author: Tian Gao <[email protected]>
AuthorDate: Mon Feb 2 06:53:29 2026 +0900
[SPARK-55283][PYTHON][PS][TESTS] Add a new argument ignore_null to assert_eq
### What changes were proposed in this pull request?
A new argument `ignore_null` is added to `assert_eq`. `ignore_null` is
basically an alias for `almost=True, atol=0, rtol=0`. We can use our customized
logic to compare df/series/index which does not care about null equalities.
### Why are the changes needed?
pandas 3.0 changed its testing utility so for `assert_frame_equal` and
`assert_series_equal`, they raise an error on different null-like values now
(`None` vs `np.nan` for example). We need this to make our tests pass.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Locally it works for a fixed test - I did not put the test changes in this
PR.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54066 from gaogaotiantian/add-ignore-null.
Authored-by: Tian Gao <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/testing/pandasutils.py | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/python/pyspark/testing/pandasutils.py
b/python/pyspark/testing/pandasutils.py
index 11b741947604..9a019d88f509 100644
--- a/python/pyspark/testing/pandasutils.py
+++ b/python/pyspark/testing/pandasutils.py
@@ -371,6 +371,7 @@ class PandasOnSparkTestUtils:
rtol: float = 1e-5,
atol: float = 1e-8,
check_row_order: bool = True,
+ ignore_null: bool = False,
):
"""
Asserts if two arbitrary objects are equal or not. If given objects are
@@ -389,10 +390,21 @@ class PandasOnSparkTestUtils:
float values in actual and expected. Set to 1e-8 by default.
:param check_row_order: A flag indicating whether the order of rows
should be considered
in the comparison. If set to False, row order will be ignored.
+ :param ignore_null: if this is enabled, the comparison will ignore
null values.
"""
import pandas as pd
from pandas.api.types import is_list_like
+ if ignore_null:
+ # We use _assert_pandas_almost_equal with atol=0 and rtol=0 to
check if the
+ # values are equal because null values are properly handled by it
+ if not almost:
+ # It's possible to set almost=True and ignore_null=True. In
that case,
+ # honor atol and rtol settings. ignore_null=True is implied by
almost=True.
+ almost = True
+ rtol = 0
+ atol = 0
+
# for pandas-on-Spark DataFrames, allow choice to ignore row order
if isinstance(left, (ps.DataFrame, ps.Series, ps.Index)):
if left is None and right is None:
@@ -453,7 +465,9 @@ class PandasOnSparkTestUtils:
elif is_list_like(lobj) and is_list_like(robj):
self.assertTrue(len(left) == len(right))
for litem, ritem in zip(left, right):
- self.assert_eq(litem, ritem, check_exact=check_exact,
almost=almost)
+ self.assert_eq(
+ litem, ritem, check_exact=check_exact, almost=almost,
ignore_null=ignore_null
+ )
elif (lobj is not None and pd.isna(lobj)) and (robj is not None and
pd.isna(robj)):
pass
else:
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]