This is an automated email from the ASF dual-hosted git repository. ueshin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 32b8512 [SPARK-36762][PYTHON] Fix Series.isin when Series has NaN values 32b8512 is described below commit 32b8512912c211f7f12e717e7029e89645da9d3b Author: dgd-contributor <dgd_contribu...@viettel.com.vn> AuthorDate: Fri Sep 17 17:48:15 2021 -0700 [SPARK-36762][PYTHON] Fix Series.isin when Series has NaN values ### What changes were proposed in this pull request? Fix Series.isin when Series has NaN values ### Why are the changes needed? Fix Series.isin when Series has NaN values ``` python >>> pser = pd.Series([None, 5, None, 3, 2, 1, None, 0, 0]) >>> psser = ps.from_pandas(pser) >>> pser.isin([1, 3, 5, None]) 0 False 1 True 2 False 3 True 4 False 5 True 6 False 7 False 8 False dtype: bool >>> psser.isin([1, 3, 5, None]) 0 None 1 True 2 None 3 True 4 None 5 True 6 None 7 None 8 None dtype: object ``` ### Does this PR introduce _any_ user-facing change? After this PR ``` python >>> pser = pd.Series([None, 5, None, 3, 2, 1, None, 0, 0]) >>> psser = ps.from_pandas(pser) >>> psser.isin([1, 3, 5, None]) 0 False 1 True 2 False 3 True 4 False 5 True 6 False 7 False 8 False dtype: bool ``` ### How was this patch tested? unit tests Closes #34005 from dgd-contributor/SPARK-36762_fix_series.isin_when_values_have_NaN. Authored-by: dgd-contributor <dgd_contribu...@viettel.com.vn> Signed-off-by: Takuya UESHIN <ues...@databricks.com> --- python/pyspark/pandas/base.py | 10 ++++++++-- python/pyspark/pandas/tests/test_series.py | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index 533460c..27f3d78 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -27,7 +27,7 @@ import numpy as np import pandas as pd # noqa: F401 from pandas.api.types import is_list_like, CategoricalDtype from pyspark.sql import functions as F, Column, Window -from pyspark.sql.types import LongType +from pyspark.sql.types import LongType, BooleanType from pyspark import pandas as ps # For running doctests and reference resolution in PyCharm. from pyspark.pandas._typing import Axis, Dtype, IndexOpsLike, Label, SeriesOrIndex @@ -873,7 +873,13 @@ class IndexOpsMixin(object, metaclass=ABCMeta): ) values = values.tolist() if isinstance(values, np.ndarray) else list(values) - return self._with_new_scol(self.spark.column.isin([SF.lit(v) for v in values])) + + other = [SF.lit(v) for v in values] + scol = self.spark.column.isin(other) + field = self._internal.data_fields[0].copy( + dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False + ) + return self._with_new_scol(scol=F.coalesce(scol, F.lit(False)), field=field) def isnull(self: IndexOpsLike) -> IndexOpsLike: """ diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 1bf8388..09e5d30 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -394,6 +394,23 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): with self.assertRaisesRegex(TypeError, msg): psser.isin(1) + # when Series have NaN + pser = pd.Series(["lama", "cow", None, "lama", "beetle", "lama", "hippo", None], name="a") + psser = ps.from_pandas(pser) + + self.assert_eq(psser.isin(["cow", "lama"]), pser.isin(["cow", "lama"])) + + pser = pd.Series([None, 5, None, 3, 2, 1, None, 0, 0], name="a") + psser = ps.from_pandas(pser) + + if LooseVersion(pd.__version__) >= LooseVersion("1.2"): + self.assert_eq(psser.isin([1, 5, 0, None]), pser.isin([1, 5, 0, None])) + else: + expected = pd.Series( + [False, True, False, False, False, True, False, True, True], name="a" + ) + self.assert_eq(psser.isin([1, 5, 0, None]), expected) + def test_drop_duplicates(self): pdf = pd.DataFrame({"animal": ["lama", "cow", "lama", "beetle", "lama", "hippo"]}) psdf = ps.from_pandas(pdf) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org