This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new d2b60ff51fa [SPARK-43567][PS] Support `use_na_sentinel` for `factorize` d2b60ff51fa is described below commit d2b60ff51fabdb38899e649aa2e700112534d79c Author: itholic <haejoon....@databricks.com> AuthorDate: Tue Aug 8 16:16:11 2023 +0900 [SPARK-43567][PS] Support `use_na_sentinel` for `factorize` ### What changes were proposed in this pull request? This PR proposes to support `use_na_sentinel` for `factorize`. ### Why are the changes needed? To match the behavior with [pandas 2](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html) ### Does this PR introduce _any_ user-facing change? Yes, the `na_sentinel` is removed in favor of `use_na_sentinel`. ### How was this patch tested? Enabling the existing tests. Closes #42270 from itholic/pandas_use_na_sentinel. Authored-by: itholic <haejoon....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../source/migration_guide/pyspark_upgrade.rst | 1 + python/pyspark/pandas/base.py | 39 ++++++++-------------- .../tests/connect/series/test_parity_compute.py | 4 +++ .../pyspark/pandas/tests/indexes/test_category.py | 8 ++--- python/pyspark/pandas/tests/series/test_compute.py | 20 +++++------ 5 files changed, 29 insertions(+), 43 deletions(-) diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index 7a691ee2645..d26f1cbbe0d 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -29,6 +29,7 @@ Upgrading from PySpark 3.5 to 4.0 * In Spark 4.0, ``Series.append`` has been removed from pandas API on Spark, use ``ps.concat`` instead. * In Spark 4.0, ``DataFrame.mad`` has been removed from pandas API on Spark. * In Spark 4.0, ``Series.mad`` has been removed from pandas API on Spark. +* In Spark 4.0, ``na_sentinel`` parameter from ``Index.factorize`` and `Series.factorize`` has been removed from pandas API on Spark, use ``use_na_sentinel`` instead. Upgrading from PySpark 3.3 to 3.4 diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index 2de260e6e93..0685af76987 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -1614,7 +1614,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): return cast(IndexOpsLike, self._psdf.iloc[indices].index) def factorize( - self: IndexOpsLike, sort: bool = True, na_sentinel: Optional[int] = -1 + self: IndexOpsLike, sort: bool = True, use_na_sentinel: bool = True ) -> Tuple[IndexOpsLike, pd.Index]: """ Encode the object as an enumerated type or categorical variable. @@ -1625,11 +1625,11 @@ class IndexOpsMixin(object, metaclass=ABCMeta): Parameters ---------- sort : bool, default True - na_sentinel : int or None, default -1 - Value to mark "not found". If None, will not drop the NaN - from the uniques of the values. - - .. deprecated:: 3.4.0 + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values, effectively assigning them + a distinct category. If False, NaN values will be encoded as non-negative integers, + treating them as unique categories in the encoding process and retaining them in the + set of unique categories in the data. Returns ------- @@ -1658,7 +1658,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): >>> uniques Index(['a', 'b', 'c'], dtype='object') - >>> codes, uniques = psser.factorize(na_sentinel=None) + >>> codes, uniques = psser.factorize(use_na_sentinel=False) >>> codes 0 1 1 3 @@ -1669,17 +1669,6 @@ class IndexOpsMixin(object, metaclass=ABCMeta): >>> uniques Index(['a', 'b', 'c', None], dtype='object') - >>> codes, uniques = psser.factorize(na_sentinel=-2) - >>> codes - 0 1 - 1 -2 - 2 0 - 3 2 - 4 1 - dtype: int32 - >>> uniques - Index(['a', 'b', 'c'], dtype='object') - For Index: >>> psidx = ps.Index(['b', None, 'a', 'c', 'b']) @@ -1691,8 +1680,8 @@ class IndexOpsMixin(object, metaclass=ABCMeta): """ from pyspark.pandas.series import first_series - assert (na_sentinel is None) or isinstance(na_sentinel, int) assert sort is True + use_na_sentinel = -1 if use_na_sentinel else False # type: ignore[assignment] warnings.warn( "Argument `na_sentinel` will be removed in 4.0.0.", @@ -1716,7 +1705,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): scol = map_scol[self.spark.column] codes, uniques = self._with_new_scol( scol.alias(self._internal.data_spark_column_names[0]) - ).factorize(na_sentinel=na_sentinel) + ).factorize(use_na_sentinel=use_na_sentinel) return codes, uniques.astype(self.dtype) uniq_sdf = self._internal.spark_frame.select(self.spark.column).distinct() @@ -1743,13 +1732,13 @@ class IndexOpsMixin(object, metaclass=ABCMeta): # Constructs `unique_to_code` mapping non-na unique to code unique_to_code = {} - if na_sentinel is not None: - na_sentinel_code = na_sentinel + if use_na_sentinel: + na_sentinel_code = use_na_sentinel code = 0 for unique in uniques_list: if pd.isna(unique): - if na_sentinel is None: - na_sentinel_code = code + if not use_na_sentinel: + na_sentinel_code = code # type: ignore[assignment] else: unique_to_code[unique] = code code += 1 @@ -1767,7 +1756,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): codes = self._with_new_scol(new_scol.alias(self._internal.data_spark_column_names[0])) - if na_sentinel is not None: + if use_na_sentinel: # Drops the NaN from the uniques of the values uniques_list = [x for x in uniques_list if not pd.isna(x)] diff --git a/python/pyspark/pandas/tests/connect/series/test_parity_compute.py b/python/pyspark/pandas/tests/connect/series/test_parity_compute.py index 8876fcb1398..31916f12b4e 100644 --- a/python/pyspark/pandas/tests/connect/series/test_parity_compute.py +++ b/python/pyspark/pandas/tests/connect/series/test_parity_compute.py @@ -24,6 +24,10 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class SeriesParityComputeTests(SeriesComputeMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): pass + @unittest.skip("TODO(SPARK-43620): Support `Column` for SparkConnectColumn.__getitem__.") + def test_factorize(self): + super().test_factorize() + if __name__ == "__main__": from pyspark.pandas.tests.connect.series.test_parity_compute import * # noqa: F401 diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index ffffae828c4..6aa92b7e1e3 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -210,10 +210,6 @@ class CategoricalIndexTestsMixin: self.assert_eq(pscidx.astype(str), pcidx.astype(str)) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43567): Enable CategoricalIndexTests.test_factorize for pandas 2.0.0.", - ) def test_factorize(self): pidx = pd.CategoricalIndex([1, 2, 3, None]) psidx = ps.from_pandas(pidx) @@ -224,8 +220,8 @@ class CategoricalIndexTestsMixin: self.assert_eq(kcodes.tolist(), pcodes.tolist()) self.assert_eq(kuniques, puniques) - pcodes, puniques = pidx.factorize(na_sentinel=-2) - kcodes, kuniques = psidx.factorize(na_sentinel=-2) + pcodes, puniques = pidx.factorize(use_na_sentinel=-2) + kcodes, kuniques = psidx.factorize(use_na_sentinel=-2) self.assert_eq(kcodes.tolist(), pcodes.tolist()) self.assert_eq(kuniques, puniques) diff --git a/python/pyspark/pandas/tests/series/test_compute.py b/python/pyspark/pandas/tests/series/test_compute.py index 155649179e6..784bf29e1a2 100644 --- a/python/pyspark/pandas/tests/series/test_compute.py +++ b/python/pyspark/pandas/tests/series/test_compute.py @@ -407,10 +407,6 @@ class SeriesComputeMixin: self.assert_eq(abs(psser), abs(pser)) self.assert_eq(np.abs(psser), np.abs(pser)) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43550): Enable SeriesTests.test_factorize for pandas 2.0.0.", - ) def test_factorize(self): pser = pd.Series(["a", "b", "a", "b"]) psser = ps.from_pandas(pser) @@ -492,27 +488,27 @@ class SeriesComputeMixin: pser = pd.Series(["a", "b", "a", np.nan, None]) psser = ps.from_pandas(pser) - pcodes, puniques = pser.factorize(sort=True, na_sentinel=-2) - kcodes, kuniques = psser.factorize(na_sentinel=-2) + pcodes, puniques = pser.factorize(sort=True, use_na_sentinel=-2) + kcodes, kuniques = psser.factorize(use_na_sentinel=-2) self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) - pcodes, puniques = pser.factorize(sort=True, na_sentinel=2) - kcodes, kuniques = psser.factorize(na_sentinel=2) + pcodes, puniques = pser.factorize(sort=True, use_na_sentinel=2) + kcodes, kuniques = psser.factorize(use_na_sentinel=2) self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) if not pd_below_1_1_2: - pcodes, puniques = pser.factorize(sort=True, na_sentinel=None) - kcodes, kuniques = psser.factorize(na_sentinel=None) + pcodes, puniques = pser.factorize(sort=True, use_na_sentinel=None) + kcodes, kuniques = psser.factorize(use_na_sentinel=None) self.assert_eq(pcodes.tolist(), kcodes.to_list()) # puniques is Index(['a', 'b', nan], dtype='object') self.assert_eq(ps.Index(["a", "b", None]), kuniques) psser = ps.Series([1, 2, np.nan, 4, 5]) # Arrow takes np.nan as null psser.loc[3] = np.nan # Spark takes np.nan as NaN - kcodes, kuniques = psser.factorize(na_sentinel=None) - pcodes, puniques = psser._to_pandas().factorize(sort=True, na_sentinel=None) + kcodes, kuniques = psser.factorize(use_na_sentinel=None) + pcodes, puniques = psser._to_pandas().factorize(sort=True, use_na_sentinel=None) self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org