[spark] branch master updated: [SPARK-43567][PS] Support `use_na_sentinel` for `factorize`

gurwls223 Tue, 08 Aug 2023 00:16:30 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new d2b60ff51fa [SPARK-43567][PS] Support `use_na_sentinel` for `factorize`
d2b60ff51fa is described below

commit d2b60ff51fabdb38899e649aa2e700112534d79c
Author: itholic <haejoon....@databricks.com>
AuthorDate: Tue Aug 8 16:16:11 2023 +0900

    [SPARK-43567][PS] Support `use_na_sentinel` for `factorize`
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to support `use_na_sentinel` for `factorize`.
    
    ### Why are the changes needed?
    
    To match the behavior with [pandas 
2](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html)
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, the `na_sentinel` is removed in favor of `use_na_sentinel`.
    
    ### How was this patch tested?
    
    Enabling the existing tests.
    
    Closes #42270 from itholic/pandas_use_na_sentinel.
    
    Authored-by: itholic <haejoon....@databricks.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 .../source/migration_guide/pyspark_upgrade.rst     |  1 +
 python/pyspark/pandas/base.py                      | 39 ++++++++--------------
 .../tests/connect/series/test_parity_compute.py    |  4 +++
 .../pyspark/pandas/tests/indexes/test_category.py  |  8 ++---
 python/pyspark/pandas/tests/series/test_compute.py | 20 +++++------
 5 files changed, 29 insertions(+), 43 deletions(-)

diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst 
b/python/docs/source/migration_guide/pyspark_upgrade.rst
index 7a691ee2645..d26f1cbbe0d 100644
--- a/python/docs/source/migration_guide/pyspark_upgrade.rst
+++ b/python/docs/source/migration_guide/pyspark_upgrade.rst
@@ -29,6 +29,7 @@ Upgrading from PySpark 3.5 to 4.0
 * In Spark 4.0, ``Series.append`` has been removed from pandas API on Spark, 
use ``ps.concat`` instead.
 * In Spark 4.0, ``DataFrame.mad`` has been removed from pandas API on Spark.
 * In Spark 4.0, ``Series.mad`` has been removed from pandas API on Spark.
+* In Spark 4.0, ``na_sentinel`` parameter from ``Index.factorize`` and 
`Series.factorize`` has been removed from pandas API on Spark, use 
``use_na_sentinel`` instead.
 
 
 Upgrading from PySpark 3.3 to 3.4
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index 2de260e6e93..0685af76987 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -1614,7 +1614,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
             return cast(IndexOpsLike, self._psdf.iloc[indices].index)
 
     def factorize(
-        self: IndexOpsLike, sort: bool = True, na_sentinel: Optional[int] = -1
+        self: IndexOpsLike, sort: bool = True, use_na_sentinel: bool = True
     ) -> Tuple[IndexOpsLike, pd.Index]:
         """
         Encode the object as an enumerated type or categorical variable.
@@ -1625,11 +1625,11 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
         Parameters
         ----------
         sort : bool, default True
-        na_sentinel : int or None, default -1
-            Value to mark "not found". If None, will not drop the NaN
-            from the uniques of the values.
-
-            .. deprecated:: 3.4.0
+        use_na_sentinel : bool, default True
+            If True, the sentinel -1 will be used for NaN values, effectively 
assigning them
+            a distinct category. If False, NaN values will be encoded as 
non-negative integers,
+            treating them as unique categories in the encoding process and 
retaining them in the
+            set of unique categories in the data.
 
         Returns
         -------
@@ -1658,7 +1658,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
         >>> uniques
         Index(['a', 'b', 'c'], dtype='object')
 
-        >>> codes, uniques = psser.factorize(na_sentinel=None)
+        >>> codes, uniques = psser.factorize(use_na_sentinel=False)
         >>> codes
         0    1
         1    3
@@ -1669,17 +1669,6 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
         >>> uniques
         Index(['a', 'b', 'c', None], dtype='object')
 
-        >>> codes, uniques = psser.factorize(na_sentinel=-2)
-        >>> codes
-        0    1
-        1   -2
-        2    0
-        3    2
-        4    1
-        dtype: int32
-        >>> uniques
-        Index(['a', 'b', 'c'], dtype='object')
-
         For Index:
 
         >>> psidx = ps.Index(['b', None, 'a', 'c', 'b'])
@@ -1691,8 +1680,8 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
         """
         from pyspark.pandas.series import first_series
 
-        assert (na_sentinel is None) or isinstance(na_sentinel, int)
         assert sort is True
+        use_na_sentinel = -1 if use_na_sentinel else False  # type: 
ignore[assignment]
 
         warnings.warn(
             "Argument `na_sentinel` will be removed in 4.0.0.",
@@ -1716,7 +1705,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
                 scol = map_scol[self.spark.column]
             codes, uniques = self._with_new_scol(
                 scol.alias(self._internal.data_spark_column_names[0])
-            ).factorize(na_sentinel=na_sentinel)
+            ).factorize(use_na_sentinel=use_na_sentinel)
             return codes, uniques.astype(self.dtype)
 
         uniq_sdf = 
self._internal.spark_frame.select(self.spark.column).distinct()
@@ -1743,13 +1732,13 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
 
         # Constructs `unique_to_code` mapping non-na unique to code
         unique_to_code = {}
-        if na_sentinel is not None:
-            na_sentinel_code = na_sentinel
+        if use_na_sentinel:
+            na_sentinel_code = use_na_sentinel
         code = 0
         for unique in uniques_list:
             if pd.isna(unique):
-                if na_sentinel is None:
-                    na_sentinel_code = code
+                if not use_na_sentinel:
+                    na_sentinel_code = code  # type: ignore[assignment]
             else:
                 unique_to_code[unique] = code
             code += 1
@@ -1767,7 +1756,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
 
         codes = 
self._with_new_scol(new_scol.alias(self._internal.data_spark_column_names[0]))
 
-        if na_sentinel is not None:
+        if use_na_sentinel:
             # Drops the NaN from the uniques of the values
             uniques_list = [x for x in uniques_list if not pd.isna(x)]
 
diff --git a/python/pyspark/pandas/tests/connect/series/test_parity_compute.py 
b/python/pyspark/pandas/tests/connect/series/test_parity_compute.py
index 8876fcb1398..31916f12b4e 100644
--- a/python/pyspark/pandas/tests/connect/series/test_parity_compute.py
+++ b/python/pyspark/pandas/tests/connect/series/test_parity_compute.py
@@ -24,6 +24,10 @@ from pyspark.testing.pandasutils import 
PandasOnSparkTestUtils
 class SeriesParityComputeTests(SeriesComputeMixin, PandasOnSparkTestUtils, 
ReusedConnectTestCase):
     pass
 
+    @unittest.skip("TODO(SPARK-43620): Support `Column` for 
SparkConnectColumn.__getitem__.")
+    def test_factorize(self):
+        super().test_factorize()
+
 
 if __name__ == "__main__":
     from pyspark.pandas.tests.connect.series.test_parity_compute import *  # 
noqa: F401
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py 
b/python/pyspark/pandas/tests/indexes/test_category.py
index ffffae828c4..6aa92b7e1e3 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -210,10 +210,6 @@ class CategoricalIndexTestsMixin:
 
         self.assert_eq(pscidx.astype(str), pcidx.astype(str))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43567): Enable CategoricalIndexTests.test_factorize for 
pandas 2.0.0.",
-    )
     def test_factorize(self):
         pidx = pd.CategoricalIndex([1, 2, 3, None])
         psidx = ps.from_pandas(pidx)
@@ -224,8 +220,8 @@ class CategoricalIndexTestsMixin:
         self.assert_eq(kcodes.tolist(), pcodes.tolist())
         self.assert_eq(kuniques, puniques)
 
-        pcodes, puniques = pidx.factorize(na_sentinel=-2)
-        kcodes, kuniques = psidx.factorize(na_sentinel=-2)
+        pcodes, puniques = pidx.factorize(use_na_sentinel=-2)
+        kcodes, kuniques = psidx.factorize(use_na_sentinel=-2)
 
         self.assert_eq(kcodes.tolist(), pcodes.tolist())
         self.assert_eq(kuniques, puniques)
diff --git a/python/pyspark/pandas/tests/series/test_compute.py 
b/python/pyspark/pandas/tests/series/test_compute.py
index 155649179e6..784bf29e1a2 100644
--- a/python/pyspark/pandas/tests/series/test_compute.py
+++ b/python/pyspark/pandas/tests/series/test_compute.py
@@ -407,10 +407,6 @@ class SeriesComputeMixin:
         self.assert_eq(abs(psser), abs(pser))
         self.assert_eq(np.abs(psser), np.abs(pser))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43550): Enable SeriesTests.test_factorize for pandas 
2.0.0.",
-    )
     def test_factorize(self):
         pser = pd.Series(["a", "b", "a", "b"])
         psser = ps.from_pandas(pser)
@@ -492,27 +488,27 @@ class SeriesComputeMixin:
         pser = pd.Series(["a", "b", "a", np.nan, None])
         psser = ps.from_pandas(pser)
 
-        pcodes, puniques = pser.factorize(sort=True, na_sentinel=-2)
-        kcodes, kuniques = psser.factorize(na_sentinel=-2)
+        pcodes, puniques = pser.factorize(sort=True, use_na_sentinel=-2)
+        kcodes, kuniques = psser.factorize(use_na_sentinel=-2)
         self.assert_eq(pcodes.tolist(), kcodes.to_list())
         self.assert_eq(puniques, kuniques)
 
-        pcodes, puniques = pser.factorize(sort=True, na_sentinel=2)
-        kcodes, kuniques = psser.factorize(na_sentinel=2)
+        pcodes, puniques = pser.factorize(sort=True, use_na_sentinel=2)
+        kcodes, kuniques = psser.factorize(use_na_sentinel=2)
         self.assert_eq(pcodes.tolist(), kcodes.to_list())
         self.assert_eq(puniques, kuniques)
 
         if not pd_below_1_1_2:
-            pcodes, puniques = pser.factorize(sort=True, na_sentinel=None)
-            kcodes, kuniques = psser.factorize(na_sentinel=None)
+            pcodes, puniques = pser.factorize(sort=True, use_na_sentinel=None)
+            kcodes, kuniques = psser.factorize(use_na_sentinel=None)
             self.assert_eq(pcodes.tolist(), kcodes.to_list())
             # puniques is Index(['a', 'b', nan], dtype='object')
             self.assert_eq(ps.Index(["a", "b", None]), kuniques)
 
             psser = ps.Series([1, 2, np.nan, 4, 5])  # Arrow takes np.nan as 
null
             psser.loc[3] = np.nan  # Spark takes np.nan as NaN
-            kcodes, kuniques = psser.factorize(na_sentinel=None)
-            pcodes, puniques = psser._to_pandas().factorize(sort=True, 
na_sentinel=None)
+            kcodes, kuniques = psser.factorize(use_na_sentinel=None)
+            pcodes, puniques = psser._to_pandas().factorize(sort=True, 
use_na_sentinel=None)
             self.assert_eq(pcodes.tolist(), kcodes.to_list())
             self.assert_eq(puniques, kuniques)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-43567][PS] Support `use_na_sentinel` for `factorize`

Reply via email to