This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push:
new 37e5a10 [SPARK-36248][PYTHON] Add rename_categories to
CategoricalAccessor and CategoricalIndex
37e5a10 is described below
commit 37e5a1047780171ec9d496d9be4f94a5d3149bad
Author: Xinrong Meng <[email protected]>
AuthorDate: Fri Jul 23 12:26:24 2021 +0900
[SPARK-36248][PYTHON] Add rename_categories to CategoricalAccessor and
CategoricalIndex
### What changes were proposed in this pull request?
Add rename_categories to CategoricalAccessor and CategoricalIndex.
### Why are the changes needed?
rename_categories is supported in pandas CategoricalAccessor and
CategoricalIndex. We ought to follow pandas.
### Does this PR introduce _any_ user-facing change?
Yes. `rename_categories` is supported in pandas API on Spark now.
```py
# CategoricalIndex
>>> psser = ps.CategoricalIndex(["a", "a", "b"])
>>> psser.rename_categories([0, 1])
CategoricalIndex([0, 0, 1], categories=[0, 1], ordered=False,
dtype='category')
>>> psser.rename_categories({'a': 'A', 'c': 'C'})
CategoricalIndex(['A', 'A', 'b'], categories=['A', 'b'], ordered=False,
dtype='category')
>>> psser.rename_categories(lambda x: x.upper())
CategoricalIndex(['A', 'A', 'B'], categories=['A', 'B'], ordered=False,
dtype='category')
# CategoricalAccessor
>>> s = ps.Series(["a", "a", "b"], dtype="category")
>>> s.cat.rename_categories([0, 1])
0 0
1 0
2 1
dtype: category
Categories (2, int64): [0, 1]
>>> s.cat.rename_categories({'a': 'A', 'c': 'C'})
0 A
1 A
2 b
dtype: category
Categories (2, object): ['A', 'b']
>>> s.cat.rename_categories(lambda x: x.upper())
0 A
1 A
2 B
dtype: category
Categories (2, object): ['A', 'B']
```
### How was this patch tested?
Unit tests.
Closes #33471 from xinrong-databricks/category_rename_categories.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit 8b3d84bb7eeb798337f63c266686f2efeeaf9ea3)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../source/reference/pyspark.pandas/indexing.rst | 1 +
.../source/reference/pyspark.pandas/series.rst | 1 +
python/pyspark/pandas/categorical.py | 110 ++++++++++++++++++++-
python/pyspark/pandas/indexes/category.py | 72 +++++++++++++-
python/pyspark/pandas/missing/indexes.py | 1 -
.../pyspark/pandas/tests/indexes/test_category.py | 25 +++++
python/pyspark/pandas/tests/test_categorical.py | 51 ++++++++++
7 files changed, 255 insertions(+), 6 deletions(-)
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst
b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 454d486..c2eae08 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -178,6 +178,7 @@ Categorical components
CategoricalIndex.remove_categories
CategoricalIndex.as_ordered
CategoricalIndex.as_unordered
+ CategoricalIndex.rename_categories
.. _api.multiindex:
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst
b/python/docs/source/reference/pyspark.pandas/series.rst
index 07944f2..877902c 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -403,6 +403,7 @@ the ``Series.cat`` accessor.
Series.cat.remove_categories
Series.cat.as_ordered
Series.cat.as_unordered
+ Series.cat.rename_categories
.. _api.series.plot:
diff --git a/python/pyspark/pandas/categorical.py
b/python/pyspark/pandas/categorical.py
index 529041b..c7f0923 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -14,10 +14,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-from typing import Any, List, Optional, Union, TYPE_CHECKING, cast
+from typing import Any, Callable, List, Optional, Union, TYPE_CHECKING, cast
import pandas as pd
-from pandas.api.types import CategoricalDtype, is_list_like
+from pandas.api.types import CategoricalDtype, is_dict_like, is_list_like
from pyspark.pandas.internal import InternalField
from pyspark.sql.types import StructField
@@ -442,8 +442,110 @@ class CategoricalAccessor(object):
def remove_unused_categories(self) -> "ps.Series":
raise NotImplementedError()
- def rename_categories(self, new_categories: pd.Index, inplace: bool =
False) -> "ps.Series":
- raise NotImplementedError()
+ def rename_categories(
+ self, new_categories: Union[list, dict, Callable], inplace: bool =
False
+ ) -> Optional["ps.Series"]:
+ """
+ Rename categories.
+
+ Parameters
+ ----------
+ new_categories : list-like, dict-like or callable
+
+ New categories which will replace old categories.
+
+ * list-like: all items must be unique and the number of items in
+ the new categories must match the existing number of categories.
+
+ * dict-like: specifies a mapping from
+ old categories to new. Categories not contained in the mapping
+ are passed through and extra categories in the mapping are
+ ignored.
+
+ * callable : a callable that is called on all items in the old
+ categories and whose return values comprise the new categories.
+
+ inplace : bool, default False
+ Whether or not to rename the categories inplace or return a copy of
+ this categorical with renamed categories.
+
+ Returns
+ -------
+ cat : Series or None
+ Categorical with removed categories or None if ``inplace=True``.
+
+ Raises
+ ------
+ ValueError
+ If new categories are list-like and do not have the same number of
+ items than the current categories or do not validate as categories
+
+ See Also
+ --------
+ reorder_categories : Reorder categories.
+ add_categories : Add new categories.
+ remove_categories : Remove the specified categories.
+ remove_unused_categories : Remove categories which are not used.
+ set_categories : Set the categories to the specified ones.
+
+ Examples
+ --------
+ >>> s = ps.Series(["a", "a", "b"], dtype="category")
+ >>> s.cat.rename_categories([0, 1]) # doctest: +SKIP
+ 0 0
+ 1 0
+ 2 1
+ dtype: category
+ Categories (2, int64): [0, 1]
+
+ For dict-like ``new_categories``, extra keys are ignored and
+ categories not in the dictionary are passed through
+
+ >>> s.cat.rename_categories({'a': 'A', 'c': 'C'}) # doctest: +SKIP
+ 0 A
+ 1 A
+ 2 b
+ dtype: category
+ Categories (2, object): ['A', 'b']
+
+ You may also provide a callable to create the new categories
+
+ >>> s.cat.rename_categories(lambda x: x.upper()) # doctest: +SKIP
+ 0 A
+ 1 A
+ 2 B
+ dtype: category
+ Categories (2, object): ['A', 'B']
+ """
+ from pyspark.pandas.frame import DataFrame
+
+ if is_dict_like(new_categories):
+ categories = [cast(dict, new_categories).get(item, item) for item
in self.categories]
+ elif callable(new_categories):
+ categories = [new_categories(item) for item in self.categories]
+ elif is_list_like(new_categories):
+ if len(self.categories) != len(new_categories):
+ raise ValueError(
+ "new categories need to have the same number of items as
the old categories!"
+ )
+ categories = cast(list, new_categories)
+ else:
+ raise TypeError("new_categories must be list-like, dict-like or
callable.")
+
+ internal = self._data._psdf._internal.with_new_spark_column(
+ self._data._column_label,
+ self._data.spark.column,
+ field=self._data._internal.data_fields[0].copy(
+ dtype=CategoricalDtype(categories=categories,
ordered=self.ordered)
+ ),
+ )
+
+ if inplace:
+ self._data._psdf._update_internal_frame(internal)
+ return None
+ else:
+ psser = DataFrame(internal)._psser_for(self._data._column_label)
+ return psser._with_new_scol(psser.spark.column,
field=psser._internal.data_fields[0])
def reorder_categories(
self, new_categories: pd.Index, ordered: bool = None, inplace: bool =
False
diff --git a/python/pyspark/pandas/indexes/category.py
b/python/pyspark/pandas/indexes/category.py
index 28b5027..a745b25 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -15,7 +15,7 @@
# limitations under the License.
#
from functools import partial
-from typing import Any, List, Optional, Union, cast, no_type_check
+from typing import Any, Callable, List, Optional, Union, cast, no_type_check
import pandas as pd
from pandas.api.types import is_hashable, CategoricalDtype
@@ -364,6 +364,76 @@ class CategoricalIndex(Index):
return partial(property_or_func, self)
raise AttributeError("'CategoricalIndex' object has no attribute
'{}'".format(item))
+ def rename_categories(
+ self, new_categories: Union[list, dict, Callable], inplace: bool =
False
+ ) -> Optional["CategoricalIndex"]:
+ """
+ Rename categories.
+
+ Parameters
+ ----------
+ new_categories : list-like, dict-like or callable
+
+ New categories which will replace old categories.
+
+ * list-like: all items must be unique and the number of items in
+ the new categories must match the existing number of categories.
+
+ * dict-like: specifies a mapping from
+ old categories to new. Categories not contained in the mapping
+ are passed through and extra categories in the mapping are
+ ignored.
+
+ * callable : a callable that is called on all items in the old
+ categories and whose return values comprise the new categories.
+
+ inplace : bool, default False
+ Whether or not to rename the categories inplace or return a copy of
+ this categorical with renamed categories.
+
+ Returns
+ -------
+ cat : CategoricalIndex or None
+ Categorical with removed categories or None if ``inplace=True``.
+
+ Raises
+ ------
+ ValueError
+ If new categories are list-like and do not have the same number of
+ items than the current categories or do not validate as categories
+
+ See Also
+ --------
+ reorder_categories : Reorder categories.
+ add_categories : Add new categories.
+ remove_categories : Remove the specified categories.
+ remove_unused_categories : Remove categories which are not used.
+ set_categories : Set the categories to the specified ones.
+
+ Examples
+ --------
+ >>> idx = ps.CategoricalIndex(["a", "a", "b"])
+ >>> idx.rename_categories([0, 1])
+ CategoricalIndex([0, 0, 1], categories=[0, 1], ordered=False,
dtype='category')
+
+ For dict-like ``new_categories``, extra keys are ignored and
+ categories not in the dictionary are passed through
+
+ >>> idx.rename_categories({'a': 'A', 'c': 'C'})
+ CategoricalIndex(['A', 'A', 'b'], categories=['A', 'b'],
ordered=False, dtype='category')
+
+ You may also provide a callable to create the new categories
+
+ >>> idx.rename_categories(lambda x: x.upper())
+ CategoricalIndex(['A', 'A', 'B'], categories=['A', 'B'],
ordered=False, dtype='category')
+ """
+ if inplace:
+ raise ValueError("cannot use inplace with CategoricalIndex")
+
+ return
CategoricalIndex(self.to_series().cat.rename_categories(new_categories)).rename(
+ self.name
+ )
+
def _test() -> None:
import os
diff --git a/python/pyspark/pandas/missing/indexes.py
b/python/pyspark/pandas/missing/indexes.py
index e550801..ef65da1 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -123,7 +123,6 @@ class
MissingPandasLikeDatetimeIndex(MissingPandasLikeIndex):
class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
# Functions
- rename_categories = _unsupported_function("rename_categories",
cls="CategoricalIndex")
reorder_categories = _unsupported_function("reorder_categories",
cls="CategoricalIndex")
remove_unused_categories = _unsupported_function(
"remove_unused_categories", cls="CategoricalIndex"
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py
b/python/pyspark/pandas/tests/indexes/test_category.py
index ebda1be..a05eaef 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -257,6 +257,31 @@ class CategoricalIndexTest(PandasOnSparkTestCase,
TestUtils):
self.assert_eq(psidx.insert(1, "w"), pidx.insert(1, "w"))
+ def test_rename_categories(self):
+ pidx = pd.CategoricalIndex(["a", "b", "c", "d"])
+ psidx = ps.from_pandas(pidx)
+ self.assert_eq(pidx.rename_categories([0, 1, 3, 2]),
psidx.rename_categories([0, 1, 3, 2]))
+ self.assert_eq(
+ pidx.rename_categories({"a": "A", "c": "C"}),
+ psidx.rename_categories({"a": "A", "c": "C"}),
+ )
+ self.assert_eq(
+ pidx.rename_categories(lambda x: x.upper()),
+ psidx.rename_categories(lambda x: x.upper()),
+ )
+ self.assertRaises(
+ TypeError,
+ lambda: psidx.rename_categories(None),
+ )
+ self.assertRaises(
+ TypeError,
+ lambda: psidx.rename_categories(1),
+ )
+ self.assertRaises(
+ TypeError,
+ lambda: psidx.rename_categories("x"),
+ )
+
if __name__ == "__main__":
import unittest
diff --git a/python/pyspark/pandas/tests/test_categorical.py
b/python/pyspark/pandas/tests/test_categorical.py
index cf36563..e60426a 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -560,6 +560,57 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
self.assert_eq(psdf.a.unstack().sort_index(),
pdf.a.unstack().sort_index())
self.assert_eq(psdf.b.unstack().sort_index(),
pdf.b.unstack().sort_index())
+ def test_rename_categories(self):
+ pdf, psdf = self.df_pair
+
+ pser = pdf.b
+ psser = psdf.b
+
+ self.assert_eq(
+ pser.cat.rename_categories([0, 1, 3, 2]),
psser.cat.rename_categories([0, 1, 3, 2])
+ )
+ self.assert_eq(
+ pser.cat.rename_categories({"a": "A", "c": "C"}),
+ psser.cat.rename_categories({"a": "A", "c": "C"}),
+ )
+ self.assert_eq(
+ pser.cat.rename_categories(lambda x: x.upper()),
+ psser.cat.rename_categories(lambda x: x.upper()),
+ )
+
+ pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
+ psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
+ self.assert_eq(pser, psser)
+ self.assert_eq(pdf, psdf)
+
+ pser.cat.rename_categories(lambda x: x.upper(), inplace=True)
+ psser.cat.rename_categories(lambda x: x.upper(), inplace=True)
+ self.assert_eq(pser, psser)
+ self.assert_eq(pdf, psdf)
+
+ pser.cat.rename_categories([0, 1, 3, 2], inplace=True)
+ psser.cat.rename_categories([0, 1, 3, 2], inplace=True)
+ self.assert_eq(pser, psser)
+ self.assert_eq(pdf, psdf)
+
+ self.assertRaisesRegex(
+ ValueError,
+ "new categories need to have the same number of items as the old
categories",
+ lambda: psser.cat.rename_categories([0, 1, 2]),
+ )
+ self.assertRaises(
+ TypeError,
+ lambda: psser.cat.rename_categories(None),
+ )
+ self.assertRaises(
+ TypeError,
+ lambda: psser.cat.rename_categories(1),
+ )
+ self.assertRaises(
+ TypeError,
+ lambda: psser.cat.rename_categories("x"),
+ )
+
if __name__ == "__main__":
import unittest
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]