This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push:
new 0e94e42 [SPARK-36249][PYTHON] Add remove_categories to
CategoricalAccessor and CategoricalIndex
0e94e42 is described below
commit 0e94e42cd3ffdf1b5cccf4a8281c27e66fcc59d3
Author: Takuya UESHIN <[email protected]>
AuthorDate: Thu Jul 22 17:06:12 2021 +0900
[SPARK-36249][PYTHON] Add remove_categories to CategoricalAccessor and
CategoricalIndex
### What changes were proposed in this pull request?
Add `remove_categories` to `CategoricalAccessor` and `CategoricalIndex`.
### Why are the changes needed?
We should implement `remove_categories` in `CategoricalAccessor` and
`CategoricalIndex`.
### Does this PR introduce _any_ user-facing change?
Yes, users will be able to use `remove_categories`.
### How was this patch tested?
Added some tests.
Closes #33474 from ueshin/issues/SPARK-36249/remove_categories.
Authored-by: Takuya UESHIN <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit a3c7ae18e2090623b758cdb6fb3b62413981610a)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../source/reference/pyspark.pandas/indexing.rst | 1 +
.../source/reference/pyspark.pandas/series.rst | 1 +
python/pyspark/pandas/categorical.py | 92 +++++++++++++++++++++-
python/pyspark/pandas/indexes/category.py | 43 ++++++++++
python/pyspark/pandas/missing/indexes.py | 1 -
.../pyspark/pandas/tests/indexes/test_category.py | 16 ++++
python/pyspark/pandas/tests/test_categorical.py | 24 ++++++
7 files changed, 175 insertions(+), 3 deletions(-)
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst
b/python/docs/source/reference/pyspark.pandas/indexing.rst
index b0b4cdd..60c4c4b 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -176,6 +176,7 @@ Categorical components
CategoricalIndex.categories
CategoricalIndex.ordered
CategoricalIndex.add_categories
+ CategoricalIndex.remove_categories
CategoricalIndex.as_ordered
CategoricalIndex.as_unordered
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst
b/python/docs/source/reference/pyspark.pandas/series.rst
index 6243a22..fec89de 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -402,6 +402,7 @@ the ``Series.cat`` accessor.
Series.cat.ordered
Series.cat.codes
Series.cat.add_categories
+ Series.cat.remove_categories
Series.cat.as_ordered
Series.cat.as_unordered
diff --git a/python/pyspark/pandas/categorical.py
b/python/pyspark/pandas/categorical.py
index a83c3c7..529041b 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -348,8 +348,96 @@ class CategoricalAccessor(object):
"""
return self._set_ordered(ordered=False, inplace=inplace)
- def remove_categories(self, removals: pd.Index, inplace: bool = False) ->
"ps.Series":
- raise NotImplementedError()
+ def remove_categories(
+ self, removals: Union[pd.Index, Any, List], inplace: bool = False
+ ) -> Optional["ps.Series"]:
+ """
+ Remove the specified categories.
+
+ `removals` must be included in the old categories. Values which were in
+ the removed categories will be set to NaN
+
+ Parameters
+ ----------
+ removals : category or list of categories
+ The categories which should be removed.
+ inplace : bool, default False
+ Whether or not to remove the categories inplace or return a copy of
+ this categorical with removed categories.
+
+ Returns
+ -------
+ Series or None
+ Categorical with removed categories or None if ``inplace=True``.
+
+ Raises
+ ------
+ ValueError
+ If the removals are not contained in the categories
+
+ Examples
+ --------
+ >>> s = ps.Series(list("abbccc"), dtype="category")
+ >>> s # doctest: +SKIP
+ 0 a
+ 1 b
+ 2 b
+ 3 c
+ 4 c
+ 5 c
+ dtype: category
+ Categories (3, object): ['a', 'b', 'c']
+
+ >>> s.cat.remove_categories('b') # doctest: +SKIP
+ 0 a
+ 1 NaN
+ 2 NaN
+ 3 c
+ 4 c
+ 5 c
+ dtype: category
+ Categories (2, object): ['a', 'c']
+ """
+ if is_list_like(removals):
+ categories = [cat for cat in removals if cat is not None] # type:
List
+ elif removals is None:
+ categories = []
+ else:
+ categories = [removals]
+
+ if any(cat not in self.categories for cat in categories):
+ raise ValueError(
+ "removals must all be in old categories: {{{cats}}}".format(
+ cats=", ".join(
+ set(str(cat) for cat in categories if cat not in
self.categories)
+ )
+ )
+ )
+
+ if len(categories) == 0:
+ if inplace:
+ return None
+ else:
+ psser = self._data
+ return psser._with_new_scol(
+ psser.spark.column, field=psser._internal.data_fields[0]
+ )
+ else:
+ dtype = CategoricalDtype(
+ [cat for cat in self.categories if cat not in categories],
ordered=self.ordered
+ )
+ psser = self._data.astype(dtype)
+
+ if inplace:
+ internal = self._data._psdf._internal.with_new_spark_column(
+ self._data._column_label,
+ psser.spark.column,
+ field=psser._internal.data_fields[0],
+ )
+ self._data._psdf._update_internal_frame(internal)
+ return None
+ else:
+ return psser
def remove_unused_categories(self) -> "ps.Series":
raise NotImplementedError()
diff --git a/python/pyspark/pandas/indexes/category.py
b/python/pyspark/pandas/indexes/category.py
index 308043e..28b5027 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -312,6 +312,49 @@ class CategoricalIndex(Index):
return
CategoricalIndex(self.to_series().cat.as_unordered()).rename(self.name)
+ def remove_categories(
+ self, removals: Union[pd.Index, Any, List], inplace: bool = False
+ ) -> Optional["CategoricalIndex"]:
+ """
+ Remove the specified categories.
+
+ `removals` must be included in the old categories. Values which were in
+ the removed categories will be set to NaN
+
+ Parameters
+ ----------
+ removals : category or list of categories
+ The categories which should be removed.
+ inplace : bool, default False
+ Whether or not to remove the categories inplace or return a copy of
+ this categorical with removed categories.
+
+ Returns
+ -------
+ CategoricalIndex or None
+ Categorical with removed categories or None if ``inplace=True``.
+
+ Raises
+ ------
+ ValueError
+ If the removals are not contained in the categories
+
+ Examples
+ --------
+ >>> idx = ps.CategoricalIndex(list("abbccc"))
+ >>> idx # doctest: +NORMALIZE_WHITESPACE
+ CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+ categories=['a', 'b', 'c'], ordered=False,
dtype='category')
+
+ >>> idx.remove_categories('b') # doctest: +NORMALIZE_WHITESPACE
+ CategoricalIndex(['a', nan, nan, 'c', 'c', 'c'],
+ categories=['a', 'c'], ordered=False,
dtype='category')
+ """
+ if inplace:
+ raise ValueError("cannot use inplace with CategoricalIndex")
+
+ return
CategoricalIndex(self.to_series().cat.remove_categories(removals)).rename(self.name)
+
def __getattr__(self, item: str) -> Any:
if hasattr(MissingPandasLikeCategoricalIndex, item):
property_or_func = getattr(MissingPandasLikeCategoricalIndex, item)
diff --git a/python/pyspark/pandas/missing/indexes.py
b/python/pyspark/pandas/missing/indexes.py
index 2a5a4c90..e550801 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -125,7 +125,6 @@ class
MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
# Functions
rename_categories = _unsupported_function("rename_categories",
cls="CategoricalIndex")
reorder_categories = _unsupported_function("reorder_categories",
cls="CategoricalIndex")
- remove_categories = _unsupported_function("remove_categories",
cls="CategoricalIndex")
remove_unused_categories = _unsupported_function(
"remove_unused_categories", cls="CategoricalIndex"
)
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py
b/python/pyspark/pandas/tests/indexes/test_category.py
index 44e2703..ebda1be 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -106,6 +106,22 @@ class CategoricalIndexTest(PandasOnSparkTestCase,
TestUtils):
self.assertRaises(ValueError, lambda: psidx.add_categories(3))
self.assertRaises(ValueError, lambda: psidx.add_categories([4, 4]))
+ def test_remove_categories(self):
+ pidx = pd.CategoricalIndex([1, 2, 3], categories=[3, 2, 1])
+ psidx = ps.from_pandas(pidx)
+
+ self.assert_eq(pidx.remove_categories(2), psidx.remove_categories(2))
+ self.assert_eq(pidx.remove_categories([1, 3]),
psidx.remove_categories([1, 3]))
+ self.assert_eq(pidx.remove_categories([]), psidx.remove_categories([]))
+ self.assert_eq(pidx.remove_categories([2, 2]),
psidx.remove_categories([2, 2]))
+ self.assert_eq(pidx.remove_categories([1, 2, 3]),
psidx.remove_categories([1, 2, 3]))
+ self.assert_eq(pidx.remove_categories(None),
psidx.remove_categories(None))
+ self.assert_eq(pidx.remove_categories([None]),
psidx.remove_categories([None]))
+
+ self.assertRaises(ValueError, lambda: pidx.remove_categories(4,
inplace=True))
+ self.assertRaises(ValueError, lambda: psidx.remove_categories(4))
+ self.assertRaises(ValueError, lambda: psidx.remove_categories([4,
None]))
+
def test_as_ordered_unordered(self):
pidx = pd.CategoricalIndex(["x", "y", "z"], categories=["z", "y", "x"])
psidx = ps.from_pandas(pidx)
diff --git a/python/pyspark/pandas/tests/test_categorical.py
b/python/pyspark/pandas/tests/test_categorical.py
index 1af03d6..cf36563 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -97,6 +97,30 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
self.assertRaises(ValueError, lambda: psser.cat.add_categories(4))
self.assertRaises(ValueError, lambda: psser.cat.add_categories([5, 5]))
+ def test_remove_categories(self):
+ pdf, psdf = self.df_pair
+
+ pser = pdf.a
+ psser = psdf.a
+
+ self.assert_eq(pser.cat.remove_categories(2),
psser.cat.remove_categories(2))
+ self.assert_eq(pser.cat.remove_categories([1, 3]),
psser.cat.remove_categories([1, 3]))
+ self.assert_eq(pser.cat.remove_categories([]),
psser.cat.remove_categories([]))
+ self.assert_eq(pser.cat.remove_categories([2, 2]),
psser.cat.remove_categories([2, 2]))
+ self.assert_eq(
+ pser.cat.remove_categories([1, 2, 3]),
psser.cat.remove_categories([1, 2, 3])
+ )
+ self.assert_eq(pser.cat.remove_categories(None),
psser.cat.remove_categories(None))
+ self.assert_eq(pser.cat.remove_categories([None]),
psser.cat.remove_categories([None]))
+
+ pser.cat.remove_categories(2, inplace=True)
+ psser.cat.remove_categories(2, inplace=True)
+ self.assert_eq(pser, psser)
+ self.assert_eq(pdf, psdf)
+
+ self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
+ self.assertRaises(ValueError, lambda: psser.cat.remove_categories([4,
None]))
+
def test_as_ordered_unordered(self):
pdf, psdf = self.df_pair
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]