This is an automated email from the ASF dual-hosted git repository.
ueshin pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push:
new f83a9ec [SPARK-36214][PYTHON] Add add_categories to
CategoricalAccessor and CategoricalIndex
f83a9ec is described below
commit f83a9ec2fd71b6e808b6a70c9fd5d627cfd6ecfe
Author: Takuya UESHIN <[email protected]>
AuthorDate: Wed Jul 21 22:34:04 2021 -0700
[SPARK-36214][PYTHON] Add add_categories to CategoricalAccessor and
CategoricalIndex
### What changes were proposed in this pull request?
Add `add_categories` to `CategoricalAccessor` and `CategoricalIndex`.
### Why are the changes needed?
We should implement `add_categories` in `CategoricalAccessor` and
`CategoricalIndex`.
### Does this PR introduce _any_ user-facing change?
Yes, users will be able to use `add_categories`.
### How was this patch tested?
Added some tests.
Closes #33470 from ueshin/issues/SPARK-36214/add_categories.
Authored-by: Takuya UESHIN <[email protected]>
Signed-off-by: Takuya UESHIN <[email protected]>
(cherry picked from commit dcc0aaa3efb2d441b2dfadb0c64dbc28ee197de5)
Signed-off-by: Takuya UESHIN <[email protected]>
---
.../source/reference/pyspark.pandas/indexing.rst | 1 +
.../source/reference/pyspark.pandas/series.rst | 1 +
python/pyspark/pandas/categorical.py | 84 ++++++++++++++++++++--
python/pyspark/pandas/indexes/category.py | 46 ++++++++++++
python/pyspark/pandas/missing/indexes.py | 1 -
.../pyspark/pandas/tests/indexes/test_category.py | 12 ++++
python/pyspark/pandas/tests/test_categorical.py | 18 +++++
7 files changed, 158 insertions(+), 5 deletions(-)
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst
b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 4f84d91..b0b4cdd 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -175,6 +175,7 @@ Categorical components
CategoricalIndex.codes
CategoricalIndex.categories
CategoricalIndex.ordered
+ CategoricalIndex.add_categories
CategoricalIndex.as_ordered
CategoricalIndex.as_unordered
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst
b/python/docs/source/reference/pyspark.pandas/series.rst
index b718d79..6243a22 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -401,6 +401,7 @@ the ``Series.cat`` accessor.
Series.cat.categories
Series.cat.ordered
Series.cat.codes
+ Series.cat.add_categories
Series.cat.as_ordered
Series.cat.as_unordered
diff --git a/python/pyspark/pandas/categorical.py
b/python/pyspark/pandas/categorical.py
index aeba20d..a83c3c7 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -14,10 +14,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-from typing import List, Optional, Union, TYPE_CHECKING, cast
+from typing import Any, List, Optional, Union, TYPE_CHECKING, cast
import pandas as pd
-from pandas.api.types import CategoricalDtype
+from pandas.api.types import CategoricalDtype, is_list_like
from pyspark.pandas.internal import InternalField
from pyspark.sql.types import StructField
@@ -165,8 +165,84 @@ class CategoricalAccessor(object):
),
).rename()
- def add_categories(self, new_categories: pd.Index, inplace: bool = False)
-> "ps.Series":
- raise NotImplementedError()
+ def add_categories(
+ self, new_categories: Union[pd.Index, Any, List], inplace: bool = False
+ ) -> Optional["ps.Series"]:
+ """
+ Add new categories.
+
+ `new_categories` will be included at the last/highest place in the
+ categories and will be unused directly after this call.
+
+ Parameters
+ ----------
+ new_categories : category or list-like of category
+ The new categories to be included.
+ inplace : bool, default False
+ Whether or not to add the categories inplace or return a copy of
+ this categorical with added categories.
+
+ Returns
+ -------
+ Series or None
+ Categorical with new categories added or None if ``inplace=True``.
+
+ Raises
+ ------
+ ValueError
+ If the new categories include old categories or do not validate as
+ categories
+
+ Examples
+ --------
+ >>> s = ps.Series(list("abbccc"), dtype="category")
+ >>> s # doctest: +SKIP
+ 0 a
+ 1 b
+ 2 b
+ 3 c
+ 4 c
+ 5 c
+ dtype: category
+ Categories (3, object): ['a', 'b', 'c']
+
+ >>> s.cat.add_categories('x') # doctest: +SKIP
+ 0 a
+ 1 b
+ 2 b
+ 3 c
+ 4 c
+ 5 c
+ dtype: category
+ Categories (4, object): ['a', 'b', 'c', 'x']
+ """
+ from pyspark.pandas.frame import DataFrame
+
+ if is_list_like(new_categories):
+ categories = list(new_categories) # type: List
+ else:
+ categories = [new_categories]
+
+ if any(cat in self.categories for cat in categories):
+ raise ValueError(
+ "new categories must not include old categories:
{{{cats}}}".format(
+ cats=", ".join(set(str(cat) for cat in categories if cat
in self.categories))
+ )
+ )
+
+ internal = self._data._psdf._internal.with_new_spark_column(
+ self._data._column_label,
+ self._data.spark.column,
+ field=self._data._internal.data_fields[0].copy(
+ dtype=CategoricalDtype(list(self.categories) + categories,
ordered=self.ordered)
+ ),
+ )
+ if inplace:
+ self._data._psdf._update_internal_frame(internal)
+ return None
+ else:
+ psser = DataFrame(internal)._psser_for(self._data._column_label)
+ return psser._with_new_scol(psser.spark.column,
field=psser._internal.data_fields[0])
def _set_ordered(self, *, ordered: bool, inplace: bool) ->
Optional["ps.Series"]:
from pyspark.pandas.frame import DataFrame
diff --git a/python/pyspark/pandas/indexes/category.py
b/python/pyspark/pandas/indexes/category.py
index 1b65886..308043e 100644
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@@ -204,6 +204,52 @@ class CategoricalIndex(Index):
"""
return self.dtype.ordered
+ def add_categories(
+ self, new_categories: Union[pd.Index, Any, List], inplace: bool = False
+ ) -> Optional["CategoricalIndex"]:
+ """
+ Add new categories.
+
+ `new_categories` will be included at the last/highest place in the
+ categories and will be unused directly after this call.
+
+ Parameters
+ ----------
+ new_categories : category or list-like of category
+ The new categories to be included.
+ inplace : bool, default False
+ Whether or not to add the categories inplace or return a copy of
+ this categorical with added categories.
+
+ Returns
+ -------
+ CategoricalIndex or None
+ Categorical with new categories added or None if ``inplace=True``.
+
+ Raises
+ ------
+ ValueError
+ If the new categories include old categories or do not validate as
+ categories
+
+ Examples
+ --------
+ >>> idx = ps.CategoricalIndex(list("abbccc"))
+ >>> idx # doctest: +NORMALIZE_WHITESPACE
+ CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+ categories=['a', 'b', 'c'], ordered=False,
dtype='category')
+
+ >>> idx.add_categories('x') # doctest: +NORMALIZE_WHITESPACE
+ CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+ categories=['a', 'b', 'c', 'x'], ordered=False,
dtype='category')
+ """
+ if inplace:
+ raise ValueError("cannot use inplace with CategoricalIndex")
+
+ return CategoricalIndex(
+ self.to_series().cat.add_categories(new_categories=new_categories)
+ ).rename(self.name)
+
def as_ordered(self, inplace: bool = False) ->
Optional["CategoricalIndex"]:
"""
Set the Categorical to be ordered.
diff --git a/python/pyspark/pandas/missing/indexes.py
b/python/pyspark/pandas/missing/indexes.py
index 6ca564f..2a5a4c90 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -125,7 +125,6 @@ class
MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
# Functions
rename_categories = _unsupported_function("rename_categories",
cls="CategoricalIndex")
reorder_categories = _unsupported_function("reorder_categories",
cls="CategoricalIndex")
- add_categories = _unsupported_function("add_categories",
cls="CategoricalIndex")
remove_categories = _unsupported_function("remove_categories",
cls="CategoricalIndex")
remove_unused_categories = _unsupported_function(
"remove_unused_categories", cls="CategoricalIndex"
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py
b/python/pyspark/pandas/tests/indexes/test_category.py
index d04f896..44e2703 100644
--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -94,6 +94,18 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
with self.assertRaises(ValueError):
psidx.categories = [1, 2, 3, 4]
+ def test_add_categories(self):
+ pidx = pd.CategoricalIndex([1, 2, 3], categories=[3, 2, 1])
+ psidx = ps.from_pandas(pidx)
+
+ self.assert_eq(pidx.add_categories(4), psidx.add_categories(4))
+ self.assert_eq(pidx.add_categories([4, 5]), psidx.add_categories([4,
5]))
+ self.assert_eq(pidx.add_categories([]), psidx.add_categories([]))
+
+ self.assertRaises(ValueError, lambda: psidx.add_categories(4,
inplace=True))
+ self.assertRaises(ValueError, lambda: psidx.add_categories(3))
+ self.assertRaises(ValueError, lambda: psidx.add_categories([4, 4]))
+
def test_as_ordered_unordered(self):
pidx = pd.CategoricalIndex(["x", "y", "z"], categories=["z", "y", "x"])
psidx = ps.from_pandas(pidx)
diff --git a/python/pyspark/pandas/tests/test_categorical.py
b/python/pyspark/pandas/tests/test_categorical.py
index fb0561d..1af03d6 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -79,6 +79,24 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
with self.assertRaises(ValueError):
psser.cat.categories = [1, 2, 3, 4]
+ def test_add_categories(self):
+ pdf, psdf = self.df_pair
+
+ pser = pdf.a
+ psser = psdf.a
+
+ self.assert_eq(pser.cat.add_categories(4), psser.cat.add_categories(4))
+ self.assert_eq(pser.cat.add_categories([4, 5]),
psser.cat.add_categories([4, 5]))
+ self.assert_eq(pser.cat.add_categories([]),
psser.cat.add_categories([]))
+
+ pser.cat.add_categories(4, inplace=True)
+ psser.cat.add_categories(4, inplace=True)
+ self.assert_eq(pser, psser)
+ self.assert_eq(pdf, psdf)
+
+ self.assertRaises(ValueError, lambda: psser.cat.add_categories(4))
+ self.assertRaises(ValueError, lambda: psser.cat.add_categories([5, 5]))
+
def test_as_ordered_unordered(self):
pdf, psdf = self.df_pair
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]