This is an automated email from the ASF dual-hosted git repository. ueshin pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push: new a3a13da [SPARK-36186][PYTHON] Add as_ordered/as_unordered to CategoricalAccessor and CategoricalIndex a3a13da is described below commit a3a13da26c19b4241bf2f76273a82fe8598eddf5 Author: Takuya UESHIN <ues...@databricks.com> AuthorDate: Tue Jul 20 18:23:54 2021 -0700 [SPARK-36186][PYTHON] Add as_ordered/as_unordered to CategoricalAccessor and CategoricalIndex ### What changes were proposed in this pull request? Add `as_ordered`/`as_unordered` to `CategoricalAccessor` and `CategoricalIndex`. ### Why are the changes needed? We should implement `as_ordered`/`as_unordered` in `CategoricalAccessor` and `CategoricalIndex` yet. ### Does this PR introduce _any_ user-facing change? Yes, users will be able to use `as_ordered`/`as_unordered`. ### How was this patch tested? Added some tests. Closes #33400 from ueshin/issues/SPARK-36186/as_ordered_unordered. Authored-by: Takuya UESHIN <ues...@databricks.com> Signed-off-by: Takuya UESHIN <ues...@databricks.com> (cherry picked from commit 376fadc89cffac97aebe49a7cf4a4bc978b1d09e) Signed-off-by: Takuya UESHIN <ues...@databricks.com> --- .../source/reference/pyspark.pandas/indexing.rst | 2 + .../source/reference/pyspark.pandas/series.rst | 2 + python/pyspark/pandas/categorical.py | 116 +++++++++++++++++++-- python/pyspark/pandas/indexes/category.py | 72 ++++++++++++- python/pyspark/pandas/missing/indexes.py | 2 - .../pyspark/pandas/tests/indexes/test_category.py | 10 ++ python/pyspark/pandas/tests/test_categorical.py | 22 ++++ 7 files changed, 214 insertions(+), 12 deletions(-) diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst index e91f699..4f84d91 100644 --- a/python/docs/source/reference/pyspark.pandas/indexing.rst +++ b/python/docs/source/reference/pyspark.pandas/indexing.rst @@ -175,6 +175,8 @@ Categorical components CategoricalIndex.codes CategoricalIndex.categories CategoricalIndex.ordered + CategoricalIndex.as_ordered + CategoricalIndex.as_unordered .. _api.multiindex: diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst index a199d70..b718d79 100644 --- a/python/docs/source/reference/pyspark.pandas/series.rst +++ b/python/docs/source/reference/pyspark.pandas/series.rst @@ -401,6 +401,8 @@ the ``Series.cat`` accessor. Series.cat.categories Series.cat.ordered Series.cat.codes + Series.cat.as_ordered + Series.cat.as_unordered .. _api.series.plot: diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py index 3495b35..b8cc88c 100644 --- a/python/pyspark/pandas/categorical.py +++ b/python/pyspark/pandas/categorical.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import TYPE_CHECKING, cast +from typing import Optional, TYPE_CHECKING, cast import pandas as pd from pandas.api.types import CategoricalDtype @@ -62,6 +62,10 @@ class CategoricalAccessor(object): self._data = series @property + def _dtype(self) -> CategoricalDtype: + return cast(CategoricalDtype, self._data.dtype) + + @property def categories(self) -> pd.Index: """ The categories of this categorical. @@ -82,7 +86,7 @@ class CategoricalAccessor(object): >>> s.cat.categories Index(['a', 'b', 'c'], dtype='object') """ - return cast(CategoricalDtype, self._data.dtype).categories + return self._dtype.categories @categories.setter def categories(self, categories: pd.Index) -> None: @@ -109,7 +113,7 @@ class CategoricalAccessor(object): >>> s.cat.ordered False """ - return cast(CategoricalDtype, self._data.dtype).ordered + return self._dtype.ordered @property def codes(self) -> "ps.Series": @@ -152,11 +156,109 @@ class CategoricalAccessor(object): def add_categories(self, new_categories: pd.Index, inplace: bool = False) -> "ps.Series": raise NotImplementedError() - def as_ordered(self, inplace: bool = False) -> "ps.Series": - raise NotImplementedError() + def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]: + from pyspark.pandas.frame import DataFrame + + if self.ordered == ordered: + if inplace: + return None + else: + psser = self._data + else: + internal = self._data._psdf._internal.with_new_spark_column( + self._data._column_label, + self._data.spark.column, + field=self._data._internal.data_fields[0].copy( + dtype=CategoricalDtype(categories=self.categories, ordered=ordered) + ), + ) + if inplace: + self._data._psdf._update_internal_frame(internal) + return None + else: + psser = DataFrame(internal)._psser_for(self._data._column_label) + + return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0]) + + def as_ordered(self, inplace: bool = False) -> Optional["ps.Series"]: + """ + Set the Categorical to be ordered. - def as_unordered(self, inplace: bool = False) -> "ps.Series": - raise NotImplementedError() + Parameters + ---------- + inplace : bool, default False + Whether or not to set the ordered attribute in-place or return + a copy of this categorical with ordered set to True. + + Returns + ------- + Series or None + Ordered Categorical or None if ``inplace=True``. + + Examples + -------- + >>> s = ps.Series(list("abbccc"), dtype="category") + >>> s # doctest: +SKIP + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['a', 'b', 'c'] + + >>> s.cat.as_ordered() # doctest: +SKIP + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['a' < 'b' < 'c'] + """ + return self._set_ordered(ordered=True, inplace=inplace) + + def as_unordered(self, inplace: bool = False) -> Optional["ps.Series"]: + """ + Set the Categorical to be unordered. + + Parameters + ---------- + inplace : bool, default False + Whether or not to set the ordered attribute in-place or return + a copy of this categorical with ordered set to False. + + Returns + ------- + Series or None + Unordered Categorical or None if ``inplace=True``. + + Examples + -------- + >>> s = ps.Series(list("abbccc"), dtype="category").cat.as_ordered() + >>> s # doctest: +SKIP + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['a' < 'b' < 'c'] + + >>> s.cat.as_unordered() # doctest: +SKIP + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['a', 'b', 'c'] + """ + return self._set_ordered(ordered=False, inplace=inplace) def remove_categories(self, removals: pd.Index, inplace: bool = False) -> "ps.Series": raise NotImplementedError() diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py index 2f8ad17..a7ad2a0 100644 --- a/python/pyspark/pandas/indexes/category.py +++ b/python/pyspark/pandas/indexes/category.py @@ -15,7 +15,7 @@ # limitations under the License. # from functools import partial -from typing import Any, no_type_check, cast +from typing import Any, Optional, cast, no_type_check import pandas as pd from pandas.api.types import is_hashable, CategoricalDtype @@ -116,6 +116,10 @@ class CategoricalIndex(Index): ) @property + def dtype(self) -> CategoricalDtype: + return cast(CategoricalDtype, super().dtype) + + @property def codes(self) -> Index: """ The category codes of this categorical. @@ -167,7 +171,7 @@ class CategoricalIndex(Index): >>> idx.categories Index(['a', 'b', 'c'], dtype='object') """ - return cast(CategoricalDtype, self.dtype).categories + return self.dtype.categories @categories.setter def categories(self, categories: pd.Index) -> None: @@ -188,7 +192,69 @@ class CategoricalIndex(Index): >>> idx.ordered False """ - return cast(CategoricalDtype, self.dtype).ordered + return self.dtype.ordered + + def as_ordered(self, inplace: bool = False) -> Optional["CategoricalIndex"]: + """ + Set the Categorical to be ordered. + + Parameters + ---------- + inplace : bool, default False + Whether or not to set the ordered attribute in-place or return + a copy of this categorical with ordered set to True. + + Returns + ------- + CategoricalIndex or None + Ordered Categorical or None if ``inplace=True``. + + Examples + -------- + >>> idx = ps.CategoricalIndex(list("abbccc")) + >>> idx # doctest: +NORMALIZE_WHITESPACE + CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'], + categories=['a', 'b', 'c'], ordered=False, dtype='category') + + >>> idx.as_ordered() # doctest: +NORMALIZE_WHITESPACE + CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'], + categories=['a', 'b', 'c'], ordered=True, dtype='category') + """ + if inplace: + raise ValueError("cannot use inplace with CategoricalIndex") + + return CategoricalIndex(self.to_series().cat.as_ordered()).rename(self.name) + + def as_unordered(self, inplace: bool = False) -> Optional["CategoricalIndex"]: + """ + Set the Categorical to be unordered. + + Parameters + ---------- + inplace : bool, default False + Whether or not to set the ordered attribute in-place or return + a copy of this categorical with ordered set to False. + + Returns + ------- + CategoricalIndex or None + Unordered Categorical or None if ``inplace=True``. + + Examples + -------- + >>> idx = ps.CategoricalIndex(list("abbccc")).as_ordered() + >>> idx # doctest: +NORMALIZE_WHITESPACE + CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'], + categories=['a', 'b', 'c'], ordered=True, dtype='category') + + >>> idx.as_unordered() # doctest: +NORMALIZE_WHITESPACE + CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'], + categories=['a', 'b', 'c'], ordered=False, dtype='category') + """ + if inplace: + raise ValueError("cannot use inplace with CategoricalIndex") + + return CategoricalIndex(self.to_series().cat.as_unordered()).rename(self.name) def __getattr__(self, item: str) -> Any: if hasattr(MissingPandasLikeCategoricalIndex, item): diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py index 0b8a975..6ca564f 100644 --- a/python/pyspark/pandas/missing/indexes.py +++ b/python/pyspark/pandas/missing/indexes.py @@ -131,8 +131,6 @@ class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex): "remove_unused_categories", cls="CategoricalIndex" ) set_categories = _unsupported_function("set_categories", cls="CategoricalIndex") - as_ordered = _unsupported_function("as_ordered", cls="CategoricalIndex") - as_unordered = _unsupported_function("as_unordered", cls="CategoricalIndex") map = _unsupported_function("map", cls="CategoricalIndex") diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index fb72b30..02752ec 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -67,6 +67,16 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(psidx.codes, pd.Index(pidx.codes)) self.assert_eq(psidx.ordered, pidx.ordered) + def test_as_ordered_unordered(self): + pidx = pd.CategoricalIndex(["x", "y", "z"], categories=["z", "y", "x"]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.as_ordered(), psidx.as_ordered()) + self.assert_eq(pidx.as_unordered(), psidx.as_unordered()) + + self.assertRaises(ValueError, lambda: psidx.as_ordered(inplace=True)) + self.assertRaises(ValueError, lambda: psidx.as_unordered(inplace=True)) + def test_astype(self): pidx = pd.Index(["a", "b", "c"]) psidx = ps.from_pandas(pidx) diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py index 7a1f7be..a4c9b148 100644 --- a/python/pyspark/pandas/tests/test_categorical.py +++ b/python/pyspark/pandas/tests/test_categorical.py @@ -65,6 +65,28 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(psser.cat.codes, pser.cat.codes) self.assert_eq(psser.cat.ordered, pser.cat.ordered) + def test_as_ordered_unordered(self): + pdf, psdf = self.df_pair + + pser = pdf.a + psser = psdf.a + + # as_ordered + self.assert_eq(pser.cat.as_ordered(), psser.cat.as_ordered()) + + pser.cat.as_ordered(inplace=True) + psser.cat.as_ordered(inplace=True) + self.assert_eq(pser, psser) + self.assert_eq(pdf, psdf) + + # as_unordered + self.assert_eq(pser.cat.as_unordered(), psser.cat.as_unordered()) + + pser.cat.as_unordered(inplace=True) + psser.cat.as_unordered(inplace=True) + self.assert_eq(pser, psser) + self.assert_eq(pdf, psdf) + def test_astype(self): pser = pd.Series(["a", "b", "c"]) psser = ps.from_pandas(pser) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org