This is an automated email from the ASF dual-hosted git repository. brycemecum pushed a commit to branch maint-19.0.x in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 8677e33380ef79f3344d667be4e74a42d7c4b810 Author: Joris Van den Bossche <[email protected]> AuthorDate: Tue Jan 7 17:47:14 2025 +0100 GH-45175: [Python] Honor the strings_to_categorical keyword in to_pandas for string view type (#45176) ### Rationale for this change Currently this keyword works for string or large string: ```python >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string())}) >>> table.to_pandas(strings_to_categorical=True).dtypes col category dtype: object >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.large_string())}) >>> table.to_pandas(strings_to_categorical=True).dtypes col category dtype: object ``` but not for string view: ```python >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string_view())}) >>> table.to_pandas(strings_to_categorical=True).dtypes col object dtype: object ``` For consistency we should make that keyword check for string view columns as well, I think From https://github.com/apache/arrow/pull/44195/files#r1901831460 ### Are these changes tested? Yes ### Are there any user-facing changes? Yes, when using the `strings_to_categorical=True` keyword and having a string_view type, this column will now be converted to a pandas Categorical * GitHub Issue: #45175 Authored-by: Joris Van den Bossche <[email protected]> Signed-off-by: Raúl Cumplido <[email protected]> --- python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 6 ++-- python/pyarrow/tests/test_pandas.py | 32 +++++++++++++++++++--- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 10c4d0e160..a0f1d5bbbe 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -2523,7 +2523,8 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr } if (options.strings_to_categorical) { for (int i = 0; i < static_cast<int>(arrays->size()); i++) { - if (is_base_binary_like((*arrays)[i]->type()->id())) { + if (is_base_binary_like((*arrays)[i]->type()->id()) || + is_binary_view_like((*arrays)[i]->type()->id())) { columns_to_encode.push_back(i); } } @@ -2557,7 +2558,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options, py_ref = nullptr; } - if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) { + if (options.strings_to_categorical && (is_base_binary_like(arr->type()->id()) || + is_binary_view_like(arr->type()->id()))) { if (options.zero_copy_only) { return Status::Invalid("Need to dictionary encode a column, but ", "only zero-copy conversions allowed"); diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 1186f87b03..d5c936df07 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1836,10 +1836,13 @@ class TestConvertStringLikeTypes: result = table.to_pandas(categories=['col']) assert table.to_pandas().equals(result) - def test_table_str_to_categorical_without_na(self): + @pytest.mark.parametrize( + "string_type", [pa.string(), pa.large_string(), pa.string_view()] + ) + def test_table_str_to_categorical_without_na(self, string_type): values = ['a', 'a', 'b', 'b', 'c'] df = pd.DataFrame({'strings': values}) - field = pa.field('strings', pa.string()) + field = pa.field('strings', string_type) schema = pa.schema([field]) table = pa.Table.from_pandas(df, schema=schema) @@ -1851,10 +1854,22 @@ class TestConvertStringLikeTypes: table.to_pandas(strings_to_categorical=True, zero_copy_only=True) - def test_table_str_to_categorical_with_na(self): + # chunked array + result = table["strings"].to_pandas(strings_to_categorical=True) + expected = pd.Series(pd.Categorical(values), name="strings") + tm.assert_series_equal(result, expected) + + with pytest.raises(pa.ArrowInvalid): + table["strings"].to_pandas(strings_to_categorical=True, + zero_copy_only=True) + + @pytest.mark.parametrize( + "string_type", [pa.string(), pa.large_string(), pa.string_view()] + ) + def test_table_str_to_categorical_with_na(self, string_type): values = [None, 'a', 'b', np.nan] df = pd.DataFrame({'strings': values}) - field = pa.field('strings', pa.string()) + field = pa.field('strings', string_type) schema = pa.schema([field]) table = pa.Table.from_pandas(df, schema=schema) @@ -1866,6 +1881,15 @@ class TestConvertStringLikeTypes: table.to_pandas(strings_to_categorical=True, zero_copy_only=True) + # chunked array + result = table["strings"].to_pandas(strings_to_categorical=True) + expected = pd.Series(pd.Categorical(values), name="strings") + tm.assert_series_equal(result, expected) + + with pytest.raises(pa.ArrowInvalid): + table["strings"].to_pandas(strings_to_categorical=True, + zero_copy_only=True) + # Regression test for ARROW-2101 def test_array_of_bytes_to_strings(self): converted = pa.array(np.array([b'x'], dtype=object), pa.string())
