This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2c5ae51a17 GH-45175: [Python] Honor the strings_to_categorical keyword
in to_pandas for string view type (#45176)
2c5ae51a17 is described below
commit 2c5ae51a17bf8e7f63b393e89e2aeb0c2b75b1b6
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Tue Jan 7 17:47:14 2025 +0100
GH-45175: [Python] Honor the strings_to_categorical keyword in to_pandas
for string view type (#45176)
### Rationale for this change
Currently this keyword works for string or large string:
```python
>>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string())})
>>> table.to_pandas(strings_to_categorical=True).dtypes
col category
dtype: object
>>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.large_string())})
>>> table.to_pandas(strings_to_categorical=True).dtypes
col category
dtype: object
```
but not for string view:
```python
>>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string_view())})
>>> table.to_pandas(strings_to_categorical=True).dtypes
col object
dtype: object
```
For consistency we should make that keyword check for string view columns
as well, I think
From https://github.com/apache/arrow/pull/44195/files#r1901831460
### Are these changes tested?
Yes
### Are there any user-facing changes?
Yes, when using the `strings_to_categorical=True` keyword and having a
string_view type, this column will now be converted to a pandas Categorical
* GitHub Issue: #45175
Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
---
python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 6 ++--
python/pyarrow/tests/test_pandas.py | 32 +++++++++++++++++++---
2 files changed, 32 insertions(+), 6 deletions(-)
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index 10c4d0e160..a0f1d5bbbe 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -2523,7 +2523,8 @@ Status ConvertCategoricals(const PandasOptions& options,
ChunkedArrayVector* arr
}
if (options.strings_to_categorical) {
for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
- if (is_base_binary_like((*arrays)[i]->type()->id())) {
+ if (is_base_binary_like((*arrays)[i]->type()->id()) ||
+ is_binary_view_like((*arrays)[i]->type()->id())) {
columns_to_encode.push_back(i);
}
}
@@ -2557,7 +2558,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions&
options,
py_ref = nullptr;
}
- if (options.strings_to_categorical &&
is_base_binary_like(arr->type()->id())) {
+ if (options.strings_to_categorical &&
(is_base_binary_like(arr->type()->id()) ||
+
is_binary_view_like(arr->type()->id()))) {
if (options.zero_copy_only) {
return Status::Invalid("Need to dictionary encode a column, but ",
"only zero-copy conversions allowed");
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 1186f87b03..d5c936df07 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -1836,10 +1836,13 @@ class TestConvertStringLikeTypes:
result = table.to_pandas(categories=['col'])
assert table.to_pandas().equals(result)
- def test_table_str_to_categorical_without_na(self):
+ @pytest.mark.parametrize(
+ "string_type", [pa.string(), pa.large_string(), pa.string_view()]
+ )
+ def test_table_str_to_categorical_without_na(self, string_type):
values = ['a', 'a', 'b', 'b', 'c']
df = pd.DataFrame({'strings': values})
- field = pa.field('strings', pa.string())
+ field = pa.field('strings', string_type)
schema = pa.schema([field])
table = pa.Table.from_pandas(df, schema=schema)
@@ -1851,10 +1854,22 @@ class TestConvertStringLikeTypes:
table.to_pandas(strings_to_categorical=True,
zero_copy_only=True)
- def test_table_str_to_categorical_with_na(self):
+ # chunked array
+ result = table["strings"].to_pandas(strings_to_categorical=True)
+ expected = pd.Series(pd.Categorical(values), name="strings")
+ tm.assert_series_equal(result, expected)
+
+ with pytest.raises(pa.ArrowInvalid):
+ table["strings"].to_pandas(strings_to_categorical=True,
+ zero_copy_only=True)
+
+ @pytest.mark.parametrize(
+ "string_type", [pa.string(), pa.large_string(), pa.string_view()]
+ )
+ def test_table_str_to_categorical_with_na(self, string_type):
values = [None, 'a', 'b', np.nan]
df = pd.DataFrame({'strings': values})
- field = pa.field('strings', pa.string())
+ field = pa.field('strings', string_type)
schema = pa.schema([field])
table = pa.Table.from_pandas(df, schema=schema)
@@ -1866,6 +1881,15 @@ class TestConvertStringLikeTypes:
table.to_pandas(strings_to_categorical=True,
zero_copy_only=True)
+ # chunked array
+ result = table["strings"].to_pandas(strings_to_categorical=True)
+ expected = pd.Series(pd.Categorical(values), name="strings")
+ tm.assert_series_equal(result, expected)
+
+ with pytest.raises(pa.ArrowInvalid):
+ table["strings"].to_pandas(strings_to_categorical=True,
+ zero_copy_only=True)
+
# Regression test for ARROW-2101
def test_array_of_bytes_to_strings(self):
converted = pa.array(np.array([b'x'], dtype=object), pa.string())