(arrow) branch main updated: GH-45175: [Python] Honor the strings_to_categorical keyword in to_pandas for string view type (#45176)

raulcd Tue, 07 Jan 2025 08:47:27 -0800

This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 2c5ae51a17 GH-45175: [Python] Honor the strings_to_categorical keyword 
in to_pandas for string view type (#45176)
2c5ae51a17 is described below

commit 2c5ae51a17bf8e7f63b393e89e2aeb0c2b75b1b6
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Tue Jan 7 17:47:14 2025 +0100

    GH-45175: [Python] Honor the strings_to_categorical keyword in to_pandas 
for string view type (#45176)
    
    ### Rationale for this change
    
    Currently this keyword works for string or large string:
    
    ```python
    >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string())})
    >>> table.to_pandas(strings_to_categorical=True).dtypes
    col    category
    dtype: object
    >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.large_string())})
    >>> table.to_pandas(strings_to_categorical=True).dtypes
    col    category
    dtype: object
    ```
    
    but not for string view:
    
    ```python
    >>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string_view())})
    >>> table.to_pandas(strings_to_categorical=True).dtypes
    col    object
    dtype: object
    ```
    
    For consistency we should make that keyword check for string view columns 
as well, I think
    
    From https://github.com/apache/arrow/pull/44195/files#r1901831460
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    Yes, when using the `strings_to_categorical=True` keyword and having a 
string_view type, this column will now be converted to a pandas Categorical
    
    * GitHub Issue: #45175
    
    Authored-by: Joris Van den Bossche <[email protected]>
    Signed-off-by: Raúl Cumplido <[email protected]>
---
 python/pyarrow/src/arrow/python/arrow_to_pandas.cc |  6 ++--
 python/pyarrow/tests/test_pandas.py                | 32 +++++++++++++++++++---
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc 
b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index 10c4d0e160..a0f1d5bbbe 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -2523,7 +2523,8 @@ Status ConvertCategoricals(const PandasOptions& options, 
ChunkedArrayVector* arr
   }
   if (options.strings_to_categorical) {
     for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
-      if (is_base_binary_like((*arrays)[i]->type()->id())) {
+      if (is_base_binary_like((*arrays)[i]->type()->id()) ||
+          is_binary_view_like((*arrays)[i]->type()->id())) {
         columns_to_encode.push_back(i);
       }
     }
@@ -2557,7 +2558,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& 
options,
     py_ref = nullptr;
   }
 
-  if (options.strings_to_categorical && 
is_base_binary_like(arr->type()->id())) {
+  if (options.strings_to_categorical && 
(is_base_binary_like(arr->type()->id()) ||
+                                         
is_binary_view_like(arr->type()->id()))) {
     if (options.zero_copy_only) {
       return Status::Invalid("Need to dictionary encode a column, but ",
                              "only zero-copy conversions allowed");
diff --git a/python/pyarrow/tests/test_pandas.py 
b/python/pyarrow/tests/test_pandas.py
index 1186f87b03..d5c936df07 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -1836,10 +1836,13 @@ class TestConvertStringLikeTypes:
         result = table.to_pandas(categories=['col'])
         assert table.to_pandas().equals(result)
 
-    def test_table_str_to_categorical_without_na(self):
+    @pytest.mark.parametrize(
+        "string_type", [pa.string(), pa.large_string(), pa.string_view()]
+    )
+    def test_table_str_to_categorical_without_na(self, string_type):
         values = ['a', 'a', 'b', 'b', 'c']
         df = pd.DataFrame({'strings': values})
-        field = pa.field('strings', pa.string())
+        field = pa.field('strings', string_type)
         schema = pa.schema([field])
         table = pa.Table.from_pandas(df, schema=schema)
 
@@ -1851,10 +1854,22 @@ class TestConvertStringLikeTypes:
             table.to_pandas(strings_to_categorical=True,
                             zero_copy_only=True)
 
-    def test_table_str_to_categorical_with_na(self):
+        # chunked array
+        result = table["strings"].to_pandas(strings_to_categorical=True)
+        expected = pd.Series(pd.Categorical(values), name="strings")
+        tm.assert_series_equal(result, expected)
+
+        with pytest.raises(pa.ArrowInvalid):
+            table["strings"].to_pandas(strings_to_categorical=True,
+                                       zero_copy_only=True)
+
+    @pytest.mark.parametrize(
+        "string_type", [pa.string(), pa.large_string(), pa.string_view()]
+    )
+    def test_table_str_to_categorical_with_na(self, string_type):
         values = [None, 'a', 'b', np.nan]
         df = pd.DataFrame({'strings': values})
-        field = pa.field('strings', pa.string())
+        field = pa.field('strings', string_type)
         schema = pa.schema([field])
         table = pa.Table.from_pandas(df, schema=schema)
 
@@ -1866,6 +1881,15 @@ class TestConvertStringLikeTypes:
             table.to_pandas(strings_to_categorical=True,
                             zero_copy_only=True)
 
+        # chunked array
+        result = table["strings"].to_pandas(strings_to_categorical=True)
+        expected = pd.Series(pd.Categorical(values), name="strings")
+        tm.assert_series_equal(result, expected)
+
+        with pytest.raises(pa.ArrowInvalid):
+            table["strings"].to_pandas(strings_to_categorical=True,
+                                       zero_copy_only=True)
+
     # Regression test for ARROW-2101
     def test_array_of_bytes_to_strings(self):
         converted = pa.array(np.array([b'x'], dtype=object), pa.string())

(arrow) branch main updated: GH-45175: [Python] Honor the strings_to_categorical keyword in to_pandas for string view type (#45176)

Reply via email to