This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new eb47fd653f GH-35081: [Python] construct pandas.DataFrame with public 
API in `to_pandas` (#40897)
eb47fd653f is described below

commit eb47fd653fbbe03efc18daf5488369cb87752f96
Author: Joris Van den Bossche <jorisvandenboss...@gmail.com>
AuthorDate: Tue Apr 16 09:59:51 2024 +0200

    GH-35081: [Python] construct pandas.DataFrame with public API in 
`to_pandas` (#40897)
    
    ### Rationale for this change
    
    Avoiding using pandas internals to create Block objects ourselves, using a 
new API for pandas>=3
    
    * GitHub Issue: #35081
    
    Authored-by: Joris Van den Bossche <jorisvandenboss...@gmail.com>
    Signed-off-by: Joris Van den Bossche <jorisvandenboss...@gmail.com>
---
 python/pyarrow/pandas-shim.pxi  |  7 +++-
 python/pyarrow/pandas_compat.py | 75 +++++++++++++++++++++++------------------
 2 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi
index 0409e133ad..74f0d981b5 100644
--- a/python/pyarrow/pandas-shim.pxi
+++ b/python/pyarrow/pandas-shim.pxi
@@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object):
         object _array_like_types, _is_extension_array_dtype, _lock
         bint has_sparse
         bint _pd024
-        bint _is_v1, _is_ge_v21
+        bint _is_v1, _is_ge_v21, _is_ge_v3
 
     def __init__(self):
         self._lock = Lock()
@@ -79,6 +79,7 @@ cdef class _PandasAPIShim(object):
 
         self._is_v1 = self._loose_version < Version('2.0.0')
         self._is_ge_v21 = self._loose_version >= Version('2.1.0')
+        self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0')
 
         self._compat_module = pdcompat
         self._data_frame = pd.DataFrame
@@ -169,6 +170,10 @@ cdef class _PandasAPIShim(object):
         self._check_import()
         return self._is_ge_v21
 
+    def is_ge_v3(self):
+        self._check_import()
+        return self._is_ge_v3
+
     @property
     def categorical_type(self):
         self._check_import()
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 5bd0dfcf6b..00fa19604e 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -676,7 +676,7 @@ def get_datetimetz_type(values, dtype, type_):
 # Converting pyarrow.Table efficiently to pandas.DataFrame
 
 
-def _reconstruct_block(item, columns=None, extension_columns=None):
+def _reconstruct_block(item, columns=None, extension_columns=None, 
return_block=True):
     """
     Construct a pandas Block from the `item` dictionary coming from pyarrow's
     serialization or returned by arrow::python::ConvertTableToPandas.
@@ -709,22 +709,23 @@ def _reconstruct_block(item, columns=None, 
extension_columns=None):
     block_arr = item.get('block', None)
     placement = item['placement']
     if 'dictionary' in item:
-        cat = _pandas_api.categorical_type.from_codes(
+        arr = _pandas_api.categorical_type.from_codes(
             block_arr, categories=item['dictionary'],
             ordered=item['ordered'])
-        block = _int.make_block(cat, placement=placement)
     elif 'timezone' in item:
         unit, _ = np.datetime_data(block_arr.dtype)
         dtype = make_datetimetz(unit, item['timezone'])
         if _pandas_api.is_ge_v21():
-            pd_arr = _pandas_api.pd.array(
+            arr = _pandas_api.pd.array(
                 block_arr.view("int64"), dtype=dtype, copy=False
             )
-            block = _int.make_block(pd_arr, placement=placement)
         else:
-            block = _int.make_block(block_arr, placement=placement,
-                                    klass=_int.DatetimeTZBlock,
-                                    dtype=dtype)
+            arr = block_arr
+            if return_block:
+                block = _int.make_block(block_arr, placement=placement,
+                                        klass=_int.DatetimeTZBlock,
+                                        dtype=dtype)
+                return block
     elif 'py_array' in item:
         # create ExtensionBlock
         arr = item['py_array']
@@ -734,12 +735,14 @@ def _reconstruct_block(item, columns=None, 
extension_columns=None):
         if not hasattr(pandas_dtype, '__from_arrow__'):
             raise ValueError("This column does not support to be converted "
                              "to a pandas ExtensionArray")
-        pd_ext_arr = pandas_dtype.__from_arrow__(arr)
-        block = _int.make_block(pd_ext_arr, placement=placement)
+        arr = pandas_dtype.__from_arrow__(arr)
     else:
-        block = _int.make_block(block_arr, placement=placement)
+        arr = block_arr
 
-    return block
+    if return_block:
+        return _int.make_block(arr, placement=placement)
+    else:
+        return arr, placement
 
 
 def make_datetimetz(unit, tz):
@@ -752,9 +755,6 @@ def make_datetimetz(unit, tz):
 def table_to_dataframe(
     options, table, categories=None, ignore_metadata=False, types_mapper=None
 ):
-    from pandas.core.internals import BlockManager
-    from pandas import DataFrame
-
     all_columns = []
     column_indexes = []
     pandas_metadata = table.schema.pandas_metadata
@@ -774,15 +774,35 @@ def table_to_dataframe(
 
     _check_data_column_metadata_consistency(all_columns)
     columns = _deserialize_column_index(table, all_columns, column_indexes)
-    blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
 
-    axes = [columns, index]
-    mgr = BlockManager(blocks, axes)
-    if _pandas_api.is_ge_v21():
-        df = DataFrame._from_mgr(mgr, mgr.axes)
+    column_names = table.column_names
+    result = pa.lib.table_to_blocks(options, table, categories,
+                                    list(ext_columns_dtypes.keys()))
+    if _pandas_api.is_ge_v3():
+        from pandas.api.internals import create_dataframe_from_blocks
+
+        blocks = [
+            _reconstruct_block(
+                item, column_names, ext_columns_dtypes, return_block=False)
+            for item in result
+        ]
+        df = create_dataframe_from_blocks(blocks, index=index, columns=columns)
+        return df
     else:
-        df = DataFrame(mgr)
-    return df
+        from pandas.core.internals import BlockManager
+        from pandas import DataFrame
+
+        blocks = [
+            _reconstruct_block(item, column_names, ext_columns_dtypes)
+            for item in result
+        ]
+        axes = [columns, index]
+        mgr = BlockManager(blocks, axes)
+        if _pandas_api.is_ge_v21():
+            df = DataFrame._from_mgr(mgr, mgr.axes)
+        else:
+            df = DataFrame(mgr)
+        return df
 
 
 # Set of the string repr of all numpy dtypes that can be stored in a pandas
@@ -1099,17 +1119,6 @@ def _reconstruct_columns_from_metadata(columns, 
column_indexes):
         return pd.Index(new_levels[0], dtype=new_levels[0].dtype, 
name=columns.name)
 
 
-def _table_to_blocks(options, block_table, categories, extension_columns):
-    # Part of table_to_blockmanager
-
-    # Convert an arrow table to Block from the internal pandas API
-    columns = block_table.column_names
-    result = pa.lib.table_to_blocks(options, block_table, categories,
-                                    list(extension_columns.keys()))
-    return [_reconstruct_block(item, columns, extension_columns)
-            for item in result]
-
-
 def _add_any_metadata(table, pandas_metadata):
     modified_columns = {}
     modified_fields = {}

Reply via email to