This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new eb47fd653f GH-35081: [Python] construct pandas.DataFrame with public
API in `to_pandas` (#40897)
eb47fd653f is described below
commit eb47fd653fbbe03efc18daf5488369cb87752f96
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Tue Apr 16 09:59:51 2024 +0200
GH-35081: [Python] construct pandas.DataFrame with public API in
`to_pandas` (#40897)
### Rationale for this change
Avoiding using pandas internals to create Block objects ourselves, using a
new API for pandas>=3
* GitHub Issue: #35081
Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/pandas-shim.pxi | 7 +++-
python/pyarrow/pandas_compat.py | 75 +++++++++++++++++++++++------------------
2 files changed, 48 insertions(+), 34 deletions(-)
diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi
index 0409e133ad..74f0d981b5 100644
--- a/python/pyarrow/pandas-shim.pxi
+++ b/python/pyarrow/pandas-shim.pxi
@@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object):
object _array_like_types, _is_extension_array_dtype, _lock
bint has_sparse
bint _pd024
- bint _is_v1, _is_ge_v21
+ bint _is_v1, _is_ge_v21, _is_ge_v3
def __init__(self):
self._lock = Lock()
@@ -79,6 +79,7 @@ cdef class _PandasAPIShim(object):
self._is_v1 = self._loose_version < Version('2.0.0')
self._is_ge_v21 = self._loose_version >= Version('2.1.0')
+ self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0')
self._compat_module = pdcompat
self._data_frame = pd.DataFrame
@@ -169,6 +170,10 @@ cdef class _PandasAPIShim(object):
self._check_import()
return self._is_ge_v21
+ def is_ge_v3(self):
+ self._check_import()
+ return self._is_ge_v3
+
@property
def categorical_type(self):
self._check_import()
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 5bd0dfcf6b..00fa19604e 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -676,7 +676,7 @@ def get_datetimetz_type(values, dtype, type_):
# Converting pyarrow.Table efficiently to pandas.DataFrame
-def _reconstruct_block(item, columns=None, extension_columns=None):
+def _reconstruct_block(item, columns=None, extension_columns=None,
return_block=True):
"""
Construct a pandas Block from the `item` dictionary coming from pyarrow's
serialization or returned by arrow::python::ConvertTableToPandas.
@@ -709,22 +709,23 @@ def _reconstruct_block(item, columns=None,
extension_columns=None):
block_arr = item.get('block', None)
placement = item['placement']
if 'dictionary' in item:
- cat = _pandas_api.categorical_type.from_codes(
+ arr = _pandas_api.categorical_type.from_codes(
block_arr, categories=item['dictionary'],
ordered=item['ordered'])
- block = _int.make_block(cat, placement=placement)
elif 'timezone' in item:
unit, _ = np.datetime_data(block_arr.dtype)
dtype = make_datetimetz(unit, item['timezone'])
if _pandas_api.is_ge_v21():
- pd_arr = _pandas_api.pd.array(
+ arr = _pandas_api.pd.array(
block_arr.view("int64"), dtype=dtype, copy=False
)
- block = _int.make_block(pd_arr, placement=placement)
else:
- block = _int.make_block(block_arr, placement=placement,
- klass=_int.DatetimeTZBlock,
- dtype=dtype)
+ arr = block_arr
+ if return_block:
+ block = _int.make_block(block_arr, placement=placement,
+ klass=_int.DatetimeTZBlock,
+ dtype=dtype)
+ return block
elif 'py_array' in item:
# create ExtensionBlock
arr = item['py_array']
@@ -734,12 +735,14 @@ def _reconstruct_block(item, columns=None,
extension_columns=None):
if not hasattr(pandas_dtype, '__from_arrow__'):
raise ValueError("This column does not support to be converted "
"to a pandas ExtensionArray")
- pd_ext_arr = pandas_dtype.__from_arrow__(arr)
- block = _int.make_block(pd_ext_arr, placement=placement)
+ arr = pandas_dtype.__from_arrow__(arr)
else:
- block = _int.make_block(block_arr, placement=placement)
+ arr = block_arr
- return block
+ if return_block:
+ return _int.make_block(arr, placement=placement)
+ else:
+ return arr, placement
def make_datetimetz(unit, tz):
@@ -752,9 +755,6 @@ def make_datetimetz(unit, tz):
def table_to_dataframe(
options, table, categories=None, ignore_metadata=False, types_mapper=None
):
- from pandas.core.internals import BlockManager
- from pandas import DataFrame
-
all_columns = []
column_indexes = []
pandas_metadata = table.schema.pandas_metadata
@@ -774,15 +774,35 @@ def table_to_dataframe(
_check_data_column_metadata_consistency(all_columns)
columns = _deserialize_column_index(table, all_columns, column_indexes)
- blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
- axes = [columns, index]
- mgr = BlockManager(blocks, axes)
- if _pandas_api.is_ge_v21():
- df = DataFrame._from_mgr(mgr, mgr.axes)
+ column_names = table.column_names
+ result = pa.lib.table_to_blocks(options, table, categories,
+ list(ext_columns_dtypes.keys()))
+ if _pandas_api.is_ge_v3():
+ from pandas.api.internals import create_dataframe_from_blocks
+
+ blocks = [
+ _reconstruct_block(
+ item, column_names, ext_columns_dtypes, return_block=False)
+ for item in result
+ ]
+ df = create_dataframe_from_blocks(blocks, index=index, columns=columns)
+ return df
else:
- df = DataFrame(mgr)
- return df
+ from pandas.core.internals import BlockManager
+ from pandas import DataFrame
+
+ blocks = [
+ _reconstruct_block(item, column_names, ext_columns_dtypes)
+ for item in result
+ ]
+ axes = [columns, index]
+ mgr = BlockManager(blocks, axes)
+ if _pandas_api.is_ge_v21():
+ df = DataFrame._from_mgr(mgr, mgr.axes)
+ else:
+ df = DataFrame(mgr)
+ return df
# Set of the string repr of all numpy dtypes that can be stored in a pandas
@@ -1099,17 +1119,6 @@ def _reconstruct_columns_from_metadata(columns,
column_indexes):
return pd.Index(new_levels[0], dtype=new_levels[0].dtype,
name=columns.name)
-def _table_to_blocks(options, block_table, categories, extension_columns):
- # Part of table_to_blockmanager
-
- # Convert an arrow table to Block from the internal pandas API
- columns = block_table.column_names
- result = pa.lib.table_to_blocks(options, block_table, categories,
- list(extension_columns.keys()))
- return [_reconstruct_block(item, columns, extension_columns)
- for item in result]
-
-
def _add_any_metadata(table, pandas_metadata):
modified_columns = {}
modified_fields = {}