AlenkaF commented on code in PR #34559:
URL: https://github.com/apache/arrow/pull/34559#discussion_r1142094232
##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -1127,3 +1141,45 @@ def test_cpp_extension_in_python(tmpdir):
reconstructed_array = batch.column(0)
assert reconstructed_array.type == uuid_type
assert reconstructed_array == array
+
+
+def test_extension_to_pandas_storage_type(registered_period_type):
+ period_type, _ = registered_period_type
+ np_arr = np.array([1, 2, 3, 4])
+ storage = pa.array([1, 2, 3, 4], pa.int64())
+ arr = pa.ExtensionArray.from_storage(period_type, storage)
+
+ if isinstance(period_type, PeriodTypeWithToPandasDtype):
+ pandas_dtype = period_type.to_pandas_dtype()
+ else:
+ pandas_dtype = np_arr.dtype
+
+ # Test arrays
+ result = arr.to_pandas()
+ assert result.dtype == pandas_dtype
+
+ # Test the change in ConvertChunkedArrayToPandas
+ chunked_arr = pa.chunked_array([arr])
+ result = chunked_arr.to_numpy()
+ assert result.dtype == np_arr.dtype
+
+ result = chunked_arr.to_pandas()
+ # TODO: to_pandas should take use of to_pandas_dtype
+ # if defined!
+ # assert result.dtype == pandas_dtype
+ assert result.dtype == np_arr.dtype
+
+ # Test the change in ConvertTableToPandas
+ data = [
+ pa.array([1, 2, 3, 4]),
+ pa.array(['foo', 'bar', None, None]),
+ pa.array([True, None, True, False]),
+ arr
+ ]
+ my_schema = pa.schema([('f0', pa.int8()),
+ ('f1', pa.string()),
+ ('f2', pa.bool_()),
+ ('ext', period_type)])
+ table = pa.Table.from_arrays(data, schema=my_schema)
+ result = table.to_pandas()
+ assert result["ext"].dtype == pandas_dtype
Review Comment:
This doesn't work currently (without the change in this PR also). The
problem is that the extension type is unhashable so we can not even define a
`types_mapper` with an extension type:
```python
>>> period_type = PeriodType('D')
>>> {period_type: pd.Int64Dtype()}
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: unhashable type: 'PeriodType'
```
The `DataType` base class has the `__hash__` method implemented so I guess
that should work for `ExtensionType` subclass also? If we would want
`types_mapper` keyword to work for extension types, I guess, we would have to
add `__hash__` method in the definition of the type:
```python
class PeriodType(pa.ExtensionType):
...
def __hash__(self):
return hash(str(self))
```
But using `PeriodType` with the `__hash__` method and the `types_mapper`
(latest main, not this PR) I get an error on pandas side:
```python
>>> types_mapper = {period_type: pd.Int64Dtype()}.get
>>> table.to_pandas(types_mapper=types_mapper)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "pyarrow/array.pxi", line 835, in
pyarrow.lib._PandasConvertible.to_pandas
return self._to_pandas(options, categories=categories,
File "pyarrow/table.pxi", line 4111, in pyarrow.lib.Table._to_pandas
mgr = table_to_blockmanager(
File "/Users/alenkafrim/repos/arrow/python/pyarrow/pandas_compat.py", line
820, in table_to_blockmanager
blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
File "/Users/alenkafrim/repos/arrow/python/pyarrow/pandas_compat.py", line
1171, in _table_to_blocks
return [_reconstruct_block(item, columns, extension_columns)
File "/Users/alenkafrim/repos/arrow/python/pyarrow/pandas_compat.py", line
1171, in <listcomp>
return [_reconstruct_block(item, columns, extension_columns)
File "/Users/alenkafrim/repos/arrow/python/pyarrow/pandas_compat.py", line
780, in _reconstruct_block
pd_ext_arr = pandas_dtype.__from_arrow__(arr)
File
"/Users/alenkafrim/repos/pyarrow-dev/lib/python3.10/site-packages/pandas/core/arrays/numeric.py",
line 88, in __from_arrow__
raise TypeError(
TypeError: Expected array of Int64 type, got
extension<test.period<PeriodType>> instead
```
This looks it deserves a separate issue.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]