danking commented on issue #47279:
URL: https://github.com/apache/arrow/issues/47279#issuecomment-3214534792
I have likewise encountered this issue while implementing a Ray Datasource
for the [Vortex file format](GitHub.com/vortex-data/vortex). It's somewhat
cheaper for Vortex to yield PyArrow Tables rather than Pandas DataFrames for
fiddly reasons related to Pandas' Arrow support.
```
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _
pyarrow/table.pxi:6307: in pyarrow.lib.concat_tables
???
test/test_datasource.py:38: in <genexpr>
tbl = pa.concat_tables(pa.Table.from_pydict(x) for x in
ds.iter_batches()) # pyright: ignore[reportUnknownMemberType,
reportArgumentType]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
../.venv/lib/python3.11/site-packages/ray/data/iterator.py:185: in
_create_iterator
) = self._to_ref_bundle_iterator()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
../.venv/lib/python3.11/site-packages/ray/data/_internal/iterator/iterator_impl.py:27:
in _to_ref_bundle_iterator
ref_bundles_iterator, stats = self._base_dataset._execute_to_iterator()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
../.venv/lib/python3.11/site-packages/ray/data/dataset.py:6103: in
_execute_to_iterator
bundle_iter, stats, executor = self._plan.execute_to_iterator()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
E ray.exceptions.RayTaskError(ArrowTypeError):
ray::ReadVortex->SplitBlocks(100)() (pid=90297, ip=127.0.0.1)
E for b_out in
map_transformer.apply_transform(iter(blocks), ctx):
E File
"/Users/danielking/projects/vortex/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py",
line 601, in __call__
E for block in blocks:
E File
"/Users/danielking/projects/vortex/.venv/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py",
line 534, in __call__
E while output_buffer.has_next():
E ^^^^^^^^^^^^^^^^^^^^^^^^
E File
"/Users/danielking/projects/vortex/.venv/lib/python3.11/site-packages/ray/data/_internal/output_buffer.py",
line 95, in has_next
E self._exceeded_buffer_row_limit() or
self._exceeded_buffer_size_limit()
E
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
E File
"/Users/danielking/projects/vortex/.venv/lib/python3.11/site-packages/ray/data/_internal/output_buffer.py",
line 85, in _exceeded_buffer_size_limit
E and self._buffer.get_estimated_memory_usage()
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
E File
"/Users/danielking/projects/vortex/.venv/lib/python3.11/site-packages/ray/data/_internal/delegating_block_builder.py",
line 76, in get_estimated_memory_usage
E return self._builder.get_estimated_memory_usage()
E ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
E File
"/Users/danielking/projects/vortex/.venv/lib/python3.11/site-packages/ray/data/_internal/table_block.py",
line 155, in get_estimated_memory_usage
E self._tables_size_bytes +=
BlockAccessor.for_block(table).size_bytes()
E
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
E File
"/Users/danielking/projects/vortex/.venv/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py",
line 320, in size_bytes
E return self._table.nbytes
E ^^^^^^^^^^^^^^^^^^
E File "pyarrow/table.pxi", line 5303, in
pyarrow.lib.Table.nbytes.__get__
E File "pyarrow/error.pxi", line 155, in
pyarrow.lib.pyarrow_internal_check_status
E File "pyarrow/error.pxi", line 92, in
pyarrow.lib.check_status
E pyarrow.lib.ArrowTypeError: Extracting byte ranges not
supported for type string_view
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]