do-me commented on issue #14229:
URL: https://github.com/apache/arrow/issues/14229#issuecomment-1826259855
Same here. I'm trying to load a 6GB parquet file with 3 cols two string cols
and one with embeddings (array) in pandas with
```python
df = pd.read_parquet("test.parquet")
```
```
File size in bytes: 6207538015 bytes
File size in kilobytes: 6062048.84 KB
```
Tried with pandas 2.0.3 and latest 2.1.3 on Windows (32Gb RAM) and Ubuntu
(128Gb RAM):
```
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
File <timed exec>:4
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/parquet.py:509, in
read_parquet(path, engine, columns, storage_options, use_nullable_dtypes,
dtype_backend, **kwargs)
506 use_nullable_dtypes = False
507 check_dtype_backend(dtype_backend)
--> 509 return impl.read(
510 path,
511 columns=columns,
512 storage_options=storage_options,
513 use_nullable_dtypes=use_nullable_dtypes,
514 dtype_backend=dtype_backend,
515 **kwargs,
516 )
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/parquet.py:227, in
PyArrowImpl.read(self, path, columns, use_nullable_dtypes, dtype_backend,
storage_options, **kwargs)
220 path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
221 path,
222 kwargs.pop("filesystem", None),
223 storage_options=storage_options,
224 mode="rb",
225 )
226 try:
--> 227 pa_table = self.api.parquet.read_table(
228 path_or_handle, columns=columns, **kwargs
229 )
230 result = pa_table.to_pandas(**to_pandas_kwargs)
232 if manager == "array":
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/parquet/core.py:2973,
in read_table(source, columns, use_threads, metadata, schema,
use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning,
filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer,
coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit,
thrift_container_size_limit)
2962 # TODO test that source is not a directory or a list
2963 dataset = ParquetFile(
2964 source, metadata=metadata,
read_dictionary=read_dictionary,
2965 memory_map=memory_map, buffer_size=buffer_size,
(...)
2970 thrift_container_size_limit=thrift_container_size_limit,
2971 )
-> 2973 return dataset.read(columns=columns, use_threads=use_threads,
2974 use_pandas_metadata=use_pandas_metadata)
2976 warnings.warn(
2977 "Passing 'use_legacy_dataset=True' to get the legacy behaviour
is "
2978 "deprecated as of pyarrow 8.0.0, and the legacy implementation
will "
2979 "be removed in a future version.",
2980 FutureWarning, stacklevel=2)
2982 if ignore_prefixes is not None:
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/parquet/core.py:2601,
in _ParquetDatasetV2.read(self, columns, use_threads, use_pandas_metadata)
2593 index_columns = [
2594 col for col in _get_pandas_index_columns(metadata)
2595 if not isinstance(col, dict)
2596 ]
2597 columns = (
2598 list(columns) + list(set(index_columns) - set(columns))
2599 )
-> 2601 table = self._dataset.to_table(
2602 columns=columns, filter=self._filter_expression,
2603 use_threads=use_threads
2604 )
2606 # if use_pandas_metadata, restore the pandas metadata (which gets
2607 # lost if doing a specific `columns` selection in to_table)
2608 if use_pandas_metadata:
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/_dataset.pyx:369, in
pyarrow._dataset.Dataset.to_table()
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/_dataset.pyx:2818, in
pyarrow._dataset.Scanner.to_table()
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/error.pxi:144, in
pyarrow.lib.pyarrow_internal_check_status()
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/error.pxi:115, in
pyarrow.lib.check_status()
OSError: List index overflow.
```
The weird thing is that this that I processed 20 of these files with
different file sizes and even bigger ones than this one (with 7GB) worked.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]