ghuls commented on issue #286:
URL: https://github.com/apache/arrow-rs/issues/286#issuecomment-865384731
@jorgecarleitao I think I might have figured out the problem.
```python
import polars as pl
import pyarrow as pa
import pandas as pd
# Read Feather file written with pandas, with pa,feather.read_feather
(wrapped inside pl.read_ipc) in Polars dataframe.
df_pl = pl.read_ipc('test_pandas.feather', use_pyarrow=True)
# Convert Polars dataframe to arrow table and write to Feather v2 file
without compression (with pyarrow).
pa.feather.write_feather(df_pl.to_arrow(),
'test_polars_to_arrow_uncompressed.feather', compression='uncompressed',
version=2)
# Convert Polars dataframe to arrow table and write to Feather v2 file
without compression (with pyarrow).
pa.feather.write_feather(df_pl.to_arrow(),
'test_polars_to_arrow_lz4.feather', compression='lz4', version=2)
# Convert Polars dataframe to arrow table and convert arrow table to pandas
dataframe and write to Feather v2 file without compression (with pyarrow).
pa.feather.write_feather(df_pl.to_arrow().to_pandas(),
'test_polars_to_arrow_to_pandas_uncompressed.feather',
compression='uncompressed', version=2)
# Convert Polars dataframe to arrow table and convert arrow table to pandas
dataframe and write to Feather v2 file with lz4 compression (with pyarrow).
pa.feather.write_feather(df_pl.to_arrow().to_pandas(),
'test_polars_to_arrow_to_pandas_lz4.feather', compression='lz4', version=2)
# Now try to read all those files with polars without using the pyarrow
Feather reading code, but the arrow-rs code instead.
# Reading Feather v2 file without compression containing saved arrow table
data, works.
In [9]: pl.read_ipc('test_polars_to_arrow_uncompressed.feather',
use_pyarrow=False)
Out[9]:
shape: (7, 5)
╭────────────────────┬────────┬─────────────────────┬────────────────────┬─────────╮
│ motif1 ┆ motif2 ┆ motif3 ┆ motif4 ┆
regions │
│ --- ┆ --- ┆ --- ┆ --- ┆
--- │
│ f32 ┆ f32 ┆ f32 ┆ f32 ┆
str │
╞════════════════════╪════════╪═════════════════════╪════════════════════╪═════════╡
│ 1.2000000476837158 ┆ 3 ┆ 0.30000001192092896 ┆ 5.599999904632568 ┆
"reg1" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 6.699999809265137 ┆ 3 ┆ 4.300000190734863 ┆ 5.599999904632568 ┆
"reg2" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 3.5 ┆ 3 ┆ 0.0 ┆ 0.0 ┆
"reg3" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 0.0 ┆ 3 ┆ 0.0 ┆ 5.599999904632568 ┆
"reg4" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 2.4000000953674316 ┆ 3 ┆ 7.800000190734863 ┆ 1.2000000476837158 ┆
"reg5" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 2.4000000953674316 ┆ 3 ┆ 0.6000000238418579 ┆ 0.0 ┆
"reg6" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 2.4000000953674316 ┆ 3 ┆ 7.699999809265137 ┆ 0.0 ┆
"reg7" │
╰────────────────────┴────────┴─────────────────────┴────────────────────┴─────────╯
# Reading Feather v2 file without compression containing saved pandas
dataframe, works.
In [10]: pl.read_ipc('test_polars_to_arrow_to_pandas_uncompressed.feather',
use_pyarrow=False)
Out[10]:
shape: (7, 5)
╭────────────────────┬────────┬─────────────────────┬────────────────────┬─────────╮
│ motif1 ┆ motif2 ┆ motif3 ┆ motif4 ┆
regions │
│ --- ┆ --- ┆ --- ┆ --- ┆
--- │
│ f32 ┆ f32 ┆ f32 ┆ f32 ┆
str │
╞════════════════════╪════════╪═════════════════════╪════════════════════╪═════════╡
│ 1.2000000476837158 ┆ 3 ┆ 0.30000001192092896 ┆ 5.599999904632568 ┆
"reg1" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 6.699999809265137 ┆ 3 ┆ 4.300000190734863 ┆ 5.599999904632568 ┆
"reg2" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 3.5 ┆ 3 ┆ 0.0 ┆ 0.0 ┆
"reg3" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 0.0 ┆ 3 ┆ 0.0 ┆ 5.599999904632568 ┆
"reg4" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 2.4000000953674316 ┆ 3 ┆ 7.800000190734863 ┆ 1.2000000476837158 ┆
"reg5" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 2.4000000953674316 ┆ 3 ┆ 0.6000000238418579 ┆ 0.0 ┆
"reg6" │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ 2.4000000953674316 ┆ 3 ┆ 7.699999809265137 ┆ 0.0 ┆
"reg7" │
╰────────────────────┴────────┴─────────────────────┴────────────────────┴─────────╯
# Reading Feather v2 file with lz4 compression containing saved pandas
dataframe, gives the error from the first post.
In [11]: pl.read_ipc('test_polars_to_arrow_to_pandas_lz4.feather',
use_pyarrow=False)
thread '<unnamed>' panicked at 'assertion failed: prefix.is_empty() &&
suffix.is_empty()',
/github/home/.cargo/git/checkouts/arrow-rs-3b86e19e889d5acc/9f56afb/arrow/src/buffer/immutable.rs:179:9
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
---------------------------------------------------------------------------
PanicException Traceback (most recent call last)
<ipython-input-11-04613b1d0975> in <module>
----> 1 pl.read_ipc('test_polars_to_arrow_to_pandas_lz4.feather',
use_pyarrow=False)
/software/miniconda3/envs/cisTopic/lib/python3.7/site-packages/polars/functions.py
in read_ipc(file, use_pyarrow)
337 """
338 file = _prepare_file_arg(file)
--> 339 return DataFrame.read_ipc(file, use_pyarrow)
340
341
/software/miniconda3/envs/cisTopic/lib/python3.7/site-packages/polars/frame.py
in read_ipc(file, use_pyarrow)
302
303 self = DataFrame.__new__(DataFrame)
--> 304 self._df = PyDataFrame.read_ipc(file)
305 return self
306
PanicException: assertion failed: prefix.is_empty() && suffix.is_empty()
# Reading Feather v2 file with lz4 compression containing saved pyarrow
table, results in killing of iPython due to trying to allocate a too big buffer.
In [12]: pl.read_ipc('test_polars_to_arrow_lz4.feather', use_pyarrow=False)
Out[12]: memory allocation of 2702793507844465093 bytes failed
Aborted
```
So to me it looks like that arrow-rs is not detecting that pyarrow saved the
Feather file with compression and I guess it is reading data (or offsets) from
the wrong locations.
[test_feather_polars_to_pyarrow.zip](https://github.com/apache/arrow-rs/files/6689794/test_feather_polars_to_pyarrow.zip)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]