ghuls edited a comment on issue #286:
URL: https://github.com/apache/arrow-rs/issues/286#issuecomment-865384731


   @jorgecarleitao I think I might have figured out the problem.
   
   ```python
   import polars as pl
   import pyarrow as pa
   import pandas as pd
   
   # Read Feather file written with pandas, with pa,feather.read_feather 
(wrapped inside pl.read_ipc) in Polars dataframe.
   df_pl = pl.read_ipc('test_pandas.feather', use_pyarrow=True)
   
   # Convert Polars dataframe to arrow table and write to Feather v2 file 
without compression (with pyarrow).
   pa.feather.write_feather(df_pl.to_arrow(), 
'test_polars_to_arrow_uncompressed.feather', compression='uncompressed', 
version=2)
   
   # Convert Polars dataframe to arrow table and write to Feather v2 file 
without compression (with pyarrow).
   pa.feather.write_feather(df_pl.to_arrow(), 
'test_polars_to_arrow_lz4.feather', compression='lz4', version=2)
   
   # Convert Polars dataframe to arrow table and convert arrow table to pandas 
dataframe and write to Feather v2 file without compression (with pyarrow).
   pa.feather.write_feather(df_pl.to_arrow().to_pandas(), 
'test_polars_to_arrow_to_pandas_uncompressed.feather', 
compression='uncompressed', version=2)
   
   # Convert Polars dataframe to arrow table and convert arrow table to pandas 
dataframe and write to Feather v2 file with lz4 compression (with pyarrow).
   pa.feather.write_feather(df_pl.to_arrow().to_pandas(), 
'test_polars_to_arrow_to_pandas_lz4.feather', compression='lz4', version=2)
   
   
   # Now try to read all those files with polars without using the pyarrow 
Feather reading code, but the arrow-rs code instead.
   
   # Reading Feather v2 file without compression containing saved arrow table 
data, works.
   In [9]: pl.read_ipc('test_polars_to_arrow_uncompressed.feather', 
use_pyarrow=False)
   Out[9]: 
   shape: (7, 5)
   
╭────────────────────┬────────┬─────────────────────┬────────────────────┬─────────╮
   │ motif1             ┆ motif2 ┆ motif3              ┆ motif4             ┆ 
regions │
   │ ---                ┆ ---    ┆ ---                 ┆ ---                ┆ 
---     │
   │ f32                ┆ f32    ┆ f32                 ┆ f32                ┆ 
str     │
   
╞════════════════════╪════════╪═════════════════════╪════════════════════╪═════════╡
   │ 1.2000000476837158 ┆ 3      ┆ 0.30000001192092896 ┆ 5.599999904632568  ┆ 
"reg1"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 6.699999809265137  ┆ 3      ┆ 4.300000190734863   ┆ 5.599999904632568  ┆ 
"reg2"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 3.5                ┆ 3      ┆ 0.0                 ┆ 0.0                ┆ 
"reg3"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 0.0                ┆ 3      ┆ 0.0                 ┆ 5.599999904632568  ┆ 
"reg4"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 2.4000000953674316 ┆ 3      ┆ 7.800000190734863   ┆ 1.2000000476837158 ┆ 
"reg5"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 2.4000000953674316 ┆ 3      ┆ 0.6000000238418579  ┆ 0.0                ┆ 
"reg6"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 2.4000000953674316 ┆ 3      ┆ 7.699999809265137   ┆ 0.0                ┆ 
"reg7"  │
   
╰────────────────────┴────────┴─────────────────────┴────────────────────┴─────────╯
   
   
   # Reading Feather v2 file without compression containing saved pandas 
dataframe, works.
   In [10]: pl.read_ipc('test_polars_to_arrow_to_pandas_uncompressed.feather', 
use_pyarrow=False)
   Out[10]: 
   shape: (7, 5)
   
╭────────────────────┬────────┬─────────────────────┬────────────────────┬─────────╮
   │ motif1             ┆ motif2 ┆ motif3              ┆ motif4             ┆ 
regions │
   │ ---                ┆ ---    ┆ ---                 ┆ ---                ┆ 
---     │
   │ f32                ┆ f32    ┆ f32                 ┆ f32                ┆ 
str     │
   
╞════════════════════╪════════╪═════════════════════╪════════════════════╪═════════╡
   │ 1.2000000476837158 ┆ 3      ┆ 0.30000001192092896 ┆ 5.599999904632568  ┆ 
"reg1"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 6.699999809265137  ┆ 3      ┆ 4.300000190734863   ┆ 5.599999904632568  ┆ 
"reg2"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 3.5                ┆ 3      ┆ 0.0                 ┆ 0.0                ┆ 
"reg3"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 0.0                ┆ 3      ┆ 0.0                 ┆ 5.599999904632568  ┆ 
"reg4"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 2.4000000953674316 ┆ 3      ┆ 7.800000190734863   ┆ 1.2000000476837158 ┆ 
"reg5"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 2.4000000953674316 ┆ 3      ┆ 0.6000000238418579  ┆ 0.0                ┆ 
"reg6"  │
   
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
   │ 2.4000000953674316 ┆ 3      ┆ 7.699999809265137   ┆ 0.0                ┆ 
"reg7"  │
   
╰────────────────────┴────────┴─────────────────────┴────────────────────┴─────────╯
   
   
   # Reading Feather v2 file with lz4 compression containing saved pandas 
dataframe, gives the error from the first post.
   In [11]: pl.read_ipc('test_polars_to_arrow_to_pandas_lz4.feather', 
use_pyarrow=False)
   thread '<unnamed>' panicked at 'assertion failed: prefix.is_empty() && 
suffix.is_empty()', 
/github/home/.cargo/git/checkouts/arrow-rs-3b86e19e889d5acc/9f56afb/arrow/src/buffer/immutable.rs:179:9
   note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
   ---------------------------------------------------------------------------
   PanicException                            Traceback (most recent call last)
   <ipython-input-11-04613b1d0975> in <module>
   ----> 1 pl.read_ipc('test_polars_to_arrow_to_pandas_lz4.feather', 
use_pyarrow=False)
   
/software/miniconda3/envs/cisTopic/lib/python3.7/site-packages/polars/functions.py
 in read_ipc(file, use_pyarrow)
       337     """
       338     file = _prepare_file_arg(file)
   --> 339     return DataFrame.read_ipc(file, use_pyarrow)
       340 
       341 
   
   
/software/miniconda3/envs/cisTopic/lib/python3.7/site-packages/polars/frame.py 
in read_ipc(file, use_pyarrow)
       302 
       303         self = DataFrame.__new__(DataFrame)
   --> 304         self._df = PyDataFrame.read_ipc(file)
       305         return self
       306 
   
   PanicException: assertion failed: prefix.is_empty() && suffix.is_empty()
   
   
   # Reading Feather v2 file with lz4 compression containing saved pyarrow 
table, results in killing of iPython due to trying to allocate a too big buffer.
   In [12]: pl.read_ipc('test_polars_to_arrow_lz4.feather', use_pyarrow=False)
   Out[12]: memory allocation of 2702793507844465093 bytes failed
   Aborted
   ```
   
   So to me it looks like that arrow-rs is not detecting that pyarrow saved the 
Feather file with lz4 compression and I guess it is reading data (or offsets) 
from the wrong locations.
   
   ```python
   In [6]: ?pa.feather.write_feather
   Signature:
   pa.feather.write_feather(
       df,
       dest,
       compression=None,
       compression_level=None,
       chunksize=None,
       version=2,
   )
   Docstring:
   Write a pandas.DataFrame to Feather format.
   
   Parameters
   ----------
   df : pandas.DataFrame or pyarrow.Table
       Data to write out as Feather format.
   dest : str
       Local destination path.
   compression : string, default None
       Can be one of {"zstd", "lz4", "uncompressed"}. The default of None uses
       LZ4 for V2 files if it is available, otherwise uncompressed.
   compression_level : int, default None
       Use a compression level particular to the chosen compressor. If None
       use the default compression level
   chunksize : int, default None
       For V2 files, the internal maximum size of Arrow RecordBatch chunks
       when writing the Arrow IPC file format. None means use the default,
       which is currently 64K
   version : int, default 2
       Feather file version. Version 2 is the current. Version 1 is the more
       limited legacy format
   File:      
/software/miniconda3/envs/cisTopic/lib/python3.7/site-packages/pyarrow/feather.py
   Type:      function
   ```
   
   Feather files are attached:
   
[test_feather_polars_to_pyarrow.zip](https://github.com/apache/arrow-rs/files/6689794/test_feather_polars_to_pyarrow.zip)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to