[ 
https://issues.apache.org/jira/browse/ARROW-7980?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Søren Fuglede Jørgensen updated ARROW-7980:
-------------------------------------------
    Description: 
When following the [procedure outlined 
here]([https://stackoverflow.com/a/57986261/5085211]) to use `pyarrow` to 
serialize/deserialize pandas data frames, the below example fails with the 
given traceback:

{{
import pandas as pd
 import pyarrow as pa
 df = pd.DataFrame([{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])
 df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)
 df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)
 context = pa.default_serialization_context()
 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())

--------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-9-6f75cc47c6d5> in <module>
----> 1 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.deserialize()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.deserialize_from()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.SerializedPyObject.deserialize()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.SerializationContext._deserialize_callback()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.py 
in _deserialize_pandas_dataframe(data)
    167 
    168     def _deserialize_pandas_dataframe(data):
--> 169         return pdcompat.serialized_dict_to_dataframe(data)
    170 
    171     def _serialize_pandas_series(obj):

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in serialized_dict_to_dataframe(data)
    661 def serialized_dict_to_dataframe(data):
    662     import pandas.core.internals as _int
--> 663     reconstructed_blocks = [_reconstruct_block(block)
    664                             for block in data['blocks']]
    665 

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in <listcomp>(.0)
    661 def serialized_dict_to_dataframe(data):
    662     import pandas.core.internals as _int
--> 663     reconstructed_blocks = [_reconstruct_block(block)
    664                             for block in data['blocks']]
    665 

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in _reconstruct_block(item, columns, extension_columns)
    707                                 klass=_int.CategoricalBlock)
    708     elif 'timezone' in item:
--> 709         dtype = make_datetimetz(item['timezone'])
    710         block = _int.make_block(block_arr, placement=placement,
    711                                 klass=_int.DatetimeTZBlock,

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in make_datetimetz(tz)
    734 def make_datetimetz(tz):
    735     tz = pa.lib.string_to_tzinfo(tz)
--> 736     return _pandas_api.datetimetz_type('ns', tz=tz)
    737 
    738 

TypeError: 'NoneType' object is not callable

}}

Perhaps interestingly, if I comment out the two `pd.to_datetime` lines, the 
thing works (perhaps unsurprisingly), but if I then include them again, the 
original reproducing example all of a sudden works. That is, this works:

{{
 import pandas as pd 
 import pyarrow as pa 
 df = pd.DataFrame([\{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])
 context = pa.default_serialization_context()
 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())
 
 df = pd.DataFrame([\{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])
 df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)
 df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)
 context = pa.default_serialization_context()
 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())}}

The issue occurs with pyarrow 0.16.0, and in both pandas 0.25.3 and 1.0.1.

  was:
When following the [procedure outlined 
here]([https://stackoverflow.com/a/57986261/5085211]) to use `pyarrow` to 
serialize/deserialize pandas data frames, the below example fails with the 
given traceback:

{{import pandas as pd
 import pyarrow as pa
 df = pd.DataFrame([

{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}

])
 df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)
 df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)
 context = pa.default_serialization_context()
 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())

--------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-9-6f75cc47c6d5> in <module>
----> 1 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.deserialize()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.deserialize_from()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.SerializedPyObject.deserialize()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.SerializationContext._deserialize_callback()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.py 
in _deserialize_pandas_dataframe(data)
 167 
 168 def _deserialize_pandas_dataframe(data):
--> 169 return pdcompat.serialized_dict_to_dataframe(data)
 170 
 171 def _serialize_pandas_series(obj):

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in serialized_dict_to_dataframe(data)
 661 def serialized_dict_to_dataframe(data):
 662 import pandas.core.internals as _int
--> 663 reconstructed_blocks = [_reconstruct_block(block)
 664 for block in data['blocks']]
 665

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in <listcomp>(.0)
 661 def serialized_dict_to_dataframe(data):
 662 import pandas.core.internals as _int
--> 663 reconstructed_blocks = [_reconstruct_block(block)
 664 for block in data['blocks']]
 665

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in _reconstruct_block(item, columns, extension_columns)
 707 klass=_int.CategoricalBlock)
 708 elif 'timezone' in item:
--> 709 dtype = make_datetimetz(item['timezone'])
 710 block = _int.make_block(block_arr, placement=placement,
 711 klass=_int.DatetimeTZBlock,

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in make_datetimetz(tz)
 734 def make_datetimetz(tz):
 735 tz = pa.lib.string_to_tzinfo(tz)
--> 736 return _pandas_api.datetimetz_type('ns', tz=tz)
 737 
 738

TypeError: 'NoneType' object is not callable

}}

Perhaps interestingly, if I comment out the two `pd.to_datetime` lines, the 
thing works (perhaps unsurprisingly), but if I then include them again, the 
original reproducing example all of a sudden works. That is, this works:

{{
 import pandas as pd 
 import pyarrow as pa 
 df = pd.DataFrame([\{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])
 context = pa.default_serialization_context()
 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())
 
 df = pd.DataFrame([\{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])
 df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)
 df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)
 context = pa.default_serialization_context()
 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())}}

The issue occurs with pyarrow 0.16.0, and in both pandas 0.25.3 and 1.0.1.


> Deserialization with pyarrow fails for certain Timestamp-based data frame
> -------------------------------------------------------------------------
>
>                 Key: ARROW-7980
>                 URL: https://issues.apache.org/jira/browse/ARROW-7980
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.16.0
>            Reporter: Søren Fuglede Jørgensen
>            Priority: Major
>
> When following the [procedure outlined 
> here]([https://stackoverflow.com/a/57986261/5085211]) to use `pyarrow` to 
> serialize/deserialize pandas data frames, the below example fails with the 
> given traceback:
> {{
> import pandas as pd
>  import pyarrow as pa
>  df = pd.DataFrame([{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 
> 'Minutes5DK': '2020-02-25T22:15:00'}])
>  df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)
>  df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)
>  context = pa.default_serialization_context()
>  pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())
> --------------------------------------------------------------------------
> TypeError                                 Traceback (most recent call last)
> <ipython-input-9-6f75cc47c6d5> in <module>
> ----> 1 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
>  in pyarrow.lib.deserialize()
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
>  in pyarrow.lib.deserialize_from()
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
>  in pyarrow.lib.SerializedPyObject.deserialize()
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
>  in pyarrow.lib.SerializationContext._deserialize_callback()
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.py
>  in _deserialize_pandas_dataframe(data)
>     167 
>     168     def _deserialize_pandas_dataframe(data):
> --> 169         return pdcompat.serialized_dict_to_dataframe(data)
>     170 
>     171     def _serialize_pandas_series(obj):
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py
>  in serialized_dict_to_dataframe(data)
>     661 def serialized_dict_to_dataframe(data):
>     662     import pandas.core.internals as _int
> --> 663     reconstructed_blocks = [_reconstruct_block(block)
>     664                             for block in data['blocks']]
>     665 
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py
>  in <listcomp>(.0)
>     661 def serialized_dict_to_dataframe(data):
>     662     import pandas.core.internals as _int
> --> 663     reconstructed_blocks = [_reconstruct_block(block)
>     664                             for block in data['blocks']]
>     665 
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py
>  in _reconstruct_block(item, columns, extension_columns)
>     707                                 klass=_int.CategoricalBlock)
>     708     elif 'timezone' in item:
> --> 709         dtype = make_datetimetz(item['timezone'])
>     710         block = _int.make_block(block_arr, placement=placement,
>     711                                 klass=_int.DatetimeTZBlock,
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py
>  in make_datetimetz(tz)
>     734 def make_datetimetz(tz):
>     735     tz = pa.lib.string_to_tzinfo(tz)
> --> 736     return _pandas_api.datetimetz_type('ns', tz=tz)
>     737 
>     738 
> TypeError: 'NoneType' object is not callable
> }}
> Perhaps interestingly, if I comment out the two `pd.to_datetime` lines, the 
> thing works (perhaps unsurprisingly), but if I then include them again, the 
> original reproducing example all of a sudden works. That is, this works:
> {{
>  import pandas as pd 
>  import pyarrow as pa 
>  df = pd.DataFrame([\{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 
> 'Minutes5DK': '2020-02-25T22:15:00'}])
>  context = pa.default_serialization_context()
>  pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())
>  
>  df = pd.DataFrame([\{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 
> 'Minutes5DK': '2020-02-25T22:15:00'}])
>  df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)
>  df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)
>  context = pa.default_serialization_context()
>  pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())}}
> The issue occurs with pyarrow 0.16.0, and in both pandas 0.25.3 and 1.0.1.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to