[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16423461#comment-16423461 ] ASF GitHub Bot commented on ARROW-2122: --- xhochy closed pull request #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 0bc47fc0d..fad943cce 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -128,7 +128,7 @@ def get_extension_dtype_info(column): } physical_dtype = str(cats.codes.dtype) elif hasattr(dtype, 'tz'): -metadata = {'timezone': str(dtype.tz)} +metadata = {'timezone': pa.lib.tzinfo_to_string(dtype.tz)} physical_dtype = 'datetime64[ns]' else: metadata = None @@ -418,7 +418,7 @@ def dataframe_to_serialized_dict(frame): block_data = {} if isinstance(block, _int.DatetimeTZBlock): -block_data['timezone'] = values.tz.zone +block_data['timezone'] = pa.lib.tzinfo_to_string(values.tz) values = values.values elif isinstance(block, _int.CategoricalBlock): block_data.update(dictionary=values.categories, @@ -482,6 +482,7 @@ def _reconstruct_block(item): def _make_datetimetz(tz): from pyarrow.compat import DatetimeTZDtype +tz = pa.lib.string_to_tzinfo(tz) return DatetimeTZDtype('ns', tz=tz) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index a801acd69..64d44a967 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -235,8 +235,7 @@ cdef class TimestampValue(ArrayValue): value = self.value if not dtype.timezone().empty(): -import pytz -tzinfo = pytz.timezone(frombytes(dtype.timezone())) +tzinfo = string_to_tzinfo(frombytes(dtype.timezone())) else: tzinfo = None diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index c27c0edd9..de7ddefc5 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -300,9 +300,11 @@ cdef class Column: result = pd.Series(values, name=self.name) if isinstance(self.type, TimestampType): -if self.type.tz is not None: +tz = self.type.tz +if tz is not None: +tz = string_to_tzinfo(tz) result = (result.dt.tz_localize('utc') - .dt.tz_convert(self.type.tz)) + .dt.tz_convert(tz)) return result diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 19b59a49b..988d512a8 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -26,6 +26,7 @@ import itertools import numpy as np import six +import pytz int_type_pairs = [ @@ -649,3 +650,14 @@ def test_decimal_array_with_none_and_nan(): array = pa.array(values, type=pa.decimal128(10, 4)) assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None] + + +@pytest.mark.parametrize('tz,name', [ +(pytz.FixedOffset(90), '+01:30'), +(pytz.FixedOffset(-90), '-01:30'), +(pytz.utc, 'UTC'), +(pytz.timezone('America/New_York'), 'America/New_York') +]) +def test_timezone_string(tz, name): +assert pa.lib.tzinfo_to_string(tz) == name +assert pa.lib.string_to_tzinfo(name) == tz diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 5abc026bf..f90d805be 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -1000,6 +1000,16 @@ def test_array_from_pandas_date_with_mask(self): expected = pd.Series([None, date(1991, 1, 1), None]) assert pa.Array.from_pandas(expected).equals(result) +def test_fixed_offset_timezone(self): +df = pd.DataFrame({ +'a': [ +pd.Timestamp('2012-11-11 00:00:00+01:00'), +pd.NaT +] + }) +_check_pandas_roundtrip(df) +_check_serialize_components_roundtrip(df) + class TestConvertStringLikeTypes(object): """ diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 5f962901c..1ff42d70f 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import re + # These are imprecise because the type (in pandas 0.x) depends
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16422475#comment-16422475 ] ASF GitHub Bot commented on ARROW-2122: --- adshieh commented on a change in pull request #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707#discussion_r178545777 ## File path: python/pyarrow/types.pxi ## @@ -847,6 +849,63 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' +_FIXED_OFFSET_RE = re.compile(r'([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])') Review comment: Good catch, thanks! This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Assignee: Albert Shieh >Priority: Major > Labels: pull-request-available > Fix For: 0.10.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16422080#comment-16422080 ] ASF GitHub Bot commented on ARROW-2122: --- pitrou commented on a change in pull request #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707#discussion_r178472077 ## File path: python/pyarrow/types.pxi ## @@ -847,6 +849,63 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' +_FIXED_OFFSET_RE = re.compile(r'([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])') Review comment: I *think* this needs a `$` at the end to avoid ignoring any trailing text. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Assignee: Albert Shieh >Priority: Major > Labels: pull-request-available > Fix For: 0.10.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16421775#comment-16421775 ] ASF GitHub Bot commented on ARROW-2122: --- adshieh commented on a change in pull request #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707#discussion_r178466966 ## File path: python/pyarrow/types.pxi ## @@ -847,6 +847,25 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' +FIXED_OFFSET_PREFIX = '+' + + +def tzinfo_to_string(tz): Review comment: Done! This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Assignee: Albert Shieh >Priority: Major > Labels: pull-request-available > Fix For: 0.10.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16421776#comment-16421776 ] ASF GitHub Bot commented on ARROW-2122: --- adshieh commented on a change in pull request #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707#discussion_r178466968 ## File path: python/pyarrow/tests/test_convert_pandas.py ## @@ -1001,6 +1001,17 @@ def test_array_from_pandas_date_with_mask(self): assert pa.Array.from_pandas(expected).equals(result) +def test_fixed_offset_timezone(): Review comment: Done! This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Assignee: Albert Shieh >Priority: Major > Labels: pull-request-available > Fix For: 0.10.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16421774#comment-16421774 ] ASF GitHub Bot commented on ARROW-2122: --- adshieh commented on a change in pull request #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707#discussion_r178466964 ## File path: python/pyarrow/types.pxi ## @@ -847,6 +847,25 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' +FIXED_OFFSET_PREFIX = '+' Review comment: Done! This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Assignee: Albert Shieh >Priority: Major > Labels: pull-request-available > Fix For: 0.10.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16396847#comment-16396847 ] ASF GitHub Bot commented on ARROW-2122: --- pitrou commented on a change in pull request #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707#discussion_r174097652 ## File path: python/pyarrow/tests/test_convert_pandas.py ## @@ -1001,6 +1001,17 @@ def test_array_from_pandas_date_with_mask(self): assert pa.Array.from_pandas(expected).equals(result) +def test_fixed_offset_timezone(): Review comment: Please put this under the class above. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Assignee: Albert Shieh >Priority: Major > Labels: pull-request-available > Fix For: 0.10.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16396844#comment-16396844 ] ASF GitHub Bot commented on ARROW-2122: --- pitrou commented on a change in pull request #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707#discussion_r174097132 ## File path: python/pyarrow/types.pxi ## @@ -847,6 +847,25 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' +FIXED_OFFSET_PREFIX = '+' Review comment: We probably want the offset to be encoded as `[+-]HH:MM`. See https://github.com/apache/arrow/blob/master/format/Schema.fbs#L162-L166 This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Assignee: Albert Shieh >Priority: Major > Labels: pull-request-available > Fix For: 0.10.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16396845#comment-16396845 ] ASF GitHub Bot commented on ARROW-2122: --- pitrou commented on a change in pull request #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707#discussion_r174097194 ## File path: python/pyarrow/types.pxi ## @@ -847,6 +847,25 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' +FIXED_OFFSET_PREFIX = '+' + + +def tzinfo_to_string(tz): Review comment: These two functions would deserve a docstring, and appropriate unit tests. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Assignee: Albert Shieh >Priority: Major > Labels: pull-request-available > Fix For: 0.10.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16396530#comment-16396530 ] ASF GitHub Bot commented on ARROW-2122: --- wesm commented on issue #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707#issuecomment-372543159 This needs a little more scrutiny (cc @jreback @cpcloud @pitrou) before we commit to something for 0.9.0 we might have to break later. I'm going to move this issue off 0.9.0 so we aren't blocking This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Assignee: Albert Shieh >Priority: Major > Labels: pull-request-available > Fix For: 0.10.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16387042#comment-16387042 ] ASF GitHub Bot commented on ARROW-2122: --- adshieh commented on a change in pull request #1707: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp. URL: https://github.com/apache/arrow/pull/1707#discussion_r172375593 ## File path: python/pyarrow/types.pxi ## @@ -847,6 +847,25 @@ cdef timeunit_to_string(TimeUnit unit): return 'ns' +FIXED_OFFSET_PREFIX = '+' Review comment: @wesm suggestions? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16386615#comment-16386615 ] Albert Shieh commented on ARROW-2122: - How about '+{:d}'.format(tz._minutes), or some other prefix besides '+'? > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Priority: Major > Fix For: 0.9.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16386274#comment-16386274 ] Wes McKinney commented on ARROW-2122: - Ah, sounds like we need a convention to handle FixedOffset in Arrow so that things can be properly coerced going in and out of Python > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Priority: Major > Fix For: 0.9.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2122) [Python] Pyarrow fails to serialize dataframe with timestamp.
[ https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16386217#comment-16386217 ] Albert Shieh commented on ARROW-2122: - The issue is that the timestamp has a pytz.FixedOffset timezone, which has a zone attribute of None where arrow expects a string. > [Python] Pyarrow fails to serialize dataframe with timestamp. > - > > Key: ARROW-2122 > URL: https://issues.apache.org/jira/browse/ARROW-2122 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Robert Nishihara >Priority: Major > Fix For: 0.9.0 > > > The bug can be reproduced as follows. > {code:java} > import pyarrow as pa > import pandas as pd > df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) > s = pa.serialize(df).to_buffer() > new_df = pa.deserialize(s) # this fails{code} > The last line fails with > {code:java} > Traceback (most recent call last): > File "", line 1, in > File "serialization.pxi", line 441, in pyarrow.lib.deserialize > File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from > File "serialization.pxi", line 257, in > pyarrow.lib.SerializedPyObject.deserialize > File "serialization.pxi", line 174, in > pyarrow.lib.SerializationContext._deserialize_callback > File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in > _deserialize_pandas_dataframe > return pdcompat.serialized_dict_to_dataframe(data) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > serialized_dict_to_dataframe > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in > > for block in data['blocks']] > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in > _reconstruct_block > dtype = _make_datetimetz(item['timezone']) > File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in > _make_datetimetz > return DatetimeTZDtype('ns', tz=tz) > File > "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py", > line 409, in __new__ > raise ValueError("DatetimeTZDtype constructor must have a tz " > ValueError: DatetimeTZDtype constructor must have a tz supplied{code} > -- This message was sent by Atlassian JIRA (v7.6.3#76005)