This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 6e46bdc ARROW-3703: [Python] DataFrame.to_parquet crashes if datetime column has time zones 6e46bdc is described below commit 6e46bdc9a354ebb15644e99a80f6cc07bb440b21 Author: Krisztián Szűcs <szucs.kriszt...@gmail.com> AuthorDate: Thu Nov 15 08:47:11 2018 -0500 ARROW-3703: [Python] DataFrame.to_parquet crashes if datetime column has time zones Author: Krisztián Szűcs <szucs.kriszt...@gmail.com> Closes #2975 from kszucs/ARROW-3703 and squashes the following commits: dba35f267 <Krisztián Szűcs> more robust timezone to string conversion --- python/pyarrow/tests/test_convert_pandas.py | 28 ++++++++++++++++++++++++++++ python/pyarrow/tests/test_parquet.py | 11 +++++++++++ python/pyarrow/types.pxi | 26 ++++++++++++++++++++++---- 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 0a0a524..7f672ea 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -15,6 +15,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. + +import six import decimal import json import multiprocessing as mp @@ -26,6 +28,7 @@ import numpy.testing as npt import pandas as pd import pandas.util.testing as tm import pytest +import pytz import pyarrow as pa import pyarrow.types as patypes @@ -823,6 +826,31 @@ class TestConvertDateTimeLikeTypes(object): }) tm.assert_frame_equal(expected_df, result) + def test_python_datetime_with_pytz_tzinfo(self): + for tz in [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)]: + values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)] + df = pd.DataFrame({'datetime': values}) + _check_pandas_roundtrip(df) + + @pytest.mark.skipif(six.PY2, reason='datetime.timezone is available since ' + 'python version 3.2') + def test_python_datetime_with_timezone_tzinfo(self): + from datetime import timezone + + values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=pytz.utc)] + df = pd.DataFrame({'datetime': values}) + _check_pandas_roundtrip(df) + + # datetime.timezone is going to be pytz.FixedOffset + hours = 1 + tz_timezone = timezone(timedelta(hours=hours)) + tz_pytz = pytz.FixedOffset(hours * 60) + values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_timezone)] + values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)] + df = pd.DataFrame({'datetime': values}) + df_exp = pd.DataFrame({'datetime': values_exp}) + _check_pandas_roundtrip(df, expected=df_exp) + def test_python_datetime_subclass(self): class MyDatetime(datetime): diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index bacffdf..8217dd3 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -20,6 +20,7 @@ import decimal import io import json import os +import six import pytest import numpy as np @@ -244,6 +245,16 @@ def test_pandas_parquet_datetime_tz(): tm.assert_frame_equal(df, df_read) +@pytest.mark.skipif(six.PY2, reason='datetime.timezone is available since ' + 'python version 3.2') +def test_datetime_timezone_tzinfo(): + value = datetime.datetime(2018, 1, 1, 1, 23, 45, + tzinfo=datetime.timezone.utc) + df = pd.DataFrame({'foo': [value]}) + + _roundtrip_pandas_dataframe(df, write_kwargs={}) + + def test_pandas_parquet_custom_metadata(tempdir): df = alltypes_sample(size=10000) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index fb7d081..399f15e 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -962,12 +962,30 @@ def tzinfo_to_string(tz): name : string Time zone name """ - if tz.zone is None: - sign = '+' if tz._minutes >= 0 else '-' - hours, minutes = divmod(abs(tz._minutes), 60) + import pytz + import datetime + + def fixed_offset_to_string(offset): + seconds = int(offset.utcoffset(None).total_seconds()) + sign = '+' if seconds >= 0 else '-' + minutes, seconds = divmod(abs(seconds), 60) + hours, minutes = divmod(minutes, 60) + if seconds > 0: + raise ValueError('Offset must represent whole number of minutes') return '{}{:02d}:{:02d}'.format(sign, hours, minutes) - else: + + if isinstance(tz, pytz.tzinfo.BaseTzInfo): return tz.zone + elif isinstance(tz, pytz._FixedOffset): + return fixed_offset_to_string(tz) + elif isinstance(tz, datetime.tzinfo): + if six.PY3 and isinstance(tz, datetime.timezone): + return fixed_offset_to_string(tz) + else: + raise ValueError('Unable to convert timezone `{}` to string' + .format(tz)) + else: + raise TypeError('Must be an instance of `datetime.tzinfo`') def string_to_tzinfo(name):