kszucs commented on a change in pull request #7816: URL: https://github.com/apache/arrow/pull/7816#discussion_r458859987
########## File path: python/pyarrow/tests/test_convert_builtin.py ########## @@ -811,6 +875,125 @@ def test_sequence_timestamp(): 46, 57, 437699) +@pytest.mark.parametrize('timezone', [ + None, + 'UTC', + 'Europe/Budapest', +]) +@pytest.mark.parametrize('unit', [ + 's', + 'ms', + 'us', + 'ns' +]) +def test_sequence_timestamp_with_timezone(timezone, unit): + def expected_integer_value(dt): + units = ['s', 'ms', 'us', 'ns'] + multiplier = 10**(units.index(unit) * 3) + if dt is None: + return None + else: + # avoid float precision issues + ts = decimal.Decimal(str(dt.timestamp())) + return int(ts * multiplier) + + def expected_datetime_value(dt): + if dt is None: + return None + + if unit == 's': + dt = dt.replace(microsecond=0) + elif unit == 'ms': + dt = dt.replace(microsecond=(dt.microsecond // 1000) * 1000) + + # adjust the timezone + if timezone is None: + # make datetime timezone unaware + return dt.replace(tzinfo=None) + else: + # convert to the expected timezone + return dt.astimezone(pytz.timezone(timezone)) + + data = [ + datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive + pytz.utc.localize( + datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) + ), + None, + pytz.timezone('US/Eastern').localize( + datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) + ), + pytz.timezone('Europe/Moscow').localize( + datetime.datetime(2010, 8, 13, 5, 0, 0, 437699) + ), + ] + utcdata = [ + pytz.utc.localize(data[0]), + data[1], + None, + data[3].astimezone(pytz.utc), + data[4].astimezone(pytz.utc), + ] + + ty = pa.timestamp(unit, tz=timezone) + arr = pa.array(data, type=ty) + assert len(arr) == 5 + assert arr.type == ty + assert arr.null_count == 1 + + # test that the underlying integers are UTC values + values = arr.cast('int64') + expected = list(map(expected_integer_value, utcdata)) + assert values.to_pylist() == expected + + # test that the scalars are datetimes with the correct timezone + for i in range(len(arr)): + assert arr[i].as_py() == expected_datetime_value(utcdata[i]) + + +def test_sequence_timestamp_with_timezone_inference(): + data = [ + datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive + pytz.utc.localize( + datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) + ), + None, + pytz.timezone('US/Eastern').localize( + datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) + ), + pytz.timezone('Europe/Moscow').localize( + datetime.datetime(2010, 8, 13, 5, 0, 0, 437699) + ), + ] + expected = [ + pa.timestamp('us', tz=None), + pa.timestamp('us', tz='UTC'), + pa.timestamp('us', tz=None), + pa.timestamp('us', tz='US/Eastern'), + pa.timestamp('us', tz='Europe/Moscow') + ] + for dt, expected_type in zip(data, expected): + prepended = [dt] + data + arr = pa.array(prepended) + assert arr.type == expected_type + + +# @pytest.mark.pandas +# def test_nanosecond_resolution_timestamp(): Review comment: I'm a bit uncertain whether we should support inferring/converting from `pandas.Timestamp` objects in the python builtin to arrow code paths since pandas is an optional dependency. If we leave it as is, the worst problem is that the nanosecond resolution `pandas.Timestamp` objects get truncated to microsecond resolution. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org