Repository: arrow Updated Branches: refs/heads/master 360942e61 -> e29a7d4ca
ARROW-668: [Python] Box timestamp values as pandas.Timestamp if available, attach tzinfo I'm not sure how to easily test the behavior if pandas is not present. I created an environment without pandas and added some fixes so that I verify the behavior, but at some point we should create a "no pandas" test suite to see what using pyarrow is like without pandas installed. Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #487 from wesm/ARROW-668 and squashes the following commits: 554a647 [Wes McKinney] Remove cython from requirements.txt 649d28a [Wes McKinney] Box timestamp values as pandas.Timestamp if available, return timezone also if available Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/e29a7d4c Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/e29a7d4c Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/e29a7d4c Branch: refs/heads/master Commit: e29a7d4cae943312a1f8598e71c5d46c1954b5fa Parents: 360942e Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Tue Apr 4 16:22:29 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Tue Apr 4 16:22:29 2017 -0400 ---------------------------------------------------------------------- python/pyarrow/array.pyx | 25 ++++------ python/pyarrow/compat.py | 17 +++++++ python/pyarrow/scalar.pyx | 47 +++++++++++++++---- python/pyarrow/tests/test_scalars.py | 76 ++++++++++++++++++++----------- 4 files changed, 112 insertions(+), 53 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/array.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 67785e3..1f59556 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -29,7 +29,7 @@ cimport pyarrow.includes.pyarrow as pyarrow import pyarrow.config -from pyarrow.compat import frombytes, tobytes +from pyarrow.compat import frombytes, tobytes, PandasSeries, Categorical from pyarrow.error cimport check_status from pyarrow.memory cimport MemoryPool, maybe_unbox_memory_pool @@ -44,11 +44,6 @@ import pyarrow.schema as schema cimport cpython -cdef _pandas(): - import pandas as pd - return pd - - cdef maybe_coerce_datetime64(values, dtype, DataType type, timestamps_to_ms=False): @@ -66,7 +61,7 @@ cdef maybe_coerce_datetime64(values, dtype, DataType type, tz = dtype.tz unit = 'ms' if coerce_ms else dtype.unit type = schema.timestamp(unit, tz) - else: + elif type is None: # Trust the NumPy dtype type = schema.type_from_numpy_dtype(values.dtype) @@ -141,15 +136,13 @@ cdef class Array: shared_ptr[CDataType] c_type CMemoryPool* pool - pd = _pandas() - if mask is not None: mask = get_series_values(mask) values = get_series_values(obj) pool = maybe_unbox_memory_pool(memory_pool) - if isinstance(values, pd.Categorical): + if isinstance(values, Categorical): return DictionaryArray.from_arrays( values.codes, values.categories.values, mask=mask, memory_pool=memory_pool) @@ -397,9 +390,9 @@ cdef wrap_array_output(PyObject* output): cdef object obj = PyObject_to_object(output) if isinstance(obj, dict): - return _pandas().Categorical(obj['indices'], - categories=obj['dictionary'], - fastpath=True) + return Categorical(obj['indices'], + categories=obj['dictionary'], + fastpath=True) else: return obj @@ -622,14 +615,12 @@ cdef object box_tensor(const shared_ptr[CTensor]& sp_tensor): cdef object get_series_values(object obj): - import pandas as pd - - if isinstance(obj, pd.Series): + if isinstance(obj, PandasSeries): result = obj.values elif isinstance(obj, np.ndarray): result = obj else: - result = pd.Series(obj).values + result = PandasSeries(obj).values return result http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/compat.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index b9206aa..4dcc116 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -38,9 +38,26 @@ try: else: from pandas.types.dtypes import DatetimeTZDtype pdapi = pd.api.types + + PandasSeries = pd.Series + Categorical = pd.Categorical HAVE_PANDAS = True except: HAVE_PANDAS = False + class DatetimeTZDtype(object): + pass + + class ClassPlaceholder(object): + + def __init__(self, *args, **kwargs): + raise NotImplementedError + + class PandasSeries(ClassPlaceholder): + pass + + class Categorical(ClassPlaceholder): + pass + if PY26: import unittest2 as unittest http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/scalar.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index 983a9a7..1c0790a 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -26,6 +26,12 @@ cimport cpython as cp NA = None + +cdef _pandas(): + import pandas as pd + return pd + + cdef class NAType(Scalar): def __cinit__(self): @@ -146,16 +152,37 @@ cdef class TimestampValue(ArrayValue): CTimestampType* dtype = <CTimestampType*>ap.type().get() int64_t val = ap.Value(self.index) - if dtype.unit == TimeUnit_SECOND: - return datetime.datetime.utcfromtimestamp(val) - elif dtype.unit == TimeUnit_MILLI: - return datetime.datetime.utcfromtimestamp(float(val) / 1000) - elif dtype.unit == TimeUnit_MICRO: - return datetime.datetime.utcfromtimestamp(float(val) / 1000000) - else: - # TimeUnit_NANO - raise NotImplementedError("Cannot convert nanosecond timestamps " - "to datetime.datetime") + timezone = None + tzinfo = None + if dtype.timezone.size() > 0: + timezone = frombytes(dtype.timezone) + import pytz + tzinfo = pytz.timezone(timezone) + + try: + pd = _pandas() + if dtype.unit == TimeUnit_SECOND: + val = val * 1000000000 + elif dtype.unit == TimeUnit_MILLI: + val = val * 1000000 + elif dtype.unit == TimeUnit_MICRO: + val = val * 1000 + return pd.Timestamp(val, tz=tzinfo) + except ImportError: + if dtype.unit == TimeUnit_SECOND: + result = datetime.datetime.utcfromtimestamp(val) + elif dtype.unit == TimeUnit_MILLI: + result = datetime.datetime.utcfromtimestamp(float(val) / 1000) + elif dtype.unit == TimeUnit_MICRO: + result = datetime.datetime.utcfromtimestamp( + float(val) / 1000000) + else: + # TimeUnit_NANO + raise NotImplementedError("Cannot convert nanosecond " + "timestamps without pandas") + if timezone is not None: + result = result.replace(tzinfo=tzinfo) + return result cdef class FloatValue(ArrayValue): http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/tests/test_scalars.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index a5db7e0..f4f275b 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -19,69 +19,69 @@ import pandas as pd from pyarrow.compat import unittest, u, unicode_type -import pyarrow as A +import pyarrow as pa class TestScalars(unittest.TestCase): def test_null_singleton(self): with self.assertRaises(Exception): - A.NAType() + pa.NAType() def test_bool(self): - arr = A.from_pylist([True, None, False, None]) + arr = pa.from_pylist([True, None, False, None]) v = arr[0] - assert isinstance(v, A.BooleanValue) + assert isinstance(v, pa.BooleanValue) assert repr(v) == "True" assert v.as_py() is True - assert arr[1] is A.NA + assert arr[1] is pa.NA def test_int64(self): - arr = A.from_pylist([1, 2, None]) + arr = pa.from_pylist([1, 2, None]) v = arr[0] - assert isinstance(v, A.Int64Value) + assert isinstance(v, pa.Int64Value) assert repr(v) == "1" assert v.as_py() == 1 - assert arr[2] is A.NA + assert arr[2] is pa.NA def test_double(self): - arr = A.from_pylist([1.5, None, 3]) + arr = pa.from_pylist([1.5, None, 3]) v = arr[0] - assert isinstance(v, A.DoubleValue) + assert isinstance(v, pa.DoubleValue) assert repr(v) == "1.5" assert v.as_py() == 1.5 - assert arr[1] is A.NA + assert arr[1] is pa.NA v = arr[2] assert v.as_py() == 3.0 def test_string_unicode(self): - arr = A.from_pylist([u'foo', None, u'mañana']) + arr = pa.from_pylist([u'foo', None, u'mañana']) v = arr[0] - assert isinstance(v, A.StringValue) + assert isinstance(v, pa.StringValue) assert v.as_py() == 'foo' - assert arr[1] is A.NA + assert arr[1] is pa.NA v = arr[2].as_py() assert v == u'mañana' assert isinstance(v, unicode_type) def test_bytes(self): - arr = A.from_pylist([b'foo', None, u('bar')]) + arr = pa.from_pylist([b'foo', None, u('bar')]) v = arr[0] - assert isinstance(v, A.BinaryValue) + assert isinstance(v, pa.BinaryValue) assert v.as_py() == b'foo' - assert arr[1] is A.NA + assert arr[1] is pa.NA v = arr[2].as_py() assert v == b'bar' @@ -89,41 +89,65 @@ class TestScalars(unittest.TestCase): def test_fixed_size_bytes(self): data = [b'foof', None, b'barb'] - arr = A.from_pylist(data, type=A.binary(4)) + arr = pa.from_pylist(data, type=pa.binary(4)) v = arr[0] - assert isinstance(v, A.FixedSizeBinaryValue) + assert isinstance(v, pa.FixedSizeBinaryValue) assert v.as_py() == b'foof' - assert arr[1] is A.NA + assert arr[1] is pa.NA v = arr[2].as_py() assert v == b'barb' assert isinstance(v, bytes) def test_list(self): - arr = A.from_pylist([['foo', None], None, ['bar'], []]) + arr = pa.from_pylist([['foo', None], None, ['bar'], []]) v = arr[0] assert len(v) == 2 - assert isinstance(v, A.ListValue) + assert isinstance(v, pa.ListValue) assert repr(v) == "['foo', None]" assert v.as_py() == ['foo', None] assert v[0].as_py() == 'foo' - assert v[1] is A.NA + assert v[1] is pa.NA - assert arr[1] is A.NA + assert arr[1] is pa.NA v = arr[3] assert len(v) == 0 + def test_timestamp(self): + arr = pd.date_range('2000-01-01 12:34:56', periods=10).values + + units = ['s', 'ms', 'us', 'ns'] + + for unit in units: + dtype = 'datetime64[{0}]'.format(unit) + arrow_arr = pa.Array.from_numpy(arr.astype(dtype)) + expected = pd.Timestamp('2000-01-01 12:34:56') + + assert arrow_arr[0].as_py() == expected + + tz = 'America/New_York' + arrow_type = pa.timestamp(unit, tz=tz) + + dtype = 'datetime64[{0}]'.format(unit) + arrow_arr = pa.Array.from_numpy(arr.astype(dtype), + type=arrow_type) + expected = (pd.Timestamp('2000-01-01 12:34:56') + .tz_localize('utc') + .tz_convert(tz)) + + assert arrow_arr[0].as_py() == expected + def test_dictionary(self): colors = ['red', 'green', 'blue'] values = pd.Series(colors * 4) categorical = pd.Categorical(values, categories=colors) - v = A.DictionaryArray.from_arrays(categorical.codes, - categorical.categories) + v = pa.DictionaryArray.from_arrays(categorical.codes, + categorical.categories) for i, c in enumerate(values): assert v[i].as_py() == c