(arrow-nanoarrow) branch main updated: feat(python): Add Arrow->Python datetime support (#417)

paleolimbot Thu, 11 Apr 2024 10:32:39 -0700

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 1751bdd5 feat(python): Add Arrow->Python datetime support (#417)
1751bdd5 is described below

commit 1751bdd57046a6aef75dfd8aede09713ad7dc7fc
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu Apr 11 14:31:32 2024 -0300

    feat(python): Add Arrow->Python datetime support (#417)
    
    This PR adds support for converting Arrow date, time, timestamp, and
    duration arrays to Python objects.
    
    ```python
    import pyarrow as pa
    import datetime
    import zoneinfo
    import nanoarrow as na
    
    dt = datetime.datetime.now()
    list(na.Array(pa.array([dt])).iter_py())
    #> [datetime.datetime(2024, 4, 8, 16, 25, 41, 216438)]
    
    dt_tz = datetime.datetime.now(zoneinfo.ZoneInfo("America/Halifax"))
    list(na.Array(pa.array([dt_tz])).iter_py())
    #> [datetime.datetime(2024, 4, 8, 16, 29, 7, 226832, 
tzinfo=zoneinfo.ZoneInfo(key='America/Halifax'))]
    
    tdelta = datetime.timedelta(123, 456, 678)
    list(na.Array(pa.array([tdelta])).iter_py())
    #> [datetime.timedelta(days=123, seconds=456, microseconds=678)]
    
    just_time = datetime.time(15, 27, 43, 12)
    list(na.Array(pa.array([just_time])).iter_py())
    #> [datetime.time(15, 27, 43, 12)]
    ```
    
    It is probably faster to use the DateTime C API, but the timings seem
    reasonable:
    
    ```python
    import pyarrow as pa
    import datetime
    import zoneinfo
    import nanoarrow as na
    
    n = int(1e6)
    
    dt = datetime.datetime.now()
    dt_array = pa.array([dt + datetime.timedelta(i) for i in range(n)])
    %timeit dt_array.to_pylist()
    %timeit list(na.Array(dt_array).iter_py())
    #> 805 ms ± 21.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    #> 804 ms ± 1.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    
    tdelta_array = pa.array([datetime.timedelta(123 + i, 456, 678) for i in 
range(n)])
    %timeit tdelta_array.to_pylist()
    #> 574 ms ± 3.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    %timeit list(na.Array(tdelta_array).iter_py())
    #> 399 ms ± 612 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
    
    just_time_array = pa.array([datetime.time(15, 27, 43, i) for i in range(n)])
    %timeit just_time_array.to_pylist()
    #> 831 ms ± 6.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    %timeit list(na.Array(just_time_array).iter_py())
    #> 399 ms ± 856 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
    ```
    
    ---------
    
    Co-authored-by: Joris Van den Bossche <[email protected]>
---
 python/pyproject.toml            |   4 +-
 python/src/nanoarrow/iterator.py | 187 ++++++++++++++++++++++++++++++++++++++-
 python/tests/test_iterator.py    | 184 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 370 insertions(+), 5 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index e176be26..2be6aebb 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -27,8 +27,8 @@ license = {text = "Apache-2.0"}
 requires-python = ">=3.8"
 
 [project.optional-dependencies]
-test = ["pyarrow", "pytest", "numpy"]
-verify = ["pytest", "numpy"]
+test = ["pyarrow", "python-dateutil", "pytest", "numpy"]
+verify = ["python-dateutil", "pytest", "numpy"]
 
 [project.urls]
 Homepage = "https://arrow.apache.org";
diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
index ae86fe51..111dbe0d 100644
--- a/python/src/nanoarrow/iterator.py
+++ b/python/src/nanoarrow/iterator.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import warnings
 from functools import cached_property
 from itertools import islice
 from typing import Iterable, Tuple
@@ -83,6 +84,14 @@ def iter_tuples(obj, schema=None) -> Iterable[Tuple]:
     return RowTupleIterator.get_iterator(obj, schema=schema)
 
 
+class InvalidArrayWarning(UserWarning):
+    pass
+
+
+class LossyConversionWarning(UserWarning):
+    pass
+
+
 class ArrayViewIterator:
     """Base class for iterators that use an internal ArrowArrayView
     as the basis for conversion to Python objects. Intended for internal use.
@@ -101,7 +110,7 @@ class ArrayViewIterator:
             map(self._make_child, self._schema.children, 
self._array_view.children)
         )
 
-        if schema.dictionary is None:
+        if self._schema.dictionary is None:
             self._dictionary = None
         else:
             self._dictionary = self._make_child(
@@ -115,6 +124,13 @@ class ArrayViewIterator:
     def _child_names(self):
         return [child.name for child in self._schema.children]
 
+    @cached_property
+    def _object_label(self):
+        if self._schema.name:
+            return f"{self._schema.name} <{self._schema_view.type}>"
+        else:
+            return f"<unnamed {self._schema_view.type}>"
+
     def _contains_nulls(self):
         return (
             self._schema_view.nullable
@@ -126,6 +142,9 @@ class ArrayViewIterator:
         self._array_view._set_array(array)
         return self
 
+    def _warn(self, message, category):
+        warnings.warn(f"{self._object_label}: {message}", category)
+
 
 class PyIterator(ArrayViewIterator):
     """Iterate over the Python object version of values in an ArrowArrayView.
@@ -244,6 +263,126 @@ class PyIterator(ArrayViewIterator):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)
+        else:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(milliseconds=item)
+
+    def _time_iter(self, offset, length):
+        from datetime import time
+
+        for item in self._iter_time_components(offset, length):
+            if item is None:
+                yield None
+            else:
+                days, hours, mins, secs, us = item
+                if days != 0:
+                    self._warn("days != 0", InvalidArrayWarning)
+
+                yield time(hours, mins, secs, us)
+
+    def _timestamp_iter(self, offset, length):
+        from datetime import datetime
+
+        epoch = datetime(1970, 1, 1, tzinfo=_get_tzinfo("UTC"))
+        parent = self._duration_iter(offset, length)
+
+        tz = self._schema_view.timezone
+        if tz:
+            tz = _get_tzinfo(tz)
+
+            for item in parent:
+                if item is None:
+                    yield None
+                else:
+                    yield (epoch + item).astimezone(tz)
+        else:
+            epoch = epoch.replace(tzinfo=None)
+            for item in parent:
+                if item is None:
+                    yield None
+                else:
+                    yield epoch + item
+
+    def _duration_iter(self, offset, length):
+        from datetime import timedelta
+
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            to_us = 1_000_000
+        elif unit == "ms":
+            to_us = 1000
+        elif unit == "us":
+            to_us = 1
+        elif unit == "ns":
+            storage = self._iter_us_from_ns(storage)
+            to_us = 1
+
+        for item in storage:
+            if item is None:
+                yield None
+            else:
+                yield timedelta(microseconds=item * to_us)
+
+    def _iter_time_components(self, offset, length):
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            to_us = 1_000_000
+        elif unit == "ms":
+            to_us = 1000
+        elif unit == "us":
+            to_us = 1
+        elif unit == "ns":
+            storage = self._iter_us_from_ns(storage)
+            to_us = 1
+
+        us_per_sec = 1_000_000
+        us_per_min = us_per_sec * 60
+        us_per_hour = us_per_min * 60
+        us_per_day = us_per_hour * 24
+
+        for item in storage:
+            if item is None:
+                yield None
+            else:
+                us = item * to_us
+                days = us // us_per_day
+                us = us % us_per_day
+                hours = us // us_per_hour
+                us = us % us_per_hour
+                mins = us // us_per_min
+                us = us % us_per_min
+                secs = us // us_per_sec
+                us = us % us_per_sec
+                yield days, hours, mins, secs, us
+
+    def _iter_us_from_ns(self, parent):
+        for item in parent:
+            if item is None:
+                yield None
+            else:
+                if item % 1000 != 0:
+                    self._warn("nanoseconds discarded", LossyConversionWarning)
+                yield item // 1000
+
     def _primitive_iter(self, offset, length):
         view = self._array_view
         offset += view.offset
@@ -258,7 +397,7 @@ class PyIterator(ArrayViewIterator):
 
 class RowTupleIterator(PyIterator):
     """Iterate over rows of a struct array (stream) where each row is a
-    tuple instead of a dictionary. This is ~3x faster and matches other
+    tuple instead of a dictionary. This is usually faster and matches other
     Python concepts more closely (e.g., dbapi's cursor, pandas itertuples).
     Intended for internal use.
     """
@@ -278,6 +417,44 @@ class RowTupleIterator(PyIterator):
         return self._struct_tuple_iter(offset, length)
 
 
+def _get_tzinfo(tz_string, strategy=None):
+    import re
+    from datetime import timedelta, timezone
+
+    # We can handle UTC without any imports
+    if tz_string.upper() == "UTC":
+        return timezone.utc
+
+    # Arrow also allows fixed-offset in the from +HH:MM
+    maybe_fixed_offset = re.search(r"^([+-])([0-9]{2}):([0-9]{2})$", tz_string)
+    if maybe_fixed_offset:
+        sign, hours, minutes = maybe_fixed_offset.groups()
+        sign = 1 if sign == "+" else -1
+        return timezone(sign * timedelta(hours=int(hours), 
minutes=int(minutes)))
+
+    # Try zoneinfo.ZoneInfo() (Python 3.9+)
+    if strategy is None or "zoneinfo" in strategy:
+        try:
+            from zoneinfo import ZoneInfo
+
+            return ZoneInfo(tz_string)
+        except ImportError:
+            pass
+
+    # Try dateutil.tz.gettz()
+    if strategy is None or "dateutil" in strategy:
+        try:
+            from dateutil.tz import gettz
+
+            return gettz(tz_string)
+        except ImportError:
+            pass
+
+    raise RuntimeError(
+        "zoneinfo (Python 3.9+) or dateutil is required to resolve timezone"
+    )
+
+
 _ITEMS_ITER_LOOKUP = {
     CArrowType.BINARY: "_binary_iter",
     CArrowType.LARGE_BINARY: "_binary_iter",
@@ -288,6 +465,12 @@ _ITEMS_ITER_LOOKUP = {
     CArrowType.LARGE_LIST: "_list_iter",
     CArrowType.FIXED_SIZE_LIST: "_fixed_size_list_iter",
     CArrowType.DICTIONARY: "_dictionary_iter",
+    CArrowType.DATE32: "_date_iter",
+    CArrowType.DATE64: "_date_iter",
+    CArrowType.TIME32: "_time_iter",
+    CArrowType.TIME64: "_time_iter",
+    CArrowType.TIMESTAMP: "_timestamp_iter",
+    CArrowType.DURATION: "_duration_iter",
 }
 
 _PRIMITIVE_TYPE_NAMES = [
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index abba0846..72a55414 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -15,12 +15,31 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import datetime
+
 import pytest
-from nanoarrow.iterator import iter_py, iter_tuples
+from nanoarrow.iterator import (
+    ArrayViewIterator,
+    InvalidArrayWarning,
+    LossyConversionWarning,
+    iter_py,
+    iter_tuples,
+)
 
 import nanoarrow as na
 
 
+def test_iterator_warnings():
+    msg_unnamed = "<unnamed int32>: something"
+    with pytest.warns(LossyConversionWarning, match=msg_unnamed):
+        ArrayViewIterator(na.int32())._warn("something", 
LossyConversionWarning)
+
+    msg_named = "some_colname <int32>: something"
+    with pytest.warns(LossyConversionWarning, match=msg_named):
+        iterator = ArrayViewIterator(na.Schema(na.Type.INT32, 
name="some_colname"))
+        iterator._warn("something", LossyConversionWarning)
+
+
 def test_iterator_primitive():
     array = na.c_array([1, 2, 3], na.int32())
     assert list(iter_py(array)) == [1, 2, 3]
@@ -313,3 +332,166 @@ def test_iterator_nullable_dictionary():
 
     sliced = array[1:]
     assert list(iter_py(sliced)) == ["cde", "ab", "def", "cde", None]
+
+
+def test_iterator_date():
+    pa = pytest.importorskip("pyarrow")
+
+    items = [
+        datetime.date(1970, 1, 2),
+        None,
+        datetime.date(2024, 4, 8),
+    ]
+
+    array = pa.array(items, pa.date32())
+    assert list(iter_py(array)) == items
+
+    array = pa.array(items, pa.date64())
+    assert list(iter_py(array)) == items
+
+
+def test_iterator_time():
+    pa = pytest.importorskip("pyarrow")
+
+    items = [
+        datetime.time(15, 45, 21, 12345),
+        None,
+        datetime.time(1, 23, 45),
+    ]
+
+    array = pa.array(items, pa.time64("ns"))
+    assert list(iter_py(array)) == items
+
+    array = pa.array(items, pa.time64("us"))
+    assert list(iter_py(array)) == items
+
+    items[0] = datetime.time(15, 45, 21, 123000)
+    array = pa.array(items, pa.time32("ms"))
+    assert list(iter_py(array)) == items
+
+    items[0] = datetime.time(15, 45, 21)
+    array = pa.array(items, pa.time32("s"))
+    assert list(iter_py(array)) == items
+
+
+def test_iterator_time_invalid():
+    time_invalid = na.c_array_from_buffers(
+        na.time32("s"), 1, [None, na.c_buffer([60 * 60 * 24], na.int32())]
+    )
+
+    with pytest.warns(InvalidArrayWarning):
+        list(iter_py(time_invalid))
+
+
+def test_iterator_timestamp():
+    pa = pytest.importorskip("pyarrow")
+
+    items = [
+        datetime.datetime(1900, 1, 1, 11, 59, 1, 123),
+        None,
+        datetime.datetime(2050, 1, 1, 23, 59, 1, 0),
+    ]
+
+    array = pa.array(items, pa.timestamp("ns"))
+    assert list(iter_py(array)) == items
+
+    array = pa.array(items, pa.timestamp("us"))
+    assert list(iter_py(array)) == items
+
+    items[0] = items[0].replace(microsecond=123000)
+    array = pa.array(items, pa.timestamp("ms"))
+    assert list(iter_py(array)) == items
+
+    items[0] = items[0].replace(microsecond=0)
+    array = pa.array(items, pa.timestamp("s"))
+    assert list(iter_py(array)) == items
+
+
+def test_iterator_timestamp_tz():
+    from nanoarrow.iterator import _get_tzinfo
+
+    pa = pytest.importorskip("pyarrow")
+
+    tz = _get_tzinfo("America/Halifax")
+
+    items = [
+        datetime.datetime(1900, 1, 1, 11, 59, 1, 1234, tzinfo=tz),
+        None,
+        datetime.datetime(2050, 1, 1, 23, 59, 1, 0, tzinfo=tz),
+    ]
+
+    array = pa.array(items, pa.timestamp("ns", "America/Halifax"))
+    assert list(iter_py(array)) == items
+
+    array = pa.array(items, pa.timestamp("us", "America/Halifax"))
+    assert list(iter_py(array)) == items
+
+    items[0] = items[0].replace(microsecond=123000)
+    array = pa.array(items, pa.timestamp("ms", "America/Halifax"))
+    assert list(iter_py(array)) == items
+
+    items[0] = items[0].replace(microsecond=0)
+    array = pa.array(items, pa.timestamp("s", "America/Halifax"))
+    assert list(iter_py(array)) == items
+
+
+def test_iterator_lossy_timestamp():
+    datetime_with_ns = na.c_array_from_buffers(
+        na.timestamp("ns"), 1, [None, na.c_buffer([1], na.int64())]
+    )
+
+    with pytest.warns(LossyConversionWarning):
+        list(iter_py(datetime_with_ns))
+
+
+def test_get_tzinfo():
+    from nanoarrow.iterator import _get_tzinfo
+
+    dt = datetime.datetime(2020, 1, 2, 3, 4, 5)
+
+    assert dt.replace(tzinfo=_get_tzinfo("UTC")).utcoffset() == 
datetime.timedelta(0)
+    assert dt.replace(tzinfo=_get_tzinfo("utc")).utcoffset() == 
datetime.timedelta(0)
+
+    assert dt.replace(tzinfo=_get_tzinfo("+03:30")).utcoffset() == 
datetime.timedelta(
+        hours=3, minutes=30
+    )
+
+    assert dt.replace(tzinfo=_get_tzinfo("-03:30")).utcoffset() == 
datetime.timedelta(
+        hours=-3, minutes=-30
+    )
+
+    pytest.importorskip("zoneinfo")
+    pytest.importorskip("dateutil")
+
+    tz_zoneinfo = _get_tzinfo("America/Halifax", strategy=["zoneinfo"])
+    tz_dateutil = _get_tzinfo("America/Halifax", strategy=["dateutil"])
+
+    for tz in [tz_zoneinfo, tz_dateutil]:
+        assert dt.replace(tzinfo=tz).utcoffset() == 
datetime.timedelta(hours=-4)
+
+    with pytest.raises(RuntimeError):
+        _get_tzinfo("America/Halifax", strategy=[])
+
+
+def test_iterator_duration():
+    pa = pytest.importorskip("pyarrow")
+
+    items = [
+        datetime.timedelta(days=-12, seconds=-345, microseconds=-6789),
+        None,
+        datetime.timedelta(days=12345, seconds=67890),
+    ]
+
+    array = pa.array(items, pa.duration("ns"))
+    assert list(iter_py(array)) == items
+
+    array = pa.array(items, pa.duration("us"))
+    assert list(iter_py(array)) == items
+
+    items[0] = datetime.timedelta(days=-12, seconds=-345, microseconds=-678000)
+    array = pa.array(items, pa.duration("ms"))
+    assert list(iter_py(array)) == items
+
+    items[0] = datetime.timedelta(days=-12, seconds=-345)
+    array = pa.array(items, pa.duration("s"))
+    assert list(iter_py(array)) == items

(arrow-nanoarrow) branch main updated: feat(python): Add Arrow->Python datetime support (#417)

Reply via email to