This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1751bdd5 feat(python): Add Arrow->Python datetime support (#417)
1751bdd5 is described below
commit 1751bdd57046a6aef75dfd8aede09713ad7dc7fc
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu Apr 11 14:31:32 2024 -0300
feat(python): Add Arrow->Python datetime support (#417)
This PR adds support for converting Arrow date, time, timestamp, and
duration arrays to Python objects.
```python
import pyarrow as pa
import datetime
import zoneinfo
import nanoarrow as na
dt = datetime.datetime.now()
list(na.Array(pa.array([dt])).iter_py())
#> [datetime.datetime(2024, 4, 8, 16, 25, 41, 216438)]
dt_tz = datetime.datetime.now(zoneinfo.ZoneInfo("America/Halifax"))
list(na.Array(pa.array([dt_tz])).iter_py())
#> [datetime.datetime(2024, 4, 8, 16, 29, 7, 226832,
tzinfo=zoneinfo.ZoneInfo(key='America/Halifax'))]
tdelta = datetime.timedelta(123, 456, 678)
list(na.Array(pa.array([tdelta])).iter_py())
#> [datetime.timedelta(days=123, seconds=456, microseconds=678)]
just_time = datetime.time(15, 27, 43, 12)
list(na.Array(pa.array([just_time])).iter_py())
#> [datetime.time(15, 27, 43, 12)]
```
It is probably faster to use the DateTime C API, but the timings seem
reasonable:
```python
import pyarrow as pa
import datetime
import zoneinfo
import nanoarrow as na
n = int(1e6)
dt = datetime.datetime.now()
dt_array = pa.array([dt + datetime.timedelta(i) for i in range(n)])
%timeit dt_array.to_pylist()
%timeit list(na.Array(dt_array).iter_py())
#> 805 ms ± 21.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#> 804 ms ± 1.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
tdelta_array = pa.array([datetime.timedelta(123 + i, 456, 678) for i in
range(n)])
%timeit tdelta_array.to_pylist()
#> 574 ms ± 3.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit list(na.Array(tdelta_array).iter_py())
#> 399 ms ± 612 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
just_time_array = pa.array([datetime.time(15, 27, 43, i) for i in range(n)])
%timeit just_time_array.to_pylist()
#> 831 ms ± 6.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit list(na.Array(just_time_array).iter_py())
#> 399 ms ± 856 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
```
---------
Co-authored-by: Joris Van den Bossche <[email protected]>
---
python/pyproject.toml | 4 +-
python/src/nanoarrow/iterator.py | 187 ++++++++++++++++++++++++++++++++++++++-
python/tests/test_iterator.py | 184 +++++++++++++++++++++++++++++++++++++-
3 files changed, 370 insertions(+), 5 deletions(-)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index e176be26..2be6aebb 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -27,8 +27,8 @@ license = {text = "Apache-2.0"}
requires-python = ">=3.8"
[project.optional-dependencies]
-test = ["pyarrow", "pytest", "numpy"]
-verify = ["pytest", "numpy"]
+test = ["pyarrow", "python-dateutil", "pytest", "numpy"]
+verify = ["python-dateutil", "pytest", "numpy"]
[project.urls]
Homepage = "https://arrow.apache.org"
diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
index ae86fe51..111dbe0d 100644
--- a/python/src/nanoarrow/iterator.py
+++ b/python/src/nanoarrow/iterator.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+import warnings
from functools import cached_property
from itertools import islice
from typing import Iterable, Tuple
@@ -83,6 +84,14 @@ def iter_tuples(obj, schema=None) -> Iterable[Tuple]:
return RowTupleIterator.get_iterator(obj, schema=schema)
+class InvalidArrayWarning(UserWarning):
+ pass
+
+
+class LossyConversionWarning(UserWarning):
+ pass
+
+
class ArrayViewIterator:
"""Base class for iterators that use an internal ArrowArrayView
as the basis for conversion to Python objects. Intended for internal use.
@@ -101,7 +110,7 @@ class ArrayViewIterator:
map(self._make_child, self._schema.children,
self._array_view.children)
)
- if schema.dictionary is None:
+ if self._schema.dictionary is None:
self._dictionary = None
else:
self._dictionary = self._make_child(
@@ -115,6 +124,13 @@ class ArrayViewIterator:
def _child_names(self):
return [child.name for child in self._schema.children]
+ @cached_property
+ def _object_label(self):
+ if self._schema.name:
+ return f"{self._schema.name} <{self._schema_view.type}>"
+ else:
+ return f"<unnamed {self._schema_view.type}>"
+
def _contains_nulls(self):
return (
self._schema_view.nullable
@@ -126,6 +142,9 @@ class ArrayViewIterator:
self._array_view._set_array(array)
return self
+ def _warn(self, message, category):
+ warnings.warn(f"{self._object_label}: {message}", category)
+
class PyIterator(ArrayViewIterator):
"""Iterate over the Python object version of values in an ArrowArrayView.
@@ -244,6 +263,126 @@ class PyIterator(ArrayViewIterator):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
+ else:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(milliseconds=item)
+
+ def _time_iter(self, offset, length):
+ from datetime import time
+
+ for item in self._iter_time_components(offset, length):
+ if item is None:
+ yield None
+ else:
+ days, hours, mins, secs, us = item
+ if days != 0:
+ self._warn("days != 0", InvalidArrayWarning)
+
+ yield time(hours, mins, secs, us)
+
+ def _timestamp_iter(self, offset, length):
+ from datetime import datetime
+
+ epoch = datetime(1970, 1, 1, tzinfo=_get_tzinfo("UTC"))
+ parent = self._duration_iter(offset, length)
+
+ tz = self._schema_view.timezone
+ if tz:
+ tz = _get_tzinfo(tz)
+
+ for item in parent:
+ if item is None:
+ yield None
+ else:
+ yield (epoch + item).astimezone(tz)
+ else:
+ epoch = epoch.replace(tzinfo=None)
+ for item in parent:
+ if item is None:
+ yield None
+ else:
+ yield epoch + item
+
+ def _duration_iter(self, offset, length):
+ from datetime import timedelta
+
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ to_us = 1_000_000
+ elif unit == "ms":
+ to_us = 1000
+ elif unit == "us":
+ to_us = 1
+ elif unit == "ns":
+ storage = self._iter_us_from_ns(storage)
+ to_us = 1
+
+ for item in storage:
+ if item is None:
+ yield None
+ else:
+ yield timedelta(microseconds=item * to_us)
+
+ def _iter_time_components(self, offset, length):
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ to_us = 1_000_000
+ elif unit == "ms":
+ to_us = 1000
+ elif unit == "us":
+ to_us = 1
+ elif unit == "ns":
+ storage = self._iter_us_from_ns(storage)
+ to_us = 1
+
+ us_per_sec = 1_000_000
+ us_per_min = us_per_sec * 60
+ us_per_hour = us_per_min * 60
+ us_per_day = us_per_hour * 24
+
+ for item in storage:
+ if item is None:
+ yield None
+ else:
+ us = item * to_us
+ days = us // us_per_day
+ us = us % us_per_day
+ hours = us // us_per_hour
+ us = us % us_per_hour
+ mins = us // us_per_min
+ us = us % us_per_min
+ secs = us // us_per_sec
+ us = us % us_per_sec
+ yield days, hours, mins, secs, us
+
+ def _iter_us_from_ns(self, parent):
+ for item in parent:
+ if item is None:
+ yield None
+ else:
+ if item % 1000 != 0:
+ self._warn("nanoseconds discarded", LossyConversionWarning)
+ yield item // 1000
+
def _primitive_iter(self, offset, length):
view = self._array_view
offset += view.offset
@@ -258,7 +397,7 @@ class PyIterator(ArrayViewIterator):
class RowTupleIterator(PyIterator):
"""Iterate over rows of a struct array (stream) where each row is a
- tuple instead of a dictionary. This is ~3x faster and matches other
+ tuple instead of a dictionary. This is usually faster and matches other
Python concepts more closely (e.g., dbapi's cursor, pandas itertuples).
Intended for internal use.
"""
@@ -278,6 +417,44 @@ class RowTupleIterator(PyIterator):
return self._struct_tuple_iter(offset, length)
+def _get_tzinfo(tz_string, strategy=None):
+ import re
+ from datetime import timedelta, timezone
+
+ # We can handle UTC without any imports
+ if tz_string.upper() == "UTC":
+ return timezone.utc
+
+ # Arrow also allows fixed-offset in the from +HH:MM
+ maybe_fixed_offset = re.search(r"^([+-])([0-9]{2}):([0-9]{2})$", tz_string)
+ if maybe_fixed_offset:
+ sign, hours, minutes = maybe_fixed_offset.groups()
+ sign = 1 if sign == "+" else -1
+ return timezone(sign * timedelta(hours=int(hours),
minutes=int(minutes)))
+
+ # Try zoneinfo.ZoneInfo() (Python 3.9+)
+ if strategy is None or "zoneinfo" in strategy:
+ try:
+ from zoneinfo import ZoneInfo
+
+ return ZoneInfo(tz_string)
+ except ImportError:
+ pass
+
+ # Try dateutil.tz.gettz()
+ if strategy is None or "dateutil" in strategy:
+ try:
+ from dateutil.tz import gettz
+
+ return gettz(tz_string)
+ except ImportError:
+ pass
+
+ raise RuntimeError(
+ "zoneinfo (Python 3.9+) or dateutil is required to resolve timezone"
+ )
+
+
_ITEMS_ITER_LOOKUP = {
CArrowType.BINARY: "_binary_iter",
CArrowType.LARGE_BINARY: "_binary_iter",
@@ -288,6 +465,12 @@ _ITEMS_ITER_LOOKUP = {
CArrowType.LARGE_LIST: "_list_iter",
CArrowType.FIXED_SIZE_LIST: "_fixed_size_list_iter",
CArrowType.DICTIONARY: "_dictionary_iter",
+ CArrowType.DATE32: "_date_iter",
+ CArrowType.DATE64: "_date_iter",
+ CArrowType.TIME32: "_time_iter",
+ CArrowType.TIME64: "_time_iter",
+ CArrowType.TIMESTAMP: "_timestamp_iter",
+ CArrowType.DURATION: "_duration_iter",
}
_PRIMITIVE_TYPE_NAMES = [
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index abba0846..72a55414 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -15,12 +15,31 @@
# specific language governing permissions and limitations
# under the License.
+import datetime
+
import pytest
-from nanoarrow.iterator import iter_py, iter_tuples
+from nanoarrow.iterator import (
+ ArrayViewIterator,
+ InvalidArrayWarning,
+ LossyConversionWarning,
+ iter_py,
+ iter_tuples,
+)
import nanoarrow as na
+def test_iterator_warnings():
+ msg_unnamed = "<unnamed int32>: something"
+ with pytest.warns(LossyConversionWarning, match=msg_unnamed):
+ ArrayViewIterator(na.int32())._warn("something",
LossyConversionWarning)
+
+ msg_named = "some_colname <int32>: something"
+ with pytest.warns(LossyConversionWarning, match=msg_named):
+ iterator = ArrayViewIterator(na.Schema(na.Type.INT32,
name="some_colname"))
+ iterator._warn("something", LossyConversionWarning)
+
+
def test_iterator_primitive():
array = na.c_array([1, 2, 3], na.int32())
assert list(iter_py(array)) == [1, 2, 3]
@@ -313,3 +332,166 @@ def test_iterator_nullable_dictionary():
sliced = array[1:]
assert list(iter_py(sliced)) == ["cde", "ab", "def", "cde", None]
+
+
+def test_iterator_date():
+ pa = pytest.importorskip("pyarrow")
+
+ items = [
+ datetime.date(1970, 1, 2),
+ None,
+ datetime.date(2024, 4, 8),
+ ]
+
+ array = pa.array(items, pa.date32())
+ assert list(iter_py(array)) == items
+
+ array = pa.array(items, pa.date64())
+ assert list(iter_py(array)) == items
+
+
+def test_iterator_time():
+ pa = pytest.importorskip("pyarrow")
+
+ items = [
+ datetime.time(15, 45, 21, 12345),
+ None,
+ datetime.time(1, 23, 45),
+ ]
+
+ array = pa.array(items, pa.time64("ns"))
+ assert list(iter_py(array)) == items
+
+ array = pa.array(items, pa.time64("us"))
+ assert list(iter_py(array)) == items
+
+ items[0] = datetime.time(15, 45, 21, 123000)
+ array = pa.array(items, pa.time32("ms"))
+ assert list(iter_py(array)) == items
+
+ items[0] = datetime.time(15, 45, 21)
+ array = pa.array(items, pa.time32("s"))
+ assert list(iter_py(array)) == items
+
+
+def test_iterator_time_invalid():
+ time_invalid = na.c_array_from_buffers(
+ na.time32("s"), 1, [None, na.c_buffer([60 * 60 * 24], na.int32())]
+ )
+
+ with pytest.warns(InvalidArrayWarning):
+ list(iter_py(time_invalid))
+
+
+def test_iterator_timestamp():
+ pa = pytest.importorskip("pyarrow")
+
+ items = [
+ datetime.datetime(1900, 1, 1, 11, 59, 1, 123),
+ None,
+ datetime.datetime(2050, 1, 1, 23, 59, 1, 0),
+ ]
+
+ array = pa.array(items, pa.timestamp("ns"))
+ assert list(iter_py(array)) == items
+
+ array = pa.array(items, pa.timestamp("us"))
+ assert list(iter_py(array)) == items
+
+ items[0] = items[0].replace(microsecond=123000)
+ array = pa.array(items, pa.timestamp("ms"))
+ assert list(iter_py(array)) == items
+
+ items[0] = items[0].replace(microsecond=0)
+ array = pa.array(items, pa.timestamp("s"))
+ assert list(iter_py(array)) == items
+
+
+def test_iterator_timestamp_tz():
+ from nanoarrow.iterator import _get_tzinfo
+
+ pa = pytest.importorskip("pyarrow")
+
+ tz = _get_tzinfo("America/Halifax")
+
+ items = [
+ datetime.datetime(1900, 1, 1, 11, 59, 1, 1234, tzinfo=tz),
+ None,
+ datetime.datetime(2050, 1, 1, 23, 59, 1, 0, tzinfo=tz),
+ ]
+
+ array = pa.array(items, pa.timestamp("ns", "America/Halifax"))
+ assert list(iter_py(array)) == items
+
+ array = pa.array(items, pa.timestamp("us", "America/Halifax"))
+ assert list(iter_py(array)) == items
+
+ items[0] = items[0].replace(microsecond=123000)
+ array = pa.array(items, pa.timestamp("ms", "America/Halifax"))
+ assert list(iter_py(array)) == items
+
+ items[0] = items[0].replace(microsecond=0)
+ array = pa.array(items, pa.timestamp("s", "America/Halifax"))
+ assert list(iter_py(array)) == items
+
+
+def test_iterator_lossy_timestamp():
+ datetime_with_ns = na.c_array_from_buffers(
+ na.timestamp("ns"), 1, [None, na.c_buffer([1], na.int64())]
+ )
+
+ with pytest.warns(LossyConversionWarning):
+ list(iter_py(datetime_with_ns))
+
+
+def test_get_tzinfo():
+ from nanoarrow.iterator import _get_tzinfo
+
+ dt = datetime.datetime(2020, 1, 2, 3, 4, 5)
+
+ assert dt.replace(tzinfo=_get_tzinfo("UTC")).utcoffset() ==
datetime.timedelta(0)
+ assert dt.replace(tzinfo=_get_tzinfo("utc")).utcoffset() ==
datetime.timedelta(0)
+
+ assert dt.replace(tzinfo=_get_tzinfo("+03:30")).utcoffset() ==
datetime.timedelta(
+ hours=3, minutes=30
+ )
+
+ assert dt.replace(tzinfo=_get_tzinfo("-03:30")).utcoffset() ==
datetime.timedelta(
+ hours=-3, minutes=-30
+ )
+
+ pytest.importorskip("zoneinfo")
+ pytest.importorskip("dateutil")
+
+ tz_zoneinfo = _get_tzinfo("America/Halifax", strategy=["zoneinfo"])
+ tz_dateutil = _get_tzinfo("America/Halifax", strategy=["dateutil"])
+
+ for tz in [tz_zoneinfo, tz_dateutil]:
+ assert dt.replace(tzinfo=tz).utcoffset() ==
datetime.timedelta(hours=-4)
+
+ with pytest.raises(RuntimeError):
+ _get_tzinfo("America/Halifax", strategy=[])
+
+
+def test_iterator_duration():
+ pa = pytest.importorskip("pyarrow")
+
+ items = [
+ datetime.timedelta(days=-12, seconds=-345, microseconds=-6789),
+ None,
+ datetime.timedelta(days=12345, seconds=67890),
+ ]
+
+ array = pa.array(items, pa.duration("ns"))
+ assert list(iter_py(array)) == items
+
+ array = pa.array(items, pa.duration("us"))
+ assert list(iter_py(array)) == items
+
+ items[0] = datetime.timedelta(days=-12, seconds=-345, microseconds=-678000)
+ array = pa.array(items, pa.duration("ms"))
+ assert list(iter_py(array)) == items
+
+ items[0] = datetime.timedelta(days=-12, seconds=-345)
+ array = pa.array(items, pa.duration("s"))
+ assert list(iter_py(array)) == items