jorisvandenbossche commented on code in PR #417:
URL: https://github.com/apache/arrow-nanoarrow/pull/417#discussion_r1559107584
##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
+ else:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(milliseconds=item)
+
+ def _time_iter(self, offset, length):
+ from datetime import time
+
+ for item in self._iter_datetime_components(offset, length):
+ if item is None:
+ yield None
+ else:
+ days, hours, mins, secs, us = item
+ yield time(hours, mins, secs, us)
+
+ def _timestamp_iter(self, offset, length):
+ from datetime import datetime
+
+ fromtimestamp = datetime.fromtimestamp
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ scale = 1
+ elif unit == "ms":
+ scale = 1000
+ elif unit == "us":
+ scale = 1_000_000
+ elif unit == "ns":
+ storage = _scale_and_round_maybe_none(storage, 0.001)
+ scale = 1_000_000
+
+ tz = self._schema_view.timezone
+ if tz:
+ tz = _get_tzinfo(tz)
+ tz_fromtimestamp = tz
+ else:
+ tz = None
+ tz_fromtimestamp = _get_tzinfo("UTC")
+
+ for parent in storage:
Review Comment:
```suggestion
for item in storage:
```
nitpick to be consistent in naming? (above you are using item)
##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
+ else:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(milliseconds=item)
+
+ def _time_iter(self, offset, length):
+ from datetime import time
+
+ for item in self._iter_datetime_components(offset, length):
+ if item is None:
+ yield None
+ else:
+ days, hours, mins, secs, us = item
+ yield time(hours, mins, secs, us)
+
+ def _timestamp_iter(self, offset, length):
+ from datetime import datetime
+
+ fromtimestamp = datetime.fromtimestamp
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ scale = 1
+ elif unit == "ms":
+ scale = 1000
+ elif unit == "us":
+ scale = 1_000_000
+ elif unit == "ns":
+ storage = _scale_and_round_maybe_none(storage, 0.001)
Review Comment:
Should we silently discard the nanoseconds? An alternative could be to raise
an error if the nanoseconds are not zero, or warn.
##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
+ else:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(milliseconds=item)
+
+ def _time_iter(self, offset, length):
+ from datetime import time
+
+ for item in self._iter_datetime_components(offset, length):
+ if item is None:
+ yield None
+ else:
+ days, hours, mins, secs, us = item
+ yield time(hours, mins, secs, us)
+
+ def _timestamp_iter(self, offset, length):
+ from datetime import datetime
+
+ fromtimestamp = datetime.fromtimestamp
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ scale = 1
+ elif unit == "ms":
+ scale = 1000
+ elif unit == "us":
+ scale = 1_000_000
+ elif unit == "ns":
+ storage = _scale_and_round_maybe_none(storage, 0.001)
+ scale = 1_000_000
+
+ tz = self._schema_view.timezone
+ if tz:
+ tz = _get_tzinfo(tz)
+ tz_fromtimestamp = tz
+ else:
+ tz = None
+ tz_fromtimestamp = _get_tzinfo("UTC")
+
+ for parent in storage:
+ if parent is None:
+ yield None
+ else:
+ s = parent // scale
+ us = parent % scale * (1_000_000 // scale)
+ yield fromtimestamp(s, tz_fromtimestamp).replace(
+ microsecond=us, tzinfo=tz
+ )
+
+ def _duration_iter(self, offset, length):
+ from datetime import timedelta
+
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ to_us = 1_000_000
+ elif unit == "ms":
+ to_us = 1000
+ elif unit == "us":
+ to_us = 1
+ elif unit == "ns":
+ storage = _scale_and_round_maybe_none(storage, 0.001)
+ to_us = 1
+
+ for item in storage:
+ if item is None:
+ yield None
+ else:
+ yield timedelta(microseconds=item * to_us)
+
+ def _iter_datetime_components(self, offset, length):
Review Comment:
Is `_iter_time_components` a better name? Because I think this is only used
for the time type, right?
##########
python/src/nanoarrow/iterator.py:
##########
@@ -278,6 +398,50 @@ def _iter1(self, offset, length):
return self._struct_tuple_iter(offset, length)
+def _get_tzinfo(tz_string, strategy=None):
+ # We can handle UTC without any imports
+ if tz_string.upper() == "UTC":
+ try:
+ # Added in Python 3.11
+ from datetime import UTC
Review Comment:
This is an alias of `datetime.timezone.utc` (lower case), which might be
available in older versions as well. So could simplify to use that
##########
python/src/nanoarrow/iterator.py:
##########
@@ -278,6 +398,50 @@ def _iter1(self, offset, length):
return self._struct_tuple_iter(offset, length)
+def _get_tzinfo(tz_string, strategy=None):
+ # We can handle UTC without any imports
+ if tz_string.upper() == "UTC":
+ try:
+ # Added in Python 3.11
+ from datetime import UTC
+
+ return UTC
+ except ImportError:
+ from datetime import timedelta, timezone
+
+ return timezone(timedelta(0), "UTC")
+
+ # Try zoneinfo.ZoneInfo() (Python 3.9+)
+ if strategy is None or "zoneinfo" in strategy:
+ try:
+ from zoneinfo import ZoneInfo
+
+ return ZoneInfo(tz_string)
+ except ImportError:
+ pass
Review Comment:
If the timezone doesn't exist (Arrow C++ itself never does any validation of
this, so this is typically "user input"), then this will actually raise a
`ZoneInfoNotFoundError`, not `ImportError`. And you could also catch a KeyError
to keep it generic.
```
In [76]: zoneinfo.ZoneInfo("bal")
...
ZoneInfoNotFoundError: 'No time zone found with key bal'
In [77]: zoneinfo.ZoneInfoNotFoundError.mro()
Out[77]:
[zoneinfo._common.ZoneInfoNotFoundError,
KeyError,
LookupError,
Exception,
BaseException,
object]
```
I suppose the ImportError was for older Python versions, but so I would do
that (`import zoneinfo`) in a separate try/except
##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
+ else:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(milliseconds=item)
+
+ def _time_iter(self, offset, length):
+ from datetime import time
+
+ for item in self._iter_datetime_components(offset, length):
+ if item is None:
+ yield None
+ else:
+ days, hours, mins, secs, us = item
+ yield time(hours, mins, secs, us)
+
+ def _timestamp_iter(self, offset, length):
+ from datetime import datetime
+
+ fromtimestamp = datetime.fromtimestamp
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ scale = 1
+ elif unit == "ms":
+ scale = 1000
+ elif unit == "us":
+ scale = 1_000_000
+ elif unit == "ns":
+ storage = _scale_and_round_maybe_none(storage, 0.001)
+ scale = 1_000_000
+
+ tz = self._schema_view.timezone
+ if tz:
+ tz = _get_tzinfo(tz)
+ tz_fromtimestamp = tz
+ else:
+ tz = None
+ tz_fromtimestamp = _get_tzinfo("UTC")
+
+ for parent in storage:
+ if parent is None:
+ yield None
+ else:
+ s = parent // scale
+ us = parent % scale * (1_000_000 // scale)
+ yield fromtimestamp(s, tz_fromtimestamp).replace(
+ microsecond=us, tzinfo=tz
+ )
+
+ def _duration_iter(self, offset, length):
+ from datetime import timedelta
+
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ to_us = 1_000_000
Review Comment:
For unit of seconds, it's probably a bit more efficient to pass the seconds
directly to `timedelta(seconds=..)` than first converting to microseconds,
because then the `timedelta` constructor will then convert the microseconds
back to seconds internally.
Don't know if that is worth it though, because of course the current logic
makes the loop a bit simpler
##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
Review Comment:
`datetime.date` also has a `fromtimestamp` method that accepts seconds since
epoch, so you could also use that, which seems to be slightly faster:
```
In [53]: item = 10957
In [54]: epoch = datetime.date(1970, 1, 1)
In [55]: epoch + datetime.timedelta(item)
Out[55]: datetime.date(2000, 1, 1)
In [56]: datetime.date.fromtimestamp(60*60*24*item)
Out[56]: datetime.date(2000, 1, 1)
In [57]: %timeit epoch + datetime.timedelta(item)
164 ns ± 0.569 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops
each)
In [58]: %timeit datetime.date.fromtimestamp(60*60*24*item)
118 ns ± 0.0869 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops
each)
```
##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
+ else:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(milliseconds=item)
+
+ def _time_iter(self, offset, length):
+ from datetime import time
+
+ for item in self._iter_datetime_components(offset, length):
+ if item is None:
+ yield None
+ else:
+ days, hours, mins, secs, us = item
+ yield time(hours, mins, secs, us)
+
+ def _timestamp_iter(self, offset, length):
+ from datetime import datetime
+
+ fromtimestamp = datetime.fromtimestamp
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ scale = 1
+ elif unit == "ms":
+ scale = 1000
+ elif unit == "us":
+ scale = 1_000_000
+ elif unit == "ns":
+ storage = _scale_and_round_maybe_none(storage, 0.001)
+ scale = 1_000_000
+
+ tz = self._schema_view.timezone
+ if tz:
+ tz = _get_tzinfo(tz)
+ tz_fromtimestamp = tz
+ else:
+ tz = None
+ tz_fromtimestamp = _get_tzinfo("UTC")
+
+ for parent in storage:
+ if parent is None:
+ yield None
+ else:
+ s = parent // scale
+ us = parent % scale * (1_000_000 // scale)
+ yield fromtimestamp(s, tz_fromtimestamp).replace(
+ microsecond=us, tzinfo=tz
+ )
Review Comment:
I think `fromtimestamp` should work with fractional seconds?
```
In [63]: import time
In [64]: time.time()
Out[64]: 1712740039.6300623
In [65]: datetime.datetime.fromtimestamp(time.time())
Out[65]: datetime.datetime(2024, 4, 10, 11, 7, 34, 904224)
```
So in that case could we do a normal division and just pass that to
fromtimestamp?
##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
+ else:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(milliseconds=item)
+
+ def _time_iter(self, offset, length):
+ from datetime import time
+
+ for item in self._iter_datetime_components(offset, length):
+ if item is None:
+ yield None
+ else:
+ days, hours, mins, secs, us = item
+ yield time(hours, mins, secs, us)
Review Comment:
`days` is just ignored here. Should we assert that it is 0? (the Arrow spec
strictly speaking says that the value should never exceed the number of
seconds/milliseconds/.. of 1 day, so for valid data we can be sure to not have
a `day` here)
##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
Review Comment:
And the same for DATE64 if converting the milliseconds to seconds
##########
python/src/nanoarrow/iterator.py:
##########
@@ -278,6 +398,50 @@ def _iter1(self, offset, length):
return self._struct_tuple_iter(offset, length)
+def _get_tzinfo(tz_string, strategy=None):
+ # We can handle UTC without any imports
+ if tz_string.upper() == "UTC":
+ try:
+ # Added in Python 3.11
+ from datetime import UTC
+
+ return UTC
+ except ImportError:
+ from datetime import timedelta, timezone
+
+ return timezone(timedelta(0), "UTC")
+
+ # Try zoneinfo.ZoneInfo() (Python 3.9+)
+ if strategy is None or "zoneinfo" in strategy:
+ try:
+ from zoneinfo import ZoneInfo
+
+ return ZoneInfo(tz_string)
+ except ImportError:
+ pass
Review Comment:
Ah, but of course you are just catching the ImportError to then be able to
try dateutil, and so the `ZoneInfoNotFoundError` will just be bubbled up to the
user, which is fine. So ignore my comment ;)
##########
python/src/nanoarrow/iterator.py:
##########
@@ -278,6 +398,50 @@ def _iter1(self, offset, length):
return self._struct_tuple_iter(offset, length)
+def _get_tzinfo(tz_string, strategy=None):
+ # We can handle UTC without any imports
+ if tz_string.upper() == "UTC":
+ try:
+ # Added in Python 3.11
+ from datetime import UTC
+
+ return UTC
+ except ImportError:
+ from datetime import timedelta, timezone
+
+ return timezone(timedelta(0), "UTC")
+
+ # Try zoneinfo.ZoneInfo() (Python 3.9+)
+ if strategy is None or "zoneinfo" in strategy:
+ try:
+ from zoneinfo import ZoneInfo
+
+ return ZoneInfo(tz_string)
+ except ImportError:
+ pass
+
+ # Try dateutil.tz.gettz()
+ if strategy is None or "dateutil" in strategy:
+ try:
+ from dateutil.tz import gettz
+
+ return gettz(tz_string)
+ except ImportError:
+ pass
+
+ raise RuntimeError(
+ "zoneinfo (Python 3.9+), pytz, or dateutil is required to resolve
timezone"
Review Comment:
```suggestion
"zoneinfo (Python 3.9+) or dateutil is required to resolve timezone"
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]