jorisvandenbossche commented on code in PR #417:
URL: https://github.com/apache/arrow-nanoarrow/pull/417#discussion_r1560980528
##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +263,125 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
+ else:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(milliseconds=item)
+
+ def _time_iter(self, offset, length):
+ from datetime import time
+
+ for item in self._iter_time_components(offset, length):
+ if item is None:
+ yield None
+ else:
+ days, hours, mins, secs, us = item
+ if days != 0:
+ self._warn("days != 0", InvalidArrayWarning)
+
+ yield time(hours, mins, secs, us)
+
+ def _timestamp_iter(self, offset, length):
+ from datetime import datetime
+
+ epoch = datetime(1970, 1, 1, tzinfo=_get_tzinfo("UTC"))
+ parent = self._duration_iter(offset, length)
+
+ tz = self._schema_view.timezone
+ if tz:
+ tz = _get_tzinfo(tz)
+
+ for item in parent:
+ if item is None:
+ yield None
+ else:
+ yield (epoch + item).astimezone(tz)
+ else:
+ for item in parent:
+ if item is None:
+ yield None
+ else:
+ yield (epoch + item).replace(tzinfo=None)
Review Comment:
```suggestion
epoch = epoch.replace(tzinfo=None)
for item in parent:
if item is None:
yield None
else:
yield epoch + item
```
No need to do the replace each time inside the for loop I think?
##########
python/src/nanoarrow/iterator.py:
##########
@@ -278,6 +416,44 @@ def _iter1(self, offset, length):
return self._struct_tuple_iter(offset, length)
+def _get_tzinfo(tz_string, strategy=None):
+ import re
+ from datetime import timedelta, timezone
+
+ # We can handle UTC without any imports
+ if re.search(r"^utc$", tz_string, re.IGNORECASE):
Review Comment:
Is there a reason you moved to this more complex check compared to
`tz_string.upper() == "UTC":` ? Are there cases we would miss?
##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _date_iter(self, offset, length):
+ from datetime import date, timedelta
+
+ storage = self._primitive_iter(offset, length)
+ epoch = date(1970, 1, 1)
+
+ if self._schema_view.type_id == CArrowType.DATE32:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(item)
+ else:
+ for item in storage:
+ if item is None:
+ yield item
+ else:
+ yield epoch + timedelta(milliseconds=item)
+
+ def _time_iter(self, offset, length):
+ from datetime import time
+
+ for item in self._iter_datetime_components(offset, length):
+ if item is None:
+ yield None
+ else:
+ days, hours, mins, secs, us = item
+ yield time(hours, mins, secs, us)
+
+ def _timestamp_iter(self, offset, length):
+ from datetime import datetime
+
+ fromtimestamp = datetime.fromtimestamp
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ scale = 1
+ elif unit == "ms":
+ scale = 1000
+ elif unit == "us":
+ scale = 1_000_000
+ elif unit == "ns":
+ storage = _scale_and_round_maybe_none(storage, 0.001)
+ scale = 1_000_000
+
+ tz = self._schema_view.timezone
+ if tz:
+ tz = _get_tzinfo(tz)
+ tz_fromtimestamp = tz
+ else:
+ tz = None
+ tz_fromtimestamp = _get_tzinfo("UTC")
+
+ for parent in storage:
+ if parent is None:
+ yield None
+ else:
+ s = parent // scale
+ us = parent % scale * (1_000_000 // scale)
+ yield fromtimestamp(s, tz_fromtimestamp).replace(
+ microsecond=us, tzinfo=tz
+ )
+
+ def _duration_iter(self, offset, length):
+ from datetime import timedelta
+
+ storage = self._primitive_iter(offset, length)
+
+ unit = self._schema_view.time_unit
+ if unit == "s":
+ to_us = 1_000_000
Review Comment:
> I think there is a lot that could be optimized here...this pass is mostly
for completeness/correctness. Probably this is a job for C or C++ + and Python
C API where we can do some of these things efficiently.
FWIW, I think it is also nice that this is just in Python (and it's still
faster than pyarrow to_pylist anyway). But it's true the bigger gain will
probably be found in moving this to C(ython) (at least if we use numpy as
baseline, then this specific duration iteration can be improved 10x: this PR
for 1M elements: 340ms, pyarrow: 480 ms, this PR but with directly passing
seconds: 280ms, numpy: 30ms)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]