Re: [PR] feat(python): Add Arrow->Python datetime support [arrow-nanoarrow]

via GitHub Thu, 11 Apr 2024 06:48:00 -0700


jorisvandenbossche commented on code in PR #417:
URL: https://github.com/apache/arrow-nanoarrow/pull/417#discussion_r1560980528



##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +263,125 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)
+        else:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(milliseconds=item)
+
+    def _time_iter(self, offset, length):
+        from datetime import time
+
+        for item in self._iter_time_components(offset, length):
+            if item is None:
+                yield None
+            else:
+                days, hours, mins, secs, us = item
+                if days != 0:
+                    self._warn("days != 0", InvalidArrayWarning)
+
+                yield time(hours, mins, secs, us)
+
+    def _timestamp_iter(self, offset, length):
+        from datetime import datetime
+
+        epoch = datetime(1970, 1, 1, tzinfo=_get_tzinfo("UTC"))
+        parent = self._duration_iter(offset, length)
+
+        tz = self._schema_view.timezone
+        if tz:
+            tz = _get_tzinfo(tz)
+
+            for item in parent:
+                if item is None:
+                    yield None
+                else:
+                    yield (epoch + item).astimezone(tz)
+        else:
+            for item in parent:
+                if item is None:
+                    yield None
+                else:
+                    yield (epoch + item).replace(tzinfo=None)

Review Comment:
   ```suggestion
               epoch = epoch.replace(tzinfo=None)
               for item in parent:
                   if item is None:
                       yield None
                   else:
                       yield epoch + item
   ```
   
   No need to do the replace each time inside the for loop I think?



##########
python/src/nanoarrow/iterator.py:
##########
@@ -278,6 +416,44 @@ def _iter1(self, offset, length):
         return self._struct_tuple_iter(offset, length)
 
 
+def _get_tzinfo(tz_string, strategy=None):
+    import re
+    from datetime import timedelta, timezone
+
+    # We can handle UTC without any imports
+    if re.search(r"^utc$", tz_string, re.IGNORECASE):

Review Comment:
   Is there a reason you moved to this more complex check compared to 
`tz_string.upper() == "UTC":` ? Are there cases we would miss?



##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)
+        else:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(milliseconds=item)
+
+    def _time_iter(self, offset, length):
+        from datetime import time
+
+        for item in self._iter_datetime_components(offset, length):
+            if item is None:
+                yield None
+            else:
+                days, hours, mins, secs, us = item
+                yield time(hours, mins, secs, us)
+
+    def _timestamp_iter(self, offset, length):
+        from datetime import datetime
+
+        fromtimestamp = datetime.fromtimestamp
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            scale = 1
+        elif unit == "ms":
+            scale = 1000
+        elif unit == "us":
+            scale = 1_000_000
+        elif unit == "ns":
+            storage = _scale_and_round_maybe_none(storage, 0.001)
+            scale = 1_000_000
+
+        tz = self._schema_view.timezone
+        if tz:
+            tz = _get_tzinfo(tz)
+            tz_fromtimestamp = tz
+        else:
+            tz = None
+            tz_fromtimestamp = _get_tzinfo("UTC")
+
+        for parent in storage:
+            if parent is None:
+                yield None
+            else:
+                s = parent // scale
+                us = parent % scale * (1_000_000 // scale)
+                yield fromtimestamp(s, tz_fromtimestamp).replace(
+                    microsecond=us, tzinfo=tz
+                )
+
+    def _duration_iter(self, offset, length):
+        from datetime import timedelta
+
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            to_us = 1_000_000

Review Comment:
   > I think there is a lot that could be optimized here...this pass is mostly 
for completeness/correctness. Probably this is a job for C or C++ + and Python 
C API where we can do some of these things efficiently.
   
   FWIW, I think it is also nice that this is just in Python (and it's still 
faster than pyarrow to_pylist anyway). But it's true the bigger gain will 
probably be found in moving this to C(ython) (at least if we use numpy as 
baseline, then this specific duration iteration can be improved 10x: this PR 
for 1M elements: 340ms, pyarrow: 480 ms, this PR but with directly passing 
seconds: 280ms, numpy: 30ms)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(python): Add Arrow->Python datetime support [arrow-nanoarrow]

Reply via email to