Re: [PR] feat(python): Add Arrow->Python datetime support [arrow-nanoarrow]

via GitHub Wed, 10 Apr 2024 02:35:17 -0700


jorisvandenbossche commented on code in PR #417:
URL: https://github.com/apache/arrow-nanoarrow/pull/417#discussion_r1559107584



##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)
+        else:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(milliseconds=item)
+
+    def _time_iter(self, offset, length):
+        from datetime import time
+
+        for item in self._iter_datetime_components(offset, length):
+            if item is None:
+                yield None
+            else:
+                days, hours, mins, secs, us = item
+                yield time(hours, mins, secs, us)
+
+    def _timestamp_iter(self, offset, length):
+        from datetime import datetime
+
+        fromtimestamp = datetime.fromtimestamp
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            scale = 1
+        elif unit == "ms":
+            scale = 1000
+        elif unit == "us":
+            scale = 1_000_000
+        elif unit == "ns":
+            storage = _scale_and_round_maybe_none(storage, 0.001)
+            scale = 1_000_000
+
+        tz = self._schema_view.timezone
+        if tz:
+            tz = _get_tzinfo(tz)
+            tz_fromtimestamp = tz
+        else:
+            tz = None
+            tz_fromtimestamp = _get_tzinfo("UTC")
+
+        for parent in storage:

Review Comment:
   ```suggestion
           for item in storage:
   ```
   
   nitpick to be consistent in naming? (above you are using item)



##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)
+        else:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(milliseconds=item)
+
+    def _time_iter(self, offset, length):
+        from datetime import time
+
+        for item in self._iter_datetime_components(offset, length):
+            if item is None:
+                yield None
+            else:
+                days, hours, mins, secs, us = item
+                yield time(hours, mins, secs, us)
+
+    def _timestamp_iter(self, offset, length):
+        from datetime import datetime
+
+        fromtimestamp = datetime.fromtimestamp
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            scale = 1
+        elif unit == "ms":
+            scale = 1000
+        elif unit == "us":
+            scale = 1_000_000
+        elif unit == "ns":
+            storage = _scale_and_round_maybe_none(storage, 0.001)

Review Comment:
   Should we silently discard the nanoseconds? An alternative could be to raise 
an error if the nanoseconds are not zero, or warn.



##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)
+        else:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(milliseconds=item)
+
+    def _time_iter(self, offset, length):
+        from datetime import time
+
+        for item in self._iter_datetime_components(offset, length):
+            if item is None:
+                yield None
+            else:
+                days, hours, mins, secs, us = item
+                yield time(hours, mins, secs, us)
+
+    def _timestamp_iter(self, offset, length):
+        from datetime import datetime
+
+        fromtimestamp = datetime.fromtimestamp
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            scale = 1
+        elif unit == "ms":
+            scale = 1000
+        elif unit == "us":
+            scale = 1_000_000
+        elif unit == "ns":
+            storage = _scale_and_round_maybe_none(storage, 0.001)
+            scale = 1_000_000
+
+        tz = self._schema_view.timezone
+        if tz:
+            tz = _get_tzinfo(tz)
+            tz_fromtimestamp = tz
+        else:
+            tz = None
+            tz_fromtimestamp = _get_tzinfo("UTC")
+
+        for parent in storage:
+            if parent is None:
+                yield None
+            else:
+                s = parent // scale
+                us = parent % scale * (1_000_000 // scale)
+                yield fromtimestamp(s, tz_fromtimestamp).replace(
+                    microsecond=us, tzinfo=tz
+                )
+
+    def _duration_iter(self, offset, length):
+        from datetime import timedelta
+
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            to_us = 1_000_000
+        elif unit == "ms":
+            to_us = 1000
+        elif unit == "us":
+            to_us = 1
+        elif unit == "ns":
+            storage = _scale_and_round_maybe_none(storage, 0.001)
+            to_us = 1
+
+        for item in storage:
+            if item is None:
+                yield None
+            else:
+                yield timedelta(microseconds=item * to_us)
+
+    def _iter_datetime_components(self, offset, length):

Review Comment:
   Is `_iter_time_components` a better name? Because I think this is only used 
for the time type, right?



##########
python/src/nanoarrow/iterator.py:
##########
@@ -278,6 +398,50 @@ def _iter1(self, offset, length):
         return self._struct_tuple_iter(offset, length)
 
 
+def _get_tzinfo(tz_string, strategy=None):
+    # We can handle UTC without any imports
+    if tz_string.upper() == "UTC":
+        try:
+            # Added in Python 3.11
+            from datetime import UTC

Review Comment:
   This is an alias of `datetime.timezone.utc` (lower case), which might be 
available in older versions as well. So could simplify to use that



##########
python/src/nanoarrow/iterator.py:
##########
@@ -278,6 +398,50 @@ def _iter1(self, offset, length):
         return self._struct_tuple_iter(offset, length)
 
 
+def _get_tzinfo(tz_string, strategy=None):
+    # We can handle UTC without any imports
+    if tz_string.upper() == "UTC":
+        try:
+            # Added in Python 3.11
+            from datetime import UTC
+
+            return UTC
+        except ImportError:
+            from datetime import timedelta, timezone
+
+            return timezone(timedelta(0), "UTC")
+
+    # Try zoneinfo.ZoneInfo() (Python 3.9+)
+    if strategy is None or "zoneinfo" in strategy:
+        try:
+            from zoneinfo import ZoneInfo
+
+            return ZoneInfo(tz_string)
+        except ImportError:
+            pass

Review Comment:
   If the timezone doesn't exist (Arrow C++ itself never does any validation of 
this, so this is typically "user input"), then this will actually raise a 
`ZoneInfoNotFoundError`, not `ImportError`. And you could also catch a KeyError 
to keep it generic.
   
   ```
   In [76]: zoneinfo.ZoneInfo("bal")
   ...
   ZoneInfoNotFoundError: 'No time zone found with key bal'
   
   In [77]: zoneinfo.ZoneInfoNotFoundError.mro()
   Out[77]: 
   [zoneinfo._common.ZoneInfoNotFoundError,
    KeyError,
    LookupError,
    Exception,
    BaseException,
    object]
   ```
   
   I suppose the ImportError was for older Python versions, but so I would do 
that (`import zoneinfo`) in a separate try/except



##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)
+        else:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(milliseconds=item)
+
+    def _time_iter(self, offset, length):
+        from datetime import time
+
+        for item in self._iter_datetime_components(offset, length):
+            if item is None:
+                yield None
+            else:
+                days, hours, mins, secs, us = item
+                yield time(hours, mins, secs, us)
+
+    def _timestamp_iter(self, offset, length):
+        from datetime import datetime
+
+        fromtimestamp = datetime.fromtimestamp
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            scale = 1
+        elif unit == "ms":
+            scale = 1000
+        elif unit == "us":
+            scale = 1_000_000
+        elif unit == "ns":
+            storage = _scale_and_round_maybe_none(storage, 0.001)
+            scale = 1_000_000
+
+        tz = self._schema_view.timezone
+        if tz:
+            tz = _get_tzinfo(tz)
+            tz_fromtimestamp = tz
+        else:
+            tz = None
+            tz_fromtimestamp = _get_tzinfo("UTC")
+
+        for parent in storage:
+            if parent is None:
+                yield None
+            else:
+                s = parent // scale
+                us = parent % scale * (1_000_000 // scale)
+                yield fromtimestamp(s, tz_fromtimestamp).replace(
+                    microsecond=us, tzinfo=tz
+                )
+
+    def _duration_iter(self, offset, length):
+        from datetime import timedelta
+
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            to_us = 1_000_000

Review Comment:
   For unit of seconds, it's probably a bit more efficient to pass the seconds 
directly to `timedelta(seconds=..)` than first converting to microseconds, 
because then the `timedelta` constructor will then convert the microseconds 
back to seconds internally.
   
   Don't know if that is worth it though, because of course the current logic 
makes the loop a bit simpler



##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)

Review Comment:
   `datetime.date` also has a `fromtimestamp` method that accepts seconds since 
epoch, so you could also use that, which seems to be slightly faster:
   
   ```
   In [53]: item = 10957
   
   In [54]: epoch = datetime.date(1970, 1, 1)
   
   In [55]: epoch + datetime.timedelta(item)
   Out[55]: datetime.date(2000, 1, 1)
   
   In [56]: datetime.date.fromtimestamp(60*60*24*item)
   Out[56]: datetime.date(2000, 1, 1)
   
   In [57]: %timeit epoch + datetime.timedelta(item)
   164 ns ± 0.569 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops 
each)
   
   In [58]: %timeit datetime.date.fromtimestamp(60*60*24*item)
   118 ns ± 0.0869 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops 
each)
   ```
   
   



##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)
+        else:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(milliseconds=item)
+
+    def _time_iter(self, offset, length):
+        from datetime import time
+
+        for item in self._iter_datetime_components(offset, length):
+            if item is None:
+                yield None
+            else:
+                days, hours, mins, secs, us = item
+                yield time(hours, mins, secs, us)
+
+    def _timestamp_iter(self, offset, length):
+        from datetime import datetime
+
+        fromtimestamp = datetime.fromtimestamp
+        storage = self._primitive_iter(offset, length)
+
+        unit = self._schema_view.time_unit
+        if unit == "s":
+            scale = 1
+        elif unit == "ms":
+            scale = 1000
+        elif unit == "us":
+            scale = 1_000_000
+        elif unit == "ns":
+            storage = _scale_and_round_maybe_none(storage, 0.001)
+            scale = 1_000_000
+
+        tz = self._schema_view.timezone
+        if tz:
+            tz = _get_tzinfo(tz)
+            tz_fromtimestamp = tz
+        else:
+            tz = None
+            tz_fromtimestamp = _get_tzinfo("UTC")
+
+        for parent in storage:
+            if parent is None:
+                yield None
+            else:
+                s = parent // scale
+                us = parent % scale * (1_000_000 // scale)
+                yield fromtimestamp(s, tz_fromtimestamp).replace(
+                    microsecond=us, tzinfo=tz
+                )

Review Comment:
   I think `fromtimestamp` should work with fractional seconds? 
   
   ```
   In [63]: import time
   
   In [64]: time.time()
   Out[64]: 1712740039.6300623
   
   In [65]: datetime.datetime.fromtimestamp(time.time())
   Out[65]: datetime.datetime(2024, 4, 10, 11, 7, 34, 904224)
   ```
   
   So in that case could we do a normal division and just pass that to 
fromtimestamp?



##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)
+        else:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(milliseconds=item)
+
+    def _time_iter(self, offset, length):
+        from datetime import time
+
+        for item in self._iter_datetime_components(offset, length):
+            if item is None:
+                yield None
+            else:
+                days, hours, mins, secs, us = item
+                yield time(hours, mins, secs, us)

Review Comment:
   `days` is just ignored here. Should we assert that it is 0? (the Arrow spec 
strictly speaking says that the value should never exceed the number of 
seconds/milliseconds/.. of 1 day, so for valid data we can be sure to not have 
a `day` here)



##########
python/src/nanoarrow/iterator.py:
##########
@@ -244,6 +244,126 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _date_iter(self, offset, length):
+        from datetime import date, timedelta
+
+        storage = self._primitive_iter(offset, length)
+        epoch = date(1970, 1, 1)
+
+        if self._schema_view.type_id == CArrowType.DATE32:
+            for item in storage:
+                if item is None:
+                    yield item
+                else:
+                    yield epoch + timedelta(item)

Review Comment:
   And the same for DATE64 if converting the milliseconds to seconds



##########
python/src/nanoarrow/iterator.py:
##########
@@ -278,6 +398,50 @@ def _iter1(self, offset, length):
         return self._struct_tuple_iter(offset, length)
 
 
+def _get_tzinfo(tz_string, strategy=None):
+    # We can handle UTC without any imports
+    if tz_string.upper() == "UTC":
+        try:
+            # Added in Python 3.11
+            from datetime import UTC
+
+            return UTC
+        except ImportError:
+            from datetime import timedelta, timezone
+
+            return timezone(timedelta(0), "UTC")
+
+    # Try zoneinfo.ZoneInfo() (Python 3.9+)
+    if strategy is None or "zoneinfo" in strategy:
+        try:
+            from zoneinfo import ZoneInfo
+
+            return ZoneInfo(tz_string)
+        except ImportError:
+            pass

Review Comment:
   Ah, but of course you are just catching the ImportError to then be able to 
try dateutil, and so the `ZoneInfoNotFoundError` will just be bubbled up to the 
user, which is fine. So ignore my comment ;)



##########
python/src/nanoarrow/iterator.py:
##########
@@ -278,6 +398,50 @@ def _iter1(self, offset, length):
         return self._struct_tuple_iter(offset, length)
 
 
+def _get_tzinfo(tz_string, strategy=None):
+    # We can handle UTC without any imports
+    if tz_string.upper() == "UTC":
+        try:
+            # Added in Python 3.11
+            from datetime import UTC
+
+            return UTC
+        except ImportError:
+            from datetime import timedelta, timezone
+
+            return timezone(timedelta(0), "UTC")
+
+    # Try zoneinfo.ZoneInfo() (Python 3.9+)
+    if strategy is None or "zoneinfo" in strategy:
+        try:
+            from zoneinfo import ZoneInfo
+
+            return ZoneInfo(tz_string)
+        except ImportError:
+            pass
+
+    # Try dateutil.tz.gettz()
+    if strategy is None or "dateutil" in strategy:
+        try:
+            from dateutil.tz import gettz
+
+            return gettz(tz_string)
+        except ImportError:
+            pass
+
+    raise RuntimeError(
+        "zoneinfo (Python 3.9+), pytz, or dateutil is required to resolve 
timezone"

Review Comment:
   ```suggestion
           "zoneinfo (Python 3.9+) or dateutil is required to resolve timezone"
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(python): Add Arrow->Python datetime support [arrow-nanoarrow]

Reply via email to