This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 9c63a259 feat(python) : Added support to create 
timestamp/date32/date64 Array from integer values (#502)
9c63a259 is described below

commit 9c63a259b2f746690336fcc6d7d0bb8d1f4ee71f
Author: Abhishek Singh <[email protected]>
AuthorDate: Sat Jun 8 04:01:37 2024 -0700

    feat(python) : Added support to create timestamp/date32/date64 Array from 
integer values (#502)
    
    Addresses https://github.com/apache/arrow-nanoarrow/issues/478
    
    Currently, if you create a `c_array` from an iterable of timestamps an
    error is raised
    
    Code:
    ```python
    from datetime import datetime
    
    import nanoarrow as na
    import pyarrow as pa
    
    
    def gen_timestamp():
        for i in range(100):
            yield int(round(datetime.now().timestamp()))
    
    timestamp_array = na.c_array(gen_timestamp(), na.timestamp("s"))
    
    parray = pa.Table.from_arrays([timestamp_array], names=["timestamp"])
    
    print(parray.to_pandas())
    ```
    Error:
    ```
    Traceback (most recent call last):
      File 
"/Users/as/nanoarrow_poc/.venv/lib/python3.12/site-packages/nanoarrow/c_array.py",
 line 128, in c_array
        builder = builder_cls(schema)
                  ^^^^^^^^^^^^^^^^^^^
      File 
"/Users/as/nanoarrow_poc/.venv/lib/python3.12/site-packages/nanoarrow/c_array.py",
 line 457, in __init__
        raise ValueError(
    ValueError: Can't build array of type timestamp from iterable
    ```
    
    Expected output:
    ```
                 timestamp
    0  2024-05-31 18:43:10
    1  2024-05-31 18:43:10
    2  2024-05-31 18:43:10
    3  2024-05-31 18:43:10
    4  2024-05-31 18:43:10
    ..                 ...
    95 2024-05-31 18:43:10
    96 2024-05-31 18:43:10
    97 2024-05-31 18:43:10
    98 2024-05-31 18:43:10
    99 2024-05-31 18:43:10
    
    [100 rows x 1 columns]
    ```
    
    Benefits
    - Pyarrow Table and Pandas Dataframe can use the corresponding nanoarrow
    Array Type to cast the values when using `to_pandas`
    
    ---------
    
    Co-authored-by: Dewey Dunnington <[email protected]>
---
 python/src/nanoarrow/_lib.pyx   | 11 +++++
 python/src/nanoarrow/c_array.py |  8 +++-
 python/tests/test_array.py      | 65 ++++++++++++++++++++++++++++
 python/tests/test_c_array.py    | 95 +++++++++++++++++++++++++++++++++++++++++
 python/tests/test_c_buffer.py   | 35 ++++++++++++++-
 5 files changed, 211 insertions(+), 3 deletions(-)

diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 73da100a..058c27c9 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -1095,6 +1095,17 @@ cdef class CSchemaView:
     def storage_type_id(self):
         return self._schema_view.storage_type
 
+    @property
+    def storage_buffer_format(self):
+        if self.buffer_format is not None:
+            return self.buffer_format
+        elif self._schema_view.type == NANOARROW_TYPE_DATE32:
+            return 'i'
+        elif self._schema_view.type in (NANOARROW_TYPE_TIMESTAMP, 
NANOARROW_TYPE_DATE64, NANOARROW_TYPE_DURATION):
+            return 'q'
+        else:
+            return None
+
     @property
     def buffer_format(self):
         """The Python struct format representing an element of this type
diff --git a/python/src/nanoarrow/c_array.py b/python/src/nanoarrow/c_array.py
index 64723325..db33c22e 100644
--- a/python/src/nanoarrow/c_array.py
+++ b/python/src/nanoarrow/c_array.py
@@ -406,7 +406,7 @@ class ArrayFromPyBufferBuilder(ArrayBuilder):
     def __init__(self, schema):
         super().__init__(schema)
 
-        if self._schema_view.buffer_format is None:
+        if self._schema_view.storage_buffer_format is None:
             raise ValueError(
                 f"Can't build array of type {self._schema_view.type} from 
PyBuffer"
             )
@@ -508,7 +508,7 @@ class ArrayFromIterableBuilder(ArrayBuilder):
     def _append_using_array(self, obj: Iterable) -> None:
         from array import array
 
-        py_array = array(self._schema_view.buffer_format, obj)
+        py_array = array(self._schema_view.storage_buffer_format, obj)
         buffer = CBuffer.from_pybuffer(py_array)
         self._c_builder.set_buffer(1, buffer, move=True)
         self._c_builder.set_length(len(buffer))
@@ -549,6 +549,10 @@ _ARRAY_BUILDER_FROM_ITERABLE_METHOD = {
     CArrowType.UINT64: "_append_using_array",
     CArrowType.FLOAT: "_append_using_array",
     CArrowType.DOUBLE: "_append_using_array",
+    CArrowType.TIMESTAMP: "_append_using_array",
+    CArrowType.DATE32: "_append_using_array",
+    CArrowType.DATE64: "_append_using_array",
+    CArrowType.DURATION: "_append_using_array",
 }
 
 _ARRAY_BUILDER_FROM_NULLABLE_ITERABLE_METHOD = {
diff --git a/python/tests/test_array.py b/python/tests/test_array.py
index 8eb1e96a..2dfd83c8 100644
--- a/python/tests/test_array.py
+++ b/python/tests/test_array.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+from datetime import date, datetime, timedelta, timezone
 
 import pytest
 from nanoarrow.c_array_stream import CArrayStream
@@ -354,3 +355,67 @@ def test_array_inspect(capsys):
     array.inspect()
     captured = capsys.readouterr()
     assert captured.out.startswith("<ArrowArray struct<col0: int32")
+
+
+def test_timestamp_array():
+    d1 = int(round(datetime(1985, 12, 31, 0, 0, 
tzinfo=timezone.utc).timestamp() * 1e3))
+    d2 = int(round(datetime(2005, 3, 4, 0, 0, tzinfo=timezone.utc).timestamp() 
* 1e3))
+    array = na.Array([d1, d2], na.timestamp("ms"))
+    assert list(array.to_pysequence()) == [
+        datetime(1985, 12, 31, 0, 0),
+        datetime(2005, 3, 4, 0, 0),
+    ]
+    assert array.to_pylist() == [
+        datetime(1985, 12, 31, 0, 0),
+        datetime(2005, 3, 4, 0, 0),
+    ]
+    assert repr(array).startswith("nanoarrow.Array<timestamp('ms', '')>")
+
+
+def test_date64_array():
+    unix_epoch = date(1970, 1, 1)
+    d1, d2 = date(1970, 1, 2), date(1970, 1, 3)
+    d1_date64 = int(round((d1 - unix_epoch).total_seconds() * 1e3))
+    d2_date64 = int(round((d2 - unix_epoch).total_seconds() * 1e3))
+    array = na.Array([d1_date64, d2_date64], na.date64())
+    assert list(array.to_pysequence()) == [d1, d2]
+    assert array.to_pylist() == [d1, d2]
+
+
+def test_duration_array():
+    unix_epoch = date(1970, 1, 1)
+    d1, d2 = date(1970, 1, 2), date(1970, 1, 3)
+    d1_date64 = int(round((d1 - unix_epoch).total_seconds() * 1e3))
+    d2_date64 = int(round((d2 - unix_epoch).total_seconds() * 1e3))
+    array = na.Array([d1_date64, d2_date64], na.duration("ms"))
+    assert list(array.to_pysequence()) == [timedelta(days=1), 
timedelta(days=2)]
+    assert array.to_pylist() == [timedelta(days=1), timedelta(days=2)]
+
+
+def test_timestamp_array_using_struct():
+    schema = na.struct(
+        {
+            "creation_timestamp": na.timestamp("ms"),
+        }
+    )
+
+    d1 = int(round(datetime(1985, 12, 31, 0, 0, 
tzinfo=timezone.utc).timestamp() * 1e3))
+    d2 = int(round(datetime(2005, 3, 4, 0, 0, tzinfo=timezone.utc).timestamp() 
* 1e3))
+
+    columns = [
+        na.c_array([d1, d2], na.timestamp("ms")),
+    ]
+
+    c_array = na.c_array_from_buffers(
+        schema, length=columns[0].length, buffers=[None], children=columns
+    )
+    array = na.Array(c_array)
+    names, columns = array.to_columns_pysequence()
+    assert names == ["creation_timestamp"]
+    assert list(array.to_pysequence()) == [
+        {"creation_timestamp": datetime(1985, 12, 31, 0, 0)},
+        {"creation_timestamp": datetime(2005, 3, 4, 0, 0)},
+    ]
+    assert repr(array).startswith(
+        "nanoarrow.Array<struct<creation_timestamp: timestamp('ms', '')>"
+    )
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index b40febcd..2db9bdb8 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -15,6 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import array
+from datetime import date, datetime
+
 import pytest
 from nanoarrow._lib import CArrayBuilder, NanoarrowException
 from nanoarrow.c_schema import c_schema_view
@@ -514,3 +517,95 @@ def test_c_array_from_buffers_validation():
             validation_level=validation_level,
         )
         assert c_array.length == 2
+
+
+def test_c_array_timestamp_seconds():
+    d1 = int(round(datetime(1970, 1, 1).timestamp()))
+    d2 = int(round(datetime(1985, 12, 31).timestamp()))
+    d3 = int(round(datetime(2005, 3, 4).timestamp()))
+    c_array = na.c_array([d1, d2, d3], na.timestamp("s"))
+    assert c_array.length == 3
+    assert c_array.null_count == 0
+    view = c_array.view()
+    assert list(view.buffer(0)) == []
+    assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_timestamp_seconds_from_pybuffer():
+    d1 = int(round(datetime(1970, 1, 1).timestamp()))
+    d2 = int(round(datetime(1985, 12, 31).timestamp()))
+    d3 = int(round(datetime(2005, 3, 4).timestamp()))
+    c_array = na.c_array(array.array("q", [d1, d2, d3]), na.timestamp("s"))
+    assert c_array.length == 3
+    assert c_array.null_count == 0
+    view = c_array.view()
+    assert list(view.buffer(0)) == []
+    assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_timestamp_milliseconds():
+    d1 = int(round(datetime(1970, 1, 1).timestamp() * 1e3))
+    d2 = int(round(datetime(1985, 12, 31).timestamp() * 1e3))
+    d3 = int(round(datetime(2005, 3, 4).timestamp() * 1e3))
+    c_array = na.c_array([d1, d2, d3], na.timestamp("ms"))
+    assert c_array.length == 3
+    assert c_array.null_count == 0
+    view = c_array.view()
+    assert list(view.buffer(0)) == []
+    assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_timestamp_milliseconds_from_pybuffer():
+    d1 = int(round(datetime(1970, 1, 1).timestamp() * 1e3))
+    d2 = int(round(datetime(1985, 12, 31).timestamp() * 1e3))
+    d3 = int(round(datetime(2005, 3, 4).timestamp() * 1e3))
+    c_array = na.c_array(array.array("q", [d1, d2, d3]), na.timestamp("ms"))
+    assert c_array.length == 3
+    assert c_array.null_count == 0
+    view = c_array.view()
+    assert list(view.buffer(0)) == []
+    assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_timestamp_microseconds():
+    d1 = int(round(datetime(1970, 1, 1).timestamp() * 1e6))
+    d2 = int(round(datetime(1985, 12, 31).timestamp() * 1e6))
+    d3 = int(round(datetime(2005, 3, 4).timestamp() * 1e6))
+    c_array = na.c_array([d1, d2, d3], na.timestamp("us"))
+    assert c_array.length == 3
+    assert c_array.null_count == 0
+    view = c_array.view()
+    assert list(view.buffer(0)) == []
+    assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_timestamp_nanoseconds():
+    d1 = int(round(datetime(1970, 1, 1).timestamp() * 1e9))
+    d2 = int(round(datetime(1985, 12, 31).timestamp() * 1e9))
+    d3 = int(round(datetime(2005, 3, 4).timestamp() * 1e9))
+    c_array = na.c_array([d1, d2, d3], na.timestamp("ns"))
+    assert c_array.length == 3
+    assert c_array.null_count == 0
+    view = c_array.view()
+    assert list(view.buffer(0)) == []
+    assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_duration():
+    unix_epoch = date(1970, 1, 1)
+    d1, d2, d3 = date(1970, 1, 2), date(1970, 1, 3), date(1970, 1, 4)
+    d1_duration_in_ms = int(round((d1 - unix_epoch).total_seconds() * 1e3))
+    d2_duration_in_ms = int(round((d2 - unix_epoch).total_seconds() * 1e3))
+    d3_duration_in_ms = int(round((d3 - unix_epoch).total_seconds() * 1e3))
+    c_array = na.c_array(
+        [d1_duration_in_ms, d2_duration_in_ms, d3_duration_in_ms], 
na.duration("ms")
+    )
+    assert c_array.length == 3
+    assert c_array.null_count == 0
+    view = c_array.view()
+    assert list(view.buffer(0)) == []
+    assert list(view.buffer(1)) == [
+        d1_duration_in_ms,
+        d2_duration_in_ms,
+        d3_duration_in_ms,
+    ]
diff --git a/python/tests/test_c_buffer.py b/python/tests/test_c_buffer.py
index 6a38d127..94111d5f 100644
--- a/python/tests/test_c_buffer.py
+++ b/python/tests/test_c_buffer.py
@@ -17,6 +17,7 @@
 
 import struct
 import sys
+from datetime import date, datetime
 
 import pytest
 from nanoarrow._lib import CBuffer, CBufferBuilder
@@ -260,7 +261,7 @@ def test_c_buffer_from_iterable():
     # An Arrow type whose storage type is not the same as its top-level
     # type will error.
     with pytest.raises(ValueError, match="Can't create buffer"):
-        na.c_buffer([1, 2, 3], na.date32())
+        na.c_buffer([1, 2, 3], na.dictionary(na.int32(), na.string()))
 
     with pytest.raises(ValueError, match="Can't create buffer"):
         na.c_buffer([1, 2, 3], na.extension_type(na.int32(), "arrow.test"))
@@ -362,3 +363,35 @@ def test_c_buffer_bitmap_from_iterable():
     builder.write_elements([True, False])
     with pytest.raises(NotImplementedError, match="Append to bitmap"):
         builder.write_elements([True])
+
+
+def test_c_buffer_from_timestamp_iterable():
+    d1 = int(round(datetime(1970, 1, 1).timestamp() * 1e3))
+    d2 = int(round(datetime(1985, 12, 31).timestamp() * 1e3))
+    d3 = int(round(datetime(2005, 3, 4).timestamp() * 1e3))
+    with pytest.raises(ValueError):
+        na.c_buffer([d1, d2, d3], na.timestamp("ms"))
+
+
+def test_c_buffer_from_date64_iterable():
+    unix_epoch = date(1970, 1, 1)
+    d1 = date(1970, 1, 2)
+    diff_in_milliseconds = int(round((d1 - unix_epoch).total_seconds() * 1e3))
+    with pytest.raises(ValueError):
+        na.c_buffer([diff_in_milliseconds], na.date64())
+
+
+def test_c_buffer_from_date32_iterable():
+    unix_epoch = date(1970, 1, 1)
+    d1 = date(1970, 1, 2)
+    diff_in_days = (d1 - unix_epoch).days
+    with pytest.raises(ValueError):
+        na.c_buffer([diff_in_days], na.date32())
+
+
+def test_c_buffer_from_duration_iterable():
+    unix_epoch = date(1970, 1, 1)
+    d1 = date(1970, 1, 2)
+    diff_in_milliseconds = int(round((d1 - unix_epoch).total_seconds() * 1e3))
+    with pytest.raises(ValueError):
+        na.c_buffer([diff_in_milliseconds], na.duration("ms"))

Reply via email to