This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 9c63a259 feat(python) : Added support to create
timestamp/date32/date64 Array from integer values (#502)
9c63a259 is described below
commit 9c63a259b2f746690336fcc6d7d0bb8d1f4ee71f
Author: Abhishek Singh <[email protected]>
AuthorDate: Sat Jun 8 04:01:37 2024 -0700
feat(python) : Added support to create timestamp/date32/date64 Array from
integer values (#502)
Addresses https://github.com/apache/arrow-nanoarrow/issues/478
Currently, if you create a `c_array` from an iterable of timestamps an
error is raised
Code:
```python
from datetime import datetime
import nanoarrow as na
import pyarrow as pa
def gen_timestamp():
for i in range(100):
yield int(round(datetime.now().timestamp()))
timestamp_array = na.c_array(gen_timestamp(), na.timestamp("s"))
parray = pa.Table.from_arrays([timestamp_array], names=["timestamp"])
print(parray.to_pandas())
```
Error:
```
Traceback (most recent call last):
File
"/Users/as/nanoarrow_poc/.venv/lib/python3.12/site-packages/nanoarrow/c_array.py",
line 128, in c_array
builder = builder_cls(schema)
^^^^^^^^^^^^^^^^^^^
File
"/Users/as/nanoarrow_poc/.venv/lib/python3.12/site-packages/nanoarrow/c_array.py",
line 457, in __init__
raise ValueError(
ValueError: Can't build array of type timestamp from iterable
```
Expected output:
```
timestamp
0 2024-05-31 18:43:10
1 2024-05-31 18:43:10
2 2024-05-31 18:43:10
3 2024-05-31 18:43:10
4 2024-05-31 18:43:10
.. ...
95 2024-05-31 18:43:10
96 2024-05-31 18:43:10
97 2024-05-31 18:43:10
98 2024-05-31 18:43:10
99 2024-05-31 18:43:10
[100 rows x 1 columns]
```
Benefits
- Pyarrow Table and Pandas Dataframe can use the corresponding nanoarrow
Array Type to cast the values when using `to_pandas`
---------
Co-authored-by: Dewey Dunnington <[email protected]>
---
python/src/nanoarrow/_lib.pyx | 11 +++++
python/src/nanoarrow/c_array.py | 8 +++-
python/tests/test_array.py | 65 ++++++++++++++++++++++++++++
python/tests/test_c_array.py | 95 +++++++++++++++++++++++++++++++++++++++++
python/tests/test_c_buffer.py | 35 ++++++++++++++-
5 files changed, 211 insertions(+), 3 deletions(-)
diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 73da100a..058c27c9 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -1095,6 +1095,17 @@ cdef class CSchemaView:
def storage_type_id(self):
return self._schema_view.storage_type
+ @property
+ def storage_buffer_format(self):
+ if self.buffer_format is not None:
+ return self.buffer_format
+ elif self._schema_view.type == NANOARROW_TYPE_DATE32:
+ return 'i'
+ elif self._schema_view.type in (NANOARROW_TYPE_TIMESTAMP,
NANOARROW_TYPE_DATE64, NANOARROW_TYPE_DURATION):
+ return 'q'
+ else:
+ return None
+
@property
def buffer_format(self):
"""The Python struct format representing an element of this type
diff --git a/python/src/nanoarrow/c_array.py b/python/src/nanoarrow/c_array.py
index 64723325..db33c22e 100644
--- a/python/src/nanoarrow/c_array.py
+++ b/python/src/nanoarrow/c_array.py
@@ -406,7 +406,7 @@ class ArrayFromPyBufferBuilder(ArrayBuilder):
def __init__(self, schema):
super().__init__(schema)
- if self._schema_view.buffer_format is None:
+ if self._schema_view.storage_buffer_format is None:
raise ValueError(
f"Can't build array of type {self._schema_view.type} from
PyBuffer"
)
@@ -508,7 +508,7 @@ class ArrayFromIterableBuilder(ArrayBuilder):
def _append_using_array(self, obj: Iterable) -> None:
from array import array
- py_array = array(self._schema_view.buffer_format, obj)
+ py_array = array(self._schema_view.storage_buffer_format, obj)
buffer = CBuffer.from_pybuffer(py_array)
self._c_builder.set_buffer(1, buffer, move=True)
self._c_builder.set_length(len(buffer))
@@ -549,6 +549,10 @@ _ARRAY_BUILDER_FROM_ITERABLE_METHOD = {
CArrowType.UINT64: "_append_using_array",
CArrowType.FLOAT: "_append_using_array",
CArrowType.DOUBLE: "_append_using_array",
+ CArrowType.TIMESTAMP: "_append_using_array",
+ CArrowType.DATE32: "_append_using_array",
+ CArrowType.DATE64: "_append_using_array",
+ CArrowType.DURATION: "_append_using_array",
}
_ARRAY_BUILDER_FROM_NULLABLE_ITERABLE_METHOD = {
diff --git a/python/tests/test_array.py b/python/tests/test_array.py
index 8eb1e96a..2dfd83c8 100644
--- a/python/tests/test_array.py
+++ b/python/tests/test_array.py
@@ -14,6 +14,7 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+from datetime import date, datetime, timedelta, timezone
import pytest
from nanoarrow.c_array_stream import CArrayStream
@@ -354,3 +355,67 @@ def test_array_inspect(capsys):
array.inspect()
captured = capsys.readouterr()
assert captured.out.startswith("<ArrowArray struct<col0: int32")
+
+
+def test_timestamp_array():
+ d1 = int(round(datetime(1985, 12, 31, 0, 0,
tzinfo=timezone.utc).timestamp() * 1e3))
+ d2 = int(round(datetime(2005, 3, 4, 0, 0, tzinfo=timezone.utc).timestamp()
* 1e3))
+ array = na.Array([d1, d2], na.timestamp("ms"))
+ assert list(array.to_pysequence()) == [
+ datetime(1985, 12, 31, 0, 0),
+ datetime(2005, 3, 4, 0, 0),
+ ]
+ assert array.to_pylist() == [
+ datetime(1985, 12, 31, 0, 0),
+ datetime(2005, 3, 4, 0, 0),
+ ]
+ assert repr(array).startswith("nanoarrow.Array<timestamp('ms', '')>")
+
+
+def test_date64_array():
+ unix_epoch = date(1970, 1, 1)
+ d1, d2 = date(1970, 1, 2), date(1970, 1, 3)
+ d1_date64 = int(round((d1 - unix_epoch).total_seconds() * 1e3))
+ d2_date64 = int(round((d2 - unix_epoch).total_seconds() * 1e3))
+ array = na.Array([d1_date64, d2_date64], na.date64())
+ assert list(array.to_pysequence()) == [d1, d2]
+ assert array.to_pylist() == [d1, d2]
+
+
+def test_duration_array():
+ unix_epoch = date(1970, 1, 1)
+ d1, d2 = date(1970, 1, 2), date(1970, 1, 3)
+ d1_date64 = int(round((d1 - unix_epoch).total_seconds() * 1e3))
+ d2_date64 = int(round((d2 - unix_epoch).total_seconds() * 1e3))
+ array = na.Array([d1_date64, d2_date64], na.duration("ms"))
+ assert list(array.to_pysequence()) == [timedelta(days=1),
timedelta(days=2)]
+ assert array.to_pylist() == [timedelta(days=1), timedelta(days=2)]
+
+
+def test_timestamp_array_using_struct():
+ schema = na.struct(
+ {
+ "creation_timestamp": na.timestamp("ms"),
+ }
+ )
+
+ d1 = int(round(datetime(1985, 12, 31, 0, 0,
tzinfo=timezone.utc).timestamp() * 1e3))
+ d2 = int(round(datetime(2005, 3, 4, 0, 0, tzinfo=timezone.utc).timestamp()
* 1e3))
+
+ columns = [
+ na.c_array([d1, d2], na.timestamp("ms")),
+ ]
+
+ c_array = na.c_array_from_buffers(
+ schema, length=columns[0].length, buffers=[None], children=columns
+ )
+ array = na.Array(c_array)
+ names, columns = array.to_columns_pysequence()
+ assert names == ["creation_timestamp"]
+ assert list(array.to_pysequence()) == [
+ {"creation_timestamp": datetime(1985, 12, 31, 0, 0)},
+ {"creation_timestamp": datetime(2005, 3, 4, 0, 0)},
+ ]
+ assert repr(array).startswith(
+ "nanoarrow.Array<struct<creation_timestamp: timestamp('ms', '')>"
+ )
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index b40febcd..2db9bdb8 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -15,6 +15,9 @@
# specific language governing permissions and limitations
# under the License.
+import array
+from datetime import date, datetime
+
import pytest
from nanoarrow._lib import CArrayBuilder, NanoarrowException
from nanoarrow.c_schema import c_schema_view
@@ -514,3 +517,95 @@ def test_c_array_from_buffers_validation():
validation_level=validation_level,
)
assert c_array.length == 2
+
+
+def test_c_array_timestamp_seconds():
+ d1 = int(round(datetime(1970, 1, 1).timestamp()))
+ d2 = int(round(datetime(1985, 12, 31).timestamp()))
+ d3 = int(round(datetime(2005, 3, 4).timestamp()))
+ c_array = na.c_array([d1, d2, d3], na.timestamp("s"))
+ assert c_array.length == 3
+ assert c_array.null_count == 0
+ view = c_array.view()
+ assert list(view.buffer(0)) == []
+ assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_timestamp_seconds_from_pybuffer():
+ d1 = int(round(datetime(1970, 1, 1).timestamp()))
+ d2 = int(round(datetime(1985, 12, 31).timestamp()))
+ d3 = int(round(datetime(2005, 3, 4).timestamp()))
+ c_array = na.c_array(array.array("q", [d1, d2, d3]), na.timestamp("s"))
+ assert c_array.length == 3
+ assert c_array.null_count == 0
+ view = c_array.view()
+ assert list(view.buffer(0)) == []
+ assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_timestamp_milliseconds():
+ d1 = int(round(datetime(1970, 1, 1).timestamp() * 1e3))
+ d2 = int(round(datetime(1985, 12, 31).timestamp() * 1e3))
+ d3 = int(round(datetime(2005, 3, 4).timestamp() * 1e3))
+ c_array = na.c_array([d1, d2, d3], na.timestamp("ms"))
+ assert c_array.length == 3
+ assert c_array.null_count == 0
+ view = c_array.view()
+ assert list(view.buffer(0)) == []
+ assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_timestamp_milliseconds_from_pybuffer():
+ d1 = int(round(datetime(1970, 1, 1).timestamp() * 1e3))
+ d2 = int(round(datetime(1985, 12, 31).timestamp() * 1e3))
+ d3 = int(round(datetime(2005, 3, 4).timestamp() * 1e3))
+ c_array = na.c_array(array.array("q", [d1, d2, d3]), na.timestamp("ms"))
+ assert c_array.length == 3
+ assert c_array.null_count == 0
+ view = c_array.view()
+ assert list(view.buffer(0)) == []
+ assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_timestamp_microseconds():
+ d1 = int(round(datetime(1970, 1, 1).timestamp() * 1e6))
+ d2 = int(round(datetime(1985, 12, 31).timestamp() * 1e6))
+ d3 = int(round(datetime(2005, 3, 4).timestamp() * 1e6))
+ c_array = na.c_array([d1, d2, d3], na.timestamp("us"))
+ assert c_array.length == 3
+ assert c_array.null_count == 0
+ view = c_array.view()
+ assert list(view.buffer(0)) == []
+ assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_timestamp_nanoseconds():
+ d1 = int(round(datetime(1970, 1, 1).timestamp() * 1e9))
+ d2 = int(round(datetime(1985, 12, 31).timestamp() * 1e9))
+ d3 = int(round(datetime(2005, 3, 4).timestamp() * 1e9))
+ c_array = na.c_array([d1, d2, d3], na.timestamp("ns"))
+ assert c_array.length == 3
+ assert c_array.null_count == 0
+ view = c_array.view()
+ assert list(view.buffer(0)) == []
+ assert list(view.buffer(1)) == [d1, d2, d3]
+
+
+def test_c_array_duration():
+ unix_epoch = date(1970, 1, 1)
+ d1, d2, d3 = date(1970, 1, 2), date(1970, 1, 3), date(1970, 1, 4)
+ d1_duration_in_ms = int(round((d1 - unix_epoch).total_seconds() * 1e3))
+ d2_duration_in_ms = int(round((d2 - unix_epoch).total_seconds() * 1e3))
+ d3_duration_in_ms = int(round((d3 - unix_epoch).total_seconds() * 1e3))
+ c_array = na.c_array(
+ [d1_duration_in_ms, d2_duration_in_ms, d3_duration_in_ms],
na.duration("ms")
+ )
+ assert c_array.length == 3
+ assert c_array.null_count == 0
+ view = c_array.view()
+ assert list(view.buffer(0)) == []
+ assert list(view.buffer(1)) == [
+ d1_duration_in_ms,
+ d2_duration_in_ms,
+ d3_duration_in_ms,
+ ]
diff --git a/python/tests/test_c_buffer.py b/python/tests/test_c_buffer.py
index 6a38d127..94111d5f 100644
--- a/python/tests/test_c_buffer.py
+++ b/python/tests/test_c_buffer.py
@@ -17,6 +17,7 @@
import struct
import sys
+from datetime import date, datetime
import pytest
from nanoarrow._lib import CBuffer, CBufferBuilder
@@ -260,7 +261,7 @@ def test_c_buffer_from_iterable():
# An Arrow type whose storage type is not the same as its top-level
# type will error.
with pytest.raises(ValueError, match="Can't create buffer"):
- na.c_buffer([1, 2, 3], na.date32())
+ na.c_buffer([1, 2, 3], na.dictionary(na.int32(), na.string()))
with pytest.raises(ValueError, match="Can't create buffer"):
na.c_buffer([1, 2, 3], na.extension_type(na.int32(), "arrow.test"))
@@ -362,3 +363,35 @@ def test_c_buffer_bitmap_from_iterable():
builder.write_elements([True, False])
with pytest.raises(NotImplementedError, match="Append to bitmap"):
builder.write_elements([True])
+
+
+def test_c_buffer_from_timestamp_iterable():
+ d1 = int(round(datetime(1970, 1, 1).timestamp() * 1e3))
+ d2 = int(round(datetime(1985, 12, 31).timestamp() * 1e3))
+ d3 = int(round(datetime(2005, 3, 4).timestamp() * 1e3))
+ with pytest.raises(ValueError):
+ na.c_buffer([d1, d2, d3], na.timestamp("ms"))
+
+
+def test_c_buffer_from_date64_iterable():
+ unix_epoch = date(1970, 1, 1)
+ d1 = date(1970, 1, 2)
+ diff_in_milliseconds = int(round((d1 - unix_epoch).total_seconds() * 1e3))
+ with pytest.raises(ValueError):
+ na.c_buffer([diff_in_milliseconds], na.date64())
+
+
+def test_c_buffer_from_date32_iterable():
+ unix_epoch = date(1970, 1, 1)
+ d1 = date(1970, 1, 2)
+ diff_in_days = (d1 - unix_epoch).days
+ with pytest.raises(ValueError):
+ na.c_buffer([diff_in_days], na.date32())
+
+
+def test_c_buffer_from_duration_iterable():
+ unix_epoch = date(1970, 1, 1)
+ d1 = date(1970, 1, 2)
+ diff_in_milliseconds = int(round((d1 - unix_epoch).total_seconds() * 1e3))
+ with pytest.raises(ValueError):
+ na.c_buffer([diff_in_milliseconds], na.duration("ms"))