This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new d6ef4800 feat(python): Add StringView and BinaryView IO to Python
bindings (#637)
d6ef4800 is described below
commit d6ef48000d2855007efd18025a48a0b3bc7fadfe
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Sep 30 10:25:36 2024 -0500
feat(python): Add StringView and BinaryView IO to Python bindings (#637)
This PR implements StringView support in the Python bindings. It is a
thin wrapper around the C functions added, although we should perhaps
abstract some of the buffer info calculation into the C library since I
had to work around that in the R bindings as well.
```python
import nanoarrow as na
array = na.Array(["abc", "def", None, "longer than 12 bytes"],
na.string_view())
array
#> nanoarrow.Array<string_view>[4]
#> 'abc'
#> 'def'
#> None
#> 'longer than 12 bytes'
array.buffers
#> (nanoarrow.c_buffer.CBufferView(bool[1 b] 11010000),
#> nanoarrow.c_buffer.CBufferView(string_view[64 b]
b'\x03\x00\x00\x00abc\x00\x00\x00\x00\x00\x00\x00\x00\x00'...),
#> nanoarrow.c_buffer.CBufferView(string[20 b] b'longer than 12 bytes'),
#> nanoarrow.c_buffer.CBufferView(int64[8 b] 20))
```
---------
Co-authored-by: William Ayd <[email protected]>
---
python/src/nanoarrow/_array.pyx | 83 +++++++++++++++++++++++++++++++++++-----
python/src/nanoarrow/_types.pxd | 2 +
python/src/nanoarrow/_types.pyi | 7 ++++
python/src/nanoarrow/_types.pyx | 11 ++++++
python/src/nanoarrow/c_array.py | 2 +
python/src/nanoarrow/iterator.py | 8 ++++
python/tests/test_c_array.py | 40 +++++++++++++++++++
python/tests/test_iterator.py | 28 ++++++++++----
8 files changed, 163 insertions(+), 18 deletions(-)
diff --git a/python/src/nanoarrow/_array.pyx b/python/src/nanoarrow/_array.pyx
index 3baaf16e..0bd3a961 100644
--- a/python/src/nanoarrow/_array.pyx
+++ b/python/src/nanoarrow/_array.pyx
@@ -22,10 +22,12 @@ from cpython.pycapsule cimport PyCapsule_GetPointer
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from cpython cimport (
Py_buffer,
- PyObject_GetBuffer,
PyBuffer_Release,
PyBUF_ANY_CONTIGUOUS,
PyBUF_FORMAT,
+ PyBytes_FromStringAndSize,
+ PyObject_GetBuffer,
+ PyUnicode_FromStringAndSize,
)
from nanoarrow_c cimport (
@@ -43,6 +45,9 @@ from nanoarrow_c cimport (
ArrowArrayView,
ArrowArrayViewComputeNullCount,
ArrowArrayViewInitFromSchema,
+ ArrowArrayViewIsNull,
+ ArrowArrayViewGetStringUnsafe,
+ ArrowArrayViewGetBytesUnsafe,
ArrowArrayViewSetArray,
ArrowArrayViewSetArrayMinimal,
ArrowBitCountSet,
@@ -57,6 +62,7 @@ from nanoarrow_c cimport (
ArrowValidationLevel,
NANOARROW_BUFFER_TYPE_DATA,
NANOARROW_BUFFER_TYPE_DATA_OFFSET,
+ NANOARROW_BUFFER_TYPE_DATA_VIEW,
NANOARROW_BUFFER_TYPE_TYPE_ID,
NANOARROW_BUFFER_TYPE_UNION_OFFSET,
NANOARROW_BUFFER_TYPE_VALIDITY,
@@ -78,6 +84,7 @@ from nanoarrow._device cimport Device, CSharedSyncEvent
from nanoarrow._buffer cimport CBuffer, CBufferView
from nanoarrow._schema cimport CSchema, CLayout
+from nanoarrow cimport _types
from nanoarrow._utils cimport (
alloc_c_array,
alloc_c_device_array,
@@ -189,13 +196,48 @@ cdef class CArrayView:
@property
def n_buffers(self):
+ if _types.is_data_view(self._ptr.storage_type):
+ return 2 + self._ptr.n_variadic_buffers + 1
+
return self.layout.n_buffers
- def buffer_type(self, int64_t i):
+ def _buffer_info(self, int64_t i):
if i < 0 or i >= self.n_buffers:
raise IndexError(f"{i} out of range [0, {self.n_buffers}]")
- buffer_type = self._ptr.layout.buffer_type[i]
+ if (
+ _types.is_data_view(self._ptr.storage_type)
+ and i == (2 + self._ptr.n_variadic_buffers)
+ ):
+ return (
+ NANOARROW_BUFFER_TYPE_DATA,
+ _types.INT64,
+ 64,
+ <uintptr_t>self._ptr.array.buffers[i],
+ (self._ptr.n_variadic_buffers) * 8
+ )
+ elif (
+ _types.is_data_view(self._ptr.storage_type)
+ and i >= 2
+ ):
+ return (
+ NANOARROW_BUFFER_TYPE_DATA,
+ _types.STRING if int(self._ptr.storage_type) ==
_types.STRING_VIEW else _types.BINARY,
+ 0,
+ <uintptr_t>self._ptr.array.buffers[i],
+ (<int64_t*>self._ptr.array.buffers[2 +
self._ptr.n_variadic_buffers])[i - 2]
+ )
+
+ return (
+ self._ptr.layout.buffer_type[i],
+ self._ptr.layout.buffer_data_type[i],
+ self._ptr.layout.element_size_bits[i],
+ <uintptr_t>self._ptr.buffer_views[i].data.data,
+ self._ptr.buffer_views[i].size_bytes
+ )
+
+ def buffer_type(self, int64_t i):
+ buffer_type = self._buffer_info(i)[0]
if buffer_type == NANOARROW_BUFFER_TYPE_VALIDITY:
return "validity"
elif buffer_type == NANOARROW_BUFFER_TYPE_TYPE_ID:
@@ -206,14 +248,17 @@ cdef class CArrayView:
return "data_offset"
elif buffer_type == NANOARROW_BUFFER_TYPE_DATA:
return "data"
+ elif buffer_type == NANOARROW_BUFFER_TYPE_DATA_VIEW:
+ return "data_view"
else:
return "none"
def buffer(self, int64_t i):
- if i < 0 or i >= self.n_buffers:
- raise IndexError(f"{i} out of range [0, {self.n_buffers}]")
+ _, data_type, element_size_bits, addr, size = self._buffer_info(i)
- cdef ArrowBufferView* buffer_view = &(self._ptr.buffer_views[i])
+ cdef ArrowBufferView buffer_view
+ buffer_view.data.data = <void*>addr
+ buffer_view.size_bytes = size
# Check the buffer size here because the error later is cryptic.
# Buffer sizes are set to -1 when they are "unknown", so because of
errors
@@ -224,10 +269,10 @@ cdef class CArrayView:
return CBufferView(
self._array_base,
- <uintptr_t>buffer_view.data.data,
- buffer_view.size_bytes,
- self._ptr.layout.buffer_data_type[i],
- self._ptr.layout.element_size_bits[i],
+ addr,
+ size,
+ data_type,
+ element_size_bits,
self._event
)
@@ -249,6 +294,24 @@ cdef class CArrayView:
return dictionary
+ def _iter_bytes(self, int64_t offset, int64_t length) -> bytes | None:
+ cdef ArrowBufferView item_view
+ for i in range(offset, length):
+ if ArrowArrayViewIsNull(self._ptr, i):
+ yield None
+ else:
+ item_view = ArrowArrayViewGetBytesUnsafe(self._ptr, i)
+ yield PyBytes_FromStringAndSize(item_view.data.as_char,
item_view.size_bytes)
+
+ def _iter_str(self, int64_t offset, int64_t length) -> str | None:
+ cdef ArrowStringView item_view
+ for i in range(offset, length):
+ if ArrowArrayViewIsNull(self._ptr, i):
+ yield None
+ else:
+ item_view = ArrowArrayViewGetStringUnsafe(self._ptr, i)
+ yield PyUnicode_FromStringAndSize(item_view.data,
item_view.size_bytes)
+
def __repr__(self):
return _repr_utils.array_view_repr(self)
diff --git a/python/src/nanoarrow/_types.pxd b/python/src/nanoarrow/_types.pxd
index 4a53fe31..3dc727c4 100644
--- a/python/src/nanoarrow/_types.pxd
+++ b/python/src/nanoarrow/_types.pxd
@@ -90,6 +90,8 @@ cpdef bint has_time_unit(int type_id)
cpdef bint is_union(int type_id)
+cpdef bint is_data_view(int type_id)
+
cdef int to_format(int type_id, int element_size_bits, size_t out_size, char*
out)
cdef tuple from_format(format)
diff --git a/python/src/nanoarrow/_types.pyi b/python/src/nanoarrow/_types.pyi
index 23c8cf8c..e55097b2 100644
--- a/python/src/nanoarrow/_types.pyi
+++ b/python/src/nanoarrow/_types.pyi
@@ -20,6 +20,7 @@ import enum
from typing import Callable, ClassVar
BINARY: CArrowType
+BINARY_VIEW: CArrowType
BOOL: CArrowType
DATE32: CArrowType
DATE64: CArrowType
@@ -47,8 +48,10 @@ LARGE_STRING: CArrowType
LIST: CArrowType
MAP: CArrowType
NA: CArrowType
+RUN_END_ENCODED: CArrowType
SPARSE_UNION: CArrowType
STRING: CArrowType
+STRING_VIEW: CArrowType
STRUCT: CArrowType
TIME32: CArrowType
TIME64: CArrowType
@@ -61,6 +64,7 @@ UNINITIALIZED: CArrowType
__pyx_capi__: dict
__test__: dict
has_time_unit: _cython_3_0_11.cython_function_or_method
+is_data_view: _cython_3_0_11.cython_function_or_method
is_decimal: _cython_3_0_11.cython_function_or_method
is_fixed_size: _cython_3_0_11.cython_function_or_method
is_floating_point: _cython_3_0_11.cython_function_or_method
@@ -72,6 +76,7 @@ sys_byteorder: str
class CArrowType(enum.IntFlag):
__new__: ClassVar[Callable] = ...
BINARY: ClassVar[CArrowType] = ...
+ BINARY_VIEW: ClassVar[CArrowType] = ...
BOOL: ClassVar[CArrowType] = ...
DATE32: ClassVar[CArrowType] = ...
DATE64: ClassVar[CArrowType] = ...
@@ -99,8 +104,10 @@ class CArrowType(enum.IntFlag):
LIST: ClassVar[CArrowType] = ...
MAP: ClassVar[CArrowType] = ...
NA: ClassVar[CArrowType] = ...
+ RUN_END_ENCODED: ClassVar[CArrowType] = ...
SPARSE_UNION: ClassVar[CArrowType] = ...
STRING: ClassVar[CArrowType] = ...
+ STRING_VIEW: ClassVar[CArrowType] = ...
STRUCT: ClassVar[CArrowType] = ...
TIME32: ClassVar[CArrowType] = ...
TIME64: ClassVar[CArrowType] = ...
diff --git a/python/src/nanoarrow/_types.pyx b/python/src/nanoarrow/_types.pyx
index c10463b0..e43545f2 100644
--- a/python/src/nanoarrow/_types.pyx
+++ b/python/src/nanoarrow/_types.pyx
@@ -109,6 +109,14 @@ cpdef bint is_union(int type_id):
)
+cpdef bint is_data_view(int type_id):
+ """Check if type_id is a binary view or string view type"""
+ return type_id in (
+ _types.BINARY_VIEW,
+ _types.STRING_VIEW
+ )
+
+
cdef tuple from_format(format):
"""Convert a Python buffer protocol format string to a itemsize/type_id
tuple
@@ -236,6 +244,9 @@ cdef int to_format(int type_id, int element_size_bits,
size_t out_size, char* ou
elif type_id == _types.DECIMAL256:
format_const = "32s"
element_size_bits_calc = 256
+ elif is_data_view(type_id):
+ format_const = "16s"
+ element_size_bits_calc = 128
else:
raise ValueError(f"Unsupported Arrow type_id for format conversion:
{type_id}")
diff --git a/python/src/nanoarrow/c_array.py b/python/src/nanoarrow/c_array.py
index d0250456..0c71bda4 100644
--- a/python/src/nanoarrow/c_array.py
+++ b/python/src/nanoarrow/c_array.py
@@ -547,8 +547,10 @@ _ARRAY_BUILDER_FROM_ITERABLE_METHOD = {
_types.BINARY: "_append_bytes",
_types.LARGE_BINARY: "_append_bytes",
_types.FIXED_SIZE_BINARY: "_append_bytes",
+ _types.BINARY_VIEW: "_append_bytes",
_types.STRING: "_append_strings",
_types.LARGE_STRING: "_append_strings",
+ _types.STRING_VIEW: "_append_strings",
_types.INT8: "_append_using_array",
_types.UINT8: "_append_using_array",
_types.INT16: "_append_using_array",
diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
index 874f285f..fc6e1428 100644
--- a/python/src/nanoarrow/iterator.py
+++ b/python/src/nanoarrow/iterator.py
@@ -322,6 +322,12 @@ class PyIterator(ArrayViewBaseIterator):
for start, end in zip(starts, ends):
yield bytes(data[start:end])
+ def _binary_view_iter(self, offset, length):
+ return self._array_view._iter_bytes(offset, length)
+
+ def _string_view_iter(self, offset, length):
+ return self._array_view._iter_str(offset, length)
+
def _decimal_iter(self, offset, length):
from decimal import Context, Decimal
from sys import byteorder
@@ -564,6 +570,8 @@ _ITEMS_ITER_LOOKUP = {
_types.DURATION: "_duration_iter",
_types.DECIMAL128: "_decimal_iter",
_types.DECIMAL256: "_decimal_iter",
+ _types.STRING_VIEW: "_string_view_iter",
+ _types.BINARY_VIEW: "_binary_view_iter",
}
_PRIMITIVE_TYPE_NAMES = [
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index e4d98f17..d2993ec6 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -288,6 +288,32 @@ def test_c_array_from_iterable_string():
na.c_array([b"1234"], na.string())
+def test_c_array_from_iterable_string_view():
+ string = na.c_array(
+ ["abc", None, "a string longer than 12 bytes"], na.string_view()
+ )
+ assert string.length == 3
+ assert string.null_count == 1
+ assert string.n_buffers == 4
+
+ array_view = string.view()
+ assert len(array_view.buffer(0)) == 1
+ assert bytes(array_view.buffer(2)) == b"a string longer than 12 bytes"
+ assert list(array_view.buffer(3)) == [len("a string longer than 12 bytes")]
+
+ # Make sure this also works when all strings are inlined (i.e., no
variadic buffers)
+ string = na.c_array(["abc", None, "short string"], na.string_view())
+ assert string.length == 3
+ assert string.null_count == 1
+ assert string.n_buffers == 3
+
+ array_view = string.view()
+ assert len(array_view.buffer(0)) == 1
+ assert len(array_view.buffer(1)) == 3
+ assert len(bytes(array_view.buffer(1))) == 3 * 16
+ assert list(array_view.buffer(2)) == []
+
+
def test_c_array_from_iterable_bytes():
string = na.c_array([b"abc", None, b"defg"], na.binary())
assert string.length == 3
@@ -311,6 +337,20 @@ def test_c_array_from_iterable_bytes():
na.c_array([buf_2d], na.binary())
+def test_c_array_from_iterable__view():
+ string = na.c_array(
+ [b"abc", None, b"a string longer than 12 bytes"], na.binary_view()
+ )
+ assert string.length == 3
+ assert string.null_count == 1
+ assert string.n_buffers == 4
+
+ array_view = string.view()
+ assert len(array_view.buffer(0)) == 1
+ assert bytes(array_view.buffer(2)) == b"a string longer than 12 bytes"
+ assert list(array_view.buffer(3)) == [len("a string longer than 12 bytes")]
+
+
def test_c_array_from_iterable_non_empty_nullable_without_nulls():
c_array = na.c_array([1, 2, 3], na.int32())
assert c_array.length == 3
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index fe6e8bbd..0c0e0474 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -68,8 +68,11 @@ def test_iterator_nullable_primitive():
assert list(iter_py(sliced)) == [2, 3, None]
-def test_iterator_string():
- array = na.c_array(["ab", "cde"], na.string())
[email protected](
+ "arrow_type", [na.string(), na.large_string(), na.string_view()]
+)
+def test_iterator_string(arrow_type):
+ array = na.c_array(["ab", "cde"], arrow_type)
assert list(iter_py(array)) == ["ab", "cde"]
@@ -77,8 +80,11 @@ def test_iterator_string():
assert list(iter_py(sliced)) == ["cde"]
-def test_iterator_nullable_string():
- array = na.c_array(["ab", "cde", None], na.string())
[email protected](
+ "arrow_type", [na.string(), na.large_string(), na.string_view()]
+)
+def test_iterator_nullable_string(arrow_type):
+ array = na.c_array(["ab", "cde", None], arrow_type)
assert list(iter_py(array)) == ["ab", "cde", None]
@@ -86,8 +92,11 @@ def test_iterator_nullable_string():
assert list(iter_py(sliced)) == ["cde", None]
-def test_iterator_binary():
- array = na.c_array([b"ab", b"cde"], na.binary())
[email protected](
+ "arrow_type", [na.binary(), na.large_binary(), na.binary_view()]
+)
+def test_iterator_binary(arrow_type):
+ array = na.c_array([b"ab", b"cde"], arrow_type)
assert list(iter_py(array)) == [b"ab", b"cde"]
@@ -95,8 +104,11 @@ def test_iterator_binary():
assert list(iter_py(sliced)) == [b"cde"]
-def test_iterator_nullable_binary():
- array = na.c_array([b"ab", b"cde", None], na.binary())
[email protected](
+ "arrow_type", [na.binary(), na.large_binary(), na.binary_view()]
+)
+def test_iterator_nullable_binary(arrow_type):
+ array = na.c_array([b"ab", b"cde", None], arrow_type)
assert list(iter_py(array)) == [b"ab", b"cde", None]