This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new aa26f28a64 GH-44651: [Python] Allow from_buffers to work with
StringView on Python (#44701)
aa26f28a64 is described below
commit aa26f28a64b7638f01756d78a2ea8fbddceafc65
Author: Raúl Cumplido <[email protected]>
AuthorDate: Mon Nov 18 12:10:57 2024 +0100
GH-44651: [Python] Allow from_buffers to work with StringView on Python
(#44701)
### Rationale for this change
Currently `from_buffers` is not working with StringView on Python because
we validate against num_buffers. This only take into account the mandatory
buffers but does not take into account the variadic_spec that can be present
for both string_view and binary_view
### What changes are included in this PR?
Take into account whether the type contains a variadic_spec for the
non-mandatory buffers and only check lower_bound number of buffers.
### Are these changes tested?
Yes, I've added a couple of tests.
### Are there any user-facing changes?
We are exposing a new method on the Python DataType. `has_variadic_buffers`
which tells us whether the number of buffers expected is only lower-bounded by
num_buffers.
* GitHub Issue: #44651
Authored-by: Raúl Cumplido <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
---
python/pyarrow/array.pxi | 7 ++++++-
python/pyarrow/includes/libarrow.pxd | 1 +
python/pyarrow/tests/test_array.py | 26 ++++++++++++++++++++++++++
python/pyarrow/tests/test_types.py | 8 ++++++++
python/pyarrow/types.pxi | 16 ++++++++++++++++
5 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index eaedbf1e38..8bddc34e10 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1174,7 +1174,12 @@ cdef class Array(_PandasConvertible):
"({0}) did not match the passed number "
"({1}).".format(type.num_fields, len(children)))
- if type.num_buffers != len(buffers):
+ if type.has_variadic_buffers:
+ if type.num_buffers > len(buffers):
+ raise ValueError("Type's expected number of buffers is at
least "
+ "{0}, but the passed number is "
+ "{1}.".format(type.num_buffers, len(buffers)))
+ elif type.num_buffers != len(buffers):
raise ValueError("Type's expected number of buffers "
"({0}) did not match the passed number "
"({1}).".format(type.num_buffers, len(buffers)))
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index a70cb91873..8bf61b73cc 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -158,6 +158,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CDataTypeLayout" arrow::DataTypeLayout":
vector[CBufferSpec] buffers
+ optional[CBufferSpec] variadic_spec
c_bool has_dictionary
cdef cppclass CDataType" arrow::DataType":
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index 4160d64829..885442b079 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -651,6 +651,32 @@ def test_string_binary_from_buffers():
assert copied.null_count == 0
+def test_string_view_from_buffers():
+ array = pa.array(
+ [
+ "String longer than 12 characters",
+ None,
+ "short",
+ "Length is 12"
+ ], type=pa.string_view())
+
+ buffers = array.buffers()
+ copied = pa.StringViewArray.from_buffers(
+ pa.string_view(), len(array), buffers)
+ copied.validate(full=True)
+ assert copied.to_pylist() == [
+ "String longer than 12 characters",
+ None,
+ "short",
+ "Length is 12"
+ ]
+
+ match = r"number of buffers is at least 2"
+ with pytest.raises(ValueError, match=match):
+ pa.StringViewArray.from_buffers(
+ pa.string_view(), len(array), buffers[0:1])
+
+
@pytest.mark.parametrize('list_type_factory', [
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
def test_list_from_buffers(list_type_factory):
diff --git a/python/pyarrow/tests/test_types.py
b/python/pyarrow/tests/test_types.py
index fef350d5de..de439b6bb8 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -887,6 +887,14 @@ def test_types_weakref():
assert wr() is None # not a singleton
+def test_types_has_variadic_buffers():
+ for ty in get_many_types():
+ if ty in (pa.string_view(), pa.binary_view()):
+ assert ty.has_variadic_buffers
+ else:
+ assert not ty.has_variadic_buffers
+
+
def test_fields_hashable():
in_dict = {}
fields = [pa.field('a', pa.int32()),
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 4aa8238556..0d6787cf2a 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -326,6 +326,22 @@ cdef class DataType(_Weakrefable):
"""
return self.type.layout().buffers.size()
+ @property
+ def has_variadic_buffers(self):
+ """
+ If True, the number of expected buffers is only
+ lower-bounded by num_buffers.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> pa.int64().has_variadic_buffers
+ False
+ >>> pa.string_view().has_variadic_buffers
+ True
+ """
+ return self.type.layout().variadic_spec.has_value()
+
def __str__(self):
return frombytes(self.type.ToString(), safe=True)