This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 787afa1594 GH-39651: [Python] Basic pyarrow bindings for
Binary/StringView classes (#39652)
787afa1594 is described below
commit 787afa1594586d2d556d21471647f9cd2c55b18f
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Tue Jan 30 12:54:19 2024 +0100
GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes
(#39652)
### Rationale for this change
First step for https://github.com/apache/arrow/issues/39633: exposing the
Array, DataType and Scalar classes for BinaryView and StringView, such that
those can already be represented in pyarrow.
(I exposed a variant of StringBuilder as well, just for now to be able to
create test data)
* Closes: #39651
Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
docs/source/python/api/arrays.rst | 4 ++
docs/source/python/api/datatypes.rst | 4 ++
python/pyarrow/__init__.py | 7 ++--
python/pyarrow/array.pxi | 14 +++++++
python/pyarrow/builder.pxi | 66 ++++++++++++++++++++++++++++++
python/pyarrow/includes/libarrow.pxd | 9 ++++
python/pyarrow/lib.pxd | 8 ++++
python/pyarrow/lib.pyx | 2 +
python/pyarrow/scalar.pxi | 10 +++++
python/pyarrow/src/arrow/python/helpers.cc | 2 +
python/pyarrow/tests/test_builder.py | 21 +++++++++-
python/pyarrow/tests/test_misc.py | 4 ++
python/pyarrow/tests/test_scalars.py | 28 ++++++++++++-
python/pyarrow/tests/test_types.py | 8 ++++
python/pyarrow/types.pxi | 32 +++++++++++++++
python/pyarrow/types.py | 10 +++++
16 files changed, 223 insertions(+), 6 deletions(-)
diff --git a/docs/source/python/api/arrays.rst
b/docs/source/python/api/arrays.rst
index 73b5e063ff..b858862dcf 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -63,6 +63,8 @@ may expose data type-specific methods or properties.
FixedSizeBinaryArray
LargeBinaryArray
LargeStringArray
+ BinaryViewArray,
+ StringViewArray,
Time32Array
Time64Array
Date32Array
@@ -119,6 +121,8 @@ classes may expose data type-specific methods or properties.
FixedSizeBinaryScalar
LargeBinaryScalar
LargeStringScalar
+ BinaryViewScalar
+ StringViewScalar
Time32Scalar
Time64Scalar
Date32Scalar
diff --git a/docs/source/python/api/datatypes.rst
b/docs/source/python/api/datatypes.rst
index 4066ef3142..642c243b21 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -55,6 +55,8 @@ These should be used to create Arrow data types and schemas.
large_binary
large_string
large_utf8
+ binary_view
+ string_view
decimal128
list_
large_list
@@ -168,6 +170,8 @@ represents a given data type (such as ``int32``) or general
category
is_large_binary
is_large_unicode
is_large_string
+ is_binary_view
+ is_string_view
is_fixed_size_binary
is_map
is_dictionary
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 9da94885ec..4dbd1258d3 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -163,7 +163,7 @@ from pyarrow.lib import (null, bool_,
time32, time64, timestamp, date32, date64, duration,
month_day_nano_interval,
float16, float32, float64,
- binary, string, utf8,
+ binary, string, utf8, binary_view, string_view,
large_binary, large_string, large_utf8,
decimal128, decimal256,
list_, large_list, map_, struct,
@@ -205,6 +205,7 @@ from pyarrow.lib import (null, bool_,
FixedSizeListArray, UnionArray,
BinaryArray, StringArray,
LargeBinaryArray, LargeStringArray,
+ BinaryViewArray, StringViewArray,
FixedSizeBinaryArray,
DictionaryArray,
Date32Array, Date64Array, TimestampArray,
@@ -223,8 +224,8 @@ from pyarrow.lib import (null, bool_,
Time32Scalar, Time64Scalar,
TimestampScalar, DurationScalar,
MonthDayNanoIntervalScalar,
- BinaryScalar, LargeBinaryScalar,
- StringScalar, LargeStringScalar,
+ BinaryScalar, LargeBinaryScalar, BinaryViewScalar,
+ StringScalar, LargeStringScalar, StringViewScalar,
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
RunEndEncodedScalar, ExtensionScalar)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 1416f5f434..1029f3a629 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -2942,6 +2942,12 @@ cdef class LargeStringArray(Array):
null_count, offset)
+cdef class StringViewArray(Array):
+ """
+ Concrete class for Arrow arrays of string (or utf8) view data type.
+ """
+
+
cdef class BinaryArray(Array):
"""
Concrete class for Arrow arrays of variable-sized binary data type.
@@ -2968,6 +2974,12 @@ cdef class LargeBinaryArray(Array):
return (<CLargeBinaryArray*> self.ap).total_values_length()
+cdef class BinaryViewArray(Array):
+ """
+ Concrete class for Arrow arrays of variable-sized binary view data type.
+ """
+
+
cdef class DictionaryArray(Array):
"""
Concrete class for dictionary-encoded Arrow arrays.
@@ -3669,6 +3681,8 @@ cdef dict _array_classes = {
_Type_STRING: StringArray,
_Type_LARGE_BINARY: LargeBinaryArray,
_Type_LARGE_STRING: LargeStringArray,
+ _Type_BINARY_VIEW: BinaryViewArray,
+ _Type_STRING_VIEW: StringViewArray,
_Type_DICTIONARY: DictionaryArray,
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
_Type_DECIMAL128: Decimal128Array,
diff --git a/python/pyarrow/builder.pxi b/python/pyarrow/builder.pxi
index a34ea5412e..2af39e2c58 100644
--- a/python/pyarrow/builder.pxi
+++ b/python/pyarrow/builder.pxi
@@ -80,3 +80,69 @@ cdef class StringBuilder(_Weakrefable):
def __len__(self):
return self.builder.get().length()
+
+
+cdef class StringViewBuilder(_Weakrefable):
+ """
+ Builder class for UTF8 string views.
+
+ This class exposes facilities for incrementally adding string values and
+ building the null bitmap for a pyarrow.Array (type='string_view').
+ """
+ cdef:
+ unique_ptr[CStringViewBuilder] builder
+
+ def __cinit__(self, MemoryPool memory_pool=None):
+ cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
+ self.builder.reset(new CStringViewBuilder(pool))
+
+ def append(self, value):
+ """
+ Append a single value to the builder.
+
+ The value can either be a string/bytes object or a null value
+ (np.nan or None).
+
+ Parameters
+ ----------
+ value : string/bytes or np.nan/None
+ The value to append to the string array builder.
+ """
+ if value is None or value is np.nan:
+ self.builder.get().AppendNull()
+ elif isinstance(value, (bytes, str)):
+ self.builder.get().Append(tobytes(value))
+ else:
+ raise TypeError('StringViewBuilder only accepts string objects')
+
+ def append_values(self, values):
+ """
+ Append all the values from an iterable.
+
+ Parameters
+ ----------
+ values : iterable of string/bytes or np.nan/None values
+ The values to append to the string array builder.
+ """
+ for value in values:
+ self.append(value)
+
+ def finish(self):
+ """
+ Return result of builder as an Array object; also resets the builder.
+
+ Returns
+ -------
+ array : pyarrow.Array
+ """
+ cdef shared_ptr[CArray] out
+ with nogil:
+ self.builder.get().Finish(&out)
+ return pyarrow_wrap_array(out)
+
+ @property
+ def null_count(self):
+ return self.builder.get().null_count()
+
+ def __len__(self):
+ return self.builder.get().length()
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 74e92594b0..d92f09da77 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -126,6 +126,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
_Type_LARGE_BINARY" arrow::Type::LARGE_BINARY"
_Type_LARGE_STRING" arrow::Type::LARGE_STRING"
_Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"
+ _Type_BINARY_VIEW" arrow::Type::BINARY_VIEW"
+ _Type_STRING_VIEW" arrow::Type::STRING_VIEW"
_Type_LIST" arrow::Type::LIST"
_Type_LARGE_LIST" arrow::Type::LARGE_LIST"
@@ -1295,7 +1297,14 @@ cdef extern from "arrow/builder.h" namespace "arrow"
nogil:
cdef cppclass CStringBuilder" arrow::StringBuilder"(CBinaryBuilder):
CStringBuilder(CMemoryPool* pool)
+ CStatus Append(const c_string& value)
+
+ cdef cppclass CBinaryViewBuilder" arrow::BinaryViewBuilder"(CArrayBuilder):
+ CBinaryViewBuilder(shared_ptr[CDataType], CMemoryPool* pool)
+ CStatus Append(const char* value, int32_t length)
+ cdef cppclass CStringViewBuilder"
arrow::StringViewBuilder"(CBinaryViewBuilder):
+ CStringViewBuilder(CMemoryPool* pool)
CStatus Append(const c_string& value)
cdef cppclass CTimestampBuilder "arrow::TimestampBuilder"(CArrayBuilder):
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 58ec34addb..c110486406 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -445,6 +445,14 @@ cdef class BinaryArray(Array):
pass
+cdef class StringViewArray(Array):
+ pass
+
+
+cdef class BinaryViewArray(Array):
+ pass
+
+
cdef class DictionaryArray(Array):
cdef:
object _indices, _dictionary
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 29a0bed559..b0368b67f7 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -106,6 +106,8 @@ Type_STRING = _Type_STRING
Type_LARGE_BINARY = _Type_LARGE_BINARY
Type_LARGE_STRING = _Type_LARGE_STRING
Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
+Type_BINARY_VIEW = _Type_BINARY_VIEW
+Type_STRING_VIEW = _Type_STRING_VIEW
Type_LIST = _Type_LIST
Type_LARGE_LIST = _Type_LARGE_LIST
Type_MAP = _Type_MAP
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 9a66dc8122..2772acf818 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -665,6 +665,14 @@ cdef class LargeStringScalar(StringScalar):
pass
+cdef class BinaryViewScalar(BinaryScalar):
+ pass
+
+
+cdef class StringViewScalar(StringScalar):
+ pass
+
+
cdef class ListScalar(Scalar):
"""
Concrete class for list-like scalars.
@@ -1051,8 +1059,10 @@ cdef dict _scalar_classes = {
_Type_BINARY: BinaryScalar,
_Type_LARGE_BINARY: LargeBinaryScalar,
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryScalar,
+ _Type_BINARY_VIEW: BinaryViewScalar,
_Type_STRING: StringScalar,
_Type_LARGE_STRING: LargeStringScalar,
+ _Type_STRING_VIEW: StringViewScalar,
_Type_LIST: ListScalar,
_Type_LARGE_LIST: LargeListScalar,
_Type_FIXED_SIZE_LIST: FixedSizeListScalar,
diff --git a/python/pyarrow/src/arrow/python/helpers.cc
b/python/pyarrow/src/arrow/python/helpers.cc
index c266abc169..2c86c86a91 100644
--- a/python/pyarrow/src/arrow/python/helpers.cc
+++ b/python/pyarrow/src/arrow/python/helpers.cc
@@ -63,6 +63,8 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
GET_PRIMITIVE_TYPE(STRING, utf8);
GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
+ GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view);
+ GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view);
GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval);
default:
return nullptr;
diff --git a/python/pyarrow/tests/test_builder.py
b/python/pyarrow/tests/test_builder.py
index 50d801026b..abc8a0013d 100644
--- a/python/pyarrow/tests/test_builder.py
+++ b/python/pyarrow/tests/test_builder.py
@@ -20,7 +20,7 @@ import weakref
import numpy as np
import pyarrow as pa
-from pyarrow.lib import StringBuilder
+from pyarrow.lib import StringBuilder, StringViewBuilder
def test_weakref():
@@ -65,3 +65,22 @@ def test_string_builder_append_after_finish():
sbuilder.append("No effect")
expected = [None, None, "text", None, "other text"]
assert arr.to_pylist() == expected
+
+
+def test_string_view_builder():
+ builder = StringViewBuilder()
+ builder.append(b"a byte string")
+ builder.append("a string")
+ builder.append("a longer not-inlined string")
+ builder.append(np.nan)
+ builder.append_values([None, "text"])
+ assert len(builder) == 6
+ assert builder.null_count == 2
+ arr = builder.finish()
+ assert isinstance(arr, pa.Array)
+ assert arr.null_count == 2
+ assert arr.type == 'string_view'
+ expected = [
+ "a byte string", "a string", "a longer not-inlined string", None,
None, "text"
+ ]
+ assert arr.to_pylist() == expected
diff --git a/python/pyarrow/tests/test_misc.py
b/python/pyarrow/tests/test_misc.py
index 8b8c50882b..8cec878328 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -185,6 +185,8 @@ def test_set_timezone_db_path_non_windows():
pa.UnionArray,
pa.BinaryArray,
pa.StringArray,
+ pa.BinaryViewArray,
+ pa.StringViewArray,
pa.FixedSizeBinaryArray,
pa.DictionaryArray,
pa.Date32Array,
@@ -221,6 +223,8 @@ def test_set_timezone_db_path_non_windows():
pa.StringScalar,
pa.BinaryScalar,
pa.FixedSizeBinaryScalar,
+ pa.BinaryViewScalar,
+ pa.StringViewScalar,
pa.ListScalar,
pa.LargeListScalar,
pa.MapScalar,
diff --git a/python/pyarrow/tests/test_scalars.py
b/python/pyarrow/tests/test_scalars.py
index 74dee59558..4a239b23d5 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -51,6 +51,9 @@ from pyarrow.tests import util
(b"bytes", None, pa.BinaryScalar),
("largestring", pa.large_string(), pa.LargeStringScalar),
(b"largebytes", pa.large_binary(), pa.LargeBinaryScalar),
+ # TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be
implemented
+ # ("string_view", pa.string_view(), pa.StringViewScalar),
+ # (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar),
(b"abc", pa.binary(3), pa.FixedSizeBinaryScalar),
([1, 2, 3], None, pa.ListScalar),
([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar),
@@ -488,7 +491,8 @@ def test_month_day_nano_interval():
@pytest.mark.parametrize('value', ['foo', 'mañana'])
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
(pa.string(), pa.StringScalar),
- (pa.large_string(), pa.LargeStringScalar)
+ (pa.large_string(), pa.LargeStringScalar),
+ # (pa.string_view(), pa.StringViewScalar),
])
def test_string(value, ty, scalar_typ):
s = pa.scalar(value, type=ty)
@@ -503,10 +507,30 @@ def test_string(value, ty, scalar_typ):
assert buf.to_pybytes() == value.encode()
[email protected]('value', ['foo', 'mañana'])
+def test_string_view(value):
+ # TODO: replace with normal scalar construction
+ builder = pa.lib.StringViewBuilder()
+ builder.append(value)
+ arr = builder.finish()
+
+ s = arr[0]
+ assert isinstance(s, pa.StringViewScalar)
+ assert s.as_py() == value
+ assert s.as_py() != 'something'
+ assert repr(value) in repr(s)
+ assert str(s) == str(value)
+
+ buf = s.as_buffer()
+ assert isinstance(buf, pa.Buffer)
+ assert buf.to_pybytes() == value.encode()
+
+
@pytest.mark.parametrize('value', [b'foo', b'bar'])
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
(pa.binary(), pa.BinaryScalar),
- (pa.large_binary(), pa.LargeBinaryScalar)
+ (pa.large_binary(), pa.LargeBinaryScalar),
+ # (pa.binary_view(), pa.BinaryViewScalar),
])
def test_binary(value, ty, scalar_typ):
s = pa.scalar(value, type=ty)
diff --git a/python/pyarrow/tests/test_types.py
b/python/pyarrow/tests/test_types.py
index c8a52c6b62..a5ab3128dc 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -61,6 +61,8 @@ def get_many_types():
pa.binary(10),
pa.large_string(),
pa.large_binary(),
+ pa.string_view(),
+ pa.binary_view(),
pa.list_(pa.int32()),
pa.list_(pa.int32(), 2),
pa.large_list(pa.uint16()),
@@ -244,6 +246,12 @@ def test_is_binary_string():
assert types.is_fixed_size_binary(pa.binary(5))
assert not types.is_fixed_size_binary(pa.binary())
+ assert types.is_string_view(pa.string_view())
+ assert not types.is_string_view(pa.string())
+ assert types.is_binary_view(pa.binary_view())
+ assert not types.is_binary_view(pa.binary())
+ assert not types.is_binary_view(pa.string_view())
+
def test_is_temporal_date_time_timestamp():
date_types = [pa.date32(), pa.date64()]
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index b6dc53d633..ce3736b5af 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -4375,6 +4375,36 @@ def large_utf8():
return large_string()
+def binary_view():
+ """
+ Create a variable-length binary view type.
+
+ Examples
+ --------
+ Create an instance of a string type:
+
+ >>> import pyarrow as pa
+ >>> pa.binary_view()
+ DataType(binary_view)
+ """
+ return primitive_type(_Type_BINARY_VIEW)
+
+
+def string_view():
+ """
+ Create UTF8 variable-length string view type.
+
+ Examples
+ --------
+ Create an instance of a string type:
+
+ >>> import pyarrow as pa
+ >>> pa.string_view()
+ DataType(string_view)
+ """
+ return primitive_type(_Type_STRING_VIEW)
+
+
def list_(value_type, int list_size=-1):
"""
Create ListType instance from child data type or field.
@@ -4991,6 +5021,8 @@ cdef dict _type_aliases = {
'large_str': large_string,
'large_utf8': large_string,
'large_binary': large_binary,
+ 'binary_view': binary_view,
+ 'string_view': string_view,
'date32': date32,
'date64': date64,
'date32[day]': date32,
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index 5d7dbe4b45..32398dac9c 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -243,6 +243,16 @@ def is_fixed_size_binary(t):
return t.id == lib.Type_FIXED_SIZE_BINARY
+@doc(is_null, datatype="variable-length binary view")
+def is_binary_view(t):
+ return t.id == lib.Type_BINARY_VIEW
+
+
+@doc(is_null, datatype="variable-length string (utf-8) view")
+def is_string_view(t):
+ return t.id == lib.Type_STRING_VIEW
+
+
@doc(is_null, datatype="date")
def is_date(t):
return t.id in _DATE_TYPES