This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new caa7898125 GH-34568: [C++][Python] Expose Run-End Encoded arrays in
Python Arrow (#34570)
caa7898125 is described below
commit caa78981259fc029182e677f13183371822ce410
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Thu Mar 23 05:20:06 2023 -0300
GH-34568: [C++][Python] Expose Run-End Encoded arrays in Python Arrow
(#34570)
* Closes: #34568
Authored-by: Felipe Oliveira Carvalho <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
docs/source/python/api/arrays.rst | 1 +
docs/source/python/api/compute.rst | 3 +
docs/source/python/api/datatypes.rst | 3 +
python/pyarrow/__init__.py | 3 +
python/pyarrow/_compute.pyx | 22 +++++
python/pyarrow/array.pxi | 165 +++++++++++++++++++++++++++++++++++
python/pyarrow/compute.py | 1 +
python/pyarrow/includes/libarrow.pxd | 39 +++++++++
python/pyarrow/lib.pxd | 5 ++
python/pyarrow/lib.pyx | 1 +
python/pyarrow/public-api.pxi | 2 +
python/pyarrow/tests/test_array.py | 71 +++++++++++++++
python/pyarrow/tests/test_compute.py | 16 ++++
python/pyarrow/tests/test_misc.py | 1 +
python/pyarrow/tests/test_types.py | 28 +++++-
python/pyarrow/types.pxi | 54 ++++++++++--
python/pyarrow/types.py | 11 +++
17 files changed, 418 insertions(+), 8 deletions(-)
diff --git a/docs/source/python/api/arrays.rst
b/docs/source/python/api/arrays.rst
index 59543db370..2d96bedca1 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -73,6 +73,7 @@ may expose data type-specific methods or properties.
FixedSizeListArray
LargeListArray
MapArray
+ RunEndEncodedArray
StructArray
UnionArray
ExtensionArray
diff --git a/docs/source/python/api/compute.rst
b/docs/source/python/api/compute.rst
index f0fd3efd19..c04652e79c 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -402,6 +402,8 @@ Conversions
ceil_temporal
floor_temporal
round_temporal
+ run_end_decode
+ run_end_encode
strftime
strptime
@@ -546,6 +548,7 @@ Compute Options
RoundOptions
RoundTemporalOptions
RoundToMultipleOptions
+ RunEndEncodeOptions
ScalarAggregateOptions
ScalarAggregateOptions
SelectKOptions
diff --git a/docs/source/python/api/datatypes.rst
b/docs/source/python/api/datatypes.rst
index 48a254a001..0b3f5e2d56 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -61,6 +61,7 @@ These should be used to create Arrow data types and schemas.
map_
struct
dictionary
+ run_end_encoded
field
schema
from_numpy_dtype
@@ -98,6 +99,7 @@ functions above.
Decimal128Type
Field
Schema
+ RunEndEncodedType
Specific classes and functions for extension types.
@@ -145,6 +147,7 @@ represents a given data type (such as ``int32``) or general
category
is_struct
is_union
is_nested
+ is_run_end_encoded
is_temporal
is_timestamp
is_date
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 9fd5c290d1..8fd2553e4c 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -169,6 +169,7 @@ from pyarrow.lib import (null, bool_,
list_, large_list, map_, struct,
union, sparse_union, dense_union,
dictionary,
+ run_end_encoded,
field,
type_for_alias,
DataType, DictionaryType, StructType,
@@ -177,6 +178,7 @@ from pyarrow.lib import (null, bool_,
TimestampType, Time32Type, Time64Type, DurationType,
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
+ RunEndEncodedType,
PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
@@ -207,6 +209,7 @@ from pyarrow.lib import (null, bool_,
Time32Array, Time64Array, DurationArray,
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray,
ExtensionArray,
+ RunEndEncodedArray,
scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index cf9924baee..dd980abad7 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -1335,6 +1335,28 @@ class DictionaryEncodeOptions(_DictionaryEncodeOptions):
self._set_options(null_encoding)
+cdef class _RunEndEncodeOptions(FunctionOptions):
+ def _set_options(self, run_end_type):
+ run_end_ty = ensure_type(run_end_type)
+ self.wrapped.reset(new
CRunEndEncodeOptions(pyarrow_unwrap_data_type(run_end_ty)))
+
+
+class RunEndEncodeOptions(_RunEndEncodeOptions):
+ """
+ Options for run-end encoding.
+
+ Parameters
+ ----------
+ run_end_type : DataType, default pyarrow.int32()
+ The data type of the run_ends array.
+
+ Accepted values are pyarrow.{int16(), int32(), int64()}.
+ """
+
+ def __init__(self, run_end_type=lib.int32()):
+ self._set_options(run_end_type)
+
+
cdef class _TakeOptions(FunctionOptions):
def _set_options(self, boundscheck):
self.wrapped.reset(new CTakeOptions(boundscheck))
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 9deccfa117..4f8f9b6bf9 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -2851,6 +2851,170 @@ cdef class StructArray(Array):
return self.take(indices)
+cdef class RunEndEncodedArray(Array):
+ """
+ Concrete class for Arrow run-end encoded arrays.
+ """
+
+ @staticmethod
+ def _from_arrays(type, allow_none_for_type, logical_length, run_ends,
values, logical_offset):
+ cdef:
+ int64_t _logical_length
+ Array _run_ends
+ Array _values
+ int64_t _logical_offset
+ shared_ptr[CDataType] c_type
+ shared_ptr[CRunEndEncodedArray] ree_array
+
+ _logical_length = <int64_t>logical_length
+ _logical_offset = <int64_t>logical_offset
+
+ type = ensure_type(type, allow_none=allow_none_for_type)
+ if type is not None:
+ _run_ends = asarray(run_ends, type=type.run_end_type)
+ _values = asarray(values, type=type.value_type)
+ c_type = pyarrow_unwrap_data_type(type)
+ with nogil:
+ ree_array = GetResultValue(CRunEndEncodedArray.Make(
+ c_type, _logical_length, _run_ends.sp_array,
_values.sp_array, _logical_offset))
+ else:
+ _run_ends = asarray(run_ends)
+ _values = asarray(values)
+ with nogil:
+ ree_array = GetResultValue(CRunEndEncodedArray.MakeFromArrays(
+ _logical_length, _run_ends.sp_array, _values.sp_array,
_logical_offset))
+ cdef Array result = pyarrow_wrap_array(<shared_ptr[CArray]>ree_array)
+ result.validate(full=True)
+ return result
+
+ @staticmethod
+ def from_arrays(run_ends, values, type=None):
+ """
+ Construct RunEndEncodedArray from run_ends and values arrays.
+
+ Parameters
+ ----------
+ run_ends : Array (int16, int32, or int64 type)
+ The run_ends array.
+ values : Array (any type)
+ The values array.
+ type : pyarrow.DataType, optional
+ The run_end_encoded(run_end_type, value_type) array type.
+
+ Returns
+ -------
+ RunEndEncodedArray
+ """
+ logical_length = run_ends[-1] if len(run_ends) > 0 else 0
+ return RunEndEncodedArray._from_arrays(type, True, logical_length,
+ run_ends, values, 0)
+
+ @staticmethod
+ def from_buffers(DataType type, length, buffers, null_count=-1, offset=0,
+ children=None):
+ """
+ Construct a RunEndEncodedArray from all the parameters that make up an
+ Array.
+
+ RunEndEncodedArrays do not have buffers, only children arrays, but this
+ implementation is needed to satisfy the Array interface.
+
+ Parameters
+ ----------
+ type : DataType
+ The run_end_encoded(run_end_type, value_type) type.
+ length : int
+ The logical length of the run-end encoded array. Expected to match
+ the last value of the run_ends array (children[0]) minus the
offset.
+ buffers : List[Buffer]
+ Empty List or [None].
+ null_count : int, default -1
+ The number of null entries in the array. Run-end encoded arrays
+ are specified to not have valid bits and null_count always equals
0.
+ offset : int, default 0
+ The array's logical offset (in values, not in bytes) from the
+ start of each buffer.
+ children : List[Array]
+ Nested type children containing the run_ends and values arrays.
+
+ Returns
+ -------
+ RunEndEncodedArray
+ """
+ children = children or []
+
+ if type.num_fields != len(children):
+ raise ValueError("RunEndEncodedType's expected number of children "
+ "({0}) did not match the passed number "
+ "({1}).".format(type.num_fields, len(children)))
+
+ # buffers are validated as if we needed to pass them to C++, but
+ # _make_from_arrays will take care of filling in the expected
+ # buffers array containing a single NULL buffer on the C++ side
+ if len(buffers) == 0:
+ buffers = [None]
+ if buffers[0] is not None:
+ raise ValueError("RunEndEncodedType expects None as validity "
+ "bitmap, buffers[0] is not None")
+ if type.num_buffers != len(buffers):
+ raise ValueError("RunEndEncodedType's expected number of buffers "
+ "({0}) did not match the passed number "
+ "({1}).".format(type.num_buffers, len(buffers)))
+
+ # null_count is also validated as if we needed it
+ if null_count != -1 and null_count != 0:
+ raise ValueError("RunEndEncodedType's expected null_count (0) "
+ "did not match passed number
({0})".format(null_count))
+
+ return RunEndEncodedArray._from_arrays(type, False, length,
children[0],
+ children[1], offset)
+
+ @property
+ def run_ends(self):
+ """
+ An array holding the logical indexes of each run-end.
+
+ The physical offset to the array is applied.
+ """
+ cdef CRunEndEncodedArray* ree_array = <CRunEndEncodedArray*>(self.ap)
+ return pyarrow_wrap_array(ree_array.run_ends())
+
+ @property
+ def values(self):
+ """
+ An array holding the values of each run.
+
+ The physical offset to the array is applied.
+ """
+ cdef CRunEndEncodedArray* ree_array = <CRunEndEncodedArray*>(self.ap)
+ return pyarrow_wrap_array(ree_array.values())
+
+ def find_physical_offset(self):
+ """
+ Find the physical offset of this REE array.
+
+ This is the offset of the run that contains the value of the first
+ logical element of this array considering its offet.
+
+ This function uses binary-search, so it has a O(log N) cost.
+ """
+ cdef CRunEndEncodedArray* ree_array = <CRunEndEncodedArray*>(self.ap)
+ return ree_array.FindPhysicalOffset()
+
+ def find_physical_length(self):
+ """
+ Find the physical length of this REE array.
+
+ The physical length of an REE is the number of physical values (and
+ run-ends) necessary to represent the logical range of values from
offset
+ to length.
+
+ This function uses binary-search, so it has a O(log N) cost.
+ """
+ cdef CRunEndEncodedArray* ree_array = <CRunEndEncodedArray*>(self.ap)
+ return ree_array.FindPhysicalLength()
+
+
cdef class ExtensionArray(Array):
"""
Concrete class for Arrow extension arrays.
@@ -2960,6 +3124,7 @@ cdef dict _array_classes = {
_Type_DECIMAL128: Decimal128Array,
_Type_DECIMAL256: Decimal256Array,
_Type_STRUCT: StructArray,
+ _Type_RUN_END_ENCODED: RunEndEncodedArray,
_Type_EXTENSION: ExtensionArray,
}
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index e4daaf5e58..efc1e1d956 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -36,6 +36,7 @@ from pyarrow._compute import ( # noqa
CumulativeSumOptions,
DayOfWeekOptions,
DictionaryEncodeOptions,
+ RunEndEncodeOptions,
ElementWiseAggregateOptions,
ExtractRegexOptions,
FilterOptions,
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 73f7294142..47180ff291 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -129,6 +129,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
_Type_SPARSE_UNION" arrow::Type::SPARSE_UNION"
_Type_DENSE_UNION" arrow::Type::DENSE_UNION"
_Type_DICTIONARY" arrow::Type::DICTIONARY"
+ _Type_RUN_END_ENCODED" arrow::Type::RUN_END_ENCODED"
_Type_MAP" arrow::Type::MAP"
_Type_EXTENSION" arrow::Type::EXTENSION"
@@ -396,6 +397,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
int precision()
int scale()
+ cdef cppclass CRunEndEncodedType " arrow::RunEndEncodedType"(CDataType):
+ CRunEndEncodedType(const shared_ptr[CDataType]& run_end_type,
+ const shared_ptr[CDataType]& value_type)
+ const shared_ptr[CDataType]& run_end_type()
+ const shared_ptr[CDataType]& value_type()
+
cdef cppclass CField" arrow::Field":
cppclass CMergeOptions "arrow::Field::MergeOptions":
c_bool promote_nullability
@@ -463,6 +470,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
vector[shared_ptr[CField]] fields,
vector[int8_t] type_codes)
+ cdef shared_ptr[CDataType] CMakeRunEndEncodedType" arrow::run_end_encoded"(
+ shared_ptr[CDataType] run_end_type,
+ shared_ptr[CDataType] value_type)
+
cdef cppclass CSchema" arrow::Schema":
CSchema(const vector[shared_ptr[CField]]& fields)
CSchema(const vector[shared_ptr[CField]]& fields,
@@ -771,6 +782,28 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
CResult[vector[shared_ptr[CArray]]] Flatten(CMemoryPool* pool)
+ cdef cppclass CRunEndEncodedArray" arrow::RunEndEncodedArray"(CArray):
+ @staticmethod
+ CResult[shared_ptr[CRunEndEncodedArray]] Make(
+ const shared_ptr[CDataType]& type,
+ int64_t logical_length,
+ const shared_ptr[CArray]& run_ends,
+ const shared_ptr[CArray]& values,
+ int64_t logical_offset)
+
+ @staticmethod
+ CResult[shared_ptr[CRunEndEncodedArray]] MakeFromArrays "Make"(
+ int64_t logical_length,
+ const shared_ptr[CArray]& run_ends,
+ const shared_ptr[CArray]& values,
+ int64_t logical_offset)
+
+ shared_ptr[CArray]& run_ends()
+ shared_ptr[CArray]& values()
+
+ int64_t FindPhysicalOffset()
+ int64_t FindPhysicalLength()
+
cdef cppclass CChunkedArray" arrow::ChunkedArray":
CChunkedArray(const vector[shared_ptr[CArray]]& arrays)
CChunkedArray(const vector[shared_ptr[CArray]]& arrays,
@@ -2201,6 +2234,12 @@ cdef extern from "arrow/compute/api.h" namespace
"arrow::compute" nogil:
CDictionaryEncodeNullEncodingBehavior null_encoding)
CDictionaryEncodeNullEncodingBehavior null_encoding
+ cdef cppclass CRunEndEncodeOptions \
+ "arrow::compute::RunEndEncodeOptions"(CFunctionOptions):
+ CRunEndEncodeOptions()
+ CRunEndEncodeOptions(shared_ptr[CDataType] run_end_type)
+ shared_ptr[CDataType] run_end_type
+
cdef cppclass CTakeOptions \
" arrow::compute::TakeOptions"(CFunctionOptions):
CTakeOptions(c_bool boundscheck)
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index faa98d150f..6a3726e964 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -184,6 +184,11 @@ cdef class Decimal256Type(FixedSizeBinaryType):
const CDecimal256Type* decimal256_type
+cdef class RunEndEncodedType(DataType):
+ cdef:
+ const CRunEndEncodedType* run_end_encoded_type
+
+
cdef class BaseExtensionType(DataType):
cdef:
const CExtensionType* ext_type
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 8d7a5f61ee..6603947113 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -114,6 +114,7 @@ Type_STRUCT = _Type_STRUCT
Type_SPARSE_UNION = _Type_SPARSE_UNION
Type_DENSE_UNION = _Type_DENSE_UNION
Type_DICTIONARY = _Type_DICTIONARY
+Type_RUN_END_ENCODED = _Type_RUN_END_ENCODED
UnionMode_SPARSE = _UnionMode_SPARSE
UnionMode_DENSE = _UnionMode_DENSE
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 1849ecab09..fadc659d45 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -111,6 +111,8 @@ cdef api object pyarrow_wrap_data_type(
out = Decimal128Type.__new__(Decimal128Type)
elif type.get().id() == _Type_DECIMAL256:
out = Decimal256Type.__new__(Decimal256Type)
+ elif type.get().id() == _Type_RUN_END_ENCODED:
+ out = RunEndEncodedType.__new__(RunEndEncodedType)
elif type.get().id() == _Type_EXTENSION:
ext_type = <const CExtensionType*> type.get()
cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index d974af99c6..b4ad080005 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -3407,3 +3407,74 @@ def test_array_accepts_pyarrow_array():
# Test memory_pool keyword is accepted
result = pa.array(arr, memory_pool=pa.default_memory_pool())
assert arr == result
+
+
+def check_run_end_encoded(ree_array, run_ends, values, logical_length,
physical_length,
+ physical_offset):
+ assert ree_array.run_ends.to_pylist() == run_ends
+ assert ree_array.values.to_pylist() == values
+ assert len(ree_array) == logical_length
+ assert ree_array.find_physical_length() == physical_length
+ assert ree_array.find_physical_offset() == physical_offset
+
+
+def check_run_end_encoded_from_arrays_with_type(ree_type=None):
+ run_ends = [3, 5, 10, 19]
+ values = [1, 2, 1, 3]
+ ree_array = pa.RunEndEncodedArray.from_arrays(run_ends, values, ree_type)
+ check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0)
+
+
+def test_run_end_encoded_from_arrays():
+ check_run_end_encoded_from_arrays_with_type()
+ for run_end_type in [pa.int16(), pa.int32(), pa.int64()]:
+ for value_type in [pa.uint32(), pa.int32(), pa.uint64(), pa.int64()]:
+ ree_type = pa.run_end_encoded(run_end_type, value_type)
+ check_run_end_encoded_from_arrays_with_type(ree_type)
+
+
+def test_run_end_encoded_from_buffers():
+ run_ends = [3, 5, 10, 19]
+ values = [1, 2, 1, 3]
+
+ ree_type = pa.run_end_encoded(run_end_type=pa.int32(),
value_type=pa.uint8())
+ length = 19
+ buffers = [None]
+ null_count = 0
+ offset = 0
+ children = [run_ends, values]
+
+ ree_array = pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers,
+ null_count, offset,
+ children)
+ check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0)
+ # buffers = []
+ ree_array = pa.RunEndEncodedArray.from_buffers(ree_type, length, [],
+ null_count, offset,
+ children)
+ check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0)
+ # null_count = -1
+ ree_array = pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers,
+ -1, offset,
+ children)
+ check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0)
+ # offset = 4
+ ree_array = pa.RunEndEncodedArray.from_buffers(ree_type, length - 4,
buffers,
+ null_count, 4, children)
+ check_run_end_encoded(ree_array, run_ends, values, length - 4, 3, 1)
+ # buffers = [None, None]
+ with pytest.raises(ValueError):
+ pa.RunEndEncodedArray.from_buffers(ree_type, length, [None, None],
+ null_count, offset, children)
+ # children = None
+ with pytest.raises(ValueError):
+ pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers,
+ null_count, offset, None)
+ # len(children) == 1
+ with pytest.raises(ValueError):
+ pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers,
+ null_count, offset, [run_ends])
+ # null_count = 1
+ with pytest.raises(ValueError):
+ pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers,
+ 1, offset, children)
diff --git a/python/pyarrow/tests/test_compute.py
b/python/pyarrow/tests/test_compute.py
index 92560e4070..875d0e613b 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -134,6 +134,7 @@ def test_option_class_equality():
pc.CountOptions(),
pc.DayOfWeekOptions(count_from_zero=False, week_start=0),
pc.DictionaryEncodeOptions(),
+ pc.RunEndEncodeOptions(),
pc.ElementWiseAggregateOptions(skip_nulls=True),
pc.ExtractRegexOptions("pattern"),
pc.FilterOptions(),
@@ -3108,3 +3109,18 @@ def test_list_slice_bad_parameters():
pc.list_slice(arr, 0, 1, step=0)
with pytest.raises(pa.ArrowInvalid, match=msg + "-1"):
pc.list_slice(arr, 0, 1, step=-1)
+
+
+def check_run_end_encode_decode(run_end_encode_opts=None):
+ arr = pa.array([1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3])
+ encoded = pc.run_end_encode(arr, options=run_end_encode_opts)
+ decoded = pc.run_end_decode(encoded)
+ assert decoded.type == arr.type
+ assert decoded.equals(arr)
+
+
+def test_run_end_encode():
+ check_run_end_encode_decode()
+ check_run_end_encode_decode(pc.RunEndEncodeOptions(pa.int16()))
+ check_run_end_encode_decode(pc.RunEndEncodeOptions('int32'))
+ check_run_end_encode_decode(pc.RunEndEncodeOptions(pa.int64()))
diff --git a/python/pyarrow/tests/test_misc.py
b/python/pyarrow/tests/test_misc.py
index 457d7d396a..af6a516279 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -171,6 +171,7 @@ def test_runtime_info():
pa.Decimal128Array,
pa.Decimal256Array,
pa.StructArray,
+ pa.RunEndEncodedArray,
pa.Scalar,
pa.BooleanScalar,
pa.Int8Scalar,
diff --git a/python/pyarrow/tests/test_types.py
b/python/pyarrow/tests/test_types.py
index c780cd80c7..09f0dfe7d3 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -83,7 +83,10 @@ def get_many_types():
pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
pa.union([pa.field('a', pa.binary(10), nullable=False),
pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
- pa.dictionary(pa.int32(), pa.string())
+ pa.dictionary(pa.int32(), pa.string()),
+ pa.run_end_encoded(pa.int16(), pa.int32()),
+ pa.run_end_encoded(pa.int32(), pa.string()),
+ pa.run_end_encoded(pa.int64(), pa.uint8())
)
@@ -209,6 +212,11 @@ def test_is_union():
assert not types.is_union(pa.list_(pa.int32()))
+def test_is_run_end_encoded():
+ assert types.is_run_end_encoded(pa.run_end_encoded(pa.int32(), pa.int64()))
+ assert not types.is_run_end_encoded(pa.utf8())
+
+
# TODO(wesm): is_map, once implemented
@@ -818,6 +826,24 @@ def test_fields_weakrefable():
assert wr() is None
+def test_run_end_encoded_type():
+ ty = pa.run_end_encoded(pa.int64(), pa.utf8())
+ assert isinstance(ty, pa.RunEndEncodedType)
+ assert ty.run_end_type == pa.int64()
+ assert ty.value_type == pa.utf8()
+ assert ty.num_buffers == 1 # buffers expected to be {NULLPTR}
+ assert ty.num_fields == 2
+
+ with pytest.raises(TypeError):
+ pa.run_end_encoded(pa.int64(), None)
+
+ with pytest.raises(TypeError):
+ pa.run_end_encoded(None, pa.utf8())
+
+ with pytest.raises(ValueError):
+ pa.run_end_encoded(pa.int8(), pa.utf8())
+
+
@pytest.mark.parametrize('t,check_func', [
(pa.date32(), types.is_date32),
(pa.date64(), types.is_date64),
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 0a68daf333..0e81706660 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1244,6 +1244,27 @@ cdef class Decimal256Type(FixedSizeBinaryType):
return self.decimal256_type.scale()
+cdef class RunEndEncodedType(DataType):
+ """
+ Concrete class for run-end encoded types.
+ """
+
+ cdef void init(self, const shared_ptr[CDataType]& type) except *:
+ DataType.init(self, type)
+ self.run_end_encoded_type = <const CRunEndEncodedType*> type.get()
+
+ def __reduce__(self):
+ return run_end_encoded, (self.run_end_type, self.value_type)
+
+ @property
+ def run_end_type(self):
+ return pyarrow_wrap_data_type(self.run_end_encoded_type.run_end_type())
+
+ @property
+ def value_type(self):
+ return pyarrow_wrap_data_type(self.run_end_encoded_type.value_type())
+
+
cdef class BaseExtensionType(DataType):
"""
Concrete base class for extension types.
@@ -4479,13 +4500,6 @@ def union(child_fields, mode, type_codes=None):
-------
type : UnionType
"""
- cdef:
- Field child_field
- vector[shared_ptr[CField]] c_fields
- vector[int8_t] c_type_codes
- shared_ptr[CDataType] union_type
- int i
-
if isinstance(mode, int):
if mode not in (_UnionMode_SPARSE, _UnionMode_DENSE):
raise ValueError("Invalid union mode {0!r}".format(mode))
@@ -4503,6 +4517,32 @@ def union(child_fields, mode, type_codes=None):
return dense_union(child_fields, type_codes)
+def run_end_encoded(run_end_type, value_type):
+ """
+ Create RunEndEncodedType from run-end and value types.
+
+ Parameters
+ ----------
+ run_end_type : pyarrow.DataType
+ The integer type of the run_ends array. Must be 'int16', 'int32', or
'int64'.
+ value_type : pyarrow.DataType
+ The type of the values array.
+
+ Returns
+ -------
+ type : RunEndEncodedType
+ """
+ cdef:
+ DataType _run_end_type = ensure_type(run_end_type, allow_none=False)
+ DataType _value_type = ensure_type(value_type, allow_none=False)
+ shared_ptr[CDataType] ree_type
+
+ if not _run_end_type.type.id() in [_Type_INT16, _Type_INT32, _Type_INT64]:
+ raise ValueError("The run_end_type should be 'int16', 'int32', or
'int64'")
+ ree_type = CMakeRunEndEncodedType(_run_end_type.sp_type,
_value_type.sp_type)
+ return pyarrow_wrap_data_type(ree_type)
+
+
cdef dict _type_aliases = {
'null': null,
'bool': bool_,
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index f239c883b4..a88ec2ad7e 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -296,6 +296,17 @@ def is_nested(t):
return t.id in _NESTED_TYPES
+def is_run_end_encoded(t):
+ """
+ Return True if value is an instance of a run-end encoded type.
+
+ Parameters
+ ----------
+ t : DataType
+ """
+ return t.id == lib.Type_RUN_END_ENCODED
+
+
def is_temporal(t):
"""
Return True if value is an instance of date, time, timestamp or duration.