This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new caa7898125 GH-34568: [C++][Python] Expose Run-End Encoded arrays in 
Python Arrow (#34570)
caa7898125 is described below

commit caa78981259fc029182e677f13183371822ce410
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Thu Mar 23 05:20:06 2023 -0300

    GH-34568: [C++][Python] Expose Run-End Encoded arrays in Python Arrow 
(#34570)
    
    
    * Closes: #34568
    
    Authored-by: Felipe Oliveira Carvalho <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 docs/source/python/api/arrays.rst    |   1 +
 docs/source/python/api/compute.rst   |   3 +
 docs/source/python/api/datatypes.rst |   3 +
 python/pyarrow/__init__.py           |   3 +
 python/pyarrow/_compute.pyx          |  22 +++++
 python/pyarrow/array.pxi             | 165 +++++++++++++++++++++++++++++++++++
 python/pyarrow/compute.py            |   1 +
 python/pyarrow/includes/libarrow.pxd |  39 +++++++++
 python/pyarrow/lib.pxd               |   5 ++
 python/pyarrow/lib.pyx               |   1 +
 python/pyarrow/public-api.pxi        |   2 +
 python/pyarrow/tests/test_array.py   |  71 +++++++++++++++
 python/pyarrow/tests/test_compute.py |  16 ++++
 python/pyarrow/tests/test_misc.py    |   1 +
 python/pyarrow/tests/test_types.py   |  28 +++++-
 python/pyarrow/types.pxi             |  54 ++++++++++--
 python/pyarrow/types.py              |  11 +++
 17 files changed, 418 insertions(+), 8 deletions(-)

diff --git a/docs/source/python/api/arrays.rst 
b/docs/source/python/api/arrays.rst
index 59543db370..2d96bedca1 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -73,6 +73,7 @@ may expose data type-specific methods or properties.
    FixedSizeListArray
    LargeListArray
    MapArray
+   RunEndEncodedArray
    StructArray
    UnionArray
    ExtensionArray
diff --git a/docs/source/python/api/compute.rst 
b/docs/source/python/api/compute.rst
index f0fd3efd19..c04652e79c 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -402,6 +402,8 @@ Conversions
    ceil_temporal
    floor_temporal
    round_temporal
+   run_end_decode
+   run_end_encode
    strftime
    strptime
 
@@ -546,6 +548,7 @@ Compute Options
    RoundOptions
    RoundTemporalOptions
    RoundToMultipleOptions
+   RunEndEncodeOptions
    ScalarAggregateOptions
    ScalarAggregateOptions
    SelectKOptions
diff --git a/docs/source/python/api/datatypes.rst 
b/docs/source/python/api/datatypes.rst
index 48a254a001..0b3f5e2d56 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -61,6 +61,7 @@ These should be used to create Arrow data types and schemas.
    map_
    struct
    dictionary
+   run_end_encoded
    field
    schema
    from_numpy_dtype
@@ -98,6 +99,7 @@ functions above.
    Decimal128Type
    Field
    Schema
+   RunEndEncodedType
 
 Specific classes and functions for extension types.
 
@@ -145,6 +147,7 @@ represents a given data type (such as ``int32``) or general 
category
    is_struct
    is_union
    is_nested
+   is_run_end_encoded
    is_temporal
    is_timestamp
    is_date
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 9fd5c290d1..8fd2553e4c 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -169,6 +169,7 @@ from pyarrow.lib import (null, bool_,
                          list_, large_list, map_, struct,
                          union, sparse_union, dense_union,
                          dictionary,
+                         run_end_encoded,
                          field,
                          type_for_alias,
                          DataType, DictionaryType, StructType,
@@ -177,6 +178,7 @@ from pyarrow.lib import (null, bool_,
                          TimestampType, Time32Type, Time64Type, DurationType,
                          FixedSizeBinaryType, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
+                         RunEndEncodedType,
                          PyExtensionType, UnknownExtensionType,
                          register_extension_type, unregister_extension_type,
                          DictionaryMemo,
@@ -207,6 +209,7 @@ from pyarrow.lib import (null, bool_,
                          Time32Array, Time64Array, DurationArray,
                          MonthDayNanoIntervalArray,
                          Decimal128Array, Decimal256Array, StructArray, 
ExtensionArray,
+                         RunEndEncodedArray,
                          scalar, NA, _NULL as NULL, Scalar,
                          NullScalar, BooleanScalar,
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index cf9924baee..dd980abad7 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -1335,6 +1335,28 @@ class DictionaryEncodeOptions(_DictionaryEncodeOptions):
         self._set_options(null_encoding)
 
 
+cdef class _RunEndEncodeOptions(FunctionOptions):
+    def _set_options(self, run_end_type):
+        run_end_ty = ensure_type(run_end_type)
+        self.wrapped.reset(new 
CRunEndEncodeOptions(pyarrow_unwrap_data_type(run_end_ty)))
+
+
+class RunEndEncodeOptions(_RunEndEncodeOptions):
+    """
+    Options for run-end encoding.
+
+    Parameters
+    ----------
+    run_end_type : DataType, default pyarrow.int32()
+        The data type of the run_ends array.
+
+        Accepted values are pyarrow.{int16(), int32(), int64()}.
+    """
+
+    def __init__(self, run_end_type=lib.int32()):
+        self._set_options(run_end_type)
+
+
 cdef class _TakeOptions(FunctionOptions):
     def _set_options(self, boundscheck):
         self.wrapped.reset(new CTakeOptions(boundscheck))
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 9deccfa117..4f8f9b6bf9 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -2851,6 +2851,170 @@ cdef class StructArray(Array):
         return self.take(indices)
 
 
+cdef class RunEndEncodedArray(Array):
+    """
+    Concrete class for Arrow run-end encoded arrays.
+    """
+
+    @staticmethod
+    def _from_arrays(type, allow_none_for_type, logical_length, run_ends, 
values, logical_offset):
+        cdef:
+            int64_t _logical_length
+            Array _run_ends
+            Array _values
+            int64_t _logical_offset
+            shared_ptr[CDataType] c_type
+            shared_ptr[CRunEndEncodedArray] ree_array
+
+        _logical_length = <int64_t>logical_length
+        _logical_offset = <int64_t>logical_offset
+
+        type = ensure_type(type, allow_none=allow_none_for_type)
+        if type is not None:
+            _run_ends = asarray(run_ends, type=type.run_end_type)
+            _values = asarray(values, type=type.value_type)
+            c_type = pyarrow_unwrap_data_type(type)
+            with nogil:
+                ree_array = GetResultValue(CRunEndEncodedArray.Make(
+                    c_type, _logical_length, _run_ends.sp_array, 
_values.sp_array, _logical_offset))
+        else:
+            _run_ends = asarray(run_ends)
+            _values = asarray(values)
+            with nogil:
+                ree_array = GetResultValue(CRunEndEncodedArray.MakeFromArrays(
+                    _logical_length, _run_ends.sp_array, _values.sp_array, 
_logical_offset))
+        cdef Array result = pyarrow_wrap_array(<shared_ptr[CArray]>ree_array)
+        result.validate(full=True)
+        return result
+
+    @staticmethod
+    def from_arrays(run_ends, values, type=None):
+        """
+        Construct RunEndEncodedArray from run_ends and values arrays.
+
+        Parameters
+        ----------
+        run_ends : Array (int16, int32, or int64 type)
+            The run_ends array.
+        values : Array (any type)
+            The values array.
+        type : pyarrow.DataType, optional
+            The run_end_encoded(run_end_type, value_type) array type.
+
+        Returns
+        -------
+        RunEndEncodedArray
+        """
+        logical_length = run_ends[-1] if len(run_ends) > 0 else 0
+        return RunEndEncodedArray._from_arrays(type, True, logical_length,
+                                               run_ends, values, 0)
+
+    @staticmethod
+    def from_buffers(DataType type, length, buffers, null_count=-1, offset=0,
+                     children=None):
+        """
+        Construct a RunEndEncodedArray from all the parameters that make up an
+        Array.
+
+        RunEndEncodedArrays do not have buffers, only children arrays, but this
+        implementation is needed to satisfy the Array interface.
+
+        Parameters
+        ----------
+        type : DataType
+            The run_end_encoded(run_end_type, value_type) type.
+        length : int
+            The logical length of the run-end encoded array. Expected to match
+            the last value of the run_ends array (children[0]) minus the 
offset.
+        buffers : List[Buffer]
+            Empty List or [None].
+        null_count : int, default -1
+            The number of null entries in the array. Run-end encoded arrays
+            are specified to not have valid bits and null_count always equals 
0.
+        offset : int, default 0
+            The array's logical offset (in values, not in bytes) from the
+            start of each buffer.
+        children : List[Array]
+            Nested type children containing the run_ends and values arrays.
+
+        Returns
+        -------
+        RunEndEncodedArray
+        """
+        children = children or []
+
+        if type.num_fields != len(children):
+            raise ValueError("RunEndEncodedType's expected number of children "
+                             "({0}) did not match the passed number "
+                             "({1}).".format(type.num_fields, len(children)))
+
+        # buffers are validated as if we needed to pass them to C++, but
+        # _make_from_arrays will take care of filling in the expected
+        # buffers array containing a single NULL buffer on the C++ side
+        if len(buffers) == 0:
+            buffers = [None]
+        if buffers[0] is not None:
+            raise ValueError("RunEndEncodedType expects None as validity "
+                             "bitmap, buffers[0] is not None")
+        if type.num_buffers != len(buffers):
+            raise ValueError("RunEndEncodedType's expected number of buffers "
+                             "({0}) did not match the passed number "
+                             "({1}).".format(type.num_buffers, len(buffers)))
+
+        # null_count is also validated as if we needed it
+        if null_count != -1 and null_count != 0:
+            raise ValueError("RunEndEncodedType's expected null_count (0) "
+                             "did not match passed number 
({0})".format(null_count))
+
+        return RunEndEncodedArray._from_arrays(type, False, length, 
children[0],
+                                               children[1], offset)
+
+    @property
+    def run_ends(self):
+        """
+        An array holding the logical indexes of each run-end.
+
+        The physical offset to the array is applied.
+        """
+        cdef CRunEndEncodedArray* ree_array = <CRunEndEncodedArray*>(self.ap)
+        return pyarrow_wrap_array(ree_array.run_ends())
+
+    @property
+    def values(self):
+        """
+        An array holding the values of each run.
+
+        The physical offset to the array is applied.
+        """
+        cdef CRunEndEncodedArray* ree_array = <CRunEndEncodedArray*>(self.ap)
+        return pyarrow_wrap_array(ree_array.values())
+
+    def find_physical_offset(self):
+        """
+        Find the physical offset of this REE array.
+
+        This is the offset of the run that contains the value of the first
+        logical element of this array considering its offet.
+
+        This function uses binary-search, so it has a O(log N) cost.
+        """
+        cdef CRunEndEncodedArray* ree_array = <CRunEndEncodedArray*>(self.ap)
+        return ree_array.FindPhysicalOffset()
+
+    def find_physical_length(self):
+        """
+        Find the physical length of this REE array.
+
+        The physical length of an REE is the number of physical values (and
+        run-ends) necessary to represent the logical range of values from 
offset
+        to length.
+
+        This function uses binary-search, so it has a O(log N) cost.
+        """
+        cdef CRunEndEncodedArray* ree_array = <CRunEndEncodedArray*>(self.ap)
+        return ree_array.FindPhysicalLength()
+
+
 cdef class ExtensionArray(Array):
     """
     Concrete class for Arrow extension arrays.
@@ -2960,6 +3124,7 @@ cdef dict _array_classes = {
     _Type_DECIMAL128: Decimal128Array,
     _Type_DECIMAL256: Decimal256Array,
     _Type_STRUCT: StructArray,
+    _Type_RUN_END_ENCODED: RunEndEncodedArray,
     _Type_EXTENSION: ExtensionArray,
 }
 
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index e4daaf5e58..efc1e1d956 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -36,6 +36,7 @@ from pyarrow._compute import (  # noqa
     CumulativeSumOptions,
     DayOfWeekOptions,
     DictionaryEncodeOptions,
+    RunEndEncodeOptions,
     ElementWiseAggregateOptions,
     ExtractRegexOptions,
     FilterOptions,
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 73f7294142..47180ff291 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -129,6 +129,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         _Type_SPARSE_UNION" arrow::Type::SPARSE_UNION"
         _Type_DENSE_UNION" arrow::Type::DENSE_UNION"
         _Type_DICTIONARY" arrow::Type::DICTIONARY"
+        _Type_RUN_END_ENCODED" arrow::Type::RUN_END_ENCODED"
         _Type_MAP" arrow::Type::MAP"
 
         _Type_EXTENSION" arrow::Type::EXTENSION"
@@ -396,6 +397,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         int precision()
         int scale()
 
+    cdef cppclass CRunEndEncodedType " arrow::RunEndEncodedType"(CDataType):
+        CRunEndEncodedType(const shared_ptr[CDataType]& run_end_type,
+                           const shared_ptr[CDataType]& value_type)
+        const shared_ptr[CDataType]& run_end_type()
+        const shared_ptr[CDataType]& value_type()
+
     cdef cppclass CField" arrow::Field":
         cppclass CMergeOptions "arrow::Field::MergeOptions":
             c_bool promote_nullability
@@ -463,6 +470,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         vector[shared_ptr[CField]] fields,
         vector[int8_t] type_codes)
 
+    cdef shared_ptr[CDataType] CMakeRunEndEncodedType" arrow::run_end_encoded"(
+        shared_ptr[CDataType] run_end_type,
+        shared_ptr[CDataType] value_type)
+
     cdef cppclass CSchema" arrow::Schema":
         CSchema(const vector[shared_ptr[CField]]& fields)
         CSchema(const vector[shared_ptr[CField]]& fields,
@@ -771,6 +782,28 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
         CResult[vector[shared_ptr[CArray]]] Flatten(CMemoryPool* pool)
 
+    cdef cppclass CRunEndEncodedArray" arrow::RunEndEncodedArray"(CArray):
+        @staticmethod
+        CResult[shared_ptr[CRunEndEncodedArray]] Make(
+            const shared_ptr[CDataType]& type,
+            int64_t logical_length,
+            const shared_ptr[CArray]& run_ends,
+            const shared_ptr[CArray]& values,
+            int64_t logical_offset)
+
+        @staticmethod
+        CResult[shared_ptr[CRunEndEncodedArray]] MakeFromArrays "Make"(
+            int64_t logical_length,
+            const shared_ptr[CArray]& run_ends,
+            const shared_ptr[CArray]& values,
+            int64_t logical_offset)
+
+        shared_ptr[CArray]& run_ends()
+        shared_ptr[CArray]& values()
+
+        int64_t FindPhysicalOffset()
+        int64_t FindPhysicalLength()
+
     cdef cppclass CChunkedArray" arrow::ChunkedArray":
         CChunkedArray(const vector[shared_ptr[CArray]]& arrays)
         CChunkedArray(const vector[shared_ptr[CArray]]& arrays,
@@ -2201,6 +2234,12 @@ cdef extern from "arrow/compute/api.h" namespace 
"arrow::compute" nogil:
             CDictionaryEncodeNullEncodingBehavior null_encoding)
         CDictionaryEncodeNullEncodingBehavior null_encoding
 
+    cdef cppclass CRunEndEncodeOptions \
+            "arrow::compute::RunEndEncodeOptions"(CFunctionOptions):
+        CRunEndEncodeOptions()
+        CRunEndEncodeOptions(shared_ptr[CDataType] run_end_type)
+        shared_ptr[CDataType] run_end_type
+
     cdef cppclass CTakeOptions \
             " arrow::compute::TakeOptions"(CFunctionOptions):
         CTakeOptions(c_bool boundscheck)
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index faa98d150f..6a3726e964 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -184,6 +184,11 @@ cdef class Decimal256Type(FixedSizeBinaryType):
         const CDecimal256Type* decimal256_type
 
 
+cdef class RunEndEncodedType(DataType):
+    cdef:
+        const CRunEndEncodedType* run_end_encoded_type
+
+
 cdef class BaseExtensionType(DataType):
     cdef:
         const CExtensionType* ext_type
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 8d7a5f61ee..6603947113 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -114,6 +114,7 @@ Type_STRUCT = _Type_STRUCT
 Type_SPARSE_UNION = _Type_SPARSE_UNION
 Type_DENSE_UNION = _Type_DENSE_UNION
 Type_DICTIONARY = _Type_DICTIONARY
+Type_RUN_END_ENCODED = _Type_RUN_END_ENCODED
 
 UnionMode_SPARSE = _UnionMode_SPARSE
 UnionMode_DENSE = _UnionMode_DENSE
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 1849ecab09..fadc659d45 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -111,6 +111,8 @@ cdef api object pyarrow_wrap_data_type(
         out = Decimal128Type.__new__(Decimal128Type)
     elif type.get().id() == _Type_DECIMAL256:
         out = Decimal256Type.__new__(Decimal256Type)
+    elif type.get().id() == _Type_RUN_END_ENCODED:
+        out = RunEndEncodedType.__new__(RunEndEncodedType)
     elif type.get().id() == _Type_EXTENSION:
         ext_type = <const CExtensionType*> type.get()
         cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index d974af99c6..b4ad080005 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -3407,3 +3407,74 @@ def test_array_accepts_pyarrow_array():
     # Test memory_pool keyword is accepted
     result = pa.array(arr, memory_pool=pa.default_memory_pool())
     assert arr == result
+
+
+def check_run_end_encoded(ree_array, run_ends, values, logical_length, 
physical_length,
+                          physical_offset):
+    assert ree_array.run_ends.to_pylist() == run_ends
+    assert ree_array.values.to_pylist() == values
+    assert len(ree_array) == logical_length
+    assert ree_array.find_physical_length() == physical_length
+    assert ree_array.find_physical_offset() == physical_offset
+
+
+def check_run_end_encoded_from_arrays_with_type(ree_type=None):
+    run_ends = [3, 5, 10, 19]
+    values = [1, 2, 1, 3]
+    ree_array = pa.RunEndEncodedArray.from_arrays(run_ends, values, ree_type)
+    check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0)
+
+
+def test_run_end_encoded_from_arrays():
+    check_run_end_encoded_from_arrays_with_type()
+    for run_end_type in [pa.int16(), pa.int32(), pa.int64()]:
+        for value_type in [pa.uint32(), pa.int32(), pa.uint64(), pa.int64()]:
+            ree_type = pa.run_end_encoded(run_end_type, value_type)
+            check_run_end_encoded_from_arrays_with_type(ree_type)
+
+
+def test_run_end_encoded_from_buffers():
+    run_ends = [3, 5, 10, 19]
+    values = [1, 2, 1, 3]
+
+    ree_type = pa.run_end_encoded(run_end_type=pa.int32(), 
value_type=pa.uint8())
+    length = 19
+    buffers = [None]
+    null_count = 0
+    offset = 0
+    children = [run_ends, values]
+
+    ree_array = pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers,
+                                                   null_count, offset,
+                                                   children)
+    check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0)
+    # buffers = []
+    ree_array = pa.RunEndEncodedArray.from_buffers(ree_type, length, [],
+                                                   null_count, offset,
+                                                   children)
+    check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0)
+    # null_count = -1
+    ree_array = pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers,
+                                                   -1, offset,
+                                                   children)
+    check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0)
+    # offset = 4
+    ree_array = pa.RunEndEncodedArray.from_buffers(ree_type, length - 4, 
buffers,
+                                                   null_count, 4, children)
+    check_run_end_encoded(ree_array, run_ends, values, length - 4, 3, 1)
+    # buffers = [None, None]
+    with pytest.raises(ValueError):
+        pa.RunEndEncodedArray.from_buffers(ree_type, length, [None, None],
+                                           null_count, offset, children)
+    # children = None
+    with pytest.raises(ValueError):
+        pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers,
+                                           null_count, offset, None)
+    # len(children) == 1
+    with pytest.raises(ValueError):
+        pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers,
+                                           null_count, offset, [run_ends])
+    # null_count = 1
+    with pytest.raises(ValueError):
+        pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers,
+                                           1, offset, children)
diff --git a/python/pyarrow/tests/test_compute.py 
b/python/pyarrow/tests/test_compute.py
index 92560e4070..875d0e613b 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -134,6 +134,7 @@ def test_option_class_equality():
         pc.CountOptions(),
         pc.DayOfWeekOptions(count_from_zero=False, week_start=0),
         pc.DictionaryEncodeOptions(),
+        pc.RunEndEncodeOptions(),
         pc.ElementWiseAggregateOptions(skip_nulls=True),
         pc.ExtractRegexOptions("pattern"),
         pc.FilterOptions(),
@@ -3108,3 +3109,18 @@ def test_list_slice_bad_parameters():
         pc.list_slice(arr, 0, 1, step=0)
     with pytest.raises(pa.ArrowInvalid, match=msg + "-1"):
         pc.list_slice(arr, 0, 1, step=-1)
+
+
+def check_run_end_encode_decode(run_end_encode_opts=None):
+    arr = pa.array([1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3])
+    encoded = pc.run_end_encode(arr, options=run_end_encode_opts)
+    decoded = pc.run_end_decode(encoded)
+    assert decoded.type == arr.type
+    assert decoded.equals(arr)
+
+
+def test_run_end_encode():
+    check_run_end_encode_decode()
+    check_run_end_encode_decode(pc.RunEndEncodeOptions(pa.int16()))
+    check_run_end_encode_decode(pc.RunEndEncodeOptions('int32'))
+    check_run_end_encode_decode(pc.RunEndEncodeOptions(pa.int64()))
diff --git a/python/pyarrow/tests/test_misc.py 
b/python/pyarrow/tests/test_misc.py
index 457d7d396a..af6a516279 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -171,6 +171,7 @@ def test_runtime_info():
     pa.Decimal128Array,
     pa.Decimal256Array,
     pa.StructArray,
+    pa.RunEndEncodedArray,
     pa.Scalar,
     pa.BooleanScalar,
     pa.Int8Scalar,
diff --git a/python/pyarrow/tests/test_types.py 
b/python/pyarrow/tests/test_types.py
index c780cd80c7..09f0dfe7d3 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -83,7 +83,10 @@ def get_many_types():
                   pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
         pa.union([pa.field('a', pa.binary(10), nullable=False),
                   pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
-        pa.dictionary(pa.int32(), pa.string())
+        pa.dictionary(pa.int32(), pa.string()),
+        pa.run_end_encoded(pa.int16(), pa.int32()),
+        pa.run_end_encoded(pa.int32(), pa.string()),
+        pa.run_end_encoded(pa.int64(), pa.uint8())
     )
 
 
@@ -209,6 +212,11 @@ def test_is_union():
     assert not types.is_union(pa.list_(pa.int32()))
 
 
+def test_is_run_end_encoded():
+    assert types.is_run_end_encoded(pa.run_end_encoded(pa.int32(), pa.int64()))
+    assert not types.is_run_end_encoded(pa.utf8())
+
+
 # TODO(wesm): is_map, once implemented
 
 
@@ -818,6 +826,24 @@ def test_fields_weakrefable():
     assert wr() is None
 
 
+def test_run_end_encoded_type():
+    ty = pa.run_end_encoded(pa.int64(), pa.utf8())
+    assert isinstance(ty, pa.RunEndEncodedType)
+    assert ty.run_end_type == pa.int64()
+    assert ty.value_type == pa.utf8()
+    assert ty.num_buffers == 1  # buffers expected to be {NULLPTR}
+    assert ty.num_fields == 2
+
+    with pytest.raises(TypeError):
+        pa.run_end_encoded(pa.int64(), None)
+
+    with pytest.raises(TypeError):
+        pa.run_end_encoded(None, pa.utf8())
+
+    with pytest.raises(ValueError):
+        pa.run_end_encoded(pa.int8(), pa.utf8())
+
+
 @pytest.mark.parametrize('t,check_func', [
     (pa.date32(), types.is_date32),
     (pa.date64(), types.is_date64),
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 0a68daf333..0e81706660 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1244,6 +1244,27 @@ cdef class Decimal256Type(FixedSizeBinaryType):
         return self.decimal256_type.scale()
 
 
+cdef class RunEndEncodedType(DataType):
+    """
+    Concrete class for run-end encoded types.
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        DataType.init(self, type)
+        self.run_end_encoded_type = <const CRunEndEncodedType*> type.get()
+
+    def __reduce__(self):
+        return run_end_encoded, (self.run_end_type, self.value_type)
+
+    @property
+    def run_end_type(self):
+        return pyarrow_wrap_data_type(self.run_end_encoded_type.run_end_type())
+
+    @property
+    def value_type(self):
+        return pyarrow_wrap_data_type(self.run_end_encoded_type.value_type())
+
+
 cdef class BaseExtensionType(DataType):
     """
     Concrete base class for extension types.
@@ -4479,13 +4500,6 @@ def union(child_fields, mode, type_codes=None):
     -------
     type : UnionType
     """
-    cdef:
-        Field child_field
-        vector[shared_ptr[CField]] c_fields
-        vector[int8_t] c_type_codes
-        shared_ptr[CDataType] union_type
-        int i
-
     if isinstance(mode, int):
         if mode not in (_UnionMode_SPARSE, _UnionMode_DENSE):
             raise ValueError("Invalid union mode {0!r}".format(mode))
@@ -4503,6 +4517,32 @@ def union(child_fields, mode, type_codes=None):
         return dense_union(child_fields, type_codes)
 
 
+def run_end_encoded(run_end_type, value_type):
+    """
+    Create RunEndEncodedType from run-end and value types.
+
+    Parameters
+    ----------
+    run_end_type : pyarrow.DataType
+        The integer type of the run_ends array. Must be 'int16', 'int32', or 
'int64'.
+    value_type : pyarrow.DataType
+        The type of the values array.
+
+    Returns
+    -------
+    type : RunEndEncodedType
+    """
+    cdef:
+        DataType _run_end_type = ensure_type(run_end_type, allow_none=False)
+        DataType _value_type = ensure_type(value_type, allow_none=False)
+        shared_ptr[CDataType] ree_type
+
+    if not _run_end_type.type.id() in [_Type_INT16, _Type_INT32, _Type_INT64]:
+        raise ValueError("The run_end_type should be 'int16', 'int32', or 
'int64'")
+    ree_type = CMakeRunEndEncodedType(_run_end_type.sp_type, 
_value_type.sp_type)
+    return pyarrow_wrap_data_type(ree_type)
+
+
 cdef dict _type_aliases = {
     'null': null,
     'bool': bool_,
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index f239c883b4..a88ec2ad7e 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -296,6 +296,17 @@ def is_nested(t):
     return t.id in _NESTED_TYPES
 
 
+def is_run_end_encoded(t):
+    """
+    Return True if value is an instance of a run-end encoded type.
+
+    Parameters
+    ----------
+    t : DataType
+    """
+    return t.id == lib.Type_RUN_END_ENCODED
+
+
 def is_temporal(t):
     """
     Return True if value is an instance of date, time, timestamp or duration.

Reply via email to