This is an automated email from the ASF dual-hosted git repository.

felipecrv pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 525881987d GH-17682: [C++][Python] Bool8 Extension Type Implementation 
(#43488)
525881987d is described below

commit 525881987d0b9b4f464c3e3593a9a7b4e3c767d0
Author: Joel Lubinitsky <[email protected]>
AuthorDate: Tue Aug 20 20:25:19 2024 -0400

    GH-17682: [C++][Python] Bool8 Extension Type Implementation (#43488)
    
    
    
    ### Rationale for this change
    
    C++ and Python implementations of #43234
    
    ### What changes are included in this PR?
    
    - Implement C++ `Bool8Type`, `Bool8Array`, `Bool8Scalar`, and tests
    - Implement Python bindings to C++, as well as zero-copy numpy conversion 
methods
    - TODO: docs waiting for rebase on #43458
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    Bool8 extension type will be available in C++ and Python libraries
    
    * GitHub Issue: #17682
    
    Authored-by: Joel Lubinitsky <[email protected]>
    Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
---
 cpp/src/arrow/CMakeLists.txt                |   1 +
 cpp/src/arrow/extension/CMakeLists.txt      |   6 ++
 cpp/src/arrow/extension/bool8.cc            |  61 +++++++++++
 cpp/src/arrow/extension/bool8.h             |  58 +++++++++++
 cpp/src/arrow/extension/bool8_test.cc       |  91 +++++++++++++++++
 cpp/src/arrow/extension_type.cc             |   7 +-
 python/pyarrow/__init__.py                  |   7 +-
 python/pyarrow/array.pxi                    | 114 ++++++++++++++++++++-
 python/pyarrow/includes/libarrow.pxd        |   9 ++
 python/pyarrow/lib.pxd                      |   3 +
 python/pyarrow/public-api.pxi               |   2 +
 python/pyarrow/scalar.pxi                   |  23 ++++-
 python/pyarrow/tests/test_extension_type.py | 152 ++++++++++++++++++++++++++++
 python/pyarrow/tests/test_misc.py           |   3 +
 python/pyarrow/types.pxi                    |  74 ++++++++++++++
 15 files changed, 604 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index fb785e1e95..fb7253b6fd 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -906,6 +906,7 @@ endif()
 
 if(ARROW_JSON)
   arrow_add_object_library(ARROW_JSON
+                           extension/bool8.cc
                            extension/fixed_shape_tensor.cc
                            extension/opaque.cc
                            json/options.cc
diff --git a/cpp/src/arrow/extension/CMakeLists.txt 
b/cpp/src/arrow/extension/CMakeLists.txt
index 6741ab602f..fcd5fa529a 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -15,6 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
+add_arrow_test(test
+               SOURCES
+               bool8_test.cc
+               PREFIX
+               "arrow-extension-bool8")
+
 add_arrow_test(test
                SOURCES
                fixed_shape_tensor_test.cc
diff --git a/cpp/src/arrow/extension/bool8.cc b/cpp/src/arrow/extension/bool8.cc
new file mode 100644
index 0000000000..c081f0c2b2
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8.cc
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include "arrow/extension/bool8.h"
+#include "arrow/util/logging.h"
+
+namespace arrow::extension {
+
+bool Bool8Type::ExtensionEquals(const ExtensionType& other) const {
+  return extension_name() == other.extension_name();
+}
+
+std::string Bool8Type::ToString(bool show_metadata) const {
+  std::stringstream ss;
+  ss << "extension<" << this->extension_name() << ">";
+  return ss.str();
+}
+
+std::string Bool8Type::Serialize() const { return ""; }
+
+Result<std::shared_ptr<DataType>> Bool8Type::Deserialize(
+    std::shared_ptr<DataType> storage_type, const std::string& 
serialized_data) const {
+  if (storage_type->id() != Type::INT8) {
+    return Status::Invalid("Expected INT8 storage type, got ", 
storage_type->ToString());
+  }
+  if (serialized_data != "") {
+    return Status::Invalid("Serialize data must be empty, got ", 
serialized_data);
+  }
+  return bool8();
+}
+
+std::shared_ptr<Array> Bool8Type::MakeArray(std::shared_ptr<ArrayData> data) 
const {
+  DCHECK_EQ(data->type->id(), Type::EXTENSION);
+  DCHECK_EQ("arrow.bool8",
+            internal::checked_cast<const 
ExtensionType&>(*data->type).extension_name());
+  return std::make_shared<Bool8Array>(data);
+}
+
+Result<std::shared_ptr<DataType>> Bool8Type::Make() {
+  return std::make_shared<Bool8Type>();
+}
+
+std::shared_ptr<DataType> bool8() { return std::make_shared<Bool8Type>(); }
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h
new file mode 100644
index 0000000000..02e629b28a
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension_type.h"
+
+namespace arrow::extension {
+
+/// \brief Bool8 is an alternate representation for boolean
+/// arrays using 8 bits instead of 1 bit per value. The underlying
+/// storage type is int8.
+class ARROW_EXPORT Bool8Array : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+/// \brief Bool8 is an alternate representation for boolean
+/// arrays using 8 bits instead of 1 bit per value. The underlying
+/// storage type is int8.
+class ARROW_EXPORT Bool8Type : public ExtensionType {
+ public:
+  /// \brief Construct a Bool8Type.
+  Bool8Type() : ExtensionType(int8()) {}
+
+  std::string extension_name() const override { return "arrow.bool8"; }
+  std::string ToString(bool show_metadata = false) const override;
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::string Serialize() const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override;
+
+  /// Create a Bool8Array from ArrayData
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const 
override;
+
+  static Result<std::shared_ptr<DataType>> Make();
+};
+
+/// \brief Return a Bool8Type instance.
+ARROW_EXPORT std::shared_ptr<DataType> bool8();
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/bool8_test.cc 
b/cpp/src/arrow/extension/bool8_test.cc
new file mode 100644
index 0000000000..eabcfcf62d
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8_test.cc
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/bool8.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/testing/extension_type.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+
+TEST(Bool8Type, Basics) {
+  auto type = 
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  auto type2 = 
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  ASSERT_EQ("arrow.bool8", type->extension_name());
+  ASSERT_EQ(*type, *type);
+  ASSERT_NE(*arrow::null(), *type);
+  ASSERT_EQ(*type, *type2);
+  ASSERT_EQ(*arrow::int8(), *type->storage_type());
+  ASSERT_EQ("", type->Serialize());
+  ASSERT_EQ("extension<arrow.bool8>", type->ToString(false));
+}
+
+TEST(Bool8Type, CreateFromArray) {
+  auto type = 
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]");
+  auto array = ExtensionType::WrapArray(type, storage);
+  ASSERT_EQ(5, array->length());
+  ASSERT_EQ(1, array->null_count());
+}
+
+TEST(Bool8Type, Deserialize) {
+  auto type = 
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  ASSERT_OK_AND_ASSIGN(auto deserialized, 
type->Deserialize(type->storage_type(), ""));
+  ASSERT_EQ(*type, *deserialized);
+  ASSERT_NOT_OK(type->Deserialize(type->storage_type(), "must be empty"));
+  ASSERT_EQ(*type, *deserialized);
+  ASSERT_NOT_OK(type->Deserialize(uint8(), ""));
+  ASSERT_EQ(*type, *deserialized);
+}
+
+TEST(Bool8Type, MetadataRoundTrip) {
+  auto type = 
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  std::string serialized = type->Serialize();
+  ASSERT_OK_AND_ASSIGN(auto deserialized,
+                       type->Deserialize(type->storage_type(), serialized));
+  ASSERT_EQ(*type, *deserialized);
+}
+
+TEST(Bool8Type, BatchRoundTrip) {
+  auto type = 
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+
+  auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]");
+  auto array = ExtensionType::WrapArray(type, storage);
+  auto batch =
+      RecordBatch::Make(schema({field("field", type)}), array->length(), 
{array});
+
+  std::shared_ptr<RecordBatch> written;
+  {
+    ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
+    ASSERT_OK(ipc::WriteRecordBatchStream({batch}, 
ipc::IpcWriteOptions::Defaults(),
+                                          out_stream.get()));
+
+    ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
+
+    io::BufferReader reader(complete_ipc_stream);
+    std::shared_ptr<RecordBatchReader> batch_reader;
+    ASSERT_OK_AND_ASSIGN(batch_reader, 
ipc::RecordBatchStreamReader::Open(&reader));
+    ASSERT_OK(batch_reader->ReadNext(&written));
+  }
+
+  ASSERT_EQ(*batch->schema(), *written->schema());
+  ASSERT_BATCHES_EQUAL(*batch, *written);
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index cf8dda7a85..685018f7de 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -28,6 +28,7 @@
 #include "arrow/chunked_array.h"
 #include "arrow/config.h"
 #ifdef ARROW_JSON
+#include "arrow/extension/bool8.h"
 #include "arrow/extension/fixed_shape_tensor.h"
 #endif
 #include "arrow/status.h"
@@ -146,10 +147,12 @@ static void CreateGlobalRegistry() {
 
 #ifdef ARROW_JSON
   // Register canonical extension types
-  auto ext_type =
+  auto fst_ext_type =
       
checked_pointer_cast<ExtensionType>(extension::fixed_shape_tensor(int64(), {}));
+  ARROW_CHECK_OK(g_registry->RegisterType(fst_ext_type));
 
-  ARROW_CHECK_OK(g_registry->RegisterType(ext_type));
+  auto bool8_ext_type = 
checked_pointer_cast<ExtensionType>(extension::bool8());
+  ARROW_CHECK_OK(g_registry->RegisterType(bool8_ext_type));
 #endif
 }
 
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index aa7bab9f97..807bcdc315 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -174,6 +174,7 @@ from pyarrow.lib import (null, bool_,
                          run_end_encoded,
                          fixed_shape_tensor,
                          opaque,
+                         bool8,
                          field,
                          type_for_alias,
                          DataType, DictionaryType, StructType,
@@ -184,7 +185,7 @@ from pyarrow.lib import (null, bool_,
                          FixedSizeBinaryType, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
                          RunEndEncodedType, FixedShapeTensorType, OpaqueType,
-                         PyExtensionType, UnknownExtensionType,
+                         Bool8Type, PyExtensionType, UnknownExtensionType,
                          register_extension_type, unregister_extension_type,
                          DictionaryMemo,
                          KeyValueMetadata,
@@ -218,7 +219,7 @@ from pyarrow.lib import (null, bool_,
                          MonthDayNanoIntervalArray,
                          Decimal128Array, Decimal256Array, StructArray, 
ExtensionArray,
                          RunEndEncodedArray, FixedShapeTensorArray, 
OpaqueArray,
-                         scalar, NA, _NULL as NULL, Scalar,
+                         Bool8Array, scalar, NA, _NULL as NULL, Scalar,
                          NullScalar, BooleanScalar,
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
                          UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
@@ -235,7 +236,7 @@ from pyarrow.lib import (null, bool_,
                          FixedSizeBinaryScalar, DictionaryScalar,
                          MapScalar, StructScalar, UnionScalar,
                          RunEndEncodedScalar, ExtensionScalar,
-                         FixedShapeTensorScalar, OpaqueScalar)
+                         FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar)
 
 # Buffers, allocation
 from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 6c40a21db9..4c3eb93232 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1581,7 +1581,7 @@ cdef class Array(_PandasConvertible):
 
     def to_numpy(self, zero_copy_only=True, writable=False):
         """
-        Return a NumPy view or copy of this array (experimental).
+        Return a NumPy view or copy of this array.
 
         By default, tries to return a view of this array. This is only
         supported for primitive arrays with the same memory layout as NumPy
@@ -4476,6 +4476,118 @@ cdef class OpaqueArray(ExtensionArray):
     """
 
 
+cdef class Bool8Array(ExtensionArray):
+    """
+    Concrete class for bool8 extension arrays.
+
+    Examples
+    --------
+    Define the extension type for an bool8 array
+
+    >>> import pyarrow as pa
+    >>> bool8_type = pa.bool8()
+
+    Create an extension array
+
+    >>> arr = [-1, 0, 1, 2, None]
+    >>> storage = pa.array(arr, pa.int8())
+    >>> pa.ExtensionArray.from_storage(bool8_type, storage)
+    <pyarrow.lib.Bool8Array object at ...>
+    [
+      -1,
+      0,
+      1,
+      2,
+      null
+    ]
+    """
+
+    def to_numpy(self, zero_copy_only=True, writable=False):
+        """
+        Return a NumPy bool view or copy of this array.
+
+        By default, tries to return a view of this array. This is only
+        supported for arrays without any nulls.
+
+        Parameters
+        ----------
+        zero_copy_only : bool, default True
+            If True, an exception will be raised if the conversion to a numpy
+            array would require copying the underlying data (e.g. in presence
+            of nulls).
+        writable : bool, default False
+            For numpy arrays created with zero copy (view on the Arrow data),
+            the resulting array is not writable (Arrow data is immutable).
+            By setting this to True, a copy of the array is made to ensure
+            it is writable.
+
+        Returns
+        -------
+        array : numpy.ndarray
+        """
+        if not writable:
+            try:
+                return self.storage.to_numpy().view(np.bool_)
+            except ArrowInvalid as e:
+                if zero_copy_only:
+                    raise e
+
+        return _pc().not_equal(self.storage, 
0).to_numpy(zero_copy_only=zero_copy_only, writable=writable)
+
+    @staticmethod
+    def from_storage(Int8Array storage):
+        """
+        Construct Bool8Array from Int8Array storage.
+
+        Parameters
+        ----------
+        storage : Int8Array
+            The underlying storage for the result array.
+
+        Returns
+        -------
+        bool8_array : Bool8Array
+        """
+        return ExtensionArray.from_storage(bool8(), storage)
+
+    @staticmethod
+    def from_numpy(obj):
+        """
+        Convert numpy array to a bool8 extension array without making a copy.
+        The input array must be 1-dimensional, with either bool_ or int8 dtype.
+
+        Parameters
+        ----------
+        obj : numpy.ndarray
+
+        Returns
+        -------
+        bool8_array : Bool8Array
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> import numpy as np
+        >>> arr = np.array([True, False, True], dtype=np.bool_)
+        >>> pa.Bool8Array.from_numpy(arr)
+        <pyarrow.lib.Bool8Array object at ...>
+        [
+          1,
+          0,
+          1
+        ]
+        """
+
+        if obj.ndim != 1:
+            raise ValueError(f"Cannot convert {obj.ndim}-D array to bool8 
array")
+
+        if obj.dtype not in [np.bool_, np.int8]:
+            raise TypeError(f"Array dtype {obj.dtype} incompatible with bool8 
storage")
+
+        storage_arr = array(obj.view(np.int8), type=int8())
+        return Bool8Array.from_storage(storage_arr)
+
+
 cdef dict _array_classes = {
     _Type_NA: NullArray,
     _Type_BOOL: BooleanArray,
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 9b008d150f..a54a1db292 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2895,6 +2895,15 @@ cdef extern from "arrow/extension/opaque.h" namespace 
"arrow::extension" nogil:
         pass
 
 
+cdef extern from "arrow/extension/bool8.h" namespace "arrow::extension" nogil:
+    cdef cppclass CBool8Type" arrow::extension::Bool8Type"(CExtensionType):
+
+        @staticmethod
+        CResult[shared_ptr[CDataType]] Make()
+
+    cdef cppclass CBool8Array" arrow::extension::Bool8Array"(CExtensionArray):
+        pass
+
 cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
     cdef enum CCompressionType" arrow::Compression::type":
         CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED"
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 2cb302d20a..e3625c1815 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -214,6 +214,9 @@ cdef class FixedShapeTensorType(BaseExtensionType):
     cdef:
         const CFixedShapeTensorType* tensor_ext_type
 
+cdef class Bool8Type(BaseExtensionType):
+    cdef:
+        const CBool8Type* bool8_ext_type
 
 cdef class OpaqueType(BaseExtensionType):
     cdef:
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 2f9fc1c554..19a26bd6c6 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -126,6 +126,8 @@ cdef api object pyarrow_wrap_data_type(
             out = FixedShapeTensorType.__new__(FixedShapeTensorType)
         elif ext_type.extension_name() == b"arrow.opaque":
             out = OpaqueType.__new__(OpaqueType)
+        elif ext_type.extension_name() == b"arrow.bool8":
+            out = Bool8Type.__new__(Bool8Type)
         else:
             out = BaseExtensionType.__new__(BaseExtensionType)
     else:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 12a99c2aec..72ae2aee5f 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -1091,6 +1091,18 @@ cdef class OpaqueScalar(ExtensionScalar):
     """
 
 
+cdef class Bool8Scalar(ExtensionScalar):
+    """
+    Concrete class for bool8 extension scalar.
+    """
+
+    def as_py(self):
+        """
+        Return this scalar as a Python object.
+        """
+        py_val = super().as_py()
+        return None if py_val is None else py_val != 0
+
 cdef dict _scalar_classes = {
     _Type_BOOL: BooleanScalar,
     _Type_UINT8: UInt8Scalar,
@@ -1199,6 +1211,11 @@ def scalar(value, type=None, *, from_pandas=None, 
MemoryPool memory_pool=None):
     type = ensure_type(type, allow_none=True)
     pool = maybe_unbox_memory_pool(memory_pool)
 
+    extension_type = None
+    if type is not None and type.id == _Type_EXTENSION:
+        extension_type = type
+        type = type.storage_type
+
     if _is_array_like(value):
         value = get_values(value, &is_pandas_object)
 
@@ -1223,4 +1240,8 @@ def scalar(value, type=None, *, from_pandas=None, 
MemoryPool memory_pool=None):
 
     # retrieve the scalar from the first position
     scalar = GetResultValue(array.get().GetScalar(0))
-    return Scalar.wrap(scalar)
+    result = Scalar.wrap(scalar)
+
+    if extension_type is not None:
+        result = ExtensionScalar.from_storage(extension_type, result)
+    return result
diff --git a/python/pyarrow/tests/test_extension_type.py 
b/python/pyarrow/tests/test_extension_type.py
index 58c54189f2..b04ee85ec9 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1707,3 +1707,155 @@ def test_opaque_type(pickle_module, storage_type, 
storage):
     # cast extension type -> storage type
     inner = arr.cast(storage_type)
     assert inner == storage
+
+
+def test_bool8_type(pickle_module):
+    bool8_type = pa.bool8()
+    storage_type = pa.int8()
+    assert bool8_type.extension_name == "arrow.bool8"
+    assert bool8_type.storage_type == storage_type
+    assert str(bool8_type) == "extension<arrow.bool8>"
+
+    assert bool8_type == bool8_type
+    assert bool8_type == pa.bool8()
+    assert bool8_type != storage_type
+
+    # Pickle roundtrip
+    result = pickle_module.loads(pickle_module.dumps(bool8_type))
+    assert result == bool8_type
+
+    # IPC roundtrip
+    storage = pa.array([-1, 0, 1, 2, None], storage_type)
+    arr = pa.ExtensionArray.from_storage(bool8_type, storage)
+    assert isinstance(arr, pa.Bool8Array)
+
+    # extension is registered by default
+    buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+    batch = ipc_read_batch(buf)
+
+    assert batch.column(0).type.extension_name == "arrow.bool8"
+    assert isinstance(batch.column(0), pa.Bool8Array)
+
+    # cast storage -> extension type
+    result = storage.cast(bool8_type)
+    assert result == arr
+
+    # cast extension type -> storage type
+    inner = arr.cast(storage_type)
+    assert inner == storage
+
+
+def test_bool8_to_bool_conversion():
+    bool_arr = pa.array([True, False, True, True, None], pa.bool_())
+    bool8_arr = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2, None], pa.int8()),
+    )
+
+    # cast extension type -> arrow boolean type
+    assert bool8_arr.cast(pa.bool_()) == bool_arr
+
+    # cast arrow boolean type -> extension type, expecting canonical values
+    canonical_storage = pa.array([1, 0, 1, 1, None], pa.int8())
+    canonical_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(), 
canonical_storage)
+    assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr
+
+
+def test_bool8_to_numpy_conversion():
+    arr = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2, None], pa.int8()),
+    )
+
+    # cannot zero-copy with nulls
+    with pytest.raises(
+        pa.ArrowInvalid,
+        match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was 
True",
+    ):
+        arr.to_numpy()
+
+    # nullable conversion possible with a copy, but dest dtype is object
+    assert np.array_equal(
+        arr.to_numpy(zero_copy_only=False),
+        np.array([True, False, True, True, None], dtype=np.object_),
+    )
+
+    # zero-copy possible with non-null array
+    np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+    arr_no_nulls = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2], pa.int8()),
+    )
+
+    arr_to_np = arr_no_nulls.to_numpy()
+    assert np.array_equal(arr_to_np, np_arr_no_nulls)
+
+    # same underlying buffer
+    assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address
+
+    # if the user requests a writable array, a copy should be performed
+    arr_to_np_writable = arr_no_nulls.to_numpy(zero_copy_only=False, 
writable=True)
+    assert np.array_equal(arr_to_np_writable, np_arr_no_nulls)
+
+    # different underlying buffer
+    assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address
+
+
+def test_bool8_from_numpy_conversion():
+    np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+    canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([1, 0, 1, 1], pa.int8()),
+    )
+
+    arr_from_np = pa.Bool8Array.from_numpy(np_arr_no_nulls)
+    assert arr_from_np == canonical_bool8_arr_no_nulls
+
+    # same underlying buffer
+    assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data
+
+    # conversion only valid for 1-D arrays
+    with pytest.raises(
+        ValueError,
+        match="Cannot convert 2-D array to bool8 array",
+    ):
+        pa.Bool8Array.from_numpy(
+            np.array([[True, False], [False, True]], dtype=np.bool_),
+        )
+
+    with pytest.raises(
+        ValueError,
+        match="Cannot convert 0-D array to bool8 array",
+    ):
+        pa.Bool8Array.from_numpy(np.bool_())
+
+    # must use compatible storage type
+    with pytest.raises(
+        TypeError,
+        match="Array dtype float64 incompatible with bool8 storage",
+    ):
+        pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64))
+
+
+def test_bool8_scalar():
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() is True
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() is False
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() is True
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() is True
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), None).as_py() is None
+
+    arr = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2, None], pa.int8()),
+    )
+    assert arr[0].as_py() is True
+    assert arr[1].as_py() is False
+    assert arr[2].as_py() is True
+    assert arr[3].as_py() is True
+    assert arr[4].as_py() is None
+
+    assert pa.scalar(-1, type=pa.bool8()).as_py() is True
+    assert pa.scalar(0, type=pa.bool8()).as_py() is False
+    assert pa.scalar(1, type=pa.bool8()).as_py() is True
+    assert pa.scalar(2, type=pa.bool8()).as_py() is True
+    assert pa.scalar(None, type=pa.bool8()).as_py() is None
diff --git a/python/pyarrow/tests/test_misc.py 
b/python/pyarrow/tests/test_misc.py
index 9a55a38177..5d3471c7c3 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -250,6 +250,9 @@ def test_set_timezone_db_path_non_windows():
     pa.OpaqueArray,
     pa.OpaqueScalar,
     pa.OpaqueType,
+    pa.Bool8Array,
+    pa.Bool8Scalar,
+    pa.Bool8Type,
 ])
 def test_extension_type_constructor_errors(klass):
     # ARROW-2638: prevent calling extension class constructors directly
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index dcd2b61c33..563782f0c2 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1837,6 +1837,37 @@ cdef class FixedShapeTensorType(BaseExtensionType):
         return FixedShapeTensorScalar
 
 
+cdef class Bool8Type(BaseExtensionType):
+    """
+    Concrete class for bool8 extension type.
+
+    Bool8 is an alternate representation for boolean
+    arrays using 8 bits instead of 1 bit per value. The underlying
+    storage type is int8.
+
+    Examples
+    --------
+    Create an instance of bool8 extension type:
+
+    >>> import pyarrow as pa
+    >>> pa.bool8()
+    Bool8Type(extension<arrow.bool8>)
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        BaseExtensionType.init(self, type)
+        self.bool8_ext_type = <const CBool8Type*> type.get()
+
+    def __arrow_ext_class__(self):
+        return Bool8Array
+
+    def __reduce__(self):
+        return bool8, ()
+
+    def __arrow_ext_scalar_class__(self):
+        return Bool8Scalar
+
+
 cdef class OpaqueType(BaseExtensionType):
     """
     Concrete class for opaque extension type.
@@ -5278,6 +5309,49 @@ def fixed_shape_tensor(DataType value_type, shape, 
dim_names=None, permutation=N
     return out
 
 
+def bool8():
+    """
+    Create instance of bool8 extension type.
+
+    Examples
+    --------
+    Create an instance of bool8 extension type:
+
+    >>> import pyarrow as pa
+    >>> type = pa.bool8()
+    >>> type
+    Bool8Type(extension<arrow.bool8>)
+
+    Inspect the data type:
+
+    >>> type.storage_type
+    DataType(int8)
+
+    Create a table with a bool8 array:
+
+    >>> arr = [-1, 0, 1, 2, None]
+    >>> storage = pa.array(arr, pa.int8())
+    >>> other = pa.ExtensionArray.from_storage(type, storage)
+    >>> pa.table([other], names=["unknown_col"])
+    pyarrow.Table
+    unknown_col: extension<arrow.bool8>
+    ----
+    unknown_col: [[-1,0,1,2,null]]
+
+    Returns
+    -------
+    type : Bool8Type
+    """
+
+    cdef Bool8Type out = Bool8Type.__new__(Bool8Type)
+
+    c_type = GetResultValue(CBool8Type.Make())
+
+    out.init(c_type)
+
+    return out
+
+
 def opaque(DataType storage_type, str type_name not None, str vendor_name not 
None):
     """
     Create instance of opaque extension type.

Reply via email to