This is an automated email from the ASF dual-hosted git repository.
felipecrv pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 525881987d GH-17682: [C++][Python] Bool8 Extension Type Implementation
(#43488)
525881987d is described below
commit 525881987d0b9b4f464c3e3593a9a7b4e3c767d0
Author: Joel Lubinitsky <[email protected]>
AuthorDate: Tue Aug 20 20:25:19 2024 -0400
GH-17682: [C++][Python] Bool8 Extension Type Implementation (#43488)
### Rationale for this change
C++ and Python implementations of #43234
### What changes are included in this PR?
- Implement C++ `Bool8Type`, `Bool8Array`, `Bool8Scalar`, and tests
- Implement Python bindings to C++, as well as zero-copy numpy conversion
methods
- TODO: docs waiting for rebase on #43458
### Are these changes tested?
Yes
### Are there any user-facing changes?
Bool8 extension type will be available in C++ and Python libraries
* GitHub Issue: #17682
Authored-by: Joel Lubinitsky <[email protected]>
Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
---
cpp/src/arrow/CMakeLists.txt | 1 +
cpp/src/arrow/extension/CMakeLists.txt | 6 ++
cpp/src/arrow/extension/bool8.cc | 61 +++++++++++
cpp/src/arrow/extension/bool8.h | 58 +++++++++++
cpp/src/arrow/extension/bool8_test.cc | 91 +++++++++++++++++
cpp/src/arrow/extension_type.cc | 7 +-
python/pyarrow/__init__.py | 7 +-
python/pyarrow/array.pxi | 114 ++++++++++++++++++++-
python/pyarrow/includes/libarrow.pxd | 9 ++
python/pyarrow/lib.pxd | 3 +
python/pyarrow/public-api.pxi | 2 +
python/pyarrow/scalar.pxi | 23 ++++-
python/pyarrow/tests/test_extension_type.py | 152 ++++++++++++++++++++++++++++
python/pyarrow/tests/test_misc.py | 3 +
python/pyarrow/types.pxi | 74 ++++++++++++++
15 files changed, 604 insertions(+), 7 deletions(-)
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index fb785e1e95..fb7253b6fd 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -906,6 +906,7 @@ endif()
if(ARROW_JSON)
arrow_add_object_library(ARROW_JSON
+ extension/bool8.cc
extension/fixed_shape_tensor.cc
extension/opaque.cc
json/options.cc
diff --git a/cpp/src/arrow/extension/CMakeLists.txt
b/cpp/src/arrow/extension/CMakeLists.txt
index 6741ab602f..fcd5fa529a 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -15,6 +15,12 @@
# specific language governing permissions and limitations
# under the License.
+add_arrow_test(test
+ SOURCES
+ bool8_test.cc
+ PREFIX
+ "arrow-extension-bool8")
+
add_arrow_test(test
SOURCES
fixed_shape_tensor_test.cc
diff --git a/cpp/src/arrow/extension/bool8.cc b/cpp/src/arrow/extension/bool8.cc
new file mode 100644
index 0000000000..c081f0c2b2
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8.cc
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include "arrow/extension/bool8.h"
+#include "arrow/util/logging.h"
+
+namespace arrow::extension {
+
+bool Bool8Type::ExtensionEquals(const ExtensionType& other) const {
+ return extension_name() == other.extension_name();
+}
+
+std::string Bool8Type::ToString(bool show_metadata) const {
+ std::stringstream ss;
+ ss << "extension<" << this->extension_name() << ">";
+ return ss.str();
+}
+
+std::string Bool8Type::Serialize() const { return ""; }
+
+Result<std::shared_ptr<DataType>> Bool8Type::Deserialize(
+ std::shared_ptr<DataType> storage_type, const std::string&
serialized_data) const {
+ if (storage_type->id() != Type::INT8) {
+ return Status::Invalid("Expected INT8 storage type, got ",
storage_type->ToString());
+ }
+ if (serialized_data != "") {
+ return Status::Invalid("Serialize data must be empty, got ",
serialized_data);
+ }
+ return bool8();
+}
+
+std::shared_ptr<Array> Bool8Type::MakeArray(std::shared_ptr<ArrayData> data)
const {
+ DCHECK_EQ(data->type->id(), Type::EXTENSION);
+ DCHECK_EQ("arrow.bool8",
+ internal::checked_cast<const
ExtensionType&>(*data->type).extension_name());
+ return std::make_shared<Bool8Array>(data);
+}
+
+Result<std::shared_ptr<DataType>> Bool8Type::Make() {
+ return std::make_shared<Bool8Type>();
+}
+
+std::shared_ptr<DataType> bool8() { return std::make_shared<Bool8Type>(); }
+
+} // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h
new file mode 100644
index 0000000000..02e629b28a
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension_type.h"
+
+namespace arrow::extension {
+
+/// \brief Bool8 is an alternate representation for boolean
+/// arrays using 8 bits instead of 1 bit per value. The underlying
+/// storage type is int8.
+class ARROW_EXPORT Bool8Array : public ExtensionArray {
+ public:
+ using ExtensionArray::ExtensionArray;
+};
+
+/// \brief Bool8 is an alternate representation for boolean
+/// arrays using 8 bits instead of 1 bit per value. The underlying
+/// storage type is int8.
+class ARROW_EXPORT Bool8Type : public ExtensionType {
+ public:
+ /// \brief Construct a Bool8Type.
+ Bool8Type() : ExtensionType(int8()) {}
+
+ std::string extension_name() const override { return "arrow.bool8"; }
+ std::string ToString(bool show_metadata = false) const override;
+
+ bool ExtensionEquals(const ExtensionType& other) const override;
+
+ std::string Serialize() const override;
+
+ Result<std::shared_ptr<DataType>> Deserialize(
+ std::shared_ptr<DataType> storage_type,
+ const std::string& serialized_data) const override;
+
+ /// Create a Bool8Array from ArrayData
+ std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const
override;
+
+ static Result<std::shared_ptr<DataType>> Make();
+};
+
+/// \brief Return a Bool8Type instance.
+ARROW_EXPORT std::shared_ptr<DataType> bool8();
+
+} // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/bool8_test.cc
b/cpp/src/arrow/extension/bool8_test.cc
new file mode 100644
index 0000000000..eabcfcf62d
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8_test.cc
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/bool8.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/testing/extension_type.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+
+TEST(Bool8Type, Basics) {
+ auto type =
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+ auto type2 =
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+ ASSERT_EQ("arrow.bool8", type->extension_name());
+ ASSERT_EQ(*type, *type);
+ ASSERT_NE(*arrow::null(), *type);
+ ASSERT_EQ(*type, *type2);
+ ASSERT_EQ(*arrow::int8(), *type->storage_type());
+ ASSERT_EQ("", type->Serialize());
+ ASSERT_EQ("extension<arrow.bool8>", type->ToString(false));
+}
+
+TEST(Bool8Type, CreateFromArray) {
+ auto type =
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+ auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]");
+ auto array = ExtensionType::WrapArray(type, storage);
+ ASSERT_EQ(5, array->length());
+ ASSERT_EQ(1, array->null_count());
+}
+
+TEST(Bool8Type, Deserialize) {
+ auto type =
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+ ASSERT_OK_AND_ASSIGN(auto deserialized,
type->Deserialize(type->storage_type(), ""));
+ ASSERT_EQ(*type, *deserialized);
+ ASSERT_NOT_OK(type->Deserialize(type->storage_type(), "must be empty"));
+ ASSERT_EQ(*type, *deserialized);
+ ASSERT_NOT_OK(type->Deserialize(uint8(), ""));
+ ASSERT_EQ(*type, *deserialized);
+}
+
+TEST(Bool8Type, MetadataRoundTrip) {
+ auto type =
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+ std::string serialized = type->Serialize();
+ ASSERT_OK_AND_ASSIGN(auto deserialized,
+ type->Deserialize(type->storage_type(), serialized));
+ ASSERT_EQ(*type, *deserialized);
+}
+
+TEST(Bool8Type, BatchRoundTrip) {
+ auto type =
internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+
+ auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]");
+ auto array = ExtensionType::WrapArray(type, storage);
+ auto batch =
+ RecordBatch::Make(schema({field("field", type)}), array->length(),
{array});
+
+ std::shared_ptr<RecordBatch> written;
+ {
+ ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
+ ASSERT_OK(ipc::WriteRecordBatchStream({batch},
ipc::IpcWriteOptions::Defaults(),
+ out_stream.get()));
+
+ ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
+
+ io::BufferReader reader(complete_ipc_stream);
+ std::shared_ptr<RecordBatchReader> batch_reader;
+ ASSERT_OK_AND_ASSIGN(batch_reader,
ipc::RecordBatchStreamReader::Open(&reader));
+ ASSERT_OK(batch_reader->ReadNext(&written));
+ }
+
+ ASSERT_EQ(*batch->schema(), *written->schema());
+ ASSERT_BATCHES_EQUAL(*batch, *written);
+}
+
+} // namespace arrow
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index cf8dda7a85..685018f7de 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -28,6 +28,7 @@
#include "arrow/chunked_array.h"
#include "arrow/config.h"
#ifdef ARROW_JSON
+#include "arrow/extension/bool8.h"
#include "arrow/extension/fixed_shape_tensor.h"
#endif
#include "arrow/status.h"
@@ -146,10 +147,12 @@ static void CreateGlobalRegistry() {
#ifdef ARROW_JSON
// Register canonical extension types
- auto ext_type =
+ auto fst_ext_type =
checked_pointer_cast<ExtensionType>(extension::fixed_shape_tensor(int64(), {}));
+ ARROW_CHECK_OK(g_registry->RegisterType(fst_ext_type));
- ARROW_CHECK_OK(g_registry->RegisterType(ext_type));
+ auto bool8_ext_type =
checked_pointer_cast<ExtensionType>(extension::bool8());
+ ARROW_CHECK_OK(g_registry->RegisterType(bool8_ext_type));
#endif
}
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index aa7bab9f97..807bcdc315 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -174,6 +174,7 @@ from pyarrow.lib import (null, bool_,
run_end_encoded,
fixed_shape_tensor,
opaque,
+ bool8,
field,
type_for_alias,
DataType, DictionaryType, StructType,
@@ -184,7 +185,7 @@ from pyarrow.lib import (null, bool_,
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
RunEndEncodedType, FixedShapeTensorType, OpaqueType,
- PyExtensionType, UnknownExtensionType,
+ Bool8Type, PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
KeyValueMetadata,
@@ -218,7 +219,7 @@ from pyarrow.lib import (null, bool_,
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray,
ExtensionArray,
RunEndEncodedArray, FixedShapeTensorArray,
OpaqueArray,
- scalar, NA, _NULL as NULL, Scalar,
+ Bool8Array, scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
@@ -235,7 +236,7 @@ from pyarrow.lib import (null, bool_,
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
RunEndEncodedScalar, ExtensionScalar,
- FixedShapeTensorScalar, OpaqueScalar)
+ FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar)
# Buffers, allocation
from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 6c40a21db9..4c3eb93232 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1581,7 +1581,7 @@ cdef class Array(_PandasConvertible):
def to_numpy(self, zero_copy_only=True, writable=False):
"""
- Return a NumPy view or copy of this array (experimental).
+ Return a NumPy view or copy of this array.
By default, tries to return a view of this array. This is only
supported for primitive arrays with the same memory layout as NumPy
@@ -4476,6 +4476,118 @@ cdef class OpaqueArray(ExtensionArray):
"""
+cdef class Bool8Array(ExtensionArray):
+ """
+ Concrete class for bool8 extension arrays.
+
+ Examples
+ --------
+ Define the extension type for an bool8 array
+
+ >>> import pyarrow as pa
+ >>> bool8_type = pa.bool8()
+
+ Create an extension array
+
+ >>> arr = [-1, 0, 1, 2, None]
+ >>> storage = pa.array(arr, pa.int8())
+ >>> pa.ExtensionArray.from_storage(bool8_type, storage)
+ <pyarrow.lib.Bool8Array object at ...>
+ [
+ -1,
+ 0,
+ 1,
+ 2,
+ null
+ ]
+ """
+
+ def to_numpy(self, zero_copy_only=True, writable=False):
+ """
+ Return a NumPy bool view or copy of this array.
+
+ By default, tries to return a view of this array. This is only
+ supported for arrays without any nulls.
+
+ Parameters
+ ----------
+ zero_copy_only : bool, default True
+ If True, an exception will be raised if the conversion to a numpy
+ array would require copying the underlying data (e.g. in presence
+ of nulls).
+ writable : bool, default False
+ For numpy arrays created with zero copy (view on the Arrow data),
+ the resulting array is not writable (Arrow data is immutable).
+ By setting this to True, a copy of the array is made to ensure
+ it is writable.
+
+ Returns
+ -------
+ array : numpy.ndarray
+ """
+ if not writable:
+ try:
+ return self.storage.to_numpy().view(np.bool_)
+ except ArrowInvalid as e:
+ if zero_copy_only:
+ raise e
+
+ return _pc().not_equal(self.storage,
0).to_numpy(zero_copy_only=zero_copy_only, writable=writable)
+
+ @staticmethod
+ def from_storage(Int8Array storage):
+ """
+ Construct Bool8Array from Int8Array storage.
+
+ Parameters
+ ----------
+ storage : Int8Array
+ The underlying storage for the result array.
+
+ Returns
+ -------
+ bool8_array : Bool8Array
+ """
+ return ExtensionArray.from_storage(bool8(), storage)
+
+ @staticmethod
+ def from_numpy(obj):
+ """
+ Convert numpy array to a bool8 extension array without making a copy.
+ The input array must be 1-dimensional, with either bool_ or int8 dtype.
+
+ Parameters
+ ----------
+ obj : numpy.ndarray
+
+ Returns
+ -------
+ bool8_array : Bool8Array
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> arr = np.array([True, False, True], dtype=np.bool_)
+ >>> pa.Bool8Array.from_numpy(arr)
+ <pyarrow.lib.Bool8Array object at ...>
+ [
+ 1,
+ 0,
+ 1
+ ]
+ """
+
+ if obj.ndim != 1:
+ raise ValueError(f"Cannot convert {obj.ndim}-D array to bool8
array")
+
+ if obj.dtype not in [np.bool_, np.int8]:
+ raise TypeError(f"Array dtype {obj.dtype} incompatible with bool8
storage")
+
+ storage_arr = array(obj.view(np.int8), type=int8())
+ return Bool8Array.from_storage(storage_arr)
+
+
cdef dict _array_classes = {
_Type_NA: NullArray,
_Type_BOOL: BooleanArray,
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 9b008d150f..a54a1db292 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2895,6 +2895,15 @@ cdef extern from "arrow/extension/opaque.h" namespace
"arrow::extension" nogil:
pass
+cdef extern from "arrow/extension/bool8.h" namespace "arrow::extension" nogil:
+ cdef cppclass CBool8Type" arrow::extension::Bool8Type"(CExtensionType):
+
+ @staticmethod
+ CResult[shared_ptr[CDataType]] Make()
+
+ cdef cppclass CBool8Array" arrow::extension::Bool8Array"(CExtensionArray):
+ pass
+
cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
cdef enum CCompressionType" arrow::Compression::type":
CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED"
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 2cb302d20a..e3625c1815 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -214,6 +214,9 @@ cdef class FixedShapeTensorType(BaseExtensionType):
cdef:
const CFixedShapeTensorType* tensor_ext_type
+cdef class Bool8Type(BaseExtensionType):
+ cdef:
+ const CBool8Type* bool8_ext_type
cdef class OpaqueType(BaseExtensionType):
cdef:
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 2f9fc1c554..19a26bd6c6 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -126,6 +126,8 @@ cdef api object pyarrow_wrap_data_type(
out = FixedShapeTensorType.__new__(FixedShapeTensorType)
elif ext_type.extension_name() == b"arrow.opaque":
out = OpaqueType.__new__(OpaqueType)
+ elif ext_type.extension_name() == b"arrow.bool8":
+ out = Bool8Type.__new__(Bool8Type)
else:
out = BaseExtensionType.__new__(BaseExtensionType)
else:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 12a99c2aec..72ae2aee5f 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -1091,6 +1091,18 @@ cdef class OpaqueScalar(ExtensionScalar):
"""
+cdef class Bool8Scalar(ExtensionScalar):
+ """
+ Concrete class for bool8 extension scalar.
+ """
+
+ def as_py(self):
+ """
+ Return this scalar as a Python object.
+ """
+ py_val = super().as_py()
+ return None if py_val is None else py_val != 0
+
cdef dict _scalar_classes = {
_Type_BOOL: BooleanScalar,
_Type_UINT8: UInt8Scalar,
@@ -1199,6 +1211,11 @@ def scalar(value, type=None, *, from_pandas=None,
MemoryPool memory_pool=None):
type = ensure_type(type, allow_none=True)
pool = maybe_unbox_memory_pool(memory_pool)
+ extension_type = None
+ if type is not None and type.id == _Type_EXTENSION:
+ extension_type = type
+ type = type.storage_type
+
if _is_array_like(value):
value = get_values(value, &is_pandas_object)
@@ -1223,4 +1240,8 @@ def scalar(value, type=None, *, from_pandas=None,
MemoryPool memory_pool=None):
# retrieve the scalar from the first position
scalar = GetResultValue(array.get().GetScalar(0))
- return Scalar.wrap(scalar)
+ result = Scalar.wrap(scalar)
+
+ if extension_type is not None:
+ result = ExtensionScalar.from_storage(extension_type, result)
+ return result
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index 58c54189f2..b04ee85ec9 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1707,3 +1707,155 @@ def test_opaque_type(pickle_module, storage_type,
storage):
# cast extension type -> storage type
inner = arr.cast(storage_type)
assert inner == storage
+
+
+def test_bool8_type(pickle_module):
+ bool8_type = pa.bool8()
+ storage_type = pa.int8()
+ assert bool8_type.extension_name == "arrow.bool8"
+ assert bool8_type.storage_type == storage_type
+ assert str(bool8_type) == "extension<arrow.bool8>"
+
+ assert bool8_type == bool8_type
+ assert bool8_type == pa.bool8()
+ assert bool8_type != storage_type
+
+ # Pickle roundtrip
+ result = pickle_module.loads(pickle_module.dumps(bool8_type))
+ assert result == bool8_type
+
+ # IPC roundtrip
+ storage = pa.array([-1, 0, 1, 2, None], storage_type)
+ arr = pa.ExtensionArray.from_storage(bool8_type, storage)
+ assert isinstance(arr, pa.Bool8Array)
+
+ # extension is registered by default
+ buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+ batch = ipc_read_batch(buf)
+
+ assert batch.column(0).type.extension_name == "arrow.bool8"
+ assert isinstance(batch.column(0), pa.Bool8Array)
+
+ # cast storage -> extension type
+ result = storage.cast(bool8_type)
+ assert result == arr
+
+ # cast extension type -> storage type
+ inner = arr.cast(storage_type)
+ assert inner == storage
+
+
+def test_bool8_to_bool_conversion():
+ bool_arr = pa.array([True, False, True, True, None], pa.bool_())
+ bool8_arr = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2, None], pa.int8()),
+ )
+
+ # cast extension type -> arrow boolean type
+ assert bool8_arr.cast(pa.bool_()) == bool_arr
+
+ # cast arrow boolean type -> extension type, expecting canonical values
+ canonical_storage = pa.array([1, 0, 1, 1, None], pa.int8())
+ canonical_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(),
canonical_storage)
+ assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr
+
+
+def test_bool8_to_numpy_conversion():
+ arr = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2, None], pa.int8()),
+ )
+
+ # cannot zero-copy with nulls
+ with pytest.raises(
+ pa.ArrowInvalid,
+ match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was
True",
+ ):
+ arr.to_numpy()
+
+ # nullable conversion possible with a copy, but dest dtype is object
+ assert np.array_equal(
+ arr.to_numpy(zero_copy_only=False),
+ np.array([True, False, True, True, None], dtype=np.object_),
+ )
+
+ # zero-copy possible with non-null array
+ np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+ arr_no_nulls = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2], pa.int8()),
+ )
+
+ arr_to_np = arr_no_nulls.to_numpy()
+ assert np.array_equal(arr_to_np, np_arr_no_nulls)
+
+ # same underlying buffer
+ assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address
+
+ # if the user requests a writable array, a copy should be performed
+ arr_to_np_writable = arr_no_nulls.to_numpy(zero_copy_only=False,
writable=True)
+ assert np.array_equal(arr_to_np_writable, np_arr_no_nulls)
+
+ # different underlying buffer
+ assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address
+
+
+def test_bool8_from_numpy_conversion():
+ np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+ canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([1, 0, 1, 1], pa.int8()),
+ )
+
+ arr_from_np = pa.Bool8Array.from_numpy(np_arr_no_nulls)
+ assert arr_from_np == canonical_bool8_arr_no_nulls
+
+ # same underlying buffer
+ assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data
+
+ # conversion only valid for 1-D arrays
+ with pytest.raises(
+ ValueError,
+ match="Cannot convert 2-D array to bool8 array",
+ ):
+ pa.Bool8Array.from_numpy(
+ np.array([[True, False], [False, True]], dtype=np.bool_),
+ )
+
+ with pytest.raises(
+ ValueError,
+ match="Cannot convert 0-D array to bool8 array",
+ ):
+ pa.Bool8Array.from_numpy(np.bool_())
+
+ # must use compatible storage type
+ with pytest.raises(
+ TypeError,
+ match="Array dtype float64 incompatible with bool8 storage",
+ ):
+ pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64))
+
+
+def test_bool8_scalar():
+ assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() is True
+ assert pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() is False
+ assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() is True
+ assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() is True
+ assert pa.ExtensionScalar.from_storage(pa.bool8(), None).as_py() is None
+
+ arr = pa.ExtensionArray.from_storage(
+ pa.bool8(),
+ pa.array([-1, 0, 1, 2, None], pa.int8()),
+ )
+ assert arr[0].as_py() is True
+ assert arr[1].as_py() is False
+ assert arr[2].as_py() is True
+ assert arr[3].as_py() is True
+ assert arr[4].as_py() is None
+
+ assert pa.scalar(-1, type=pa.bool8()).as_py() is True
+ assert pa.scalar(0, type=pa.bool8()).as_py() is False
+ assert pa.scalar(1, type=pa.bool8()).as_py() is True
+ assert pa.scalar(2, type=pa.bool8()).as_py() is True
+ assert pa.scalar(None, type=pa.bool8()).as_py() is None
diff --git a/python/pyarrow/tests/test_misc.py
b/python/pyarrow/tests/test_misc.py
index 9a55a38177..5d3471c7c3 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -250,6 +250,9 @@ def test_set_timezone_db_path_non_windows():
pa.OpaqueArray,
pa.OpaqueScalar,
pa.OpaqueType,
+ pa.Bool8Array,
+ pa.Bool8Scalar,
+ pa.Bool8Type,
])
def test_extension_type_constructor_errors(klass):
# ARROW-2638: prevent calling extension class constructors directly
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index dcd2b61c33..563782f0c2 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1837,6 +1837,37 @@ cdef class FixedShapeTensorType(BaseExtensionType):
return FixedShapeTensorScalar
+cdef class Bool8Type(BaseExtensionType):
+ """
+ Concrete class for bool8 extension type.
+
+ Bool8 is an alternate representation for boolean
+ arrays using 8 bits instead of 1 bit per value. The underlying
+ storage type is int8.
+
+ Examples
+ --------
+ Create an instance of bool8 extension type:
+
+ >>> import pyarrow as pa
+ >>> pa.bool8()
+ Bool8Type(extension<arrow.bool8>)
+ """
+
+ cdef void init(self, const shared_ptr[CDataType]& type) except *:
+ BaseExtensionType.init(self, type)
+ self.bool8_ext_type = <const CBool8Type*> type.get()
+
+ def __arrow_ext_class__(self):
+ return Bool8Array
+
+ def __reduce__(self):
+ return bool8, ()
+
+ def __arrow_ext_scalar_class__(self):
+ return Bool8Scalar
+
+
cdef class OpaqueType(BaseExtensionType):
"""
Concrete class for opaque extension type.
@@ -5278,6 +5309,49 @@ def fixed_shape_tensor(DataType value_type, shape,
dim_names=None, permutation=N
return out
+def bool8():
+ """
+ Create instance of bool8 extension type.
+
+ Examples
+ --------
+ Create an instance of bool8 extension type:
+
+ >>> import pyarrow as pa
+ >>> type = pa.bool8()
+ >>> type
+ Bool8Type(extension<arrow.bool8>)
+
+ Inspect the data type:
+
+ >>> type.storage_type
+ DataType(int8)
+
+ Create a table with a bool8 array:
+
+ >>> arr = [-1, 0, 1, 2, None]
+ >>> storage = pa.array(arr, pa.int8())
+ >>> other = pa.ExtensionArray.from_storage(type, storage)
+ >>> pa.table([other], names=["unknown_col"])
+ pyarrow.Table
+ unknown_col: extension<arrow.bool8>
+ ----
+ unknown_col: [[-1,0,1,2,null]]
+
+ Returns
+ -------
+ type : Bool8Type
+ """
+
+ cdef Bool8Type out = Bool8Type.__new__(Bool8Type)
+
+ c_type = GetResultValue(CBool8Type.Make())
+
+ out.init(c_type)
+
+ return out
+
+
def opaque(DataType storage_type, str type_name not None, str vendor_name not
None):
"""
Create instance of opaque extension type.