This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2328b6ee39 GH-15058: [C++][Python] Native support for UUID (#37298)
2328b6ee39 is described below
commit 2328b6ee39b497d9f48e6d342db9f7d0c34d9791
Author: Rok Mihevc <[email protected]>
AuthorDate: Mon Aug 26 16:34:18 2024 +0200
GH-15058: [C++][Python] Native support for UUID (#37298)
### Rationale for this change
See #15058.
UUID datatype is common in throughout the ecosystem and Arrow as supporting
it as a native type would reduce friction.
### What changes are included in this PR?
This PR implements logic for Arrow canonical extension type in C++ and a
Python wrapper.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes, new extension type is added.
* Closes: #15058
Authored-by: Rok Mihevc <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/CMakeLists.txt | 3 +-
cpp/src/arrow/acero/hash_join_node_test.cc | 1 +
cpp/src/arrow/extension/CMakeLists.txt | 2 +-
cpp/src/arrow/extension/fixed_shape_tensor_test.cc | 17 +---
cpp/src/arrow/extension/uuid.cc | 58 ++++++++++++
cpp/src/arrow/extension/uuid.h | 61 ++++++++++++
cpp/src/arrow/extension/uuid_test.cc | 72 ++++++++++++++
cpp/src/arrow/extension_type.cc | 4 +-
cpp/src/arrow/extension_type_test.cc | 19 +---
cpp/src/arrow/integration/json_integration_test.cc | 2 +-
cpp/src/arrow/ipc/test_common.cc | 35 +++++--
cpp/src/arrow/ipc/test_common.h | 3 +
cpp/src/arrow/scalar_test.cc | 5 +-
cpp/src/arrow/testing/extension_type.h | 6 +-
cpp/src/arrow/testing/gtest_util.cc | 16 ++--
dev/archery/archery/integration/datagen.py | 2 +-
docs/source/format/CanonicalExtensions.rst | 2 +
docs/source/status.rst | 2 +-
python/pyarrow/__init__.py | 18 ++--
python/pyarrow/array.pxi | 6 ++
python/pyarrow/includes/libarrow.pxd | 10 ++
python/pyarrow/lib.pxd | 3 +
python/pyarrow/public-api.pxi | 11 ++-
python/pyarrow/scalar.pxi | 10 ++
python/pyarrow/src/arrow/python/gdb.cc | 27 +-----
python/pyarrow/tests/extensions.pyx | 2 +-
python/pyarrow/tests/test_extension_type.py | 105 ++++++++++++++-------
python/pyarrow/tests/test_gdb.py | 8 +-
python/pyarrow/types.pxi | 34 +++++++
29 files changed, 412 insertions(+), 132 deletions(-)
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 89f28ee416..6b0ac8c23c 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -375,6 +375,7 @@ set(ARROW_SRCS
device.cc
extension_type.cc
extension/bool8.cc
+ extension/uuid.cc
pretty_print.cc
record_batch.cc
result.cc
@@ -1225,6 +1226,7 @@ add_subdirectory(testing)
add_subdirectory(array)
add_subdirectory(c)
add_subdirectory(compute)
+add_subdirectory(extension)
add_subdirectory(io)
add_subdirectory(tensor)
add_subdirectory(util)
@@ -1267,7 +1269,6 @@ endif()
if(ARROW_JSON)
add_subdirectory(json)
- add_subdirectory(extension)
endif()
if(ARROW_ORC)
diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc
b/cpp/src/arrow/acero/hash_join_node_test.cc
index 9065e286a2..76ad9c7d65 100644
--- a/cpp/src/arrow/acero/hash_join_node_test.cc
+++ b/cpp/src/arrow/acero/hash_join_node_test.cc
@@ -29,6 +29,7 @@
#include "arrow/compute/kernels/test_util.h"
#include "arrow/compute/light_array_internal.h"
#include "arrow/compute/row/row_encoder_internal.h"
+#include "arrow/extension/uuid.h"
#include "arrow/testing/extension_type.h"
#include "arrow/testing/generator.h"
#include "arrow/testing/gtest_util.h"
diff --git a/cpp/src/arrow/extension/CMakeLists.txt
b/cpp/src/arrow/extension/CMakeLists.txt
index 5cb4bc77af..065ea3f1dd 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-set(CANONICAL_EXTENSION_TESTS bool8_test.cc)
+set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc)
if(ARROW_JSON)
list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc
opaque_test.cc)
diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
index 3fd39a11ff..842a78e1a4 100644
--- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
+++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
@@ -23,7 +23,7 @@
#include "arrow/array/array_primitive.h"
#include "arrow/io/memory.h"
#include "arrow/ipc/reader.h"
-#include "arrow/ipc/writer.h"
+#include "arrow/ipc/test_common.h"
#include "arrow/record_batch.h"
#include "arrow/tensor.h"
#include "arrow/testing/gtest_util.h"
@@ -33,6 +33,7 @@
namespace arrow {
using FixedShapeTensorType = extension::FixedShapeTensorType;
+using arrow::ipc::test::RoundtripBatch;
using extension::fixed_shape_tensor;
using extension::FixedShapeTensorArray;
@@ -71,20 +72,6 @@ class TestExtensionType : public ::testing::Test {
std::string serialized_;
};
-auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
- std::shared_ptr<RecordBatch>* out) {
- ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
- ASSERT_OK(ipc::WriteRecordBatchStream({batch},
ipc::IpcWriteOptions::Defaults(),
- out_stream.get()));
-
- ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
-
- io::BufferReader reader(complete_ipc_stream);
- std::shared_ptr<RecordBatchReader> batch_reader;
- ASSERT_OK_AND_ASSIGN(batch_reader,
ipc::RecordBatchStreamReader::Open(&reader));
- ASSERT_OK(batch_reader->ReadNext(out));
-};
-
TEST_F(TestExtensionType, CheckDummyRegistration) {
// We need a registered dummy type at runtime to allow for IPC
deserialization
auto registered_type = GetExtensionType("arrow.fixed_shape_tensor");
diff --git a/cpp/src/arrow/extension/uuid.cc b/cpp/src/arrow/extension/uuid.cc
new file mode 100644
index 0000000000..43b917a17f
--- /dev/null
+++ b/cpp/src/arrow/extension/uuid.cc
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include "arrow/extension_type.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/extension/uuid.h"
+
+namespace arrow::extension {
+
+bool UuidType::ExtensionEquals(const ExtensionType& other) const {
+ return (other.extension_name() == this->extension_name());
+}
+
+std::shared_ptr<Array> UuidType::MakeArray(std::shared_ptr<ArrayData> data)
const {
+ DCHECK_EQ(data->type->id(), Type::EXTENSION);
+ DCHECK_EQ("arrow.uuid",
+ static_cast<const ExtensionType&>(*data->type).extension_name());
+ return std::make_shared<UuidArray>(data);
+}
+
+Result<std::shared_ptr<DataType>> UuidType::Deserialize(
+ std::shared_ptr<DataType> storage_type, const std::string& serialized)
const {
+ if (!serialized.empty()) {
+ return Status::Invalid("Unexpected serialized metadata: '", serialized,
"'");
+ }
+ if (!storage_type->Equals(*fixed_size_binary(16))) {
+ return Status::Invalid("Invalid storage type for UuidType: ",
+ storage_type->ToString());
+ }
+ return std::make_shared<UuidType>();
+}
+
+std::string UuidType::ToString(bool show_metadata) const {
+ std::stringstream ss;
+ ss << "extension<" << this->extension_name() << ">";
+ return ss.str();
+}
+
+std::shared_ptr<DataType> uuid() { return std::make_shared<UuidType>(); }
+
+} // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/uuid.h b/cpp/src/arrow/extension/uuid.h
new file mode 100644
index 0000000000..42bb21cf0b
--- /dev/null
+++ b/cpp/src/arrow/extension/uuid.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/extension_type.h"
+
+namespace arrow::extension {
+
+/// \brief UuidArray stores array of UUIDs. Underlying storage type is
+/// FixedSizeBinary(16).
+class ARROW_EXPORT UuidArray : public ExtensionArray {
+ public:
+ using ExtensionArray::ExtensionArray;
+};
+
+/// \brief UuidType is a canonical arrow extension type for UUIDs.
+/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this
+/// does not interpret the bytes in any way. Specific UUID version is not
+/// required or guaranteed.
+class ARROW_EXPORT UuidType : public ExtensionType {
+ public:
+ /// \brief Construct a UuidType.
+ UuidType() : ExtensionType(fixed_size_binary(16)) {}
+
+ std::string extension_name() const override { return "arrow.uuid"; }
+ std::string ToString(bool show_metadata = false) const override;
+
+ bool ExtensionEquals(const ExtensionType& other) const override;
+
+ /// Create a UuidArray from ArrayData
+ std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const
override;
+
+ Result<std::shared_ptr<DataType>> Deserialize(
+ std::shared_ptr<DataType> storage_type,
+ const std::string& serialized) const override;
+
+ std::string Serialize() const override { return ""; }
+
+ /// \brief Create a UuidType instance
+ static Result<std::shared_ptr<DataType>> Make() { return
std::make_shared<UuidType>(); }
+};
+
+/// \brief Return a UuidType instance.
+ARROW_EXPORT std::shared_ptr<DataType> uuid();
+
+} // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/uuid_test.cc
b/cpp/src/arrow/extension/uuid_test.cc
new file mode 100644
index 0000000000..3bbb6eeb4a
--- /dev/null
+++ b/cpp/src/arrow/extension/uuid_test.cc
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/uuid.h"
+
+#include "arrow/testing/matchers.h"
+
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/test_common.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/key_value_metadata.h"
+
+#include "arrow/testing/extension_type.h"
+
+namespace arrow {
+
+using arrow::ipc::test::RoundtripBatch;
+
+TEST(TestUuuidExtensionType, ExtensionTypeTest) {
+ auto type = uuid();
+ ASSERT_EQ(type->id(), Type::EXTENSION);
+
+ const auto& ext_type = static_cast<const ExtensionType&>(*type);
+ std::string serialized = ext_type.Serialize();
+
+ ASSERT_OK_AND_ASSIGN(auto deserialized,
+ ext_type.Deserialize(fixed_size_binary(16),
serialized));
+ ASSERT_TRUE(deserialized->Equals(*type));
+ ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16)));
+}
+
+TEST(TestUuuidExtensionType, RoundtripBatch) {
+ auto ext_type = extension::uuid();
+ auto exact_ext_type =
internal::checked_pointer_cast<extension::UuidType>(ext_type);
+ auto arr = ArrayFromJSON(fixed_size_binary(16), R"(["abcdefghijklmnop",
null])");
+ auto ext_arr = ExtensionType::WrapArray(ext_type, arr);
+
+ // Pass extension array, expect getting back extension array
+ std::shared_ptr<RecordBatch> read_batch;
+ auto ext_field = field(/*name=*/"f0", /*type=*/ext_type);
+ auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(),
{ext_arr});
+ RoundtripBatch(batch, &read_batch);
+ CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);
+
+ // Pass extension metadata and storage array, expect getting back extension
array
+ std::shared_ptr<RecordBatch> read_batch2;
+ auto ext_metadata =
+ key_value_metadata({{"ARROW:extension:name",
exact_ext_type->extension_name()},
+ {"ARROW:extension:metadata", ""}});
+ ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(),
+ /*nullable=*/true, /*metadata=*/ext_metadata);
+ auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr});
+ RoundtripBatch(batch2, &read_batch2);
+ CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true);
+}
+
+} // namespace arrow
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index 83c7ebed4f..fc220f73a6 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -32,6 +32,7 @@
#include "arrow/extension/fixed_shape_tensor.h"
#include "arrow/extension/opaque.h"
#endif
+#include "arrow/extension/uuid.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
@@ -147,14 +148,13 @@ static void CreateGlobalRegistry() {
// Register canonical extension types
g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
- std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8()};
+ std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8(),
extension::uuid()};
#ifdef ARROW_JSON
ext_types.push_back(extension::fixed_shape_tensor(int64(), {}));
ext_types.push_back(extension::opaque(null(), "", ""));
#endif
- // Register canonical extension types
for (const auto& ext_type : ext_types) {
ARROW_CHECK_OK(
g_registry->RegisterType(checked_pointer_cast<ExtensionType>(ext_type)));
diff --git a/cpp/src/arrow/extension_type_test.cc
b/cpp/src/arrow/extension_type_test.cc
index f104c984a6..f49ffc5cba 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -30,6 +30,7 @@
#include "arrow/io/memory.h"
#include "arrow/ipc/options.h"
#include "arrow/ipc/reader.h"
+#include "arrow/ipc/test_common.h"
#include "arrow/ipc/writer.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
@@ -41,6 +42,8 @@
namespace arrow {
+using arrow::ipc::test::RoundtripBatch;
+
class Parametric1Array : public ExtensionArray {
public:
using ExtensionArray::ExtensionArray;
@@ -178,7 +181,7 @@ class ExtStructType : public ExtensionType {
class TestExtensionType : public ::testing::Test {
public:
- void SetUp() {
ASSERT_OK(RegisterExtensionType(std::make_shared<UuidType>())); }
+ void SetUp() {
ASSERT_OK(RegisterExtensionType(std::make_shared<ExampleUuidType>())); }
void TearDown() {
if (GetExtensionType("uuid")) {
@@ -211,20 +214,6 @@ TEST_F(TestExtensionType, ExtensionTypeTest) {
ASSERT_EQ(deserialized->byte_width(), 16);
}
-auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
- std::shared_ptr<RecordBatch>* out) {
- ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
- ASSERT_OK(ipc::WriteRecordBatchStream({batch},
ipc::IpcWriteOptions::Defaults(),
- out_stream.get()));
-
- ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
-
- io::BufferReader reader(complete_ipc_stream);
- std::shared_ptr<RecordBatchReader> batch_reader;
- ASSERT_OK_AND_ASSIGN(batch_reader,
ipc::RecordBatchStreamReader::Open(&reader));
- ASSERT_OK(batch_reader->ReadNext(out));
-};
-
TEST_F(TestExtensionType, IpcRoundtrip) {
auto ext_arr = ExampleUuid();
auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr});
diff --git a/cpp/src/arrow/integration/json_integration_test.cc
b/cpp/src/arrow/integration/json_integration_test.cc
index 9b56928c68..0e84ea6124 100644
--- a/cpp/src/arrow/integration/json_integration_test.cc
+++ b/cpp/src/arrow/integration/json_integration_test.cc
@@ -1046,7 +1046,7 @@ TEST(TestJsonFileReadWrite, JsonExample2) {
auto storage_array =
ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])");
- AssertArraysEqual(*batch->column(0), UuidArray(uuid_type, storage_array));
+ AssertArraysEqual(*batch->column(0), ExampleUuidArray(uuid_type,
storage_array));
AssertArraysEqual(*batch->column(1), NullArray(2));
}
diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc
index 87c02e2d87..fb4f6bd8ea 100644
--- a/cpp/src/arrow/ipc/test_common.cc
+++ b/cpp/src/arrow/ipc/test_common.cc
@@ -27,8 +27,10 @@
#include "arrow/array.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_primitive.h"
-#include "arrow/array/builder_time.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
#include "arrow/ipc/test_common.h"
+#include "arrow/ipc/writer.h"
#include "arrow/pretty_print.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
@@ -242,11 +244,11 @@ Status MakeRandomBooleanArray(const int length, bool
include_nulls,
std::shared_ptr<Array>* out) {
std::vector<uint8_t> values(length);
random_null_bytes(length, 0.5, values.data());
- ARROW_ASSIGN_OR_RAISE(auto data, internal::BytesToBits(values));
+ ARROW_ASSIGN_OR_RAISE(auto data, arrow::internal::BytesToBits(values));
if (include_nulls) {
std::vector<uint8_t> valid_bytes(length);
- ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
internal::BytesToBits(valid_bytes));
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
arrow::internal::BytesToBits(valid_bytes));
random_null_bytes(length, 0.1, valid_bytes.data());
*out = std::make_shared<BooleanArray>(length, data, null_bitmap, -1);
} else {
@@ -596,7 +598,7 @@ Status MakeStruct(std::shared_ptr<RecordBatch>* out) {
std::shared_ptr<Array> no_nulls(new StructArray(type,
list_batch->num_rows(), columns));
std::vector<uint8_t> null_bytes(list_batch->num_rows(), 1);
null_bytes[0] = 0;
- ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes));
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
arrow::internal::BytesToBits(null_bytes));
std::shared_ptr<Array> with_nulls(
new StructArray(type, list_batch->num_rows(), columns, null_bitmap, 1));
@@ -1088,9 +1090,9 @@ Status MakeUuid(std::shared_ptr<RecordBatch>* out) {
auto f1 = field("f1", uuid_type, /*nullable=*/false);
auto schema = ::arrow::schema({f0, f1});
- auto a0 = std::make_shared<UuidArray>(
+ auto a0 = std::make_shared<ExampleUuidArray>(
uuid_type, ArrayFromJSON(storage_type, R"(["0123456789abcdef", null])"));
- auto a1 = std::make_shared<UuidArray>(
+ auto a1 = std::make_shared<ExampleUuidArray>(
uuid_type,
ArrayFromJSON(storage_type, R"(["ZYXWVUTSRQPONMLK",
"JIHGFEDBA9876543"])"));
@@ -1176,12 +1178,13 @@ enable_if_t<std::is_floating_point<CValueType>::value,
void> FillRandomData(
Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
const std::vector<int64_t>& shape, bool row_major_p,
std::shared_ptr<Tensor>* out, uint32_t seed) {
- const auto& element_type = internal::checked_cast<const
FixedWidthType&>(*type);
+ const auto& element_type = arrow::internal::checked_cast<const
FixedWidthType&>(*type);
std::vector<int64_t> strides;
if (row_major_p) {
- RETURN_NOT_OK(internal::ComputeRowMajorStrides(element_type, shape,
&strides));
+ RETURN_NOT_OK(arrow::internal::ComputeRowMajorStrides(element_type, shape,
&strides));
} else {
- RETURN_NOT_OK(internal::ComputeColumnMajorStrides(element_type, shape,
&strides));
+ RETURN_NOT_OK(
+ arrow::internal::ComputeColumnMajorStrides(element_type, shape,
&strides));
}
const int64_t element_size = element_type.bit_width() / CHAR_BIT;
@@ -1233,6 +1236,20 @@ Status MakeRandomTensor(const std::shared_ptr<DataType>&
type,
return Tensor::Make(type, buf, shape, strides).Value(out);
}
+void RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
+ std::shared_ptr<RecordBatch>* out) {
+ ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
+ ASSERT_OK(ipc::WriteRecordBatchStream({batch},
ipc::IpcWriteOptions::Defaults(),
+ out_stream.get()));
+
+ ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
+
+ io::BufferReader reader(complete_ipc_stream);
+ std::shared_ptr<RecordBatchReader> batch_reader;
+ ASSERT_OK_AND_ASSIGN(batch_reader,
ipc::RecordBatchStreamReader::Open(&reader));
+ ASSERT_OK(batch_reader->ReadNext(out));
+}
+
} // namespace test
} // namespace ipc
} // namespace arrow
diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h
index db8613cbb1..9b7e7f13e3 100644
--- a/cpp/src/arrow/ipc/test_common.h
+++ b/cpp/src/arrow/ipc/test_common.h
@@ -184,6 +184,9 @@ Status MakeRandomTensor(const std::shared_ptr<DataType>&
type,
const std::vector<int64_t>& shape, bool row_major_p,
std::shared_ptr<Tensor>* out, uint32_t seed = 0);
+ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr<RecordBatch>&
batch,
+ std::shared_ptr<RecordBatch>* out);
+
} // namespace test
} // namespace ipc
} // namespace arrow
diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc
index 104a5697b5..e9ec13e98b 100644
--- a/cpp/src/arrow/scalar_test.cc
+++ b/cpp/src/arrow/scalar_test.cc
@@ -43,7 +43,6 @@ namespace arrow {
using compute::Cast;
using compute::CastOptions;
-
using internal::checked_cast;
using internal::checked_pointer_cast;
@@ -2038,7 +2037,7 @@ class TestExtensionScalar : public ::testing::Test {
void SetUp() {
type_ = uuid();
storage_type_ = fixed_size_binary(16);
- uuid_type_ = checked_cast<const UuidType*>(type_.get());
+ uuid_type_ = checked_cast<const ExampleUuidType*>(type_.get());
}
protected:
@@ -2049,7 +2048,7 @@ class TestExtensionScalar : public ::testing::Test {
}
std::shared_ptr<DataType> type_, storage_type_;
- const UuidType* uuid_type_{nullptr};
+ const ExampleUuidType* uuid_type_{nullptr};
const std::string_view uuid_string1_{UUID_STRING1};
const std::string_view uuid_string2_{UUID_STRING2};
diff --git a/cpp/src/arrow/testing/extension_type.h
b/cpp/src/arrow/testing/extension_type.h
index 6515631f20..a4526e31c2 100644
--- a/cpp/src/arrow/testing/extension_type.h
+++ b/cpp/src/arrow/testing/extension_type.h
@@ -27,14 +27,14 @@
namespace arrow {
-class ARROW_TESTING_EXPORT UuidArray : public ExtensionArray {
+class ARROW_TESTING_EXPORT ExampleUuidArray : public ExtensionArray {
public:
using ExtensionArray::ExtensionArray;
};
-class ARROW_TESTING_EXPORT UuidType : public ExtensionType {
+class ARROW_TESTING_EXPORT ExampleUuidType : public ExtensionType {
public:
- UuidType() : ExtensionType(fixed_size_binary(16)) {}
+ ExampleUuidType() : ExtensionType(fixed_size_binary(16)) {}
std::string extension_name() const override { return "uuid"; }
diff --git a/cpp/src/arrow/testing/gtest_util.cc
b/cpp/src/arrow/testing/gtest_util.cc
index 95de16c715..ae2e53b30a 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -49,9 +49,13 @@
#include "arrow/buffer.h"
#include "arrow/compute/api_vector.h"
#include "arrow/datum.h"
+#include "arrow/io/memory.h"
#include "arrow/ipc/json_simple.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
#include "arrow/pretty_print.h"
+#include "arrow/record_batch.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/tensor.h"
@@ -847,17 +851,17 @@ Future<> SleepABitAsync() {
///////////////////////////////////////////////////////////////////////////
// Extension types
-bool UuidType::ExtensionEquals(const ExtensionType& other) const {
+bool ExampleUuidType::ExtensionEquals(const ExtensionType& other) const {
return (other.extension_name() == this->extension_name());
}
-std::shared_ptr<Array> UuidType::MakeArray(std::shared_ptr<ArrayData> data)
const {
+std::shared_ptr<Array> ExampleUuidType::MakeArray(std::shared_ptr<ArrayData>
data) const {
DCHECK_EQ(data->type->id(), Type::EXTENSION);
DCHECK_EQ("uuid", static_cast<const
ExtensionType&>(*data->type).extension_name());
- return std::make_shared<UuidArray>(data);
+ return std::make_shared<ExampleUuidArray>(data);
}
-Result<std::shared_ptr<DataType>> UuidType::Deserialize(
+Result<std::shared_ptr<DataType>> ExampleUuidType::Deserialize(
std::shared_ptr<DataType> storage_type, const std::string& serialized)
const {
if (serialized != "uuid-serialized") {
return Status::Invalid("Type identifier did not match: '", serialized,
"'");
@@ -866,7 +870,7 @@ Result<std::shared_ptr<DataType>> UuidType::Deserialize(
return Status::Invalid("Invalid storage type for UuidType: ",
storage_type->ToString());
}
- return std::make_shared<UuidType>();
+ return std::make_shared<ExampleUuidType>();
}
bool SmallintType::ExtensionEquals(const ExtensionType& other) const {
@@ -982,7 +986,7 @@ Result<std::shared_ptr<DataType>>
Complex128Type::Deserialize(
return std::make_shared<Complex128Type>();
}
-std::shared_ptr<DataType> uuid() { return std::make_shared<UuidType>(); }
+std::shared_ptr<DataType> uuid() { return std::make_shared<ExampleUuidType>();
}
std::shared_ptr<DataType> smallint() { return
std::make_shared<SmallintType>(); }
diff --git a/dev/archery/archery/integration/datagen.py
b/dev/archery/archery/integration/datagen.py
index d395d26cb7..f63aa0d95a 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -1845,7 +1845,7 @@ def generate_nested_dictionary_case():
def generate_extension_case():
dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0')
- uuid_type = ExtensionType('uuid', 'uuid-serialized',
+ uuid_type = ExtensionType('arrow.uuid', '',
FixedSizeBinaryField('', 16))
dict_ext_type = ExtensionType(
'dict-extension', 'dict-extension-serialized',
diff --git a/docs/source/format/CanonicalExtensions.rst
b/docs/source/format/CanonicalExtensions.rst
index 5658f949ce..1106f8aaff 100644
--- a/docs/source/format/CanonicalExtensions.rst
+++ b/docs/source/format/CanonicalExtensions.rst
@@ -272,6 +272,8 @@ JSON
In the future, additional fields may be added, but they are not required
to interpret the array.
+.. _uuid_extension:
+
UUID
====
diff --git a/docs/source/status.rst b/docs/source/status.rst
index 5e2c2cc19c..b685d4bbf8 100644
--- a/docs/source/status.rst
+++ b/docs/source/status.rst
@@ -121,7 +121,7 @@ Data Types
+-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| JSON | | | ✓ | | | |
| |
+-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
-| UUID | | | ✓ | | | |
| |
+| UUID | ✓ | | ✓ | | | |
| |
+-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| 8-bit Boolean | ✓ | | ✓ | | | |
| |
+-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 807bcdc315..d31c93119b 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -172,9 +172,7 @@ from pyarrow.lib import (null, bool_,
union, sparse_union, dense_union,
dictionary,
run_end_encoded,
- fixed_shape_tensor,
- opaque,
- bool8,
+ bool8, fixed_shape_tensor, opaque, uuid,
field,
type_for_alias,
DataType, DictionaryType, StructType,
@@ -184,8 +182,9 @@ from pyarrow.lib import (null, bool_,
TimestampType, Time32Type, Time64Type, DurationType,
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
- RunEndEncodedType, FixedShapeTensorType, OpaqueType,
- Bool8Type, PyExtensionType, UnknownExtensionType,
+ RunEndEncodedType, Bool8Type, FixedShapeTensorType,
+ OpaqueType, UuidType,
+ PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
KeyValueMetadata,
@@ -218,8 +217,9 @@ from pyarrow.lib import (null, bool_,
Time32Array, Time64Array, DurationArray,
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray,
ExtensionArray,
- RunEndEncodedArray, FixedShapeTensorArray,
OpaqueArray,
- Bool8Array, scalar, NA, _NULL as NULL, Scalar,
+ RunEndEncodedArray, Bool8Array, FixedShapeTensorArray,
+ OpaqueArray, UuidArray,
+ scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
@@ -235,8 +235,8 @@ from pyarrow.lib import (null, bool_,
StringScalar, LargeStringScalar, StringViewScalar,
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
- RunEndEncodedScalar, ExtensionScalar,
- FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar)
+ RunEndEncodedScalar, Bool8Scalar, ExtensionScalar,
+ FixedShapeTensorScalar, OpaqueScalar, UuidScalar)
# Buffers, allocation
from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 77d6c9c06d..1587de0e6b 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -4338,6 +4338,12 @@ cdef class ExtensionArray(Array):
return result
+class UuidArray(ExtensionArray):
+ """
+ Concrete class for Arrow arrays of UUID data type.
+ """
+
+
cdef class FixedShapeTensorArray(ExtensionArray):
"""
Concrete class for fixed shape tensor extension arrays.
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 6f510cfc0c..c2346750a1 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2865,6 +2865,16 @@ cdef extern from "arrow/extension_type.h" namespace
"arrow":
shared_ptr[CArray] storage()
+cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil:
+ cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType):
+
+ @staticmethod
+ CResult[shared_ptr[CDataType]] Make()
+
+ cdef cppclass CUuidArray" arrow::extension::UuidArray"(CExtensionArray):
+ pass
+
+
cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace
"arrow::extension" nogil:
cdef cppclass CFixedShapeTensorType \
" arrow::extension::FixedShapeTensorType"(CExtensionType):
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index a7c3b496a0..5c3d981c3a 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -222,6 +222,9 @@ cdef class OpaqueType(BaseExtensionType):
cdef:
const COpaqueType* opaque_ext_type
+cdef class UuidType(BaseExtensionType):
+ cdef:
+ const CUuidType* uuid_ext_type
cdef class PyExtensionType(ExtensionType):
pass
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 19a26bd6c6..d3e2ff2e99 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -120,14 +120,17 @@ cdef api object pyarrow_wrap_data_type(
elif type.get().id() == _Type_EXTENSION:
ext_type = <const CExtensionType*> type.get()
cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
+ extension_name = ext_type.extension_name()
if cpy_ext_type != nullptr:
return cpy_ext_type.GetInstance()
- elif ext_type.extension_name() == b"arrow.fixed_shape_tensor":
+ elif extension_name == b"arrow.bool8":
+ out = Bool8Type.__new__(Bool8Type)
+ elif extension_name == b"arrow.fixed_shape_tensor":
out = FixedShapeTensorType.__new__(FixedShapeTensorType)
- elif ext_type.extension_name() == b"arrow.opaque":
+ elif extension_name == b"arrow.opaque":
out = OpaqueType.__new__(OpaqueType)
- elif ext_type.extension_name() == b"arrow.bool8":
- out = Bool8Type.__new__(Bool8Type)
+ elif extension_name == b"arrow.uuid":
+ out = UuidType.__new__(UuidType)
else:
out = BaseExtensionType.__new__(BaseExtensionType)
else:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 72ae2aee5f..68f77832c4 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -17,6 +17,7 @@
import collections
from cython cimport binding
+from uuid import UUID
cdef class Scalar(_Weakrefable):
@@ -1043,6 +1044,15 @@ cdef class ExtensionScalar(Scalar):
return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)
+class UuidScalar(ExtensionScalar):
+ """
+ Concrete class for Uuid extension scalar.
+ """
+
+ def as_py(self):
+ return None if self.value is None else UUID(bytes=self.value.as_py())
+
+
cdef class FixedShapeTensorScalar(ExtensionScalar):
"""
Concrete class for fixed shape tensor extension scalar.
diff --git a/python/pyarrow/src/arrow/python/gdb.cc
b/python/pyarrow/src/arrow/python/gdb.cc
index 6941769e4e..7c58bae334 100644
--- a/python/pyarrow/src/arrow/python/gdb.cc
+++ b/python/pyarrow/src/arrow/python/gdb.cc
@@ -22,7 +22,7 @@
#include "arrow/array.h"
#include "arrow/chunked_array.h"
#include "arrow/datum.h"
-#include "arrow/extension_type.h"
+#include "arrow/extension/uuid.h"
#include "arrow/ipc/json_simple.h"
#include "arrow/python/gdb.h"
#include "arrow/record_batch.h"
@@ -37,6 +37,8 @@
namespace arrow {
+using extension::uuid;
+using extension::UuidType;
using ipc::internal::json::ArrayFromJSON;
using ipc::internal::json::ChunkedArrayFromJSON;
using ipc::internal::json::ScalarFromJSON;
@@ -56,29 +58,6 @@ class CustomStatusDetail : public StatusDetail {
std::string ToString() const override { return "This is a detail"; }
};
-class UuidType : public ExtensionType {
- public:
- UuidType() : ExtensionType(fixed_size_binary(16)) {}
-
- std::string extension_name() const override { return "uuid"; }
-
- bool ExtensionEquals(const ExtensionType& other) const override {
- return (other.extension_name() == this->extension_name());
- }
-
- std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const
override {
- return std::make_shared<ExtensionArray>(data);
- }
-
- Result<std::shared_ptr<DataType>> Deserialize(
- std::shared_ptr<DataType> storage_type,
- const std::string& serialized) const override {
- return Status::NotImplemented("");
- }
-
- std::string Serialize() const override { return "uuid-serialized"; }
-};
-
std::shared_ptr<Array> SliceArrayFromJSON(const std::shared_ptr<DataType>& ty,
std::string_view json, int64_t
offset = 0,
int64_t length = -1) {
diff --git a/python/pyarrow/tests/extensions.pyx
b/python/pyarrow/tests/extensions.pyx
index c1bf9aae1e..309b574dc0 100644
--- a/python/pyarrow/tests/extensions.pyx
+++ b/python/pyarrow/tests/extensions.pyx
@@ -37,7 +37,7 @@ cdef extern from * namespace "arrow::py" nogil:
class UuidType : public ExtensionType {
public:
UuidType() : ExtensionType(fixed_size_binary(16)) {}
- std::string extension_name() const override { return "uuid"; }
+ std::string extension_name() const override { return "example-uuid"; }
bool ExtensionEquals(const ExtensionType& other) const override {
return other.extension_name() == this->extension_name();
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index 0d50c467e9..aacbd2cb6e 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -95,18 +95,21 @@ class IntegerEmbeddedType(pa.ExtensionType):
return cls()
-class UuidScalarType(pa.ExtensionScalar):
+class ExampleUuidScalarType(pa.ExtensionScalar):
def as_py(self):
return None if self.value is None else UUID(bytes=self.value.as_py())
-class UuidType(pa.ExtensionType):
+class ExampleUuidType(pa.ExtensionType):
def __init__(self):
- super().__init__(pa.binary(16), 'pyarrow.tests.UuidType')
+ super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType')
+
+ def __reduce__(self):
+ return ExampleUuidType, ()
def __arrow_ext_scalar_class__(self):
- return UuidScalarType
+ return ExampleUuidScalarType
def __arrow_ext_serialize__(self):
return b''
@@ -116,10 +119,10 @@ class UuidType(pa.ExtensionType):
return cls()
-class UuidType2(pa.ExtensionType):
+class ExampleUuidType2(pa.ExtensionType):
def __init__(self):
- super().__init__(pa.binary(16), 'pyarrow.tests.UuidType2')
+ super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType2')
def __arrow_ext_serialize__(self):
return b''
@@ -250,8 +253,8 @@ def ipc_read_batch(buf):
def test_ext_type_basics():
- ty = UuidType()
- assert ty.extension_name == "pyarrow.tests.UuidType"
+ ty = ExampleUuidType()
+ assert ty.extension_name == "pyarrow.tests.ExampleUuidType"
def test_ext_type_str():
@@ -267,16 +270,16 @@ def test_ext_type_repr():
def test_ext_type_lifetime():
- ty = UuidType()
+ ty = ExampleUuidType()
wr = weakref.ref(ty)
del ty
assert wr() is None
def test_ext_type_storage_type():
- ty = UuidType()
+ ty = ExampleUuidType()
assert ty.storage_type == pa.binary(16)
- assert ty.__class__ is UuidType
+ assert ty.__class__ is ExampleUuidType
ty = ParamExtType(5)
assert ty.storage_type == pa.binary(5)
assert ty.__class__ is ParamExtType
@@ -284,7 +287,7 @@ def test_ext_type_storage_type():
def test_ext_type_byte_width():
# Test for fixed-size binary types
- ty = UuidType()
+ ty = pa.uuid()
assert ty.byte_width == 16
ty = ParamExtType(5)
assert ty.byte_width == 5
@@ -297,7 +300,7 @@ def test_ext_type_byte_width():
def test_ext_type_bit_width():
# Test for fixed-size binary types
- ty = UuidType()
+ ty = pa.uuid()
assert ty.bit_width == 128
ty = ParamExtType(5)
assert ty.bit_width == 40
@@ -309,7 +312,7 @@ def test_ext_type_bit_width():
def test_ext_type_as_py():
- ty = UuidType()
+ ty = ExampleUuidType()
expected = uuid4()
scalar = pa.ExtensionScalar.from_storage(ty, expected.bytes)
assert scalar.as_py() == expected
@@ -342,12 +345,22 @@ def test_ext_type_as_py():
def test_uuid_type_pickle(pickle_module):
for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
- ty = UuidType()
+ ty = ExampleUuidType()
ser = pickle_module.dumps(ty, protocol=proto)
del ty
ty = pickle_module.loads(ser)
wr = weakref.ref(ty)
- assert ty.extension_name == "pyarrow.tests.UuidType"
+ assert ty.extension_name == "pyarrow.tests.ExampleUuidType"
+ del ty
+ assert wr() is None
+
+ for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
+ ty = pa.uuid()
+ ser = pickle_module.dumps(ty, protocol=proto)
+ del ty
+ ty = pickle_module.loads(ser)
+ wr = weakref.ref(ty)
+ assert ty.extension_name == "arrow.uuid"
del ty
assert wr() is None
@@ -358,8 +371,8 @@ def test_ext_type_equality():
c = ParamExtType(6)
assert a != b
assert b == c
- d = UuidType()
- e = UuidType()
+ d = ExampleUuidType()
+ e = ExampleUuidType()
assert a != d
assert d == e
@@ -403,7 +416,7 @@ def test_ext_array_equality():
storage1 = pa.array([b"0123456789abcdef"], type=pa.binary(16))
storage2 = pa.array([b"0123456789abcdef"], type=pa.binary(16))
storage3 = pa.array([], type=pa.binary(16))
- ty1 = UuidType()
+ ty1 = ExampleUuidType()
ty2 = ParamExtType(16)
a = pa.ExtensionArray.from_storage(ty1, storage1)
@@ -451,9 +464,9 @@ def test_ext_scalar_from_array():
data = [b"0123456789abcdef", b"0123456789abcdef",
b"zyxwvutsrqponmlk", None]
storage = pa.array(data, type=pa.binary(16))
- ty1 = UuidType()
+ ty1 = ExampleUuidType()
ty2 = ParamExtType(16)
- ty3 = UuidType2()
+ ty3 = ExampleUuidType2()
a = pa.ExtensionArray.from_storage(ty1, storage)
b = pa.ExtensionArray.from_storage(ty2, storage)
@@ -462,9 +475,9 @@ def test_ext_scalar_from_array():
scalars_a = list(a)
assert len(scalars_a) == 4
- assert ty1.__arrow_ext_scalar_class__() == UuidScalarType
- assert isinstance(a[0], UuidScalarType)
- assert isinstance(scalars_a[0], UuidScalarType)
+ assert ty1.__arrow_ext_scalar_class__() == ExampleUuidScalarType
+ assert isinstance(a[0], ExampleUuidScalarType)
+ assert isinstance(scalars_a[0], ExampleUuidScalarType)
for s, val in zip(scalars_a, data):
assert isinstance(s, pa.ExtensionScalar)
@@ -505,7 +518,7 @@ def test_ext_scalar_from_array():
def test_ext_scalar_from_storage():
- ty = UuidType()
+ ty = ExampleUuidType()
s = pa.ExtensionScalar.from_storage(ty, None)
assert isinstance(s, pa.ExtensionScalar)
@@ -706,14 +719,14 @@ def test_cast_between_extension_types():
tiny_int_arr.cast(pa.int64()).cast(IntegerType())
# Between the same extension types is okay
- array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(UuidType())
- out = array.cast(UuidType())
- assert out.type == UuidType()
+ array = pa.array([b'1' * 16, b'2' * 16],
pa.binary(16)).cast(ExampleUuidType())
+ out = array.cast(ExampleUuidType())
+ assert out.type == ExampleUuidType()
# Will still fail casting between extensions who share storage type,
# can only cast between exactly the same extension types.
with pytest.raises(TypeError, match='Casting from *'):
- array.cast(UuidType2())
+ array.cast(ExampleUuidType2())
def test_cast_to_extension_with_extension_storage():
@@ -744,10 +757,10 @@ def test_cast_nested_extension_types(data, type_factory):
def test_casting_dict_array_to_extension_type():
storage = pa.array([b"0123456789abcdef"], type=pa.binary(16))
- arr = pa.ExtensionArray.from_storage(UuidType(), storage)
+ arr = pa.ExtensionArray.from_storage(ExampleUuidType(), storage)
dict_arr = pa.DictionaryArray.from_arrays(pa.array([0, 0], pa.int32()),
arr)
- out = dict_arr.cast(UuidType())
+ out = dict_arr.cast(ExampleUuidType())
assert isinstance(out, pa.ExtensionArray)
assert out.to_pylist() == [UUID('30313233-3435-3637-3839-616263646566'),
UUID('30313233-3435-3637-3839-616263646566')]
@@ -1347,7 +1360,7 @@ def test_cpp_extension_in_python(tmpdir):
mod = __import__('extensions')
uuid_type = mod._make_uuid_type()
- assert uuid_type.extension_name == "uuid"
+ assert uuid_type.extension_name == "example-uuid"
assert uuid_type.storage_type == pa.binary(16)
array = mod._make_uuid_array()
@@ -1356,6 +1369,31 @@ def test_cpp_extension_in_python(tmpdir):
assert array[0].as_py() == b'abcdefghijklmno0'
assert array[1].as_py() == b'0onmlkjihgfedcba'
+ buf = ipc_write_batch(pa.RecordBatch.from_arrays([array],
["example-uuid"]))
+
+ batch = ipc_read_batch(buf)
+ reconstructed_array = batch.column(0)
+ assert reconstructed_array.type == uuid_type
+ assert reconstructed_array == array
+
+
+def test_uuid_extension():
+ data = [b"0123456789abcdef", b"0123456789abcdef",
+ b"zyxwvutsrqponmlk", None]
+
+ uuid_type = pa.uuid()
+ assert uuid_type.extension_name == "arrow.uuid"
+ assert uuid_type.storage_type == pa.binary(16)
+ assert uuid_type.__class__ is pa.UuidType
+
+ storage = pa.array(data, pa.binary(16))
+ array = pa.ExtensionArray.from_storage(uuid_type, storage)
+ assert array.type == uuid_type
+
+ assert array.to_pylist() == [x if x is None else UUID(bytes=x) for x in
data]
+ assert array[0].as_py() == UUID(bytes=data[0])
+ assert array[3].as_py() is None
+
buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["uuid"]))
batch = ipc_read_batch(buf)
@@ -1363,6 +1401,9 @@ def test_cpp_extension_in_python(tmpdir):
assert reconstructed_array.type == uuid_type
assert reconstructed_array == array
+ assert uuid_type.__arrow_ext_scalar_class__() == pa.UuidScalar
+ assert isinstance(array[0], pa.UuidScalar)
+
def test_tensor_type():
tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py
index 0d12d710dc..2ac2f55754 100644
--- a/python/pyarrow/tests/test_gdb.py
+++ b/python/pyarrow/tests/test_gdb.py
@@ -409,7 +409,7 @@ def test_types_stack(gdb_arrow):
check_stack_repr(
gdb_arrow, "uuid_type",
- ('arrow::ExtensionType "extension<uuid>" '
+ ('arrow::ExtensionType "extension<arrow.uuid>" '
'with storage type arrow::fixed_size_binary(16)'))
@@ -447,7 +447,7 @@ def test_types_heap(gdb_arrow):
check_heap_repr(
gdb_arrow, "heap_uuid_type",
- ('arrow::ExtensionType "extension<uuid>" '
+ ('arrow::ExtensionType "extension<arrow.uuid>" '
'with storage type arrow::fixed_size_binary(16)'))
@@ -716,12 +716,12 @@ def test_scalars_stack(gdb_arrow):
check_stack_repr(
gdb_arrow, "extension_scalar",
- ('arrow::ExtensionScalar of type "extension<uuid>", '
+ ('arrow::ExtensionScalar of type "extension<arrow.uuid>", '
'value arrow::FixedSizeBinaryScalar of size 16, '
'value "0123456789abcdef"'))
check_stack_repr(
gdb_arrow, "extension_scalar_null",
- 'arrow::ExtensionScalar of type "extension<uuid>", null value')
+ 'arrow::ExtensionScalar of type "extension<arrow.uuid>", null value')
def test_scalars_heap(gdb_arrow):
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 563782f0c2..f83ecc3aa4 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1765,6 +1765,25 @@ cdef class ExtensionType(BaseExtensionType):
return ExtensionScalar
+cdef class UuidType(BaseExtensionType):
+ """
+ Concrete class for UUID extension type.
+ """
+
+ cdef void init(self, const shared_ptr[CDataType]& type) except *:
+ BaseExtensionType.init(self, type)
+ self.uuid_ext_type = <const CUuidType*> type.get()
+
+ def __arrow_ext_class__(self):
+ return UuidArray
+
+ def __reduce__(self):
+ return uuid, ()
+
+ def __arrow_ext_scalar_class__(self):
+ return UuidScalar
+
+
cdef class FixedShapeTensorType(BaseExtensionType):
"""
Concrete class for fixed shape tensor extension type.
@@ -5208,6 +5227,21 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)
+def uuid():
+ """
+ Create UuidType instance.
+
+ Returns
+ -------
+ type : UuidType
+ """
+
+ cdef UuidType out = UuidType.__new__(UuidType)
+ c_uuid_ext_type = GetResultValue(CUuidType.Make())
+ out.init(c_uuid_ext_type)
+ return out
+
+
def fixed_shape_tensor(DataType value_type, shape, dim_names=None,
permutation=None):
"""
Create instance of fixed shape tensor extension type with shape and
optional