pitrou commented on code in PR #37298:
URL: https://github.com/apache/arrow/pull/37298#discussion_r1726690200


##########
cpp/src/arrow/extension/uuid_test.cc:
##########
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/uuid.h"
+
+#include "arrow/testing/matchers.h"
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/testing/gtest_util.h"
+
+#include "arrow/testing/extension_type.h"
+
+namespace arrow {
+
+TEST(TestUuuidExtensionType, ExtensionTypeTest) {
+  auto type = uuid();
+  ASSERT_EQ(type->id(), Type::EXTENSION);
+
+  const auto& ext_type = static_cast<const ExtensionType&>(*type);
+  std::string serialized = ext_type.Serialize();
+
+  ASSERT_OK_AND_ASSIGN(auto deserialized,
+                       ext_type.Deserialize(fixed_size_binary(16), 
serialized));
+  ASSERT_TRUE(deserialized->Equals(*type));
+  ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16)));
+}

Review Comment:
   Should there be a IPC array roundtrip test as well? This would check that 
the type is properly registered.



##########
cpp/src/arrow/extension/CMakeLists.txt:
##########
@@ -15,22 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 
-add_arrow_test(test
-               SOURCES
-               bool8_test.cc
-               PREFIX
-               "arrow-extension-bool8")
+set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc)
 
-add_arrow_test(test
-               SOURCES
-               fixed_shape_tensor_test.cc
-               PREFIX
-               "arrow-fixed-shape-tensor")
+if(ARROW_JSON)
+  list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc 
opaque_test.cc)
+endif()
 
 add_arrow_test(test
                SOURCES
-               opaque_test.cc
+               ${CANONICAL_EXTENSION_TESTS}
                PREFIX
-               "arrow-extension-opaque")
+               "arrow-canonical-extensions")

Review Comment:
   Good idea @rok.



##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -266,25 +269,25 @@ def test_ext_type_repr():
     assert repr(ty) == "IntegerType(DataType(int64))"
 
 
-def test_ext_type_lifetime():
-    ty = UuidType()
+def test_ext_type__lifetime():

Review Comment:
   Not sure why the additional underscore?



##########
python/pyarrow/includes/libarrow.pxd:
##########
@@ -2865,6 +2865,19 @@ cdef extern from "arrow/extension_type.h" namespace 
"arrow":
         shared_ptr[CArray] storage()
 
 
+cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension":
+    cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType):
+
+        @staticmethod
+        CResult[shared_ptr[CDataType]] Make()
+
+        CResult[shared_ptr[CDataType]] Deserialize(const c_string& 
serialized_data) const
+
+        c_string Serialize() const
+
+        const shared_ptr[CDataType] value_type()

Review Comment:
   These 3 declarations should probably not be necessary.



##########
cpp/src/arrow/extension_type.cc:
##########
@@ -146,14 +147,17 @@ static void CreateGlobalRegistry() {
   g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
 
 #ifdef ARROW_JSON
-  // Register canonical extension types
-  auto fst_ext_type =
-      
checked_pointer_cast<ExtensionType>(extension::fixed_shape_tensor(int64(), {}));
-  ARROW_CHECK_OK(g_registry->RegisterType(fst_ext_type));
-
-  auto bool8_ext_type = 
checked_pointer_cast<ExtensionType>(extension::bool8());
-  ARROW_CHECK_OK(g_registry->RegisterType(bool8_ext_type));
+  std::vector<std::shared_ptr<DataType>> ext_types{
+      extension::bool8(), extension::fixed_shape_tensor(int64(), {}), 
extension::uuid()};
+#else
+  std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8(), 
extension::uuid()};
 #endif

Review Comment:
   Suggestion to make this neater and less repetitive:
   ```c++
     std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8(), 
extension::uuid()};
   #ifdef ARROW_JSON
     ext_types.push_back(extension::fixed_shape_tensor(int64(), {}));
   #endif
   ```



##########
cpp/src/arrow/extension/uuid.h:
##########
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/extension_type.h"
+
+namespace arrow {
+namespace extension {
+
+class ARROW_EXPORT UuidArray : public ExtensionArray {

Review Comment:
   Can you add docstrings for these classes, even if they end up very concise?



##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -342,12 +345,12 @@ def test_ext_type_as_py():
 
 def test_uuid_type_pickle(pickle_module):

Review Comment:
   Can we test pickling of _both_ the canonical uuid type and the example uuid 
type? Since their implementations are quite different.



##########
dev/archery/archery/integration/datagen.py:
##########
@@ -1845,7 +1845,7 @@ def generate_nested_dictionary_case():
 def generate_extension_case():
     dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0')
 
-    uuid_type = ExtensionType('uuid', 'uuid-serialized',
+    uuid_type = ExtensionType('arrow.uuid', '',
                               FixedSizeBinaryField('', 16))

Review Comment:
   I was expecting this to perhaps fail the integration tests on other 
implementations, so I'm pleasantly surprised that it succeeds.



##########
cpp/src/arrow/testing/extension_type.h:
##########
@@ -27,14 +27,14 @@
 
 namespace arrow {
 
-class ARROW_TESTING_EXPORT UuidArray : public ExtensionArray {
+class ARROW_TESTING_EXPORT ExampleUuidArray : public ExtensionArray {
  public:
   using ExtensionArray::ExtensionArray;
 };
 
-class ARROW_TESTING_EXPORT UuidType : public ExtensionType {
+class ARROW_TESTING_EXPORT ExampleUuidType : public ExtensionType {

Review Comment:
   @joellubi I think this approach (having a separate "example uuid" type for 
testing) is better than the one you adopted for the Go implementation, FTR.



##########
python/pyarrow/tests/test_extension_type.py:
##########
@@ -266,25 +269,25 @@ def test_ext_type_repr():
     assert repr(ty) == "IntegerType(DataType(int64))"
 
 
-def test_ext_type_lifetime():
-    ty = UuidType()
+def test_ext_type__lifetime():
+    ty = ExampleUuidType()
     wr = weakref.ref(ty)
     del ty
     assert wr() is None
 
 
-def test_ext_type_storage_type():
-    ty = UuidType()
+def test_ext_type__storage_type():

Review Comment:
   Here too.



##########
cpp/src/arrow/extension/uuid.h:
##########
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/extension_type.h"
+
+namespace arrow {
+namespace extension {
+
+class ARROW_EXPORT UuidArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+class ARROW_EXPORT UuidType : public ExtensionType {
+ public:
+  UuidType() : ExtensionType(fixed_size_binary(16)) {}
+
+  std::string extension_name() const override { return "arrow.uuid"; }
+
+  const std::shared_ptr<DataType> value_type() const { return 
fixed_size_binary(16); }

Review Comment:
   Is this different from `storage_type()`?



##########
cpp/src/arrow/testing/gtest_util.h:
##########
@@ -309,6 +309,9 @@ ARROW_TESTING_EXPORT void ApproxCompareBatch(
     const RecordBatch& left, const RecordBatch& right, bool compare_metadata = 
true,
     const EqualOptions& options = TestingEqualOptions());
 
+ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr<RecordBatch>& 
batch,
+                                         std::shared_ptr<RecordBatch>* out);

Review Comment:
   Since this depends on IPC, I would move it into 
`arrow/ipc/test_common.{h,cc}`.



##########
python/pyarrow/types.pxi:
##########
@@ -1765,6 +1765,38 @@ cdef class ExtensionType(BaseExtensionType):
         return ExtensionScalar
 
 
+cdef class UuidType(BaseExtensionType):
+    """
+    Concrete class for UUID extension type.
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        BaseExtensionType.init(self, type)
+        self.uuid_ext_type = <const CUuidType*> type.get()
+
+    def __arrow_ext_serialize__(self):
+        """
+        Serialized representation of metadata to reconstruct the type object.
+        """
+        return self.uuid_ext_type.Serialize()
+
+    @classmethod
+    def __arrow_ext_deserialize__(self, storage_type, serialized):
+        """
+        Return an UuidType instance from the storage type.
+        """
+        return self.uuid_ext_type.Deserialize(storage_type, serialized)

Review Comment:
   These are not defined for `Bool8Type` and `FixedShapeTensorType`, so I'm 
surprised you need them here. What happens if you remove them?



##########
python/pyarrow/public-api.pxi:
##########
@@ -122,12 +122,14 @@ cdef api object pyarrow_wrap_data_type(
         cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
         if cpy_ext_type != nullptr:
             return cpy_ext_type.GetInstance()
+        elif ext_type.extension_name() == b"arrow.bool8":
+            out = Bool8Type.__new__(Bool8Type)
         elif ext_type.extension_name() == b"arrow.fixed_shape_tensor":

Review Comment:
   I'm not sure whether Cython optimizes this, but it looks like we're going to 
convert the C++ extension name into a bytes object in each `if` statement. 
Perhaps we can store the conversion in a local variable? Something like:
   ```cython
           ext_name = ext_type.extension_name()
           if ext_name == b"arrow.bool8":
               out = Bool8Type.__new__(Bool8Type)
           elif ext_name == b"arrow.fixed_shape_tensor":
           # etc.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to