wgtmac commented on code in PR #45866:
URL: https://github.com/apache/arrow/pull/45866#discussion_r2038754141


##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -1027,6 +1028,68 @@ TEST_F(TestConvertParquetSchema, 
ParquetSchemaArrowExtensions) {
   }
 }
 
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make("uuid", Repetition::OPTIONAL,
+                                               LogicalType::UUID(),
+                                               
ParquetType::FIXED_LEN_BYTE_ARRAY, 16));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, field should be treated as fixed_size_binary(16) in Arrow.
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("uuid", 
::arrow::fixed_size_binary(16), true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, field will be interpreted as uuid()
+    // extension field.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("uuid", ::arrow::extension::uuid(), 
true)});

Review Comment:
   ```suggestion
           ::arrow::schema({::arrow::field("uuid", 
::arrow::extension::uuid())});
   ```



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -1027,6 +1028,68 @@ TEST_F(TestConvertParquetSchema, 
ParquetSchemaArrowExtensions) {
   }
 }
 
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make("uuid", Repetition::OPTIONAL,
+                                               LogicalType::UUID(),
+                                               
ParquetType::FIXED_LEN_BYTE_ARRAY, 16));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, field should be treated as fixed_size_binary(16) in Arrow.
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("uuid", 
::arrow::fixed_size_binary(16), true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, field will be interpreted as uuid()
+    // extension field.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("uuid", ::arrow::extension::uuid(), 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file contains Arrow schema.
+    // uuid will be interpreted as uuid() field even though extensions are not 
enabled.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(false);
+    std::shared_ptr<KeyValueMetadata> field_metadata =
+        ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("uuid", ::arrow::extension::uuid(), true, 
field_metadata)});
+
+    std::shared_ptr<KeyValueMetadata> metadata;
+    ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata));
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema, true /* check_metadata */);
+  }
+
+  {
+    // Parquet file contains Arrow schema.
+    // uuid will be interpreted as uuid() field also when extensions *are* 
enabled
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    std::shared_ptr<KeyValueMetadata> field_metadata =
+        ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("uuid", ::arrow::extension::uuid(), true, 
field_metadata)});

Review Comment:
   ```suggestion
           {::arrow::field("uuid", ::arrow::extension::uuid(), 
/*nullable=*/true, field_metadata)});
   ```



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -1055,57 +1061,62 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
   const auto& inferred_type = inferred->field->type();
 
   if (origin_type->id() == ::arrow::Type::EXTENSION) {
-    const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*origin_type);
-    if (inferred_type->id() != ::arrow::Type::EXTENSION &&
-        ex_type.extension_name() == std::string("arrow.json") &&
-        ::arrow::extension::JsonExtensionType::IsSupportedStorageType(
-            inferred_type->id())) {
-      // Schema mismatch.
-      //
-      // Arrow extensions are DISABLED in Parquet.
-      // origin_type is ::arrow::extension::json()
-      // inferred_type is ::arrow::utf8()
-      //
-      // Origin type is restored as Arrow should be considered the source of 
truth.
-      inferred->field = inferred->field->WithType(origin_type);
-      RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred));
-    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
-               ex_type.extension_name() == std::string("arrow.json")) {
-      // Potential schema mismatch.
-      //
-      // Arrow extensions are ENABLED in Parquet.
-      // origin_type is arrow::extension::json(...)
-      // inferred_type is arrow::extension::json(arrow::utf8())
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
-      RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
-      inferred->field = inferred->field->WithType(origin_type);
-    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
-               ex_type.extension_name() == std::string("parquet.variant")) {
-      // Potential schema mismatch.
-      //
-      // Arrow extensions are ENABLED in Parquet.
-      // origin_type is parquet::arrow::variant(...)
-      // inferred_type is
-      // parquet::arrow::variant(struct(arrow::binary(),arrow::binary()))
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
+    const auto& origin_extension_type =
+        checked_cast<const ::arrow::ExtensionType&>(*origin_type);
+    std::string origin_extension_name = origin_extension_type.extension_name();
+
+    // Whether or not the inferred type is also an extension type. This can 
occur because
+    // arrow_extensions_enabled is true in the ArrowReaderProperties or because
+    // the field contained an ARROW:extension:name metadata field and that 
extension name
+    // is registered.
+    bool arrow_extension_inferred = inferred_type->id() == 
::arrow::Type::EXTENSION;
+
+    // Check if the inferred storage type is compatible with the extension type
+    // we're about to apply. We assume that if an extension type was inferred
+    // that it was constructed with a valid storage type.
+    bool extension_supports_inferred_storage;
+    if (origin_extension_name == "arrow.json") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          ::arrow::extension::JsonExtensionType::IsSupportedStorageType(
+              inferred_type->id());
+    } else if (origin_extension_name == "arrow.uuid") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          (inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY &&

Review Comment:
   Is it better to add `UuidType::IsSupportedStorageType(const 
std::shared_ptr<::arrow::DataType>&)` for this?



##########
cpp/src/parquet/arrow/schema_internal.cc:
##########
@@ -134,16 +135,22 @@ Result<std::shared_ptr<ArrowType>> FromByteArray(
   }
 }
 
-Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
-                                            int32_t physical_length) {
+Result<std::shared_ptr<ArrowType>> FromFLBA(
+    const LogicalType& logical_type, int32_t physical_length,
+    const ArrowReaderProperties& reader_properties) {
   switch (logical_type.type()) {
     case LogicalType::Type::DECIMAL:
       return MakeArrowDecimal(logical_type);
     case LogicalType::Type::FLOAT16:
       return ::arrow::float16();
     case LogicalType::Type::NONE:
     case LogicalType::Type::INTERVAL:
+      return ::arrow::fixed_size_binary(physical_length);
     case LogicalType::Type::UUID:
+      if (physical_length == 16 && 
reader_properties.get_arrow_extensions_enabled()) {
+        return ::arrow::extension::uuid();
+      }
+
       return ::arrow::fixed_size_binary(physical_length);
     default:
       return Status::NotImplemented("Unhandled logical logical_type ",

Review Comment:
   ```suggestion
         return Status::NotImplemented("Unhandled logical_type ",
   ```
   Just found that this error message can be improved.



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -1055,57 +1061,62 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
   const auto& inferred_type = inferred->field->type();
 
   if (origin_type->id() == ::arrow::Type::EXTENSION) {
-    const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*origin_type);
-    if (inferred_type->id() != ::arrow::Type::EXTENSION &&
-        ex_type.extension_name() == std::string("arrow.json") &&
-        ::arrow::extension::JsonExtensionType::IsSupportedStorageType(
-            inferred_type->id())) {
-      // Schema mismatch.
-      //
-      // Arrow extensions are DISABLED in Parquet.
-      // origin_type is ::arrow::extension::json()
-      // inferred_type is ::arrow::utf8()
-      //
-      // Origin type is restored as Arrow should be considered the source of 
truth.
-      inferred->field = inferred->field->WithType(origin_type);
-      RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred));
-    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
-               ex_type.extension_name() == std::string("arrow.json")) {
-      // Potential schema mismatch.
-      //
-      // Arrow extensions are ENABLED in Parquet.
-      // origin_type is arrow::extension::json(...)
-      // inferred_type is arrow::extension::json(arrow::utf8())
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
-      RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
-      inferred->field = inferred->field->WithType(origin_type);
-    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
-               ex_type.extension_name() == std::string("parquet.variant")) {
-      // Potential schema mismatch.
-      //
-      // Arrow extensions are ENABLED in Parquet.
-      // origin_type is parquet::arrow::variant(...)
-      // inferred_type is
-      // parquet::arrow::variant(struct(arrow::binary(),arrow::binary()))
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
+    const auto& origin_extension_type =
+        checked_cast<const ::arrow::ExtensionType&>(*origin_type);
+    std::string origin_extension_name = origin_extension_type.extension_name();
+
+    // Whether or not the inferred type is also an extension type. This can 
occur because
+    // arrow_extensions_enabled is true in the ArrowReaderProperties or because
+    // the field contained an ARROW:extension:name metadata field and that 
extension name
+    // is registered.
+    bool arrow_extension_inferred = inferred_type->id() == 
::arrow::Type::EXTENSION;
+
+    // Check if the inferred storage type is compatible with the extension type
+    // we're about to apply. We assume that if an extension type was inferred
+    // that it was constructed with a valid storage type.
+    bool extension_supports_inferred_storage;
+    if (origin_extension_name == "arrow.json") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          ::arrow::extension::JsonExtensionType::IsSupportedStorageType(
+              inferred_type->id());
+    } else if (origin_extension_name == "arrow.uuid") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          (inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY &&
+           checked_cast<const ::arrow::FixedSizeBinaryType&>(*inferred_type)
+                   .byte_width() == 16);
+    } else if (origin_extension_name == "parquet.variant") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          VariantExtensionType::IsSupportedStorageType(inferred_type);
+    } else if (arrow_extension_inferred) {
+      extension_supports_inferred_storage = 
origin_extension_type.Equals(*inferred_type);
+    } else {
+      extension_supports_inferred_storage =

Review Comment:
   Similar to the above comment, do we need to check this? It will go to the 
else branch at line 1108 and the same check is performed there too.



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -1055,57 +1061,62 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
   const auto& inferred_type = inferred->field->type();
 
   if (origin_type->id() == ::arrow::Type::EXTENSION) {
-    const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*origin_type);
-    if (inferred_type->id() != ::arrow::Type::EXTENSION &&
-        ex_type.extension_name() == std::string("arrow.json") &&
-        ::arrow::extension::JsonExtensionType::IsSupportedStorageType(
-            inferred_type->id())) {
-      // Schema mismatch.
-      //
-      // Arrow extensions are DISABLED in Parquet.
-      // origin_type is ::arrow::extension::json()
-      // inferred_type is ::arrow::utf8()
-      //
-      // Origin type is restored as Arrow should be considered the source of 
truth.
-      inferred->field = inferred->field->WithType(origin_type);
-      RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred));
-    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
-               ex_type.extension_name() == std::string("arrow.json")) {
-      // Potential schema mismatch.
-      //
-      // Arrow extensions are ENABLED in Parquet.
-      // origin_type is arrow::extension::json(...)
-      // inferred_type is arrow::extension::json(arrow::utf8())
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
-      RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
-      inferred->field = inferred->field->WithType(origin_type);
-    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
-               ex_type.extension_name() == std::string("parquet.variant")) {
-      // Potential schema mismatch.
-      //
-      // Arrow extensions are ENABLED in Parquet.
-      // origin_type is parquet::arrow::variant(...)
-      // inferred_type is
-      // parquet::arrow::variant(struct(arrow::binary(),arrow::binary()))
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
+    const auto& origin_extension_type =
+        checked_cast<const ::arrow::ExtensionType&>(*origin_type);
+    std::string origin_extension_name = origin_extension_type.extension_name();
+
+    // Whether or not the inferred type is also an extension type. This can 
occur because
+    // arrow_extensions_enabled is true in the ArrowReaderProperties or because
+    // the field contained an ARROW:extension:name metadata field and that 
extension name
+    // is registered.
+    bool arrow_extension_inferred = inferred_type->id() == 
::arrow::Type::EXTENSION;
+
+    // Check if the inferred storage type is compatible with the extension type
+    // we're about to apply. We assume that if an extension type was inferred
+    // that it was constructed with a valid storage type.
+    bool extension_supports_inferred_storage;
+    if (origin_extension_name == "arrow.json") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          ::arrow::extension::JsonExtensionType::IsSupportedStorageType(
+              inferred_type->id());
+    } else if (origin_extension_name == "arrow.uuid") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          (inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY &&
+           checked_cast<const ::arrow::FixedSizeBinaryType&>(*inferred_type)
+                   .byte_width() == 16);
+    } else if (origin_extension_name == "parquet.variant") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          VariantExtensionType::IsSupportedStorageType(inferred_type);
+    } else if (arrow_extension_inferred) {
+      extension_supports_inferred_storage = 
origin_extension_type.Equals(*inferred_type);
+    } else {
+      extension_supports_inferred_storage =
+          origin_extension_type.storage_type()->Equals(*inferred_type);
+    }
+
+    if (arrow_extension_inferred || extension_supports_inferred_storage) {
+      // i.e., arrow_extensions_enabled is true or arrow_extensions_enabled is 
false but
+      // we still restore the extension type because Arrow is the source of 
truth if we
+      // are asked to apply the original metadata
+      auto origin_storage_field =
+          origin_field.WithType(origin_extension_type.storage_type());
       RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
       inferred->field = inferred->field->WithType(origin_type);
     } else {
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
+      // If we are here, we still *might* be able to restore the extension type
+      // if we first apply metadata to children (e.g., if the extension type
+      // is nested and its children are extension type).
+      auto origin_storage_field =
+          origin_field.WithType(origin_extension_type.storage_type());
       RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
-
-      // Restore extension type, if the storage type is the same as inferred
-      // from the Parquet type
-      if (ex_type.storage_type()->Equals(*inferred->field->type())) {
+      if 
(origin_extension_type.storage_type()->Equals(*inferred->field->type())) {

Review Comment:
   ```suggestion
         if (origin_extension_type.storage_type()->Equals(*inferred_type)) {
   ```



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -1027,6 +1028,68 @@ TEST_F(TestConvertParquetSchema, 
ParquetSchemaArrowExtensions) {
   }
 }
 
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make("uuid", Repetition::OPTIONAL,
+                                               LogicalType::UUID(),
+                                               
ParquetType::FIXED_LEN_BYTE_ARRAY, 16));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, field should be treated as fixed_size_binary(16) in Arrow.
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("uuid", 
::arrow::fixed_size_binary(16), true)});

Review Comment:
   ```suggestion
           ::arrow::schema({::arrow::field("uuid", 
::arrow::fixed_size_binary(16))});
   ```



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -1027,6 +1028,68 @@ TEST_F(TestConvertParquetSchema, 
ParquetSchemaArrowExtensions) {
   }
 }
 
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make("uuid", Repetition::OPTIONAL,
+                                               LogicalType::UUID(),
+                                               
ParquetType::FIXED_LEN_BYTE_ARRAY, 16));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, field should be treated as fixed_size_binary(16) in Arrow.
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("uuid", 
::arrow::fixed_size_binary(16), true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, field will be interpreted as uuid()
+    // extension field.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("uuid", ::arrow::extension::uuid(), 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file contains Arrow schema.
+    // uuid will be interpreted as uuid() field even though extensions are not 
enabled.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(false);
+    std::shared_ptr<KeyValueMetadata> field_metadata =
+        ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("uuid", ::arrow::extension::uuid(), true, 
field_metadata)});

Review Comment:
   ```suggestion
           {::arrow::field("uuid", ::arrow::extension::uuid(), 
/*nullable=*/true, field_metadata)});
   ```



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -1055,57 +1061,62 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
   const auto& inferred_type = inferred->field->type();
 
   if (origin_type->id() == ::arrow::Type::EXTENSION) {
-    const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*origin_type);
-    if (inferred_type->id() != ::arrow::Type::EXTENSION &&
-        ex_type.extension_name() == std::string("arrow.json") &&
-        ::arrow::extension::JsonExtensionType::IsSupportedStorageType(
-            inferred_type->id())) {
-      // Schema mismatch.
-      //
-      // Arrow extensions are DISABLED in Parquet.
-      // origin_type is ::arrow::extension::json()
-      // inferred_type is ::arrow::utf8()
-      //
-      // Origin type is restored as Arrow should be considered the source of 
truth.
-      inferred->field = inferred->field->WithType(origin_type);
-      RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred));
-    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
-               ex_type.extension_name() == std::string("arrow.json")) {
-      // Potential schema mismatch.
-      //
-      // Arrow extensions are ENABLED in Parquet.
-      // origin_type is arrow::extension::json(...)
-      // inferred_type is arrow::extension::json(arrow::utf8())
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
-      RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
-      inferred->field = inferred->field->WithType(origin_type);
-    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
-               ex_type.extension_name() == std::string("parquet.variant")) {
-      // Potential schema mismatch.
-      //
-      // Arrow extensions are ENABLED in Parquet.
-      // origin_type is parquet::arrow::variant(...)
-      // inferred_type is
-      // parquet::arrow::variant(struct(arrow::binary(),arrow::binary()))
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
+    const auto& origin_extension_type =
+        checked_cast<const ::arrow::ExtensionType&>(*origin_type);
+    std::string origin_extension_name = origin_extension_type.extension_name();
+
+    // Whether or not the inferred type is also an extension type. This can 
occur because
+    // arrow_extensions_enabled is true in the ArrowReaderProperties or because
+    // the field contained an ARROW:extension:name metadata field and that 
extension name
+    // is registered.
+    bool arrow_extension_inferred = inferred_type->id() == 
::arrow::Type::EXTENSION;
+
+    // Check if the inferred storage type is compatible with the extension type
+    // we're about to apply. We assume that if an extension type was inferred
+    // that it was constructed with a valid storage type.
+    bool extension_supports_inferred_storage;
+    if (origin_extension_name == "arrow.json") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          ::arrow::extension::JsonExtensionType::IsSupportedStorageType(
+              inferred_type->id());
+    } else if (origin_extension_name == "arrow.uuid") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          (inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY &&
+           checked_cast<const ::arrow::FixedSizeBinaryType&>(*inferred_type)
+                   .byte_width() == 16);
+    } else if (origin_extension_name == "parquet.variant") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          VariantExtensionType::IsSupportedStorageType(inferred_type);
+    } else if (arrow_extension_inferred) {
+      extension_supports_inferred_storage = 
origin_extension_type.Equals(*inferred_type);

Review Comment:
   Why do we need this branch? It will for sure fall into the if statement at 
line 1100.



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -1055,57 +1061,62 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
   const auto& inferred_type = inferred->field->type();
 
   if (origin_type->id() == ::arrow::Type::EXTENSION) {
-    const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*origin_type);
-    if (inferred_type->id() != ::arrow::Type::EXTENSION &&
-        ex_type.extension_name() == std::string("arrow.json") &&
-        ::arrow::extension::JsonExtensionType::IsSupportedStorageType(
-            inferred_type->id())) {
-      // Schema mismatch.
-      //
-      // Arrow extensions are DISABLED in Parquet.
-      // origin_type is ::arrow::extension::json()
-      // inferred_type is ::arrow::utf8()
-      //
-      // Origin type is restored as Arrow should be considered the source of 
truth.
-      inferred->field = inferred->field->WithType(origin_type);
-      RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred));
-    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
-               ex_type.extension_name() == std::string("arrow.json")) {
-      // Potential schema mismatch.
-      //
-      // Arrow extensions are ENABLED in Parquet.
-      // origin_type is arrow::extension::json(...)
-      // inferred_type is arrow::extension::json(arrow::utf8())
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
-      RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
-      inferred->field = inferred->field->WithType(origin_type);
-    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
-               ex_type.extension_name() == std::string("parquet.variant")) {
-      // Potential schema mismatch.
-      //
-      // Arrow extensions are ENABLED in Parquet.
-      // origin_type is parquet::arrow::variant(...)
-      // inferred_type is
-      // parquet::arrow::variant(struct(arrow::binary(),arrow::binary()))
-      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
-
-      // Apply metadata recursively to storage type
+    const auto& origin_extension_type =
+        checked_cast<const ::arrow::ExtensionType&>(*origin_type);
+    std::string origin_extension_name = origin_extension_type.extension_name();
+
+    // Whether or not the inferred type is also an extension type. This can 
occur because
+    // arrow_extensions_enabled is true in the ArrowReaderProperties or because
+    // the field contained an ARROW:extension:name metadata field and that 
extension name
+    // is registered.
+    bool arrow_extension_inferred = inferred_type->id() == 
::arrow::Type::EXTENSION;
+
+    // Check if the inferred storage type is compatible with the extension type
+    // we're about to apply. We assume that if an extension type was inferred
+    // that it was constructed with a valid storage type.
+    bool extension_supports_inferred_storage;
+    if (origin_extension_name == "arrow.json") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          ::arrow::extension::JsonExtensionType::IsSupportedStorageType(
+              inferred_type->id());
+    } else if (origin_extension_name == "arrow.uuid") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          (inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY &&
+           checked_cast<const ::arrow::FixedSizeBinaryType&>(*inferred_type)
+                   .byte_width() == 16);
+    } else if (origin_extension_name == "parquet.variant") {
+      extension_supports_inferred_storage =
+          arrow_extension_inferred ||
+          VariantExtensionType::IsSupportedStorageType(inferred_type);
+    } else if (arrow_extension_inferred) {
+      extension_supports_inferred_storage = 
origin_extension_type.Equals(*inferred_type);
+    } else {
+      extension_supports_inferred_storage =
+          origin_extension_type.storage_type()->Equals(*inferred_type);
+    }
+
+    if (arrow_extension_inferred || extension_supports_inferred_storage) {
+      // i.e., arrow_extensions_enabled is true or arrow_extensions_enabled is 
false but
+      // we still restore the extension type because Arrow is the source of 
truth if we
+      // are asked to apply the original metadata
+      auto origin_storage_field =

Review Comment:
   ```
         auto origin_storage_field =
             origin_field.WithType(origin_extension_type.storage_type());
         RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
   ```
   
   These lines are the same in the branches. Should we move them out?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to