wgtmac commented on code in PR #45375:
URL: https://github.com/apache/arrow/pull/45375#discussion_r2012188224


##########
cpp/src/parquet/arrow/variant_test.cc:
##########
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/variant.h"
+
+#include "arrow/array/validate.h"
+#include "arrow/ipc/test_common.h"
+#include "arrow/record_batch.h"
+#include "arrow/testing/gtest_util.h"
+#include "parquet/exception.h"
+
+namespace parquet::arrow {
+
+using ::arrow::binary;
+using ::arrow::struct_;
+
+class TestVariantExtensionType : public ::testing::Test {};
+
+TEST_F(TestVariantExtensionType, StorageTypeValidation) {

Review Comment:
   ```suggestion
   TEST(TestVariantExtensionType, StorageTypeValidation) {
   ```



##########
cpp/src/parquet/arrow/variant.h:
##########
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "parquet/platform.h"
+
+namespace parquet::arrow {
+
+using ::arrow::Array;

Review Comment:
   We should avoid these `using` statements in the header file.



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -1053,6 +1077,19 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
       // inferred_type is arrow::extension::json(arrow::utf8())
       auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
 
+      // Apply metadata recursively to storage type
+      RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
+      inferred->field = inferred->field->WithType(origin_type);
+    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
+               ex_type.extension_name() == std::string("parquet.variant")) {
+      // Potential schema mismatch.
+      //
+      // Arrow extensions are ENABLED in Parquet.
+      // origin_type is arrow::extension::variant(...)

Review Comment:
   ```suggestion
         // origin_type is parquet::arrow::variant(...)
   ```



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -1053,6 +1077,19 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
       // inferred_type is arrow::extension::json(arrow::utf8())
       auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
 
+      // Apply metadata recursively to storage type
+      RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
+      inferred->field = inferred->field->WithType(origin_type);
+    } else if (inferred_type->id() == ::arrow::Type::EXTENSION &&
+               ex_type.extension_name() == std::string("parquet.variant")) {
+      // Potential schema mismatch.
+      //
+      // Arrow extensions are ENABLED in Parquet.
+      // origin_type is arrow::extension::variant(...)
+      // inferred_type is
+      // arrow::extension::variant(struct(arrow::binary(),arrow::binary()))

Review Comment:
   ```suggestion
         // parquet::arrow::variant(struct(arrow::binary(),arrow::binary()))
   ```



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -114,6 +115,24 @@ Status MapToNode(const std::shared_ptr<::arrow::MapType>& 
type, const std::strin
   return Status::OK();
 }
 
+Status VariantToNode(const std::shared_ptr<VariantExtensionType>& type,
+                     const std::string& name, bool nullable, int field_id,
+                     const WriterProperties& properties,
+                     const ArrowWriterProperties& arrow_properties, NodePtr* 
out) {
+  NodePtr metadata_node;
+  RETURN_NOT_OK(FieldToNode("metadata", type->metadata_field(), properties,
+                            arrow_properties, &metadata_node));
+
+  NodePtr value_node;
+  RETURN_NOT_OK(FieldToNode("value", type->value_field(), properties, 
arrow_properties,
+                            &value_node));
+
+  NodePtr variant_node = *out =
+      GroupNode::Make("variant", Repetition::OPTIONAL, {metadata_node, 
value_node});

Review Comment:
   ```suggestion
         GroupNode::Make("variant", RepetitionFromNullable(nullable), 
{metadata_node, value_node});
   ```



##########
cpp/src/parquet/CMakeLists.txt:
##########
@@ -399,7 +400,8 @@ add_parquet_test(arrow-test
                  arrow/arrow_metadata_test.cc
                  arrow/arrow_reader_writer_test.cc
                  arrow/arrow_schema_test.cc
-                 arrow/arrow_statistics_test.cc)
+                 arrow/arrow_statistics_test.cc
+                 arrow/variant_test.cc)

Review Comment:
   OK, I got it. You are following json_test.cc and uuid_test.cc for canonical 
extension types.



##########
cpp/src/parquet/arrow/variant.h:
##########
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "parquet/platform.h"
+
+namespace parquet::arrow {
+
+using ::arrow::Array;
+using ::arrow::ArrayData;
+using ::arrow::DataType;
+using ::arrow::ExtensionType;
+using ::arrow::Result;
+
+class PARQUET_EXPORT VariantExtensionType : public ExtensionType {
+ public:
+  explicit VariantExtensionType(const std::shared_ptr<DataType>& storage_type)
+      : ExtensionType(storage_type), storage_type_(storage_type) {}
+
+  std::string extension_name() const override { return "parquet.variant"; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override;
+
+  std::string Serialize() const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const 
override;
+
+  static Result<std::shared_ptr<DataType>> Make(std::shared_ptr<DataType> 
storage_type);
+
+  static bool IsSupportedStorageType(std::shared_ptr<DataType> storage_type);

Review Comment:
   ```suggestion
     static bool IsSupportedStorageType(const std::shared_ptr<DataType>& 
storage_type);
   ```



##########
cpp/src/parquet/arrow/variant.h:
##########
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "parquet/platform.h"
+
+namespace parquet::arrow {
+
+using ::arrow::Array;
+using ::arrow::ArrayData;
+using ::arrow::DataType;
+using ::arrow::ExtensionType;
+using ::arrow::Result;
+
+class PARQUET_EXPORT VariantExtensionType : public ExtensionType {

Review Comment:
   Please add `/// EXPERIMENTAL: ` to indicate this is an unstable class.



##########
cpp/src/parquet/arrow/variant_test.cc:
##########
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/variant.h"
+
+#include "arrow/array/validate.h"
+#include "arrow/ipc/test_common.h"
+#include "arrow/record_batch.h"
+#include "arrow/testing/gtest_util.h"
+#include "parquet/exception.h"
+
+namespace parquet::arrow {
+
+using ::arrow::binary;
+using ::arrow::struct_;
+
+class TestVariantExtensionType : public ::testing::Test {};
+
+TEST_F(TestVariantExtensionType, StorageTypeValidation) {
+  auto variant1 =
+      variant(struct_({field("metadata", binary()), field("value", 
binary())}));
+  auto variant2 =
+      variant(struct_({field("metadata", binary()), field("value", 
binary())}));
+
+  ASSERT_TRUE(variant1->Equals(variant2));
+
+  auto missingValue = struct_({field("metadata", binary())});
+  auto missingMetadata = struct_({field("value", binary())});
+  auto badValueType =
+      struct_({field("metadata", binary()), field("value", ::arrow::int32())});
+  auto extraField = struct_(
+      {field("metadata", binary()), field("value", binary()), field("extra", 
binary())});

Review Comment:
   ```suggestion
     auto missing_value = struct_({field("metadata", binary())});
     auto missing_metadata = struct_({field("value", binary())});
     auto bad_value_type =
         struct_({field("metadata", binary()), field("value", 
::arrow::int32())});
     auto extra_field = struct_(
         {field("metadata", binary()), field("value", binary()), field("extra", 
binary())});
   ```



##########
cpp/src/parquet/CMakeLists.txt:
##########
@@ -399,7 +400,8 @@ add_parquet_test(arrow-test
                  arrow/arrow_metadata_test.cc
                  arrow/arrow_reader_writer_test.cc
                  arrow/arrow_schema_test.cc
-                 arrow/arrow_statistics_test.cc)
+                 arrow/arrow_statistics_test.cc
+                 arrow/variant_test.cc)

Review Comment:
   Why adding a new file? Is `arrow_schema_test.cc` too long?



##########
cpp/src/parquet/arrow/schema_internal.cc:
##########
@@ -16,6 +16,7 @@
 // under the License.
 
 #include "parquet/arrow/schema_internal.h"
+#include "parquet/arrow/variant.h"

Review Comment:
   Why do we need this?



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -114,6 +115,24 @@ Status MapToNode(const std::shared_ptr<::arrow::MapType>& 
type, const std::strin
   return Status::OK();
 }
 
+Status VariantToNode(const std::shared_ptr<VariantExtensionType>& type,
+                     const std::string& name, bool nullable, int field_id,
+                     const WriterProperties& properties,
+                     const ArrowWriterProperties& arrow_properties, NodePtr* 
out) {
+  NodePtr metadata_node;
+  RETURN_NOT_OK(FieldToNode("metadata", type->metadata_field(), properties,
+                            arrow_properties, &metadata_node));
+
+  NodePtr value_node;
+  RETURN_NOT_OK(FieldToNode("value", type->value_field(), properties, 
arrow_properties,
+                            &value_node));
+
+  NodePtr variant_node = *out =

Review Comment:
   ```suggestion
     *out =
   ```



##########
cpp/src/parquet/arrow/variant.h:
##########
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "parquet/platform.h"
+
+namespace parquet::arrow {
+
+using ::arrow::Array;
+using ::arrow::ArrayData;
+using ::arrow::DataType;
+using ::arrow::ExtensionType;
+using ::arrow::Result;
+
+class PARQUET_EXPORT VariantExtensionType : public ExtensionType {

Review Comment:
   BTW, how do you want to support sheredded form? A separate extension type or 
reuse this one?



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -114,6 +115,24 @@ Status MapToNode(const std::shared_ptr<::arrow::MapType>& 
type, const std::strin
   return Status::OK();
 }
 
+Status VariantToNode(const std::shared_ptr<VariantExtensionType>& type,
+                     const std::string& name, bool nullable, int field_id,
+                     const WriterProperties& properties,
+                     const ArrowWriterProperties& arrow_properties, NodePtr* 
out) {
+  NodePtr metadata_node;
+  RETURN_NOT_OK(FieldToNode("metadata", type->metadata_field(), properties,
+                            arrow_properties, &metadata_node));
+
+  NodePtr value_node;
+  RETURN_NOT_OK(FieldToNode("value", type->value_field(), properties, 
arrow_properties,
+                            &value_node));
+
+  NodePtr variant_node = *out =
+      GroupNode::Make("variant", Repetition::OPTIONAL, {metadata_node, 
value_node});

Review Comment:
   `field_id` should also be set to this node.



##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -114,6 +115,24 @@ Status MapToNode(const std::shared_ptr<::arrow::MapType>& 
type, const std::strin
   return Status::OK();
 }
 
+Status VariantToNode(const std::shared_ptr<VariantExtensionType>& type,
+                     const std::string& name, bool nullable, int field_id,
+                     const WriterProperties& properties,
+                     const ArrowWriterProperties& arrow_properties, NodePtr* 
out) {
+  NodePtr metadata_node;
+  RETURN_NOT_OK(FieldToNode("metadata", type->metadata_field(), properties,
+                            arrow_properties, &metadata_node));
+
+  NodePtr value_node;
+  RETURN_NOT_OK(FieldToNode("value", type->value_field(), properties, 
arrow_properties,
+                            &value_node));
+
+  NodePtr variant_node = *out =
+      GroupNode::Make("variant", Repetition::OPTIONAL, {metadata_node, 
value_node});

Review Comment:
   It seems that VariantLogicalType should also be set to the group node?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to