This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new a82edf90ce GH-49081: [C++][Parquet] Correct variant's extension name
(#49082)
a82edf90ce is described below
commit a82edf90ce66eb9a9a9e3bbac514e5d51f531c1f
Author: Zehua Zou <[email protected]>
AuthorDate: Mon Feb 9 11:09:04 2026 +0800
GH-49081: [C++][Parquet] Correct variant's extension name (#49082)
### Rationale for this change
Correct variant extension according to arrow's specification.
### What changes are included in this PR?
Modified variant's hardcoded extension name.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* GitHub Issue: #49081
Authored-by: Zehua Zou <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
cpp/src/arrow/CMakeLists.txt | 1 +
.../extension/parquet_variant.cc} | 30 +++++----------
.../extension/parquet_variant.h} | 43 ++++++++++------------
cpp/src/parquet/CMakeLists.txt | 1 -
cpp/src/parquet/arrow/arrow_schema_test.cc | 4 +-
cpp/src/parquet/arrow/schema.cc | 23 +++++++-----
cpp/src/parquet/arrow/variant_test.cc | 21 ++++++-----
7 files changed, 57 insertions(+), 66 deletions(-)
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index d9f04a627b..6e9d76a61e 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -366,6 +366,7 @@ set(ARROW_SRCS
extension_type.cc
extension/bool8.cc
extension/json.cc
+ extension/parquet_variant.cc
extension/uuid.cc
pretty_print.cc
record_batch.cc
diff --git a/cpp/src/parquet/arrow/variant_internal.cc
b/cpp/src/arrow/extension/parquet_variant.cc
similarity index 84%
rename from cpp/src/parquet/arrow/variant_internal.cc
rename to cpp/src/arrow/extension/parquet_variant.cc
index 87f88efaac..95aa5a0eb6 100644
--- a/cpp/src/parquet/arrow/variant_internal.cc
+++ b/cpp/src/arrow/extension/parquet_variant.cc
@@ -15,28 +15,19 @@
// specific language governing permissions and limitations
// under the License.
-#include "parquet/arrow/variant_internal.h"
+#include "arrow/extension/parquet_variant.h"
#include <string>
#include "arrow/extension_type.h"
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/type_fwd.h"
#include "arrow/util/logging_internal.h"
-namespace parquet::arrow {
+namespace arrow::extension {
-using ::arrow::Array;
-using ::arrow::ArrayData;
-using ::arrow::DataType;
-using ::arrow::ExtensionType;
-using ::arrow::Result;
-using ::arrow::Type;
-
-VariantExtensionType::VariantExtensionType(
- const std::shared_ptr<::arrow::DataType>& storage_type)
- : ::arrow::ExtensionType(storage_type) {
+VariantExtensionType::VariantExtensionType(const std::shared_ptr<DataType>&
storage_type)
+ : ExtensionType(storage_type) {
// GH-45948: Shredded variants will need to handle an optional
shredded_value as
// well as value_ becoming optional.
@@ -66,14 +57,13 @@ std::string VariantExtensionType::Serialize() const {
return ""; }
std::shared_ptr<Array> VariantExtensionType::MakeArray(
std::shared_ptr<ArrayData> data) const {
DCHECK_EQ(data->type->id(), Type::EXTENSION);
- DCHECK_EQ("parquet.variant",
- ::arrow::internal::checked_cast<const ExtensionType&>(*data->type)
- .extension_name());
+ DCHECK_EQ("arrow.parquet.variant",
+ internal::checked_cast<const
ExtensionType&>(*data->type).extension_name());
return std::make_shared<VariantArray>(data);
}
namespace {
-bool IsBinaryField(const std::shared_ptr<::arrow::Field> field) {
+bool IsBinaryField(const std::shared_ptr<Field> field) {
return field->type()->storage_id() == Type::BINARY ||
field->type()->storage_id() == Type::LARGE_BINARY;
}
@@ -116,8 +106,8 @@ bool VariantExtensionType::IsSupportedStorageType(
Result<std::shared_ptr<DataType>> VariantExtensionType::Make(
std::shared_ptr<DataType> storage_type) {
if (!IsSupportedStorageType(storage_type)) {
- return ::arrow::Status::Invalid("Invalid storage type for
VariantExtensionType: ",
- storage_type->ToString());
+ return Status::Invalid("Invalid storage type for VariantExtensionType: ",
+ storage_type->ToString());
}
return std::make_shared<VariantExtensionType>(std::move(storage_type));
@@ -130,4 +120,4 @@ std::shared_ptr<DataType> variant(std::shared_ptr<DataType>
storage_type) {
return VariantExtensionType::Make(std::move(storage_type)).ValueOrDie();
}
-} // namespace parquet::arrow
+} // namespace arrow::extension
diff --git a/cpp/src/parquet/arrow/variant_internal.h
b/cpp/src/arrow/extension/parquet_variant.h
similarity index 56%
rename from cpp/src/parquet/arrow/variant_internal.h
rename to cpp/src/arrow/extension/parquet_variant.h
index d0b77c72c6..be90923f14 100644
--- a/cpp/src/parquet/arrow/variant_internal.h
+++ b/cpp/src/arrow/extension/parquet_variant.h
@@ -17,17 +17,16 @@
#pragma once
-#include <stdexcept>
#include <string>
#include "arrow/extension_type.h"
-#include "parquet/platform.h"
+#include "arrow/util/visibility.h"
-namespace parquet::arrow {
+namespace arrow::extension {
-class PARQUET_EXPORT VariantArray : public ::arrow::ExtensionArray {
+class ARROW_EXPORT VariantArray : public ExtensionArray {
public:
- using ::arrow::ExtensionArray::ExtensionArray;
+ using ExtensionArray::ExtensionArray;
};
/// EXPERIMENTAL: Variant is not yet fully supported.
@@ -46,41 +45,37 @@ class PARQUET_EXPORT VariantArray : public
::arrow::ExtensionArray {
///
/// To read more about variant shredding, see the variant shredding spec at
/// https://github.com/apache/parquet-format/blob/master/VariantShredding.md
-class PARQUET_EXPORT VariantExtensionType : public ::arrow::ExtensionType {
+class ARROW_EXPORT VariantExtensionType : public ExtensionType {
public:
- explicit VariantExtensionType(const std::shared_ptr<::arrow::DataType>&
storage_type);
+ explicit VariantExtensionType(const std::shared_ptr<DataType>& storage_type);
- std::string extension_name() const override { return "parquet.variant"; }
+ std::string extension_name() const override { return
"arrow.parquet.variant"; }
- bool ExtensionEquals(const ::arrow::ExtensionType& other) const override;
+ bool ExtensionEquals(const ExtensionType& other) const override;
- ::arrow::Result<std::shared_ptr<::arrow::DataType>> Deserialize(
- std::shared_ptr<::arrow::DataType> storage_type,
+ Result<std::shared_ptr<DataType>> Deserialize(
+ std::shared_ptr<DataType> storage_type,
const std::string& serialized_data) const override;
std::string Serialize() const override;
- std::shared_ptr<::arrow::Array> MakeArray(
- std::shared_ptr<::arrow::ArrayData> data) const override;
+ std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const
override;
- static ::arrow::Result<std::shared_ptr<::arrow::DataType>> Make(
- std::shared_ptr<::arrow::DataType> storage_type);
+ static Result<std::shared_ptr<DataType>> Make(std::shared_ptr<DataType>
storage_type);
- static bool IsSupportedStorageType(
- const std::shared_ptr<::arrow::DataType>& storage_type);
+ static bool IsSupportedStorageType(const std::shared_ptr<DataType>&
storage_type);
- std::shared_ptr<::arrow::Field> metadata() const { return metadata_; }
+ std::shared_ptr<Field> metadata() const { return metadata_; }
- std::shared_ptr<::arrow::Field> value() const { return value_; }
+ std::shared_ptr<Field> value() const { return value_; }
private:
// TODO GH-45948 added shredded_value
- std::shared_ptr<::arrow::Field> metadata_;
- std::shared_ptr<::arrow::Field> value_;
+ std::shared_ptr<Field> metadata_;
+ std::shared_ptr<Field> value_;
};
/// \brief Return a VariantExtensionType instance.
-PARQUET_EXPORT std::shared_ptr<::arrow::DataType> variant(
- std::shared_ptr<::arrow::DataType> storage_type);
+ARROW_EXPORT std::shared_ptr<DataType> variant(std::shared_ptr<DataType>
storage_type);
-} // namespace parquet::arrow
+} // namespace arrow::extension
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index feeb1805f6..6c1550dcc2 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -157,7 +157,6 @@ set(PARQUET_SRCS
arrow/reader_internal.cc
arrow/schema.cc
arrow/schema_internal.cc
- arrow/variant_internal.cc
arrow/writer.cc
bloom_filter.cc
bloom_filter_reader.cc
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc
b/cpp/src/parquet/arrow/arrow_schema_test.cc
index 73ce8ea69e..f930d3d7bd 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -25,7 +25,6 @@
#include "parquet/arrow/reader.h"
#include "parquet/arrow/reader_internal.h"
#include "parquet/arrow/schema.h"
-#include "parquet/arrow/variant_internal.h"
#include "parquet/file_reader.h"
#include "parquet/schema.h"
#include "parquet/schema_internal.h"
@@ -34,6 +33,7 @@
#include "arrow/array.h"
#include "arrow/extension/json.h"
+#include "arrow/extension/parquet_variant.h"
#include "arrow/extension/uuid.h"
#include "arrow/ipc/writer.h"
#include "arrow/testing/extension_type.h"
@@ -950,7 +950,7 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) {
auto arrow_metadata = ::arrow::field("metadata", ::arrow::binary(),
/*nullable=*/false);
auto arrow_value = ::arrow::field("value", ::arrow::binary(),
/*nullable=*/false);
auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value});
- auto variant_extension =
std::make_shared<VariantExtensionType>(arrow_variant);
+ auto variant_extension = ::arrow::extension::variant(arrow_variant);
{
// Parquet file does not contain Arrow schema.
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 266215a810..9c0db1d533 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -22,6 +22,7 @@
#include <vector>
#include "arrow/extension/json.h"
+#include "arrow/extension/parquet_variant.h"
#include "arrow/extension/uuid.h"
#include "arrow/extension_type.h"
#include "arrow/io/memory.h"
@@ -36,7 +37,6 @@
#include "arrow/util/value_parsing.h"
#include "parquet/arrow/schema_internal.h"
-#include "parquet/arrow/variant_internal.h"
#include "parquet/exception.h"
#include "parquet/geospatial/util_json_internal.h"
#include "parquet/metadata.h"
@@ -129,10 +129,11 @@ Status MapToNode(const std::shared_ptr<::arrow::MapType>&
type, const std::strin
return Status::OK();
}
-Status VariantToNode(const std::shared_ptr<VariantExtensionType>& type,
- const std::string& name, bool nullable, int field_id,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr*
out) {
+Status VariantToNode(
+ const std::shared_ptr<::arrow::extension::VariantExtensionType>& type,
+ const std::string& name, bool nullable, int field_id,
+ const WriterProperties& properties, const ArrowWriterProperties&
arrow_properties,
+ NodePtr* out) {
NodePtr metadata_node;
RETURN_NOT_OK(FieldToNode("metadata", type->metadata(), properties,
arrow_properties,
&metadata_node));
@@ -485,8 +486,10 @@ Status FieldToNode(const std::string& name, const
std::shared_ptr<Field>& field,
ARROW_ASSIGN_OR_RAISE(logical_type,
LogicalTypeFromGeoArrowMetadata(ext_type->Serialize()));
break;
- } else if (ext_type->extension_name() == std::string("parquet.variant"))
{
- auto variant_type =
std::static_pointer_cast<VariantExtensionType>(field->type());
+ } else if (ext_type->extension_name() ==
std::string("arrow.parquet.variant")) {
+ auto variant_type =
+ std::static_pointer_cast<::arrow::extension::VariantExtensionType>(
+ field->type());
return VariantToNode(variant_type, name, field->nullable(), field_id,
properties,
arrow_properties, out);
@@ -597,7 +600,7 @@ Status GroupToStruct(const GroupNode& node, LevelInfo
current_levels,
auto struct_type = ::arrow::struct_(arrow_fields);
if (ctx->properties.get_arrow_extensions_enabled() &&
node.logical_type()->is_variant()) {
- auto extension_type = ::arrow::GetExtensionType("parquet.variant");
+ auto extension_type = ::arrow::GetExtensionType("arrow.parquet.variant");
if (extension_type) {
ARROW_ASSIGN_OR_RAISE(
struct_type,
@@ -1147,10 +1150,10 @@ Result<bool> ApplyOriginalMetadata(const Field&
origin_field, SchemaField* infer
extension_supports_inferred_storage =
arrow_extension_inferred ||
::arrow::extension::UuidType::IsSupportedStorageType(inferred_type);
- } else if (origin_extension_name == "parquet.variant") {
+ } else if (origin_extension_name == "arrow.parquet.variant") {
extension_supports_inferred_storage =
arrow_extension_inferred ||
- VariantExtensionType::IsSupportedStorageType(inferred_type);
+
::arrow::extension::VariantExtensionType::IsSupportedStorageType(inferred_type);
} else {
extension_supports_inferred_storage =
origin_extension_type.storage_type()->Equals(*inferred_type);
diff --git a/cpp/src/parquet/arrow/variant_test.cc
b/cpp/src/parquet/arrow/variant_test.cc
index caf63d8e3d..04f46d2e44 100644
--- a/cpp/src/parquet/arrow/variant_test.cc
+++ b/cpp/src/parquet/arrow/variant_test.cc
@@ -15,9 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-#include "parquet/arrow/variant_internal.h"
-
#include "arrow/array/validate.h"
+#include "arrow/extension/parquet_variant.h"
#include "arrow/ipc/test_common.h"
#include "arrow/record_batch.h"
#include "arrow/testing/gtest_util.h"
@@ -29,16 +28,20 @@ using ::arrow::binary;
using ::arrow::struct_;
TEST(TestVariantExtensionType, StorageTypeValidation) {
- auto variant1 = variant(struct_({field("metadata", binary(),
/*nullable=*/false),
- field("value", binary(),
/*nullable=*/false)}));
- auto variant2 = variant(struct_({field("metadata", binary(),
/*nullable=*/false),
- field("value", binary(),
/*nullable=*/false)}));
+ auto variant1 = ::arrow::extension::variant(
+ struct_({field("metadata", binary(), /*nullable=*/false),
+ field("value", binary(), /*nullable=*/false)}));
+ auto variant2 = ::arrow::extension::variant(
+ struct_({field("metadata", binary(), /*nullable=*/false),
+ field("value", binary(), /*nullable=*/false)}));
ASSERT_TRUE(variant1->Equals(variant2));
// Metadata and value fields can be provided in either order
- auto variantFieldsFlipped = std::dynamic_pointer_cast<VariantExtensionType>(
- variant(struct_({field("value", binary(), /*nullable=*/false),
+ auto variantFieldsFlipped =
+ std::dynamic_pointer_cast<::arrow::extension::VariantExtensionType>(
+ ::arrow::extension::variant(
+ struct_({field("value", binary(), /*nullable=*/false),
field("metadata", binary(), /*nullable=*/false)})));
ASSERT_EQ("metadata", variantFieldsFlipped->metadata()->name());
@@ -62,7 +65,7 @@ TEST(TestVariantExtensionType, StorageTypeValidation) {
Invalid,
"Invalid: Invalid storage type for VariantExtensionType: " +
storage_type->ToString(),
- VariantExtensionType::Make(storage_type));
+ ::arrow::extension::VariantExtensionType::Make(storage_type));
}
}