This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 5f616db149 GH-47838: [C++][Parquet] Set Variant specification version 
to 1 to align with the variant spec (#47835)
5f616db149 is described below

commit 5f616db1491478dc85cc62eb0771730224479750
Author: Aihua Xu <[email protected]>
AuthorDate: Mon Oct 20 04:55:30 2025 -0700

    GH-47838: [C++][Parquet] Set Variant specification version to 1 to align 
with the variant spec (#47835)
    
    ### Rationale for this change
    According to the [Variant 
specification](https://github.com/apache/parquet-format/blob/master/VariantEncoding.md),
 the specification_version field must be set to 1 to indicate Variant encoding 
version 1. Currently, this field defaults to 0, which violates the 
specification. Parquet readers that strictly enforce specification version 
validation will fail to read files containing Variant types.
    <img width="624" height="185" alt="image" 
src="https://github.com/user-attachments/assets/b0f1deb9-0301-4b94-a472-17fd9cc0df5d";
 />
    
    ### What changes are included in this PR?
    The change includes defaulting the specification version to 1.
    ### Are these changes tested?
    The change is covered by unit test.
    ### Are there any user-facing changes?
    The Parquet files produced the variant logical type annotation `VARIANT(1)`.
    
    ```
    Schema:
    message schema {
      optional group V (VARIANT(1)) = 1 {
        required binary metadata;
        required binary value;
      }
    }
    ```
    
    * GitHub Issue: #47838
    
    Lead-authored-by: Aihua <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/schema_test.cc | 34 ++++++++++++++++++++++++-
 cpp/src/parquet/types.cc       | 58 ++++++++++++++++++++++++++++++++++++------
 cpp/src/parquet/types.h        | 11 ++++++--
 3 files changed, 92 insertions(+), 11 deletions(-)

diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc
index c33e5ccf4a..2950a7df70 100644
--- a/cpp/src/parquet/schema_test.cc
+++ b/cpp/src/parquet/schema_test.cc
@@ -1580,7 +1580,8 @@ TEST(TestLogicalTypeOperation, LogicalTypeRepresentation) 
{
                               LogicalType::EdgeInterpolationAlgorithm::KARNEY),
        "Geography(crs=srid:1234, algorithm=karney)",
        R"({"Type": "Geography", "crs": "srid:1234", "algorithm": "karney"})"},
-      {LogicalType::Variant(), "Variant", R"({"Type": "Variant"})"},
+      {LogicalType::Variant(), "Variant(1)", R"({"Type": "Variant", 
"SpecVersion": 1})"},
+      {LogicalType::Variant(2), "Variant(2)", R"({"Type": "Variant", 
"SpecVersion": 2})"},
       {LogicalType::None(), "None", R"({"Type": "None"})"},
   };
 
@@ -2353,6 +2354,37 @@ TEST(TestLogicalTypeSerialization, Roundtrips) {
   // Group nodes ...
   ConfirmGroupNodeRoundtrip("map", LogicalType::Map());
   ConfirmGroupNodeRoundtrip("list", LogicalType::List());
+  ConfirmGroupNodeRoundtrip("variant", LogicalType::Variant());
+}
+
+TEST(TestLogicalTypeSerialization, VariantSpecificationVersion) {
+  // Confirm that Variant logical type sets specification_version to expected 
value in
+  // thrift serialization
+  constexpr int8_t spec_version = 2;
+  auto metadata = PrimitiveNode::Make("metadata", Repetition::REQUIRED, 
Type::BYTE_ARRAY);
+  auto value = PrimitiveNode::Make("value", Repetition::REQUIRED, 
Type::BYTE_ARRAY);
+  NodePtr variant_node =
+      GroupNode::Make("variant", Repetition::REQUIRED, {metadata, value},
+                      LogicalType::Variant(spec_version));
+
+  // Verify variant logical type
+  auto logical_type = variant_node->logical_type();
+  ASSERT_TRUE(logical_type->is_variant());
+  const auto& variant_type = checked_cast<const 
VariantLogicalType&>(*logical_type);
+  ASSERT_EQ(variant_type.spec_version(), spec_version);
+
+  // Verify thrift serialization
+  std::vector<format::SchemaElement> elements;
+  ToParquet(reinterpret_cast<GroupNode*>(variant_node.get()), &elements);
+
+  // Verify that logicalType is set and is VARIANT
+  ASSERT_EQ(elements[0].name, "variant");
+  ASSERT_TRUE(elements[0].__isset.logicalType);
+  ASSERT_TRUE(elements[0].logicalType.__isset.VARIANT);
+
+  // Verify that specification_version is set properly
+  ASSERT_TRUE(elements[0].logicalType.VARIANT.__isset.specification_version);
+  ASSERT_EQ(elements[0].logicalType.VARIANT.specification_version, 
spec_version);
 }
 
 }  // namespace schema
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index 7109c3d1c8..fb4eb92a75 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -596,7 +596,12 @@ std::shared_ptr<const LogicalType> LogicalType::FromThrift(
 
     return GeographyLogicalType::Make(std::move(crs), algorithm);
   } else if (type.__isset.VARIANT) {
-    return VariantLogicalType::Make();
+    int8_t spec_version = kVariantSpecVersion;
+    if (type.VARIANT.__isset.specification_version) {
+      spec_version = type.VARIANT.specification_version;
+    }
+
+    return VariantLogicalType::Make(spec_version);
   } else {
     // Sentinel type for one we do not recognize
     return UndefinedLogicalType::Make();
@@ -664,8 +669,8 @@ std::shared_ptr<const LogicalType> LogicalType::Geography(
   return GeographyLogicalType::Make(std::move(crs), algorithm);
 }
 
-std::shared_ptr<const LogicalType> LogicalType::Variant() {
-  return VariantLogicalType::Make();
+std::shared_ptr<const LogicalType> LogicalType::Variant(int8_t spec_version) {
+  return VariantLogicalType::Make(spec_version);
 }
 
 std::shared_ptr<const LogicalType> LogicalType::None() { return 
NoLogicalType::Make(); }
@@ -1963,16 +1968,53 @@ class LogicalType::Impl::Variant final : public 
LogicalType::Impl::Incompatible,
  public:
   friend class VariantLogicalType;
 
-  OVERRIDE_TOSTRING(Variant)
-  OVERRIDE_TOTHRIFT(VariantType, VARIANT)
+  std::string ToString() const override;
+  std::string ToJSON() const override;
+  format::LogicalType ToThrift() const override;
+
+  int8_t spec_version() const { return spec_version_; }
 
  private:
-  Variant()
+  explicit Variant(const int8_t spec_version)
       : LogicalType::Impl(LogicalType::Type::VARIANT, SortOrder::UNKNOWN),
-        LogicalType::Impl::Inapplicable() {}
+        LogicalType::Impl::Inapplicable() {
+    this->spec_version_ = spec_version;
+  }
+
+  int8_t spec_version_;
 };
 
-GENERATE_MAKE(Variant)
+int8_t VariantLogicalType::spec_version() const {
+  return (dynamic_cast<const 
LogicalType::Impl::Variant&>(*impl_)).spec_version();
+}
+
+std::string LogicalType::Impl::Variant::ToString() const {
+  std::stringstream type;
+  type << "Variant(" << static_cast<int>(spec_version_) << ")";
+  return type.str();
+}
+
+std::string LogicalType::Impl::Variant::ToJSON() const {
+  std::stringstream json;
+  json << R"({"Type": "Variant", "SpecVersion": )" << 
static_cast<int>(spec_version_)
+       << "}";
+
+  return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Variant::ToThrift() const {
+  format::LogicalType type;
+  format::VariantType variant_type;
+  variant_type.__set_specification_version(spec_version_);
+  type.__set_VARIANT(variant_type);
+  return type;
+}
+
+std::shared_ptr<const LogicalType> VariantLogicalType::Make(const int8_t 
spec_version) {
+  auto logical_type = std::shared_ptr<VariantLogicalType>(new 
VariantLogicalType());
+  logical_type->impl_.reset(new LogicalType::Impl::Variant(spec_version));
+  return logical_type;
+}
 
 class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible,
                                     public 
LogicalType::Impl::UniversalApplicable {
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index c2040e555f..7e8a18fc94 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -178,6 +178,9 @@ class PARQUET_EXPORT LogicalType {
     KARNEY = 5
   };
 
+  /// \brief The latest supported Variant specification version by this library
+  static constexpr int8_t kVariantSpecVersion = 1;
+
   /// \brief If possible, return a logical type equivalent to the given legacy
   /// converted type (and decimal metadata if applicable).
   static std::shared_ptr<const LogicalType> FromConvertedType(
@@ -224,7 +227,8 @@ class PARQUET_EXPORT LogicalType {
   static std::shared_ptr<const LogicalType> BSON();
   static std::shared_ptr<const LogicalType> UUID();
   static std::shared_ptr<const LogicalType> Float16();
-  static std::shared_ptr<const LogicalType> Variant();
+  static std::shared_ptr<const LogicalType> Variant(
+      int8_t specVersion = kVariantSpecVersion);
 
   static std::shared_ptr<const LogicalType> Geometry(std::string crs = "");
 
@@ -495,7 +499,10 @@ class PARQUET_EXPORT GeographyLogicalType : public 
LogicalType {
 /// \brief Allowed for group nodes only.
 class PARQUET_EXPORT VariantLogicalType : public LogicalType {
  public:
-  static std::shared_ptr<const LogicalType> Make();
+  static std::shared_ptr<const LogicalType> Make(
+      int8_t specVersion = kVariantSpecVersion);
+
+  int8_t spec_version() const;
 
  private:
   VariantLogicalType() = default;

Reply via email to