qzyu999 commented on code in PR #50121:
URL: https://github.com/apache/arrow/pull/50121#discussion_r3485321197


##########
cpp/src/arrow/extension/variant_internal.h:
##########
@@ -0,0 +1,347 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow::extension::variant_internal {
+
+/// \file variant_internal.h
+/// \brief Utilities for Variant binary encoding/decoding.
+///
+/// Implements parsing logic per the Variant Encoding Spec:
+/// https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
+///
+/// The "internal" in the filename refers to the binary encoding internals
+/// of the Variant type, not the visibility of this header. This header is
+/// installed and provides the public C++ API for working with Variant
+/// binary data (independent of the VariantExtensionType in parquet_variant.h).
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/// Variant encoding spec version 1.
+constexpr uint8_t kVariantVersion = 1;
+
+/// Maximum nesting depth for recursive value decoding.
+/// Prevents stack overflow on deeply nested (possibly malicious) input.
+constexpr int32_t kMaxNestingDepth = 128;
+
+// ---------------------------------------------------------------------------
+// Enumerations
+// ---------------------------------------------------------------------------
+
+/// \brief Basic type codes from bits 0-1 of the value header byte.
+///
+/// See: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
+enum class BasicType : uint8_t {
+  kPrimitive = 0,
+  kShortString = 1,
+  kObject = 2,
+  kArray = 3,
+};
+
+/// \brief Primitive type codes from bits 2-7 when basic_type == kPrimitive.
+///
+/// See: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
+enum class PrimitiveType : uint8_t {
+  kNull = 0,
+  kTrue = 1,
+  kFalse = 2,
+  kInt8 = 3,
+  kInt16 = 4,
+  kInt32 = 5,
+  kInt64 = 6,
+  kDouble = 7,
+  kDecimal4 = 8,
+  kDecimal8 = 9,
+  kDecimal16 = 10,
+  kDate = 11,
+  kTimestampMicros = 12,
+  kTimestampMicrosNTZ = 13,
+  kFloat = 14,
+  kBinary = 15,
+  kString = 16,
+  kTimeNTZ = 17,
+  kTimestampNanos = 18,
+  kTimestampNanosNTZ = 19,
+  kUUID = 20,
+};
+
+// ---------------------------------------------------------------------------
+// Metadata
+// ---------------------------------------------------------------------------
+
+/// \brief Parsed variant metadata (string dictionary).
+///
+/// The metadata buffer contains a header byte followed by a dictionary of
+/// interned strings. String views reference the raw buffer and are valid
+/// only as long as the underlying buffer is alive.
+struct ARROW_EXPORT VariantMetadata {
+  /// Spec version (must be kVariantVersion).
+  uint8_t version = 0;
+
+  /// Whether the dictionary strings are sorted lexicographically.
+  bool is_sorted = false;
+
+  /// Number of bytes used for each offset (1, 2, 3, or 4).
+  int32_t offset_size = 0;
+
+  /// Dictionary of interned strings. Views into the raw metadata buffer.
+  std::vector<std::string_view> strings;
+};
+
+/// \brief Decode a variant metadata buffer.
+///
+/// Parses the header byte and string dictionary from the raw metadata
+/// buffer. The returned VariantMetadata contains string_views that
+/// reference the input buffer directly (zero-copy).
+///
+/// \param[in] data Pointer to the metadata buffer (must not be null)
+/// \param[in] length Length of the metadata buffer in bytes
+/// \return Parsed VariantMetadata on success, Status::Invalid on
+///         malformed input
+///
+/// \note The input buffer must outlive the returned VariantMetadata.
+ARROW_EXPORT Result<VariantMetadata> DecodeMetadata(const uint8_t* data, 
int64_t length);
+
+// ---------------------------------------------------------------------------
+// Value header utilities
+// ---------------------------------------------------------------------------
+
+/// \brief Extract the basic type from a value header byte.
+///
+/// \param[in] header The first byte of a variant value
+/// \return The BasicType (bits 0-1)
+inline BasicType GetBasicType(uint8_t header) {
+  return static_cast<BasicType>(header & 0x03);
+}
+
+/// \brief Extract the primitive type from a value header byte.
+///
+/// Only valid when GetBasicType(header) == BasicType::kPrimitive.
+///
+/// \param[in] header The first byte of a variant value
+/// \return The PrimitiveType (bits 2-7)
+inline PrimitiveType GetPrimitiveType(uint8_t header) {
+  return static_cast<PrimitiveType>((header >> 2) & 0x3F);
+}
+
+/// \brief Get the byte size of a primitive value (excluding header).
+///
+/// \param[in] primitive_type The primitive type code
+/// \return Number of bytes for the value payload, or -1 for
+///         variable-length types (Binary, String)
+ARROW_EXPORT int32_t PrimitiveValueSize(PrimitiveType primitive_type);
+
+// ---------------------------------------------------------------------------
+// Value decoding
+// ---------------------------------------------------------------------------
+
+/// \brief Visitor interface for variant value decoding.
+///
+/// Implement this interface to receive callbacks during variant value
+/// traversal. The visitor pattern avoids materializing a tree of objects,
+/// which is important when scanning millions of rows.
+///
+/// All methods return Status::OK() to continue traversal, or any error
+/// Status to abort.
+///
+/// \note String values passed to String() and FieldName() are raw bytes from
+///       the variant buffer without UTF-8 validation. Per spec, all strings
+///       must be valid UTF-8, but validation is the responsibility of a
+///       higher-level consumer (e.g., when materializing to Arrow 
StringArray).
+class ARROW_EXPORT VariantVisitor {
+ public:
+  virtual ~VariantVisitor() = default;
+
+  /// @name Primitive value callbacks
+  /// @{
+  virtual Status Null() = 0;
+  virtual Status Bool(bool value) = 0;
+  virtual Status Int8(int8_t value) = 0;
+  virtual Status Int16(int16_t value) = 0;
+  virtual Status Int32(int32_t value) = 0;
+  virtual Status Int64(int64_t value) = 0;
+  virtual Status Float(float value) = 0;
+  virtual Status Double(double value) = 0;
+  virtual Status Decimal4(const uint8_t* bytes, int32_t scale) = 0;
+  virtual Status Decimal8(const uint8_t* bytes, int32_t scale) = 0;
+  virtual Status Decimal16(const uint8_t* bytes, int32_t scale) = 0;
+  virtual Status Date(int32_t days_since_epoch) = 0;
+  virtual Status TimestampMicros(int64_t micros_since_epoch) = 0;
+  virtual Status TimestampMicrosNTZ(int64_t micros_since_epoch) = 0;
+  virtual Status String(std::string_view value) = 0;
+  virtual Status Binary(std::string_view value) = 0;
+  virtual Status TimeNTZ(int64_t micros_since_midnight) = 0;
+  virtual Status TimestampNanos(int64_t nanos_since_epoch) = 0;
+  virtual Status TimestampNanosNTZ(int64_t nanos_since_epoch) = 0;
+  virtual Status UUID(const uint8_t* bytes) = 0;
+  /// @}
+
+  /// @name Container callbacks
+  /// @{
+
+  /// \brief Called at the start of an object with the number of fields.
+  virtual Status StartObject(int32_t num_fields) = 0;
+
+  /// \brief Called for each object field name, before the field value.
+  virtual Status FieldName(std::string_view name) = 0;
+
+  /// \brief Called after all fields of an object have been visited.
+  virtual Status EndObject() = 0;
+
+  /// \brief Called at the start of an array with the number of elements.
+  virtual Status StartArray(int32_t num_elements) = 0;
+
+  /// \brief Called after all elements of an array have been visited.
+  virtual Status EndArray() = 0;
+  /// @}
+};
+
+/// \brief Decode a variant value buffer using a visitor.
+///
+/// Recursively traverses the variant value, calling the appropriate
+/// visitor methods for each element. Objects and arrays trigger
+/// Start/End pairs with nested visits for their contents.
+///
+/// \param[in] metadata Parsed metadata (for resolving string dictionary)
+/// \param[in] data Pointer to the value buffer
+/// \param[in] length Length of the value buffer in bytes
+/// \param[in] visitor Callback interface for decoded values
+/// \return Status::OK on success, Status::Invalid on malformed input
+///
+/// \note The data buffer must remain valid for the duration of the call.
+ARROW_EXPORT Status DecodeVariantValue(const VariantMetadata& metadata,

Review Comment:
   Implemented in #50232 (the shredding PR in this stack). 
`ReconstructVariantColumn()` handles the "unshredding" path, reassembling typed 
Parquet columns back into variant binary.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to