misiek1984 commented on code in PR #50122: URL: https://github.com/apache/arrow/pull/50122#discussion_r3441878179
########## cpp/src/arrow/extension/variant_internal.h: ########## @@ -0,0 +1,479 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstdint> +#include <string> +#include <string_view> +#include <unordered_map> +#include <vector> + +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow::extension::variant_internal { + +/// \file variant_internal.h +/// \brief Utilities for Variant binary encoding/decoding. +/// +/// Implements parsing logic per the Variant Encoding Spec: +/// https://github.com/apache/parquet-format/blob/master/VariantEncoding.md +/// +/// The "internal" in the filename refers to the binary encoding internals +/// of the Variant type, not the visibility of this header. This header is +/// installed and provides the public C++ API for working with Variant +/// binary data (independent of the VariantExtensionType in parquet_variant.h). + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +/// Variant encoding spec version 1. +constexpr uint8_t kVariantVersion = 1; + +/// Maximum nesting depth for recursive value decoding. +/// Prevents stack overflow on deeply nested (possibly malicious) input. +constexpr int32_t kMaxNestingDepth = 128; + +// --------------------------------------------------------------------------- +// Enumerations +// --------------------------------------------------------------------------- + +/// \brief Basic type codes from bits 0-1 of the value header byte. +/// +/// Variant Encoding Spec §3: "Value encoding" +enum class BasicType : uint8_t { + kPrimitive = 0, + kShortString = 1, + kObject = 2, + kArray = 3, +}; + +/// \brief Primitive type codes from bits 2-7 when basic_type == kPrimitive. +/// +/// Variant Encoding Spec §3.1: "Primitive types" +enum class PrimitiveType : uint8_t { + kNull = 0, + kTrue = 1, + kFalse = 2, + kInt8 = 3, + kInt16 = 4, + kInt32 = 5, + kInt64 = 6, + kDouble = 7, + kDecimal4 = 8, + kDecimal8 = 9, + kDecimal16 = 10, + kDate = 11, + kTimestampMicros = 12, + kTimestampMicrosNTZ = 13, + kFloat = 14, + kBinary = 15, + kString = 16, + kTimeNTZ = 17, + kTimestampNanos = 18, + kTimestampNanosNTZ = 19, + kUUID = 20, +}; + +// --------------------------------------------------------------------------- +// Metadata +// --------------------------------------------------------------------------- + +/// \brief Parsed variant metadata (string dictionary). +/// +/// The metadata buffer contains a header byte followed by a dictionary of +/// interned strings. String views reference the raw buffer and are valid +/// only as long as the underlying buffer is alive. +struct ARROW_EXPORT VariantMetadata { + /// Spec version (must be kVariantVersion). + uint8_t version = 0; + + /// Whether the dictionary strings are sorted lexicographically. + bool is_sorted = false; + + /// Number of bytes used for each offset (1, 2, 3, or 4). + int32_t offset_size = 0; + + /// Dictionary of interned strings. Views into the raw metadata buffer. + std::vector<std::string_view> strings; +}; + +/// \brief Decode a variant metadata buffer. +/// +/// Parses the header byte and string dictionary from the raw metadata +/// buffer. The returned VariantMetadata contains string_views that +/// reference the input buffer directly (zero-copy). +/// +/// \param[in] data Pointer to the metadata buffer (must not be null) +/// \param[in] length Length of the metadata buffer in bytes +/// \return Parsed VariantMetadata on success, Status::Invalid on +/// malformed input +/// +/// \note The input buffer must outlive the returned VariantMetadata. +ARROW_EXPORT Result<VariantMetadata> DecodeMetadata(const uint8_t* data, int64_t length); + +// --------------------------------------------------------------------------- +// Value header utilities +// --------------------------------------------------------------------------- + +/// \brief Extract the basic type from a value header byte. +/// +/// \param[in] header The first byte of a variant value +/// \return The BasicType (bits 0-1) +inline BasicType GetBasicType(uint8_t header) { + return static_cast<BasicType>(header & 0x03); +} + +/// \brief Extract the primitive type from a value header byte. +/// +/// Only valid when GetBasicType(header) == BasicType::kPrimitive. +/// +/// \param[in] header The first byte of a variant value +/// \return The PrimitiveType (bits 2-7) +inline PrimitiveType GetPrimitiveType(uint8_t header) { + return static_cast<PrimitiveType>((header >> 2) & 0x3F); +} + +/// \brief Get the byte size of a primitive value (excluding header). +/// +/// \param[in] primitive_type The primitive type code +/// \return Number of bytes for the value payload, or -1 for +/// variable-length types (Binary, String) +ARROW_EXPORT int32_t PrimitiveValueSize(PrimitiveType primitive_type); + +// --------------------------------------------------------------------------- +// Value decoding +// --------------------------------------------------------------------------- + +/// \brief Visitor interface for variant value decoding. +/// +/// Implement this interface to receive callbacks during variant value +/// traversal. The visitor pattern avoids materializing a tree of objects, +/// which is important when scanning millions of rows. +/// +/// All methods return Status::OK() to continue traversal, or any error +/// Status to abort. +/// +/// \note String values passed to String() and FieldName() are raw bytes from +/// the variant buffer without UTF-8 validation. Per spec, all strings +/// must be valid UTF-8, but validation is the responsibility of a +/// higher-level consumer (e.g., when materializing to Arrow StringArray). +class ARROW_EXPORT VariantVisitor { + public: + virtual ~VariantVisitor() = default; + + /// @name Primitive value callbacks + /// @{ + virtual Status Null() = 0; + virtual Status Bool(bool value) = 0; + virtual Status Int8(int8_t value) = 0; + virtual Status Int16(int16_t value) = 0; + virtual Status Int32(int32_t value) = 0; + virtual Status Int64(int64_t value) = 0; + virtual Status Float(float value) = 0; + virtual Status Double(double value) = 0; + virtual Status Decimal4(const uint8_t* bytes, int32_t scale) = 0; + virtual Status Decimal8(const uint8_t* bytes, int32_t scale) = 0; + virtual Status Decimal16(const uint8_t* bytes, int32_t scale) = 0; + virtual Status Date(int32_t days_since_epoch) = 0; + virtual Status TimestampMicros(int64_t micros_since_epoch) = 0; + virtual Status TimestampMicrosNTZ(int64_t micros_since_epoch) = 0; + virtual Status String(std::string_view value) = 0; + virtual Status Binary(std::string_view value) = 0; + virtual Status TimeNTZ(int64_t micros_since_midnight) = 0; + virtual Status TimestampNanos(int64_t nanos_since_epoch) = 0; + virtual Status TimestampNanosNTZ(int64_t nanos_since_epoch) = 0; + virtual Status UUID(const uint8_t* bytes) = 0; + /// @} + + /// @name Container callbacks + /// @{ + + /// \brief Called at the start of an object with the number of fields. + virtual Status StartObject(int32_t num_fields) = 0; + + /// \brief Called for each object field name, before the field value. + virtual Status FieldName(std::string_view name) = 0; + + /// \brief Called after all fields of an object have been visited. + virtual Status EndObject() = 0; + + /// \brief Called at the start of an array with the number of elements. + virtual Status StartArray(int32_t num_elements) = 0; + + /// \brief Called after all elements of an array have been visited. + virtual Status EndArray() = 0; + /// @} +}; + +/// \brief Decode a variant value buffer using a visitor. +/// +/// Recursively traverses the variant value, calling the appropriate +/// visitor methods for each element. Objects and arrays trigger +/// Start/End pairs with nested visits for their contents. +/// +/// \param[in] metadata Parsed metadata (for resolving string dictionary) +/// \param[in] data Pointer to the value buffer +/// \param[in] length Length of the value buffer in bytes +/// \param[in] visitor Callback interface for decoded values +/// \return Status::OK on success, Status::Invalid on malformed input +/// +/// \note The data buffer must remain valid for the duration of the call. +ARROW_EXPORT Status DecodeVariantValue(const VariantMetadata& metadata, + const uint8_t* data, int64_t length, + VariantVisitor* visitor); + +/// \brief Get the basic type of a variant value without full decoding. +/// +/// \param[in] data Pointer to the value buffer +/// \param[in] length Length of the value buffer in bytes +/// \return The BasicType of the value, or Status::Invalid if the +/// buffer is empty +ARROW_EXPORT Result<BasicType> GetValueBasicType(const uint8_t* data, int64_t length); + +/// \brief Get the number of fields in a variant object. +/// +/// \param[in] data Pointer to the value buffer (must start with an object) +/// \param[in] length Length of the value buffer in bytes +/// \return The number of fields, or Status::Invalid if not an object +ARROW_EXPORT Result<int32_t> GetObjectFieldCount(const uint8_t* data, int64_t length); + +/// \brief Get the number of elements in a variant array. +/// +/// \param[in] data Pointer to the value buffer (must start with an array) +/// \param[in] length Length of the value buffer in bytes +/// \return The number of elements, or Status::Invalid if not an array +ARROW_EXPORT Result<int32_t> GetArrayElementCount(const uint8_t* data, int64_t length); + +// --------------------------------------------------------------------------- +// Value size computation +// --------------------------------------------------------------------------- + +/// \brief Compute the total byte size of a variant value (header + data). +/// +/// Determines how many bytes a variant value occupies by examining +/// its header and (for containers/variable-length types) reading +/// size information. Does NOT recursively validate the contents. +/// +/// \param[in] data Pointer to the start of a variant value +/// \param[in] length Maximum bytes available +/// \return Total byte count of the value, or Status::Invalid if truncated +ARROW_EXPORT Result<int64_t> ValueSize(const uint8_t* data, int64_t length); + +// --------------------------------------------------------------------------- +// Random access utilities +// --------------------------------------------------------------------------- + +/// \brief Find an object field by name and return the offset/size of its value. +/// +/// Searches the field IDs in the object, resolving each against the +/// metadata dictionary. Per spec, field IDs are in lexicographic order +/// of their corresponding key names, enabling binary search for large +/// objects (>=32 fields). For smaller objects, linear scan is used. +/// +/// \param[in] metadata Parsed metadata (for resolving field IDs to names) +/// \param[in] data Pointer to the object value buffer +/// \param[in] length Length of the value buffer in bytes +/// \param[in] field_name The field name to search for +/// \param[out] field_offset Set to the byte offset of the field's value +/// within data, or -1 if not found +/// \param[out] field_size Set to the byte size of the field's value, +/// or 0 if not found +/// \return Status::OK if search completed (field may or may not exist), +/// Status::Invalid if the buffer is malformed +ARROW_EXPORT Status FindObjectField(const VariantMetadata& metadata, const uint8_t* data, + int64_t length, std::string_view field_name, + int64_t* field_offset, int64_t* field_size); + +/// \brief Get the i-th element of a variant array by index (O(1) access). +/// +/// Uses the offset table for random access without traversing preceding +/// elements. +/// +/// \param[in] data Pointer to the array value buffer +/// \param[in] length Length of the value buffer in bytes +/// \param[in] index Zero-based element index +/// \param[out] element_offset Set to the byte offset of the element within data +/// \param[out] element_size Set to the byte size of the element +/// \return Status::OK on success, Status::Invalid if not an array or +/// index is out of range +ARROW_EXPORT Status GetArrayElement(const uint8_t* data, int64_t length, int32_t index, + int64_t* element_offset, int64_t* element_size); + +/// \brief Get the i-th field of a variant object by position. +/// +/// Returns both the field name (resolved from metadata) and a pointer +/// to the field's value. +/// +/// \param[in] metadata Parsed metadata +/// \param[in] data Pointer to the object value buffer +/// \param[in] length Length of the value buffer in bytes +/// \param[in] index Zero-based field index +/// \param[out] field_name Set to the field's key name +/// \param[out] field_offset Set to the byte offset of the field's value +/// \param[out] field_size Set to the byte size of the field's value +/// \return Status::OK on success, Status::Invalid if not an object or +/// index is out of range +ARROW_EXPORT Status GetObjectFieldAt(const VariantMetadata& metadata, const uint8_t* data, + int64_t length, int32_t index, + std::string_view* field_name, int64_t* field_offset, + int64_t* field_size); + +/// \brief Find the dictionary ID for a given key name. +/// +/// Uses binary search if the metadata is sorted, otherwise linear scan. +/// +/// \param[in] metadata Parsed metadata +/// \param[in] key The key to search for +/// \return The dictionary ID if found, or -1 if not present +ARROW_EXPORT int32_t FindMetadataKey(const VariantMetadata& metadata, + std::string_view key); + +// --------------------------------------------------------------------------- +// Variant Builder (Encoder) +// --------------------------------------------------------------------------- + +/// \brief Builder for constructing Variant binary values. +/// +/// Mirrors the Go implementation's Builder pattern. Values are written +/// into an internal buffer; containers (objects/arrays) use a start-offset +/// + finish pattern that shifts data to insert headers. +/// +/// Usage: +/// VariantBuilder builder; +/// auto start = builder.Offset(); +/// std::vector<VariantBuilder::FieldEntry> fields; +/// fields.push_back(builder.NextField(start, "name")); +/// builder.String("Alice"); +/// fields.push_back(builder.NextField(start, "age")); +/// builder.Int(30); +/// builder.FinishObject(start, fields); +/// ARROW_ASSIGN_OR_RAISE(auto result, builder.Finish()); +class ARROW_EXPORT VariantBuilder { Review Comment: This API is great for building new variants. Did you also consider adding an API that allows modifying existing Variant values? We would need to add a function to `VariantBuilder` similar to `FindObjectField` from the decoding PR, which would "move"| the context of `VariantBuilder` to a specific place/field. Once called, you would then be able to override the existing value. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
