xxubai commented on code in PR #46372: URL: https://github.com/apache/arrow/pull/46372#discussion_r2101769873
########## cpp/src/parquet/variant.h: ########## @@ -0,0 +1,307 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstdint> +#include <optional> +#include <string_view> +#include <vector> + +#include <arrow/util/decimal.h> +#include <arrow/util/small_vector.h> + +#include "parquet/platform.h" + +namespace parquet::variant { + +// TODO(mwish): Should I use parquet::ByteArray rather than +// std::string_view? + +enum class VariantBasicType { + /// One of the primitive types + Primitive = 0, + /// A string with a length less than 64 bytes + ShortString = 1, + /// A collection of (string-key, variant-value) pairs + Object = 2, + /// An ordered sequence of variant values + Array = 3 +}; + +PARQUET_EXPORT std::string VariantBasicTypeToString(VariantBasicType type); + +enum class VariantPrimitiveType : int8_t { + /// Equivalent Parquet Type: UNKNOWN + NullType = 0, + /// Equivalent Parquet Type: BOOLEAN + BooleanTrue = 1, + /// Equivalent Parquet Type: BOOLEAN + BooleanFalse = 2, + /// Equivalent Parquet Type: INT(8, signed) + Int8 = 3, + /// Equivalent Parquet Type: INT(16, signed) + Int16 = 4, + /// Equivalent Parquet Type: INT(32, signed) + Int32 = 5, + /// Equivalent Parquet Type: INT(64, signed) + Int64 = 6, + /// Equivalent Parquet Type: DOUBLE + Double = 7, + /// Equivalent Parquet Type: DECIMAL(precision, scale) + Decimal4 = 8, + /// Equivalent Parquet Type: DECIMAL(precision, scale) + Decimal8 = 9, + /// Equivalent Parquet Type: DECIMAL(precision, scale) + Decimal16 = 10, + /// Equivalent Parquet Type: DATE + Date = 11, + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=true, MICROS) + TimestampMicros = 12, + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=false, MICROS) + TimestampMicrosNtz = 13, + /// Equivalent Parquet Type: FLOAT + Float = 14, + /// Equivalent Parquet Type: BINARY + Binary = 15, + /// Equivalent Parquet Type: STRING + String = 16, + /// Equivalent Parquet Type: TIME(isAdjustedToUTC=false, MICROS) + TimeMicrosNtz = 17, + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=true, NANOS) + TimestampNanosTz = 18, // Assuming TZ stands for TimeZone, and follows the document's + // 'timestamp with time zone' + /// Equivalent Parquet Type: TIMESTAMP(isAdjustedToUTC=false, NANOS) + TimestampNanosNtz = 19, // Differentiating from TimestampNtz (MICROS) + /// Equivalent Parquet Type: UUID + Uuid = 20 +}; + +PARQUET_EXPORT std::string VariantPrimitiveTypeToString(VariantPrimitiveType type); + +/// VariantType is from basic type and primitive type. +enum class VariantType { + Object, + Array, + Null, + Boolean, + Int8, + Int16, + Int32, + Int64, + String, + Double, + Decimal4, + Decimal8, + Decimal16, + Date, + TimestampMicrosTz, + TimestampMicrosNtz, + Float, + Binary, + Time, + TimestampNanosTz, + TimestampNanosNtz, + Uuid +}; + +PARQUET_EXPORT std::string VariantTypeToString(VariantType type); + +class PARQUET_EXPORT VariantMetadata { + public: + explicit VariantMetadata(std::string_view metadata); + /// \brief Get the variant metadata version. Currently, always 1. + uint8_t version() const; + /// \brief Get the metadata key for a given variant field id. + /// \throw ParquetException if the variant_id is out of range(larger than + /// dictionary_size). + std::string_view GetMetadataKey(uint32_t variant_id) const; + /// \brief Get the metadata id for a given key. + /// From the discussion in ML: + /// https://lists.apache.org/thread/b68tjmrjmy64mbv9dknpmqs28vnzjj96 if + /// !sorted_and_unique(), the metadata key is not guaranteed to be unique, so we use a Review Comment: I check the implementation just return the first matched string index(https://github.com/apache/iceberg/blob/1911c94ea605a3d3f10a1994b046f00a5e9fdceb/api/src/main/java/org/apache/iceberg/variants/SerializedMetadata.java#L88-L102). Kindly ask: Why we need to return a vector here? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org