wgtmac commented on code in PR #159: URL: https://github.com/apache/iceberg-cpp/pull/159#discussion_r2268673543
########## src/iceberg/parquet/parquet_schema_util.cc: ########## @@ -17,20 +17,397 @@ * under the License. */ +#include <charconv> + +#include <arrow/type.h> +#include <arrow/type_fwd.h> +#include <arrow/util/key_value_metadata.h> +#include <parquet/arrow/schema.h> #include <parquet/schema.h> +#include "iceberg/metadata_columns.h" #include "iceberg/parquet/parquet_schema_util_internal.h" +#include "iceberg/result.h" +#include "iceberg/schema_util_internal.h" #include "iceberg/util/checked_cast.h" +#include "iceberg/util/formatter.h" +#include "iceberg/util/macros.h" namespace iceberg::parquet { +namespace { + +constexpr std::string_view kParquetFieldIdKey = "PARQUET:field_id"; + +std::optional<int32_t> FieldIdFromMetadata( + const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) { + if (!metadata) { + return std::nullopt; + } + int key = metadata->FindKey(kParquetFieldIdKey); + if (key < 0) { + return std::nullopt; + } + std::string field_id_str = metadata->value(key); + int32_t field_id = -1; + auto [_, ec] = std::from_chars(field_id_str.data(), + field_id_str.data() + field_id_str.size(), field_id); + if (ec != std::errc() || field_id < 0) { + return std::nullopt; + } + return field_id; +} + +std::optional<int32_t> GetFieldId(const ::parquet::arrow::SchemaField& parquet_field) { + return FieldIdFromMetadata(parquet_field.field->metadata()); +} + +// TODO(gangwu): support v3 unknown type +Status ValidateParquetSchemaEvolution( + const Type& expected_type, const ::parquet::arrow::SchemaField& parquet_field) { + const auto& arrow_type = parquet_field.field->type(); + switch (expected_type.type_id()) { + case TypeId::kBoolean: + if (arrow_type->id() == ::arrow::Type::BOOL) { + return {}; + } + break; + case TypeId::kInt: + if (arrow_type->id() == ::arrow::Type::INT32) { + return {}; + } + break; + case TypeId::kLong: + if (arrow_type->id() == ::arrow::Type::INT64 || + arrow_type->id() == ::arrow::Type::INT32) { + return {}; + } + break; + case TypeId::kFloat: + if (arrow_type->id() == ::arrow::Type::FLOAT) { + return {}; + } + break; + case TypeId::kDouble: + if (arrow_type->id() == ::arrow::Type::DOUBLE || + arrow_type->id() == ::arrow::Type::FLOAT) { + return {}; + } + break; + case TypeId::kDate: + if (arrow_type->id() == ::arrow::Type::DATE32) { + return {}; + } + break; + case TypeId::kTime: + if (arrow_type->id() == ::arrow::Type::TIME64) { + const auto& time_type = + internal::checked_cast<const ::arrow::Time64Type&>(*arrow_type); + if (time_type.unit() == ::arrow::TimeUnit::MICRO) { + return {}; + } + } + break; + case TypeId::kTimestamp: + if (arrow_type->id() == ::arrow::Type::TIMESTAMP) { + const auto& timestamp_type = + internal::checked_cast<const ::arrow::TimestampType&>(*arrow_type); + if (timestamp_type.unit() == ::arrow::TimeUnit::MICRO && + timestamp_type.timezone().empty()) { + return {}; + } + } + break; + case TypeId::kTimestampTz: + if (arrow_type->id() == ::arrow::Type::TIMESTAMP) { + const auto& timestamp_type = + internal::checked_cast<const ::arrow::TimestampType&>(*arrow_type); + if (timestamp_type.unit() == ::arrow::TimeUnit::MICRO && + !timestamp_type.timezone().empty()) { + return {}; + } + } + break; + case TypeId::kString: + if (arrow_type->id() == ::arrow::Type::STRING) { + return {}; + } + break; + case TypeId::kBinary: + if (arrow_type->id() == ::arrow::Type::BINARY) { + return {}; + } + break; + case TypeId::kDecimal: + if (arrow_type->id() == ::arrow::Type::DECIMAL128) { + const auto& decimal_type = + internal::checked_cast<const DecimalType&>(expected_type); + const auto& arrow_decimal = + internal::checked_cast<const ::arrow::Decimal128Type&>(*arrow_type); + if (decimal_type.scale() == arrow_decimal.scale() && + decimal_type.precision() >= arrow_decimal.precision()) { + return {}; + } + } + break; + case TypeId::kUuid: + if (arrow_type->id() == ::arrow::Type::FIXED_SIZE_BINARY) { + const auto& fixed_binary = + internal::checked_cast<const ::arrow::FixedSizeBinaryType&>(*arrow_type); + if (fixed_binary.byte_width() == 16) { + return {}; + } + } Review Comment: Good catch! Fixed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org