wgtmac commented on code in PR #159:
URL: https://github.com/apache/iceberg-cpp/pull/159#discussion_r2268664195


##########
src/iceberg/parquet/parquet_schema_util.cc:
##########
@@ -17,20 +17,397 @@
  * under the License.
  */
 
+#include <charconv>
+
+#include <arrow/type.h>
+#include <arrow/type_fwd.h>
+#include <arrow/util/key_value_metadata.h>
+#include <parquet/arrow/schema.h>
 #include <parquet/schema.h>
 
+#include "iceberg/metadata_columns.h"
 #include "iceberg/parquet/parquet_schema_util_internal.h"
+#include "iceberg/result.h"
+#include "iceberg/schema_util_internal.h"
 #include "iceberg/util/checked_cast.h"
+#include "iceberg/util/formatter.h"
+#include "iceberg/util/macros.h"
 
 namespace iceberg::parquet {
 
+namespace {
+
+constexpr std::string_view kParquetFieldIdKey = "PARQUET:field_id";
+
+std::optional<int32_t> FieldIdFromMetadata(
+    const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+  if (!metadata) {
+    return std::nullopt;
+  }
+  int key = metadata->FindKey(kParquetFieldIdKey);
+  if (key < 0) {
+    return std::nullopt;
+  }
+  std::string field_id_str = metadata->value(key);
+  int32_t field_id = -1;
+  auto [_, ec] = std::from_chars(field_id_str.data(),
+                                 field_id_str.data() + field_id_str.size(), 
field_id);
+  if (ec != std::errc() || field_id < 0) {
+    return std::nullopt;
+  }
+  return field_id;
+}
+
+std::optional<int32_t> GetFieldId(const ::parquet::arrow::SchemaField& 
parquet_field) {
+  return FieldIdFromMetadata(parquet_field.field->metadata());
+}
+
+// TODO(gangwu): support v3 unknown type
+Status ValidateParquetSchemaEvolution(
+    const Type& expected_type, const ::parquet::arrow::SchemaField& 
parquet_field) {
+  const auto& arrow_type = parquet_field.field->type();
+  switch (expected_type.type_id()) {
+    case TypeId::kBoolean:
+      if (arrow_type->id() == ::arrow::Type::BOOL) {
+        return {};
+      }
+      break;
+    case TypeId::kInt:
+      if (arrow_type->id() == ::arrow::Type::INT32) {
+        return {};
+      }
+      break;
+    case TypeId::kLong:
+      if (arrow_type->id() == ::arrow::Type::INT64 ||
+          arrow_type->id() == ::arrow::Type::INT32) {
+        return {};
+      }
+      break;
+    case TypeId::kFloat:
+      if (arrow_type->id() == ::arrow::Type::FLOAT) {
+        return {};
+      }
+      break;
+    case TypeId::kDouble:
+      if (arrow_type->id() == ::arrow::Type::DOUBLE ||
+          arrow_type->id() == ::arrow::Type::FLOAT) {
+        return {};
+      }
+      break;
+    case TypeId::kDate:
+      if (arrow_type->id() == ::arrow::Type::DATE32) {
+        return {};
+      }
+      break;
+    case TypeId::kTime:
+      if (arrow_type->id() == ::arrow::Type::TIME64) {
+        const auto& time_type =
+            internal::checked_cast<const ::arrow::Time64Type&>(*arrow_type);
+        if (time_type.unit() == ::arrow::TimeUnit::MICRO) {
+          return {};
+        }
+      }
+      break;
+    case TypeId::kTimestamp:
+      if (arrow_type->id() == ::arrow::Type::TIMESTAMP) {
+        const auto& timestamp_type =
+            internal::checked_cast<const ::arrow::TimestampType&>(*arrow_type);
+        if (timestamp_type.unit() == ::arrow::TimeUnit::MICRO &&
+            timestamp_type.timezone().empty()) {
+          return {};
+        }
+      }
+      break;
+    case TypeId::kTimestampTz:
+      if (arrow_type->id() == ::arrow::Type::TIMESTAMP) {
+        const auto& timestamp_type =
+            internal::checked_cast<const ::arrow::TimestampType&>(*arrow_type);
+        if (timestamp_type.unit() == ::arrow::TimeUnit::MICRO &&
+            !timestamp_type.timezone().empty()) {
+          return {};
+        }
+      }
+      break;
+    case TypeId::kString:
+      if (arrow_type->id() == ::arrow::Type::STRING) {
+        return {};
+      }
+      break;
+    case TypeId::kBinary:
+      if (arrow_type->id() == ::arrow::Type::BINARY) {
+        return {};
+      }
+      break;
+    case TypeId::kDecimal:
+      if (arrow_type->id() == ::arrow::Type::DECIMAL128) {
+        const auto& decimal_type =
+            internal::checked_cast<const DecimalType&>(expected_type);
+        const auto& arrow_decimal =
+            internal::checked_cast<const 
::arrow::Decimal128Type&>(*arrow_type);
+        if (decimal_type.scale() == arrow_decimal.scale() &&
+            decimal_type.precision() >= arrow_decimal.precision()) {
+          return {};
+        }
+      }
+      break;
+    case TypeId::kUuid:
+      if (arrow_type->id() == ::arrow::Type::FIXED_SIZE_BINARY) {
+        const auto& fixed_binary =
+            internal::checked_cast<const 
::arrow::FixedSizeBinaryType&>(*arrow_type);
+        if (fixed_binary.byte_width() == 16) {
+          return {};
+        }
+      }
+      break;
+    case TypeId::kFixed:
+      if (arrow_type->id() == ::arrow::Type::FIXED_SIZE_BINARY) {
+        const auto& fixed_binary =
+            internal::checked_cast<const 
::arrow::FixedSizeBinaryType&>(*arrow_type);
+        if (fixed_binary.byte_width() ==
+            internal::checked_cast<const FixedType&>(expected_type).length()) {
+          return {};
+        }
+      }
+      break;
+    case TypeId::kStruct:
+      if (arrow_type->id() == ::arrow::Type::STRUCT) {
+        return {};
+      }
+      break;
+    case TypeId::kList:
+      if (arrow_type->id() == ::arrow::Type::LIST) {

Review Comment:
   ListView is not supported by parquet-cpp yet. I think we should just support 
simple list and binary type variants in the early versions of iceberg-cpp. Once 
parquet-cpp has full support, we can leverage them later.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to