wgtmac commented on code in PR #430:
URL: https://github.com/apache/iceberg-cpp/pull/430#discussion_r2641943225
##########
src/iceberg/schema.h:
##########
@@ -103,26 +110,46 @@ class ICEBERG_EXPORT Schema : public StructType {
Result<std::unique_ptr<Schema>> Project(
const std::unordered_set<int32_t>& field_ids) const;
+ std::vector<int32_t> IdentifierFieldIds() const;
+ void SetIdentifierFieldIds(std::vector<int32_t> identifier_field_ids);
Review Comment:
Can we avoid adding setter like this to keep `Schema` immutable?
##########
src/iceberg/util/type_util.cc:
##########
@@ -22,11 +22,15 @@
#include <stack>
#include "iceberg/result.h"
+#include "iceberg/schema.h"
#include "iceberg/util/checked_cast.h"
#include "iceberg/util/formatter_internal.h"
#include "iceberg/util/string_util.h"
#include "iceberg/util/visit_type.h"
+using std::ranges::to;
Review Comment:
Should we limit the scope to the function that uses these?
##########
src/iceberg/schema.h:
##########
@@ -103,26 +110,46 @@ class ICEBERG_EXPORT Schema : public StructType {
Result<std::unique_ptr<Schema>> Project(
const std::unordered_set<int32_t>& field_ids) const;
+ std::vector<int32_t> IdentifierFieldIds() const;
Review Comment:
```suggestion
const std::vector<int32_t>& IdentifierFieldIds() const;
```
Or `std::span<const int32_t>`
##########
src/iceberg/schema.h:
##########
@@ -78,6 +79,12 @@ class ICEBERG_EXPORT Schema : public StructType {
Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
int32_t field_id) const;
+ /// \brief Returns the full column name for the given id.
+ ///
+ /// \param field_id The id of the field to get the full name for.
+ /// \return The full name of the field with the given id, or std::nullopt if
not found.
+ Result<std::optional<std::string>> FindColumnNameById(int32_t field_id)
const;
Review Comment:
```suggestion
Result<std::optional<std::string_view>> FindColumnNameById(int32_t
field_id) const;
```
Use `std::string_view` to avoid a string copy.
##########
src/iceberg/schema.cc:
##########
@@ -179,4 +192,25 @@ Result<std::unique_ptr<Schema>> Schema::Project(
std::nullopt);
}
+std::vector<int32_t> Schema::IdentifierFieldIds() const { return
identifier_field_ids_; }
+
+void Schema::SetIdentifierFieldIds(std::vector<int32_t> identifier_field_ids) {
+ identifier_field_ids_ = std::move(identifier_field_ids);
+}
+
+Result<std::vector<std::string>> Schema::IdentifierFieldNames() const {
+ using std::ranges::to;
+ using std::views::transform;
+ std::vector<std::string> names;
+ names.reserve(identifier_field_ids_.size());
+ for (auto id : identifier_field_ids_) {
+ ICEBERG_ASSIGN_OR_RAISE(auto name, FindColumnNameById(id));
+ if (!name.has_value()) {
+ return InvalidSchema("Can not find the field of the specified field id:
{}", id);
Review Comment:
```suggestion
return InvalidSchema("Cannot find the field of the specified field id:
{}", id);
```
##########
src/iceberg/schema.cc:
##########
@@ -77,21 +81,21 @@ Schema::InitIdToFieldMap(const Schema& self) {
return id_to_field;
}
-Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
-Schema::InitNameToIdMap(const Schema& self) {
- std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
name_to_id;
- NameToIdVisitor visitor(name_to_id, /*case_sensitive=*/true);
+Result<Schema::NameIdMap> Schema::InitNameIdMap(const Schema& self) {
+ NameIdMap name_id_map;
+ NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name,
+ /*case_sensitive=*/true);
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
visitor.Finish();
- return name_to_id;
+ return name_id_map;
}
Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
Schema::InitLowerCaseNameToIdMap(const Schema& self) {
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
lowercase_name_to_id;
- NameToIdVisitor visitor(lowercase_name_to_id, /*case_sensitive=*/false);
+ NameToIdVisitor visitor(lowercase_name_to_id, nullptr,
/*case_sensitive=*/false);
Review Comment:
```suggestion
NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr,
/*case_sensitive=*/false);
```
##########
src/iceberg/test/assign_id_visitor_test.cc:
##########
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "iceberg/schema.h"
+#include "iceberg/schema_field.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/type.h"
+#include "iceberg/util/type_util.h"
+
+namespace iceberg {
+
+namespace {
+
+Schema CreateFlatSchema() {
+ return Schema({
+ SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
+ SchemaField::MakeOptional(/*field_id=*/20, "name", iceberg::string()),
+ SchemaField::MakeOptional(/*field_id=*/30, "age", iceberg::int32()),
+ SchemaField::MakeRequired(/*field_id=*/40, "data", iceberg::float64()),
+ });
+}
+
+std::shared_ptr<Type> CreateListOfStruct() {
+ return std::make_shared<ListType>(SchemaField::MakeOptional(
+ /*field_id=*/101, "element",
+ std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeOptional(/*field_id=*/102, "x", iceberg::int32()),
+ SchemaField::MakeRequired(/*field_id=*/103, "y", iceberg::string()),
+ })));
+}
+
+std::shared_ptr<Type> CreateMapWithStructValue() {
+ return std::make_shared<MapType>(
+ SchemaField::MakeRequired(/*field_id=*/201, "key", iceberg::string()),
+ SchemaField::MakeRequired(
+ /*field_id=*/202, "value",
+ std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeRequired(/*field_id=*/203, "id",
iceberg::int64()),
+ SchemaField::MakeOptional(/*field_id=*/204, "name",
iceberg::string()),
+ })));
+}
+
+std::shared_ptr<Type> CreateNestedStruct() {
+ return std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeRequired(/*field_id=*/301, "outer_id",
iceberg::int64()),
+ SchemaField::MakeRequired(
+ /*field_id=*/302, "nested",
+ std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeOptional(/*field_id=*/303, "inner_id",
iceberg::int32()),
+ SchemaField::MakeRequired(/*field_id=*/304, "inner_name",
+ iceberg::string()),
+ })),
+ });
+}
+
+Schema CreateNestedSchema(std::vector<int32_t> identifier_field_ids = {}) {
+ return Schema(
+ {
+ SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
+ SchemaField::MakeOptional(/*field_id=*/20, "list",
CreateListOfStruct()),
+ SchemaField::MakeOptional(/*field_id=*/30, "map",
CreateMapWithStructValue()),
+ SchemaField::MakeRequired(/*field_id=*/40, "struct",
CreateNestedStruct()),
+ },
+ Schema::kInitialSchemaId, std::move(identifier_field_ids));
+}
+
+void ExpectStructEQ(const StructType& expect, const StructType& actual) {
Review Comment:
Can we simply call `EXPECT_EQ(expect, actual)` since we already have `bool
operator==(const Type& lhs, const Type& rhs)`?
##########
src/iceberg/util/type_util.cc:
##########
@@ -294,4 +305,87 @@ std::unordered_map<int32_t, int32_t> IndexParents(const
StructType& root_struct)
return id_to_parent;
}
+AssignFreshIdVisitor::AssignFreshIdVisitor(std::function<int32_t()> next_id)
+ : next_id_(std::move(next_id)) {}
+
+std::shared_ptr<Type> AssignFreshIdVisitor::Visit(
+ const std::shared_ptr<Type>& type) const {
+ switch (type->type_id()) {
+ case TypeId::kStruct:
+ return Visit(*internal::checked_pointer_cast<StructType>(type));
+ case TypeId::kMap:
+ return Visit(*internal::checked_pointer_cast<MapType>(type));
+ case TypeId::kList:
+ return Visit(*internal::checked_pointer_cast<ListType>(type));
+ default:
+ return type;
+ }
+}
+
+std::shared_ptr<StructType> AssignFreshIdVisitor::Visit(const StructType&
type) const {
+ auto fresh_ids = type.fields() |
+ transform([&](const auto& field) { return next_id_(); }) |
+ to<std::vector<int32_t>>();
+ std::vector<SchemaField> fresh_fields;
+ for (size_t i = 0; i < type.fields().size(); ++i) {
+ const auto& field = type.fields()[i];
+ fresh_fields.emplace_back(fresh_ids[i], std::string(field.name()),
+ Visit(field.type()), field.optional(),
+ std::string(field.doc()));
+ }
+ return std::make_shared<StructType>(std::move(fresh_fields));
+}
+
+std::shared_ptr<ListType> AssignFreshIdVisitor::Visit(const ListType& type)
const {
+ const auto& elem_field = type.fields()[0];
+ int32_t fresh_id = next_id_();
+ SchemaField fresh_elem_field(fresh_id, std::string(elem_field.name()),
+ Visit(elem_field.type()), elem_field.optional(),
+ std::string(elem_field.doc()));
+ return std::make_shared<ListType>(std::move(fresh_elem_field));
+}
+
+std::shared_ptr<MapType> AssignFreshIdVisitor::Visit(const MapType& type)
const {
+ const auto& key_field = type.fields()[0];
+ const auto& value_field = type.fields()[1];
+
+ int32_t fresh_key_id = next_id_();
+ int32_t fresh_value_id = next_id_();
+
+ SchemaField fresh_key_field(fresh_key_id, std::string(key_field.name()),
+ Visit(key_field.type()), key_field.optional(),
+ std::string(key_field.doc()));
+ SchemaField fresh_value_field(fresh_value_id,
std::string(value_field.name()),
+ Visit(value_field.type()),
value_field.optional(),
+ std::string(value_field.doc()));
+ return std::make_shared<MapType>(std::move(fresh_key_field),
+ std::move(fresh_value_field));
+}
+
+Status RefreshIdentifierFields(Schema& fresh_schema, const Schema&
base_schema) {
+ ICEBERG_ASSIGN_OR_RAISE(auto identifier_field_names,
+ base_schema.IdentifierFieldNames());
+ std::vector<int32_t> fresh_identifier_ids;
+ for (const auto& name : identifier_field_names) {
+ ICEBERG_ASSIGN_OR_RAISE(auto field, fresh_schema.FindFieldByName(name));
+ if (!field) {
+ return InvalidSchema("Cannot find identifier field: {}", name);
+ }
+ fresh_identifier_ids.push_back(field.value().get().field_id());
+ }
+ fresh_schema.SetIdentifierFieldIds(std::move(fresh_identifier_ids));
+ return {};
+}
+
+Result<std::shared_ptr<Schema>> AssignFreshIds(int32_t schema_id, const
Schema& schema,
+ std::function<int32_t()>
next_id) {
+ auto fresh_type = AssignFreshIdVisitor(std::move(next_id))
+ .Visit(dynamic_cast<const StructType&>(schema));
Review Comment:
```suggestion
.Visit(internal::checked_cast<const
StructType&>(schema));
```
##########
src/iceberg/util/type_util.h:
##########
@@ -56,7 +56,7 @@ class NameToIdVisitor {
public:
explicit NameToIdVisitor(
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>&
name_to_id,
- bool case_sensitive = true,
+ std::unordered_map<int32_t, std::string>* id_to_name, bool
case_sensitive = true,
Review Comment:
Please update the comment at line 54
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]