wgtmac commented on code in PR #127: URL: https://github.com/apache/iceberg-cpp/pull/127#discussion_r2267061473
########## src/iceberg/avro/avro_schema_util.cc: ########## @@ -773,4 +775,247 @@ Result<SchemaProjection> Project(const Schema& expected_schema, return SchemaProjection{std::move(field_projection.children)}; } +namespace { + +Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + auto new_record_node = std::make_shared<::avro::NodeRecord>(); + new_record_node->setName(original_node->name()); + + for (size_t i = 0; i < original_node->leaves(); ++i) { + if (i >= original_node->names()) { + return InvalidSchema("Index {} is out of bounds for names (size: {})", i, + original_node->names()); + } + const std::string& field_name = original_node->nameAt(i); + ::avro::NodePtr field_node = original_node->leafAt(i); + + // TODO(liuxiaoyu): Add support for case sensitivity in name matching. + // Try to find nested field by name + const MappedField* nested_field = nullptr; + if (field.nested_mapping) { + auto fields_span = field.nested_mapping->fields(); + for (const auto& f : fields_span) { + if (f.names.find(field_name) != f.names.end()) { + nested_field = &f; + break; + } + } + } + + if (!nested_field) { + return InvalidSchema("Field '{}' not found in nested mapping", field_name); + } + + if (!nested_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for field '{}' in nested mapping", + field_name); + } + + // Preserve existing custom attributes for this field + ::avro::CustomAttributes attributes; + if (i < original_node->customAttributes()) { + // Copy all existing attributes from the original node + for (const auto& attr_pair : original_node->customAttributesAt(i).attributes()) { + // Copy each existing attribute to preserve original metadata + attributes.addAttribute(attr_pair.first, attr_pair.second, /*addQuote=*/false); + } + } + + // Add field ID attribute to the new node (preserving existing attributes) + attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(nested_field->field_id.value()), + /*addQuote=*/false); + + new_record_node->addCustomAttributesForField(attributes); + + // Recursively apply field IDs to nested fields + ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node, + MakeAvroNodeWithFieldIds(field_node, *nested_field)); + new_record_node->addName(field_name); + new_record_node->addLeaf(new_nested_node); + } + + return new_record_node; +} + +Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + if (original_node->leaves() != 1) { + return InvalidSchema("Array type must have exactly one leaf"); + } + + auto new_array_node = std::make_shared<::avro::NodeArray>(); + + // Check if this is a map represented as array + if (HasMapLogicalType(original_node)) { + ICEBERG_ASSIGN_OR_RAISE(auto new_element_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), field)); + new_array_node->addLeaf(new_element_node); Review Comment: We need to check `if (original_node->customAttributes() > 0)` and add it to `new_array_node`. ########## src/iceberg/avro/avro_schema_util.cc: ########## @@ -773,4 +775,247 @@ Result<SchemaProjection> Project(const Schema& expected_schema, return SchemaProjection{std::move(field_projection.children)}; } +namespace { + +Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + auto new_record_node = std::make_shared<::avro::NodeRecord>(); + new_record_node->setName(original_node->name()); + + for (size_t i = 0; i < original_node->leaves(); ++i) { + if (i >= original_node->names()) { + return InvalidSchema("Index {} is out of bounds for names (size: {})", i, + original_node->names()); + } + const std::string& field_name = original_node->nameAt(i); + ::avro::NodePtr field_node = original_node->leafAt(i); + + // TODO(liuxiaoyu): Add support for case sensitivity in name matching. + // Try to find nested field by name + const MappedField* nested_field = nullptr; + if (field.nested_mapping) { + auto fields_span = field.nested_mapping->fields(); + for (const auto& f : fields_span) { + if (f.names.find(field_name) != f.names.end()) { + nested_field = &f; + break; + } + } + } + + if (!nested_field) { + return InvalidSchema("Field '{}' not found in nested mapping", field_name); + } + + if (!nested_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for field '{}' in nested mapping", + field_name); + } + + // Preserve existing custom attributes for this field + ::avro::CustomAttributes attributes; + if (i < original_node->customAttributes()) { + // Copy all existing attributes from the original node + for (const auto& attr_pair : original_node->customAttributesAt(i).attributes()) { + // Copy each existing attribute to preserve original metadata + attributes.addAttribute(attr_pair.first, attr_pair.second, /*addQuote=*/false); + } + } + + // Add field ID attribute to the new node (preserving existing attributes) + attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(nested_field->field_id.value()), + /*addQuote=*/false); + + new_record_node->addCustomAttributesForField(attributes); + + // Recursively apply field IDs to nested fields + ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node, + MakeAvroNodeWithFieldIds(field_node, *nested_field)); + new_record_node->addName(field_name); + new_record_node->addLeaf(new_nested_node); + } + + return new_record_node; +} + +Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + if (original_node->leaves() != 1) { + return InvalidSchema("Array type must have exactly one leaf"); + } + + auto new_array_node = std::make_shared<::avro::NodeArray>(); + + // Check if this is a map represented as array + if (HasMapLogicalType(original_node)) { + ICEBERG_ASSIGN_OR_RAISE(auto new_element_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), field)); + new_array_node->addLeaf(new_element_node); + return new_array_node; + } + + // For regular arrays, use the first field from nested mapping as element field + if (!field.nested_mapping || field.nested_mapping->fields().empty()) { + return InvalidSchema("Array type requires nested mapping with element field"); + } + + const auto& element_field = field.nested_mapping->fields()[0]; + + if (!element_field.field_id.has_value()) { + return InvalidSchema("Field ID is missing for element field in array"); + } + + ICEBERG_ASSIGN_OR_RAISE( + auto new_element_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), element_field)); + new_array_node->addLeaf(new_element_node); + + // Add element field ID as custom attribute + ::avro::CustomAttributes element_attributes; + element_attributes.addAttribute(std::string(kElementIdProp), Review Comment: Ditto, we may need to merge attributes from the original node if exist. ########## src/iceberg/avro/avro_schema_util.cc: ########## @@ -773,4 +775,247 @@ Result<SchemaProjection> Project(const Schema& expected_schema, return SchemaProjection{std::move(field_projection.children)}; } +namespace { + +Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + auto new_record_node = std::make_shared<::avro::NodeRecord>(); + new_record_node->setName(original_node->name()); + + for (size_t i = 0; i < original_node->leaves(); ++i) { + if (i >= original_node->names()) { + return InvalidSchema("Index {} is out of bounds for names (size: {})", i, + original_node->names()); + } + const std::string& field_name = original_node->nameAt(i); + ::avro::NodePtr field_node = original_node->leafAt(i); + + // TODO(liuxiaoyu): Add support for case sensitivity in name matching. + // Try to find nested field by name + const MappedField* nested_field = nullptr; + if (field.nested_mapping) { + auto fields_span = field.nested_mapping->fields(); + for (const auto& f : fields_span) { + if (f.names.find(field_name) != f.names.end()) { + nested_field = &f; + break; + } + } + } + + if (!nested_field) { + return InvalidSchema("Field '{}' not found in nested mapping", field_name); + } + + if (!nested_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for field '{}' in nested mapping", + field_name); + } + + // Preserve existing custom attributes for this field + ::avro::CustomAttributes attributes; + if (i < original_node->customAttributes()) { + // Copy all existing attributes from the original node + for (const auto& attr_pair : original_node->customAttributesAt(i).attributes()) { + // Copy each existing attribute to preserve original metadata + attributes.addAttribute(attr_pair.first, attr_pair.second, /*addQuote=*/false); + } + } + + // Add field ID attribute to the new node (preserving existing attributes) + attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(nested_field->field_id.value()), + /*addQuote=*/false); + + new_record_node->addCustomAttributesForField(attributes); + + // Recursively apply field IDs to nested fields + ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node, + MakeAvroNodeWithFieldIds(field_node, *nested_field)); + new_record_node->addName(field_name); + new_record_node->addLeaf(new_nested_node); + } + + return new_record_node; +} + +Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + if (original_node->leaves() != 1) { + return InvalidSchema("Array type must have exactly one leaf"); + } + + auto new_array_node = std::make_shared<::avro::NodeArray>(); + + // Check if this is a map represented as array + if (HasMapLogicalType(original_node)) { + ICEBERG_ASSIGN_OR_RAISE(auto new_element_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), field)); + new_array_node->addLeaf(new_element_node); + return new_array_node; + } + + // For regular arrays, use the first field from nested mapping as element field + if (!field.nested_mapping || field.nested_mapping->fields().empty()) { + return InvalidSchema("Array type requires nested mapping with element field"); + } + + const auto& element_field = field.nested_mapping->fields()[0]; + + if (!element_field.field_id.has_value()) { + return InvalidSchema("Field ID is missing for element field in array"); + } + + ICEBERG_ASSIGN_OR_RAISE( + auto new_element_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), element_field)); + new_array_node->addLeaf(new_element_node); + + // Add element field ID as custom attribute + ::avro::CustomAttributes element_attributes; + element_attributes.addAttribute(std::string(kElementIdProp), + std::to_string(*element_field.field_id), + /*addQuote=*/false); + new_array_node->addCustomAttributesForField(element_attributes); + + return new_array_node; +} + +Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + if (original_node->leaves() != 2) { + return InvalidSchema("Map type must have exactly two leaves"); + } + + auto new_map_node = std::make_shared<::avro::NodeMap>(); + + // For map types, we need to extract key and value field mappings from the nested + // mapping + if (!field.nested_mapping) { + return InvalidSchema("Map type requires nested mapping for key and value fields"); + } + + // For map types, use the first two fields from nested mapping as key and value + if (!field.nested_mapping || field.nested_mapping->fields().size() < 2) { + return InvalidSchema("Map type requires nested mapping with key and value fields"); + } + + const auto& key_mapped_field = field.nested_mapping->fields()[0]; + const auto& value_mapped_field = field.nested_mapping->fields()[1]; + + if (!key_mapped_field.field_id || !value_mapped_field.field_id) { + return InvalidSchema("Map key and value fields must have field IDs"); + } + + // Add key and value nodes + ICEBERG_ASSIGN_OR_RAISE( + auto new_key_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), key_mapped_field)); + ICEBERG_ASSIGN_OR_RAISE( + auto new_value_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(1), value_mapped_field)); + new_map_node->addLeaf(new_key_node); + new_map_node->addLeaf(new_value_node); + + // Preserve existing custom attributes from the original node and add field ID + // attributes Copy existing attributes from the original node (if any) + if (original_node->customAttributes() > 0) { + const auto& original_attrs = original_node->customAttributesAt(0); + const auto& existing_attrs = original_attrs.attributes(); + for (const auto& attr_pair : existing_attrs) { + // Copy each existing attribute to preserve original metadata + ::avro::CustomAttributes attributes; + attributes.addAttribute(attr_pair.first, attr_pair.second, /*addQuote=*/false); + new_map_node->addCustomAttributesForField(attributes); Review Comment: I think this is wrong. Perhaps existing test cases do not cover this so they are not revealed. We need to merge `original_node->customAttributesAt(0)` to `key_attributes`, and merge `original_node->customAttributesAt(1)` to `value_attributes`, if they exist. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org