MisterRaindrop commented on code in PR #127: URL: https://github.com/apache/iceberg-cpp/pull/127#discussion_r2281109179
########## src/iceberg/avro/avro_schema_util.cc: ########## @@ -773,4 +775,301 @@ Result<SchemaProjection> Project(const Schema& expected_schema, return SchemaProjection{std::move(field_projection.children)}; } +namespace { + +Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + auto new_record_node = std::make_shared<::avro::NodeRecord>(); + new_record_node->setName(original_node->name()); + + for (size_t i = 0; i < original_node->leaves(); ++i) { + if (i >= original_node->names()) { + return InvalidSchema("Index {} is out of bounds for names (size: {})", i, + original_node->names()); + } + const std::string& field_name = original_node->nameAt(i); + ::avro::NodePtr field_node = original_node->leafAt(i); + + // TODO(liuxiaoyu): Add support for case sensitivity in name matching. + // Try to find nested field by name + const MappedField* nested_field = nullptr; + if (field.nested_mapping) { + auto fields_span = field.nested_mapping->fields(); + for (const auto& f : fields_span) { + if (f.names.find(field_name) != f.names.end()) { + nested_field = &f; + break; + } + } + } + + if (!nested_field) { + return InvalidSchema("Field '{}' not found in nested mapping", field_name); + } + + if (!nested_field->field_id.has_value()) { + return InvalidSchema("Field ID is missing for field '{}' in nested mapping", + field_name); + } + + // Preserve existing custom attributes for this field + ::avro::CustomAttributes attributes; + if (i < original_node->customAttributes()) { + // Copy all existing attributes from the original node + for (const auto& attr_pair : original_node->customAttributesAt(i).attributes()) { + // Copy each existing attribute to preserve original metadata + attributes.addAttribute(attr_pair.first, attr_pair.second, /*addQuote=*/false); + } + } + + // Add field ID attribute to the new node (preserving existing attributes) + attributes.addAttribute(std::string(kFieldIdProp), + std::to_string(nested_field->field_id.value()), + /*addQuote=*/false); + + new_record_node->addCustomAttributesForField(attributes); + + // Recursively apply field IDs to nested fields + ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node, + MakeAvroNodeWithFieldIds(field_node, *nested_field)); + new_record_node->addName(field_name); + new_record_node->addLeaf(new_nested_node); + } + + return new_record_node; +} + +Result<::avro::NodePtr> CreateArrayNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + if (original_node->leaves() != 1) { + return InvalidSchema("Array type must have exactly one leaf"); + } + + auto new_array_node = std::make_shared<::avro::NodeArray>(); + + // Check if this is a map represented as array + if (HasMapLogicalType(original_node)) { + ICEBERG_ASSIGN_OR_RAISE(auto new_element_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), field)); + new_array_node->addLeaf(new_element_node); + + // Check and add custom attributes + if (original_node->customAttributes() > 0) { + ::avro::CustomAttributes merged_attributes; + const auto& original_attrs = original_node->customAttributesAt(0); + const auto& existing_attrs = original_attrs.attributes(); + for (const auto& attr_pair : existing_attrs) { + // Skip element-id as we might set it differently + if (attr_pair.first != kElementIdProp) { + merged_attributes.addAttribute(attr_pair.first, attr_pair.second, + /*addQuote=*/false); + } + } + // Add merged attributes if we found any + if (merged_attributes.attributes().size() > 0) { + new_array_node->addCustomAttributesForField(merged_attributes); + } + } + + return new_array_node; + } + + // For regular arrays, use the first field from nested mapping as element field + if (!field.nested_mapping || field.nested_mapping->fields().empty()) { + return InvalidSchema("Array type requires nested mapping with element field"); + } + + const auto& element_field = field.nested_mapping->fields()[0]; + + if (!element_field.field_id.has_value()) { + return InvalidSchema("Field ID is missing for element field in array"); + } + + ICEBERG_ASSIGN_OR_RAISE( + auto new_element_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), element_field)); + new_array_node->addLeaf(new_element_node); + + // Create merged custom attributes with element field ID + ::avro::CustomAttributes merged_attributes; + + // First add our element field ID (highest priority) + merged_attributes.addAttribute(std::string(kElementIdProp), + std::to_string(*element_field.field_id), + /*addQuote=*/false); + + // Then merge any custom attributes from original node (except element-id) + if (original_node->customAttributes() > 0) { + const auto& original_attrs = original_node->customAttributesAt(0); + const auto& existing_attrs = original_attrs.attributes(); + for (const auto& attr_pair : existing_attrs) { + // Skip element-id as we've already set it above + if (attr_pair.first != kElementIdProp) { + merged_attributes.addAttribute(attr_pair.first, attr_pair.second, + /*addQuote=*/false); + } + } + } + + // Add all attributes at once + new_array_node->addCustomAttributesForField(merged_attributes); + + return new_array_node; +} + +Result<::avro::NodePtr> CreateMapNodeWithFieldIds(const ::avro::NodePtr& original_node, + const MappedField& field) { + if (original_node->leaves() != 2) { + return InvalidSchema("Map type must have exactly two leaves"); + } + + auto new_map_node = std::make_shared<::avro::NodeMap>(); + + // For map types, we need to extract key and value field mappings from the nested + // mapping + if (!field.nested_mapping) { + return InvalidSchema("Map type requires nested mapping for key and value fields"); + } + + // For map types, use the first two fields from nested mapping as key and value + if (!field.nested_mapping || field.nested_mapping->fields().size() < 2) { + return InvalidSchema("Map type requires nested mapping with key and value fields"); + } + + const auto& key_mapped_field = field.nested_mapping->fields()[0]; + const auto& value_mapped_field = field.nested_mapping->fields()[1]; + + if (!key_mapped_field.field_id || !value_mapped_field.field_id) { + return InvalidSchema("Map key and value fields must have field IDs"); + } + + // Add key and value nodes + ICEBERG_ASSIGN_OR_RAISE( + auto new_key_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(0), key_mapped_field)); + ICEBERG_ASSIGN_OR_RAISE( + auto new_value_node, + MakeAvroNodeWithFieldIds(original_node->leafAt(1), value_mapped_field)); + new_map_node->addLeaf(new_key_node); + new_map_node->addLeaf(new_value_node); + + // Create key and value attributes + ::avro::CustomAttributes key_attributes; + ::avro::CustomAttributes value_attributes; + + // Add required field IDs + key_attributes.addAttribute(std::string(kKeyIdProp), + std::to_string(*key_mapped_field.field_id), + /*addQuote=*/false); + value_attributes.addAttribute(std::string(kValueIdProp), + std::to_string(*value_mapped_field.field_id), + /*addQuote=*/false); + + // Merge custom attributes from original node if they exist + if (original_node->customAttributes() > 0) { + // Merge attributes for key (index 0) + const auto& original_key_attrs = original_node->customAttributesAt(0); + const auto& existing_key_attrs = original_key_attrs.attributes(); + for (const auto& attr_pair : existing_key_attrs) { + // Skip if it's the key ID property we're already setting + if (attr_pair.first != kKeyIdProp) { + key_attributes.addAttribute(attr_pair.first, attr_pair.second, + /*addQuote=*/false); + } + } + + // Merge attributes for value (index 1) + if (original_node->customAttributes() > 1) { + const auto& original_value_attrs = original_node->customAttributesAt(1); + const auto& existing_value_attrs = original_value_attrs.attributes(); + for (const auto& attr_pair : existing_value_attrs) { + // Skip if it's the value ID property we're already setting + if (attr_pair.first != kValueIdProp) { Review Comment: @wgtmac I see there are multiple similar parts in the code. I think it's worth adding a function. What about you? If it's okay, I will add a function. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org