jeffkbkim commented on code in PR #14182: URL: https://github.com/apache/kafka/pull/14182#discussion_r1309478864
########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/OptimizedUniformAssignmentBuilder.java: ########## @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.common.Uuid; +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.stream.IntStream; + +import static java.lang.Math.min; + +/** + * Assigns partitions to members of a consumer group ensuring a balanced distribution with + * considerations for sticky assignments and rack-awareness. + * The order of priority of properties during the assignment will be: + * balance > rack matching (when applicable) > stickiness. + * + * <p> Here's the step-by-step breakdown of the assignment process: + * + * <ul> + * <li> Compute the quotas of partitions for each member based on the total partitions and member count.</li> + * <li> For existing assignments, retain partitions based on the determined quota and member's rack compatibility. + * <li> If a partition's rack mismatches with its member, track it with its prior owner.</li> + * <li> Identify members that haven't fulfilled their partition quota or are eligible to receive extra partitions.</li> + * <li> Derive the unassigned partitions by taking the difference between total partitions and the sticky assignments.</li> + * <li> Depending on members needing extra partitions, select members from the potentially unfilled list + * and add them to the unfilled list.</li> + * <li> Proceed with a round-robin assignment adhering to rack awareness. + * For each unassigned partition, locate the first compatible member from the unfilled list.</li> + * <li> If no rack-compatible member is found, revert to the tracked current owner. + * If that member can't accommodate the partition due to quota limits, resort to a generic round-robin assignment.</li> + * </ul> + */ +public class OptimizedUniformAssignmentBuilder extends UniformAssignor.AbstractAssignmentBuilder { + private static final Logger log = LoggerFactory.getLogger(OptimizedUniformAssignmentBuilder.class); + private final AssignmentSpec assignmentSpec; + private final SubscribedTopicDescriber subscribedTopicDescriber; + // List of topics subscribed to by all members. + private final List<Uuid> subscriptionList; + private final RackInfo rackInfo; + // Count of members to receive an extra partition beyond the minimum quota, + // to account for the distribution of the remaining partitions. + private int remainingMembersToGetAnExtraPartition; + // Map of members to the remaining number of partitions needed to meet the minimum quota, + // including members eligible for an extra partition. + private final Map<String, Integer> potentiallyUnfilledMembers; + // Members mapped to the remaining number of partitions needed to meet the full quota. + // Full quota = minQuota + one extra partition (if applicable). + private Map<String, Integer> unfilledMembers; + private List<TopicIdPartition> unassignedPartitions; + private final Map<String, MemberAssignment> newAssignment; + // Tracks the current owner of each partition when using rack-aware strategy. + // Current refers to the existing assignment. + private final Map<TopicIdPartition, String> currentPartitionOwners; + // Indicates if a rack aware assignment can be done. + // True if racks are defined for both members and partitions. + boolean useRackAwareStrategy; + + OptimizedUniformAssignmentBuilder(AssignmentSpec assignmentSpec, SubscribedTopicDescriber subscribedTopicDescriber) { + this.assignmentSpec = assignmentSpec; + this.subscribedTopicDescriber = subscribedTopicDescriber; + this.subscriptionList = new ArrayList<>(assignmentSpec.members().values().iterator().next().subscribedTopicIds()); + this.rackInfo = new RackInfo(assignmentSpec, subscribedTopicDescriber, subscriptionList); + this.potentiallyUnfilledMembers = new HashMap<>(); + this.unfilledMembers = new HashMap<>(); + this.newAssignment = new HashMap<>(); + this.useRackAwareStrategy = rackInfo.useRackStrategy; + // Without rack-aware strategy, tracking current owners of unassigned partitions is unnecessary + // as all sticky partitions are retained until a member meets its quota. + this. currentPartitionOwners = useRackAwareStrategy ? new HashMap<>() : Collections.emptyMap(); Review Comment: space after `this.` ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/OptimizedUniformAssignmentBuilder.java: ########## @@ -0,0 +1,399 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.common.Uuid; +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; + +import static java.lang.Math.min; + +/** + * Assigns Kafka partitions to members of a consumer group ensuring a balanced distribution with + * considerations for sticky assignments and rack-awareness. + * The order of priority of properties during the assignment will be: balance > rack matching (when applicable) > stickiness. + * + * <p> Here's the step-by-step breakdown of the assignment process: + * + * <ul> + * <li> Compute the quotas of partitions for each member based on the total partitions and member count.</li> + * <li> For existing assignments, retain partitions based on the determined quota and member's rack compatibility. + * <li> If a partition's rack mismatches with its member, track it with its prior owner.</li> + * <li> Identify members that haven't fulfilled their partition quota or are eligible to receive extra partitions.</li> + * <li> Derive the unassigned partitions by taking the difference between total partitions and the sticky assignments.</li> + * <li> Depending on members needing extra partitions, select members from the potentially unfilled list and add them to the unfilled list.</li> + * <li> Proceed with a round-robin assignment adhering to rack awareness. + * For each unassigned partition, locate the first compatible member from the unfilled list.</li> + * <li> If no rack-compatible member is found, revert to the tracked current owner. + * If that member can't accommodate the partition due to quota limits, resort to a generic round-robin assignment.</li> + * </ul> + */ +public class OptimizedUniformAssignmentBuilder extends UniformAssignor.AbstractAssignmentBuilder { + private static final Logger log = LoggerFactory.getLogger(OptimizedUniformAssignmentBuilder.class); + private final AssignmentSpec assignmentSpec; + private final SubscribedTopicDescriber subscribedTopicDescriber; + // List of topics subscribed to by all members. + private final List<Uuid> subscriptionList; + private final RackInfo rackInfo; + // Count of members to receive an extra partition beyond the minimum quota, + // to account for the distribution of the remaining partitions. + private int remainingMembersToGetExtraPartition; + // Map of members to the remaining number of partitions needed to meet the minimum quota, + // including members eligible for an extra partition. + private final Map<String, Integer> potentiallyUnfilledMembers; + // Members mapped to the remaining number of partitions needed to meet the full quota. + // Full quota = minQuota + one extra partition (if applicable). + private Map<String, Integer> unfilledMembers; + private List<TopicIdPartition> unassignedPartitions; + private final Map<String, MemberAssignment> newAssignment; + // Tracks the current owner of each partition when using rack-aware strategy. + // Current refers to the existing assignment. + private final Map<TopicIdPartition, String> currentPartitionOwners; + // Indicates if a rack aware assignment can be done. + // True if racks are defined for both members and partitions. + boolean useRackAwareStrategy; + + OptimizedUniformAssignmentBuilder(AssignmentSpec assignmentSpec, SubscribedTopicDescriber subscribedTopicDescriber) { + this.assignmentSpec = assignmentSpec; + this.subscribedTopicDescriber = subscribedTopicDescriber; + subscriptionList = new ArrayList<>(assignmentSpec.members().values().iterator().next().subscribedTopicIds()); + + RackInfo rackInfo = new RackInfo(assignmentSpec, subscribedTopicDescriber, subscriptionList); + this.rackInfo = rackInfo; + + potentiallyUnfilledMembers = new HashMap<>(); + unfilledMembers = new HashMap<>(); + newAssignment = new HashMap<>(); + + // Without rack-aware strategy, tracking current owners of unassigned partitions is unnecessary + // as all sticky partitions are retained until a member meets its quota. + if (rackInfo.memberRacks.isEmpty() || rackInfo.partitionRacks.isEmpty()) { + this.useRackAwareStrategy = false; + currentPartitionOwners = Collections.emptyMap(); + } else { + this.useRackAwareStrategy = true; + currentPartitionOwners = new HashMap<>(); + } + } + + @Override + protected GroupAssignment buildAssignment() { + int totalPartitionsCount = 0; + // Removes the current topic from subscriptionList if the topic doesn't exist in the topic metadata. + Iterator<Uuid> iterator = subscriptionList.iterator(); + while (iterator.hasNext()) { + Uuid topicId = iterator.next(); + int partitionCount = subscribedTopicDescriber.numPartitions(topicId); + if (partitionCount == -1) { + log.warn("Members are subscribed to topic " + topicId + " which doesn't exist in the topic metadata."); + iterator.remove(); + } else { + totalPartitionsCount += partitionCount; + } + } + + if (subscriptionList.isEmpty()) { + log.info("The subscription list is empty, returning an empty assignment"); + return new GroupAssignment(Collections.emptyMap()); + } + + // The minimum required quota that each member needs to meet for a balanced assignment. + // This is the same for all members. + final int minQuota; + int numberOfMembers = assignmentSpec.members().size(); + minQuota = (int) Math.floor(((double) totalPartitionsCount) / numberOfMembers); + remainingMembersToGetExtraPartition = totalPartitionsCount % numberOfMembers; + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> + newAssignment.put(memberId, new MemberAssignment(new HashMap<>())) + ); + + Set<TopicIdPartition> allAssignedStickyPartitions = computeAssignedStickyPartitions(minQuota); + + unassignedPartitions = getUnassignedPartitions(allAssignedStickyPartitions); + unfilledMembers = getUnfilledMembers(); + + if (!isTotalUnassignedPartitionsEqualsTotalRemainingAssignments()) { + log.warn("Number of available partitions is not equal to the total requirement"); + } + + if (useRackAwareStrategy) rackAwareRoundRobinAssignment(); + unassignedPartitionsRoundRobinAssignment(); + + return new GroupAssignment(newAssignment); + } + + /** + * Retrieves a set of partitions that were currently assigned to members and will be retained in the new assignment, + * by ensuring that the partitions are still relevant based on current topic metadata and subscriptions. + * If rack awareness is enabled, it ensures that a partition's rack matches the member's rack. + * + * <p> For each member, it: + * <ul> + * <li> Finds the valid current assignment considering topic subscriptions and metadata.</li> + * <li> If current assignments exist, retains up to the minimum quota of assignments.</li> + * <li> If there are members that should get an extra partition, assigns the next partition after the retained ones.</li> + * <li> For members with assignments not exceeding the minimum quota, + * it identifies them as potentially unfilled members and tracks the remaining quota.</li> + * </ul> + * + * @return A set containing all the sticky partitions that have been retained in the new assignment. + */ + private Set<TopicIdPartition> computeAssignedStickyPartitions(Integer minQuota) { + Set<TopicIdPartition> allAssignedStickyPartitions = new HashSet<>(); + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> { + // Remove all the topics that aren't in the subscriptions or the topic metadata anymore. + // If rack awareness is enabled, only add partitions if the consumers rack matches the partitions rack. + List<TopicIdPartition> validCurrentAssignment = getValidCurrentAssignment(memberId, assignmentMemberSpec.assignedPartitions()); + + int currentAssignmentSize = validCurrentAssignment.size(); + int remaining = minQuota - currentAssignmentSize; + + if (currentAssignmentSize > 0) { + int retainedPartitionsCount = min(currentAssignmentSize, minQuota); + for (int i = 0; i < retainedPartitionsCount; i++) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(i).topicId(), k -> new HashSet<>()) + .add(validCurrentAssignment.get(i).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(i)); + } + + // The extra partition is located at the index "retainedPartitionsCount" from the current step. + if (remaining < 0 && remainingMembersToGetExtraPartition > 0) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(retainedPartitionsCount).topicId(), k -> new HashSet<>()) + .add(validCurrentAssignment.get(retainedPartitionsCount).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(retainedPartitionsCount)); + remainingMembersToGetExtraPartition--; + } + } + if (remaining >= 0) { + potentiallyUnfilledMembers.put(memberId, remaining); + } + }); + + return allAssignedStickyPartitions; + } + + /** + * Filters the current assignment of partitions for a given member. + * + * If a partition is assigned to a member not subscribed to its topic or + * if the rack-aware strategy is to be used but there is a mismatch, + * the partition is excluded from the valid assignment and stored for future consideration. + * + * @param memberId The Id of the member whose assignment is being validated. + * @param assignedPartitions The partitions currently assigned to the member. + * + * @return List of valid partitions after applying the filters. + */ + private List<TopicIdPartition> getValidCurrentAssignment(String memberId, Map<Uuid, Set<Integer>> assignedPartitions) { + List<TopicIdPartition> validCurrentAssignmentList = new ArrayList<>(); + + assignedPartitions.forEach((topicId, currentAssignment) -> { + List<Integer> currentAssignmentList = new ArrayList<>(currentAssignment); + if (subscriptionList.contains(topicId)) { + for (Integer partition : currentAssignmentList) { + TopicIdPartition topicIdPartition = new TopicIdPartition(topicId, partition); + if (useRackAwareStrategy && rackInfo.racksMismatch(memberId, topicIdPartition)) { + currentPartitionOwners.put(topicIdPartition, memberId); + } else { + validCurrentAssignmentList.add(topicIdPartition); + } + } + } + }); + + return validCurrentAssignmentList; + } + + /** + * This method iterates over the unassigned partitions and attempts to allocate them + * to members while considering their rack affiliations. + */ + private void rackAwareRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + + // Sorts partitions in ascending order by number of potential consumers with matching racks. + // Partitions with no potential members aren't included in this list. + List<TopicIdPartition> sortedPartitions = rackInfo.sortPartitionsByRackConsumers(unassignedPartitions); + + Iterator<TopicIdPartition> partitionIterator = sortedPartitions.iterator(); + while (partitionIterator.hasNext()) { + TopicIdPartition partition = partitionIterator.next(); + boolean assigned = false; + for (int i = 0; i < roundRobinMembers.size() && !assigned; i++) { + String memberId = roundRobinMembers.poll(); + Integer remainingPartitionCount = unfilledMembers.get(memberId); + + if (remainingPartitionCount != null && remainingPartitionCount > 0 && !rackInfo.racksMismatch(memberId, partition)) { + assignPartitionToMember(memberId, partition); + assigned = true; + partitionIterator.remove(); + unassignedPartitions.remove(partition); + } + + // Only re-add to the end of the queue if it's still in the unfilledMembers map + if (unfilledMembers.containsKey(memberId)) { + roundRobinMembers.add(memberId); + } + } + } + } + + /** + * Allocates the unassigned partitions to available members. + * + * If the rack-aware strategy is enabled, partitions are attempted to be assigned back to their current owners first. + * + * If a partition couldn't be assigned to its current owner due to quotas or + * if the rack-aware strategy is not enabled, the partitions are allocated to members in a round-robin fashion.</p> + */ + private void unassignedPartitionsRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + Iterator<TopicIdPartition> partitionIterator = unassignedPartitions.iterator(); + + while (partitionIterator.hasNext()) { + TopicIdPartition partition = partitionIterator.next(); + boolean assigned = false; + + if (useRackAwareStrategy && currentPartitionOwners.containsKey(partition)) { + String prevOwner = currentPartitionOwners.get(partition); + if (unfilledMembers.containsKey(prevOwner)) { + assignPartitionToMember(prevOwner, partition); + assigned = true; + partitionIterator.remove(); + if (!unfilledMembers.containsKey(prevOwner)) { + roundRobinMembers.remove(prevOwner); + } + } + } + + // Only re-add the member to the end of the queue if it's still available for assignment. + for (int i = 0; i < unfilledMembers.size() && !assigned; i++) { Review Comment: can you remind me again why we need to do at least n number of iterations? ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/RangeAssignor.java: ########## @@ -17,6 +17,8 @@ package org.apache.kafka.coordinator.group.assignor; import org.apache.kafka.common.Uuid; +import org.slf4j.Logger; Review Comment: Ok ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/OptimizedUniformAssignmentBuilder.java: ########## @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.common.Uuid; +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.stream.IntStream; + +import static java.lang.Math.min; + +/** + * Assigns Kafka partitions to members of a consumer group ensuring a balanced distribution with + * considerations for sticky assignments and rack-awareness. + * The order of priority of properties during the assignment will be: balance > rack matching (when applicable) > stickiness. + * + * <p> Here's the step-by-step breakdown of the assignment process: + * + * <ul> + * <li> Compute the quotas of partitions for each member based on the total partitions and member count.</li> + * <li> For existing assignments, retain partitions based on the determined quota and member's rack compatibility. + * <li> If a partition's rack mismatches with its member, track it with its prior owner.</li> + * <li> Identify members that haven't fulfilled their partition quota or are eligible to receive extra partitions.</li> + * <li> Derive the unassigned partitions by taking the difference between total partitions and the sticky assignments.</li> + * <li> Depending on members needing extra partitions, select members from the potentially unfilled list + * and add them to the unfilled list.</li> + * <li> Proceed with a round-robin assignment adhering to rack awareness. + * For each unassigned partition, locate the first compatible member from the unfilled list.</li> + * <li> If no rack-compatible member is found, revert to the tracked current owner. + * If that member can't accommodate the partition due to quota limits, resort to a generic round-robin assignment.</li> + * </ul> + */ +public class OptimizedUniformAssignmentBuilder extends UniformAssignor.AbstractAssignmentBuilder { + private static final Logger log = LoggerFactory.getLogger(OptimizedUniformAssignmentBuilder.class); + private final AssignmentSpec assignmentSpec; + private final SubscribedTopicDescriber subscribedTopicDescriber; + // List of topics subscribed to by all members. + private final List<Uuid> subscriptionList; + private final RackInfo rackInfo; + // Count of members to receive an extra partition beyond the minimum quota, + // to account for the distribution of the remaining partitions. + private int remainingMembersToGetAnExtraPartition; + // Map of members to the remaining number of partitions needed to meet the minimum quota, + // including members eligible for an extra partition. + private final Map<String, Integer> potentiallyUnfilledMembers; + // Members mapped to the remaining number of partitions needed to meet the full quota. + // Full quota = minQuota + one extra partition (if applicable). + private Map<String, Integer> unfilledMembers; + private List<TopicIdPartition> unassignedPartitions; + private final Map<String, MemberAssignment> newAssignment; + // Tracks the current owner of each partition when using rack-aware strategy. + // Current refers to the existing assignment. + private final Map<TopicIdPartition, String> currentPartitionOwners; + // Indicates if a rack aware assignment can be done. + // True if racks are defined for both members and partitions. + boolean useRackAwareStrategy; + + OptimizedUniformAssignmentBuilder(AssignmentSpec assignmentSpec, SubscribedTopicDescriber subscribedTopicDescriber) { + this.assignmentSpec = assignmentSpec; + this.subscribedTopicDescriber = subscribedTopicDescriber; + this.subscriptionList = new ArrayList<>(assignmentSpec.members().values().iterator().next().subscribedTopicIds()); + + this.rackInfo = new RackInfo(assignmentSpec, subscribedTopicDescriber, subscriptionList); + this.potentiallyUnfilledMembers = new HashMap<>(); + this.unfilledMembers = new HashMap<>(); + this.newAssignment = new HashMap<>(); + + // If consumer or member racks are unavailable, rack aware strategy is not used. + this.useRackAwareStrategy = !(rackInfo.memberRacks.isEmpty() || rackInfo.partitionRacks.isEmpty()); + // Without rack-aware strategy, tracking current owners of unassigned partitions is unnecessary + // as all sticky partitions are retained until a member meets its quota. + this. currentPartitionOwners = useRackAwareStrategy ? new HashMap<>() : Collections.emptyMap(); + } + + @Override + protected GroupAssignment buildAssignment() throws PartitionAssignorException{ + int totalPartitionsCount = 0; + // Removes the current topic from subscriptionList if the topic doesn't exist in the topic metadata. + Iterator<Uuid> iterator = subscriptionList.iterator(); + while (iterator.hasNext()) { + Uuid topicId = iterator.next(); + int partitionCount = subscribedTopicDescriber.numPartitions(topicId); + if (partitionCount == -1) { + log.warn("Members are subscribed to topic " + topicId + " which doesn't exist in the topic metadata."); + iterator.remove(); + } else { + totalPartitionsCount += partitionCount; + } + } + + if (subscriptionList.isEmpty()) { + log.info("The subscription list is empty, returning an empty assignment"); + return new GroupAssignment(Collections.emptyMap()); + } + + // The minimum required quota that each member needs to meet for a balanced assignment. + // This is the same for all members. + int numberOfMembers = assignmentSpec.members().size(); + final int minQuota = (int) Math.floor(((double) totalPartitionsCount) / numberOfMembers); + remainingMembersToGetAnExtraPartition = totalPartitionsCount % numberOfMembers; + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> + newAssignment.put(memberId, new MemberAssignment(new HashMap<>())) + ); + + Set<TopicIdPartition> allAssignedStickyPartitions = computeAssignedStickyPartitions(minQuota); + unassignedPartitions = computeUnassignedPartitions(allAssignedStickyPartitions); + unfilledMembers = computeUnfilledMembers(); + + if (!isTotalUnassignedPartitionsEqualsTotalRemainingAssignments()) { + throw new PartitionAssignorException("Number of available partitions is not equal to the total requirement"); + } + + if (useRackAwareStrategy) rackAwareRoundRobinAssignment(); + unassignedPartitionsRoundRobinAssignment(); + + return new GroupAssignment(newAssignment); + } + + /** + * Retrieves a set of partitions that were currently assigned to members and will be retained in the new assignment, + * by ensuring that the partitions are still relevant based on current topic metadata and subscriptions. + * If rack awareness is enabled, it ensures that a partition's rack matches the member's rack. + * + * <p> For each member, it: + * <ul> + * <li> Finds the valid current assignment considering topic subscriptions and metadata.</li> + * <li> If current assignments exist, retains up to the minimum quota of assignments.</li> + * <li> If there are members that should get an extra partition, + * assigns the next partition after the retained ones.</li> + * <li> For members with assignments not exceeding the minimum quota, + * it identifies them as potentially unfilled members and tracks the remaining quota.</li> + * </ul> + * + * @return A set containing all the sticky partitions that have been retained in the new assignment. + */ + private Set<TopicIdPartition> computeAssignedStickyPartitions(Integer minQuota) { + Set<TopicIdPartition> allAssignedStickyPartitions = new HashSet<>(); + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> { + // Remove all the topics that aren't in the subscriptions or the topic metadata anymore. + // If rack awareness is enabled, only add partitions if the consumers rack matches the partitions rack. + List<TopicIdPartition> validCurrentAssignment = validCurrentAssignment( + memberId, + assignmentMemberSpec.assignedPartitions() + ); + + int currentAssignmentSize = validCurrentAssignment.size(); + int remaining = minQuota - currentAssignmentSize; + + if (currentAssignmentSize > 0) { + int retainedPartitionsCount = min(currentAssignmentSize, minQuota); + IntStream.range(0, retainedPartitionsCount).forEach(i -> { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(i).topicId(), __ -> new HashSet<>()) + .add(validCurrentAssignment.get(i).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(i)); + }); + + // The extra partition is located at the index "retainedPartitionsCount" from the current step. + if (remaining < 0 && remainingMembersToGetAnExtraPartition > 0) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(retainedPartitionsCount).topicId(), __ -> new HashSet<>()) + .add(validCurrentAssignment.get(retainedPartitionsCount).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(retainedPartitionsCount)); + remainingMembersToGetAnExtraPartition--; + } + } + + if (remaining >= 0) { + potentiallyUnfilledMembers.put(memberId, remaining); + } + + }); + + return allAssignedStickyPartitions; + } + + /** + * Filters the current assignment of partitions for a given member. + * + * If a partition is assigned to a member not subscribed to its topic or + * if the rack-aware strategy is to be used but there is a mismatch, + * the partition is excluded from the valid assignment and stored for future consideration. + * + * @param memberId The Id of the member whose assignment is being validated. + * @param assignedPartitions The partitions currently assigned to the member. + * + * @return List of valid partitions after applying the filters. + */ + private List<TopicIdPartition> validCurrentAssignment( + String memberId, + Map<Uuid, Set<Integer>> assignedPartitions + ) { + List<TopicIdPartition> validCurrentAssignmentList = new ArrayList<>(); + Set<Uuid> subscriptionSet = new HashSet<>(subscriptionList); + + assignedPartitions.forEach((topicId, currentAssignment) -> { + if (subscriptionSet.contains(topicId)) { + currentAssignment.forEach((partition) -> { + TopicIdPartition topicIdPartition = new TopicIdPartition(topicId, partition); + if (useRackAwareStrategy && rackInfo.racksMismatch(memberId, topicIdPartition)) { + currentPartitionOwners.put(topicIdPartition, memberId); + } else { + validCurrentAssignmentList.add(topicIdPartition); + } + }); + } + }); + + return validCurrentAssignmentList; + } + + /** + * This method iterates over the unassigned partitions and attempts to allocate them + * to members while considering their rack affiliations. + */ + private void rackAwareRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + + // Sorts partitions in ascending order by number of potential consumers with matching racks. + // Partitions with no potential members aren't included in this list. + List<TopicIdPartition> sortedPartitions = rackInfo.sortPartitionsByRackConsumers(unassignedPartitions); + + sortedPartitions.forEach((partition) -> { + boolean assigned = false; + for (int i = 0; i < roundRobinMembers.size() && !assigned; i++) { Review Comment: the while loop will still iterate through all members at the very least right? since we poll one every iteration can you help me understand why the while loop can be infinite whereas the for loop cannot? ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/OptimizedUniformAssignmentBuilder.java: ########## @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.common.Uuid; +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.stream.IntStream; + +import static java.lang.Math.min; + +/** + * Assigns Kafka partitions to members of a consumer group ensuring a balanced distribution with + * considerations for sticky assignments and rack-awareness. + * The order of priority of properties during the assignment will be: balance > rack matching (when applicable) > stickiness. + * + * <p> Here's the step-by-step breakdown of the assignment process: + * + * <ul> + * <li> Compute the quotas of partitions for each member based on the total partitions and member count.</li> + * <li> For existing assignments, retain partitions based on the determined quota and member's rack compatibility. + * <li> If a partition's rack mismatches with its member, track it with its prior owner.</li> + * <li> Identify members that haven't fulfilled their partition quota or are eligible to receive extra partitions.</li> + * <li> Derive the unassigned partitions by taking the difference between total partitions and the sticky assignments.</li> + * <li> Depending on members needing extra partitions, select members from the potentially unfilled list + * and add them to the unfilled list.</li> + * <li> Proceed with a round-robin assignment adhering to rack awareness. + * For each unassigned partition, locate the first compatible member from the unfilled list.</li> + * <li> If no rack-compatible member is found, revert to the tracked current owner. + * If that member can't accommodate the partition due to quota limits, resort to a generic round-robin assignment.</li> + * </ul> + */ +public class OptimizedUniformAssignmentBuilder extends UniformAssignor.AbstractAssignmentBuilder { + private static final Logger log = LoggerFactory.getLogger(OptimizedUniformAssignmentBuilder.class); + private final AssignmentSpec assignmentSpec; + private final SubscribedTopicDescriber subscribedTopicDescriber; + // List of topics subscribed to by all members. + private final List<Uuid> subscriptionList; + private final RackInfo rackInfo; + // Count of members to receive an extra partition beyond the minimum quota, + // to account for the distribution of the remaining partitions. + private int remainingMembersToGetAnExtraPartition; + // Map of members to the remaining number of partitions needed to meet the minimum quota, + // including members eligible for an extra partition. + private final Map<String, Integer> potentiallyUnfilledMembers; + // Members mapped to the remaining number of partitions needed to meet the full quota. + // Full quota = minQuota + one extra partition (if applicable). + private Map<String, Integer> unfilledMembers; + private List<TopicIdPartition> unassignedPartitions; + private final Map<String, MemberAssignment> newAssignment; + // Tracks the current owner of each partition when using rack-aware strategy. + // Current refers to the existing assignment. + private final Map<TopicIdPartition, String> currentPartitionOwners; + // Indicates if a rack aware assignment can be done. + // True if racks are defined for both members and partitions. + boolean useRackAwareStrategy; + + OptimizedUniformAssignmentBuilder(AssignmentSpec assignmentSpec, SubscribedTopicDescriber subscribedTopicDescriber) { + this.assignmentSpec = assignmentSpec; + this.subscribedTopicDescriber = subscribedTopicDescriber; + this.subscriptionList = new ArrayList<>(assignmentSpec.members().values().iterator().next().subscribedTopicIds()); + + this.rackInfo = new RackInfo(assignmentSpec, subscribedTopicDescriber, subscriptionList); + this.potentiallyUnfilledMembers = new HashMap<>(); + this.unfilledMembers = new HashMap<>(); + this.newAssignment = new HashMap<>(); + + // If consumer or member racks are unavailable, rack aware strategy is not used. + this.useRackAwareStrategy = !(rackInfo.memberRacks.isEmpty() || rackInfo.partitionRacks.isEmpty()); + // Without rack-aware strategy, tracking current owners of unassigned partitions is unnecessary + // as all sticky partitions are retained until a member meets its quota. + this. currentPartitionOwners = useRackAwareStrategy ? new HashMap<>() : Collections.emptyMap(); + } + + @Override + protected GroupAssignment buildAssignment() throws PartitionAssignorException{ + int totalPartitionsCount = 0; + // Removes the current topic from subscriptionList if the topic doesn't exist in the topic metadata. + Iterator<Uuid> iterator = subscriptionList.iterator(); + while (iterator.hasNext()) { + Uuid topicId = iterator.next(); + int partitionCount = subscribedTopicDescriber.numPartitions(topicId); + if (partitionCount == -1) { + log.warn("Members are subscribed to topic " + topicId + " which doesn't exist in the topic metadata."); + iterator.remove(); + } else { + totalPartitionsCount += partitionCount; + } + } + + if (subscriptionList.isEmpty()) { + log.info("The subscription list is empty, returning an empty assignment"); + return new GroupAssignment(Collections.emptyMap()); + } + + // The minimum required quota that each member needs to meet for a balanced assignment. + // This is the same for all members. + int numberOfMembers = assignmentSpec.members().size(); + final int minQuota = (int) Math.floor(((double) totalPartitionsCount) / numberOfMembers); + remainingMembersToGetAnExtraPartition = totalPartitionsCount % numberOfMembers; + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> + newAssignment.put(memberId, new MemberAssignment(new HashMap<>())) + ); + + Set<TopicIdPartition> allAssignedStickyPartitions = computeAssignedStickyPartitions(minQuota); + unassignedPartitions = computeUnassignedPartitions(allAssignedStickyPartitions); + unfilledMembers = computeUnfilledMembers(); + + if (!isTotalUnassignedPartitionsEqualsTotalRemainingAssignments()) { + throw new PartitionAssignorException("Number of available partitions is not equal to the total requirement"); + } + + if (useRackAwareStrategy) rackAwareRoundRobinAssignment(); + unassignedPartitionsRoundRobinAssignment(); + + return new GroupAssignment(newAssignment); + } + + /** + * Retrieves a set of partitions that were currently assigned to members and will be retained in the new assignment, + * by ensuring that the partitions are still relevant based on current topic metadata and subscriptions. + * If rack awareness is enabled, it ensures that a partition's rack matches the member's rack. + * + * <p> For each member, it: + * <ul> + * <li> Finds the valid current assignment considering topic subscriptions and metadata.</li> + * <li> If current assignments exist, retains up to the minimum quota of assignments.</li> + * <li> If there are members that should get an extra partition, + * assigns the next partition after the retained ones.</li> + * <li> For members with assignments not exceeding the minimum quota, + * it identifies them as potentially unfilled members and tracks the remaining quota.</li> + * </ul> + * + * @return A set containing all the sticky partitions that have been retained in the new assignment. + */ + private Set<TopicIdPartition> computeAssignedStickyPartitions(Integer minQuota) { + Set<TopicIdPartition> allAssignedStickyPartitions = new HashSet<>(); + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> { + // Remove all the topics that aren't in the subscriptions or the topic metadata anymore. + // If rack awareness is enabled, only add partitions if the consumers rack matches the partitions rack. + List<TopicIdPartition> validCurrentAssignment = validCurrentAssignment( + memberId, + assignmentMemberSpec.assignedPartitions() + ); + + int currentAssignmentSize = validCurrentAssignment.size(); + int remaining = minQuota - currentAssignmentSize; + + if (currentAssignmentSize > 0) { + int retainedPartitionsCount = min(currentAssignmentSize, minQuota); + IntStream.range(0, retainedPartitionsCount).forEach(i -> { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(i).topicId(), __ -> new HashSet<>()) + .add(validCurrentAssignment.get(i).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(i)); + }); + + // The extra partition is located at the index "retainedPartitionsCount" from the current step. + if (remaining < 0 && remainingMembersToGetAnExtraPartition > 0) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(retainedPartitionsCount).topicId(), __ -> new HashSet<>()) + .add(validCurrentAssignment.get(retainedPartitionsCount).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(retainedPartitionsCount)); + remainingMembersToGetAnExtraPartition--; + } + } + + if (remaining >= 0) { + potentiallyUnfilledMembers.put(memberId, remaining); + } + + }); + + return allAssignedStickyPartitions; + } + + /** + * Filters the current assignment of partitions for a given member. + * + * If a partition is assigned to a member not subscribed to its topic or + * if the rack-aware strategy is to be used but there is a mismatch, + * the partition is excluded from the valid assignment and stored for future consideration. + * + * @param memberId The Id of the member whose assignment is being validated. + * @param assignedPartitions The partitions currently assigned to the member. + * + * @return List of valid partitions after applying the filters. + */ + private List<TopicIdPartition> validCurrentAssignment( + String memberId, + Map<Uuid, Set<Integer>> assignedPartitions + ) { + List<TopicIdPartition> validCurrentAssignmentList = new ArrayList<>(); + Set<Uuid> subscriptionSet = new HashSet<>(subscriptionList); + + assignedPartitions.forEach((topicId, currentAssignment) -> { + if (subscriptionSet.contains(topicId)) { + currentAssignment.forEach((partition) -> { + TopicIdPartition topicIdPartition = new TopicIdPartition(topicId, partition); + if (useRackAwareStrategy && rackInfo.racksMismatch(memberId, topicIdPartition)) { + currentPartitionOwners.put(topicIdPartition, memberId); + } else { + validCurrentAssignmentList.add(topicIdPartition); + } + }); + } + }); + + return validCurrentAssignmentList; + } + + /** + * This method iterates over the unassigned partitions and attempts to allocate them + * to members while considering their rack affiliations. + */ + private void rackAwareRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + + // Sorts partitions in ascending order by number of potential consumers with matching racks. + // Partitions with no potential members aren't included in this list. + List<TopicIdPartition> sortedPartitions = rackInfo.sortPartitionsByRackConsumers(unassignedPartitions); + + sortedPartitions.forEach((partition) -> { + boolean assigned = false; + for (int i = 0; i < roundRobinMembers.size() && !assigned; i++) { + String memberId = roundRobinMembers.poll(); + Integer remainingPartitionCount = unfilledMembers.getOrDefault(memberId, 0); + + if (remainingPartitionCount > 0 && !rackInfo.racksMismatch(memberId, partition)) { + assignPartitionToMember(memberId, partition); + assigned = true; + unassignedPartitions.remove(partition); + } + + // Only re-add to the end of the queue if it's still in the unfilledMembers map + if (unfilledMembers.containsKey(memberId)) { + roundRobinMembers.add(memberId); + } + } + }); + } + + /** + * Allocates the unassigned partitions to available members. + * + * If the rack-aware strategy is enabled, partitions are attempted to be assigned back to their current owners first. + * + * If a partition couldn't be assigned to its current owner due to quotas or + * if the rack-aware strategy is not enabled, the partitions are allocated to members in a round-robin fashion.</p> + */ + private void unassignedPartitionsRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + + unassignedPartitions.forEach((partition) -> { + boolean assigned = false; + + if (useRackAwareStrategy && currentPartitionOwners.containsKey(partition)) { + String prevOwner = currentPartitionOwners.get(partition); + if (unfilledMembers.containsKey(prevOwner)) { + assignPartitionToMember(prevOwner, partition); + assigned = true; + if (!unfilledMembers.containsKey(prevOwner)) { + roundRobinMembers.remove(prevOwner); + } + } + } + + // Only re-add the member to the end of the queue if it's still available for assignment. + for (int i = 0; i < unfilledMembers.size() && !assigned; i++) { + String memberId = roundRobinMembers.poll(); + if (unfilledMembers.get(memberId) > 0) { + assignPartitionToMember(memberId, partition); + assigned = true; + } + if (unfilledMembers.containsKey(memberId)) { + roundRobinMembers.add(memberId); + } + } + }); + } + + /** + * Assigns the specified partition to the given member. + * + * <p> + * If the member has met their allocation quota, the member is removed from the + * tracking map of members with their remaining allocations. + * Otherwise, the count of remaining partitions that can be assigned to the member is updated. + * </p> + * + * @param memberId The Id of the member to which the partition will be assigned. + * @param partition The partition to be assigned. + */ + private void assignPartitionToMember(String memberId, TopicIdPartition partition) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(partition.topicId(), __ -> new HashSet<>()) + .add(partition.partition()); + + int remainingPartitionCount = unfilledMembers.get(memberId) - 1; + if (remainingPartitionCount == 0) { + unfilledMembers.remove(memberId); + } else { + unfilledMembers.put(memberId, remainingPartitionCount); + } + } + + /** + * Determines which members can still be assigned partitions to meet the full quota. + * + * @return A map of member IDs and their capacity for additional partitions. + */ + private Map<String, Integer> computeUnfilledMembers() { + Map<String, Integer> unfilledMembers = new HashMap<>(); + + potentiallyUnfilledMembers.forEach((memberId, remaining) -> { + if (remainingMembersToGetAnExtraPartition > 0) { + remaining++; + remainingMembersToGetAnExtraPartition--; + } + if (remaining > 0) { + unfilledMembers.put(memberId, remaining); + } + }); + + return unfilledMembers; + } + + /** + * This method compares the full list of partitions against the set of already + * assigned sticky partitions to identify those that still need to be allocated. + * + * @param allAssignedStickyPartitions Set of partitions that have already been assigned. + * @return List of unassigned partitions. + */ + private List<TopicIdPartition> computeUnassignedPartitions(Set<TopicIdPartition> allAssignedStickyPartitions) { + List<TopicIdPartition> unassignedPartitions = new ArrayList<>(); + List<Uuid> sortedAllTopics = new ArrayList<>(subscriptionList); Review Comment: > all means every single topic right, if it was just sorted topics, which/whose topics? that would make sense if we are using collections that form a subset of all topics somewhere in the code. otherwise it seems redundant i'm not sure i follow. even if the topics weren't sorted, we would still ensure unassigned partitions are present, no? what makes them distribute more evenly? ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/OptimizedUniformAssignmentBuilder.java: ########## @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.common.Uuid; +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.stream.IntStream; + +import static java.lang.Math.min; + +/** + * Assigns Kafka partitions to members of a consumer group ensuring a balanced distribution with + * considerations for sticky assignments and rack-awareness. + * The order of priority of properties during the assignment will be: balance > rack matching (when applicable) > stickiness. + * + * <p> Here's the step-by-step breakdown of the assignment process: + * + * <ul> + * <li> Compute the quotas of partitions for each member based on the total partitions and member count.</li> + * <li> For existing assignments, retain partitions based on the determined quota and member's rack compatibility. + * <li> If a partition's rack mismatches with its member, track it with its prior owner.</li> + * <li> Identify members that haven't fulfilled their partition quota or are eligible to receive extra partitions.</li> + * <li> Derive the unassigned partitions by taking the difference between total partitions and the sticky assignments.</li> + * <li> Depending on members needing extra partitions, select members from the potentially unfilled list + * and add them to the unfilled list.</li> + * <li> Proceed with a round-robin assignment adhering to rack awareness. + * For each unassigned partition, locate the first compatible member from the unfilled list.</li> + * <li> If no rack-compatible member is found, revert to the tracked current owner. + * If that member can't accommodate the partition due to quota limits, resort to a generic round-robin assignment.</li> + * </ul> + */ +public class OptimizedUniformAssignmentBuilder extends UniformAssignor.AbstractAssignmentBuilder { + private static final Logger log = LoggerFactory.getLogger(OptimizedUniformAssignmentBuilder.class); + private final AssignmentSpec assignmentSpec; + private final SubscribedTopicDescriber subscribedTopicDescriber; + // List of topics subscribed to by all members. + private final List<Uuid> subscriptionList; + private final RackInfo rackInfo; + // Count of members to receive an extra partition beyond the minimum quota, + // to account for the distribution of the remaining partitions. + private int remainingMembersToGetAnExtraPartition; + // Map of members to the remaining number of partitions needed to meet the minimum quota, + // including members eligible for an extra partition. + private final Map<String, Integer> potentiallyUnfilledMembers; + // Members mapped to the remaining number of partitions needed to meet the full quota. + // Full quota = minQuota + one extra partition (if applicable). + private Map<String, Integer> unfilledMembers; + private List<TopicIdPartition> unassignedPartitions; + private final Map<String, MemberAssignment> newAssignment; + // Tracks the current owner of each partition when using rack-aware strategy. + // Current refers to the existing assignment. + private final Map<TopicIdPartition, String> currentPartitionOwners; + // Indicates if a rack aware assignment can be done. + // True if racks are defined for both members and partitions. + boolean useRackAwareStrategy; + + OptimizedUniformAssignmentBuilder(AssignmentSpec assignmentSpec, SubscribedTopicDescriber subscribedTopicDescriber) { + this.assignmentSpec = assignmentSpec; + this.subscribedTopicDescriber = subscribedTopicDescriber; + this.subscriptionList = new ArrayList<>(assignmentSpec.members().values().iterator().next().subscribedTopicIds()); + + this.rackInfo = new RackInfo(assignmentSpec, subscribedTopicDescriber, subscriptionList); + this.potentiallyUnfilledMembers = new HashMap<>(); + this.unfilledMembers = new HashMap<>(); + this.newAssignment = new HashMap<>(); + + // If consumer or member racks are unavailable, rack aware strategy is not used. + this.useRackAwareStrategy = !(rackInfo.memberRacks.isEmpty() || rackInfo.partitionRacks.isEmpty()); + // Without rack-aware strategy, tracking current owners of unassigned partitions is unnecessary + // as all sticky partitions are retained until a member meets its quota. + this. currentPartitionOwners = useRackAwareStrategy ? new HashMap<>() : Collections.emptyMap(); + } + + @Override + protected GroupAssignment buildAssignment() throws PartitionAssignorException{ + int totalPartitionsCount = 0; + // Removes the current topic from subscriptionList if the topic doesn't exist in the topic metadata. + Iterator<Uuid> iterator = subscriptionList.iterator(); + while (iterator.hasNext()) { + Uuid topicId = iterator.next(); + int partitionCount = subscribedTopicDescriber.numPartitions(topicId); + if (partitionCount == -1) { + log.warn("Members are subscribed to topic " + topicId + " which doesn't exist in the topic metadata."); + iterator.remove(); + } else { + totalPartitionsCount += partitionCount; + } + } + + if (subscriptionList.isEmpty()) { + log.info("The subscription list is empty, returning an empty assignment"); + return new GroupAssignment(Collections.emptyMap()); + } + + // The minimum required quota that each member needs to meet for a balanced assignment. + // This is the same for all members. + int numberOfMembers = assignmentSpec.members().size(); + final int minQuota = (int) Math.floor(((double) totalPartitionsCount) / numberOfMembers); + remainingMembersToGetAnExtraPartition = totalPartitionsCount % numberOfMembers; + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> + newAssignment.put(memberId, new MemberAssignment(new HashMap<>())) + ); + + Set<TopicIdPartition> allAssignedStickyPartitions = computeAssignedStickyPartitions(minQuota); + unassignedPartitions = computeUnassignedPartitions(allAssignedStickyPartitions); + unfilledMembers = computeUnfilledMembers(); + + if (!isTotalUnassignedPartitionsEqualsTotalRemainingAssignments()) { + throw new PartitionAssignorException("Number of available partitions is not equal to the total requirement"); + } + + if (useRackAwareStrategy) rackAwareRoundRobinAssignment(); + unassignedPartitionsRoundRobinAssignment(); + + return new GroupAssignment(newAssignment); + } + + /** + * Retrieves a set of partitions that were currently assigned to members and will be retained in the new assignment, + * by ensuring that the partitions are still relevant based on current topic metadata and subscriptions. + * If rack awareness is enabled, it ensures that a partition's rack matches the member's rack. + * + * <p> For each member, it: + * <ul> + * <li> Finds the valid current assignment considering topic subscriptions and metadata.</li> + * <li> If current assignments exist, retains up to the minimum quota of assignments.</li> + * <li> If there are members that should get an extra partition, + * assigns the next partition after the retained ones.</li> + * <li> For members with assignments not exceeding the minimum quota, + * it identifies them as potentially unfilled members and tracks the remaining quota.</li> + * </ul> + * + * @return A set containing all the sticky partitions that have been retained in the new assignment. + */ + private Set<TopicIdPartition> computeAssignedStickyPartitions(Integer minQuota) { + Set<TopicIdPartition> allAssignedStickyPartitions = new HashSet<>(); + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> { + // Remove all the topics that aren't in the subscriptions or the topic metadata anymore. + // If rack awareness is enabled, only add partitions if the consumers rack matches the partitions rack. + List<TopicIdPartition> validCurrentAssignment = validCurrentAssignment( + memberId, + assignmentMemberSpec.assignedPartitions() + ); + + int currentAssignmentSize = validCurrentAssignment.size(); + int remaining = minQuota - currentAssignmentSize; + + if (currentAssignmentSize > 0) { + int retainedPartitionsCount = min(currentAssignmentSize, minQuota); + IntStream.range(0, retainedPartitionsCount).forEach(i -> { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(i).topicId(), __ -> new HashSet<>()) + .add(validCurrentAssignment.get(i).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(i)); + }); + + // The extra partition is located at the index "retainedPartitionsCount" from the current step. + if (remaining < 0 && remainingMembersToGetAnExtraPartition > 0) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(retainedPartitionsCount).topicId(), __ -> new HashSet<>()) + .add(validCurrentAssignment.get(retainedPartitionsCount).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(retainedPartitionsCount)); + remainingMembersToGetAnExtraPartition--; + } + } + + if (remaining >= 0) { + potentiallyUnfilledMembers.put(memberId, remaining); + } + + }); + + return allAssignedStickyPartitions; + } + + /** + * Filters the current assignment of partitions for a given member. + * + * If a partition is assigned to a member not subscribed to its topic or + * if the rack-aware strategy is to be used but there is a mismatch, + * the partition is excluded from the valid assignment and stored for future consideration. + * + * @param memberId The Id of the member whose assignment is being validated. + * @param assignedPartitions The partitions currently assigned to the member. + * + * @return List of valid partitions after applying the filters. + */ + private List<TopicIdPartition> validCurrentAssignment( + String memberId, + Map<Uuid, Set<Integer>> assignedPartitions + ) { + List<TopicIdPartition> validCurrentAssignmentList = new ArrayList<>(); + Set<Uuid> subscriptionSet = new HashSet<>(subscriptionList); + + assignedPartitions.forEach((topicId, currentAssignment) -> { + if (subscriptionSet.contains(topicId)) { + currentAssignment.forEach((partition) -> { + TopicIdPartition topicIdPartition = new TopicIdPartition(topicId, partition); + if (useRackAwareStrategy && rackInfo.racksMismatch(memberId, topicIdPartition)) { + currentPartitionOwners.put(topicIdPartition, memberId); + } else { + validCurrentAssignmentList.add(topicIdPartition); + } + }); + } + }); + + return validCurrentAssignmentList; + } + + /** + * This method iterates over the unassigned partitions and attempts to allocate them + * to members while considering their rack affiliations. + */ + private void rackAwareRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + + // Sorts partitions in ascending order by number of potential consumers with matching racks. + // Partitions with no potential members aren't included in this list. + List<TopicIdPartition> sortedPartitions = rackInfo.sortPartitionsByRackConsumers(unassignedPartitions); + + sortedPartitions.forEach((partition) -> { + boolean assigned = false; + for (int i = 0; i < roundRobinMembers.size() && !assigned; i++) { + String memberId = roundRobinMembers.poll(); + Integer remainingPartitionCount = unfilledMembers.getOrDefault(memberId, 0); + + if (remainingPartitionCount > 0 && !rackInfo.racksMismatch(memberId, partition)) { + assignPartitionToMember(memberId, partition); + assigned = true; + unassignedPartitions.remove(partition); + } + + // Only re-add to the end of the queue if it's still in the unfilledMembers map + if (unfilledMembers.containsKey(memberId)) { + roundRobinMembers.add(memberId); + } + } + }); + } + + /** + * Allocates the unassigned partitions to available members. + * + * If the rack-aware strategy is enabled, partitions are attempted to be assigned back to their current owners first. + * + * If a partition couldn't be assigned to its current owner due to quotas or + * if the rack-aware strategy is not enabled, the partitions are allocated to members in a round-robin fashion.</p> + */ + private void unassignedPartitionsRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + + unassignedPartitions.forEach((partition) -> { + boolean assigned = false; + + if (useRackAwareStrategy && currentPartitionOwners.containsKey(partition)) { + String prevOwner = currentPartitionOwners.get(partition); + if (unfilledMembers.containsKey(prevOwner)) { + assignPartitionToMember(prevOwner, partition); + assigned = true; + if (!unfilledMembers.containsKey(prevOwner)) { + roundRobinMembers.remove(prevOwner); + } + } + } + + // Only re-add the member to the end of the queue if it's still available for assignment. + for (int i = 0; i < unfilledMembers.size() && !assigned; i++) { + String memberId = roundRobinMembers.poll(); + if (unfilledMembers.get(memberId) > 0) { + assignPartitionToMember(memberId, partition); + assigned = true; + } + if (unfilledMembers.containsKey(memberId)) { + roundRobinMembers.add(memberId); + } + } + }); + } + + /** + * Assigns the specified partition to the given member. + * + * <p> + * If the member has met their allocation quota, the member is removed from the + * tracking map of members with their remaining allocations. + * Otherwise, the count of remaining partitions that can be assigned to the member is updated. + * </p> + * + * @param memberId The Id of the member to which the partition will be assigned. + * @param partition The partition to be assigned. + */ + private void assignPartitionToMember(String memberId, TopicIdPartition partition) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(partition.topicId(), __ -> new HashSet<>()) + .add(partition.partition()); + + int remainingPartitionCount = unfilledMembers.get(memberId) - 1; + if (remainingPartitionCount == 0) { + unfilledMembers.remove(memberId); + } else { + unfilledMembers.put(memberId, remainingPartitionCount); + } + } + + /** + * Determines which members can still be assigned partitions to meet the full quota. + * + * @return A map of member IDs and their capacity for additional partitions. + */ + private Map<String, Integer> computeUnfilledMembers() { + Map<String, Integer> unfilledMembers = new HashMap<>(); + + potentiallyUnfilledMembers.forEach((memberId, remaining) -> { + if (remainingMembersToGetAnExtraPartition > 0) { + remaining++; + remainingMembersToGetAnExtraPartition--; + } + if (remaining > 0) { + unfilledMembers.put(memberId, remaining); + } + }); + + return unfilledMembers; + } + + /** + * This method compares the full list of partitions against the set of already + * assigned sticky partitions to identify those that still need to be allocated. + * + * @param allAssignedStickyPartitions Set of partitions that have already been assigned. + * @return List of unassigned partitions. + */ + private List<TopicIdPartition> computeUnassignedPartitions(Set<TopicIdPartition> allAssignedStickyPartitions) { + List<TopicIdPartition> unassignedPartitions = new ArrayList<>(); + List<Uuid> sortedAllTopics = new ArrayList<>(subscriptionList); + Collections.sort(sortedAllTopics); + + if (allAssignedStickyPartitions.isEmpty()) { + return getAllTopicIdPartitions(sortedAllTopics, subscribedTopicDescriber); + } + + sortedAllTopics.forEach((topic) -> { + int partitionCount = subscribedTopicDescriber.numPartitions(topic); + IntStream.range(0, partitionCount) + .mapToObj(i -> new TopicIdPartition(topic, i)) + .filter(partition -> !allAssignedStickyPartitions.contains(partition)) + .forEach(unassignedPartitions::add); + }); + + return unassignedPartitions; + } + + private boolean isTotalUnassignedPartitionsEqualsTotalRemainingAssignments() { Review Comment: "is __ equals __" is not grammatically correct. if we want to, we can do `is __ equal to __` but i felt that was too long. do we ever have a case where we handle a subset of unassigned partitions? even the variable is called unassignedPartitions ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/OptimizedUniformAssignmentBuilder.java: ########## @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.common.Uuid; +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.stream.IntStream; + +import static java.lang.Math.min; + +/** + * Assigns Kafka partitions to members of a consumer group ensuring a balanced distribution with Review Comment: we have the step by step details at `assign()` for the range assignor right ########## reviewers.py: ########## @@ -28,7 +28,7 @@ def prompt_for_user(): while True: try: - user_input = input("\nName or email (case insensitive): ") + user_input = input("\nName or email (case insensitive): ") Review Comment: we should probably fix this.. ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/common/TopicIdPartition.java: ########## @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.common; + +import org.apache.kafka.common.Uuid; + +import java.util.Objects; + +public class TopicIdPartition { Review Comment: ah i see. i'm still leaning towards reusing that one, but will leave it up for david to decide ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/UniformAssignor.java: ########## @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.apache.kafka.common.Uuid; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * The Uniform Assignor distributes Kafka topic partitions among group members for balanced assignment. + * The assignor employs two different strategies based on the nature of topic + * subscriptions across the group members: + * <ul> + * <li> + * <b> Optimized Uniform Assignment Builder: </b> This strategy is used when all members have subscribed + * to the same set of topics. + * </li> + * <li> + * <b> General Uniform Assignment Builder: </b> This strategy is used when members have varied topic + * subscriptions. + * </li> + * </ul> + * + * The appropriate strategy is automatically chosen based on the current members' topic subscriptions. + * + * @see OptimizedUniformAssignmentBuilder + * @see GeneralUniformAssignmentBuilder + */ +public class UniformAssignor implements PartitionAssignor { + private static final Logger log = LoggerFactory.getLogger(UniformAssignor.class); + public static final String UNIFORM_ASSIGNOR_NAME = "uniform"; + + @Override + public String name() { + return UNIFORM_ASSIGNOR_NAME; + } + + /** + * Perform the group assignment given the current members and + * topic metadata. + * + * @param assignmentSpec The member assignment spec. + * @param subscribedTopicDescriber The topic and cluster metadata describer {@link SubscribedTopicDescriber}. + * @return The new assignment for the group. + */ + @Override + public GroupAssignment assign( + AssignmentSpec assignmentSpec, + SubscribedTopicDescriber subscribedTopicDescriber + ) throws PartitionAssignorException { + + AbstractAssignmentBuilder assignmentBuilder; + if (allSubscriptionsEqual(assignmentSpec.members())) { + log.debug("Detected that all members are subscribed to the same set of topics, invoking the " + + "optimized assignment algorithm"); + assignmentBuilder = new OptimizedUniformAssignmentBuilder(assignmentSpec, subscribedTopicDescriber); + } else { + assignmentBuilder = new GeneralUniformAssignmentBuilder(); + log.debug("Detected that all members are subscribed to a different set of topics, invoking the " + + "general assignment algorithm"); + } + return assignmentBuilder.buildAssignment(); + } + + /** + * Determines if all members are subscribed to the same list of topic IDs. + * + * @param members A map of member identifiers to their respective {@code AssignmentMemberSpec}. + * Assumes the map is non-empty. + * @return true if all members have the same subscription list of topic IDs, + * false otherwise. + */ + private boolean allSubscriptionsEqual(Map<String, AssignmentMemberSpec> members) { + Set<Uuid> firstSubscriptionSet = new HashSet<>(members.values().iterator().next().subscribedTopicIds()); Review Comment: this can get NPE if the members is empty right? ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/OptimizedUniformAssignmentBuilder.java: ########## @@ -0,0 +1,399 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.common.Uuid; +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; + +import static java.lang.Math.min; + +/** + * Assigns Kafka partitions to members of a consumer group ensuring a balanced distribution with + * considerations for sticky assignments and rack-awareness. + * The order of priority of properties during the assignment will be: balance > rack matching (when applicable) > stickiness. + * + * <p> Here's the step-by-step breakdown of the assignment process: + * + * <ul> + * <li> Compute the quotas of partitions for each member based on the total partitions and member count.</li> + * <li> For existing assignments, retain partitions based on the determined quota and member's rack compatibility. + * <li> If a partition's rack mismatches with its member, track it with its prior owner.</li> + * <li> Identify members that haven't fulfilled their partition quota or are eligible to receive extra partitions.</li> + * <li> Derive the unassigned partitions by taking the difference between total partitions and the sticky assignments.</li> + * <li> Depending on members needing extra partitions, select members from the potentially unfilled list and add them to the unfilled list.</li> + * <li> Proceed with a round-robin assignment adhering to rack awareness. + * For each unassigned partition, locate the first compatible member from the unfilled list.</li> + * <li> If no rack-compatible member is found, revert to the tracked current owner. + * If that member can't accommodate the partition due to quota limits, resort to a generic round-robin assignment.</li> + * </ul> + */ +public class OptimizedUniformAssignmentBuilder extends UniformAssignor.AbstractAssignmentBuilder { + private static final Logger log = LoggerFactory.getLogger(OptimizedUniformAssignmentBuilder.class); + private final AssignmentSpec assignmentSpec; + private final SubscribedTopicDescriber subscribedTopicDescriber; + // List of topics subscribed to by all members. + private final List<Uuid> subscriptionList; + private final RackInfo rackInfo; + // Count of members to receive an extra partition beyond the minimum quota, + // to account for the distribution of the remaining partitions. + private int remainingMembersToGetExtraPartition; + // Map of members to the remaining number of partitions needed to meet the minimum quota, + // including members eligible for an extra partition. + private final Map<String, Integer> potentiallyUnfilledMembers; + // Members mapped to the remaining number of partitions needed to meet the full quota. + // Full quota = minQuota + one extra partition (if applicable). + private Map<String, Integer> unfilledMembers; + private List<TopicIdPartition> unassignedPartitions; + private final Map<String, MemberAssignment> newAssignment; + // Tracks the current owner of each partition when using rack-aware strategy. + // Current refers to the existing assignment. + private final Map<TopicIdPartition, String> currentPartitionOwners; + // Indicates if a rack aware assignment can be done. + // True if racks are defined for both members and partitions. + boolean useRackAwareStrategy; + + OptimizedUniformAssignmentBuilder(AssignmentSpec assignmentSpec, SubscribedTopicDescriber subscribedTopicDescriber) { + this.assignmentSpec = assignmentSpec; + this.subscribedTopicDescriber = subscribedTopicDescriber; + subscriptionList = new ArrayList<>(assignmentSpec.members().values().iterator().next().subscribedTopicIds()); + + RackInfo rackInfo = new RackInfo(assignmentSpec, subscribedTopicDescriber, subscriptionList); + this.rackInfo = rackInfo; + + potentiallyUnfilledMembers = new HashMap<>(); + unfilledMembers = new HashMap<>(); + newAssignment = new HashMap<>(); + + // Without rack-aware strategy, tracking current owners of unassigned partitions is unnecessary + // as all sticky partitions are retained until a member meets its quota. + if (rackInfo.memberRacks.isEmpty() || rackInfo.partitionRacks.isEmpty()) { + this.useRackAwareStrategy = false; + currentPartitionOwners = Collections.emptyMap(); + } else { + this.useRackAwareStrategy = true; + currentPartitionOwners = new HashMap<>(); + } + } + + @Override + protected GroupAssignment buildAssignment() { + int totalPartitionsCount = 0; + // Removes the current topic from subscriptionList if the topic doesn't exist in the topic metadata. + Iterator<Uuid> iterator = subscriptionList.iterator(); + while (iterator.hasNext()) { + Uuid topicId = iterator.next(); + int partitionCount = subscribedTopicDescriber.numPartitions(topicId); + if (partitionCount == -1) { + log.warn("Members are subscribed to topic " + topicId + " which doesn't exist in the topic metadata."); + iterator.remove(); + } else { + totalPartitionsCount += partitionCount; + } + } + + if (subscriptionList.isEmpty()) { + log.info("The subscription list is empty, returning an empty assignment"); + return new GroupAssignment(Collections.emptyMap()); + } + + // The minimum required quota that each member needs to meet for a balanced assignment. + // This is the same for all members. + final int minQuota; + int numberOfMembers = assignmentSpec.members().size(); + minQuota = (int) Math.floor(((double) totalPartitionsCount) / numberOfMembers); + remainingMembersToGetExtraPartition = totalPartitionsCount % numberOfMembers; + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> + newAssignment.put(memberId, new MemberAssignment(new HashMap<>())) + ); + + Set<TopicIdPartition> allAssignedStickyPartitions = computeAssignedStickyPartitions(minQuota); + + unassignedPartitions = getUnassignedPartitions(allAssignedStickyPartitions); + unfilledMembers = getUnfilledMembers(); + + if (!isTotalUnassignedPartitionsEqualsTotalRemainingAssignments()) { + log.warn("Number of available partitions is not equal to the total requirement"); + } + + if (useRackAwareStrategy) rackAwareRoundRobinAssignment(); + unassignedPartitionsRoundRobinAssignment(); + + return new GroupAssignment(newAssignment); + } + + /** + * Retrieves a set of partitions that were currently assigned to members and will be retained in the new assignment, + * by ensuring that the partitions are still relevant based on current topic metadata and subscriptions. + * If rack awareness is enabled, it ensures that a partition's rack matches the member's rack. + * + * <p> For each member, it: + * <ul> + * <li> Finds the valid current assignment considering topic subscriptions and metadata.</li> + * <li> If current assignments exist, retains up to the minimum quota of assignments.</li> + * <li> If there are members that should get an extra partition, assigns the next partition after the retained ones.</li> + * <li> For members with assignments not exceeding the minimum quota, + * it identifies them as potentially unfilled members and tracks the remaining quota.</li> + * </ul> + * + * @return A set containing all the sticky partitions that have been retained in the new assignment. + */ + private Set<TopicIdPartition> computeAssignedStickyPartitions(Integer minQuota) { + Set<TopicIdPartition> allAssignedStickyPartitions = new HashSet<>(); + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> { + // Remove all the topics that aren't in the subscriptions or the topic metadata anymore. + // If rack awareness is enabled, only add partitions if the consumers rack matches the partitions rack. + List<TopicIdPartition> validCurrentAssignment = getValidCurrentAssignment(memberId, assignmentMemberSpec.assignedPartitions()); + + int currentAssignmentSize = validCurrentAssignment.size(); + int remaining = minQuota - currentAssignmentSize; + + if (currentAssignmentSize > 0) { + int retainedPartitionsCount = min(currentAssignmentSize, minQuota); + for (int i = 0; i < retainedPartitionsCount; i++) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(i).topicId(), k -> new HashSet<>()) + .add(validCurrentAssignment.get(i).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(i)); + } + + // The extra partition is located at the index "retainedPartitionsCount" from the current step. + if (remaining < 0 && remainingMembersToGetExtraPartition > 0) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(retainedPartitionsCount).topicId(), k -> new HashSet<>()) + .add(validCurrentAssignment.get(retainedPartitionsCount).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(retainedPartitionsCount)); + remainingMembersToGetExtraPartition--; + } + } + if (remaining >= 0) { + potentiallyUnfilledMembers.put(memberId, remaining); + } + }); + + return allAssignedStickyPartitions; + } + + /** + * Filters the current assignment of partitions for a given member. + * + * If a partition is assigned to a member not subscribed to its topic or + * if the rack-aware strategy is to be used but there is a mismatch, + * the partition is excluded from the valid assignment and stored for future consideration. + * + * @param memberId The Id of the member whose assignment is being validated. + * @param assignedPartitions The partitions currently assigned to the member. + * + * @return List of valid partitions after applying the filters. + */ + private List<TopicIdPartition> getValidCurrentAssignment(String memberId, Map<Uuid, Set<Integer>> assignedPartitions) { + List<TopicIdPartition> validCurrentAssignmentList = new ArrayList<>(); + + assignedPartitions.forEach((topicId, currentAssignment) -> { + List<Integer> currentAssignmentList = new ArrayList<>(currentAssignment); + if (subscriptionList.contains(topicId)) { + for (Integer partition : currentAssignmentList) { + TopicIdPartition topicIdPartition = new TopicIdPartition(topicId, partition); + if (useRackAwareStrategy && rackInfo.racksMismatch(memberId, topicIdPartition)) { + currentPartitionOwners.put(topicIdPartition, memberId); + } else { + validCurrentAssignmentList.add(topicIdPartition); + } + } + } + }); + + return validCurrentAssignmentList; + } + + /** + * This method iterates over the unassigned partitions and attempts to allocate them + * to members while considering their rack affiliations. + */ + private void rackAwareRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + + // Sorts partitions in ascending order by number of potential consumers with matching racks. + // Partitions with no potential members aren't included in this list. + List<TopicIdPartition> sortedPartitions = rackInfo.sortPartitionsByRackConsumers(unassignedPartitions); + + Iterator<TopicIdPartition> partitionIterator = sortedPartitions.iterator(); + while (partitionIterator.hasNext()) { + TopicIdPartition partition = partitionIterator.next(); + boolean assigned = false; + for (int i = 0; i < roundRobinMembers.size() && !assigned; i++) { + String memberId = roundRobinMembers.poll(); + Integer remainingPartitionCount = unfilledMembers.get(memberId); + + if (remainingPartitionCount != null && remainingPartitionCount > 0 && !rackInfo.racksMismatch(memberId, partition)) { + assignPartitionToMember(memberId, partition); + assigned = true; + partitionIterator.remove(); + unassignedPartitions.remove(partition); + } + + // Only re-add to the end of the queue if it's still in the unfilledMembers map + if (unfilledMembers.containsKey(memberId)) { + roundRobinMembers.add(memberId); + } + } + } + } + + /** + * Allocates the unassigned partitions to available members. + * + * If the rack-aware strategy is enabled, partitions are attempted to be assigned back to their current owners first. + * + * If a partition couldn't be assigned to its current owner due to quotas or + * if the rack-aware strategy is not enabled, the partitions are allocated to members in a round-robin fashion.</p> + */ + private void unassignedPartitionsRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + Iterator<TopicIdPartition> partitionIterator = unassignedPartitions.iterator(); + + while (partitionIterator.hasNext()) { + TopicIdPartition partition = partitionIterator.next(); + boolean assigned = false; + + if (useRackAwareStrategy && currentPartitionOwners.containsKey(partition)) { + String prevOwner = currentPartitionOwners.get(partition); + if (unfilledMembers.containsKey(prevOwner)) { + assignPartitionToMember(prevOwner, partition); + assigned = true; + partitionIterator.remove(); + if (!unfilledMembers.containsKey(prevOwner)) { + roundRobinMembers.remove(prevOwner); + } + } + } + + // Only re-add the member to the end of the queue if it's still available for assignment. + for (int i = 0; i < unfilledMembers.size() && !assigned; i++) { + String memberId = roundRobinMembers.poll(); + if (unfilledMembers.get(memberId) > 0) { + assignPartitionToMember(memberId, partition); + assigned = true; + partitionIterator.remove(); + } + if (unfilledMembers.containsKey(memberId)) { + roundRobinMembers.add(memberId); + } + } + } + } + + /** + * Assigns the specified partition to the given member. + * + * <p> + * If the member has met their allocation quota, the member is removed from the + * tracking map of members with their remaining allocations. + * Otherwise, the count of remaining partitions that can be assigned to the member is updated. + * </p> + * + * @param memberId The Id of the member to which the partition will be assigned. + * @param partition The partition to be assigned. + */ + private void assignPartitionToMember(String memberId, TopicIdPartition partition) { Review Comment: that seems like a big assumption; let's at least add something to the javadocs ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/UniformAssignor.java: ########## @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.apache.kafka.common.Uuid; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * The Uniform Assignor distributes Kafka topic partitions among group members for balanced assignment. + * The assignor employs two different strategies based on the nature of topic + * subscriptions across the group members: + * <ul> + * <li> + * <b> Optimized Uniform Assignment Builder: </b> This strategy is used when all members have subscribed + * to the same set of topics. + * </li> + * <li> + * <b> General Uniform Assignment Builder: </b> This strategy is used when members have varied topic + * subscriptions. + * </li> + * </ul> + * + * The appropriate strategy is automatically chosen based on the current members' topic subscriptions. + * + * @see OptimizedUniformAssignmentBuilder + * @see GeneralUniformAssignmentBuilder + */ +public class UniformAssignor implements PartitionAssignor { + private static final Logger log = LoggerFactory.getLogger(UniformAssignor.class); + public static final String UNIFORM_ASSIGNOR_NAME = "uniform"; + + @Override + public String name() { + return UNIFORM_ASSIGNOR_NAME; + } + + /** + * Perform the group assignment given the current members and + * topic metadata. + * + * @param assignmentSpec The member assignment spec. + * @param subscribedTopicDescriber The topic and cluster metadata describer {@link SubscribedTopicDescriber}. + * @return The new assignment for the group. + */ + @Override + public GroupAssignment assign( + AssignmentSpec assignmentSpec, + SubscribedTopicDescriber subscribedTopicDescriber + ) throws PartitionAssignorException { + + AbstractAssignmentBuilder assignmentBuilder; + if (allSubscriptionsEqual(assignmentSpec.members())) { + log.debug("Detected that all members are subscribed to the same set of topics, invoking the " + + "optimized assignment algorithm"); + assignmentBuilder = new OptimizedUniformAssignmentBuilder(assignmentSpec, subscribedTopicDescriber); + } else { + assignmentBuilder = new GeneralUniformAssignmentBuilder(); + log.debug("Detected that all members are subscribed to a different set of topics, invoking the " + + "general assignment algorithm"); + } + return assignmentBuilder.buildAssignment(); + } + + /** + * Determines if all members are subscribed to the same list of topic IDs. + * + * @param members A map of member identifiers to their respective {@code AssignmentMemberSpec}. + * Assumes the map is non-empty. + * @return true if all members have the same subscription list of topic IDs, + * false otherwise. + */ + private boolean allSubscriptionsEqual(Map<String, AssignmentMemberSpec> members) { + boolean allSubscriptionsEqual = true; + Collection<Uuid> firstSubscriptionList = members.values().iterator().next().subscribedTopicIds(); + for (AssignmentMemberSpec memberSpec : members.values()) { + if (!firstSubscriptionList.equals(memberSpec.subscribedTopicIds())) { + allSubscriptionsEqual = false; + break; + } + } + return allSubscriptionsEqual; + } + + protected static abstract class AbstractAssignmentBuilder { + protected abstract GroupAssignment buildAssignment(); + + protected boolean useRackAwareAssignment( + Set<String> consumerRacks, + Set<String> partitionRacks, + Map<TopicIdPartition, Set<String>> racksPerPartition + ) { + if (consumerRacks.isEmpty() || Collections.disjoint(consumerRacks, partitionRacks)) + return false; + else { + return !racksPerPartition.values().stream().allMatch(partitionRacks::equals); + } + } + + protected List<TopicIdPartition> getAllTopicIdPartitions( + List<Uuid> listAllTopics, + SubscribedTopicDescriber subscribedTopicDescriber + ) { + List<TopicIdPartition> allTopicIdPartitions = new ArrayList<>(); + listAllTopics.forEach(topic -> + IntStream.range(0, subscribedTopicDescriber.numPartitions((topic))) + .forEach(i -> allTopicIdPartitions.add(new TopicIdPartition(topic, i)) + ) + ); + + return allTopicIdPartitions; + } + + protected class RackInfo { + protected final Map<String, String> memberRacks; + protected final Map<TopicIdPartition, Set<String>> partitionRacks; + private final Map<TopicIdPartition, Integer> numMembersByPartition; + + public RackInfo( + AssignmentSpec assignmentSpec, + SubscribedTopicDescriber subscribedTopicDescriber, + List<Uuid> topicIds + ) { + Map<String, List<String>> membersByRack = new HashMap<>(); + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> + assignmentMemberSpec.rackId().filter(r -> !r.isEmpty()).ifPresent( + rackId -> membersByRack.computeIfAbsent(rackId, k -> new ArrayList<>()).add(memberId) + ) + ); + + Set<String> allPartitionRacks; + Map<TopicIdPartition, Set<String>> partitionRacks; + List<TopicIdPartition> topicIdPartitions = getAllTopicIdPartitions(topicIds, subscribedTopicDescriber); + + if (membersByRack.isEmpty()) { + allPartitionRacks = Collections.emptySet(); + partitionRacks = Collections.emptyMap(); + } else { + partitionRacks = new HashMap<>(); + allPartitionRacks = new HashSet<>(); + topicIdPartitions.forEach(tp -> { + Set<String> racks = subscribedTopicDescriber.racksForPartition(tp.topicId(), tp.partition()); + partitionRacks.put(tp, racks); + if (!racks.isEmpty()) allPartitionRacks.addAll(racks); + }); + } + + if (useRackAwareAssignment(membersByRack.keySet(), allPartitionRacks, partitionRacks)) { + this.memberRacks = new HashMap<>(assignmentSpec.members().size()); + membersByRack.forEach((rack, rackConsumers) -> rackConsumers.forEach(c -> memberRacks.put(c, rack))); + this.partitionRacks = partitionRacks; + } else { + this.memberRacks = Collections.emptyMap(); + this.partitionRacks = Collections.emptyMap(); + } + + numMembersByPartition = partitionRacks.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().stream() + .map(r -> membersByRack.getOrDefault(r, Collections.emptyList()).size()) + .reduce(0, Integer::sum))); + } + + /** + * Determines if there's a mismatch between the memberId's rack and the partition's replica racks. + * + * <p> Mismatch conditions (returns {@code true}): + * <ul> + * <li> Consumer lacks an associated rack.</li> + * <li> Partition lacks associated replica racks.</li> + * <li> Consumer's rack isn't among the partition's replica racks.</li> + * </ul> + * + * @param memberId The memberId identifier. + * @param tp The topic partition in question. + * @return {@code true} for a mismatch; {@code false} if member and partition racks exist and align. + */ + protected boolean racksMismatch(String memberId, TopicIdPartition tp) { + String consumerRack = memberRacks.get(memberId); + Set<String> replicaRacks = partitionRacks.get(tp); + return consumerRack == null || (replicaRacks == null || !replicaRacks.contains(consumerRack)); + } + + /** + * Sorts the given list of partitions based on the number of consumers available for each partition + * in a rack-aware manner. + * + * @param partitions The list of partitions to be sorted. + * @return A sorted linked list of partitions with potential members in the same rack. + */ + protected List<TopicIdPartition> sortPartitionsByRackConsumers(List<TopicIdPartition> partitions) { Review Comment: on > available for each partition in a rack-aware manner. i don't see which part is "in a rack-aware manner" in the code. > test case can we test that the sorting happens as expected ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/OptimizedUniformAssignmentBuilder.java: ########## @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.common.Uuid; +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.stream.IntStream; + +import static java.lang.Math.min; + +/** + * Assigns Kafka partitions to members of a consumer group ensuring a balanced distribution with + * considerations for sticky assignments and rack-awareness. + * The order of priority of properties during the assignment will be: balance > rack matching (when applicable) > stickiness. + * + * <p> Here's the step-by-step breakdown of the assignment process: + * + * <ul> + * <li> Compute the quotas of partitions for each member based on the total partitions and member count.</li> + * <li> For existing assignments, retain partitions based on the determined quota and member's rack compatibility. + * <li> If a partition's rack mismatches with its member, track it with its prior owner.</li> + * <li> Identify members that haven't fulfilled their partition quota or are eligible to receive extra partitions.</li> + * <li> Derive the unassigned partitions by taking the difference between total partitions and the sticky assignments.</li> + * <li> Depending on members needing extra partitions, select members from the potentially unfilled list + * and add them to the unfilled list.</li> + * <li> Proceed with a round-robin assignment adhering to rack awareness. + * For each unassigned partition, locate the first compatible member from the unfilled list.</li> + * <li> If no rack-compatible member is found, revert to the tracked current owner. + * If that member can't accommodate the partition due to quota limits, resort to a generic round-robin assignment.</li> + * </ul> + */ +public class OptimizedUniformAssignmentBuilder extends UniformAssignor.AbstractAssignmentBuilder { + private static final Logger log = LoggerFactory.getLogger(OptimizedUniformAssignmentBuilder.class); + private final AssignmentSpec assignmentSpec; + private final SubscribedTopicDescriber subscribedTopicDescriber; + // List of topics subscribed to by all members. + private final List<Uuid> subscriptionList; + private final RackInfo rackInfo; + // Count of members to receive an extra partition beyond the minimum quota, + // to account for the distribution of the remaining partitions. + private int remainingMembersToGetAnExtraPartition; + // Map of members to the remaining number of partitions needed to meet the minimum quota, + // including members eligible for an extra partition. + private final Map<String, Integer> potentiallyUnfilledMembers; + // Members mapped to the remaining number of partitions needed to meet the full quota. + // Full quota = minQuota + one extra partition (if applicable). + private Map<String, Integer> unfilledMembers; + private List<TopicIdPartition> unassignedPartitions; + private final Map<String, MemberAssignment> newAssignment; + // Tracks the current owner of each partition when using rack-aware strategy. + // Current refers to the existing assignment. + private final Map<TopicIdPartition, String> currentPartitionOwners; + // Indicates if a rack aware assignment can be done. + // True if racks are defined for both members and partitions. + boolean useRackAwareStrategy; + + OptimizedUniformAssignmentBuilder(AssignmentSpec assignmentSpec, SubscribedTopicDescriber subscribedTopicDescriber) { + this.assignmentSpec = assignmentSpec; + this.subscribedTopicDescriber = subscribedTopicDescriber; + this.subscriptionList = new ArrayList<>(assignmentSpec.members().values().iterator().next().subscribedTopicIds()); + + this.rackInfo = new RackInfo(assignmentSpec, subscribedTopicDescriber, subscriptionList); + this.potentiallyUnfilledMembers = new HashMap<>(); + this.unfilledMembers = new HashMap<>(); + this.newAssignment = new HashMap<>(); + + // If consumer or member racks are unavailable, rack aware strategy is not used. + this.useRackAwareStrategy = !(rackInfo.memberRacks.isEmpty() || rackInfo.partitionRacks.isEmpty()); + // Without rack-aware strategy, tracking current owners of unassigned partitions is unnecessary + // as all sticky partitions are retained until a member meets its quota. + this. currentPartitionOwners = useRackAwareStrategy ? new HashMap<>() : Collections.emptyMap(); + } + + @Override + protected GroupAssignment buildAssignment() throws PartitionAssignorException{ + int totalPartitionsCount = 0; + // Removes the current topic from subscriptionList if the topic doesn't exist in the topic metadata. + Iterator<Uuid> iterator = subscriptionList.iterator(); + while (iterator.hasNext()) { + Uuid topicId = iterator.next(); + int partitionCount = subscribedTopicDescriber.numPartitions(topicId); + if (partitionCount == -1) { + log.warn("Members are subscribed to topic " + topicId + " which doesn't exist in the topic metadata."); + iterator.remove(); + } else { + totalPartitionsCount += partitionCount; + } + } + + if (subscriptionList.isEmpty()) { + log.info("The subscription list is empty, returning an empty assignment"); + return new GroupAssignment(Collections.emptyMap()); + } + + // The minimum required quota that each member needs to meet for a balanced assignment. + // This is the same for all members. + int numberOfMembers = assignmentSpec.members().size(); + final int minQuota = (int) Math.floor(((double) totalPartitionsCount) / numberOfMembers); + remainingMembersToGetAnExtraPartition = totalPartitionsCount % numberOfMembers; + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> + newAssignment.put(memberId, new MemberAssignment(new HashMap<>())) + ); + + Set<TopicIdPartition> allAssignedStickyPartitions = computeAssignedStickyPartitions(minQuota); + unassignedPartitions = computeUnassignedPartitions(allAssignedStickyPartitions); + unfilledMembers = computeUnfilledMembers(); + + if (!isTotalUnassignedPartitionsEqualsTotalRemainingAssignments()) { + throw new PartitionAssignorException("Number of available partitions is not equal to the total requirement"); + } + + if (useRackAwareStrategy) rackAwareRoundRobinAssignment(); + unassignedPartitionsRoundRobinAssignment(); + + return new GroupAssignment(newAssignment); + } + + /** + * Retrieves a set of partitions that were currently assigned to members and will be retained in the new assignment, + * by ensuring that the partitions are still relevant based on current topic metadata and subscriptions. + * If rack awareness is enabled, it ensures that a partition's rack matches the member's rack. + * + * <p> For each member, it: + * <ul> + * <li> Finds the valid current assignment considering topic subscriptions and metadata.</li> + * <li> If current assignments exist, retains up to the minimum quota of assignments.</li> + * <li> If there are members that should get an extra partition, + * assigns the next partition after the retained ones.</li> + * <li> For members with assignments not exceeding the minimum quota, + * it identifies them as potentially unfilled members and tracks the remaining quota.</li> + * </ul> + * + * @return A set containing all the sticky partitions that have been retained in the new assignment. + */ + private Set<TopicIdPartition> computeAssignedStickyPartitions(Integer minQuota) { + Set<TopicIdPartition> allAssignedStickyPartitions = new HashSet<>(); + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> { + // Remove all the topics that aren't in the subscriptions or the topic metadata anymore. + // If rack awareness is enabled, only add partitions if the consumers rack matches the partitions rack. + List<TopicIdPartition> validCurrentAssignment = validCurrentAssignment( + memberId, + assignmentMemberSpec.assignedPartitions() + ); + + int currentAssignmentSize = validCurrentAssignment.size(); + int remaining = minQuota - currentAssignmentSize; + + if (currentAssignmentSize > 0) { Review Comment: i meant if currentAssignmentSize is 0, then retainedPartitionsCount will be 0 and hence we won't iterate anyways ########## group-coordinator/src/main/java/org/apache/kafka/coordinator/group/assignor/OptimizedUniformAssignmentBuilder.java: ########## @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.coordinator.group.assignor; + +import org.apache.kafka.common.Uuid; +import org.apache.kafka.coordinator.group.common.TopicIdPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.stream.IntStream; + +import static java.lang.Math.min; + +/** + * Assigns Kafka partitions to members of a consumer group ensuring a balanced distribution with + * considerations for sticky assignments and rack-awareness. + * The order of priority of properties during the assignment will be: balance > rack matching (when applicable) > stickiness. + * + * <p> Here's the step-by-step breakdown of the assignment process: + * + * <ul> + * <li> Compute the quotas of partitions for each member based on the total partitions and member count.</li> + * <li> For existing assignments, retain partitions based on the determined quota and member's rack compatibility. + * <li> If a partition's rack mismatches with its member, track it with its prior owner.</li> + * <li> Identify members that haven't fulfilled their partition quota or are eligible to receive extra partitions.</li> + * <li> Derive the unassigned partitions by taking the difference between total partitions and the sticky assignments.</li> + * <li> Depending on members needing extra partitions, select members from the potentially unfilled list + * and add them to the unfilled list.</li> + * <li> Proceed with a round-robin assignment adhering to rack awareness. + * For each unassigned partition, locate the first compatible member from the unfilled list.</li> + * <li> If no rack-compatible member is found, revert to the tracked current owner. + * If that member can't accommodate the partition due to quota limits, resort to a generic round-robin assignment.</li> + * </ul> + */ +public class OptimizedUniformAssignmentBuilder extends UniformAssignor.AbstractAssignmentBuilder { + private static final Logger log = LoggerFactory.getLogger(OptimizedUniformAssignmentBuilder.class); + private final AssignmentSpec assignmentSpec; + private final SubscribedTopicDescriber subscribedTopicDescriber; + // List of topics subscribed to by all members. + private final List<Uuid> subscriptionList; + private final RackInfo rackInfo; + // Count of members to receive an extra partition beyond the minimum quota, + // to account for the distribution of the remaining partitions. + private int remainingMembersToGetAnExtraPartition; + // Map of members to the remaining number of partitions needed to meet the minimum quota, + // including members eligible for an extra partition. + private final Map<String, Integer> potentiallyUnfilledMembers; + // Members mapped to the remaining number of partitions needed to meet the full quota. + // Full quota = minQuota + one extra partition (if applicable). + private Map<String, Integer> unfilledMembers; + private List<TopicIdPartition> unassignedPartitions; + private final Map<String, MemberAssignment> newAssignment; + // Tracks the current owner of each partition when using rack-aware strategy. + // Current refers to the existing assignment. + private final Map<TopicIdPartition, String> currentPartitionOwners; + // Indicates if a rack aware assignment can be done. + // True if racks are defined for both members and partitions. + boolean useRackAwareStrategy; + + OptimizedUniformAssignmentBuilder(AssignmentSpec assignmentSpec, SubscribedTopicDescriber subscribedTopicDescriber) { + this.assignmentSpec = assignmentSpec; + this.subscribedTopicDescriber = subscribedTopicDescriber; + this.subscriptionList = new ArrayList<>(assignmentSpec.members().values().iterator().next().subscribedTopicIds()); + + this.rackInfo = new RackInfo(assignmentSpec, subscribedTopicDescriber, subscriptionList); + this.potentiallyUnfilledMembers = new HashMap<>(); + this.unfilledMembers = new HashMap<>(); + this.newAssignment = new HashMap<>(); + + // If consumer or member racks are unavailable, rack aware strategy is not used. + this.useRackAwareStrategy = !(rackInfo.memberRacks.isEmpty() || rackInfo.partitionRacks.isEmpty()); + // Without rack-aware strategy, tracking current owners of unassigned partitions is unnecessary + // as all sticky partitions are retained until a member meets its quota. + this. currentPartitionOwners = useRackAwareStrategy ? new HashMap<>() : Collections.emptyMap(); + } + + @Override + protected GroupAssignment buildAssignment() throws PartitionAssignorException{ + int totalPartitionsCount = 0; + // Removes the current topic from subscriptionList if the topic doesn't exist in the topic metadata. + Iterator<Uuid> iterator = subscriptionList.iterator(); + while (iterator.hasNext()) { + Uuid topicId = iterator.next(); + int partitionCount = subscribedTopicDescriber.numPartitions(topicId); + if (partitionCount == -1) { + log.warn("Members are subscribed to topic " + topicId + " which doesn't exist in the topic metadata."); + iterator.remove(); + } else { + totalPartitionsCount += partitionCount; + } + } + + if (subscriptionList.isEmpty()) { + log.info("The subscription list is empty, returning an empty assignment"); + return new GroupAssignment(Collections.emptyMap()); + } + + // The minimum required quota that each member needs to meet for a balanced assignment. + // This is the same for all members. + int numberOfMembers = assignmentSpec.members().size(); + final int minQuota = (int) Math.floor(((double) totalPartitionsCount) / numberOfMembers); + remainingMembersToGetAnExtraPartition = totalPartitionsCount % numberOfMembers; + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> + newAssignment.put(memberId, new MemberAssignment(new HashMap<>())) + ); + + Set<TopicIdPartition> allAssignedStickyPartitions = computeAssignedStickyPartitions(minQuota); + unassignedPartitions = computeUnassignedPartitions(allAssignedStickyPartitions); + unfilledMembers = computeUnfilledMembers(); + + if (!isTotalUnassignedPartitionsEqualsTotalRemainingAssignments()) { + throw new PartitionAssignorException("Number of available partitions is not equal to the total requirement"); + } + + if (useRackAwareStrategy) rackAwareRoundRobinAssignment(); + unassignedPartitionsRoundRobinAssignment(); + + return new GroupAssignment(newAssignment); + } + + /** + * Retrieves a set of partitions that were currently assigned to members and will be retained in the new assignment, + * by ensuring that the partitions are still relevant based on current topic metadata and subscriptions. + * If rack awareness is enabled, it ensures that a partition's rack matches the member's rack. + * + * <p> For each member, it: + * <ul> + * <li> Finds the valid current assignment considering topic subscriptions and metadata.</li> + * <li> If current assignments exist, retains up to the minimum quota of assignments.</li> + * <li> If there are members that should get an extra partition, + * assigns the next partition after the retained ones.</li> + * <li> For members with assignments not exceeding the minimum quota, + * it identifies them as potentially unfilled members and tracks the remaining quota.</li> + * </ul> + * + * @return A set containing all the sticky partitions that have been retained in the new assignment. + */ + private Set<TopicIdPartition> computeAssignedStickyPartitions(Integer minQuota) { + Set<TopicIdPartition> allAssignedStickyPartitions = new HashSet<>(); + + assignmentSpec.members().forEach((memberId, assignmentMemberSpec) -> { + // Remove all the topics that aren't in the subscriptions or the topic metadata anymore. + // If rack awareness is enabled, only add partitions if the consumers rack matches the partitions rack. + List<TopicIdPartition> validCurrentAssignment = validCurrentAssignment( + memberId, + assignmentMemberSpec.assignedPartitions() + ); + + int currentAssignmentSize = validCurrentAssignment.size(); + int remaining = minQuota - currentAssignmentSize; + + if (currentAssignmentSize > 0) { + int retainedPartitionsCount = min(currentAssignmentSize, minQuota); + IntStream.range(0, retainedPartitionsCount).forEach(i -> { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(i).topicId(), __ -> new HashSet<>()) + .add(validCurrentAssignment.get(i).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(i)); + }); + + // The extra partition is located at the index "retainedPartitionsCount" from the current step. + if (remaining < 0 && remainingMembersToGetAnExtraPartition > 0) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(validCurrentAssignment.get(retainedPartitionsCount).topicId(), __ -> new HashSet<>()) + .add(validCurrentAssignment.get(retainedPartitionsCount).partition()); + allAssignedStickyPartitions.add(validCurrentAssignment.get(retainedPartitionsCount)); + remainingMembersToGetAnExtraPartition--; + } + } + + if (remaining >= 0) { + potentiallyUnfilledMembers.put(memberId, remaining); + } + + }); + + return allAssignedStickyPartitions; + } + + /** + * Filters the current assignment of partitions for a given member. + * + * If a partition is assigned to a member not subscribed to its topic or + * if the rack-aware strategy is to be used but there is a mismatch, + * the partition is excluded from the valid assignment and stored for future consideration. + * + * @param memberId The Id of the member whose assignment is being validated. + * @param assignedPartitions The partitions currently assigned to the member. + * + * @return List of valid partitions after applying the filters. + */ + private List<TopicIdPartition> validCurrentAssignment( + String memberId, + Map<Uuid, Set<Integer>> assignedPartitions + ) { + List<TopicIdPartition> validCurrentAssignmentList = new ArrayList<>(); + Set<Uuid> subscriptionSet = new HashSet<>(subscriptionList); + + assignedPartitions.forEach((topicId, currentAssignment) -> { + if (subscriptionSet.contains(topicId)) { + currentAssignment.forEach((partition) -> { + TopicIdPartition topicIdPartition = new TopicIdPartition(topicId, partition); + if (useRackAwareStrategy && rackInfo.racksMismatch(memberId, topicIdPartition)) { + currentPartitionOwners.put(topicIdPartition, memberId); + } else { + validCurrentAssignmentList.add(topicIdPartition); + } + }); + } + }); + + return validCurrentAssignmentList; + } + + /** + * This method iterates over the unassigned partitions and attempts to allocate them + * to members while considering their rack affiliations. + */ + private void rackAwareRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + + // Sorts partitions in ascending order by number of potential consumers with matching racks. + // Partitions with no potential members aren't included in this list. + List<TopicIdPartition> sortedPartitions = rackInfo.sortPartitionsByRackConsumers(unassignedPartitions); + + sortedPartitions.forEach((partition) -> { + boolean assigned = false; + for (int i = 0; i < roundRobinMembers.size() && !assigned; i++) { + String memberId = roundRobinMembers.poll(); + Integer remainingPartitionCount = unfilledMembers.getOrDefault(memberId, 0); + + if (remainingPartitionCount > 0 && !rackInfo.racksMismatch(memberId, partition)) { + assignPartitionToMember(memberId, partition); + assigned = true; + unassignedPartitions.remove(partition); + } + + // Only re-add to the end of the queue if it's still in the unfilledMembers map + if (unfilledMembers.containsKey(memberId)) { + roundRobinMembers.add(memberId); + } + } + }); + } + + /** + * Allocates the unassigned partitions to available members. + * + * If the rack-aware strategy is enabled, partitions are attempted to be assigned back to their current owners first. + * + * If a partition couldn't be assigned to its current owner due to quotas or + * if the rack-aware strategy is not enabled, the partitions are allocated to members in a round-robin fashion.</p> + */ + private void unassignedPartitionsRoundRobinAssignment() { + Queue<String> roundRobinMembers = new LinkedList<>(unfilledMembers.keySet()); + + unassignedPartitions.forEach((partition) -> { + boolean assigned = false; + + if (useRackAwareStrategy && currentPartitionOwners.containsKey(partition)) { + String prevOwner = currentPartitionOwners.get(partition); + if (unfilledMembers.containsKey(prevOwner)) { + assignPartitionToMember(prevOwner, partition); + assigned = true; + if (!unfilledMembers.containsKey(prevOwner)) { + roundRobinMembers.remove(prevOwner); + } + } + } + + // Only re-add the member to the end of the queue if it's still available for assignment. + for (int i = 0; i < unfilledMembers.size() && !assigned; i++) { + String memberId = roundRobinMembers.poll(); + if (unfilledMembers.get(memberId) > 0) { + assignPartitionToMember(memberId, partition); + assigned = true; + } + if (unfilledMembers.containsKey(memberId)) { + roundRobinMembers.add(memberId); + } + } + }); + } + + /** + * Assigns the specified partition to the given member. + * + * <p> + * If the member has met their allocation quota, the member is removed from the + * tracking map of members with their remaining allocations. + * Otherwise, the count of remaining partitions that can be assigned to the member is updated. + * </p> + * + * @param memberId The Id of the member to which the partition will be assigned. + * @param partition The partition to be assigned. + */ + private void assignPartitionToMember(String memberId, TopicIdPartition partition) { + newAssignment.get(memberId) + .targetPartitions() + .computeIfAbsent(partition.topicId(), __ -> new HashSet<>()) + .add(partition.partition()); + + int remainingPartitionCount = unfilledMembers.get(memberId) - 1; + if (remainingPartitionCount == 0) { + unfilledMembers.remove(memberId); + } else { + unfilledMembers.put(memberId, remainingPartitionCount); + } + } + + /** + * Determines which members can still be assigned partitions to meet the full quota. + * + * @return A map of member IDs and their capacity for additional partitions. + */ + private Map<String, Integer> computeUnfilledMembers() { + Map<String, Integer> unfilledMembers = new HashMap<>(); + + potentiallyUnfilledMembers.forEach((memberId, remaining) -> { + if (remainingMembersToGetAnExtraPartition > 0) { + remaining++; + remainingMembersToGetAnExtraPartition--; + } + if (remaining > 0) { + unfilledMembers.put(memberId, remaining); + } + }); + + return unfilledMembers; + } + + /** + * This method compares the full list of partitions against the set of already + * assigned sticky partitions to identify those that still need to be allocated. + * + * @param allAssignedStickyPartitions Set of partitions that have already been assigned. + * @return List of unassigned partitions. + */ + private List<TopicIdPartition> computeUnassignedPartitions(Set<TopicIdPartition> allAssignedStickyPartitions) { Review Comment: not sure why i left this comment, can disregard -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
