junrao commented on code in PR #16969: URL: https://github.com/apache/kafka/pull/16969#discussion_r1729499882
########## core/src/main/java/kafka/server/share/DelayedShareFetch.java: ########## @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package kafka.server.share; + +import kafka.server.DelayedOperation; +import kafka.server.QuotaFactory; +import kafka.server.ReplicaManager; + +import org.apache.kafka.common.TopicIdPartition; +import org.apache.kafka.common.message.ShareFetchResponseData; +import org.apache.kafka.common.protocol.Errors; +import org.apache.kafka.common.record.FileRecords; +import org.apache.kafka.common.requests.FetchRequest; +import org.apache.kafka.common.requests.ListOffsetsRequest; +import org.apache.kafka.storage.internals.log.FetchPartitionData; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; + +import scala.Option; +import scala.Tuple2; +import scala.jdk.javaapi.CollectionConverters; +import scala.runtime.BoxedUnit; + +/** + * A delayed share fetch operation has been introduced in case there is no share partition for which we can acquire records. We will try to wait + * for MaxWaitMs for records to be released else complete the share fetch request. + */ +public class DelayedShareFetch extends DelayedOperation { + private final SharePartitionManager.ShareFetchPartitionData shareFetchPartitionData; + private final ReplicaManager replicaManager; + private final Map<SharePartitionManager.SharePartitionKey, SharePartition> partitionCacheMap; + + private static final Logger log = LoggerFactory.getLogger(DelayedShareFetch.class); + + DelayedShareFetch( + SharePartitionManager.ShareFetchPartitionData shareFetchPartitionData, + ReplicaManager replicaManager, + Map<SharePartitionManager.SharePartitionKey, SharePartition> partitionCacheMap) { + super(shareFetchPartitionData.fetchParams().maxWaitMs, Option.empty()); + this.shareFetchPartitionData = shareFetchPartitionData; + this.replicaManager = replicaManager; + this.partitionCacheMap = partitionCacheMap; + } + + @Override + public void onExpiration() { + } + + /** + * Complete the share fetch operation by fetching records for all partitions in the share fetch request irrespective + * of whether they have any acquired records. This is called when the fetch operation is forced to complete either + * because records can be acquired for some partitions or due to MaxWaitMs timeout. + */ + @Override + public void onComplete() { + log.trace("onCompletion of delayed share fetch request for group {}, member {}, " + + "topic partitions {}", shareFetchPartitionData.groupId(), + shareFetchPartitionData.memberId(), shareFetchPartitionData.partitionMaxBytes().keySet()); + + Map<TopicIdPartition, FetchRequest.PartitionData> topicPartitionData = topicPartitionDataForAcquirablePartitions(); + try { + if (topicPartitionData.isEmpty()) { + // No locks for share partitions could be acquired, so we complete the request with an empty response. + shareFetchPartitionData.future().complete(Collections.emptyMap()); + return; + } + log.trace("Fetchable share partitions data: {} with groupId: {} fetch params: {}", + topicPartitionData, shareFetchPartitionData.groupId(), shareFetchPartitionData.fetchParams()); + + replicaManager.fetchMessages( + shareFetchPartitionData.fetchParams(), + CollectionConverters.asScala( + topicPartitionData.entrySet().stream().map(entry -> + new Tuple2<>(entry.getKey(), entry.getValue())).collect(Collectors.toList()) + ), + QuotaFactory.UnboundedQuota$.MODULE$, + responsePartitionData -> { + log.trace("Data successfully retrieved by replica manager: {}", responsePartitionData); + List<Tuple2<TopicIdPartition, FetchPartitionData>> responseData = CollectionConverters.asJava( + responsePartitionData); + processFetchResponse(shareFetchPartitionData, responseData).whenComplete( + (result, throwable) -> { + if (throwable != null) { + log.error("Error processing fetch response for share partitions", throwable); + shareFetchPartitionData.future().completeExceptionally(throwable); + } else { + shareFetchPartitionData.future().complete(result); + } + // Releasing the lock to move ahead with the next request in queue. + releasePartitionsLock(shareFetchPartitionData.groupId(), topicPartitionData.keySet()); + }); + return BoxedUnit.UNIT; + }); + } catch (Exception e) { + // Release the locks acquired for the partitions in the share fetch request in case there is an exception + log.error("Error processing delayed share fetch request", e); + shareFetchPartitionData.future().completeExceptionally(e); + releasePartitionsLock(shareFetchPartitionData.groupId(), topicPartitionData.keySet()); + } + } + + /** + * Try to complete the fetch operation if we can acquire records for any partition in the share fetch request. + */ + @Override + public boolean tryComplete() { Review Comment: I guess we haven't added the logic to call `DelayedShareFetch.tryComplete`? ########## core/src/main/java/kafka/server/share/DelayedShareFetchKey.java: ########## @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package kafka.server.share; + +import kafka.server.DelayedOperationKey; + +import org.apache.kafka.common.TopicIdPartition; + +import java.util.Objects; +import java.util.Set; + +/** + * A key for delayed operations that fetch data for share consumers. + */ +public class DelayedShareFetchKey implements DelayedOperationKey { + private final Set<TopicIdPartition> topicIdPartitions; Review Comment: Yes, I agree that it would be useful to think through the key of DelayedShareOperation. Intuitive, two things could trigger the evaluation of a DelayedShareOperation: (1) new data on the partition; (2) new acknowledgement from a member. If (1) occurs, it affects every share fetch request on that partition. If (2) occurs, it affects every share fetch request on that partition in the same group. To cover both case, having a key with partition + groupId seems enough. Not sure why we need memberId as part of the key. ########## core/src/main/java/kafka/server/share/DelayedShareFetch.java: ########## @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package kafka.server.share; + +import kafka.server.DelayedOperation; +import kafka.server.QuotaFactory; +import kafka.server.ReplicaManager; + +import org.apache.kafka.common.TopicIdPartition; +import org.apache.kafka.common.message.ShareFetchResponseData; +import org.apache.kafka.common.protocol.Errors; +import org.apache.kafka.common.record.FileRecords; +import org.apache.kafka.common.requests.FetchRequest; +import org.apache.kafka.common.requests.ListOffsetsRequest; +import org.apache.kafka.storage.internals.log.FetchPartitionData; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; + +import scala.Option; +import scala.Tuple2; +import scala.jdk.javaapi.CollectionConverters; +import scala.runtime.BoxedUnit; + +/** + * A delayed share fetch operation has been introduced in case there is no share partition for which we can acquire records. We will try to wait + * for MaxWaitMs for records to be released else complete the share fetch request. + */ +public class DelayedShareFetch extends DelayedOperation { + private final SharePartitionManager.ShareFetchPartitionData shareFetchPartitionData; + private final ReplicaManager replicaManager; + private final Map<SharePartitionManager.SharePartitionKey, SharePartition> partitionCacheMap; + + private static final Logger log = LoggerFactory.getLogger(DelayedShareFetch.class); + + DelayedShareFetch( + SharePartitionManager.ShareFetchPartitionData shareFetchPartitionData, + ReplicaManager replicaManager, + Map<SharePartitionManager.SharePartitionKey, SharePartition> partitionCacheMap) { + super(shareFetchPartitionData.fetchParams().maxWaitMs, Option.empty()); + this.shareFetchPartitionData = shareFetchPartitionData; + this.replicaManager = replicaManager; + this.partitionCacheMap = partitionCacheMap; + } + + @Override + public void onExpiration() { + } + + /** + * Complete the share fetch operation by fetching records for all partitions in the share fetch request irrespective + * of whether they have any acquired records. This is called when the fetch operation is forced to complete either + * because records can be acquired for some partitions or due to MaxWaitMs timeout. + */ + @Override + public void onComplete() { + log.trace("onCompletion of delayed share fetch request for group {}, member {}, " + + "topic partitions {}", shareFetchPartitionData.groupId(), + shareFetchPartitionData.memberId(), shareFetchPartitionData.partitionMaxBytes().keySet()); + + Map<TopicIdPartition, FetchRequest.PartitionData> topicPartitionData = topicPartitionDataForAcquirablePartitions(); + try { + if (topicPartitionData.isEmpty()) { + // No locks for share partitions could be acquired, so we complete the request with an empty response. + shareFetchPartitionData.future().complete(Collections.emptyMap()); + return; + } + log.trace("Fetchable share partitions data: {} with groupId: {} fetch params: {}", + topicPartitionData, shareFetchPartitionData.groupId(), shareFetchPartitionData.fetchParams()); + + replicaManager.fetchMessages( + shareFetchPartitionData.fetchParams(), + CollectionConverters.asScala( + topicPartitionData.entrySet().stream().map(entry -> + new Tuple2<>(entry.getKey(), entry.getValue())).collect(Collectors.toList()) + ), + QuotaFactory.UnboundedQuota$.MODULE$, + responsePartitionData -> { + log.trace("Data successfully retrieved by replica manager: {}", responsePartitionData); + List<Tuple2<TopicIdPartition, FetchPartitionData>> responseData = CollectionConverters.asJava( + responsePartitionData); + processFetchResponse(shareFetchPartitionData, responseData).whenComplete( + (result, throwable) -> { + if (throwable != null) { + log.error("Error processing fetch response for share partitions", throwable); + shareFetchPartitionData.future().completeExceptionally(throwable); + } else { + shareFetchPartitionData.future().complete(result); + } + // Releasing the lock to move ahead with the next request in queue. + releasePartitionsLock(shareFetchPartitionData.groupId(), topicPartitionData.keySet()); + }); + return BoxedUnit.UNIT; + }); + } catch (Exception e) { + // Release the locks acquired for the partitions in the share fetch request in case there is an exception + log.error("Error processing delayed share fetch request", e); + shareFetchPartitionData.future().completeExceptionally(e); + releasePartitionsLock(shareFetchPartitionData.groupId(), topicPartitionData.keySet()); + } + } + + /** + * Try to complete the fetch operation if we can acquire records for any partition in the share fetch request. + */ + @Override + public boolean tryComplete() { + log.trace("onTry of delayed share fetch request for group {}, member {}, topic partitions {}", + shareFetchPartitionData.groupId(), shareFetchPartitionData.memberId(), + shareFetchPartitionData.partitionMaxBytes().keySet()); + + boolean canAnyPartitionBeAcquired = false; + for (TopicIdPartition topicIdPartition: shareFetchPartitionData.partitionMaxBytes().keySet()) { + SharePartition sharePartition = partitionCacheMap.get(new SharePartitionManager.SharePartitionKey( + shareFetchPartitionData.groupId(), topicIdPartition)); + if (sharePartition.maybeAcquireFetchLock()) { + if (sharePartition.canAcquireRecords()) { + canAnyPartitionBeAcquired = true; + } + sharePartition.releaseFetchLock(); + if (canAnyPartitionBeAcquired) Review Comment: 1. This is not very accurate. `canAnyPartitionBeAcquired` doesn't mean that there is enough bytes to satisfy the minByte requirement for the request. We need to take into consideration the size of all available batches in SharePartition. 2. There is also a synchronization issue. In DelayedFetch, there is only 1 fetch request per partition per group. So, after checking that a particular fetch request has enough bytes, we can by sure that when `onComplete` is called, there is indeed enough bytes. In DelayedShareFetch, there could be multiple share fetch requests per partition per group. What could happen is that in one thread, we check that request 1 can be completed, but by the time we try to complete the request, another thread has given the data in that partition to request 2. This means that we may return fewer bytes for request 1. ########## core/src/main/java/kafka/server/share/DelayedShareFetch.java: ########## @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package kafka.server.share; + +import kafka.server.DelayedOperation; +import kafka.server.QuotaFactory; +import kafka.server.ReplicaManager; + +import org.apache.kafka.common.TopicIdPartition; +import org.apache.kafka.common.message.ShareFetchResponseData; +import org.apache.kafka.common.protocol.Errors; +import org.apache.kafka.common.record.FileRecords; +import org.apache.kafka.common.requests.FetchRequest; +import org.apache.kafka.common.requests.ListOffsetsRequest; +import org.apache.kafka.storage.internals.log.FetchPartitionData; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; + +import scala.Option; +import scala.Tuple2; +import scala.jdk.javaapi.CollectionConverters; +import scala.runtime.BoxedUnit; + +/** + * A delayed share fetch operation has been introduced in case there is no share partition for which we can acquire records. We will try to wait + * for MaxWaitMs for records to be released else complete the share fetch request. + */ +public class DelayedShareFetch extends DelayedOperation { + private final SharePartitionManager.ShareFetchPartitionData shareFetchPartitionData; + private final ReplicaManager replicaManager; + private final Map<SharePartitionManager.SharePartitionKey, SharePartition> partitionCacheMap; + + private static final Logger log = LoggerFactory.getLogger(DelayedShareFetch.class); + + DelayedShareFetch( + SharePartitionManager.ShareFetchPartitionData shareFetchPartitionData, + ReplicaManager replicaManager, + Map<SharePartitionManager.SharePartitionKey, SharePartition> partitionCacheMap) { + super(shareFetchPartitionData.fetchParams().maxWaitMs, Option.empty()); + this.shareFetchPartitionData = shareFetchPartitionData; + this.replicaManager = replicaManager; + this.partitionCacheMap = partitionCacheMap; + } + + @Override + public void onExpiration() { + } + + /** + * Complete the share fetch operation by fetching records for all partitions in the share fetch request irrespective + * of whether they have any acquired records. This is called when the fetch operation is forced to complete either + * because records can be acquired for some partitions or due to MaxWaitMs timeout. + */ + @Override + public void onComplete() { + log.trace("onCompletion of delayed share fetch request for group {}, member {}, " + + "topic partitions {}", shareFetchPartitionData.groupId(), + shareFetchPartitionData.memberId(), shareFetchPartitionData.partitionMaxBytes().keySet()); + + Map<TopicIdPartition, FetchRequest.PartitionData> topicPartitionData = topicPartitionDataForAcquirablePartitions(); + try { + if (topicPartitionData.isEmpty()) { + // No locks for share partitions could be acquired, so we complete the request with an empty response. + shareFetchPartitionData.future().complete(Collections.emptyMap()); + return; + } + log.trace("Fetchable share partitions data: {} with groupId: {} fetch params: {}", + topicPartitionData, shareFetchPartitionData.groupId(), shareFetchPartitionData.fetchParams()); + + replicaManager.fetchMessages( Review Comment: We probably should just use r`eplicaManager.readFromLog` like `DelayedFetch`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: jira-unsubscr...@kafka.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org