chia7712 commented on a change in pull request #8657:
URL: https://github.com/apache/kafka/pull/8657#discussion_r431209368



##########
File path: core/src/main/scala/kafka/coordinator/group/DelayedJoin.scala
##########
@@ -33,11 +34,40 @@ import scala.math.{max, min}
  */
 private[group] class DelayedJoin(coordinator: GroupCoordinator,
                                  group: GroupMetadata,
-                                 rebalanceTimeout: Long) extends 
DelayedOperation(rebalanceTimeout, Some(group.lock)) {
+                                 rebalanceTimeout: Long) extends 
DelayedOperation(rebalanceTimeout, None) {
 
-  override def tryComplete(): Boolean = coordinator.tryCompleteJoin(group, 
forceComplete _)
-  override def onExpiration() = coordinator.onExpireJoin()
-  override def onComplete() = coordinator.onCompleteJoin(group)
+  /**
+   * The delayed requests should be completed without holding group lock so we 
keep those partitions and then
+   * complete them after releasing lock.
+   */
+  private[group] var partitionsToComplete: 
scala.collection.Map[TopicPartition, LeaderHWChange] = Map.empty
+
+  /**
+   * It controls the lock manually since GroupCoordinator#onCompleteJoin() 
invoked by onComplete() can't be within a
+   * group lock since GroupCoordinator#onCompleteJoin() tries to complete 
delayed requests.
+   *
+   */
+  override def tryComplete(): Boolean = try group.inLock {
+    /**
+     * holds the group lock for both the "group.hasAllMembersJoined" check and 
the call to forceComplete()
+     */
+    if (group.hasAllMembersJoined) forceComplete()
+    else false
+  } finally completeDelayedRequests()
+  override def onExpiration(): Unit = coordinator.onExpireJoin()
+  override def onComplete(): Unit = try partitionsToComplete = 
coordinator.onCompleteJoin(group)
+  finally completeDelayedRequests()
+
+  /**
+   * try to complete delayed requests only if the caller does not hold the 
group lock.
+   * This method is called by following cases:
+   * 1) tryComplete -> hold lock -> onComplete -> release lock -> 
completeDelayedRequests
+   * 2) onComplete -> completeDelayedRequests
+   */
+  private[group] def completeDelayedRequests(): Unit = if 
(!group.lock.isHeldByCurrentThread) {

Review comment:
       this is a workaround to deal with deadlock caused by taking multiples 
group locks




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to