himanshug commented on a change in pull request #6212: fix TaskQueue-HRTR 
deadlock
URL: https://github.com/apache/incubator-druid/pull/6212#discussion_r212374994
 
 

 ##########
 File path: 
indexing-service/src/main/java/io/druid/indexing/overlord/hrtr/HttpRemoteTaskRunner.java
 ##########
 @@ -376,36 +376,46 @@ private boolean runTaskOnWorker(
       // on a worker - this avoids overflowing a worker with tasks
       long waitMs = 
config.getTaskAssignmentTimeout().toStandardDuration().getMillis();
       long waitStart = System.currentTimeMillis();
+      boolean isTaskAssignmentTimedout = false;
       synchronized (statusLock) {
         while (tasks.containsKey(taskId)
                && tasks.get(taskId).getState() == 
HttpRemoteTaskRunnerWorkItem.State.PENDING) {
           long remaining = waitMs - (System.currentTimeMillis() - waitStart);
           if (remaining > 0) {
             statusLock.wait(remaining);
           } else {
-            log.makeAlert(
-                "Task assignment timed out on worker [%s], never ran task [%s] 
in timeout[%s]!",
-                workerHost,
-                taskId,
-                config.getTaskAssignmentTimeout()
-            ).emit();
-            taskComplete(workItem, workerHolder, TaskStatus.failure(taskId));
-            return true;
+            isTaskAssignmentTimedout = true;
+            break;
           }
         }
-        return true;
       }
+
+      if (isTaskAssignmentTimedout) {
+        log.makeAlert(
+            "Task assignment timed out on worker [%s], never ran task [%s] in 
timeout[%s]!",
+            workerHost,
+            taskId,
+            config.getTaskAssignmentTimeout()
+        ).emit();
+        taskComplete(workItem, workerHolder, TaskStatus.failure(taskId));
+      }
+
+      return true;
     } else {
       return false;
     }
   }
 
+  // CAUTION: This method calls RemoteTaskRunnerWorkItem.setResult(..) which 
results in TaskQueue.notifyStatus() being called
+  // because that is attached by TaskQueue to task result future. So, this 
method must not be called with "statusLock"
+  // held. See https://github.com/apache/incubator-druid/issues/6201
   private void taskComplete(
       HttpRemoteTaskRunnerWorkItem taskRunnerWorkItem,
       WorkerHolder workerHolder,
       TaskStatus taskStatus
   )
   {
+    Preconditions.checkArgument(!Thread.holdsLock(statusLock), "Current thread 
must not hold statusLock.");
 
 Review comment:
   makes sense, fixed

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to