This is an automated email from the ASF dual-hosted git repository.

stack pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.3 by this push:
     new 41096a8  HBASE-22287 inifinite retries on failed server in 
RSProcedureDispatcher (#1800)
41096a8 is described below

commit 41096a83a92ae0e1e3bab0b57df298388b59d61d
Author: Michael Stack <saintst...@users.noreply.github.com>
AuthorDate: Fri May 29 10:00:53 2020 -0700

    HBASE-22287 inifinite retries on failed server in RSProcedureDispatcher 
(#1800)
    
    Adds backoff in place of retry every 100ms.
    
    Signed-off-by: Duo Zhang <zhang...@apache.org>
---
 .../master/procedure/RSProcedureDispatcher.java    | 37 ++++++++++++++--------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
index 9072c3a..74e00f5 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
@@ -35,6 +35,7 @@ import 
org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
 import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.util.RetryCounter;
 import org.apache.hadoop.ipc.RemoteException;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
@@ -256,31 +257,33 @@ public class RSProcedureDispatcher
     }
 
     private boolean scheduleForRetry(IOException e) {
-      LOG.debug("request to {} failed, try={}", serverName, 
numberOfAttemptsSoFar, e);
+      LOG.debug("Request to {} failed, try={}", serverName, 
numberOfAttemptsSoFar, e);
       // Should we wait a little before retrying? If the server is starting 
it's yes.
       if (e instanceof ServerNotRunningYetException) {
         long remainingTime = getMaxWaitTime() - 
EnvironmentEdgeManager.currentTime();
         if (remainingTime > 0) {
-          LOG.warn("waiting a little before trying on the same server={}," +
-            " try={}, can wait up to {}ms", serverName, numberOfAttemptsSoFar, 
remainingTime);
+          LOG.warn("Waiting a little before retrying {}, try={}, can wait up 
to {}ms",
+            serverName, numberOfAttemptsSoFar, remainingTime);
           numberOfAttemptsSoFar++;
+          // Retry every 100ms up to maximum wait time.
           submitTask(this, 100, TimeUnit.MILLISECONDS);
           return true;
         }
-        LOG.warn("server {} is not up for a while; try a new one", serverName);
+        LOG.warn("{} is throwing ServerNotRunningYetException for {}ms; trying 
another server",
+          serverName, getMaxWaitTime());
         return false;
       }
       if (e instanceof DoNotRetryIOException) {
-        LOG.warn("server {} tells us do not retry due to {}, try={}, give up", 
serverName,
+        LOG.warn("{} tells us DoNotRetry due to {}, try={}, give up", 
serverName,
           e.toString(), numberOfAttemptsSoFar);
         return false;
       }
-      // this exception is thrown in the rpc framework, where we can make sure 
that the call has not
+      // This exception is thrown in the rpc framework, where we can make sure 
that the call has not
       // been executed yet, so it is safe to mark it as fail. Especially for 
open a region, we'd
-      // better choose another region server
-      // notice that, it is safe to quit only if this is the first time we 
send request to region
-      // server. Maybe the region server has accept our request the first 
time, and then there is a
-      // network error which prevents we receive the response, and the second 
time we hit a
+      // better choose another region server.
+      // Notice that, it is safe to quit only if this is the first time we 
send request to region
+      // server. Maybe the region server has accepted our request the first 
time, and then there is
+      // a network error which prevents we receive the response, and the 
second time we hit a
       // CallQueueTooBigException, obviously it is not safe to quit here, 
otherwise it may lead to a
       // double assign...
       if (e instanceof CallQueueTooBigException && numberOfAttemptsSoFar == 0) 
{
@@ -290,7 +293,7 @@ public class RSProcedureDispatcher
       }
       // Always retry for other exception types if the region server is not 
dead yet.
       if (!master.getServerManager().isServerOnline(serverName)) {
-        LOG.warn("request to {} failed due to {}, try={}, and the server is 
dead, give up",
+        LOG.warn("Request to {} failed due to {}, try={} and the server is not 
online, give up",
           serverName, e.toString(), numberOfAttemptsSoFar);
         return false;
       }
@@ -299,14 +302,20 @@ public class RSProcedureDispatcher
         // background task to check whether the region server is dead. And if 
it is dead, call
         // remoteCallFailed to tell the upper layer. Keep retrying here does 
not lead to incorrect
         // result, but waste some resources.
-        LOG.warn("server {} is aborted or stopped, for safety we still need 
to" +
+        LOG.warn("{} is aborted or stopped, for safety we still need to" +
           " wait until it is fully dead, try={}", serverName, 
numberOfAttemptsSoFar);
       } else {
-        LOG.warn("request to server {} failed due to {}, try={}, retrying...", 
serverName,
+        LOG.warn("request to {} failed due to {}, try={}, retrying...", 
serverName,
           e.toString(), numberOfAttemptsSoFar);
       }
       numberOfAttemptsSoFar++;
-      submitTask(this, 100, TimeUnit.MILLISECONDS);
+      // Add some backoff here as the attempts rise otherwise if a stuck 
condition, will fill logs
+      // with failed attempts. None of our backoff classes -- RetryCounter or 
ClientBackoffPolicy
+      // -- fit here nicely so just do something simple; increment by 100ms * 
retry^2 on each try
+      // up to max of 10 seconds (don't want to back off too much in case of 
situation change).
+      submitTask(this,
+        Math.min(100 * (this.numberOfAttemptsSoFar * 
this.numberOfAttemptsSoFar), 10 * 1000),
+        TimeUnit.MILLISECONDS);
       return true;
     }
 

Reply via email to