This is an automated email from the ASF dual-hosted git repository.

vjasani pushed a commit to branch branch-2.2
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.2 by this push:
     new 1a4b474  HBASE-24511 Ability to configure timeout between RPC retry to 
RS from master (#1861)
1a4b474 is described below

commit 1a4b474c2b057bdcc48a4bb2eeb17ca77b111bc8
Author: sguggilam <sandeepbit...@gmail.com>
AuthorDate: Wed Jun 10 00:23:08 2020 -0700

    HBASE-24511 Ability to configure timeout between RPC retry to RS from 
master (#1861)
    
    Signed-off-by: Viraj Jasani <vjas...@apache.org>
---
 .../master/procedure/RSProcedureDispatcher.java    | 20 ++++++++++--
 .../TestRegionServerReportForDuty.java             | 38 +++++++++++++++++++++-
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
index b469cb8..ae60848 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
@@ -211,12 +211,19 @@ public class RSProcedureDispatcher
     private int numberOfAttemptsSoFar = 0;
     private long maxWaitTime = -1;
 
+    private final long rsRpcRetryInterval;
+    private static final String RS_RPC_RETRY_INTERVAL_CONF_KEY =
+        "hbase.regionserver.rpc.retry.interval";
+    private static final int DEFAULT_RS_RPC_RETRY_INTERVAL = 100;
+
     private ExecuteProceduresRequest.Builder request = null;
 
     public ExecuteProceduresRemoteCall(final ServerName serverName,
         final Set<RemoteProcedure> remoteProcedures) {
       this.serverName = serverName;
       this.remoteProcedures = remoteProcedures;
+      this.rsRpcRetryInterval = 
master.getConfiguration().getLong(RS_RPC_RETRY_INTERVAL_CONF_KEY,
+        DEFAULT_RS_RPC_RETRY_INTERVAL);
     }
 
     private AdminService.BlockingInterface getRsAdmin() throws IOException {
@@ -241,7 +248,8 @@ public class RSProcedureDispatcher
           LOG.warn("waiting a little before trying on the same server={}," +
             " try={}, can wait up to {}ms", serverName, numberOfAttemptsSoFar, 
remainingTime);
           numberOfAttemptsSoFar++;
-          submitTask(this, 100, TimeUnit.MILLISECONDS);
+          // Retry every rsRpcRetryInterval millis up to maximum wait time.
+          submitTask(this, rsRpcRetryInterval, TimeUnit.MILLISECONDS);
           return true;
         }
         LOG.warn("server {} is not up for a while; try a new one", serverName);
@@ -283,7 +291,15 @@ public class RSProcedureDispatcher
           e.toString(), numberOfAttemptsSoFar);
       }
       numberOfAttemptsSoFar++;
-      submitTask(this, 100, TimeUnit.MILLISECONDS);
+      // Add some backoff here as the attempts rise otherwise if a stuck 
condition, will fill logs
+      // with failed attempts. None of our backoff classes -- RetryCounter or 
ClientBackoffPolicy
+      // -- fit here nicely so just do something simple; increment by 
rsRpcRetryInterval millis *
+      // retry^2 on each try
+      // up to max of 10 seconds (don't want to back off too much in case of 
situation change).
+      submitTask(this,
+        Math.min(rsRpcRetryInterval * (this.numberOfAttemptsSoFar * 
this.numberOfAttemptsSoFar),
+          10 * 1000),
+        TimeUnit.MILLISECONDS);
       return true;
     }
 
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java
index aaf2d2e..2821e36 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java
@@ -22,7 +22,8 @@ import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
 import java.io.StringWriter;
-
+import java.util.concurrent.ScheduledThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseClassTestRule;
@@ -38,6 +39,7 @@ import org.apache.hadoop.hbase.master.ServerManager;
 import org.apache.hadoop.hbase.testclassification.MediumTests;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
+import org.apache.hadoop.hbase.util.Threads;
 import org.apache.log4j.Appender;
 import org.apache.log4j.Layout;
 import org.apache.log4j.PatternLayout;
@@ -221,6 +223,40 @@ public class TestRegionServerReportForDuty {
       tablesOnMaster? 3: 2);
 
   }
+  
+  /**
+   * Tests region sever reportForDuty with RS RPC retry
+   */
+  @Test
+  public void testReportForDutyWithRSRpcRetry() throws Exception {
+    ScheduledThreadPoolExecutor scheduledThreadPoolExecutor =
+        new ScheduledThreadPoolExecutor(1, 
Threads.newDaemonThreadFactory("RSDelayedStart"));
+
+    // Start a master and wait for it to become the active/primary master.
+    // Use a random unique port
+    cluster.getConfiguration().setInt(HConstants.MASTER_PORT, 
HBaseTestingUtility.randomFreePort());
+    // Override the default RS RPC retry interval of 100ms to 300ms
+    
cluster.getConfiguration().setLong("hbase.regionserver.rpc.retry.interval", 
300);
+    // master has a rs. defaultMinToStart = 2
+    boolean tablesOnMaster = 
LoadBalancer.isTablesOnMaster(testUtil.getConfiguration());
+    
cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART,
+      tablesOnMaster ? 2 : 1);
+    
cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART,
+      tablesOnMaster ? 2 : 1);
+    master = cluster.addMaster();
+    rs = cluster.addRegionServer();
+    LOG.debug("Starting master: " + master.getMaster().getServerName());
+    master.start();
+    // Delay the RS start so that the meta assignment fails in first attempt 
and goes to retry block
+    scheduledThreadPoolExecutor.schedule(new Runnable() {
+      @Override
+      public void run() {
+        rs.start();
+      }
+    }, 1000, TimeUnit.MILLISECONDS);
+
+    waitForClusterOnline(master);
+  }
 
   private void waitForClusterOnline(MasterThread master) throws 
InterruptedException {
     while (true) {

Reply via email to