This is an automated email from the ASF dual-hosted git repository. vjasani pushed a commit to branch branch-2.2 in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.2 by this push: new 1a4b474 HBASE-24511 Ability to configure timeout between RPC retry to RS from master (#1861) 1a4b474 is described below commit 1a4b474c2b057bdcc48a4bb2eeb17ca77b111bc8 Author: sguggilam <sandeepbit...@gmail.com> AuthorDate: Wed Jun 10 00:23:08 2020 -0700 HBASE-24511 Ability to configure timeout between RPC retry to RS from master (#1861) Signed-off-by: Viraj Jasani <vjas...@apache.org> --- .../master/procedure/RSProcedureDispatcher.java | 20 ++++++++++-- .../TestRegionServerReportForDuty.java | 38 +++++++++++++++++++++- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index b469cb8..ae60848 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -211,12 +211,19 @@ public class RSProcedureDispatcher private int numberOfAttemptsSoFar = 0; private long maxWaitTime = -1; + private final long rsRpcRetryInterval; + private static final String RS_RPC_RETRY_INTERVAL_CONF_KEY = + "hbase.regionserver.rpc.retry.interval"; + private static final int DEFAULT_RS_RPC_RETRY_INTERVAL = 100; + private ExecuteProceduresRequest.Builder request = null; public ExecuteProceduresRemoteCall(final ServerName serverName, final Set<RemoteProcedure> remoteProcedures) { this.serverName = serverName; this.remoteProcedures = remoteProcedures; + this.rsRpcRetryInterval = master.getConfiguration().getLong(RS_RPC_RETRY_INTERVAL_CONF_KEY, + DEFAULT_RS_RPC_RETRY_INTERVAL); } private AdminService.BlockingInterface getRsAdmin() throws IOException { @@ -241,7 +248,8 @@ public class RSProcedureDispatcher LOG.warn("waiting a little before trying on the same server={}," + " try={}, can wait up to {}ms", serverName, numberOfAttemptsSoFar, remainingTime); numberOfAttemptsSoFar++; - submitTask(this, 100, TimeUnit.MILLISECONDS); + // Retry every rsRpcRetryInterval millis up to maximum wait time. + submitTask(this, rsRpcRetryInterval, TimeUnit.MILLISECONDS); return true; } LOG.warn("server {} is not up for a while; try a new one", serverName); @@ -283,7 +291,15 @@ public class RSProcedureDispatcher e.toString(), numberOfAttemptsSoFar); } numberOfAttemptsSoFar++; - submitTask(this, 100, TimeUnit.MILLISECONDS); + // Add some backoff here as the attempts rise otherwise if a stuck condition, will fill logs + // with failed attempts. None of our backoff classes -- RetryCounter or ClientBackoffPolicy + // -- fit here nicely so just do something simple; increment by rsRpcRetryInterval millis * + // retry^2 on each try + // up to max of 10 seconds (don't want to back off too much in case of situation change). + submitTask(this, + Math.min(rsRpcRetryInterval * (this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar), + 10 * 1000), + TimeUnit.MILLISECONDS); return true; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java index aaf2d2e..2821e36 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java @@ -22,7 +22,8 @@ import static org.junit.Assert.assertTrue; import java.io.IOException; import java.io.StringWriter; - +import java.util.concurrent.ScheduledThreadPoolExecutor; +import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseClassTestRule; @@ -38,6 +39,7 @@ import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; +import org.apache.hadoop.hbase.util.Threads; import org.apache.log4j.Appender; import org.apache.log4j.Layout; import org.apache.log4j.PatternLayout; @@ -221,6 +223,40 @@ public class TestRegionServerReportForDuty { tablesOnMaster? 3: 2); } + + /** + * Tests region sever reportForDuty with RS RPC retry + */ + @Test + public void testReportForDutyWithRSRpcRetry() throws Exception { + ScheduledThreadPoolExecutor scheduledThreadPoolExecutor = + new ScheduledThreadPoolExecutor(1, Threads.newDaemonThreadFactory("RSDelayedStart")); + + // Start a master and wait for it to become the active/primary master. + // Use a random unique port + cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtility.randomFreePort()); + // Override the default RS RPC retry interval of 100ms to 300ms + cluster.getConfiguration().setLong("hbase.regionserver.rpc.retry.interval", 300); + // master has a rs. defaultMinToStart = 2 + boolean tablesOnMaster = LoadBalancer.isTablesOnMaster(testUtil.getConfiguration()); + cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, + tablesOnMaster ? 2 : 1); + cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, + tablesOnMaster ? 2 : 1); + master = cluster.addMaster(); + rs = cluster.addRegionServer(); + LOG.debug("Starting master: " + master.getMaster().getServerName()); + master.start(); + // Delay the RS start so that the meta assignment fails in first attempt and goes to retry block + scheduledThreadPoolExecutor.schedule(new Runnable() { + @Override + public void run() { + rs.start(); + } + }, 1000, TimeUnit.MILLISECONDS); + + waitForClusterOnline(master); + } private void waitForClusterOnline(MasterThread master) throws InterruptedException { while (true) {