This is an automated email from the ASF dual-hosted git repository. apurtell pushed a commit to branch branch-1 in repository https://gitbox.apache.org/repos/asf/hbase.git
commit f3f3798575527df45961f4cbdb6d4c1d04cfb1e3 Author: Mingliang Liu <lium...@apache.org> AuthorDate: Thu Sep 6 23:01:52 2018 -0700 HBASE-21164 reportForDuty should do backoff rather than retry Remove unused methods from Sleeper (its ok, its @Private). Remove notion of startTime from Sleeper handling (it is is unused). Allow passing in how long to sleep so can maintain externally. In HRS, use a RetryCounter to calculate backoff sleep time for when reportForDuty is failing against a struggling Master. Conflicts: hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java Amending-Author: Andrew Purtell <apurt...@apache.org> --- .../org/apache/hadoop/hbase/util/RetryCounter.java | 10 +++ .../java/org/apache/hadoop/hbase/util/Sleeper.java | 31 +++----- .../org/apache/hadoop/hbase/master/HMaster.java | 13 +++- .../hadoop/hbase/regionserver/HRegionServer.java | 15 +++- .../TestRegionServerReportForDuty.java | 89 ++++++++++++++++++++++ 5 files changed, 132 insertions(+), 26 deletions(-) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java index 73512fa..881dd36 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java @@ -174,4 +174,14 @@ public class RetryCounter { public int getAttemptTimes() { return attempts; } + + public long getBackoffTime() { + return this.retryConfig.backoffPolicy.getBackoffTime(this.retryConfig, getAttemptTimes()); + } + + public long getBackoffTimeAndIncrementAttempts() { + long backoffTime = getBackoffTime(); + useRetry(); + return backoffTime; + } } diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java index a60c571..a07ee9c 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java @@ -50,13 +50,6 @@ public class Sleeper { } /** - * Sleep for period. - */ - public void sleep() { - sleep(System.currentTimeMillis()); - } - - /** * If currently asleep, stops sleeping; if not asleep, will skip the next * sleep cycle. */ @@ -68,28 +61,24 @@ public class Sleeper { } /** - * Sleep for period adjusted by passed <code>startTime</code> - * @param startTime Time some task started previous to now. Time to sleep - * will be docked current time minus passed <code>startTime</code>. + * Sleep for period. */ - public void sleep(final long startTime) { + public void sleep() { + sleep(this.period); + } + + public void sleep(long sleepTime) { if (this.stopper.isStopped()) { return; } long now = System.currentTimeMillis(); - long waitTime = this.period - (now - startTime); - if (waitTime > this.period) { - LOG.warn("Calculated wait time > " + this.period + - "; setting to this.period: " + System.currentTimeMillis() + ", " + - startTime); - waitTime = this.period; - } - while (waitTime > 0) { + long currentSleepTime = sleepTime; + while (currentSleepTime > 0) { long woke = -1; try { synchronized (sleepLock) { if (triggerWake) break; - sleepLock.wait(waitTime); + sleepLock.wait(currentSleepTime); } woke = System.currentTimeMillis(); long slept = woke - now; @@ -108,7 +97,7 @@ public class Sleeper { } // Recalculate waitTime. woke = (woke == -1)? System.currentTimeMillis(): woke; - waitTime = this.period - (woke - startTime); + currentSleepTime = this.period - (woke - now); } synchronized(sleepLock) { triggerWake = false; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index c2f99f4..d17e297 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -2656,7 +2656,18 @@ public class HMaster extends HRegionServer implements MasterServices, Server { stop("Stopped by " + Thread.currentThread().getName()); } - void checkServiceStarted() throws ServerNotRunningYetException { + @Override + public void stop(String msg) { + if (!isStopped()) { + super.stop(msg); + if (this.activeMasterManager != null) { + this.activeMasterManager.stop(); + } + } + } + + @VisibleForTesting + protected void checkServiceStarted() throws ServerNotRunningYetException { if (!serviceStarted) { throw new ServerNotRunningYetException("Server is not running yet"); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 8547839..6e5ce80 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -178,6 +178,8 @@ import org.apache.hadoop.hbase.util.HasThread; import org.apache.hadoop.hbase.util.JSONBean; import org.apache.hadoop.hbase.util.JvmPauseMonitor; import org.apache.hadoop.hbase.util.MBeanUtil; +import org.apache.hadoop.hbase.util.RetryCounter; +import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil; import org.apache.hadoop.hbase.util.Sleeper; import org.apache.hadoop.hbase.util.Threads; @@ -963,13 +965,18 @@ public class HRegionServer extends HasThread implements this.rsHost = new RegionServerCoprocessorHost(this, this.conf); } - // Try and register with the Master; tell it we are here. Break if - // server is stopped or the clusterup flag is down or hdfs went wacky. + // Try and register with the Master; tell it we are here. Break if server is stopped or the + // clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start + // up all Services. Use RetryCounter to get backoff in case Master is struggling to come up. + RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE, + this.sleeper.getPeriod(), 1000 * 60 * 5); + RetryCounter rc = rcf.create(); while (keepLooping()) { RegionServerStartupResponse w = reportForDuty(); if (w == null) { - LOG.warn("reportForDuty failed; sleeping and then retrying."); - this.sleeper.sleep(); + long sleepTime = rc.getBackoffTimeAndIncrementAttempts(); + LOG.warn("reportForDuty failed; sleeping " + sleepTime + " ms and then retrying"); + this.sleeper.sleep(sleepTime); } else { handleReportForDutyResponse(w); break; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java index 2f60bef..792843c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java @@ -21,7 +21,9 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; +import java.io.StringWriter; +import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -30,12 +32,19 @@ import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.LocalHBaseCluster; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; +import org.apache.log4j.Appender; +import org.apache.log4j.Layout; +import org.apache.log4j.PatternLayout; +import org.apache.log4j.WriterAppender; import org.apache.zookeeper.KeeperException; + import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -73,6 +82,86 @@ public class TestRegionServerReportForDuty { } /** + * LogCapturer is similar to {@link org.apache.hadoop.test.GenericTestUtils.LogCapturer} + * except that this implementation has a default appender to the root logger. + * Hadoop 2.8+ supports the default appender in the LogCapture it ships and this can be replaced. + * TODO: This class can be removed after we upgrade Hadoop dependency. + */ + static class LogCapturer { + private StringWriter sw = new StringWriter(); + private WriterAppender appender; + private org.apache.log4j.Logger logger; + + LogCapturer(org.apache.log4j.Logger logger) { + this.logger = logger; + Appender defaultAppender = org.apache.log4j.Logger.getRootLogger().getAppender("stdout"); + if (defaultAppender == null) { + defaultAppender = org.apache.log4j.Logger.getRootLogger().getAppender("console"); + } + final Layout layout = (defaultAppender == null) ? new PatternLayout() : + defaultAppender.getLayout(); + this.appender = new WriterAppender(layout, sw); + this.logger.addAppender(this.appender); + } + + String getOutput() { + return sw.toString(); + } + + public void stopCapturing() { + this.logger.removeAppender(this.appender); + } + } + + /** + * This test HMaster class will always throw ServerNotRunningYetException if checked. + */ + public static class NeverInitializedMaster extends HMaster { + public NeverInitializedMaster(Configuration conf, CoordinatedStateManager csm) + throws IOException, KeeperException, InterruptedException { + super(conf, csm); + } + + @Override + protected void checkServiceStarted() throws ServerNotRunningYetException { + throw new ServerNotRunningYetException("Server is not running yet"); + } + } + + /** + * Tests region server should backoff to report for duty if master is not ready. + */ + @Test + public void testReportForDutyBackoff() throws IOException, InterruptedException { + cluster.getConfiguration().set(HConstants.MASTER_IMPL, NeverInitializedMaster.class.getName()); + master = cluster.addMaster(); + master.start(); + + LogCapturer capturer = new LogCapturer(org.apache.log4j.Logger.getLogger(HRegionServer.class)); + // Set sleep interval relatively low so that exponential backoff is more demanding. + int msginterval = 100; + cluster.getConfiguration().setInt("hbase.regionserver.msginterval", msginterval); + rs = cluster.addRegionServer(); + rs.start(); + + int interval = 10_000; + Thread.sleep(interval); + capturer.stopCapturing(); + String output = capturer.getOutput(); + LOG.info(output); + String failMsg = "reportForDuty failed;"; + int count = StringUtils.countMatches(output, failMsg); + + // Following asserts the actual retry number is in range (expectedRetry/2, expectedRetry*2). + // Ideally we can assert the exact retry count. We relax here to tolerate contention error. + int expectedRetry = (int)Math.ceil(Math.log(interval - msginterval)); + assertTrue(String.format("reportForDuty retries %d times, less than expected min %d", + count, expectedRetry / 2), count > expectedRetry / 2); + assertTrue(String.format("reportForDuty retries %d times, more than expected max %d", + count, expectedRetry * 2), count < expectedRetry * 2); + } + + /** * Tests region sever reportForDuty with backup master becomes primary master after * the first master goes away. */