HBASE-15615 Wrong sleep time when RegionServerCallable need retry (Guanghao Zhang)
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/2482062d Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/2482062d Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/2482062d Branch: refs/heads/HBASE-14850 Commit: 2482062d3475ae2ad9fc32f0f346e83892f81ad0 Parents: 6904430 Author: Mikhail Antonov <[email protected]> Authored: Sun May 15 20:37:02 2016 -0700 Committer: Mikhail Antonov <[email protected]> Committed: Sun May 15 20:37:26 2016 -0700 ---------------------------------------------------------------------- .../client/AbstractRegionServerCallable.java | 3 +- .../hadoop/hbase/client/ConnectionUtils.java | 3 + .../client/RegionAdminServiceCallable.java | 2 +- .../hbase/client/RpcRetryingCallerImpl.java | 4 +- .../hbase/client/TestConnectionUtils.java | 20 +++ .../org/apache/hadoop/hbase/client/TestHCM.java | 136 +++++++++++++++---- 6 files changed, 137 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AbstractRegionServerCallable.java ---------------------------------------------------------------------- diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AbstractRegionServerCallable.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AbstractRegionServerCallable.java index 4a0ea28..7279d81 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AbstractRegionServerCallable.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AbstractRegionServerCallable.java @@ -100,8 +100,7 @@ abstract class AbstractRegionServerCallable<T> implements RetryingCallable<T> { @Override public long sleep(long pause, int tries) { - // Tries hasn't been bumped up yet so we use "tries + 1" to get right pause time - long sleep = ConnectionUtils.getPauseTime(pause, tries + 1); + long sleep = ConnectionUtils.getPauseTime(pause, tries); if (sleep < MIN_WAIT_DEAD_SERVER && (location == null || getConnection().isDeadServer(location.getServerName()))) { sleep = ConnectionUtils.addJitter(MIN_WAIT_DEAD_SERVER, 0.10f); http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java ---------------------------------------------------------------------- diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java index 82c2fc4..363a0e0 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java @@ -55,6 +55,9 @@ public final class ConnectionUtils { if (ntries >= HConstants.RETRY_BACKOFF.length) { ntries = HConstants.RETRY_BACKOFF.length - 1; } + if (ntries < 0) { + ntries = 0; + } long normalPause = pause * HConstants.RETRY_BACKOFF[ntries]; long jitter = (long)(normalPause * RANDOM.nextFloat() * 0.01f); // 1% possible jitter http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java ---------------------------------------------------------------------- diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java index 8ff8b8b..725bec0 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java @@ -140,7 +140,7 @@ public abstract class RegionAdminServiceCallable<T> implements RetryingCallable< @Override public long sleep(long pause, int tries) { - long sleep = ConnectionUtils.getPauseTime(pause, tries + 1); + long sleep = ConnectionUtils.getPauseTime(pause, tries); if (sleep < MIN_WAIT_DEAD_SERVER && (location == null || connection.isDeadServer(location.getServerName()))) { sleep = ConnectionUtils.addJitter(MIN_WAIT_DEAD_SERVER, 0.10f); http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCallerImpl.java ---------------------------------------------------------------------- diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCallerImpl.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCallerImpl.java index 41405ec..8f28796 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCallerImpl.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCallerImpl.java @@ -124,8 +124,8 @@ public class RpcRetryingCallerImpl<T> implements RpcRetryingCaller<T> { } // If the server is dead, we need to wait a little before retrying, to give // a chance to the regions to be - // tries hasn't been bumped up yet so we use "tries + 1" to get right pause time - expectedSleep = callable.sleep(pause, tries + 1); + // get right pause time, start by RETRY_BACKOFF[0] * pause + expectedSleep = callable.sleep(pause, tries); // If, after the planned sleep, there won't be enough time left, we stop now. long duration = singleCallDuration(expectedSleep); http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java index ac0a0bd..aea8a2b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java @@ -19,6 +19,7 @@ */ package org.apache.hadoop.hbase.client; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.testclassification.ClientTests; import org.apache.hadoop.hbase.testclassification.SmallTests; import org.junit.Test; @@ -54,4 +55,23 @@ public class TestConnectionUtils { assertTrue(retyTimeSet.size() > (retries.length * 0.80)); } + @Test + public void testGetPauseTime() { + long pauseTime; + long baseTime = 100; + pauseTime = ConnectionUtils.getPauseTime(baseTime, -1); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[0])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[0] * 1.01f)); + + for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) { + pauseTime = ConnectionUtils.getPauseTime(baseTime, i); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f)); + } + + int length = HConstants.RETRY_BACKOFF.length; + pauseTime = ConnectionUtils.getPauseTime(baseTime, length); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[length - 1])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[length - 1] * 1.01f)); + } } http://git-wip-us.apache.org/repos/asf/hbase/blob/2482062d/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java index a0f91f4..289e6f2 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java @@ -62,6 +62,7 @@ import org.apache.hadoop.hbase.exceptions.RegionMovedException; import org.apache.hadoop.hbase.filter.Filter; import org.apache.hadoop.hbase.filter.FilterBase; import org.apache.hadoop.hbase.ipc.RpcClient; +import org.apache.hadoop.hbase.ipc.RpcControllerFactory; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.Region; @@ -114,14 +115,25 @@ public class TestHCM { */ public static class SleepAndFailFirstTime extends BaseRegionObserver { static final AtomicLong ct = new AtomicLong(0); + static final String SLEEP_TIME_CONF_KEY = + "hbase.coprocessor.SleepAndFailFirstTime.sleepTime"; + static final long DEFAULT_SLEEP_TIME = 20000; + static final AtomicLong sleepTime = new AtomicLong(DEFAULT_SLEEP_TIME); public SleepAndFailFirstTime() { } @Override + public void postOpen(ObserverContext<RegionCoprocessorEnvironment> c) { + RegionCoprocessorEnvironment env = c.getEnvironment(); + Configuration conf = env.getConfiguration(); + sleepTime.set(conf.getLong(SLEEP_TIME_CONF_KEY, DEFAULT_SLEEP_TIME)); + } + + @Override public void preGetOp(final ObserverContext<RegionCoprocessorEnvironment> e, final Get get, final List<Cell> results) throws IOException { - Threads.sleep(20000); + Threads.sleep(sleepTime.get()); if (ct.incrementAndGet() == 1){ throw new IOException("first call I fail"); } @@ -352,6 +364,95 @@ public class TestHCM { } } + /** + * Test starting from 0 index when RpcRetryingCaller calculate the backoff time. + */ + @Test + public void testRpcRetryingCallerSleep() throws Exception { + HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testRpcRetryingCallerSleep"); + hdt.addCoprocessorWithSpec("|" + SleepAndFailFirstTime.class.getName() + "||" + + SleepAndFailFirstTime.SLEEP_TIME_CONF_KEY + "=2000"); + TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }).close(); + + Configuration c = new Configuration(TEST_UTIL.getConfiguration()); + c.setInt(HConstants.HBASE_CLIENT_PAUSE, 3000); + c.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 4000); + + Connection connection = ConnectionFactory.createConnection(c); + Table t = connection.getTable(TableName.valueOf("HCM-testRpcRetryingCallerSleep")); + if (t instanceof HTable) { + HTable table = (HTable) t; + table.setOperationTimeout(8000); + // Check that it works. Because 2s + 3s * RETRY_BACKOFF[0] + 2s < 8s + table.get(new Get(FAM_NAM)); + + // Resetting and retrying. + SleepAndFailFirstTime.ct.set(0); + try { + table.setOperationTimeout(6000); + // Will fail this time. After sleep, there are not enough time for second retry + // Beacuse 2s + 3s + 2s > 6s + table.get(new Get(FAM_NAM)); + Assert.fail("We expect an exception here"); + } catch (SocketTimeoutException e) { + LOG.info("We received an exception, as expected ", e); + } catch (IOException e) { + Assert.fail("Wrong exception:" + e.getMessage()); + } finally { + table.close(); + connection.close(); + } + } + } + + @Test + public void testCallableSleep() throws Exception { + long pauseTime; + long baseTime = 100; + TableName tableName = TableName.valueOf("HCM-testCallableSleep"); + HTable table = TEST_UTIL.createTable(tableName, FAM_NAM); + RegionServerCallable<Object> regionServerCallable = new RegionServerCallable<Object>( + TEST_UTIL.getConnection(), tableName, ROW) { + public Object call(int timeout) throws IOException { + return null; + } + }; + + regionServerCallable.prepare(false); + for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) { + pauseTime = regionServerCallable.sleep(baseTime, i); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f)); + } + + RegionAdminServiceCallable<Object> regionAdminServiceCallable = + new RegionAdminServiceCallable<Object>( + (ClusterConnection) TEST_UTIL.getConnection(), new RpcControllerFactory( + TEST_UTIL.getConfiguration()), tableName, ROW) { + public Object call(int timeout) throws IOException { + return null; + } + }; + + regionAdminServiceCallable.prepare(false); + for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) { + pauseTime = regionAdminServiceCallable.sleep(baseTime, i); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f)); + } + + MasterCallable masterCallable = new MasterCallable((HConnection) TEST_UTIL.getConnection()) { + public Object call(int timeout) throws IOException { + return null; + } + }; + + for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) { + pauseTime = masterCallable.sleep(baseTime, i); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f)); + } + } private void testConnectionClose(boolean allowsInterrupt) throws Exception { TableName tableName = TableName.valueOf("HCM-testConnectionClose" + allowsInterrupt); @@ -1032,7 +1133,7 @@ public class TestHCM { } } - @Ignore ("Test presumes RETRY_BACKOFF will never change; it has") @Test + @Test public void testErrorBackoffTimeCalculation() throws Exception { // TODO: This test would seem to presume hardcoded RETRY_BACKOFF which it should not. final long ANY_PAUSE = 100; @@ -1052,40 +1153,23 @@ public class TestHCM { // Check some backoff values from HConstants sequence. tracker.reportServerError(location); - assertEqualsWithJitter(ANY_PAUSE, tracker.calculateBackoffTime(location, ANY_PAUSE)); + assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[0], + tracker.calculateBackoffTime(location, ANY_PAUSE)); tracker.reportServerError(location); tracker.reportServerError(location); tracker.reportServerError(location); - assertEqualsWithJitter(ANY_PAUSE * 5, tracker.calculateBackoffTime(location, ANY_PAUSE)); + assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[3], + tracker.calculateBackoffTime(location, ANY_PAUSE)); // All of this shouldn't affect backoff for different location. assertEquals(0, tracker.calculateBackoffTime(diffLocation, ANY_PAUSE)); tracker.reportServerError(diffLocation); - assertEqualsWithJitter(ANY_PAUSE, tracker.calculateBackoffTime(diffLocation, ANY_PAUSE)); + assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[0], + tracker.calculateBackoffTime(diffLocation, ANY_PAUSE)); // Check with different base. - assertEqualsWithJitter(ANY_PAUSE * 10, + assertEqualsWithJitter(ANY_PAUSE * 2 * HConstants.RETRY_BACKOFF[3], tracker.calculateBackoffTime(location, ANY_PAUSE * 2)); - - // See that time from last error is taken into account. Time shift is applied after jitter, - // so pass the original expected backoff as the base for jitter. - long timeShift = (long)(ANY_PAUSE * 0.5); - timeMachine.setValue(timeBase + timeShift); - assertEqualsWithJitter((ANY_PAUSE * 5) - timeShift, - tracker.calculateBackoffTime(location, ANY_PAUSE), ANY_PAUSE * 2); - - // However we should not go into negative. - timeMachine.setValue(timeBase + ANY_PAUSE * 100); - assertEquals(0, tracker.calculateBackoffTime(location, ANY_PAUSE)); - - // We also should not go over the boundary; last retry would be on it. - long timeLeft = (long)(ANY_PAUSE * 0.5); - timeMachine.setValue(timeBase + largeAmountOfTime - timeLeft); - assertTrue(tracker.canTryMore(1)); - tracker.reportServerError(location); - assertEquals(timeLeft, tracker.calculateBackoffTime(location, ANY_PAUSE)); - timeMachine.setValue(timeBase + largeAmountOfTime); - assertFalse(tracker.canTryMore(1)); } finally { EnvironmentEdgeManager.reset(); }
