Repository: hbase Updated Branches: refs/heads/master a1f59d8e1 -> 3c06b4818
HBASE-12403 IntegrationTestMTTR flaky due to aggressive RS restart timeout Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/3c06b481 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/3c06b481 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/3c06b481 Branch: refs/heads/master Commit: 3c06b48181e22eb4ce91d6d8a455a1617f13d85f Parents: a1f59d8 Author: Nick Dimiduk <[email protected]> Authored: Fri Oct 31 16:34:48 2014 -0700 Committer: Nick Dimiduk <[email protected]> Committed: Sat Nov 1 10:34:59 2014 -0700 ---------------------------------------------------------------------- .../hadoop/hbase/chaos/actions/Action.java | 32 +++++++++++++++++--- .../hadoop/hbase/mttr/IntegrationTestMTTR.java | 4 +++ 2 files changed, 32 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/3c06b481/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java ---------------------------------------------------------------------- diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java index dea412f..c01ce0f 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java @@ -43,19 +43,43 @@ import org.apache.hadoop.hbase.util.Bytes; */ public class Action { + public static final String KILL_MASTER_TIMEOUT_KEY = + "hbase.chaosmonkey.action.killmastertimeout"; + public static final String START_MASTER_TIMEOUT_KEY = + "hbase.chaosmonkey.action.startmastertimeout"; + public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout"; + public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout"; + protected static Log LOG = LogFactory.getLog(Action.class); + protected static final long KILL_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT; + protected ActionContext context; protected HBaseCluster cluster; protected ClusterStatus initialStatus; protected ServerName[] initialServers; + protected long killMasterTimeout; + protected long startMasterTimeout; + protected long killRsTimeout; + protected long startRsTimeout; + public void init(ActionContext context) throws IOException { this.context = context; cluster = context.getHBaseCluster(); initialStatus = cluster.getInitialClusterStatus(); Collection<ServerName> regionServers = initialStatus.getServers(); initialServers = regionServers.toArray(new ServerName[regionServers.size()]); + + killMasterTimeout = cluster.getConf().getLong(KILL_MASTER_TIMEOUT_KEY, + KILL_MASTER_TIMEOUT_DEFAULT); + startMasterTimeout = cluster.getConf().getLong(START_MASTER_TIMEOUT_KEY, + START_MASTER_TIMEOUT_DEFAULT); + killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT); + startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT); } public void perform() throws Exception { } @@ -84,21 +108,21 @@ public class Action { protected void killMaster(ServerName server) throws IOException { LOG.info("Killing master:" + server); cluster.killMaster(server); - cluster.waitForMasterToStop(server, PolicyBasedChaosMonkey.TIMEOUT); + cluster.waitForMasterToStop(server, killMasterTimeout); LOG.info("Killed master server:" + server); } protected void startMaster(ServerName server) throws IOException { LOG.info("Starting master:" + server.getHostname()); cluster.startMaster(server.getHostname()); - cluster.waitForActiveAndReadyMaster(PolicyBasedChaosMonkey.TIMEOUT); + cluster.waitForActiveAndReadyMaster(startMasterTimeout); LOG.info("Started master: " + server); } protected void killRs(ServerName server) throws IOException { LOG.info("Killing region server:" + server); cluster.killRegionServer(server); - cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT); + cluster.waitForRegionServerToStop(server, killRsTimeout); LOG.info("Killed region server:" + server + ". Reported num of rs:" + cluster.getClusterStatus().getServersSize()); } @@ -106,7 +130,7 @@ public class Action { protected void startRs(ServerName server) throws IOException { LOG.info("Starting region server:" + server.getHostname()); cluster.startRegionServer(server.getHostname()); - cluster.waitForRegionServerToStart(server.getHostname(), PolicyBasedChaosMonkey.TIMEOUT); + cluster.waitForRegionServerToStart(server.getHostname(), startRsTimeout); LOG.info("Started region server:" + server + ". Reported num of rs:" + cluster.getClusterStatus().getServersSize()); } http://git-wip-us.apache.org/repos/asf/hbase/blob/3c06b481/hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java ---------------------------------------------------------------------- diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java index 1484873..12adc80 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java @@ -183,6 +183,10 @@ public class IntegrationTestMTTR { } private static void setupActions() throws IOException { + // allow a little more time for RS restart actions because RS start depends on having a master + // to report to and the master is also being monkeyed. + util.getConfiguration().setLong(Action.START_RS_TIMEOUT_KEY, 3 * 60 * 1000); + // Set up the action that will restart a region server holding a region from our table // because this table should only have one region we should be good. restartRSAction = new RestartRsHoldingTableAction(sleepTime, tableName.getNameAsString());
