Author: liyin Date: Tue Oct 30 06:14:06 2012 New Revision: 1403626 URL: http://svn.apache.org/viewvc?rev=1403626&view=rev Log: [master] [0.89-fb] Ensure that new master does not split logs of recently checked in RS
Author: aaiyer Summary: Sev on cell 13 saw a scenario where, upon master failover, the new master was splitting the logs for regionservers that were still running. This happens when there is an error in the SplitLog for one of the logs and the master does not acknowledge new servers. Test Plan: run MR tests. One failure. Also fails without the diff. Also, adding a unit test. Reviewers: kranganathan, kannan Reviewed By: kranganathan CC: hbase-eng@ Differential Revision: https://phabricator.fb.com/D611291 Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/InjectionEvent.java hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestLogSplitOnMasterFailover.java Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1403626&r1=1403625&r2=1403626&view=diff ============================================================================== --- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original) +++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue Oct 30 06:14:06 2012 @@ -1061,7 +1061,9 @@ public class HMaster extends HasThread i Path logDir = status.getPath(); String serverName = logDir.getName(); LOG.info("Found log folder : " + serverName); - if (!clusterStateRecovery.liveRegionServersAtStartup().contains(serverName)) { + if (!clusterStateRecovery.liveRegionServersAtStartup().contains(serverName) + // If a server now checked in with the new master, don't kill it. + && serverManager.getServerInfo(serverName) == null) { LOG.info("Log folder " + status.getPath() + " doesn't belong " + "to a known region server, splitting"); serverNames.add(serverName); Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java?rev=1403626&r1=1403625&r2=1403626&view=diff ============================================================================== --- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java (original) +++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java Tue Oct 30 06:14:06 2012 @@ -41,6 +41,8 @@ import org.apache.hadoop.hbase.master.Sp import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter; import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.util.InjectionEvent; +import org.apache.hadoop.hbase.util.InjectionHandler; import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper; import org.apache.hadoop.hbase.zookeeper.ZKSplitLog; import org.apache.hadoop.hbase.zookeeper.ZKSplitLog.TaskState; @@ -136,6 +138,8 @@ public class SplitLogWorker implements R try { FileStatus st; try { + InjectionHandler.processEventIO(InjectionEvent.SPLITLOGWORKER_SPLIT_LOG_START); + st = fs.getFileStatus(new Path(filename)); t1 = System.currentTimeMillis(); Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/InjectionEvent.java URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/InjectionEvent.java?rev=1403626&r1=1403625&r2=1403626&view=diff ============================================================================== --- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/InjectionEvent.java (original) +++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/InjectionEvent.java Tue Oct 30 06:14:06 2012 @@ -32,5 +32,6 @@ public enum InjectionEvent { HMASTER_ALTER_TABLE, HMASTER_ENABLE_TABLE, HMASTER_DISABLE_TABLE, - ZKUNASSIGNEDWATCHER_REGION_OPENED + ZKUNASSIGNEDWATCHER_REGION_OPENED, + SPLITLOGWORKER_SPLIT_LOG_START } Modified: hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestLogSplitOnMasterFailover.java URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestLogSplitOnMasterFailover.java?rev=1403626&r1=1403625&r2=1403626&view=diff ============================================================================== --- hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestLogSplitOnMasterFailover.java (original) +++ hbase/branches/0.89-fb/src/test/java/org/apache/hadoop/hbase/master/TestLogSplitOnMasterFailover.java Tue Oct 30 06:14:06 2012 @@ -48,8 +48,10 @@ import org.apache.hadoop.hbase.io.hfile. import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.util.InjectionEvent; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper; +import org.apache.hadoop.hbase.util.InjectionHandler; import org.junit.Test; /** @@ -248,6 +250,33 @@ public class TestLogSplitOnMasterFailove runTest(); } + @Test(timeout=180000) + public void testWithDistributedLogSplittingAndErrors() throws Exception { + // add a split log worker to handle InjectionEvent.SPLITLOGWORKER_SPLIT_LOG_START. + ZooKeeperWrapper.setNamespaceForTesting(); + conf.setBoolean(HConstants.DISTRIBUTED_LOG_SPLITTING_KEY, true); + InjectionHandler.set(new SplitLogKillInjectionHandler()); + runTest(); + } + + static class SplitLogKillInjectionHandler extends InjectionHandler { + static int count = 0; + + @Override + // kill split log workers the first few times. + protected void _processEventIO(InjectionEvent event, Object... args) throws IOException{ + if (event == InjectionEvent.SPLITLOGWORKER_SPLIT_LOG_START) { + count++; + LOG.debug("Processing a split log event. Count = " + count); + Threads.sleep(50); // make it take a bit of time. sleep 50ms. + if (count < 5) { + throw new IOException("Failing for the test"); + } + } + } + } + + private void runTest() throws Exception { startMiniCluster(NUM_MASTERS, NUM_RS); Thread.currentThread().setName(getClass().getSimpleName()); @@ -301,7 +330,15 @@ public class TestLogSplitOnMasterFailove masters = miniCluster().getMasters(); assertEquals(1, masters.size()); - + + // Start a few new regionservers. + final int EXTRA_RS = 2; + for (int i = NUM_RS; i < NUM_RS + EXTRA_RS; ++i) { + miniCluster().startRegionServer(); + otherRsNames.add( + miniCluster().getRegionServer(i).getServerInfo().getServerName()); + } + // wait for an active master to show up and be ready assertTrue(miniCluster().waitForActiveAndReadyMaster());
