This is an automated email from the ASF dual-hosted git repository.
zghao pushed a commit to branch branch-2.2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.2 by this push:
new 5e98129 HBASE-22964 Fix flaky TestClusterRestartFailover and
TestClusterRestartFailoverSplitWithoutZk (#574)
5e98129 is described below
commit 5e98129e048936fc7d0dd392b314f82d1cbbda1a
Author: Guanghao Zhang <[email protected]>
AuthorDate: Sun Sep 8 18:46:52 2019 +0800
HBASE-22964 Fix flaky TestClusterRestartFailover and
TestClusterRestartFailoverSplitWithoutZk (#574)
Signed-off-by: Duo Zhang <[email protected]>
---
.../apache/hadoop/hbase/HBaseTestingUtility.java | 11 +-
.../hbase/master/TestClusterRestartFailover.java | 166 +++++++++++++++------
2 files changed, 131 insertions(+), 46 deletions(-)
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
index 5155d72..50e0f1b 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
@@ -1251,6 +1251,13 @@ public class HBaseTestingUtility extends
HBaseZKTestingUtility {
public void restartHBaseCluster(int servers, List<Integer> ports)
throws IOException, InterruptedException {
+ StartMiniClusterOption option =
+
StartMiniClusterOption.builder().numRegionServers(servers).rsPorts(ports).build();
+ restartHBaseCluster(option);
+ }
+
+ public void restartHBaseCluster(StartMiniClusterOption option)
+ throws IOException, InterruptedException {
if (hbaseAdmin != null) {
hbaseAdmin.close();
hbaseAdmin = null;
@@ -1259,7 +1266,9 @@ public class HBaseTestingUtility extends
HBaseZKTestingUtility {
this.connection.close();
this.connection = null;
}
- this.hbaseCluster = new MiniHBaseCluster(this.conf, 1, servers, ports,
null, null);
+ this.hbaseCluster =
+ new MiniHBaseCluster(this.conf, option.getNumMasters(),
option.getNumRegionServers(),
+ option.getRsPorts(), option.getMasterClass(), option.getRsClass());
// Don't leave here till we've done a successful scan of the hbase:meta
Connection conn = ConnectionFactory.createConnection(this.conf);
Table t = conn.getTable(TableName.META_TABLE_NAME);
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java
index a3d8061..2ecef5c 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java
@@ -17,19 +17,33 @@
*/
package org.apache.hadoop.hbase.master;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
import java.util.List;
+import java.util.Optional;
+import java.util.concurrent.CountDownLatch;
import java.util.stream.Collectors;
+
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.StartMiniClusterOption;
import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.ServerState;
import org.apache.hadoop.hbase.master.assignment.ServerStateNode;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests;
-import org.junit.Assert;
+import org.apache.hadoop.hbase.util.JVMClusterUtil;
+import org.apache.zookeeper.KeeperException;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@@ -45,6 +59,9 @@ public class TestClusterRestartFailover extends
AbstractTestRestartCluster {
private static final Logger LOG =
LoggerFactory.getLogger(TestClusterRestartFailover.class);
+ private static CountDownLatch SCP_LATCH;
+ private static ServerName SERVER_FOR_TEST;
+
@Override
protected boolean splitWALCoordinatedByZk() {
return true;
@@ -55,60 +72,119 @@ public class TestClusterRestartFailover extends
AbstractTestRestartCluster {
.getServerNode(serverName);
}
+ /**
+ * Test for HBASE-22964
+ */
@Test
public void test() throws Exception {
- UTIL.startMiniCluster(3);
+ setupCluster();
+ setupTable();
+
+ // Find the server which not carry hbase:namespace
+ for (JVMClusterUtil.RegionServerThread thread : UTIL.getHBaseCluster()
+ .getRegionServerThreads()) {
+ if
(!thread.getRegionServer().getOnlineTables().contains(TableName.NAMESPACE_TABLE_NAME))
{
+ SERVER_FOR_TEST = thread.getRegionServer().getServerName();
+ break;
+ }
+ }
+ UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
+ ServerStateNode serverNode = getServerStateNode(SERVER_FOR_TEST);
+ assertNotNull(serverNode);
+ assertTrue("serverNode should be ONLINE when cluster runs normally",
+ serverNode.isInState(ServerState.ONLINE));
+
+ SCP_LATCH = new CountDownLatch(1);
+
+ // Shutdown cluster and restart
+ List<Integer> ports =
+
UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList().stream()
+ .map(serverName ->
serverName.getPort()).collect(Collectors.toList());
+ LOG.info("Shutting down cluster");
+ UTIL.getHBaseCluster().killAll();
+ UTIL.getHBaseCluster().waitUntilShutDown();
+ LOG.info("Restarting cluster");
+
UTIL.restartHBaseCluster(StartMiniClusterOption.builder().masterClass(HMasterForTest.class)
+ .numMasters(1).numRegionServers(3).rsPorts(ports).build());
+ UTIL.waitFor(60000, () ->
UTIL.getHBaseCluster().getMaster().isInitialized());
+
+ UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
+ serverNode = getServerStateNode(SERVER_FOR_TEST);
+ assertFalse("serverNode should not be ONLINE during SCP processing",
+ serverNode.isInState(ServerState.ONLINE));
+ Optional<Procedure<?>> procedure =
UTIL.getHBaseCluster().getMaster().getProcedures().stream()
+ .filter(p -> (p instanceof ServerCrashProcedure) &&
+ ((ServerCrashProcedure)
p).getServerName().equals(SERVER_FOR_TEST)).findAny();
+ assertTrue("Should have one SCP for " + SERVER_FOR_TEST,
procedure.isPresent());
+ assertFalse("Submit the SCP for the same serverName " + SERVER_FOR_TEST +
" which should fail",
+
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST));
+
+ // Wait the SCP to finish
+ SCP_LATCH.countDown();
+ UTIL.waitFor(60000, () -> procedure.get().isFinished());
+
+ assertFalse("Even when the SCP is finished, the duplicate SCP should not
be scheduled for " +
+ SERVER_FOR_TEST,
+
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST));
+ serverNode =
UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
+ .getServerNode(SERVER_FOR_TEST);
+ assertNull("serverNode should be deleted after SCP finished", serverNode);
+ }
+
+ private void setupCluster() throws Exception {
+ UTIL.startMiniCluster(
+
StartMiniClusterOption.builder().masterClass(HMasterForTest.class).numMasters(1)
+ .numRegionServers(3).build());
UTIL.waitFor(60000, () ->
UTIL.getMiniHBaseCluster().getMaster().isInitialized());
// wait for all SCPs finished
UTIL.waitFor(60000, () ->
UTIL.getHBaseCluster().getMaster().getProcedures().stream()
- .noneMatch(p -> p instanceof ServerCrashProcedure));
+ .noneMatch(p -> p instanceof ServerCrashProcedure));
+ UTIL.getHBaseCluster().getMaster().balanceSwitch(false);
+ }
+
+ private void setupTable() throws Exception {
TableName tableName = TABLES[0];
- ServerName testServer =
UTIL.getHBaseCluster().getRegionServer(0).getServerName();
- UTIL.waitFor(30000, () -> getServerStateNode(testServer) != null);
- ServerStateNode serverNode = getServerStateNode(testServer);
- Assert.assertNotNull(serverNode);
- Assert.assertTrue("serverNode should be ONLINE when cluster runs normally",
- serverNode.isInState(ServerState.ONLINE));
UTIL.createMultiRegionTable(tableName, FAMILY);
- UTIL.waitTableEnabled(tableName);
+ UTIL.waitTableAvailable(tableName);
Table table = UTIL.getConnection().getTable(tableName);
for (int i = 0; i < 100; i++) {
UTIL.loadTable(table, FAMILY);
}
- List<Integer> ports =
-
UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList().stream()
- .map(serverName -> serverName.getPort()).collect(Collectors.toList());
- LOG.info("Shutting down cluster");
- UTIL.getHBaseCluster().killAll();
- UTIL.getHBaseCluster().waitUntilShutDown();
- LOG.info("Starting cluster the second time");
- UTIL.restartHBaseCluster(3, ports);
- UTIL.waitFor(30000, () ->
UTIL.getHBaseCluster().getMaster().isInitialized());
- serverNode =
UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
- .getServerNode(testServer);
- Assert.assertNotNull("serverNode should not be null when restart whole
cluster", serverNode);
- Assert.assertFalse(serverNode.isInState(ServerState.ONLINE));
- LOG.info("start to find the procedure of SCP for the severName we choose");
- UTIL.waitFor(60000,
- () -> UTIL.getHBaseCluster().getMaster().getProcedures().stream()
- .anyMatch(procedure -> (procedure instanceof ServerCrashProcedure) &&
- ((ServerCrashProcedure)
procedure).getServerName().equals(testServer)));
- Assert.assertFalse("serverNode should not be ONLINE during SCP processing",
- serverNode.isInState(ServerState.ONLINE));
- LOG.info("start to submit the SCP for the same serverName {} which should
fail", testServer);
- Assert
-
.assertFalse(UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(testServer));
- Procedure<?> procedure =
UTIL.getHBaseCluster().getMaster().getProcedures().stream()
- .filter(p -> (p instanceof ServerCrashProcedure) &&
- ((ServerCrashProcedure) p).getServerName().equals(testServer))
- .findAny().get();
- UTIL.waitFor(60000, () -> procedure.isFinished());
- LOG.info("even when the SCP is finished, the duplicate SCP should not be
scheduled for {}",
- testServer);
- Assert
-
.assertFalse(UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(testServer));
- serverNode =
UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
- .getServerNode(testServer);
- Assert.assertNull("serverNode should be deleted after SCP finished",
serverNode);
+ }
+
+ public static final class HMasterForTest extends HMaster {
+
+ public HMasterForTest(Configuration conf) throws IOException,
KeeperException {
+ super(conf);
+ }
+
+ @Override
+ protected AssignmentManager createAssignmentManager(MasterServices master)
{
+ return new AssignmentManagerForTest(master);
+ }
+ }
+
+ private static final class AssignmentManagerForTest extends
AssignmentManager {
+
+ public AssignmentManagerForTest(MasterServices master) {
+ super(master);
+ }
+
+ @Override
+ public List<RegionInfo> getRegionsOnServer(ServerName serverName) {
+ List<RegionInfo> regions = super.getRegionsOnServer(serverName);
+ // ServerCrashProcedure will call this method, so wait the
CountDownLatch here
+ if (SCP_LATCH != null && SERVER_FOR_TEST != null &&
serverName.equals(SERVER_FOR_TEST)) {
+ try {
+ LOG.info("ServerCrashProcedure wait the CountDownLatch here");
+ SCP_LATCH.await();
+ LOG.info("Continue the ServerCrashProcedure");
+ SCP_LATCH = null;
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ return regions;
+ }
}
}