ndimiduk commented on a change in pull request #1141: HBASE-23808 [Flakey Test] 
TestMasterShutdown#testMasterShutdownBefore…
URL: https://github.com/apache/hbase/pull/1141#discussion_r377988314
 
 

 ##########
 File path: 
hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
 ##########
 @@ -66,103 +82,172 @@ public void testMasterShutdown() throws Exception {
     Configuration conf = HBaseConfiguration.create();
 
     // Start the cluster
-    HBaseTestingUtility htu = new HBaseTestingUtility(conf);
-    StartMiniClusterOption option = StartMiniClusterOption.builder()
-        
.numMasters(NUM_MASTERS).numRegionServers(NUM_RS).numDataNodes(NUM_RS).build();
-    htu.startMiniCluster(option);
-    MiniHBaseCluster cluster = htu.getHBaseCluster();
-
-    // get all the master threads
-    List<MasterThread> masterThreads = cluster.getMasterThreads();
-
-    // wait for each to come online
-    for (MasterThread mt : masterThreads) {
-      assertTrue(mt.isAlive());
-    }
-
-    // find the active master
-    HMaster active = null;
-    for (int i = 0; i < masterThreads.size(); i++) {
-      if (masterThreads.get(i).getMaster().isActiveMaster()) {
-        active = masterThreads.get(i).getMaster();
-        break;
+    try {
+      htu = new HBaseTestingUtility(conf);
+      StartMiniClusterOption option = StartMiniClusterOption.builder()
+        .numMasters(NUM_MASTERS)
+        .numRegionServers(NUM_RS)
+        .numDataNodes(NUM_RS)
+        .build();
+      final MiniHBaseCluster cluster = htu.startMiniCluster(option);
+
+      // wait for all master thread to spawn and start their run loop.
+      final long thirtySeconds = TimeUnit.SECONDS.toMillis(30);
+      final long oneSecond = TimeUnit.SECONDS.toMillis(1);
+      assertNotEquals(-1, htu.waitFor(thirtySeconds, oneSecond, () -> {
+        final List<MasterThread> masterThreads = cluster.getMasterThreads();
+        return masterThreads != null
+          && masterThreads.size() >= 3
+          && masterThreads.stream().allMatch(Thread::isAlive);
+      }));
+
+      // find the active master
+      final HMaster active = cluster.getMaster();
+      assertNotNull(active);
+
+      // make sure the other two are backup masters
+      ClusterMetrics status = active.getClusterMetrics();
+      assertEquals(2, status.getBackupMasterNames().size());
+
+      // tell the active master to shutdown the cluster
+      active.shutdown();
+      assertNotEquals(-1, htu.waitFor(thirtySeconds, oneSecond,
+        () -> CollectionUtils.isEmpty(cluster.getLiveMasterThreads())));
+      assertNotEquals(-1, htu.waitFor(thirtySeconds, oneSecond,
+        () -> CollectionUtils.isEmpty(cluster.getLiveRegionServerThreads())));
+    } finally {
+      if (htu != null) {
+        htu.shutdownMiniCluster();
+        htu = null;
       }
     }
-    assertNotNull(active);
-    // make sure the other two are backup masters
-    ClusterMetrics status = active.getClusterMetrics();
-    assertEquals(2, status.getBackupMasterNames().size());
-
-    // tell the active master to shutdown the cluster
-    active.shutdown();
-
-    for (int i = NUM_MASTERS - 1; i >= 0 ;--i) {
-      cluster.waitOnMaster(i);
-    }
-    // make sure all the masters properly shutdown
-    assertEquals(0, masterThreads.size());
-
-    htu.shutdownMiniCluster();
   }
 
-  private Connection createConnection(HBaseTestingUtility util) throws 
InterruptedException {
-    // the cluster may have not been initialized yet which means we can not 
get the cluster id thus
-    // an exception will be thrown. So here we need to retry.
-    for (;;) {
-      try {
-        return ConnectionFactory.createConnection(util.getConfiguration());
-      } catch (Exception e) {
-        Thread.sleep(10);
+  /**
+   * This test appears to be an intentional race between a thread that issues 
a shutdown RPC to the
+   * master, while the master is concurrently realizing it cannot initialize 
because there are no
+   * region servers available to it. The expected behavior is that master 
initialization is
+   * interruptable via said shutdown RPC.
+   */
+  @Test
+  public void testMasterShutdownBeforeStartingAnyRegionServer() throws 
Exception {
+    LocalHBaseCluster hbaseCluster = null;
+    try {
+      htu =  new HBaseTestingUtility(
+        createMasterShutdownBeforeStartingAnyRegionServerConfiguration());
+
+      // configure a cluster with
+      final StartMiniClusterOption options = StartMiniClusterOption.builder()
+        .numDataNodes(1)
+        .numMasters(1)
+        .numRegionServers(0)
+        .masterClass(HMaster.class)
+        .rsClass(MiniHBaseCluster.MiniHBaseClusterRegionServer.class)
+        .createRootDir(true)
+        .build();
+
+      // Can't simply `htu.startMiniCluster(options)` because that method 
waits for the master to
+      // start completely. However, this test's premise is that a partially 
started master should
+      // still respond to a shutdown RPC. So instead, we manage each component 
lifecycle
+      // independently.
+      // I think it's not worth refactoring HTU's helper methods just for this 
class.
+      htu.startMiniDFSCluster(options.getNumDataNodes());
+      htu.startMiniZKCluster(options.getNumZkServers());
+      htu.createRootDir();
+      hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), 
options.getNumMasters(),
+        options.getNumRegionServers(), options.getMasterClass(), 
options.getRsClass());
+      final MasterThread masterThread = hbaseCluster.getMasters().get(0);
+
+      final CompletableFuture<Void> shutdownFuture = 
CompletableFuture.runAsync(() -> {
+        // Switching to master registry exacerbated a race in the master 
bootstrap that can result
+        // in a lost shutdown command (HBASE-8422). The race is essentially 
because the server
+        // manager in HMaster is not initialized by the time shutdown() RPC 
(below) is made to the
+        // master. The suspected reason as to why it was uncommon before 
HBASE-18095 is because the
+        // connection creation with ZK registry is so slow that by then the 
server manager is
+        // usually init'ed in time for the RPC to be made. For now, adding an 
explicit wait() in
+        // the test, waiting for the server manager to become available.
+        final long timeout = TimeUnit.MINUTES.toMillis(10);
+//        assertNotEquals("timeout waiting for server manager to become 
available.",
 
 Review comment:
   Bah. need to delete this commented code.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to