[hbase] branch branch-2.3 updated: HBASE-24327 : Handle shutdown() if master cannot be contacted (#1684)

vjasani Sun, 10 May 2020 04:46:22 -0700

This is an automated email from the ASF dual-hosted git repository.

vjasani pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git



The following commit(s) were added to refs/heads/branch-2.3 by this push:
     new d155f53  HBASE-24327 : Handle shutdown() if master cannot be contacted 
(#1684)
d155f53 is described below

commit d155f5363e1a87ca0122d4ebc9c0df5e4bb46659
Author: Viraj Jasani <[email protected]>
AuthorDate: Sun May 10 17:09:51 2020 +0530

    HBASE-24327 : Handle shutdown() if master cannot be contacted (#1684)
    
    Signed-off-by: Bharath Vissapragada <[email protected]>
---
 .../hadoop/hbase/master/TestMasterShutdown.java    | 59 +++++++++++++++++-----
 1 file changed, 46 insertions(+), 13 deletions(-)

diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
index 7b3921e..703455a 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
@@ -21,7 +21,10 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertNotNull;
 import java.io.IOException;
+import java.time.Duration;
 import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
 import java.util.concurrent.TimeUnit;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.ClusterMetrics;
@@ -31,7 +34,8 @@ import org.apache.hadoop.hbase.HBaseTestingUtility;
 import org.apache.hadoop.hbase.LocalHBaseCluster;
 import org.apache.hadoop.hbase.MiniHBaseCluster;
 import org.apache.hadoop.hbase.StartMiniClusterOption;
-import org.apache.hadoop.hbase.Waiter;
+import org.apache.hadoop.hbase.client.AsyncConnection;
+import org.apache.hadoop.hbase.client.ConnectionFactory;
 import org.apache.hadoop.hbase.testclassification.LargeTests;
 import org.apache.hadoop.hbase.testclassification.MasterTests;
 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
@@ -129,6 +133,7 @@ public class TestMasterShutdown {
     try {
       htu =  new HBaseTestingUtility(
         createMasterShutdownBeforeStartingAnyRegionServerConfiguration());
+      htu.getConfiguration().setInt("hbase.client.retries.number", 3);
 
       // configure a cluster with
       final StartMiniClusterOption options = StartMiniClusterOption.builder()
@@ -151,19 +156,47 @@ public class TestMasterShutdown {
       hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), 
options.getNumMasters(),
         options.getNumRegionServers(), options.getMasterClass(), 
options.getRsClass());
       final MasterThread masterThread = hbaseCluster.getMasters().get(0);
+
       masterThread.start();
-      // Switching to master registry exacerbated a race in the master 
bootstrap that can result
-      // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is 
essentially because
-      // the server manager in HMaster is not initialized by the time 
shutdown() RPC (below) is
-      // made to the master. The suspected reason as to why it was uncommon 
before HBASE-18095
-      // is because the connection creation with ZK registry is so slow that 
by then the server
-      // manager is usually init'ed in time for the RPC to be made. For now, 
adding an explicit
-      // wait() in the test, waiting for the server manager to become 
available.
-      final long timeout = TimeUnit.MINUTES.toMillis(10);
-      assertNotEquals("Timeout waiting for server manager to become 
available.",
-        -1, Waiter.waitFor(htu.getConfiguration(), timeout,
-          () -> masterThread.getMaster().getServerManager() != null));
-      htu.getConnection().getAdmin().shutdown();
+      final CompletableFuture<Void> shutdownFuture = 
CompletableFuture.runAsync(() -> {
+        // Switching to master registry exacerbated a race in the master 
bootstrap that can result
+        // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is 
essentially because
+        // the server manager in HMaster is not initialized by the time 
shutdown() RPC (below) is
+        // made to the master. The suspected reason as to why it was uncommon 
before HBASE-18095
+        // is because the connection creation with ZK registry is so slow that 
by then the server
+        // manager is usually init'ed in time for the RPC to be made. For now, 
adding an explicit
+        // wait() in the test, waiting for the server manager to become 
available.
+        final long timeout = TimeUnit.MINUTES.toMillis(10);
+        assertNotEquals("timeout waiting for server manager to become 
available.", -1,
+          htu.waitFor(timeout, () -> 
masterThread.getMaster().getServerManager() != null));
+
+        // Master has come up far enough that we can terminate it without 
creating a zombie.
+        final long result = htu.waitFor(timeout, 1000, () -> {
+          final Configuration conf = 
createResponsiveZkConfig(htu.getConfiguration());
+          LOG.debug("Attempting to establish connection.");
+          final CompletableFuture<AsyncConnection> connFuture =
+            ConnectionFactory.createAsyncConnection(conf);
+          try (final AsyncConnection conn = connFuture.join()) {
+            LOG.info("Sending shutdown RPC.");
+            try {
+              conn.getAdmin().shutdown().join();
+              LOG.info("Shutdown RPC sent.");
+              return true;
+            } catch (CompletionException e) {
+              LOG.error("Failure sending shutdown RPC.");
+            }
+          } catch (IOException|CompletionException e) {
+            LOG.error("Failed to establish connection.");
+          } catch (Throwable e) {
+            LOG.error("Something unexpected happened.", e);
+          }
+          return false;
+        });
+        assertNotEquals("Failed to issue shutdown RPC after " + 
Duration.ofMillis(timeout),
+          -1, result);
+      });
+
+      shutdownFuture.join();
       masterThread.join();
     } finally {
       if (hbaseCluster != null) {

[hbase] branch branch-2.3 updated: HBASE-24327 : Handle shutdown() if master cannot be contacted (#1684)

Reply via email to