This is an automated email from the ASF dual-hosted git repository.
vjasani pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.3 by this push:
new d155f53 HBASE-24327 : Handle shutdown() if master cannot be contacted
(#1684)
d155f53 is described below
commit d155f5363e1a87ca0122d4ebc9c0df5e4bb46659
Author: Viraj Jasani <[email protected]>
AuthorDate: Sun May 10 17:09:51 2020 +0530
HBASE-24327 : Handle shutdown() if master cannot be contacted (#1684)
Signed-off-by: Bharath Vissapragada <[email protected]>
---
.../hadoop/hbase/master/TestMasterShutdown.java | 59 +++++++++++++++++-----
1 file changed, 46 insertions(+), 13 deletions(-)
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
index 7b3921e..703455a 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
@@ -21,7 +21,10 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import java.io.IOException;
+import java.time.Duration;
import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterMetrics;
@@ -31,7 +34,8 @@ import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.LocalHBaseCluster;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.StartMiniClusterOption;
-import org.apache.hadoop.hbase.Waiter;
+import org.apache.hadoop.hbase.client.AsyncConnection;
+import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
@@ -129,6 +133,7 @@ public class TestMasterShutdown {
try {
htu = new HBaseTestingUtility(
createMasterShutdownBeforeStartingAnyRegionServerConfiguration());
+ htu.getConfiguration().setInt("hbase.client.retries.number", 3);
// configure a cluster with
final StartMiniClusterOption options = StartMiniClusterOption.builder()
@@ -151,19 +156,47 @@ public class TestMasterShutdown {
hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(),
options.getNumMasters(),
options.getNumRegionServers(), options.getMasterClass(),
options.getRsClass());
final MasterThread masterThread = hbaseCluster.getMasters().get(0);
+
masterThread.start();
- // Switching to master registry exacerbated a race in the master
bootstrap that can result
- // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is
essentially because
- // the server manager in HMaster is not initialized by the time
shutdown() RPC (below) is
- // made to the master. The suspected reason as to why it was uncommon
before HBASE-18095
- // is because the connection creation with ZK registry is so slow that
by then the server
- // manager is usually init'ed in time for the RPC to be made. For now,
adding an explicit
- // wait() in the test, waiting for the server manager to become
available.
- final long timeout = TimeUnit.MINUTES.toMillis(10);
- assertNotEquals("Timeout waiting for server manager to become
available.",
- -1, Waiter.waitFor(htu.getConfiguration(), timeout,
- () -> masterThread.getMaster().getServerManager() != null));
- htu.getConnection().getAdmin().shutdown();
+ final CompletableFuture<Void> shutdownFuture =
CompletableFuture.runAsync(() -> {
+ // Switching to master registry exacerbated a race in the master
bootstrap that can result
+ // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is
essentially because
+ // the server manager in HMaster is not initialized by the time
shutdown() RPC (below) is
+ // made to the master. The suspected reason as to why it was uncommon
before HBASE-18095
+ // is because the connection creation with ZK registry is so slow that
by then the server
+ // manager is usually init'ed in time for the RPC to be made. For now,
adding an explicit
+ // wait() in the test, waiting for the server manager to become
available.
+ final long timeout = TimeUnit.MINUTES.toMillis(10);
+ assertNotEquals("timeout waiting for server manager to become
available.", -1,
+ htu.waitFor(timeout, () ->
masterThread.getMaster().getServerManager() != null));
+
+ // Master has come up far enough that we can terminate it without
creating a zombie.
+ final long result = htu.waitFor(timeout, 1000, () -> {
+ final Configuration conf =
createResponsiveZkConfig(htu.getConfiguration());
+ LOG.debug("Attempting to establish connection.");
+ final CompletableFuture<AsyncConnection> connFuture =
+ ConnectionFactory.createAsyncConnection(conf);
+ try (final AsyncConnection conn = connFuture.join()) {
+ LOG.info("Sending shutdown RPC.");
+ try {
+ conn.getAdmin().shutdown().join();
+ LOG.info("Shutdown RPC sent.");
+ return true;
+ } catch (CompletionException e) {
+ LOG.error("Failure sending shutdown RPC.");
+ }
+ } catch (IOException|CompletionException e) {
+ LOG.error("Failed to establish connection.");
+ } catch (Throwable e) {
+ LOG.error("Something unexpected happened.", e);
+ }
+ return false;
+ });
+ assertNotEquals("Failed to issue shutdown RPC after " +
Duration.ofMillis(timeout),
+ -1, result);
+ });
+
+ shutdownFuture.join();
masterThread.join();
} finally {
if (hbaseCluster != null) {