This is an automated email from the ASF dual-hosted git repository.

apurtell pushed a commit to branch branch-1
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-1 by this push:
     new e7ff91f  HBASE-21325 Force to terminate regionserver when abort hang 
in somewhere
e7ff91f is described below

commit e7ff91f35ee9fbf9f66ef6e999a8155368d91753
Author: Guanghao Zhang <zg...@apache.org>
AuthorDate: Fri Oct 19 19:34:04 2018 +0800

    HBASE-21325 Force to terminate regionserver when abort hang in somewhere
    
    Conflicts:
        
hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
        
hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
    
    Amending-Author: Andrew Purtell <apurt...@apache.org>
---
 .../hadoop/hbase/regionserver/HRegionServer.java   |  39 +++++-
 .../regionserver/TestRegionServerAbortTimeout.java | 137 +++++++++++++++++++++
 2 files changed, 175 insertions(+), 1 deletion(-)

diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 6e5ce80..5f250c0 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -51,6 +51,8 @@ import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
 import java.util.SortedMap;
+import java.util.Timer;
+import java.util.TimerTask;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
@@ -312,6 +314,11 @@ public class HRegionServer extends HasThread implements
   // Go down hard. Used if file system becomes unavailable and also in
   // debugging and unit tests.
   private volatile boolean abortRequested;
+  public static final String ABORT_TIMEOUT = 
"hbase.regionserver.abort.timeout";
+  // Default abort timeout is 1200 seconds for safe
+  private static final long DEFAULT_ABORT_TIMEOUT = 1200000;
+  // Will run this task when abort timeout
+  public static final String ABORT_TIMEOUT_TASK = 
"hbase.regionserver.abort.timeout.task";
 
   ConcurrentMap<String, Integer> rowlocks = new ConcurrentHashMap<String, 
Integer>();
 
@@ -1041,12 +1048,31 @@ public class HRegionServer extends HasThread implements
         abort(prefix + t.getMessage(), t);
       }
     }
+
     // Run shutdown.
     if (mxBean != null) {
       MBeanUtil.unregisterMBean(mxBean);
       mxBean = null;
     }
-    if (this.leases != null) this.leases.closeAfterLeasesExpire();
+
+    if (abortRequested) {
+      Timer abortMonitor = new Timer("Abort regionserver monitor", true);
+      TimerTask abortTimeoutTask = null;
+      try {
+        abortTimeoutTask =
+            Class.forName(conf.get(ABORT_TIMEOUT_TASK, 
SystemExitWhenAbortTimeout.class.getName()))
+                
.asSubclass(TimerTask.class).getDeclaredConstructor().newInstance();
+      } catch (Exception e) {
+        LOG.warn("Initialize abort timeout task failed", e);
+      }
+      if (abortTimeoutTask != null) {
+        abortMonitor.schedule(abortTimeoutTask, conf.getLong(ABORT_TIMEOUT, 
DEFAULT_ABORT_TIMEOUT));
+      }
+    }
+
+    if (this.leases != null) {
+      this.leases.closeAfterLeasesExpire();
+    }
     if (this.splitLogWorker != null) {
       splitLogWorker.stop();
     }
@@ -3552,4 +3578,15 @@ public class HRegionServer extends HasThread implements
   public void unassign(byte[] regionName) throws IOException {
     clusterConnection.getAdmin().unassign(regionName, false);
   }
+
+  /**
+   * Force to terminate region server when abort timeout.
+   */
+  private static class SystemExitWhenAbortTimeout extends TimerTask {
+    @Override
+    public void run() {
+      LOG.warn("Aborting region server timed out, terminate forcibly...");
+      System.exit(1);
+    }
+  }
 }
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
new file mode 100644
index 0000000..ed129c5
--- /dev/null
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.util.TimerTask;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver;
+import org.apache.hadoop.hbase.coprocessor.ObserverContext;
+import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.testclassification.RegionServerTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Threads;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ RegionServerTests.class, MediumTests.class })
+public class TestRegionServerAbortTimeout {
+  private static final Logger LOG = 
LoggerFactory.getLogger(TestRegionServerAbortTimeout.class);
+
+  private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+  private static TableName TABLE_NAME = TableName.valueOf("RSAbort");
+
+  private static byte[] CF = Bytes.toBytes("cf");
+
+  private static byte[] CQ = Bytes.toBytes("cq");
+
+  private static final int REGIONS_NUM = 5;
+
+  private static final int SLEEP_TIME_WHEN_CLOSE_REGION = 1000;
+
+  private static volatile boolean abortTimeoutTaskScheduled = false;
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    Configuration conf = UTIL.getConfiguration();
+    // Will schedule a abort timeout task after SLEEP_TIME_WHEN_CLOSE_REGION ms
+    conf.setLong(HRegionServer.ABORT_TIMEOUT, SLEEP_TIME_WHEN_CLOSE_REGION);
+    conf.set(HRegionServer.ABORT_TIMEOUT_TASK, 
TestAbortTimeoutTask.class.getName());
+    UTIL.startMiniCluster(2);
+    HTableDescriptor td = new HTableDescriptor(TABLE_NAME);
+    td.addCoprocessor(SleepWhenCloseCoprocessor.class.getName());
+    td.addFamily(new HColumnDescriptor(CF));
+    UTIL.getHBaseAdmin().createTable(td, Bytes.toBytes("0"), 
Bytes.toBytes("9"), REGIONS_NUM);
+  }
+
+  @AfterClass
+  public static void tearDown() throws Exception {
+    UTIL.getHBaseAdmin().disableTable(TABLE_NAME);
+    UTIL.getHBaseAdmin().deleteTable(TABLE_NAME);
+    UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testAbortTimeout() throws Exception {
+    Thread writer = new Thread() {
+      public void run() {
+        try {
+          try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
+            for (int i = 0; i < 10000; i++) {
+              table.put(new Put(Bytes.toBytes(i)).addColumn(CF, CQ, 
Bytes.toBytes(i)));
+            }
+          }
+        } catch (IOException e) {
+          LOG.warn("Failed to load data");
+        }
+      }
+    };
+    writer.setDaemon(true);
+    writer.start();
+
+    // Abort one region server
+    UTIL.getMiniHBaseCluster().getRegionServer(0).abort("Abort RS for test");
+
+    long startTime = System.currentTimeMillis();
+    long timeout = REGIONS_NUM * SLEEP_TIME_WHEN_CLOSE_REGION * 10;
+    while (System.currentTimeMillis() - startTime < timeout) {
+      if (UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 1) 
{
+        assertTrue("Abort timer task should be scheduled", 
abortTimeoutTaskScheduled);
+        return;
+      }
+      Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION);
+    }
+    fail("Failed to abort a region server in " + timeout + " ms");
+  }
+
+  static class TestAbortTimeoutTask extends TimerTask {
+
+    public TestAbortTimeoutTask() {
+    }
+
+    @Override
+    public void run() {
+      LOG.info("TestAbortTimeoutTask was scheduled");
+      abortTimeoutTaskScheduled = true;
+    }
+  }
+
+  public static class SleepWhenCloseCoprocessor extends BaseRegionObserver {
+    @Override
+    public void preClose(ObserverContext<RegionCoprocessorEnvironment> c, 
boolean abortRequested)
+        throws IOException {
+      Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION);
+    }
+  }
+}

Reply via email to