This is an automated email from the ASF dual-hosted git repository.

pankajkumar pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new 82a63ab  HBASE-25447 remoteProc is suspended due to OOM ERROR (#2824)
82a63ab is described below

commit 82a63abc8a5ae2c366d47800de0f97d328844020
Author: Bo Cui <[email protected]>
AuthorDate: Mon Jan 4 23:34:38 2021 +0800

    HBASE-25447 remoteProc is suspended due to OOM ERROR (#2824)
    
    Some OMME can not cause the JVM to exit, like "java.lang.OutOfMemoryError: 
Direct buffer memory", "java.lang.OutOfMemoryError: unable to create new native 
thread", as they dont call vmError#next_OnError_command. So abort HMaster when 
uncaught exception occurs in TimeoutExecutor, the new active Hmaster will 
resume the suspended procedure.
    
    Signed-off-by: Duo Zhang <[email protected]>
    Signed-off-by: stack <[email protected]>
    Signed-off-by: Pankaj Kumar<[email protected]>
    (cherry picked from commit 600be60a4bd4d3b3e9652027a0cb8bdd32016c6b)
    (cherry picked from commit ae77f81e7eaf07e9943688f10f7f7e14211151c2)
---
 .../apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java | 4 ++++
 .../hadoop/hbase/master/procedure/RSProcedureDispatcher.java      | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git 
a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
 
b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
index 71d55ea..8a5a19e 100644
--- 
a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
+++ 
b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
@@ -106,6 +106,10 @@ public abstract class RemoteProcedureDispatcher<TEnv, 
TRemote extends Comparable
     return true;
   }
 
+  protected void 
setTimeoutExecutorUncaughtExceptionHandler(UncaughtExceptionHandler eh) {
+    timeoutExecutor.setUncaughtExceptionHandler(eh);
+  }
+
   public boolean stop() {
     if (!running.getAndSet(false)) {
       return false;
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
index 17e5522..3c554c4 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
@@ -94,6 +94,7 @@ public class RSProcedureDispatcher
     if (!super.start()) {
       return false;
     }
+    setTimeoutExecutorUncaughtExceptionHandler(this::abort);
     if (master.isStopped()) {
       LOG.debug("Stopped");
       return false;
@@ -126,6 +127,13 @@ public class RSProcedureDispatcher
     return true;
   }
 
+  private void abort(Thread t, Throwable e) {
+    LOG.error("Caught error", e);
+    if (!master.isStopped() && !master.isStopping() && !master.isAborted()) {
+      master.abort("Aborting master", e);
+    }
+  }
+
   @Override
   public boolean stop() {
     if (!super.stop()) {

Reply via email to