This is an automated email from the ASF dual-hosted git repository.
pankajkumar pushed a commit to branch branch-2.2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.2 by this push:
new 3352d84 HBASE-25447 remoteProc is suspended due to OOM ERROR (#2824)
3352d84 is described below
commit 3352d84811b98cd1b254718f029560f0e3b88688
Author: Bo Cui <[email protected]>
AuthorDate: Mon Jan 4 23:34:38 2021 +0800
HBASE-25447 remoteProc is suspended due to OOM ERROR (#2824)
Some OMME can not cause the JVM to exit, like "java.lang.OutOfMemoryError:
Direct buffer memory", "java.lang.OutOfMemoryError: unable to create new native
thread", as they dont call vmError#next_OnError_command. So abort HMaster when
uncaught exception occurs in TimeoutExecutor, the new active Hmaster will
resume the suspended procedure.
Signed-off-by: Duo Zhang <[email protected]>
Signed-off-by: stack <[email protected]>
Signed-off-by: Pankaj Kumar<[email protected]>
(cherry picked from commit 600be60a4bd4d3b3e9652027a0cb8bdd32016c6b)
(cherry picked from commit ae77f81e7eaf07e9943688f10f7f7e14211151c2)
---
.../apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java | 4 ++++
.../hadoop/hbase/master/procedure/RSProcedureDispatcher.java | 8 ++++++++
2 files changed, 12 insertions(+)
diff --git
a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
index ecb4ffc..fadf0fb 100644
---
a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
+++
b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
@@ -105,6 +105,10 @@ public abstract class RemoteProcedureDispatcher<TEnv,
TRemote extends Comparable
return true;
}
+ protected void
setTimeoutExecutorUncaughtExceptionHandler(UncaughtExceptionHandler eh) {
+ timeoutExecutor.setUncaughtExceptionHandler(eh);
+ }
+
public boolean stop() {
if (!running.getAndSet(false)) {
return false;
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
index d4670ae..8a49c6a 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java
@@ -95,6 +95,7 @@ public class RSProcedureDispatcher
if (!super.start()) {
return false;
}
+ setTimeoutExecutorUncaughtExceptionHandler(this::abort);
if (master.isStopped()) {
LOG.debug("Stopped");
return false;
@@ -127,6 +128,13 @@ public class RSProcedureDispatcher
return true;
}
+ private void abort(Thread t, Throwable e) {
+ LOG.error("Caught error", e);
+ if (!master.isStopped() && !master.isStopping() && !master.isAborted()) {
+ master.abort("Aborting master", e);
+ }
+ }
+
@Override
public boolean stop() {
if (!super.stop()) {