Repository: asterixdb Updated Branches: refs/heads/master 31aacc7be -> bc918f7ff
[NO ISSUE][CLUS] Prevent Possible Deadlock in GlobalRecoveryManager - user model changes: no - storage format changes: no - interface changes: no Details: - Perform global recovery on a different thread to prevent possible deadlocks in ClusterStateManager. - Shutdown the system in case of global recovery failure since the system will be in inconsistent state. Change-Id: I3b0491a84fb24b428d5ce98f392adafc6dfacff9 Reviewed-on: https://asterix-gerrit.ics.uci.edu/2072 Sonar-Qube: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Integration-Tests: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Tested-by: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Contrib: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Reviewed-by: Ian Maxon <ima...@apache.org> Reviewed-by: Michael Blow <mb...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/asterixdb/repo Commit: http://git-wip-us.apache.org/repos/asf/asterixdb/commit/bc918f7f Tree: http://git-wip-us.apache.org/repos/asf/asterixdb/tree/bc918f7f Diff: http://git-wip-us.apache.org/repos/asf/asterixdb/diff/bc918f7f Branch: refs/heads/master Commit: bc918f7ff2ced1af32bf595689406bad7e950721 Parents: 31aacc7 Author: Murtadha Hubail <mhub...@apache.org> Authored: Sat Oct 14 02:45:21 2017 +0300 Committer: Murtadha Hubail <mhub...@apache.org> Committed: Tue Oct 17 03:05:23 2017 -0700 ---------------------------------------------------------------------- .../bootstrap/GlobalRecoveryManager.java | 52 +++++++++++--------- .../common/cluster/IGlobalRecoveryManager.java | 8 +-- .../hyracks/control/nc/NCShutdownHook.java | 1 + 3 files changed, 31 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/asterixdb/blob/bc918f7f/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java b/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java index 13f3afa..d6854b1 100644 --- a/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java +++ b/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java @@ -48,6 +48,8 @@ import org.apache.hyracks.api.client.IHyracksClientConnection; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.hyracks.api.job.JobId; import org.apache.hyracks.api.job.JobSpecification; +import org.apache.hyracks.control.nc.NCShutdownHook; +import org.apache.hyracks.util.ExitUtil; public class GlobalRecoveryManager implements IGlobalRecoveryManager { @@ -56,6 +58,7 @@ public class GlobalRecoveryManager implements IGlobalRecoveryManager { protected final ICCServiceContext serviceCtx; protected IHyracksClientConnection hcc; protected volatile boolean recoveryCompleted; + protected volatile boolean recovering; public GlobalRecoveryManager(ICCServiceContext serviceCtx, IHyracksClientConnection hcc, IStorageComponentProvider componentProvider) { @@ -81,41 +84,42 @@ public class GlobalRecoveryManager implements IGlobalRecoveryManager { } @Override - public void startGlobalRecovery(ICcApplicationContext appCtx) throws HyracksDataException { - if (!recoveryCompleted) { - recover(appCtx); + public void startGlobalRecovery(ICcApplicationContext appCtx) { + if (!recoveryCompleted && !recovering) { + synchronized (this) { + if (!recovering) { + recovering = true; + /** + * Perform recovery on a different thread to avoid deadlocks in + * {@link org.apache.asterix.common.cluster.IClusterStateManager} + */ + serviceCtx.getControllerService().getExecutor().submit(() -> { + try { + recover(appCtx); + } catch (HyracksDataException e) { + LOGGER.log(Level.SEVERE, "Global recovery failed. Shutting down...", e); + ExitUtil.exit(NCShutdownHook.FAILED_TO_RECOVER_EXIT_CODE); + } + }); + } + } } } protected void recover(ICcApplicationContext appCtx) throws HyracksDataException { - LOGGER.info("Starting Global Recovery"); - MetadataTransactionContext mdTxnCtx = null; try { + LOGGER.info("Starting Global Recovery"); MetadataManager.INSTANCE.init(); - mdTxnCtx = MetadataManager.INSTANCE.beginTransaction(); + MetadataTransactionContext mdTxnCtx = MetadataManager.INSTANCE.beginTransaction(); mdTxnCtx = doRecovery(appCtx, mdTxnCtx); MetadataManager.INSTANCE.commitTransaction(mdTxnCtx); + recoveryCompleted = true; + recovering = false; + LOGGER.info("Global Recovery Completed. Refreshing cluster state..."); + appCtx.getClusterStateManager().refreshState(); } catch (Exception e) { - // This needs to be fixed <-- Needs to shutdown the system --> - /* - * Note: Throwing this illegal state exception will terminate this thread - * and feeds listeners will not be notified. - */ - LOGGER.log(Level.SEVERE, "Global recovery was not completed successfully: ", e); - if (mdTxnCtx != null) { - try { - MetadataManager.INSTANCE.abortTransaction(mdTxnCtx); - } catch (Exception e1) { - LOGGER.log(Level.SEVERE, "Exception aborting metadata transaction", e1); - e.addSuppressed(e1); - throw new IllegalStateException(e); - } - } throw HyracksDataException.create(e); } - recoveryCompleted = true; - LOGGER.info("Global Recovery Completed"); - appCtx.getClusterStateManager().refreshState(); } protected MetadataTransactionContext doRecovery(ICcApplicationContext appCtx, MetadataTransactionContext mdTxnCtx) http://git-wip-us.apache.org/repos/asf/asterixdb/blob/bc918f7f/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cluster/IGlobalRecoveryManager.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cluster/IGlobalRecoveryManager.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cluster/IGlobalRecoveryManager.java index b4559c8..a3add90 100644 --- a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cluster/IGlobalRecoveryManager.java +++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cluster/IGlobalRecoveryManager.java @@ -20,19 +20,15 @@ package org.apache.asterix.common.cluster; import org.apache.asterix.common.api.IClusterEventsSubscriber; import org.apache.asterix.common.dataflow.ICcApplicationContext; -import org.apache.hyracks.api.exceptions.HyracksDataException; public interface IGlobalRecoveryManager extends IClusterEventsSubscriber { /** * Starts the global recovery process after the cluster state has changed to ACTIVE. * - * @param appCtx - * the application context - * @throws HyracksDataException - * if the global recovery fails + * @param appCtx the application context */ - void startGlobalRecovery(ICcApplicationContext appCtx) throws HyracksDataException; + void startGlobalRecovery(ICcApplicationContext appCtx); /** * @return true, if global recovery has been completed successfully http://git-wip-us.apache.org/repos/asf/asterixdb/blob/bc918f7f/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NCShutdownHook.java ---------------------------------------------------------------------- diff --git a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NCShutdownHook.java b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NCShutdownHook.java index 0a02635..6308373 100644 --- a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NCShutdownHook.java +++ b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NCShutdownHook.java @@ -31,6 +31,7 @@ import org.apache.hyracks.util.ThreadDumpUtil; public class NCShutdownHook extends Thread { public static final int FAILED_TO_STARTUP_EXIT_CODE = 2; + public static final int FAILED_TO_RECOVER_EXIT_CODE = 3; private static final Logger LOGGER = Logger.getLogger(NCShutdownHook.class.getName()); private static final long SHUTDOWN_WAIT_TIME = 10 * 60 * 1000L; private final Thread watchDog;