abdullah alamoudi has submitted this change and it was merged. Change subject: [NO ISSUE][ING] Fix race between active recovery and rebalance ......................................................................
[NO ISSUE][ING] Fix race between active recovery and rebalance - user model changes: no - storage format changes: no - interface changes: no Details: - In certain cases, a rebalance active suspend starts before recovery of an active job starts. - When that happens, sometimes, the recovery task exists and the active job is not resumed after rebalance. Change-Id: I66edb73950bb82baa1a1dfd892cb4b23bb7046be Reviewed-on: https://asterix-gerrit.ics.uci.edu/2950 Sonar-Qube: Jenkins <[email protected]> Tested-by: Jenkins <[email protected]> Integration-Tests: Jenkins <[email protected]> Reviewed-by: abdullah alamoudi <[email protected]> --- M asterixdb/asterix-app/src/main/java/org/apache/asterix/app/active/RecoveryTask.java 1 file changed, 36 insertions(+), 45 deletions(-) Approvals: Anon. E. Moose #1000171: abdullah alamoudi: Looks good to me, approved Jenkins: Verified; No violations found; Verified diff --git a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/active/RecoveryTask.java b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/active/RecoveryTask.java index 5d722e7..0172b28 100644 --- a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/active/RecoveryTask.java +++ b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/active/RecoveryTask.java @@ -47,7 +47,6 @@ private final IRetryPolicyFactory retryPolicyFactory; private final MetadataProvider metadataProvider; private final IClusterStateManager clusterStateManager; - private Exception failure; public RecoveryTask(ICcApplicationContext appCtx, ActiveEntityEventsListener listener, IRetryPolicyFactory retryPolicyFactory) { @@ -105,50 +104,46 @@ } } - protected Void doRecover(IRetryPolicy policy) - throws AlgebricksException, HyracksDataException, InterruptedException { + protected Void doRecover(IRetryPolicy policy) throws AlgebricksException, InterruptedException { LOGGER.log(level, "Actual Recovery task has started"); - if (listener.getState() != ActivityState.TEMPORARILY_FAILED) { - LOGGER.log(level, "but its state is not temp failure and so we're just returning"); - return null; - } - LOGGER.log(level, "calling the policy"); + Exception failure = null; while (policy.retry(failure)) { synchronized (listener) { - if (cancelRecovery) { - return null; - } - while (clusterStateManager.getState() != ClusterState.ACTIVE) { - if (cancelRecovery) { - return null; - } + while (!cancelRecovery && clusterStateManager.getState() != ClusterState.ACTIVE) { listener.wait(); + } + if (cancelRecovery) { + LOGGER.log(level, "Recovery has been cancelled"); + return null; } } IMetadataLockManager lockManager = metadataProvider.getApplicationContext().getMetadataLockManager(); - lockManager.acquireActiveEntityWriteLock(metadataProvider.getLocks(), - listener.getEntityId().getDataverse() + '.' + listener.getEntityId().getEntityName()); - for (Dataset dataset : listener.getDatasets()) { - lockManager.acquireDataverseReadLock(metadataProvider.getLocks(), dataset.getDataverseName()); - lockManager.acquireDatasetExclusiveModificationLock(metadataProvider.getLocks(), - DatasetUtil.getFullyQualifiedName(dataset)); - } - synchronized (listener) { - try { - if (cancelRecovery) { - return null; - } - listener.setState(ActivityState.RECOVERING); - listener.doStart(metadataProvider); - return null; - } catch (Exception e) { - LOGGER.log(level, "Attempt to revive " + listener.getEntityId() + " failed", e); - listener.setState(ActivityState.TEMPORARILY_FAILED); - failure = e; - } finally { - metadataProvider.getLocks().reset(); + try { + lockManager.acquireActiveEntityWriteLock(metadataProvider.getLocks(), + listener.getEntityId().getDataverse() + '.' + listener.getEntityId().getEntityName()); + for (Dataset dataset : listener.getDatasets()) { + lockManager.acquireDataverseReadLock(metadataProvider.getLocks(), dataset.getDataverseName()); + lockManager.acquireDatasetExclusiveModificationLock(metadataProvider.getLocks(), + DatasetUtil.getFullyQualifiedName(dataset)); } - listener.notifyAll(); + synchronized (listener) { + try { + if (!cancelRecovery && listener.getState() == ActivityState.TEMPORARILY_FAILED) { + listener.setState(ActivityState.RECOVERING); + listener.doStart(metadataProvider); + } + LOGGER.log(level, "Recovery completed successfully"); + return null; + } finally { + listener.notifyAll(); + } + } + } catch (Exception e) { + LOGGER.log(level, "Attempt to revive " + listener.getEntityId() + " failed", e); + listener.setState(ActivityState.TEMPORARILY_FAILED); + failure = e; + } finally { + metadataProvider.getLocks().reset(); } } // Recovery task is essntially over now either through failure or through cancellation(stop) @@ -160,6 +155,9 @@ // 1. set the state to permanent failure. // 2. set the entity to not running to avoid auto recovery attempt && listener.getState() != ActivityState.SUSPENDED) { + LOGGER.log(level, "Recovery is cancelled because the current state {} is neither {} nor {}", + listener.getState(), ActivityState.TEMPORARILY_FAILED, + listener.getState() != ActivityState.SUSPENDED); return null; } } @@ -172,10 +170,7 @@ DatasetUtil.getFullyQualifiedName(dataset)); } synchronized (listener) { - if (cancelRecovery) { - return null; - } - if (listener.getState() == ActivityState.TEMPORARILY_FAILED) { + if (!cancelRecovery && listener.getState() == ActivityState.TEMPORARILY_FAILED) { LOGGER.warn("Recovery for {} permanently failed", listener.getEntityId()); listener.setState(ActivityState.STOPPED); listener.setRunning(metadataProvider, false); @@ -186,9 +181,5 @@ metadataProvider.getLocks().reset(); } return null; - } - - public Exception getFailure() { - return failure; } } -- To view, visit https://asterix-gerrit.ics.uci.edu/2950 To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-MessageType: merged Gerrit-Change-Id: I66edb73950bb82baa1a1dfd892cb4b23bb7046be Gerrit-PatchSet: 6 Gerrit-Project: asterixdb Gerrit-Branch: stabilization-f69489 Gerrit-Owner: abdullah alamoudi <[email protected]> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Michael Blow <[email protected]> Gerrit-Reviewer: Murtadha Hubail <[email protected]> Gerrit-Reviewer: Till Westmann <[email protected]> Gerrit-Reviewer: abdullah alamoudi <[email protected]>
