arp7 commented on a change in pull request #1129:
URL: https://github.com/apache/hadoop-ozone/pull/1129#discussion_r447280689
##########
File path:
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
##########
@@ -3040,45 +3024,61 @@ public TermIndex installSnapshot(String leaderId) {
} catch (Exception e) {
LOG.error("Failed to stop/ pause the services. Cannot proceed with " +
"installing the new checkpoint.", e);
+
+ // During stopServices, if KeyManager was stopped successfully and
+ // OMMetadataManager stop failed, we should restart the KeyManager.
+ keyManager.start(configuration);
+
return null;
}
- //TODO: un-pause SM if any failures and retry?
+ File dbBackup;
+ TermIndex termIndex = omRatisServer.getLastAppliedTermIndex();
+ long currentTerm = termIndex.getTerm();
+ long lastAppliedIndex = termIndex.getIndex();
+ boolean loadSuccess = false;
- long lastAppliedIndex = omRatisServer.getLastAppliedTermIndex().getIndex();
+ try {
+ // Check if current applied log index is smaller than the downloaded
+ // checkpoint transaction index. If yes, proceed by stopping the ratis
+ // server so that the OM state can be re-initialized. If no then do not
+ // proceed with installSnapshot.
+ boolean canProceed = OzoneManagerRatisUtils.verifyTransactionInfo(
+ omTransactionInfo, lastAppliedIndex, leaderId, newDBLocation);
+ if (!canProceed) {
+ return null;
+ }
- boolean canProceed =
- OzoneManagerRatisUtils.verifyTransactionInfo(omTransactionInfo,
- lastAppliedIndex, leaderId, newDBlocation);
+ try {
+ dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation,
+ newDBLocation);
+ } catch (Exception e) {
+ LOG.error("OM DB checkpoint replacement with new downloaded " +
+ "checkpoint failed.", e);
+ return null;
+ }
- // If downloaded DB has transaction info less than current one, return.
- if (!canProceed) {
- return null;
+ loadSuccess = true;
+ } finally {
+ if (!loadSuccess) {
Review comment:
Remove this unpause, we already do a reload and unpause below.
##########
File path:
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
##########
@@ -3040,45 +3024,61 @@ public TermIndex installSnapshot(String leaderId) {
} catch (Exception e) {
LOG.error("Failed to stop/ pause the services. Cannot proceed with " +
"installing the new checkpoint.", e);
+
+ // During stopServices, if KeyManager was stopped successfully and
+ // OMMetadataManager stop failed, we should restart the KeyManager.
+ keyManager.start(configuration);
+
return null;
}
- //TODO: un-pause SM if any failures and retry?
+ File dbBackup;
+ TermIndex termIndex = omRatisServer.getLastAppliedTermIndex();
+ long currentTerm = termIndex.getTerm();
+ long lastAppliedIndex = termIndex.getIndex();
+ boolean loadSuccess = false;
- long lastAppliedIndex = omRatisServer.getLastAppliedTermIndex().getIndex();
+ try {
+ // Check if current applied log index is smaller than the downloaded
+ // checkpoint transaction index. If yes, proceed by stopping the ratis
+ // server so that the OM state can be re-initialized. If no then do not
+ // proceed with installSnapshot.
+ boolean canProceed = OzoneManagerRatisUtils.verifyTransactionInfo(
+ omTransactionInfo, lastAppliedIndex, leaderId, newDBLocation);
+ if (!canProceed) {
+ return null;
+ }
- boolean canProceed =
- OzoneManagerRatisUtils.verifyTransactionInfo(omTransactionInfo,
- lastAppliedIndex, leaderId, newDBlocation);
+ try {
+ dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation,
+ newDBLocation);
+ } catch (Exception e) {
+ LOG.error("OM DB checkpoint replacement with new downloaded " +
+ "checkpoint failed.", e);
+ return null;
+ }
- // If downloaded DB has transaction info less than current one, return.
- if (!canProceed) {
- return null;
+ loadSuccess = true;
Review comment:
Don't need `loadSuccess` any more.
##########
File path:
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
##########
@@ -3040,45 +3024,61 @@ public TermIndex installSnapshot(String leaderId) {
} catch (Exception e) {
LOG.error("Failed to stop/ pause the services. Cannot proceed with " +
"installing the new checkpoint.", e);
+
+ // During stopServices, if KeyManager was stopped successfully and
+ // OMMetadataManager stop failed, we should restart the KeyManager.
+ keyManager.start(configuration);
+
return null;
}
- //TODO: un-pause SM if any failures and retry?
+ File dbBackup;
+ TermIndex termIndex = omRatisServer.getLastAppliedTermIndex();
+ long currentTerm = termIndex.getTerm();
+ long lastAppliedIndex = termIndex.getIndex();
+ boolean loadSuccess = false;
- long lastAppliedIndex = omRatisServer.getLastAppliedTermIndex().getIndex();
+ try {
+ // Check if current applied log index is smaller than the downloaded
+ // checkpoint transaction index. If yes, proceed by stopping the ratis
+ // server so that the OM state can be re-initialized. If no then do not
+ // proceed with installSnapshot.
+ boolean canProceed = OzoneManagerRatisUtils.verifyTransactionInfo(
+ omTransactionInfo, lastAppliedIndex, leaderId, newDBLocation);
+ if (!canProceed) {
+ return null;
+ }
- boolean canProceed =
- OzoneManagerRatisUtils.verifyTransactionInfo(omTransactionInfo,
- lastAppliedIndex, leaderId, newDBlocation);
+ try {
+ dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation,
+ newDBLocation);
+ } catch (Exception e) {
+ LOG.error("OM DB checkpoint replacement with new downloaded " +
+ "checkpoint failed.", e);
+ return null;
+ }
- // If downloaded DB has transaction info less than current one, return.
- if (!canProceed) {
- return null;
+ loadSuccess = true;
+ } finally {
+ if (!loadSuccess) {
+ omRatisServer.getOmStateMachine().unpause(lastAppliedIndex,
+ currentTerm);
+ }
}
long leaderIndex = omTransactionInfo.getTransactionIndex();
Review comment:
Move these inside the try block, just after you invoke
`replaceOMDBWithCheckpoint`. Else these variables should be left to values that
reflect the old checkpoint.
##########
File path:
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
##########
@@ -3040,45 +3024,61 @@ public TermIndex installSnapshot(String leaderId) {
} catch (Exception e) {
LOG.error("Failed to stop/ pause the services. Cannot proceed with " +
"installing the new checkpoint.", e);
+
+ // During stopServices, if KeyManager was stopped successfully and
+ // OMMetadataManager stop failed, we should restart the KeyManager.
+ keyManager.start(configuration);
+
return null;
}
- //TODO: un-pause SM if any failures and retry?
+ File dbBackup;
+ TermIndex termIndex = omRatisServer.getLastAppliedTermIndex();
+ long currentTerm = termIndex.getTerm();
+ long lastAppliedIndex = termIndex.getIndex();
+ boolean loadSuccess = false;
- long lastAppliedIndex = omRatisServer.getLastAppliedTermIndex().getIndex();
+ try {
+ // Check if current applied log index is smaller than the downloaded
+ // checkpoint transaction index. If yes, proceed by stopping the ratis
+ // server so that the OM state can be re-initialized. If no then do not
+ // proceed with installSnapshot.
+ boolean canProceed = OzoneManagerRatisUtils.verifyTransactionInfo(
+ omTransactionInfo, lastAppliedIndex, leaderId, newDBLocation);
+ if (!canProceed) {
+ return null;
+ }
- boolean canProceed =
- OzoneManagerRatisUtils.verifyTransactionInfo(omTransactionInfo,
- lastAppliedIndex, leaderId, newDBlocation);
+ try {
+ dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation,
Review comment:
`replaceOMDBWithCheckpoint` contract should be that it either leaves the
current state or the old state. If it leaves inconsistent state then it should
also leave some persistent marker on disk so OM doesn't try to restart with a
bad checkpoint.
##########
File path:
hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
##########
@@ -3040,45 +3024,61 @@ public TermIndex installSnapshot(String leaderId) {
} catch (Exception e) {
LOG.error("Failed to stop/ pause the services. Cannot proceed with " +
"installing the new checkpoint.", e);
+
+ // During stopServices, if KeyManager was stopped successfully and
+ // OMMetadataManager stop failed, we should restart the KeyManager.
+ keyManager.start(configuration);
+
return null;
}
- //TODO: un-pause SM if any failures and retry?
+ File dbBackup;
+ TermIndex termIndex = omRatisServer.getLastAppliedTermIndex();
+ long currentTerm = termIndex.getTerm();
+ long lastAppliedIndex = termIndex.getIndex();
+ boolean loadSuccess = false;
- long lastAppliedIndex = omRatisServer.getLastAppliedTermIndex().getIndex();
+ try {
+ // Check if current applied log index is smaller than the downloaded
+ // checkpoint transaction index. If yes, proceed by stopping the ratis
+ // server so that the OM state can be re-initialized. If no then do not
+ // proceed with installSnapshot.
+ boolean canProceed = OzoneManagerRatisUtils.verifyTransactionInfo(
+ omTransactionInfo, lastAppliedIndex, leaderId, newDBLocation);
+ if (!canProceed) {
+ return null;
+ }
- boolean canProceed =
- OzoneManagerRatisUtils.verifyTransactionInfo(omTransactionInfo,
- lastAppliedIndex, leaderId, newDBlocation);
+ try {
+ dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation,
+ newDBLocation);
+ } catch (Exception e) {
+ LOG.error("OM DB checkpoint replacement with new downloaded " +
+ "checkpoint failed.", e);
+ return null;
+ }
- // If downloaded DB has transaction info less than current one, return.
- if (!canProceed) {
- return null;
+ loadSuccess = true;
+ } finally {
+ if (!loadSuccess) {
+ omRatisServer.getOmStateMachine().unpause(lastAppliedIndex,
+ currentTerm);
+ }
}
long leaderIndex = omTransactionInfo.getTransactionIndex();
long leaderTerm = omTransactionInfo.getCurrentTerm();
-
- File dbBackup;
- try {
- dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation,
- newDBlocation);
- } catch (Exception e) {
- LOG.error("OM DB checkpoint replacement with new downloaded checkpoint "
+
- "failed.", e);
- return null;
- }
-
// Reload the OM DB store with the new checkpoint.
// Restart (unpause) the state machine and update its last applied index
// to the installed checkpoint's snapshot index.
try {
reloadOMState(leaderIndex, leaderTerm);
Review comment:
Before calling reloadState check for existence of temp_marker file. Also
on OM restart.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]