This is an automated email from the ASF dual-hosted git repository.
bharat pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git
The following commit(s) were added to refs/heads/master by this push:
new aa04ac0 HDDS-3642. Stop/Pause Background services while replacing OM
DB with checkpoint from Leader (#1002)
aa04ac0 is described below
commit aa04ac0a894e15c98b05b1acef110c6e26bb01dc
Author: Hanisha Koneru <[email protected]>
AuthorDate: Sun Jun 14 00:49:01 2020 -0700
HDDS-3642. Stop/Pause Background services while replacing OM DB with
checkpoint from Leader (#1002)
---
.../hadoop/ozone/om/TestOMRatisSnapshots.java | 7 ++-
.../org/apache/hadoop/ozone/om/OzoneManager.java | 50 +++++++++++++---------
2 files changed, 34 insertions(+), 23 deletions(-)
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java
index cccdb7c..2ddd289 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java
@@ -16,6 +16,7 @@
*/
package org.apache.hadoop.ozone.om;
+import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -172,10 +173,12 @@ public class TestOMRatisSnapshots {
followerOMLastAppliedIndex < leaderOMSnaphsotIndex);
// Install leader OM's db checkpoint on the lagging OM.
+ File oldDbLocation = followerOM.getMetadataManager().getStore()
+ .getDbLocation();
followerOM.getOmRatisServer().getOmStateMachine().pause();
followerOM.getMetadataManager().getStore().close();
- followerOM.replaceOMDBWithCheckpoint(
- leaderOMSnaphsotIndex, leaderDbCheckpoint.getCheckpointLocation());
+ followerOM.replaceOMDBWithCheckpoint(leaderOMSnaphsotIndex, oldDbLocation,
+ leaderDbCheckpoint.getCheckpointLocation());
// Reload the follower OM with new DB checkpoint from the leader OM.
followerOM.reloadOMState(leaderOMSnaphsotIndex, leaderOMSnapshotTermIndex);
diff --git
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
index a791d28..2fde315 100644
---
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
+++
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
@@ -78,7 +78,6 @@ import org.apache.hadoop.hdds.utils.HddsServerUtil;
import org.apache.hadoop.hdds.utils.ProtocolMessageMetrics;
import org.apache.hadoop.hdds.utils.RetriableTask;
import org.apache.hadoop.hdds.utils.db.DBCheckpoint;
-import org.apache.hadoop.hdds.utils.db.DBStore;
import org.apache.hadoop.hdds.utils.db.DBUpdatesWrapper;
import org.apache.hadoop.hdds.utils.db.SequenceNumberNotFoundException;
import org.apache.hadoop.io.Text;
@@ -3038,14 +3037,20 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
return null;
}
- // Pause the State Machine so that no new transactions can be applied.
- // This action also clears the OM Double Buffer so that if there are any
- // pending transactions in the buffer, they are discarded.
- // TODO: The Ratis server should also be paused here. This is required
- // because a leader election might happen while the snapshot
- // installation is in progress and the new leader might start sending
- // append log entries to the ratis server.
- omRatisServer.getOmStateMachine().pause();
+ File oldDBLocation = metadataManager.getStore().getDbLocation();
+ try {
+ // Stop Background services
+ stopServices();
+
+ // Pause the State Machine so that no new transactions can be applied.
+ // This action also clears the OM Double Buffer so that if there are any
+ // pending transactions in the buffer, they are discarded.
+ omRatisServer.getOmStateMachine().pause();
+ } catch (Exception e) {
+ LOG.error("Failed to stop/ pause the services. Cannot proceed with " +
+ "installing the new checkpoint.", e);
+ return null;
+ }
//TODO: un-pause SM if any failures and retry?
@@ -3066,7 +3071,8 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
File dbBackup;
try {
- dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, newDBlocation);
+ dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation,
+ newDBlocation);
} catch (Exception e) {
LOG.error("OM DB checkpoint replacement with new downloaded checkpoint "
+
"failed.", e);
@@ -3116,6 +3122,12 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
return null;
}
+ void stopServices() throws Exception {
+ keyManager.stop();
+ stopSecretManager();
+ metadataManager.stop();
+ }
+
/**
* Replace the current OM DB with the new DB checkpoint.
*
@@ -3124,20 +3136,16 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
* @return location of the backup of the original DB
* @throws Exception
*/
- File replaceOMDBWithCheckpoint(long lastAppliedIndex, Path checkpointPath)
- throws Exception {
- // Stop the DB first
- DBStore store = metadataManager.getStore();
- store.close();
+ File replaceOMDBWithCheckpoint(long lastAppliedIndex, File oldDB,
+ Path checkpointPath) throws Exception {
// Take a backup of the current DB
- File db = store.getDbLocation();
String dbBackupName = OzoneConsts.OM_DB_BACKUP_PREFIX +
lastAppliedIndex + "_" + System.currentTimeMillis();
- File dbBackup = new File(db.getParentFile(), dbBackupName);
+ File dbBackup = new File(oldDB.getParentFile(), dbBackupName);
try {
- Files.move(db.toPath(), dbBackup.toPath());
+ Files.move(oldDB.toPath(), dbBackup.toPath());
} catch (IOException e) {
LOG.error("Failed to create a backup of the current DB. Aborting " +
"snapshot installation.");
@@ -3146,12 +3154,12 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
// Move the new DB checkpoint into the om metadata dir
try {
- Files.move(checkpointPath, db.toPath());
+ Files.move(checkpointPath, oldDB.toPath());
} catch (IOException e) {
LOG.error("Failed to move downloaded DB checkpoint {} to metadata " +
"directory {}. Resetting to original DB.", checkpointPath,
- db.toPath());
- Files.move(dbBackup.toPath(), db.toPath());
+ oldDB.toPath());
+ Files.move(dbBackup.toPath(), oldDB.toPath());
throw e;
}
return dbBackup;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]