This is an automated email from the ASF dual-hosted git repository.

siyao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 57d8a4144a HDDS-8832. [Snapshot] Synchronized add snapshotInfo to 
snapshot chain and snapshot cache (#4909)
57d8a4144a is described below

commit 57d8a4144a2668e104f9ffeebb9e80134a3930da
Author: Hemant Kumar <[email protected]>
AuthorDate: Thu Jun 15 22:57:54 2023 -0700

    HDDS-8832. [Snapshot] Synchronized add snapshotInfo to snapshot chain and 
snapshot cache (#4909)
---
 .../hadoop/ozone/om/SnapshotChainManager.java      |  30 +++---
 .../request/snapshot/OMSnapshotCreateRequest.java  | 106 ++++++++++++++-------
 .../apache/hadoop/ozone/om/TestSnapshotChain.java  |   2 +-
 3 files changed, 81 insertions(+), 57 deletions(-)

diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
index 0e343631f6..c147c6fdc3 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
@@ -16,7 +16,6 @@
  */
 package org.apache.hadoop.ozone.om;
 
-import com.google.common.annotations.VisibleForTesting;
 import org.apache.hadoop.hdds.utils.db.Table;
 import org.apache.hadoop.hdds.utils.db.TableIterator;
 import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
@@ -56,8 +55,7 @@ public class SnapshotChainManager {
   private final ConcurrentMap<UUID, String> snapshotIdToTableKey;
   private UUID latestGlobalSnapshotId;
 
-  public SnapshotChainManager(OMMetadataManager metadataManager)
-      throws IOException {
+  public SnapshotChainManager(OMMetadataManager metadataManager) {
     globalSnapshotChain = Collections.synchronizedMap(new LinkedHashMap<>());
     snapshotChainByPath = new ConcurrentHashMap<>();
     latestSnapshotIdByPath = new ConcurrentHashMap<>();
@@ -230,8 +228,7 @@ public class SnapshotChainManager {
   /**
    * Loads the snapshot chain from SnapshotInfo table.
    */
-  private void loadFromSnapshotInfoTable(OMMetadataManager metadataManager)
-      throws IOException {
+  private void loadFromSnapshotInfoTable(OMMetadataManager metadataManager) {
     // read from snapshotInfo table to populate
     // snapshot chains - both global and local path
     try (TableIterator<String, ? extends Table.KeyValue<String, SnapshotInfo>>
@@ -250,17 +247,22 @@ public class SnapshotChainManager {
       for (SnapshotInfo snapshotInfo : snaps.values()) {
         addSnapshot(snapshotInfo);
       }
+    } catch (IOException ioException) {
+      // TODO: [SNAPSHOT] Fail gracefully.
+      throw new RuntimeException(ioException);
     }
   }
 
   /**
    * Add snapshot to snapshot chain.
    */
-  public void addSnapshot(SnapshotInfo snapshotInfo) throws IOException {
+  public synchronized void addSnapshot(SnapshotInfo snapshotInfo)
+      throws IOException {
     addSnapshotGlobal(snapshotInfo.getSnapshotId(),
         snapshotInfo.getGlobalPreviousSnapshotId());
     addSnapshotPath(snapshotInfo.getSnapshotPath(),
-        snapshotInfo.getSnapshotId(), 
snapshotInfo.getPathPreviousSnapshotId());
+        snapshotInfo.getSnapshotId(),
+        snapshotInfo.getPathPreviousSnapshotId());
     // store snapshot ID to snapshot DB table key in the map
     snapshotIdToTableKey.put(snapshotInfo.getSnapshotId(),
         snapshotInfo.getTableKey());
@@ -269,7 +271,8 @@ public class SnapshotChainManager {
   /**
    * Delete snapshot from snapshot chain.
    */
-  public boolean deleteSnapshot(SnapshotInfo snapshotInfo) throws IOException {
+  public synchronized boolean deleteSnapshot(SnapshotInfo snapshotInfo)
+      throws IOException {
     boolean status = deleteSnapshotGlobal(snapshotInfo.getSnapshotId()) &&
         deleteSnapshotPath(snapshotInfo.getSnapshotPath(),
             snapshotInfo.getSnapshotId());
@@ -422,15 +425,4 @@ public class SnapshotChainManager {
   public String getTableKey(UUID snapshotId) {
     return snapshotIdToTableKey.get(snapshotId);
   }
-
-  @VisibleForTesting
-  void loadSnapshotInfo(OMMetadataManager metadataManager) throws IOException {
-    loadFromSnapshotInfoTable(metadataManager);
-  }
-
-  @VisibleForTesting
-  public LinkedHashMap<UUID, SnapshotChainInfo> getSnapshotChainPath(
-      String path) {
-    return snapshotChainByPath.get(path);
-  }
 }
diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
index 5903edb520..ebe1dbc80a 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
@@ -50,7 +50,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
-import java.util.Objects;
 import java.util.UUID;
 
 import static org.apache.hadoop.hdds.HddsUtils.fromProtobuf;
@@ -129,8 +128,6 @@ public class OMSnapshotCreateRequest extends 
OMClientRequest {
     IOException exception = null;
     OmMetadataManagerImpl omMetadataManager = (OmMetadataManagerImpl)
         ozoneManager.getMetadataManager();
-    SnapshotChainManager snapshotChainManager =
-        omMetadataManager.getSnapshotChainManager();
 
     OMResponse.Builder omResponse = OmResponseUtil.getOMResponseBuilder(
         getOmRequest());
@@ -157,26 +154,14 @@ public class OMSnapshotCreateRequest extends 
OMClientRequest {
       }
 
       // Note down RDB latest transaction sequence number, which is used
-      // as snapshot generation in the differ.
+      // as snapshot generation in the Differ.
       final long dbLatestSequenceNumber =
           ((RDBStore) omMetadataManager.getStore()).getDb()
               .getLatestSequenceNumber();
       snapshotInfo.setDbTxSequenceNumber(dbLatestSequenceNumber);
 
-      // Set previous path and global snapshot
-      UUID latestPathSnapshot =
-          snapshotChainManager.getLatestPathSnapshotId(snapshotPath);
-      UUID latestGlobalSnapshot =
-          snapshotChainManager.getLatestGlobalSnapshotId();
-
-      snapshotInfo.setPathPreviousSnapshotId(latestPathSnapshot);
-      snapshotInfo.setGlobalPreviousSnapshotId(latestGlobalSnapshot);
-
-      snapshotChainManager.addSnapshot(snapshotInfo);
-
-      omMetadataManager.getSnapshotInfoTable()
-          .addCacheEntry(new CacheKey<>(key),
-            CacheValue.get(transactionLogIndex, snapshotInfo));
+      addSnapshotInfoToSnapshotChainAndCache(omMetadataManager,
+          transactionLogIndex);
 
       omResponse.setCreateSnapshotResponse(
           CreateSnapshotResponse.newBuilder()
@@ -184,21 +169,6 @@ public class OMSnapshotCreateRequest extends 
OMClientRequest {
       omClientResponse = new OMSnapshotCreateResponse(
           omResponse.build(), snapshotInfo);
     } catch (IOException ex) {
-      // Remove snapshot from the SnapshotChainManager in case of any failure.
-      // It is possible that createSnapshot request fails after snapshot gets
-      // added to snapshot chain manager because couldn't add it to cache/DB.
-      // In that scenario, SnapshotChainManager#globalSnapshotId will point to
-      // failed createSnapshot request's snapshotId but in actual it doesn't
-      // exist in the SnapshotInfo table.
-      // If it doesn't get removed, OM restart will crash on
-      // SnapshotChainManager#loadFromSnapshotInfoTable because it could not
-      // find the previous snapshot which doesn't exist because it was never
-      // added to the SnapshotInfo table.
-      if (Objects.equals(snapshotInfo.getSnapshotId(),
-          snapshotChainManager.getLatestGlobalSnapshotId())) {
-        removeSnapshotInfoFromSnapshotChainManager(snapshotChainManager,
-            snapshotInfo);
-      }
       exception = ex;
       omClientResponse = new OMSnapshotCreateResponse(
           createErrorOMResponse(omResponse, exception));
@@ -220,17 +190,79 @@ public class OMSnapshotCreateRequest extends 
OMClientRequest {
         snapshotInfo.toAuditMap(), exception, userInfo));
     
     if (exception == null) {
-      LOG.info("Created snapshot '{}' under path '{}'",
-          snapshotName, snapshotPath);
+      LOG.info("Created snapshot: '{}' with snapshotId: '{}' under path '{}'",
+          snapshotName, snapshotInfo.getSnapshotId(), snapshotPath);
       omMetrics.incNumSnapshotActive();
     } else {
       omMetrics.incNumSnapshotCreateFails();
-      LOG.error("Failed to create snapshot '{}' under path '{}'",
-          snapshotName, snapshotPath);
+      LOG.error("Failed to create snapshot '{}' with snapshotId: '{}' under " +
+              "path '{}'",
+          snapshotName, snapshotInfo.getSnapshotId(), snapshotPath);
     }
     return omClientResponse;
   }
 
+  /**
+   * Add snapshot to snapshot chain and snapshot cache as a single atomic
+   * operation.
+   * If it is not done as a single atomic operation, can cause data corruption
+   * if there is any failure in adding snapshot entry to cache.
+   * For example, when two threads create snapshots on different
+   * buckets.
+   * T-1: Thread-1 adds the snapshot-1 to chain first. Previous snapshot for
+   * snapshot-1 would be null (let's assume first snapshot)
+   * T-2: Thread-2 adds snapshot-2, previous would be snapshot-1
+   * T-3: Thread-1 tries to update cache with snapshot-1 and fails.
+   * T-4: Thread-2 tries to update cache with snapshot-2 and succeeds.
+   * T-5: Thread-1 removes the snapshot from chain because it failed to
+   * update the cache.
+   * Now, snapshot-2's previous is snapshot-1 but, it doesn't exist because
+   * it was removed at T-5.
+   */
+  private void addSnapshotInfoToSnapshotChainAndCache(
+      OmMetadataManagerImpl omMetadataManager,
+      long transactionLogIndex
+  ) throws IOException {
+    // It is synchronized on SnapshotChainManager object so that this block is
+    // synchronized with OMSnapshotPurgeResponse#cleanupSnapshotChain and only
+    // one of these two operation gets executed at a time otherwise we could be
+    // in similar situation explained above if snapshot gets deleted.
+    synchronized (omMetadataManager.getSnapshotChainManager()) {
+      SnapshotChainManager snapshotChainManager =
+          omMetadataManager.getSnapshotChainManager();
+
+      try {
+        UUID latestPathSnapshot =
+            snapshotChainManager.getLatestPathSnapshotId(snapshotPath);
+        UUID latestGlobalSnapshot =
+            snapshotChainManager.getLatestGlobalSnapshotId();
+
+        snapshotInfo.setPathPreviousSnapshotId(latestPathSnapshot);
+        snapshotInfo.setGlobalPreviousSnapshotId(latestGlobalSnapshot);
+
+        snapshotChainManager.addSnapshot(snapshotInfo);
+
+        omMetadataManager.getSnapshotInfoTable()
+            .addCacheEntry(new CacheKey<>(snapshotInfo.getTableKey()),
+                CacheValue.get(transactionLogIndex, snapshotInfo));
+      } catch (IOException ioException) {
+        // Remove snapshot from the SnapshotChainManager in case of any 
failure.
+        // It is possible that createSnapshot request fails after snapshot gets
+        // added to snapshot chain manager because couldn't add it to cache/DB.
+        // In that scenario, SnapshotChainManager#globalSnapshotId will point 
to
+        // failed createSnapshot request's snapshotId but in actual it doesn't
+        // exist in the SnapshotInfo table.
+        // If it doesn't get removed, OM restart will crash on
+        // SnapshotChainManager#loadFromSnapshotInfoTable because it could not
+        // find the previous snapshot which doesn't exist because it was never
+        // added to the SnapshotInfo table.
+        removeSnapshotInfoFromSnapshotChainManager(snapshotChainManager,
+            snapshotInfo);
+        throw ioException;
+      }
+    }
+  }
+
   /**
    * Removes the snapshot from the SnapshotChainManager.
    * In case of any failure, it logs the exception as an error and swallow it.
diff --git 
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
 
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
index 4a63eb6887..6613ffeb7e 100644
--- 
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
+++ 
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
@@ -261,7 +261,7 @@ public class TestSnapshotChain {
       prevSnapshotID = snapshotID;
     }
 
-    chainManager.loadSnapshotInfo(omMetadataManager);
+    chainManager = new SnapshotChainManager(omMetadataManager);
     // check if snapshots loaded correctly from snapshotInfoTable
     assertEquals(snapshotID2, chainManager.getLatestGlobalSnapshotId());
     assertEquals(snapshotID2, chainManager.nextGlobalSnapshot(snapshotID1));


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to