This is an automated email from the ASF dual-hosted git repository.

umamahesh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new d088d199bf HDDS-6982. EC: Attempt to cleanup the RECOVERING container 
when reconstruction failed at coordinator. (#3583)
d088d199bf is described below

commit d088d199bf506d76f7046a4b19b690e2a9e6915e
Author: Uma Maheswara Rao G <[email protected]>
AuthorDate: Sat Jul 9 21:45:52 2022 -0700

    HDDS-6982. EC: Attempt to cleanup the RECOVERING container when 
reconstruction failed at coordinator. (#3583)
---
 .../reconstruction/ECContainerOperationClient.java | 25 +++++++++
 .../ECReconstructionCoordinator.java               | 61 ++++++++++++++++------
 .../ozone/container/keyvalue/KeyValueHandler.java  |  8 ++-
 3 files changed, 76 insertions(+), 18 deletions(-)

diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
index adebe11776..43a96cb524 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
@@ -111,6 +111,31 @@ public class ECContainerOperationClient implements 
Closeable {
     }
   }
 
+  public void deleteRecoveringContainer(long containerID, DatanodeDetails dn,
+      ECReplicationConfig repConfig, String encodedToken) throws IOException {
+    XceiverClientSpi xceiverClient = this.xceiverClientManager
+        .acquireClient(singleNodePipeline(dn, repConfig));
+    try {
+      // Before deleting the recovering container, just make sure that state is
+      // Recovering. There will be still race condition, but that will avoid
+      // most usual case.
+      ContainerProtos.ReadContainerResponseProto readContainerResponseProto =
+          ContainerProtocolCalls
+              .readContainer(xceiverClient, containerID, encodedToken);
+      if (readContainerResponseProto
+          .getContainerData()
+          .getState() == ContainerProtos.ContainerDataProto.State.RECOVERING) {
+        ContainerProtocolCalls
+            .deleteContainer(xceiverClient, containerID, true, encodedToken);
+      } else {
+        LOG.warn("Container will not be deleted as it is not a recovering"
+            + " container {}", containerID);
+      }
+    } finally {
+      this.xceiverClientManager.releaseClient(xceiverClient, false);
+    }
+  }
+
   public void createRecoveringContainer(long containerID, DatanodeDetails dn,
       ECReplicationConfig repConfig, String encodedToken, int replicaIndex)
       throws IOException {
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java
index 9b00596d58..b0f33e56a7 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java
@@ -135,25 +135,52 @@ public class ECReconstructionCoordinator implements 
Closeable {
 
     // 1. create target recovering containers.
     String containerToken = encode(tokenHelper.getContainerToken(cid));
-    for (Map.Entry<Integer, DatanodeDetails> indexDnPair : targetNodeMap
-        .entrySet()) {
-      DatanodeDetails dn = indexDnPair.getValue();
-      Integer index = indexDnPair.getKey();
-      containerOperationClient.createRecoveringContainer(containerID, dn,
-          repConfig, containerToken, index);
-    }
+    List<DatanodeDetails> recoveringContainersCreatedDNs = new ArrayList<>();
+    try {
+      for (Map.Entry<Integer, DatanodeDetails> indexDnPair : targetNodeMap
+          .entrySet()) {
+        DatanodeDetails dn = indexDnPair.getValue();
+        Integer index = indexDnPair.getKey();
+        containerOperationClient
+            .createRecoveringContainer(containerID, dn, repConfig,
+                containerToken, index);
+        recoveringContainersCreatedDNs.add(dn);
+      }
 
-    // 2. Reconstruct and transfer to targets
-    for (BlockLocationInfo blockLocationInfo : blockLocationInfoMap.values()) {
-      reconstructECBlockGroup(blockLocationInfo, repConfig, targetNodeMap);
-    }
+      // 2. Reconstruct and transfer to targets
+      for (BlockLocationInfo blockLocationInfo : blockLocationInfoMap
+          .values()) {
+        reconstructECBlockGroup(blockLocationInfo, repConfig, targetNodeMap);
+      }
 
-    // 3. Close containers
-    for (Map.Entry<Integer, DatanodeDetails> indexDnPair : targetNodeMap
-        .entrySet()) {
-      DatanodeDetails dn = indexDnPair.getValue();
-      containerOperationClient.closeContainer(containerID, dn, repConfig,
-          containerToken);
+      // 3. Close containers
+      for (DatanodeDetails dn: recoveringContainersCreatedDNs) {
+        containerOperationClient
+            .closeContainer(containerID, dn, repConfig, containerToken);
+      }
+    } catch (Exception e) {
+      // Any exception let's delete the recovering containers.
+      LOG.warn(
+          "Exception while reconstructing the container {}. Cleaning up"
+              + " all the recovering containers in the reconstruction 
process.",
+          containerID, e);
+      // Delete only the current thread successfully created recovering
+      // containers.
+      for (DatanodeDetails dn : recoveringContainersCreatedDNs) {
+        try {
+          containerOperationClient
+              .deleteRecoveringContainer(containerID, dn, repConfig,
+                  containerToken);
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Deleted the container {}, at the target: {}",
+                containerID, dn);
+          }
+        } catch (IOException ioe) {
+          LOG.error("Exception while deleting the container {} at target: {}",
+              containerID, dn, ioe);
+        }
+        throw e;
+      }
     }
 
   }
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
index 1c79683326..7d88aefeed 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
@@ -84,6 +84,7 @@ import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_DATANODE_VOLUME_CHOOSING_POLICY;
 import static 
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.CLOSED_CONTAINER_IO;
+import static 
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.CONTAINER_ALREADY_EXISTS;
 import static 
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.CONTAINER_INTERNAL_ERROR;
 import static 
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.CONTAINER_UNHEALTHY;
 import static 
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.DELETE_ON_NON_EMPTY_CONTAINER;
@@ -268,7 +269,12 @@ public class KeyValueHandler extends Handler {
     }
     // Create Container request should be passed a null container as the
     // container would be created here.
-    Preconditions.checkArgument(kvContainer == null);
+    if (kvContainer != null) {
+      return ContainerUtils.logAndReturnError(LOG,
+          new StorageContainerException(
+              "Container creation failed because " + "key value container" +
+                  " already exists", null, CONTAINER_ALREADY_EXISTS), request);
+    }
 
     long containerID = request.getContainerID();
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to