This is an automated email from the ASF dual-hosted git repository.
umamahesh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new d088d199bf HDDS-6982. EC: Attempt to cleanup the RECOVERING container
when reconstruction failed at coordinator. (#3583)
d088d199bf is described below
commit d088d199bf506d76f7046a4b19b690e2a9e6915e
Author: Uma Maheswara Rao G <[email protected]>
AuthorDate: Sat Jul 9 21:45:52 2022 -0700
HDDS-6982. EC: Attempt to cleanup the RECOVERING container when
reconstruction failed at coordinator. (#3583)
---
.../reconstruction/ECContainerOperationClient.java | 25 +++++++++
.../ECReconstructionCoordinator.java | 61 ++++++++++++++++------
.../ozone/container/keyvalue/KeyValueHandler.java | 8 ++-
3 files changed, 76 insertions(+), 18 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
index adebe11776..43a96cb524 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
@@ -111,6 +111,31 @@ public class ECContainerOperationClient implements
Closeable {
}
}
+ public void deleteRecoveringContainer(long containerID, DatanodeDetails dn,
+ ECReplicationConfig repConfig, String encodedToken) throws IOException {
+ XceiverClientSpi xceiverClient = this.xceiverClientManager
+ .acquireClient(singleNodePipeline(dn, repConfig));
+ try {
+ // Before deleting the recovering container, just make sure that state is
+ // Recovering. There will be still race condition, but that will avoid
+ // most usual case.
+ ContainerProtos.ReadContainerResponseProto readContainerResponseProto =
+ ContainerProtocolCalls
+ .readContainer(xceiverClient, containerID, encodedToken);
+ if (readContainerResponseProto
+ .getContainerData()
+ .getState() == ContainerProtos.ContainerDataProto.State.RECOVERING) {
+ ContainerProtocolCalls
+ .deleteContainer(xceiverClient, containerID, true, encodedToken);
+ } else {
+ LOG.warn("Container will not be deleted as it is not a recovering"
+ + " container {}", containerID);
+ }
+ } finally {
+ this.xceiverClientManager.releaseClient(xceiverClient, false);
+ }
+ }
+
public void createRecoveringContainer(long containerID, DatanodeDetails dn,
ECReplicationConfig repConfig, String encodedToken, int replicaIndex)
throws IOException {
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java
index 9b00596d58..b0f33e56a7 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java
@@ -135,25 +135,52 @@ public class ECReconstructionCoordinator implements
Closeable {
// 1. create target recovering containers.
String containerToken = encode(tokenHelper.getContainerToken(cid));
- for (Map.Entry<Integer, DatanodeDetails> indexDnPair : targetNodeMap
- .entrySet()) {
- DatanodeDetails dn = indexDnPair.getValue();
- Integer index = indexDnPair.getKey();
- containerOperationClient.createRecoveringContainer(containerID, dn,
- repConfig, containerToken, index);
- }
+ List<DatanodeDetails> recoveringContainersCreatedDNs = new ArrayList<>();
+ try {
+ for (Map.Entry<Integer, DatanodeDetails> indexDnPair : targetNodeMap
+ .entrySet()) {
+ DatanodeDetails dn = indexDnPair.getValue();
+ Integer index = indexDnPair.getKey();
+ containerOperationClient
+ .createRecoveringContainer(containerID, dn, repConfig,
+ containerToken, index);
+ recoveringContainersCreatedDNs.add(dn);
+ }
- // 2. Reconstruct and transfer to targets
- for (BlockLocationInfo blockLocationInfo : blockLocationInfoMap.values()) {
- reconstructECBlockGroup(blockLocationInfo, repConfig, targetNodeMap);
- }
+ // 2. Reconstruct and transfer to targets
+ for (BlockLocationInfo blockLocationInfo : blockLocationInfoMap
+ .values()) {
+ reconstructECBlockGroup(blockLocationInfo, repConfig, targetNodeMap);
+ }
- // 3. Close containers
- for (Map.Entry<Integer, DatanodeDetails> indexDnPair : targetNodeMap
- .entrySet()) {
- DatanodeDetails dn = indexDnPair.getValue();
- containerOperationClient.closeContainer(containerID, dn, repConfig,
- containerToken);
+ // 3. Close containers
+ for (DatanodeDetails dn: recoveringContainersCreatedDNs) {
+ containerOperationClient
+ .closeContainer(containerID, dn, repConfig, containerToken);
+ }
+ } catch (Exception e) {
+ // Any exception let's delete the recovering containers.
+ LOG.warn(
+ "Exception while reconstructing the container {}. Cleaning up"
+ + " all the recovering containers in the reconstruction
process.",
+ containerID, e);
+ // Delete only the current thread successfully created recovering
+ // containers.
+ for (DatanodeDetails dn : recoveringContainersCreatedDNs) {
+ try {
+ containerOperationClient
+ .deleteRecoveringContainer(containerID, dn, repConfig,
+ containerToken);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Deleted the container {}, at the target: {}",
+ containerID, dn);
+ }
+ } catch (IOException ioe) {
+ LOG.error("Exception while deleting the container {} at target: {}",
+ containerID, dn, ioe);
+ }
+ throw e;
+ }
}
}
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
index 1c79683326..7d88aefeed 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
@@ -84,6 +84,7 @@ import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_DATANODE_VOLUME_CHOOSING_POLICY;
import static
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.CLOSED_CONTAINER_IO;
+import static
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.CONTAINER_ALREADY_EXISTS;
import static
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.CONTAINER_INTERNAL_ERROR;
import static
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.CONTAINER_UNHEALTHY;
import static
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result.DELETE_ON_NON_EMPTY_CONTAINER;
@@ -268,7 +269,12 @@ public class KeyValueHandler extends Handler {
}
// Create Container request should be passed a null container as the
// container would be created here.
- Preconditions.checkArgument(kvContainer == null);
+ if (kvContainer != null) {
+ return ContainerUtils.logAndReturnError(LOG,
+ new StorageContainerException(
+ "Container creation failed because " + "key value container" +
+ " already exists", null, CONTAINER_ALREADY_EXISTS), request);
+ }
long containerID = request.getContainerID();
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]