This is an automated email from the ASF dual-hosted git repository.

adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 4670a5ef8c HDDS-11498. Improve SCM deletion efficiency. (#7249)
4670a5ef8c is described below

commit 4670a5ef8c98ae236d1acbaf93e9609309f88094
Author: slfan1989 <[email protected]>
AuthorDate: Thu Oct 17 13:12:17 2024 +0800

    HDDS-11498. Improve SCM deletion efficiency. (#7249)
---
 .../commandhandler/DeleteBlocksCommandHandler.java |  8 ++--
 .../block/DatanodeDeletedBlockTransactions.java    |  3 +-
 .../hadoop/hdds/scm/block/DeletedBlockLogImpl.java | 56 ++++++++++++----------
 .../hdds/scm/block/SCMBlockDeletingService.java    |  3 +-
 .../scm/block/ScmBlockDeletingServiceMetrics.java  | 34 +++++++++++++
 5 files changed, 75 insertions(+), 29 deletions(-)

diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java
index 1a630f8f0b..136c580582 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java
@@ -359,10 +359,11 @@ public class DeleteBlocksCommandHandler implements 
CommandHandler {
       DeletedContainerBlocksSummary summary =
           DeletedContainerBlocksSummary.getFrom(containerBlocks);
       LOG.info("Summary of deleting container blocks, numOfTransactions={}, "
-              + "numOfContainers={}, numOfBlocks={}",
+              + "numOfContainers={}, numOfBlocks={}, commandId={}.",
           summary.getNumOfTxs(),
           summary.getNumOfContainers(),
-          summary.getNumOfBlocks());
+          summary.getNumOfBlocks(),
+          cmd.getCmd().getId());
       if (LOG.isDebugEnabled()) {
         LOG.debug("Start to delete container blocks, TXIDs={}",
             summary.getTxIDSummary());
@@ -389,7 +390,8 @@ public class DeleteBlocksCommandHandler implements 
CommandHandler {
           LOG.debug("Sending following block deletion ACK to SCM");
           for (DeleteBlockTransactionResult result : blockDeletionACK
               .getResultsList()) {
-            LOG.debug("{} : {}", result.getTxID(), result.getSuccess());
+            LOG.debug("TxId = {} : ContainerId = {} : {}",
+                result.getTxID(), result.getContainerID(), 
result.getSuccess());
           }
         }
       }
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DatanodeDeletedBlockTransactions.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DatanodeDeletedBlockTransactions.java
index e485fcc98d..99fd9c7b43 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DatanodeDeletedBlockTransactions.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DatanodeDeletedBlockTransactions.java
@@ -47,7 +47,8 @@ class DatanodeDeletedBlockTransactions {
     blocksDeleted += tx.getLocalIDCount();
     if (SCMBlockDeletingService.LOG.isDebugEnabled()) {
       SCMBlockDeletingService.LOG
-          .debug("Transaction added: {} <- TX({})", dnID, tx.getTxID());
+          .debug("Transaction added: {} <- TX({}), DN {} <- blocksDeleted Add 
{}.",
+          dnID, tx.getTxID(), dnID, tx.getLocalIDCount());
     }
   }
 
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java
index 9d5377b9e3..987f293dd1 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java
@@ -35,7 +35,6 @@ import org.apache.hadoop.hdds.conf.ConfigurationSource;
 import org.apache.hadoop.hdds.protocol.DatanodeDetails;
 import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.CommandStatus;
 import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerBlocksDeletionACKProto;
-import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerBlocksDeletionACKProto.DeleteBlockTransactionResult;
 import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.DeletedBlocksTransaction;
 import 
org.apache.hadoop.hdds.scm.command.CommandStatusReportHandler.DeleteBlockStatus;
 import org.apache.hadoop.hdds.scm.container.ContainerInfo;
@@ -200,20 +199,6 @@ public class DeletedBlockLogImpl
         .build();
   }
 
-  private boolean isTransactionFailed(DeleteBlockTransactionResult result) {
-    if (LOG.isDebugEnabled()) {
-      LOG.debug(
-          "Got block deletion ACK from datanode, TXIDs={}, " + "success={}",
-          result.getTxID(), result.getSuccess());
-    }
-    if (!result.getSuccess()) {
-      LOG.warn("Got failed ACK for TXID={}, prepare to resend the "
-          + "TX in next interval", result.getTxID());
-      return true;
-    }
-    return false;
-  }
-
   @Override
   public int getNumOfValidTransactions() throws IOException {
     lock.lock();
@@ -300,26 +285,46 @@ public class DeletedBlockLogImpl
             .setCount(transactionStatusManager.getOrDefaultRetryCount(
               tx.getTxID(), 0))
             .build();
+
     for (ContainerReplica replica : replicas) {
       DatanodeDetails details = replica.getDatanodeDetails();
-      if (!dnList.contains(details)) {
-        continue;
-      }
       if (!transactionStatusManager.isDuplication(
           details, updatedTxn.getTxID(), commandStatus)) {
         transactions.addTransactionToDN(details.getUuid(), updatedTxn);
+        metrics.incrProcessedTransaction();
       }
     }
   }
 
   private Boolean checkInadequateReplica(Set<ContainerReplica> replicas,
-      DeletedBlocksTransaction txn) throws ContainerNotFoundException {
+      DeletedBlocksTransaction txn,
+      Set<DatanodeDetails> dnList) throws ContainerNotFoundException {
     ContainerInfo containerInfo = containerManager
         .getContainer(ContainerID.valueOf(txn.getContainerID()));
     ReplicationManager replicationManager =
         scmContext.getScm().getReplicationManager();
     ContainerHealthResult result = replicationManager
         .getContainerReplicationHealth(containerInfo, replicas);
+
+    // We have made an improvement here, and we expect that all replicas
+    // of the Container being sent will be included in the dnList.
+    // This change benefits ACK confirmation and improves deletion speed.
+    // The principle behind it is that
+    // DN can receive the command to delete a certain Container at the same 
time and provide
+    // feedback to SCM at roughly the same time.
+    // This avoids the issue of deletion blocking,
+    // where some replicas of a Container are deleted while others do not 
receive the delete command.
+    long containerId = txn.getContainerID();
+    for (ContainerReplica replica : replicas) {
+      DatanodeDetails datanodeDetails = replica.getDatanodeDetails();
+      if (!dnList.contains(datanodeDetails)) {
+        DatanodeDetails dnDetail = replica.getDatanodeDetails();
+        LOG.debug("Skip Container = {}, because DN = {} is not in dnList.",
+            containerId, dnDetail.getUuid());
+        return true;
+      }
+    }
+
     return result.getHealthState() != 
ContainerHealthResult.HealthState.HEALTHY;
   }
 
@@ -345,6 +350,7 @@ public class DeletedBlockLogImpl
                 .getCommandStatusByTxId(dnList.stream().
                 map(DatanodeDetails::getUuid).collect(Collectors.toSet()));
         ArrayList<Long> txIDs = new ArrayList<>();
+        metrics.setNumBlockDeletionTransactionDataNodes(dnList.size());
         // Here takes block replica count as the threshold to avoid the case
         // that part of replicas committed the TXN and recorded in the
         // SCMDeletedBlockTransactionStatusManager, while they are counted
@@ -358,23 +364,25 @@ public class DeletedBlockLogImpl
             // HDDS-7126. When container is under replicated, it is possible
             // that container is deleted, but transactions are not deleted.
             if (containerManager.getContainer(id).isDeleted()) {
-              LOG.warn("Container: " + id + " was deleted for the " +
-                  "transaction: " + txn);
+              LOG.warn("Container: {} was deleted for the " +
+                  "transaction: {}.", id, txn);
               txIDs.add(txn.getTxID());
             } else if (txn.getCount() > -1 && txn.getCount() <= maxRetry
                 && !containerManager.getContainer(id).isOpen()) {
               Set<ContainerReplica> replicas = containerManager
                   .getContainerReplicas(
                       ContainerID.valueOf(txn.getContainerID()));
-              if (checkInadequateReplica(replicas, txn)) {
+              if (checkInadequateReplica(replicas, txn, dnList)) {
+                metrics.incrSkippedTransaction();
                 continue;
               }
               getTransaction(
                   txn, transactions, dnList, replicas, commandStatus);
+            } else if (txn.getCount() >= maxRetry || 
containerManager.getContainer(id).isOpen()) {
+              metrics.incrSkippedTransaction();
             }
           } catch (ContainerNotFoundException ex) {
-            LOG.warn("Container: " + id + " was not found for the transaction: 
"
-                + txn);
+            LOG.warn("Container: {} was not found for the transaction: {}.", 
id, txn);
             txIDs.add(txn.getTxID());
           }
         }
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
index 7271d9dcba..e6fc45cb5e 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
@@ -203,9 +203,10 @@ public class SCMBlockDeletingService extends 
BackgroundService
             }
           }
           LOG.info("Totally added {} blocks to be deleted for"
-                  + " {} datanodes, task elapsed time: {}ms",
+                  + " {} datanodes / {} totalnodes, task elapsed time: {}ms",
               transactions.getBlocksDeleted(),
               transactions.getDatanodeTransactionMap().size(),
+              included.size(),
               Time.monotonicNow() - startTime);
           deletedBlockLog.incrementCount(new ArrayList<>(processedTxIDs));
         } catch (NotLeaderException nle) {
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/ScmBlockDeletingServiceMetrics.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/ScmBlockDeletingServiceMetrics.java
index 2cadca1d92..6637bd1832 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/ScmBlockDeletingServiceMetrics.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/ScmBlockDeletingServiceMetrics.java
@@ -24,6 +24,7 @@ import org.apache.hadoop.metrics2.annotation.Metric;
 import org.apache.hadoop.metrics2.annotation.Metrics;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.metrics2.lib.MutableCounterLong;
+import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
 
 /**
  * Metrics related to Block Deleting Service running in SCM.
@@ -76,6 +77,15 @@ public final class ScmBlockDeletingServiceMetrics {
   @Metric(about = "The number of created txs which are added into DB.")
   private MutableCounterLong numBlockDeletionTransactionCreated;
 
+  @Metric(about = "The number of skipped transactions")
+  private MutableCounterLong numSkippedTransactions;
+
+  @Metric(about = "The number of processed transactions")
+  private MutableCounterLong numProcessedTransactions;
+
+  @Metric(about = "The number of dataNodes of delete transactions.")
+  private MutableGaugeLong numBlockDeletionTransactionDataNodes;
+
   private ScmBlockDeletingServiceMetrics() {
   }
 
@@ -130,6 +140,18 @@ public final class ScmBlockDeletingServiceMetrics {
     this.numBlockDeletionTransactionCreated.incr(count);
   }
 
+  public void incrSkippedTransaction() {
+    this.numSkippedTransactions.incr();
+  }
+
+  public void incrProcessedTransaction() {
+    this.numProcessedTransactions.incr();
+  }
+
+  public void setNumBlockDeletionTransactionDataNodes(long dataNodes) {
+    this.numBlockDeletionTransactionDataNodes.set(dataNodes);
+  }
+
   public long getNumBlockDeletionCommandSent() {
     return numBlockDeletionCommandSent.value();
   }
@@ -162,6 +184,18 @@ public final class ScmBlockDeletingServiceMetrics {
     return numBlockDeletionTransactionCreated.value();
   }
 
+  public long getNumSkippedTransactions() {
+    return numSkippedTransactions.value();
+  }
+
+  public long getNumProcessedTransactions() {
+    return numProcessedTransactions.value();
+  }
+
+  public long getNumBlockDeletionTransactionDataNodes() {
+    return numBlockDeletionTransactionDataNodes.value();
+  }
+
   @Override
   public String toString() {
     StringBuffer buffer = new StringBuffer();


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to