This is an automated email from the ASF dual-hosted git repository.

weichiu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new a9a42eb842 HDDS-10941. Add a few interesting ContainerStateMachine 
metrics in CSMMetrics (#6782)
a9a42eb842 is described below

commit a9a42eb842b7eb9881b091accb9c529416045a7e
Author: Wei-Chiu Chuang <[email protected]>
AuthorDate: Mon Jul 8 11:44:18 2024 -0700

    HDDS-10941. Add a few interesting ContainerStateMachine metrics in 
CSMMetrics (#6782)
---
 .../common/transport/server/ratis/CSMMetrics.java  | 46 ++++++++++++++++++++--
 .../server/ratis/ContainerStateMachine.java        | 21 +++++++++-
 .../transport/server/ratis/DispatcherContext.java  |  8 ++++
 3 files changed, 70 insertions(+), 5 deletions(-)

diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java
index 6bd1594618..3634ae34ac 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java
@@ -39,6 +39,7 @@ import java.util.EnumMap;
 public class CSMMetrics {
   public static final String SOURCE_NAME =
       CSMMetrics.class.getSimpleName();
+  private RaftGroupId gid;
 
   // ratis op metrics metrics
   private @Metric MutableCounterLong numWriteStateMachineOps;
@@ -50,6 +51,7 @@ public class CSMMetrics {
 
   private @Metric MutableRate transactionLatencyMs;
   private final EnumMap<Type, MutableRate> opsLatencyMs;
+  private final EnumMap<Type, MutableRate> opsQueueingDelay;
   private MetricsRegistry registry = null;
 
   // Failure Metrics
@@ -64,15 +66,22 @@ public class CSMMetrics {
   private @Metric MutableCounterLong numDataCacheMiss;
   private @Metric MutableCounterLong numDataCacheHit;
   private @Metric MutableCounterLong numEvictedCacheCount;
+  private @Metric MutableCounterLong pendingApplyTransactions;
 
   private @Metric MutableRate applyTransactionNs;
   private @Metric MutableRate writeStateMachineDataNs;
+  private @Metric MutableRate writeStateMachineQueueingLatencyNs;
+  private @Metric MutableRate untilApplyTransactionNs;
+  private @Metric MutableRate startTransactionCompleteNs;
 
-  public CSMMetrics() {
+  public CSMMetrics(RaftGroupId gid) {
+    this.gid = gid;
     this.opsLatencyMs = new EnumMap<>(ContainerProtos.Type.class);
+    this.opsQueueingDelay = new EnumMap<>(ContainerProtos.Type.class);
     this.registry = new MetricsRegistry(CSMMetrics.class.getSimpleName());
     for (ContainerProtos.Type type : ContainerProtos.Type.values()) {
       opsLatencyMs.put(type, registry.newRate(type.toString() + "Ms", type + " 
op"));
+      opsQueueingDelay.put(type, registry.newRate("queueingDelay" + 
type.toString() + "Ns", type + " op"));
     }
   }
 
@@ -80,7 +89,12 @@ public class CSMMetrics {
     MetricsSystem ms = DefaultMetricsSystem.instance();
     return ms.register(SOURCE_NAME + gid.toString(),
         "Container State Machine",
-        new CSMMetrics());
+        new CSMMetrics(gid));
+  }
+
+  @Metric
+  public String getRaftGroupId() {
+    return gid.toString();
   }
 
   public void incNumWriteStateMachineOps() {
@@ -189,6 +203,11 @@ public class CSMMetrics {
     transactionLatencyMs.add(latencyMillis);
   }
 
+  public void recordQueueingDelay(ContainerProtos.Type type,
+                                  long latencyNanos) {
+    opsQueueingDelay.get(type).add(latencyNanos);
+  }
+
   public void incNumStartTransactionVerifyFailures() {
     numStartTransactionVerifyFailures.incr();
   }
@@ -205,6 +224,19 @@ public class CSMMetrics {
     writeStateMachineDataNs.add(latencyNanos);
   }
 
+
+  public void recordWriteStateMachineQueueingLatencyNs(long latencyNanos) {
+    writeStateMachineQueueingLatencyNs.add(latencyNanos);
+  }
+
+  public void recordUntilApplyTransactionNs(long latencyNanos) {
+    untilApplyTransactionNs.add(latencyNanos);
+  }
+
+  public void recordStartTransactionCompleteNs(long latencyNanos) {
+    startTransactionCompleteNs.add(latencyNanos);
+  }
+
   public void incNumDataCacheMiss() {
     numDataCacheMiss.incr();
   }
@@ -216,8 +248,16 @@ public class CSMMetrics {
     numEvictedCacheCount.incr();
   }
 
+  public void incPendingApplyTransactions() {
+    pendingApplyTransactions.incr();
+  }
+
+  public void decPendingApplyTransactions() {
+    pendingApplyTransactions.incr(-1);
+  }
+
   public void unRegister() {
     MetricsSystem ms = DefaultMetricsSystem.instance();
-    ms.unregisterSource(SOURCE_NAME);
+    ms.unregisterSource(SOURCE_NAME + gid.toString());
   }
 }
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
index 6351c746fe..f583eadd7e 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
@@ -402,6 +402,7 @@ public class ContainerStateMachine extends BaseStateMachine 
{
   @Override
   public TransactionContext startTransaction(RaftClientRequest request)
       throws IOException {
+    long startTime = Time.monotonicNowNanos();
     final ContainerCommandRequestProto proto =
         message2ContainerCommandRequestProto(request.getMessage());
     Preconditions.checkArgument(request.getRaftGroupId().equals(gid));
@@ -411,6 +412,8 @@ public class ContainerStateMachine extends BaseStateMachine 
{
         .setStateMachine(this)
         .setServerRole(RaftPeerRole.LEADER);
 
+    metrics.incPendingApplyTransactions();
+
     try {
       dispatcher.validateContainerCommand(proto);
     } catch (IOException ioe) {
@@ -440,9 +443,11 @@ public class ContainerStateMachine extends 
BaseStateMachine {
       builder.setStateMachineData(write.getData());
     }
     final ContainerCommandRequestProto containerCommandRequestProto = 
protoBuilder.build();
-    return builder.setStateMachineContext(new Context(proto, 
containerCommandRequestProto))
+    TransactionContext txnContext = builder.setStateMachineContext(new 
Context(proto, containerCommandRequestProto))
         .setLogData(containerCommandRequestProto.toByteString())
         .build();
+    metrics.recordStartTransactionCompleteNs(Time.monotonicNowNanos() - 
startTime);
+    return txnContext;
   }
 
   private static ContainerCommandRequestProto getContainerCommandRequestProto(
@@ -521,6 +526,8 @@ public class ContainerStateMachine extends BaseStateMachine 
{
     CompletableFuture<ContainerCommandResponseProto> writeChunkFuture =
         CompletableFuture.supplyAsync(() -> {
           try {
+            metrics.recordWriteStateMachineQueueingLatencyNs(
+                Time.monotonicNowNanos() - startTime);
             return dispatchCommand(requestProto, context);
           } catch (Exception e) {
             LOG.error("{}: writeChunk writeStateMachineData failed: blockId" +
@@ -884,6 +891,11 @@ public class ContainerStateMachine extends 
BaseStateMachine {
     final CheckedSupplier<ContainerCommandResponseProto, Exception> task
         = () -> {
           try {
+            long timeNow = Time.monotonicNowNanos();
+            long queueingDelay = timeNow - context.getStartTime();
+            metrics.recordQueueingDelay(request.getCmdType(), queueingDelay);
+            // TODO: add a counter to track number of executing 
applyTransaction
+            // and queue size
             return dispatchCommand(request, context);
           } catch (Exception e) {
             exceptionHandler.accept(e);
@@ -932,11 +944,13 @@ public class ContainerStateMachine extends 
BaseStateMachine {
           .setTerm(trx.getLogEntry().getTerm())
           .setLogIndex(index);
 
+      final Context context = (Context) trx.getStateMachineContext();
       long applyTxnStartTime = Time.monotonicNowNanos();
+      metrics.recordUntilApplyTransactionNs(applyTxnStartTime - 
context.getStartTime());
       applyTransactionSemaphore.acquire();
       metrics.incNumApplyTransactionsOps();
 
-      final Context context = (Context) trx.getStateMachineContext();
+
       Objects.requireNonNull(context, "context == null");
       final ContainerCommandRequestProto requestProto = context.getLogProto();
       final Type cmdType = requestProto.getCmdType();
@@ -1021,6 +1035,9 @@ public class ContainerStateMachine extends 
BaseStateMachine {
         applyTransactionSemaphore.release();
         metrics.recordApplyTransactionCompletionNs(
             Time.monotonicNowNanos() - applyTxnStartTime);
+        if (trx.getServerRole() == RaftPeerRole.LEADER) {
+          metrics.decPendingApplyTransactions();
+        }
       });
       return applyTransactionFuture;
     } catch (InterruptedException e) {
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/DispatcherContext.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/DispatcherContext.java
index d6c976cb38..15af264535 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/DispatcherContext.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/DispatcherContext.java
@@ -19,6 +19,7 @@ package 
org.apache.hadoop.ozone.container.common.transport.server.ratis;
 
 import org.apache.hadoop.hdds.annotation.InterfaceAudience;
 import org.apache.hadoop.hdds.annotation.InterfaceStability;
+import org.apache.hadoop.util.Time;
 import org.apache.ratis.server.protocol.TermIndex;
 
 import java.util.Map;
@@ -118,12 +119,15 @@ public final class DispatcherContext {
 
   private final Map<Long, Long> container2BCSIDMap;
 
+  private long startTime;
+
   private DispatcherContext(Builder b) {
     this.op = Objects.requireNonNull(b.op, "op == null");
     this.term = b.term;
     this.logIndex = b.logIndex;
     this.stage = b.stage;
     this.container2BCSIDMap = b.container2BCSIDMap;
+    this.startTime = Time.monotonicNowNanos();
   }
 
   /** Use {@link DispatcherContext#op(DispatcherContext)} for handling null. */
@@ -147,6 +151,10 @@ public final class DispatcherContext {
     return container2BCSIDMap;
   }
 
+  public long getStartTime() {
+    return startTime;
+  }
+
   @Override
   public String toString() {
     return op + "-" + stage + TermIndex.valueOf(term, logIndex);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to