This is an automated email from the ASF dual-hosted git repository.
weichiu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new a9a42eb842 HDDS-10941. Add a few interesting ContainerStateMachine
metrics in CSMMetrics (#6782)
a9a42eb842 is described below
commit a9a42eb842b7eb9881b091accb9c529416045a7e
Author: Wei-Chiu Chuang <[email protected]>
AuthorDate: Mon Jul 8 11:44:18 2024 -0700
HDDS-10941. Add a few interesting ContainerStateMachine metrics in
CSMMetrics (#6782)
---
.../common/transport/server/ratis/CSMMetrics.java | 46 ++++++++++++++++++++--
.../server/ratis/ContainerStateMachine.java | 21 +++++++++-
.../transport/server/ratis/DispatcherContext.java | 8 ++++
3 files changed, 70 insertions(+), 5 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java
index 6bd1594618..3634ae34ac 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java
@@ -39,6 +39,7 @@ import java.util.EnumMap;
public class CSMMetrics {
public static final String SOURCE_NAME =
CSMMetrics.class.getSimpleName();
+ private RaftGroupId gid;
// ratis op metrics metrics
private @Metric MutableCounterLong numWriteStateMachineOps;
@@ -50,6 +51,7 @@ public class CSMMetrics {
private @Metric MutableRate transactionLatencyMs;
private final EnumMap<Type, MutableRate> opsLatencyMs;
+ private final EnumMap<Type, MutableRate> opsQueueingDelay;
private MetricsRegistry registry = null;
// Failure Metrics
@@ -64,15 +66,22 @@ public class CSMMetrics {
private @Metric MutableCounterLong numDataCacheMiss;
private @Metric MutableCounterLong numDataCacheHit;
private @Metric MutableCounterLong numEvictedCacheCount;
+ private @Metric MutableCounterLong pendingApplyTransactions;
private @Metric MutableRate applyTransactionNs;
private @Metric MutableRate writeStateMachineDataNs;
+ private @Metric MutableRate writeStateMachineQueueingLatencyNs;
+ private @Metric MutableRate untilApplyTransactionNs;
+ private @Metric MutableRate startTransactionCompleteNs;
- public CSMMetrics() {
+ public CSMMetrics(RaftGroupId gid) {
+ this.gid = gid;
this.opsLatencyMs = new EnumMap<>(ContainerProtos.Type.class);
+ this.opsQueueingDelay = new EnumMap<>(ContainerProtos.Type.class);
this.registry = new MetricsRegistry(CSMMetrics.class.getSimpleName());
for (ContainerProtos.Type type : ContainerProtos.Type.values()) {
opsLatencyMs.put(type, registry.newRate(type.toString() + "Ms", type + "
op"));
+ opsQueueingDelay.put(type, registry.newRate("queueingDelay" +
type.toString() + "Ns", type + " op"));
}
}
@@ -80,7 +89,12 @@ public class CSMMetrics {
MetricsSystem ms = DefaultMetricsSystem.instance();
return ms.register(SOURCE_NAME + gid.toString(),
"Container State Machine",
- new CSMMetrics());
+ new CSMMetrics(gid));
+ }
+
+ @Metric
+ public String getRaftGroupId() {
+ return gid.toString();
}
public void incNumWriteStateMachineOps() {
@@ -189,6 +203,11 @@ public class CSMMetrics {
transactionLatencyMs.add(latencyMillis);
}
+ public void recordQueueingDelay(ContainerProtos.Type type,
+ long latencyNanos) {
+ opsQueueingDelay.get(type).add(latencyNanos);
+ }
+
public void incNumStartTransactionVerifyFailures() {
numStartTransactionVerifyFailures.incr();
}
@@ -205,6 +224,19 @@ public class CSMMetrics {
writeStateMachineDataNs.add(latencyNanos);
}
+
+ public void recordWriteStateMachineQueueingLatencyNs(long latencyNanos) {
+ writeStateMachineQueueingLatencyNs.add(latencyNanos);
+ }
+
+ public void recordUntilApplyTransactionNs(long latencyNanos) {
+ untilApplyTransactionNs.add(latencyNanos);
+ }
+
+ public void recordStartTransactionCompleteNs(long latencyNanos) {
+ startTransactionCompleteNs.add(latencyNanos);
+ }
+
public void incNumDataCacheMiss() {
numDataCacheMiss.incr();
}
@@ -216,8 +248,16 @@ public class CSMMetrics {
numEvictedCacheCount.incr();
}
+ public void incPendingApplyTransactions() {
+ pendingApplyTransactions.incr();
+ }
+
+ public void decPendingApplyTransactions() {
+ pendingApplyTransactions.incr(-1);
+ }
+
public void unRegister() {
MetricsSystem ms = DefaultMetricsSystem.instance();
- ms.unregisterSource(SOURCE_NAME);
+ ms.unregisterSource(SOURCE_NAME + gid.toString());
}
}
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
index 6351c746fe..f583eadd7e 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java
@@ -402,6 +402,7 @@ public class ContainerStateMachine extends BaseStateMachine
{
@Override
public TransactionContext startTransaction(RaftClientRequest request)
throws IOException {
+ long startTime = Time.monotonicNowNanos();
final ContainerCommandRequestProto proto =
message2ContainerCommandRequestProto(request.getMessage());
Preconditions.checkArgument(request.getRaftGroupId().equals(gid));
@@ -411,6 +412,8 @@ public class ContainerStateMachine extends BaseStateMachine
{
.setStateMachine(this)
.setServerRole(RaftPeerRole.LEADER);
+ metrics.incPendingApplyTransactions();
+
try {
dispatcher.validateContainerCommand(proto);
} catch (IOException ioe) {
@@ -440,9 +443,11 @@ public class ContainerStateMachine extends
BaseStateMachine {
builder.setStateMachineData(write.getData());
}
final ContainerCommandRequestProto containerCommandRequestProto =
protoBuilder.build();
- return builder.setStateMachineContext(new Context(proto,
containerCommandRequestProto))
+ TransactionContext txnContext = builder.setStateMachineContext(new
Context(proto, containerCommandRequestProto))
.setLogData(containerCommandRequestProto.toByteString())
.build();
+ metrics.recordStartTransactionCompleteNs(Time.monotonicNowNanos() -
startTime);
+ return txnContext;
}
private static ContainerCommandRequestProto getContainerCommandRequestProto(
@@ -521,6 +526,8 @@ public class ContainerStateMachine extends BaseStateMachine
{
CompletableFuture<ContainerCommandResponseProto> writeChunkFuture =
CompletableFuture.supplyAsync(() -> {
try {
+ metrics.recordWriteStateMachineQueueingLatencyNs(
+ Time.monotonicNowNanos() - startTime);
return dispatchCommand(requestProto, context);
} catch (Exception e) {
LOG.error("{}: writeChunk writeStateMachineData failed: blockId" +
@@ -884,6 +891,11 @@ public class ContainerStateMachine extends
BaseStateMachine {
final CheckedSupplier<ContainerCommandResponseProto, Exception> task
= () -> {
try {
+ long timeNow = Time.monotonicNowNanos();
+ long queueingDelay = timeNow - context.getStartTime();
+ metrics.recordQueueingDelay(request.getCmdType(), queueingDelay);
+ // TODO: add a counter to track number of executing
applyTransaction
+ // and queue size
return dispatchCommand(request, context);
} catch (Exception e) {
exceptionHandler.accept(e);
@@ -932,11 +944,13 @@ public class ContainerStateMachine extends
BaseStateMachine {
.setTerm(trx.getLogEntry().getTerm())
.setLogIndex(index);
+ final Context context = (Context) trx.getStateMachineContext();
long applyTxnStartTime = Time.monotonicNowNanos();
+ metrics.recordUntilApplyTransactionNs(applyTxnStartTime -
context.getStartTime());
applyTransactionSemaphore.acquire();
metrics.incNumApplyTransactionsOps();
- final Context context = (Context) trx.getStateMachineContext();
+
Objects.requireNonNull(context, "context == null");
final ContainerCommandRequestProto requestProto = context.getLogProto();
final Type cmdType = requestProto.getCmdType();
@@ -1021,6 +1035,9 @@ public class ContainerStateMachine extends
BaseStateMachine {
applyTransactionSemaphore.release();
metrics.recordApplyTransactionCompletionNs(
Time.monotonicNowNanos() - applyTxnStartTime);
+ if (trx.getServerRole() == RaftPeerRole.LEADER) {
+ metrics.decPendingApplyTransactions();
+ }
});
return applyTransactionFuture;
} catch (InterruptedException e) {
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/DispatcherContext.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/DispatcherContext.java
index d6c976cb38..15af264535 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/DispatcherContext.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/DispatcherContext.java
@@ -19,6 +19,7 @@ package
org.apache.hadoop.ozone.container.common.transport.server.ratis;
import org.apache.hadoop.hdds.annotation.InterfaceAudience;
import org.apache.hadoop.hdds.annotation.InterfaceStability;
+import org.apache.hadoop.util.Time;
import org.apache.ratis.server.protocol.TermIndex;
import java.util.Map;
@@ -118,12 +119,15 @@ public final class DispatcherContext {
private final Map<Long, Long> container2BCSIDMap;
+ private long startTime;
+
private DispatcherContext(Builder b) {
this.op = Objects.requireNonNull(b.op, "op == null");
this.term = b.term;
this.logIndex = b.logIndex;
this.stage = b.stage;
this.container2BCSIDMap = b.container2BCSIDMap;
+ this.startTime = Time.monotonicNowNanos();
}
/** Use {@link DispatcherContext#op(DispatcherContext)} for handling null. */
@@ -147,6 +151,10 @@ public final class DispatcherContext {
return container2BCSIDMap;
}
+ public long getStartTime() {
+ return startTime;
+ }
+
@Override
public String toString() {
return op + "-" + stage + TermIndex.valueOf(term, logIndex);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]