This is an automated email from the ASF dual-hosted git repository.

adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 361ad068ed HDDS-9719. Add performance audit log for Datanode (#5660)
361ad068ed is described below

commit 361ad068ed56d2cb8e1343f0d14969792f118228
Author: XiChen <[email protected]>
AuthorDate: Thu Feb 1 19:06:11 2024 +0800

    HDDS-9719. Add performance audit log for Datanode (#5660)
---
 .../org/apache/hadoop/hdds/HddsConfigKeys.java     |  5 +++
 .../common/src/main/resources/ozone-default.xml    |  8 ++++
 .../container/common/impl/HddsDispatcher.java      | 48 +++++++++++++++++++++-
 .../ozone/container/keyvalue/KeyValueHandler.java  |  1 +
 .../org/apache/hadoop/ozone/audit/AuditLogger.java | 21 ++++++++++
 .../org/apache/hadoop/ozone/audit/AuditMarker.java |  3 +-
 6 files changed, 83 insertions(+), 3 deletions(-)

diff --git 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
index 7b007fdca1..787f023df2 100644
--- 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
+++ 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
@@ -392,4 +392,9 @@ public final class HddsConfigKeys {
 
   public static final String OZONE_AUDIT_LOG_DEBUG_CMD_LIST_DNAUDIT =
       "ozone.audit.log.debug.cmd.list.dnaudit";
+
+  public static final String HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_KEY =
+      "hdds.datanode.slow.op.warning.threshold";
+  public static final String HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_DEFAULT =
+      "500ms";
 }
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml 
b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 6a29dc81ec..c166e534d5 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -3418,6 +3418,14 @@
       Timeout for the request submitted directly to Ratis in datanode.
     </description>
   </property>
+  <property>
+    <name>hdds.datanode.slow.op.warning.threshold</name>
+    <tag>OZONE, DATANODE, PERFORMANCE</tag>
+    <value>500ms</value>
+    <description>
+      Thresholds for printing slow-operation audit logs.
+    </description>
+  </property>
   <property>
     <name>ozone.om.keyname.character.check.enabled</name>
     <tag>OZONE, OM</tag>
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
index eeb3548d5f..63e51bdfd5 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
@@ -75,7 +75,9 @@ import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.concurrent.TimeUnit;
 
+import static 
org.apache.hadoop.ozone.audit.AuditLogger.PerformanceStringBuilder;
 import static 
org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.malformedRequest;
 import static 
org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.unsupportedRequest;
 import static 
org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult;
@@ -101,6 +103,7 @@ public class HddsDispatcher implements ContainerDispatcher, 
Auditor {
   private String clusterId;
   private ContainerMetrics metrics;
   private final TokenVerifier tokenVerifier;
+  private long slowOpThresholdMs;
 
   /**
    * Constructs an OzoneContainer that receives calls from
@@ -121,6 +124,7 @@ public class HddsDispatcher implements ContainerDispatcher, 
Auditor {
         HddsConfigKeys.HDDS_CONTAINER_CLOSE_THRESHOLD_DEFAULT);
     this.tokenVerifier = tokenVerifier != null ? tokenVerifier
         : new NoopTokenVerifier();
+    this.slowOpThresholdMs = getSlowOpThresholdMs(conf);
 
     protocolMetrics =
         new ProtocolMessageMetrics<>(
@@ -196,6 +200,7 @@ public class HddsDispatcher implements ContainerDispatcher, 
Auditor {
     AuditAction action = getAuditAction(msg.getCmdType());
     EventType eventType = getEventType(msg);
     Map<String, String> params = getAuditParams(msg);
+    PerformanceStringBuilder perf = new PerformanceStringBuilder();
 
     ContainerType containerType;
     ContainerCommandResponseProto responseProto = null;
@@ -326,10 +331,11 @@ public class HddsDispatcher implements 
ContainerDispatcher, Auditor {
       audit(action, eventType, params, AuditEventStatus.FAILURE, ex);
       return ContainerUtils.logAndReturnError(LOG, ex, msg);
     }
+    perf.appendPreOpLatencyMs(Time.monotonicNow() - startTime);
     responseProto = handler.handle(msg, container, dispatcherContext);
+    long oPLatencyMS = Time.monotonicNow() - startTime;
     if (responseProto != null) {
-      metrics.incContainerOpsLatencies(cmdType,
-              Time.monotonicNow() - startTime);
+      metrics.incContainerOpsLatencies(cmdType, oPLatencyMS);
 
       // If the request is of Write Type and the container operation
       // is unsuccessful, it implies the applyTransaction on the container
@@ -402,6 +408,8 @@ public class HddsDispatcher implements ContainerDispatcher, 
Auditor {
         audit(action, eventType, params, AuditEventStatus.FAILURE,
             new Exception(responseProto.getMessage()));
       }
+      perf.appendOpLatencyMs(oPLatencyMS);
+      performanceAudit(action, params, perf, oPLatencyMS);
 
       return responseProto;
     } else {
@@ -412,6 +420,13 @@ public class HddsDispatcher implements 
ContainerDispatcher, Auditor {
     }
   }
 
+  private long getSlowOpThresholdMs(ConfigurationSource config) {
+    return config.getTimeDuration(
+        HddsConfigKeys.HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_KEY,
+        HddsConfigKeys.HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_DEFAULT,
+            TimeUnit.MILLISECONDS);
+  }
+
   private void updateBCSID(Container container,
       DispatcherContext dispatcherContext, Type cmdType) {
     if (dispatcherContext != null && (cmdType == Type.PutBlock
@@ -682,6 +697,26 @@ public class HddsDispatcher implements 
ContainerDispatcher, Auditor {
     }
   }
 
+  private void performanceAudit(AuditAction action, Map<String, String> params,
+      PerformanceStringBuilder performance, long opLatencyMs) {
+    if (isOperationSlow(opLatencyMs)) {
+      AuditMessage msg =
+          buildAuditMessageForPerformance(action, params, performance);
+      AUDIT.logPerformance(msg);
+    }
+  }
+
+  public AuditMessage buildAuditMessageForPerformance(AuditAction op,
+      Map<String, String> auditMap, PerformanceStringBuilder performance) {
+    return new AuditMessage.Builder()
+        .setUser(null)
+        .atIp(null)
+        .forOperation(op)
+        .withParams(auditMap)
+        .setPerformance(performance)
+        .build();
+  }
+
   //TODO: use GRPC to fetch user and ip details
   @Override
   public AuditMessage buildAuditMessageForSuccess(AuditAction op,
@@ -845,6 +880,8 @@ public class HddsDispatcher implements ContainerDispatcher, 
Auditor {
     case ReadChunk:
       auditParams.put("blockData",
           BlockID.getFromProtobuf(msg.getReadChunk().getBlockID()).toString());
+      auditParams.put("blockDataSize",
+          String.valueOf(msg.getReadChunk().getChunkData().getLen()));
       return auditParams;
 
     case DeleteChunk:
@@ -857,6 +894,8 @@ public class HddsDispatcher implements ContainerDispatcher, 
Auditor {
       auditParams.put("blockData",
           BlockID.getFromProtobuf(msg.getWriteChunk().getBlockID())
               .toString());
+      auditParams.put("blockDataSize",
+          String.valueOf(msg.getWriteChunk().getChunkData().getLen()));
       return auditParams;
 
     case ListChunk:
@@ -873,6 +912,8 @@ public class HddsDispatcher implements ContainerDispatcher, 
Auditor {
         auditParams.put("blockData",
             BlockData.getFromProtoBuf(msg.getPutSmallFile()
                 .getBlock().getBlockData()).toString());
+        auditParams.put("blockDataSize",
+            String.valueOf(msg.getPutSmallFile().getChunkInfo().getLen()));
       } catch (IOException ex) {
         if (LOG.isTraceEnabled()) {
           LOG.trace("Encountered error parsing BlockData from protobuf: "
@@ -904,4 +945,7 @@ public class HddsDispatcher implements ContainerDispatcher, 
Auditor {
 
   }
 
+  private boolean isOperationSlow(long opLatencyMs) {
+    return opLatencyMs >= slowOpThresholdMs;
+  }
 }
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
index 1501f862cf..c9d6672ee8 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
@@ -782,6 +782,7 @@ public class KeyValueHandler extends Handler {
       WriteChunkRequestProto writeChunk = request.getWriteChunk();
       BlockID blockID = BlockID.getFromProtobuf(writeChunk.getBlockID());
       ContainerProtos.ChunkInfo chunkInfoProto = writeChunk.getChunkData();
+
       ChunkInfo chunkInfo = ChunkInfo.getFromProtoBuf(chunkInfoProto);
       Preconditions.checkNotNull(chunkInfo);
 
diff --git 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditLogger.java
 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditLogger.java
index 33ad708698..042887e4e5 100644
--- 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditLogger.java
+++ 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditLogger.java
@@ -49,6 +49,7 @@ public class AuditLogger {
   private static final Marker WRITE_MARKER = AuditMarker.WRITE.getMarker();
   private static final Marker READ_MARKER = AuditMarker.READ.getMarker();
   private static final Marker AUTH_MARKER = AuditMarker.AUTH.getMarker();
+  private static final Marker PERFORMANCE = 
AuditMarker.PERFORMANCE.getMarker();
   private final AtomicReference<Set<String>> debugCmdSetRef =
       new AtomicReference<>(new HashSet<>());
   public static final String AUDIT_LOG_DEBUG_CMD_LIST_PREFIX =
@@ -118,6 +119,10 @@ public class AuditLogger {
     }
   }
 
+  public void logPerformance(AuditMessage msg) {
+    this.logger.logIfEnabled(FQCN, Level.INFO, PERFORMANCE, msg, null);
+  }
+
   public void refreshDebugCmdSet() {
     OzoneConfiguration conf = new OzoneConfiguration();
     refreshDebugCmdSet(conf);
@@ -161,6 +166,22 @@ public class AuditLogger {
       append("opLatencyMs", TimeUnit.NANOSECONDS.toMillis(nanos));
     }
 
+    /**
+     * Appends pre-operation operation latency in milliseconds.
+     * @param millis Latency in nanoseconds.
+     */
+    public void appendPreOpLatencyMs(long millis) {
+      append("preOpLatencyMs", millis);
+    }
+
+    /**
+     * Appends whole operation latency in milliseconds.
+     * @param millis Latency in milliseconds.
+     */
+    public void appendOpLatencyMs(long millis) {
+      append("opLatencyMs", millis);
+    }
+
     /**
      * Appends the size in bytes.
      * @param bytes Size in bytes.
diff --git 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditMarker.java
 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditMarker.java
index d9aed2bb65..2e991a6ea2 100644
--- 
a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditMarker.java
+++ 
b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/ozone/audit/AuditMarker.java
@@ -25,7 +25,8 @@ import org.apache.logging.log4j.MarkerManager;
 public enum AuditMarker {
   WRITE(MarkerManager.getMarker("WRITE")),
   READ(MarkerManager.getMarker("READ")),
-  AUTH(MarkerManager.getMarker("AUTH")),;
+  AUTH(MarkerManager.getMarker("AUTH")),
+  PERFORMANCE(MarkerManager.getMarker("PERFORMANCE"));
 
   private Marker marker;
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to