This is an automated email from the ASF dual-hosted git repository.

erose pushed a commit to branch HDDS-10239-container-reconciliation
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to 
refs/heads/HDDS-10239-container-reconciliation by this push:
     new bde10ab836 HDDS-10373. Implement framework for capturing Merkle Tree 
Metrics. (#6864)
bde10ab836 is described below

commit bde10ab836930850c41487edfc851b68cdbc1e83
Author: Aswin Shakil Balasubramanian <[email protected]>
AuthorDate: Wed Jul 24 14:10:21 2024 -0700

    HDDS-10373. Implement framework for capturing Merkle Tree Metrics. (#6864)
---
 .../checksum/ContainerChecksumTreeManager.java     | 40 +++++++----
 .../checksum/ContainerMerkleTreeMetrics.java       | 77 ++++++++++++++++++++++
 .../checksum/TestContainerChecksumTreeManager.java | 46 ++++++++++++-
 3 files changed, 150 insertions(+), 13 deletions(-)

diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java
index 939c6d08b3..46dc4aa0ba 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java
@@ -16,6 +16,7 @@
  */
 package org.apache.hadoop.ozone.container.checksum;
 
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
 import 
org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
 import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
@@ -34,6 +35,8 @@ import org.apache.hadoop.hdds.utils.SimpleStriped;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.apache.hadoop.util.MetricUtil.captureLatencyNs;
+
 /**
  * This class coordinates reading and writing Container checksum information 
for all containers.
  */
@@ -44,12 +47,15 @@ public class ContainerChecksumTreeManager {
   // Used to coordinate reads and writes to each container's checksum file.
   // Each container ID is mapped to a stripe.
   private final Striped<ReadWriteLock> fileLock;
+  private final ContainerMerkleTreeMetrics metrics;
 
   /**
    * Creates one instance that should be used to coordinate all container 
checksum info within a datanode.
    */
   public ContainerChecksumTreeManager(DatanodeConfiguration dnConf) {
     fileLock = 
SimpleStriped.readWriteLock(dnConf.getContainerChecksumLockStripes(), true);
+    // TODO: TO unregister metrics on stop.
+    metrics = ContainerMerkleTreeMetrics.create();
   }
 
   /**
@@ -63,7 +69,7 @@ public class ContainerChecksumTreeManager {
     writeLock.lock();
     try {
       ContainerProtos.ContainerChecksumInfo newChecksumInfo = 
read(data).toBuilder()
-          .setContainerMerkleTree(tree.toProto())
+          
.setContainerMerkleTree(captureLatencyNs(metrics.getCreateMerkleTreeLatencyNS(),
 tree::toProto))
           .build();
       write(data, newChecksumInfo);
       LOG.debug("Data merkle tree for container {} updated", 
data.getContainerID());
@@ -99,7 +105,7 @@ public class ContainerChecksumTreeManager {
     }
   }
 
-  public ContainerDiff diff(KeyValueContainerData thisContainer, File 
otherContainerTree)
+  public ContainerDiff diff(KeyValueContainerData thisContainer, 
ContainerProtos.ContainerChecksumInfo otherInfo)
       throws IOException {
     // TODO HDDS-10928 compare the checksum info of the two containers and 
return a summary.
     //  Callers can act on this summary to repair their container replica 
using the peer's replica.
@@ -107,13 +113,6 @@ public class ContainerChecksumTreeManager {
     return new ContainerDiff();
   }
 
-  /**
-   * Returns the container checksum tree file for the specified container 
without deserializing it.
-   */
-  public File getContainerChecksumFile(KeyValueContainerData data) {
-    return new File(data.getMetadataPath(), data.getContainerID() + ".tree");
-  }
-
   private Lock getReadLock(long containerID) {
     return fileLock.get(containerID).readLock();
   }
@@ -139,8 +138,13 @@ public class ContainerChecksumTreeManager {
             .build();
       }
       try (FileInputStream inStream = new FileInputStream(checksumFile)) {
-        return ContainerProtos.ContainerChecksumInfo.parseFrom(inStream);
+        return captureLatencyNs(metrics.getReadContainerMerkleTreeLatencyNS(),
+            () -> ContainerProtos.ContainerChecksumInfo.parseFrom(inStream));
       }
+    } catch (IOException ex) {
+      metrics.incrementMerkleTreeReadFailures();
+      throw new IOException("Error occurred when reading container merkle tree 
for containerID "
+              + data.getContainerID(), ex);
     } finally {
       readLock.unlock();
     }
@@ -151,12 +155,26 @@ public class ContainerChecksumTreeManager {
     Lock writeLock = getWriteLock(data.getContainerID());
     writeLock.lock();
     try (FileOutputStream outStream = new 
FileOutputStream(getContainerChecksumFile(data))) {
-      checksumInfo.writeTo(outStream);
+      captureLatencyNs(metrics.getWriteContainerMerkleTreeLatencyNS(),
+          () -> checksumInfo.writeTo(outStream));
+    } catch (IOException ex) {
+      metrics.incrementMerkleTreeWriteFailures();
+      throw new IOException("Error occurred when writing container merkle tree 
for containerID "
+          + data.getContainerID(), ex);
     } finally {
       writeLock.unlock();
     }
   }
 
+  public File getContainerChecksumFile(KeyValueContainerData data) {
+    return new File(data.getMetadataPath(), data.getContainerID() + ".tree");
+  }
+
+  @VisibleForTesting
+  public ContainerMerkleTreeMetrics getMetrics() {
+    return this.metrics;
+  }
+
   /**
    * This class represents the difference between our replica of a container 
and a peer's replica of a container.
    * It summarizes the operations we need to do to reconcile our replica with 
the peer replica it was compared to.
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeMetrics.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeMetrics.java
new file mode 100644
index 0000000000..a288e15f6b
--- /dev/null
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeMetrics.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ozone.container.checksum;
+
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.annotation.Metric;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.lib.MutableCounterLong;
+import org.apache.hadoop.metrics2.lib.MutableRate;
+
+/**
+ * Class to collect metrics related to container merkle tree.
+ */
+public class ContainerMerkleTreeMetrics {
+  private static final String METRICS_SOURCE_NAME = 
ContainerMerkleTreeMetrics.class.getSimpleName();
+
+  public static ContainerMerkleTreeMetrics create() {
+    MetricsSystem ms = DefaultMetricsSystem.instance();
+    return ms.register(METRICS_SOURCE_NAME, "Container Merkle Tree Metrics",
+        new ContainerMerkleTreeMetrics());
+  }
+
+  public void unregister() {
+    MetricsSystem ms = DefaultMetricsSystem.instance();
+    ms.unregisterSource(METRICS_SOURCE_NAME);
+  }
+
+  @Metric(about = "Number of Merkle tree write failure")
+  private MutableCounterLong numMerkleTreeWriteFailure;
+
+  @Metric(about = "Number of Merkle tree read failure")
+  private MutableCounterLong numMerkleTreeReadFailure;
+
+  @Metric(about = "Merkle tree write latency")
+  private MutableRate merkleTreeWriteLatencyNS;
+
+  @Metric(about = "Merkle tree read latency")
+  private MutableRate merkleTreeReadLatencyNS;
+
+  @Metric(about = "Merkle tree creation latency")
+  private MutableRate merkleTreeCreateLatencyNS;
+
+  public void incrementMerkleTreeWriteFailures() {
+    this.numMerkleTreeWriteFailure.incr();
+  }
+
+  public void incrementMerkleTreeReadFailures() {
+    this.numMerkleTreeReadFailure.incr();
+  }
+
+  public MutableRate getWriteContainerMerkleTreeLatencyNS() {
+    return this.merkleTreeWriteLatencyNS;
+  }
+
+  public MutableRate getReadContainerMerkleTreeLatencyNS() {
+    return this.merkleTreeReadLatencyNS;
+  }
+
+  public MutableRate getCreateMerkleTreeLatencyNS() {
+    return this.merkleTreeCreateLatencyNS;
+  }
+}
diff --git 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java
 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java
index 767eed8a73..56a5dbfd55 100644
--- 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java
+++ 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java
@@ -47,6 +47,7 @@ class TestContainerChecksumTreeManager {
   private KeyValueContainerData container;
   private File checksumFile;
   private ContainerChecksumTreeManager checksumManager;
+  private ContainerMerkleTreeMetrics metrics;
 
   @BeforeEach
   public void init() {
@@ -55,11 +56,17 @@ class TestContainerChecksumTreeManager {
     when(container.getMetadataPath()).thenReturn(testDir.getAbsolutePath());
     checksumFile = new File(testDir, CONTAINER_ID + ".tree");
     checksumManager = new ContainerChecksumTreeManager(new 
DatanodeConfiguration());
+    metrics = checksumManager.getMetrics();
   }
 
   @Test
   public void testWriteEmptyTreeToFile() throws Exception {
+    
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
+    assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0);
     checksumManager.writeContainerDataTree(container, new 
ContainerMerkleTree());
+    
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 
0);
+    assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0);
+
     ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
 
     assertEquals(CONTAINER_ID, checksumInfo.getContainerID());
@@ -71,7 +78,10 @@ class TestContainerChecksumTreeManager {
 
   @Test
   public void testWriteEmptyBlockListToFile() throws Exception {
+    
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
     checksumManager.markBlocksAsDeleted(container, new TreeSet<>());
+    
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 
0);
+
     ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
 
     assertEquals(CONTAINER_ID, checksumInfo.getContainerID());
@@ -83,11 +93,14 @@ class TestContainerChecksumTreeManager {
 
   @Test
   public void testWriteOnlyTreeToFile() throws Exception {
+    
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
+    assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0);
     ContainerMerkleTree tree = buildTestTree();
     checksumManager.writeContainerDataTree(container, tree);
-
+    
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 
0);
     ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
 
+    assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0);
     assertEquals(CONTAINER_ID, checksumInfo.getContainerID());
     assertTrue(checksumInfo.getDeletedBlocksList().isEmpty());
     // TestContainerMerkleTree verifies that going from ContainerMerkleTree to 
its proto is consistent.
@@ -97,8 +110,10 @@ class TestContainerChecksumTreeManager {
 
   @Test
   public void testWriteOnlyDeletedBlocksToFile() throws Exception {
+    
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
     List<Long> expectedBlocksToDelete = Arrays.asList(1L, 2L, 3L);
     checksumManager.markBlocksAsDeleted(container, new 
TreeSet<>(expectedBlocksToDelete));
+    assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().changed());
 
     ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
 
@@ -111,13 +126,20 @@ class TestContainerChecksumTreeManager {
 
   @Test
   public void testDeletedBlocksPreservedOnTreeWrite() throws Exception {
+    
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
+    assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0);
+    
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
     List<Long> expectedBlocksToDelete = Arrays.asList(1L, 2L, 3L);
     checksumManager.markBlocksAsDeleted(container, new 
TreeSet<>(expectedBlocksToDelete));
+    
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
     ContainerMerkleTree tree = buildTestTree();
     checksumManager.writeContainerDataTree(container, tree);
+    
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 
0);
+    
assertTrue(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total() > 
0);
 
     ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
 
+    assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0);
     assertEquals(CONTAINER_ID, checksumInfo.getContainerID());
     assertEquals(expectedBlocksToDelete, checksumInfo.getDeletedBlocksList());
     assertTreesSortedAndMatch(tree.toProto(), 
checksumInfo.getContainerMerkleTree());
@@ -125,21 +147,41 @@ class TestContainerChecksumTreeManager {
 
   @Test
   public void testTreePreservedOnDeletedBlocksWrite() throws Exception {
+    
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
+    assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0);
+    
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
     ContainerMerkleTree tree = buildTestTree();
     checksumManager.writeContainerDataTree(container, tree);
+    
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
     List<Long> expectedBlocksToDelete = Arrays.asList(1L, 2L, 3L);
     checksumManager.markBlocksAsDeleted(container, new 
TreeSet<>(expectedBlocksToDelete));
+    
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 
0);
+    
assertTrue(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total() > 
0);
 
     ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
 
+    assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0);
     assertEquals(CONTAINER_ID, checksumInfo.getContainerID());
     assertEquals(expectedBlocksToDelete, checksumInfo.getDeletedBlocksList());
     assertTreesSortedAndMatch(tree.toProto(), 
checksumInfo.getContainerMerkleTree());
   }
 
+  @Test
+  public void testReadContainerMerkleTreeMetric() throws Exception {
+    
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
+    
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
+    ContainerMerkleTree tree = buildTestTree();
+    checksumManager.writeContainerDataTree(container, tree);
+    
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 
0);
+    checksumManager.writeContainerDataTree(container, tree);
+    
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 
0);
+    
assertTrue(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total() > 
0);
+  }
+
   @Test
   public void testChecksumTreeFilePath() {
-    assertEquals(checksumFile.getAbsolutePath(), 
checksumManager.getContainerChecksumFile(container).getAbsolutePath());
+    assertEquals(checksumFile.getAbsolutePath(),
+        checksumManager.getContainerChecksumFile(container).getAbsolutePath());
   }
 
   private ContainerMerkleTree buildTestTree() throws Exception {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to