This is an automated email from the ASF dual-hosted git repository.
erose pushed a commit to branch HDDS-10239-container-reconciliation
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to
refs/heads/HDDS-10239-container-reconciliation by this push:
new bde10ab836 HDDS-10373. Implement framework for capturing Merkle Tree
Metrics. (#6864)
bde10ab836 is described below
commit bde10ab836930850c41487edfc851b68cdbc1e83
Author: Aswin Shakil Balasubramanian <[email protected]>
AuthorDate: Wed Jul 24 14:10:21 2024 -0700
HDDS-10373. Implement framework for capturing Merkle Tree Metrics. (#6864)
---
.../checksum/ContainerChecksumTreeManager.java | 40 +++++++----
.../checksum/ContainerMerkleTreeMetrics.java | 77 ++++++++++++++++++++++
.../checksum/TestContainerChecksumTreeManager.java | 46 ++++++++++++-
3 files changed, 150 insertions(+), 13 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java
index 939c6d08b3..46dc4aa0ba 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java
@@ -16,6 +16,7 @@
*/
package org.apache.hadoop.ozone.container.checksum;
+import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
import
org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
@@ -34,6 +35,8 @@ import org.apache.hadoop.hdds.utils.SimpleStriped;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import static org.apache.hadoop.util.MetricUtil.captureLatencyNs;
+
/**
* This class coordinates reading and writing Container checksum information
for all containers.
*/
@@ -44,12 +47,15 @@ public class ContainerChecksumTreeManager {
// Used to coordinate reads and writes to each container's checksum file.
// Each container ID is mapped to a stripe.
private final Striped<ReadWriteLock> fileLock;
+ private final ContainerMerkleTreeMetrics metrics;
/**
* Creates one instance that should be used to coordinate all container
checksum info within a datanode.
*/
public ContainerChecksumTreeManager(DatanodeConfiguration dnConf) {
fileLock =
SimpleStriped.readWriteLock(dnConf.getContainerChecksumLockStripes(), true);
+ // TODO: TO unregister metrics on stop.
+ metrics = ContainerMerkleTreeMetrics.create();
}
/**
@@ -63,7 +69,7 @@ public class ContainerChecksumTreeManager {
writeLock.lock();
try {
ContainerProtos.ContainerChecksumInfo newChecksumInfo =
read(data).toBuilder()
- .setContainerMerkleTree(tree.toProto())
+
.setContainerMerkleTree(captureLatencyNs(metrics.getCreateMerkleTreeLatencyNS(),
tree::toProto))
.build();
write(data, newChecksumInfo);
LOG.debug("Data merkle tree for container {} updated",
data.getContainerID());
@@ -99,7 +105,7 @@ public class ContainerChecksumTreeManager {
}
}
- public ContainerDiff diff(KeyValueContainerData thisContainer, File
otherContainerTree)
+ public ContainerDiff diff(KeyValueContainerData thisContainer,
ContainerProtos.ContainerChecksumInfo otherInfo)
throws IOException {
// TODO HDDS-10928 compare the checksum info of the two containers and
return a summary.
// Callers can act on this summary to repair their container replica
using the peer's replica.
@@ -107,13 +113,6 @@ public class ContainerChecksumTreeManager {
return new ContainerDiff();
}
- /**
- * Returns the container checksum tree file for the specified container
without deserializing it.
- */
- public File getContainerChecksumFile(KeyValueContainerData data) {
- return new File(data.getMetadataPath(), data.getContainerID() + ".tree");
- }
-
private Lock getReadLock(long containerID) {
return fileLock.get(containerID).readLock();
}
@@ -139,8 +138,13 @@ public class ContainerChecksumTreeManager {
.build();
}
try (FileInputStream inStream = new FileInputStream(checksumFile)) {
- return ContainerProtos.ContainerChecksumInfo.parseFrom(inStream);
+ return captureLatencyNs(metrics.getReadContainerMerkleTreeLatencyNS(),
+ () -> ContainerProtos.ContainerChecksumInfo.parseFrom(inStream));
}
+ } catch (IOException ex) {
+ metrics.incrementMerkleTreeReadFailures();
+ throw new IOException("Error occurred when reading container merkle tree
for containerID "
+ + data.getContainerID(), ex);
} finally {
readLock.unlock();
}
@@ -151,12 +155,26 @@ public class ContainerChecksumTreeManager {
Lock writeLock = getWriteLock(data.getContainerID());
writeLock.lock();
try (FileOutputStream outStream = new
FileOutputStream(getContainerChecksumFile(data))) {
- checksumInfo.writeTo(outStream);
+ captureLatencyNs(metrics.getWriteContainerMerkleTreeLatencyNS(),
+ () -> checksumInfo.writeTo(outStream));
+ } catch (IOException ex) {
+ metrics.incrementMerkleTreeWriteFailures();
+ throw new IOException("Error occurred when writing container merkle tree
for containerID "
+ + data.getContainerID(), ex);
} finally {
writeLock.unlock();
}
}
+ public File getContainerChecksumFile(KeyValueContainerData data) {
+ return new File(data.getMetadataPath(), data.getContainerID() + ".tree");
+ }
+
+ @VisibleForTesting
+ public ContainerMerkleTreeMetrics getMetrics() {
+ return this.metrics;
+ }
+
/**
* This class represents the difference between our replica of a container
and a peer's replica of a container.
* It summarizes the operations we need to do to reconcile our replica with
the peer replica it was compared to.
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeMetrics.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeMetrics.java
new file mode 100644
index 0000000000..a288e15f6b
--- /dev/null
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeMetrics.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ozone.container.checksum;
+
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.annotation.Metric;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.lib.MutableCounterLong;
+import org.apache.hadoop.metrics2.lib.MutableRate;
+
+/**
+ * Class to collect metrics related to container merkle tree.
+ */
+public class ContainerMerkleTreeMetrics {
+ private static final String METRICS_SOURCE_NAME =
ContainerMerkleTreeMetrics.class.getSimpleName();
+
+ public static ContainerMerkleTreeMetrics create() {
+ MetricsSystem ms = DefaultMetricsSystem.instance();
+ return ms.register(METRICS_SOURCE_NAME, "Container Merkle Tree Metrics",
+ new ContainerMerkleTreeMetrics());
+ }
+
+ public void unregister() {
+ MetricsSystem ms = DefaultMetricsSystem.instance();
+ ms.unregisterSource(METRICS_SOURCE_NAME);
+ }
+
+ @Metric(about = "Number of Merkle tree write failure")
+ private MutableCounterLong numMerkleTreeWriteFailure;
+
+ @Metric(about = "Number of Merkle tree read failure")
+ private MutableCounterLong numMerkleTreeReadFailure;
+
+ @Metric(about = "Merkle tree write latency")
+ private MutableRate merkleTreeWriteLatencyNS;
+
+ @Metric(about = "Merkle tree read latency")
+ private MutableRate merkleTreeReadLatencyNS;
+
+ @Metric(about = "Merkle tree creation latency")
+ private MutableRate merkleTreeCreateLatencyNS;
+
+ public void incrementMerkleTreeWriteFailures() {
+ this.numMerkleTreeWriteFailure.incr();
+ }
+
+ public void incrementMerkleTreeReadFailures() {
+ this.numMerkleTreeReadFailure.incr();
+ }
+
+ public MutableRate getWriteContainerMerkleTreeLatencyNS() {
+ return this.merkleTreeWriteLatencyNS;
+ }
+
+ public MutableRate getReadContainerMerkleTreeLatencyNS() {
+ return this.merkleTreeReadLatencyNS;
+ }
+
+ public MutableRate getCreateMerkleTreeLatencyNS() {
+ return this.merkleTreeCreateLatencyNS;
+ }
+}
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java
index 767eed8a73..56a5dbfd55 100644
---
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java
+++
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java
@@ -47,6 +47,7 @@ class TestContainerChecksumTreeManager {
private KeyValueContainerData container;
private File checksumFile;
private ContainerChecksumTreeManager checksumManager;
+ private ContainerMerkleTreeMetrics metrics;
@BeforeEach
public void init() {
@@ -55,11 +56,17 @@ class TestContainerChecksumTreeManager {
when(container.getMetadataPath()).thenReturn(testDir.getAbsolutePath());
checksumFile = new File(testDir, CONTAINER_ID + ".tree");
checksumManager = new ContainerChecksumTreeManager(new
DatanodeConfiguration());
+ metrics = checksumManager.getMetrics();
}
@Test
public void testWriteEmptyTreeToFile() throws Exception {
+
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(),
0);
+ assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0);
checksumManager.writeContainerDataTree(container, new
ContainerMerkleTree());
+
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() >
0);
+ assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0);
+
ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
assertEquals(CONTAINER_ID, checksumInfo.getContainerID());
@@ -71,7 +78,10 @@ class TestContainerChecksumTreeManager {
@Test
public void testWriteEmptyBlockListToFile() throws Exception {
+
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(),
0);
checksumManager.markBlocksAsDeleted(container, new TreeSet<>());
+
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() >
0);
+
ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
assertEquals(CONTAINER_ID, checksumInfo.getContainerID());
@@ -83,11 +93,14 @@ class TestContainerChecksumTreeManager {
@Test
public void testWriteOnlyTreeToFile() throws Exception {
+
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(),
0);
+ assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0);
ContainerMerkleTree tree = buildTestTree();
checksumManager.writeContainerDataTree(container, tree);
-
+
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() >
0);
ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
+ assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0);
assertEquals(CONTAINER_ID, checksumInfo.getContainerID());
assertTrue(checksumInfo.getDeletedBlocksList().isEmpty());
// TestContainerMerkleTree verifies that going from ContainerMerkleTree to
its proto is consistent.
@@ -97,8 +110,10 @@ class TestContainerChecksumTreeManager {
@Test
public void testWriteOnlyDeletedBlocksToFile() throws Exception {
+
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(),
0);
List<Long> expectedBlocksToDelete = Arrays.asList(1L, 2L, 3L);
checksumManager.markBlocksAsDeleted(container, new
TreeSet<>(expectedBlocksToDelete));
+ assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().changed());
ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
@@ -111,13 +126,20 @@ class TestContainerChecksumTreeManager {
@Test
public void testDeletedBlocksPreservedOnTreeWrite() throws Exception {
+
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(),
0);
+ assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0);
+
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(),
0);
List<Long> expectedBlocksToDelete = Arrays.asList(1L, 2L, 3L);
checksumManager.markBlocksAsDeleted(container, new
TreeSet<>(expectedBlocksToDelete));
+
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(),
0);
ContainerMerkleTree tree = buildTestTree();
checksumManager.writeContainerDataTree(container, tree);
+
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() >
0);
+
assertTrue(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total() >
0);
ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
+ assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0);
assertEquals(CONTAINER_ID, checksumInfo.getContainerID());
assertEquals(expectedBlocksToDelete, checksumInfo.getDeletedBlocksList());
assertTreesSortedAndMatch(tree.toProto(),
checksumInfo.getContainerMerkleTree());
@@ -125,21 +147,41 @@ class TestContainerChecksumTreeManager {
@Test
public void testTreePreservedOnDeletedBlocksWrite() throws Exception {
+
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(),
0);
+ assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0);
+
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(),
0);
ContainerMerkleTree tree = buildTestTree();
checksumManager.writeContainerDataTree(container, tree);
+
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(),
0);
List<Long> expectedBlocksToDelete = Arrays.asList(1L, 2L, 3L);
checksumManager.markBlocksAsDeleted(container, new
TreeSet<>(expectedBlocksToDelete));
+
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() >
0);
+
assertTrue(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total() >
0);
ContainerProtos.ContainerChecksumInfo checksumInfo = readFile();
+ assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0);
assertEquals(CONTAINER_ID, checksumInfo.getContainerID());
assertEquals(expectedBlocksToDelete, checksumInfo.getDeletedBlocksList());
assertTreesSortedAndMatch(tree.toProto(),
checksumInfo.getContainerMerkleTree());
}
+ @Test
+ public void testReadContainerMerkleTreeMetric() throws Exception {
+
assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(),
0);
+
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(),
0);
+ ContainerMerkleTree tree = buildTestTree();
+ checksumManager.writeContainerDataTree(container, tree);
+
assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(),
0);
+ checksumManager.writeContainerDataTree(container, tree);
+
assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() >
0);
+
assertTrue(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total() >
0);
+ }
+
@Test
public void testChecksumTreeFilePath() {
- assertEquals(checksumFile.getAbsolutePath(),
checksumManager.getContainerChecksumFile(container).getAbsolutePath());
+ assertEquals(checksumFile.getAbsolutePath(),
+ checksumManager.getContainerChecksumFile(container).getAbsolutePath());
}
private ContainerMerkleTree buildTestTree() throws Exception {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]