errose28 commented on code in PR #7293:
URL: https://github.com/apache/ozone/pull/7293#discussion_r1833088126
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java:
##########
@@ -245,11 +361,47 @@ public static boolean checksumFileExist(Container
container) {
* This class represents the difference between our replica of a container
and a peer's replica of a container.
* It summarizes the operations we need to do to reconcile our replica with
the peer replica it was compared to.
*
- * TODO HDDS-10928
*/
public static class ContainerDiff {
+ private final List<ContainerProtos.BlockMerkleTree> missingBlocks;
+ private final SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
missingChunks;
+ private final SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
corruptChunks;
+
public ContainerDiff() {
+ this.missingBlocks = new ArrayList<>();
+ this.missingChunks = new TreeMap<>();
+ this.corruptChunks = new TreeMap<>();
+ }
+
+ public void addMissingBlock(ContainerProtos.BlockMerkleTree
missingBlockMerkleTree) {
+ this.missingBlocks.add(missingBlockMerkleTree);
+ }
+ public void addMissingChunk(long blockId, ContainerProtos.ChunkMerkleTree
missingChunkMerkleTree) {
+ this.missingChunks.computeIfAbsent(blockId, any -> new
ArrayList<>()).add(missingChunkMerkleTree);
+ }
+
+ public void addCorruptChunks(long blockId, ContainerProtos.ChunkMerkleTree
corruptChunk) {
+ this.corruptChunks.computeIfAbsent(blockId, any -> new
ArrayList<>()).add(corruptChunk);
+ }
+
+ public List<ContainerProtos.BlockMerkleTree> getMissingBlocks() {
+ return missingBlocks;
+ }
+
+ public SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
getMissingChunks() {
+ return missingChunks;
+ }
+
+ public SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
getCorruptChunks() {
+ return corruptChunks;
+ }
+
+ public boolean isHealthy() {
Review Comment:
I would add javadoc to this method clarifying that `isHealthy = true` does
not mean that the trees match. It means that we don't need to make
modifications to our replica. The peer's replica may have corruption or missing
chunks itself.
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java:
##########
@@ -245,11 +361,47 @@ public static boolean checksumFileExist(Container
container) {
* This class represents the difference between our replica of a container
and a peer's replica of a container.
* It summarizes the operations we need to do to reconcile our replica with
the peer replica it was compared to.
*
- * TODO HDDS-10928
*/
public static class ContainerDiff {
+ private final List<ContainerProtos.BlockMerkleTree> missingBlocks;
+ private final SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
missingChunks;
+ private final SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
corruptChunks;
Review Comment:
Do these need to be sorted? The caller will use the IDs to pull from the
peer.
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java:
##########
@@ -143,12 +147,124 @@ public void markBlocksAsDeleted(KeyValueContainerData
data, Collection<Long> del
}
}
- public ContainerDiff diff(KeyValueContainerData thisContainer,
ContainerProtos.ContainerChecksumInfo otherInfo)
+ public ContainerDiff diff(KeyValueContainerData thisContainer,
ContainerProtos.ContainerChecksumInfo peerChecksumInfo)
throws IOException {
- // TODO HDDS-10928 compare the checksum info of the two containers and
return a summary.
- // Callers can act on this summary to repair their container replica
using the peer's replica.
- // This method will use the read lock, which is unused in the current
implementation.
- return new ContainerDiff();
+ Preconditions.assertNotNull(thisContainer, "Container data is null");
+ Preconditions.assertNotNull(peerChecksumInfo, "Peer checksum info is
null");
+ Optional<ContainerProtos.ContainerChecksumInfo.Builder>
thisContainerChecksumInfoBuilder =
+ read(thisContainer);
+ if (!thisContainerChecksumInfoBuilder.isPresent()) {
+ throw new IOException("The container #" + thisContainer.getContainerID()
+ " doesn't have container checksum");
+ }
+
+ if (thisContainer.getContainerID() != peerChecksumInfo.getContainerID()) {
+ throw new IOException("Container Id does not match");
Review Comment:
Lets add the container IDs to the exception message.
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java:
##########
@@ -245,11 +361,47 @@ public static boolean checksumFileExist(Container
container) {
* This class represents the difference between our replica of a container
and a peer's replica of a container.
* It summarizes the operations we need to do to reconcile our replica with
the peer replica it was compared to.
*
- * TODO HDDS-10928
*/
public static class ContainerDiff {
+ private final List<ContainerProtos.BlockMerkleTree> missingBlocks;
+ private final SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
missingChunks;
+ private final SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
corruptChunks;
+
public ContainerDiff() {
+ this.missingBlocks = new ArrayList<>();
+ this.missingChunks = new TreeMap<>();
+ this.corruptChunks = new TreeMap<>();
+ }
+
+ public void addMissingBlock(ContainerProtos.BlockMerkleTree
missingBlockMerkleTree) {
+ this.missingBlocks.add(missingBlockMerkleTree);
+ }
+ public void addMissingChunk(long blockId, ContainerProtos.ChunkMerkleTree
missingChunkMerkleTree) {
+ this.missingChunks.computeIfAbsent(blockId, any -> new
ArrayList<>()).add(missingChunkMerkleTree);
+ }
+
+ public void addCorruptChunks(long blockId, ContainerProtos.ChunkMerkleTree
corruptChunk) {
+ this.corruptChunks.computeIfAbsent(blockId, any -> new
ArrayList<>()).add(corruptChunk);
+ }
+
+ public List<ContainerProtos.BlockMerkleTree> getMissingBlocks() {
+ return missingBlocks;
+ }
+
+ public SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
getMissingChunks() {
+ return missingChunks;
+ }
+
+ public SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
getCorruptChunks() {
+ return corruptChunks;
+ }
+
+ public boolean isHealthy() {
+ if (missingBlocks.isEmpty() && missingChunks.isEmpty() &&
corruptChunks.isEmpty()) {
+ return true;
+ }
+ return false;
Review Comment:
```suggestion
return missingBlocks.isEmpty() && missingChunks.isEmpty() &&
corruptChunks.isEmpty();
```
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java:
##########
@@ -143,12 +147,124 @@ public void markBlocksAsDeleted(KeyValueContainerData
data, Collection<Long> del
}
}
- public ContainerDiff diff(KeyValueContainerData thisContainer,
ContainerProtos.ContainerChecksumInfo otherInfo)
+ public ContainerDiff diff(KeyValueContainerData thisContainer,
ContainerProtos.ContainerChecksumInfo peerChecksumInfo)
throws IOException {
- // TODO HDDS-10928 compare the checksum info of the two containers and
return a summary.
- // Callers can act on this summary to repair their container replica
using the peer's replica.
- // This method will use the read lock, which is unused in the current
implementation.
- return new ContainerDiff();
+ Preconditions.assertNotNull(thisContainer, "Container data is null");
+ Preconditions.assertNotNull(peerChecksumInfo, "Peer checksum info is
null");
+ Optional<ContainerProtos.ContainerChecksumInfo.Builder>
thisContainerChecksumInfoBuilder =
+ read(thisContainer);
+ if (!thisContainerChecksumInfoBuilder.isPresent()) {
+ throw new IOException("The container #" + thisContainer.getContainerID()
+ " doesn't have container checksum");
+ }
+
+ if (thisContainer.getContainerID() != peerChecksumInfo.getContainerID()) {
+ throw new IOException("Container Id does not match");
+ }
+
+ ContainerProtos.ContainerChecksumInfo thisChecksumInfo =
thisContainerChecksumInfoBuilder.get().build();
+
+ ContainerProtos.ContainerMerkleTree thisMerkleTree =
thisChecksumInfo.getContainerMerkleTree();
+ ContainerProtos.ContainerMerkleTree peerMerkleTree =
peerChecksumInfo.getContainerMerkleTree();
+
+ return compareContainerMerkleTree(thisMerkleTree, peerMerkleTree);
+ }
+
+ private ContainerDiff
compareContainerMerkleTree(ContainerProtos.ContainerMerkleTree thisMerkleTree,
+
ContainerProtos.ContainerMerkleTree peerMerkleTree) {
+
+ ContainerDiff report = new ContainerDiff();
+ if (thisMerkleTree.getDataChecksum() == peerMerkleTree.getDataChecksum()) {
+ return new ContainerDiff();
+ }
+
+ List<ContainerProtos.BlockMerkleTree> thisBlockMerkleTreeList =
thisMerkleTree.getBlockMerkleTreeList();
+ List<ContainerProtos.BlockMerkleTree> peerBlockMerkleTreeList =
peerMerkleTree.getBlockMerkleTreeList();
+ // Since we are reconciling our container with a peer, We only need to go
through the peer's block list
+ int peerIdx = 0, thisIdx = 0;
+ while (peerIdx < peerBlockMerkleTreeList.size() || thisIdx <
thisBlockMerkleTreeList.size()) {
+ ContainerProtos.BlockMerkleTree thisBlockMerkleTree = thisIdx <
thisBlockMerkleTreeList.size() ?
+ thisBlockMerkleTreeList.get(thisIdx) : null;
+ ContainerProtos.BlockMerkleTree peerBlockMerkleTree = peerIdx <
peerBlockMerkleTreeList.size() ?
+ peerBlockMerkleTreeList.get(peerIdx) : null;
+
+ // We have checked all the peer blocks, So we can return the
ContainerDiff report
+ if (peerBlockMerkleTree == null) {
+ return report;
+ }
+
+ // peerBlockMerkleTree is not null, but thisBlockMerkleTree is null.
Means we have missing blocks.
+ if (thisBlockMerkleTree == null) {
+ report.addMissingBlock(peerBlockMerkleTree);
+ peerIdx++;
+ continue;
+ }
+
+ // BlockId matches for both thisBlockMerkleTree and peerBlockMerkleTree
+ if (thisBlockMerkleTree.getBlockID() ==
peerBlockMerkleTree.getBlockID()) {
+ if (thisBlockMerkleTree.getBlockChecksum() !=
peerBlockMerkleTree.getBlockChecksum()) {
+ compareBlockMerkleTree(report, thisBlockMerkleTree,
peerBlockMerkleTree);
+ }
+ peerIdx++;
+ thisIdx++;
+ } else {
+ if (peerBlockMerkleTree.getBlockID() >
thisBlockMerkleTree.getBlockID()) {
+ thisIdx++;
+ } else {
+ report.addMissingBlock(peerBlockMerkleTree);
+ peerIdx++;
+ }
+ }
+ }
+ return report;
+ }
+
+ private void compareBlockMerkleTree(ContainerDiff report,
ContainerProtos.BlockMerkleTree thisBlockMerkleTree,
+ ContainerProtos.BlockMerkleTree
peerBlockMerkleTree) {
+
+ List<ContainerProtos.ChunkMerkleTree> thisChunkMerkleTreeList =
thisBlockMerkleTree.getChunkMerkleTreeList();
+ List<ContainerProtos.ChunkMerkleTree> peerChunkMerkleTreeList =
peerBlockMerkleTree.getChunkMerkleTreeList();
+
+ // Since we are reconciling our container with a peer, We only need to go
through the peer's chunk list
+ int peerIdx = 0, thisIdx = 0;
+ while (peerIdx < peerChunkMerkleTreeList.size() || thisIdx <
thisChunkMerkleTreeList.size()) {
Review Comment:
Can simplify this to the more "traditional" way to merge two sorted lists?
This will help future developers recognize what is happening here.
IME there is usually one while loop handling the case where the lists
interleave. This is followed by two more loops to handle the remainder of
either list. This differs from what is here where there is one loop and null is
being used as a placeholder to handle the list remainders.
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java:
##########
@@ -143,12 +148,81 @@ public void markBlocksAsDeleted(KeyValueContainerData
data, Collection<Long> del
}
}
- public ContainerDiff diff(KeyValueContainerData thisContainer,
ContainerProtos.ContainerChecksumInfo otherInfo)
+ public ContainerDiff diff(KeyValueContainerData thisContainer,
ContainerProtos.ContainerChecksumInfo peerChecksumInfo)
Review Comment:
In addition to a diff time metric, it would be good to have counters for
number of diffs that indicated repairs/no repairs were needed on our tree.
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java:
##########
@@ -245,11 +361,47 @@ public static boolean checksumFileExist(Container
container) {
* This class represents the difference between our replica of a container
and a peer's replica of a container.
* It summarizes the operations we need to do to reconcile our replica with
the peer replica it was compared to.
*
- * TODO HDDS-10928
*/
public static class ContainerDiff {
+ private final List<ContainerProtos.BlockMerkleTree> missingBlocks;
+ private final SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
missingChunks;
+ private final SortedMap<Long, List<ContainerProtos.ChunkMerkleTree>>
corruptChunks;
+
public ContainerDiff() {
+ this.missingBlocks = new ArrayList<>();
+ this.missingChunks = new TreeMap<>();
+ this.corruptChunks = new TreeMap<>();
+ }
+
+ public void addMissingBlock(ContainerProtos.BlockMerkleTree
missingBlockMerkleTree) {
+ this.missingBlocks.add(missingBlockMerkleTree);
+ }
+ public void addMissingChunk(long blockId, ContainerProtos.ChunkMerkleTree
missingChunkMerkleTree) {
+ this.missingChunks.computeIfAbsent(blockId, any -> new
ArrayList<>()).add(missingChunkMerkleTree);
+ }
+
+ public void addCorruptChunks(long blockId, ContainerProtos.ChunkMerkleTree
corruptChunk) {
Review Comment:
```suggestion
public void addCorruptChunk(long blockId,
ContainerProtos.ChunkMerkleTree corruptChunk) {
```
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java:
##########
@@ -143,12 +147,124 @@ public void markBlocksAsDeleted(KeyValueContainerData
data, Collection<Long> del
}
}
- public ContainerDiff diff(KeyValueContainerData thisContainer,
ContainerProtos.ContainerChecksumInfo otherInfo)
+ public ContainerDiff diff(KeyValueContainerData thisContainer,
ContainerProtos.ContainerChecksumInfo peerChecksumInfo)
throws IOException {
- // TODO HDDS-10928 compare the checksum info of the two containers and
return a summary.
- // Callers can act on this summary to repair their container replica
using the peer's replica.
- // This method will use the read lock, which is unused in the current
implementation.
- return new ContainerDiff();
+ Preconditions.assertNotNull(thisContainer, "Container data is null");
+ Preconditions.assertNotNull(peerChecksumInfo, "Peer checksum info is
null");
+ Optional<ContainerProtos.ContainerChecksumInfo.Builder>
thisContainerChecksumInfoBuilder =
+ read(thisContainer);
+ if (!thisContainerChecksumInfoBuilder.isPresent()) {
+ throw new IOException("The container #" + thisContainer.getContainerID()
+ " doesn't have container checksum");
+ }
+
+ if (thisContainer.getContainerID() != peerChecksumInfo.getContainerID()) {
+ throw new IOException("Container Id does not match");
+ }
+
+ ContainerProtos.ContainerChecksumInfo thisChecksumInfo =
thisContainerChecksumInfoBuilder.get().build();
+
+ ContainerProtos.ContainerMerkleTree thisMerkleTree =
thisChecksumInfo.getContainerMerkleTree();
+ ContainerProtos.ContainerMerkleTree peerMerkleTree =
peerChecksumInfo.getContainerMerkleTree();
+
+ return compareContainerMerkleTree(thisMerkleTree, peerMerkleTree);
Review Comment:
We need to use the deleted block lists in the comparison as well. They were
removed when we just pulled the tree out of the checksum info.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]