aswinshakil commented on code in PR #8402:
URL: https://github.com/apache/ozone/pull/8402#discussion_r2079392229
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java:
##########
@@ -1796,6 +1866,34 @@ private void
verifyChunksLength(ContainerProtos.ChunkInfo peerChunkInfo, Contain
}
}
+ /**
+ * If we do not have the previous chunk for the current entry, abort the
reconciliation here. Currently we do
+ * not support repairing around holes in a block, the missing chunk must be
obtained first.
+ */
+ private boolean previousChunkPresent(BlockID blockID, long chunkOffset,
+ NavigableMap<Long,
ContainerProtos.ChunkInfo> localOffset2Chunk) {
+ long localID = blockID.getLocalID();
+ long containerID = blockID.getContainerID();
+ if (chunkOffset != 0) {
Review Comment:
We can do this in reverse ans reduce one level of indentation
```suggestion
if (chunkOffset == 0) {
return true;
}
```
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java:
##########
@@ -1796,6 +1866,34 @@ private void
verifyChunksLength(ContainerProtos.ChunkInfo peerChunkInfo, Contain
}
}
+ /**
+ * If we do not have the previous chunk for the current entry, abort the
reconciliation here. Currently we do
+ * not support repairing around holes in a block, the missing chunk must be
obtained first.
+ */
+ private boolean previousChunkPresent(BlockID blockID, long chunkOffset,
+ NavigableMap<Long,
ContainerProtos.ChunkInfo> localOffset2Chunk) {
+ long localID = blockID.getLocalID();
+ long containerID = blockID.getContainerID();
+ if (chunkOffset != 0) {
+ Map.Entry<Long, ContainerProtos.ChunkInfo> prevEntry =
localOffset2Chunk.lowerEntry(chunkOffset);
+ if (prevEntry == null) {
+ // We are trying to write a chunk that is not the first, but we
currently have no chunks in the block.
+ LOG.warn("Exiting reconciliation for block {} in container {} at
length {}. The previous chunk is not " +
Review Comment:
We can also add for what `offset` we are throwing this error in the log
message.
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java:
##########
@@ -1599,119 +1696,56 @@ private ContainerProtos.ContainerChecksumInfo
updateAndGetContainerChecksum(KeyV
BlockData blockData = blockIterator.nextBlock();
List<ContainerProtos.ChunkInfo> chunkInfos = blockData.getChunks();
// TODO: Add empty blocks to the merkle tree. Done in HDDS-10374,
needs to be backported.
- merkleTree.addChunks(blockData.getLocalID(), chunkInfos);
+ // Assume all chunks are healthy when building the tree from metadata.
Scanner will identify corruption when
+ // it runs after.
+ merkleTree.addChunks(blockData.getLocalID(), true, chunkInfos);
}
}
- ContainerProtos.ContainerChecksumInfo checksumInfo = checksumManager
- .writeContainerDataTree(containerData, merkleTree);
- return checksumInfo;
+ return checksumManager.writeContainerDataTree(containerData, merkleTree);
}
/**
- * Handle missing block. It reads the missing block data from the peer
datanode and writes it to the local container.
- * If the block write fails, the block commit sequence id of the container
and the block are not updated.
+ * Read chunks from a peer datanode and use them to repair our container.
+ *
+ * We will keep pulling chunks from the peer until we encounter an error. At
that point we will stop trying to
+ * reconcile this block instead of trying to write it with holes. Whatever
data we have pulled up to that point will
Review Comment:
Are we no longer doing reconciliation after a chunk failure, even if we have
healthy chunks after the failed chunk?
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java:
##########
@@ -1753,30 +1796,57 @@ private void reconcileChunksPerBlock(KeyValueContainer
container, Pipeline pipel
chunkByteBuffer.clear();
chunkByteBuffer.limit(chunkLength);
int bytesRead = blockInputStream.read(chunkByteBuffer);
+ // Make sure we read exactly the same amount of data we expected so
it fits in the block.
if (bytesRead != chunkLength) {
- throw new IOException("Error while reading chunk data from block
input stream. Expected length: " +
+ throw new IOException("Error while reading chunk data from peer "
+ peer + ". Expected length: " +
chunkLength + ", Actual length: " + bytesRead);
}
chunkByteBuffer.flip();
ChunkBuffer chunkBuffer = ChunkBuffer.wrap(chunkByteBuffer);
+ ChunkInfo chunkInfo = ChunkInfo.getFromProtoBuf(chunkInfoProto);
+ chunkInfo.addMetadata(OzoneConsts.CHUNK_OVERWRITE, "true");
writeChunkForClosedContainer(chunkInfo, blockID, chunkBuffer,
container);
- // In reconciling missing chunks which happens at the end of the
block, we are expected to have holes in
- // the blockData's chunk list because we continue to reconcile even
if there are failures while reconciling
- // chunks which is fine as we don't update the bcsId.
- localChunksMap.put(chunkInfo.getOffset(), chunkInfoProto);
+ localOffset2Chunk.put(chunkOffset, chunkInfoProto);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Successfully ingested chunk at offset {} into block {}
of container {} from peer {}",
+ chunkOffset, localID, containerID, peer);
+ }
+ numSuccessfulChunks++;
} catch (IOException ex) {
- overwriteBcsId = false;
- LOG.error("Error while reconciling chunk {} for block {} in
container {}",
- chunkOffset, blockID, containerData.getContainerID(), ex);
+ // The peer's chunk was expected to be healthy. Log a stack trace
for more info as to why this failed.
+ LOG.error("Failed to ingest chunk at offset {} for block {} in
container {} from peer {}",
+ chunkOffset, localID, containerID, peer, ex);
+ allChunksSuccessful = false;
}
+ // Stop block repair once we fail to pull a chunk from the peer.
+ // Our write chunk API currently does not have a good way to handle
writing around holes in a block.
+ if (!allChunksSuccessful) {
+ break;
+ }
+ }
+
+ // Do not update block metadata in this container if we did not ingest
any chunks for the block.
+ if (!localOffset2Chunk.isEmpty()) {
+ BlockData putBlockData = BlockData.getFromProtoBuf(peerBlockData);
Review Comment:
If we already have a `localBlockData` we should use that instead of
replacing it with `peerBlockData`. Ideally both should be the same, but it's
better to update the existing one than to replace it with `peerBlockData`
##########
hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeWriter.java:
##########
@@ -49,21 +49,67 @@ public class ContainerMerkleTreeWriter {
public static final Supplier<ChecksumByteBuffer> CHECKSUM_BUFFER_SUPPLIER =
ChecksumByteBufferFactory::crc32CImpl;
/**
- * Constructs an empty Container merkle tree object.
+ * Constructs a writer for an initially empty container merkle tree.
*/
public ContainerMerkleTreeWriter() {
id2Block = new TreeMap<>();
}
+ /**
+ * Constructs a writer for a container merkle tree which initially contains
all the information from the specified
+ * proto.
+ */
+ public ContainerMerkleTreeWriter(ContainerProtos.ContainerMerkleTree
fromTree) {
Review Comment:
We need add an unit test for this in `TestContainerMerkleTreeWriter` and
check both the trees are same.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]