[
https://issues.apache.org/jira/browse/HDFS-12914?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16844537#comment-16844537
]
Santosh Marella commented on HDFS-12914:
----------------------------------------
Thanks [~hexiaoqiao]. I realize I may not be having the right permissions and
hence I am unable to find the "Submit Patch" button. Created INFRA-18411
requesting for permissions.
In the mean time, here is the diff of my changes:
{code:java}
diff --git
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index ccd5931..78125d5 100644
---
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -2162,12 +2162,6 @@ public boolean processReport(final DatanodeID nodeID,
blockReportLeaseManager.removeLease(node);
return !node.hasStaleStorages();
}
- if (context != null) {
- if (!blockReportLeaseManager.checkLease(node, startTime,
- context.getLeaseId())) {
- return false;
- }
- }
if (storageInfo.getBlockReportCount() == 0) {
// The first block report can be processed a lot more efficiently than
@@ -2231,6 +2225,18 @@ public void removeBRLeaseIfNeeded(final DatanodeID
nodeID,
}
/**
+ * Checks if the block report lease for {@param nodeId} has expired or not.
+ *
+ * @param nodeId data node id
+ * @param leaseId lease id
+ * @return true if the lease is still good. false if it has expired.
+ * @throws UnregisteredNodeException if the data node hasn't registered yet
+ */
+ public boolean checkLease(DatanodeID nodeId, long leaseId) throws
UnregisteredNodeException {
+ return
blockReportLeaseManager.checkLease(datanodeManager.getDatanode(nodeId),
Time.monotonicNow(), leaseId);
+ }
+
+ /**
* Rescan the list of blocks which were previously postponed.
*/
void rescanPostponedMisreplicatedBlocks() {
diff --git
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockReportLeaseManager.java
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockReportLeaseManager.java
index 7db05c7..22f1728 100644
---
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockReportLeaseManager.java
+++
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockReportLeaseManager.java
@@ -236,7 +236,7 @@ public synchronized long requestLease(DatanodeDescriptor
dn) {
// The DataNode wants a new lease, even though it already has one.
// This can happen if the DataNode is restarted in between requesting
// a lease and using it.
- LOG.debug("Removing existing BR lease 0x{} for DN {} in order to " +
+ LOG.warn("Removing existing BR lease 0x{} for DN {} in order to " +
"issue a new one.", Long.toHexString(node.leaseId),
dn.getDatanodeUuid());
}
diff --git
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
index 89571f4..0738d99 100644
---
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
+++
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
@@ -152,6 +152,7 @@
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.NodeRegistration;
+import org.apache.hadoop.hdfs.server.protocol.RegisterCommand;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.hdfs.server.protocol.SlowDiskReports;
import org.apache.hadoop.hdfs.server.protocol.SlowPeerReports;
@@ -1445,7 +1446,18 @@ public DatanodeCommand blockReport(final
DatanodeRegistration nodeReg,
blockStateChangeLog.debug("*BLOCK* NameNode.blockReport: "
+ "from " + nodeReg + ", reports.length=" + reports.length);
}
- final BlockManager bm = namesystem.getBlockManager();
+ final BlockManager bm = namesystem.getBlockManager();
+ // Process the FBR iff the lease hasn't expired.
+ // If the lease has expired, we ask the DN to re-register and ask for
+ // a lease in subsequent heart beat.
+ if (context.getLeaseId() != 0 && !bm.checkLease(nodeReg,
context.getLeaseId())) {
+ blockStateChangeLog.warn("*BLOCK* NameNode.blockReport: Rejecting full block
report "
+ + "from " + nodeReg + ", reports.length=" + reports.length
+ + ", as the leaseId=" + Long.toHexString(context.getLeaseId()) + " has
expired. "
+ + "Asking it to re-register to obtain a new lease id and then send a FBR.");
+ bm.removeBRLeaseIfNeeded(nodeReg, context);
+ return RegisterCommand.REGISTER;
+ }
boolean noStaleStorages = false;
for (int r = 0; r < reports.length; r++) {
final BlockListAsLongs blocks = reports[r].getBlocks();{code}
> Block report leases cause missing blocks until next report
> ----------------------------------------------------------
>
> Key: HDFS-12914
> URL: https://issues.apache.org/jira/browse/HDFS-12914
> Project: Hadoop HDFS
> Issue Type: Bug
> Components: namenode
> Affects Versions: 2.8.0
> Reporter: Daryn Sharp
> Priority: Critical
>
> {{BlockReportLeaseManager#checkLease}} will reject FBRs from DNs for
> conditions such as "unknown datanode", "not in pending set", "lease has
> expired", wrong lease id, etc. Lease rejection does not throw an exception.
> It returns false which bubbles up to {{NameNodeRpcServer#blockReport}} and
> interpreted as {{noStaleStorages}}.
> A re-registering node whose FBR is rejected from an invalid lease becomes
> active with _no blocks_. A replication storm ensues possibly causing DNs to
> temporarily go dead (HDFS-12645), leading to more FBR lease rejections on
> re-registration. The cluster will have many "missing blocks" until the DNs
> next FBR is sent and/or forced.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]