Repository: hadoop
Updated Branches:
  refs/heads/branch-2.7 653ef52ef -> 1d23e1ec0


HDFS-7725. Incorrect 'nodes in service' metrics caused all writes to fail. 
Contributed by Ming Ma.
(cherry picked from commit 8104d522690fe9556177893770a388291cea0749)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/1d23e1ec
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/1d23e1ec
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/1d23e1ec

Branch: refs/heads/branch-2.7
Commit: 1d23e1ec073489bfc8a04a08350a2c46efbd466f
Parents: 653ef52
Author: Kihwal Lee <kih...@apache.org>
Authored: Tue Oct 27 11:09:05 2015 -0500
Committer: Kihwal Lee <kih...@apache.org>
Committed: Tue Oct 27 11:09:05 2015 -0500

----------------------------------------------------------------------
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt     |  3 ++
 .../blockmanagement/DecommissionManager.java    | 28 +++++++++----------
 .../blockmanagement/HeartbeatManager.java       | 29 ++++++++++++++------
 .../namenode/TestNamenodeCapacityReport.java    |  5 ++++
 4 files changed, 41 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/1d23e1ec/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt 
b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index 50bc0c4..7c09896 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -28,6 +28,9 @@ Release 2.7.2 - UNRELEASED
     HDFS-6945. BlockManager should remove a block from excessReplicateMap and
     decrement ExcessBlocks metric when the block is removed. (aajisaka)
 
+    HDFS-7725. Incorrect "nodes in service" metrics caused all writes to fail.
+    (Ming Ma via wang)
+
     HDFS-8806. Inconsistent metrics: number of missing blocks with replication
     factor 1 not properly cleared. (Zhe Zhang via aajisaka)
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/1d23e1ec/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java
index 9355329..7f3d778 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DecommissionManager.java
@@ -197,23 +197,21 @@ public class DecommissionManager {
    */
   @VisibleForTesting
   public void startDecommission(DatanodeDescriptor node) {
-    if (!node.isDecommissionInProgress()) {
-      if (!node.isAlive) {
-        LOG.info("Dead node {} is decommissioned immediately.", node);
-        node.setDecommissioned();
-      } else if (!node.isDecommissioned()) {
+    if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
+      // Update DN stats maintained by HeartbeatManager
+      hbManager.startDecommission(node);
+      // hbManager.startDecommission will set dead node to decommissioned.
+      if (node.isDecommissionInProgress()) {
         for (DatanodeStorageInfo storage : node.getStorageInfos()) {
-          LOG.info("Starting decommission of {} {} with {} blocks", 
+          LOG.info("Starting decommission of {} {} with {} blocks",
               node, storage, storage.numBlocks());
         }
-        // Update DN stats maintained by HeartbeatManager
-        hbManager.startDecommission(node);
         node.decommissioningStatus.setStartTime(monotonicNow());
         pendingNodes.add(node);
       }
     } else {
-      LOG.trace("startDecommission: Node {} is already decommission in "
-              + "progress, nothing to do.", node);
+      LOG.trace("startDecommission: Node {} in {}, nothing to do." +
+          node, node.getAdminState());
     }
   }
 
@@ -221,12 +219,12 @@ public class DecommissionManager {
    * Stop decommissioning the specified datanode. 
    * @param node
    */
-  void stopDecommission(DatanodeDescriptor node) {
+  @VisibleForTesting
+  public void stopDecommission(DatanodeDescriptor node) {
     if (node.isDecommissionInProgress() || node.isDecommissioned()) {
-      LOG.info("Stopping decommissioning of node {}", node);
       // Update DN stats maintained by HeartbeatManager
       hbManager.stopDecommission(node);
-      // Over-replicated blocks will be detected and processed when 
+      // Over-replicated blocks will be detected and processed when
       // the dead node comes back and send in its full block report.
       if (node.isAlive) {
         blockManager.processOverReplicatedBlocksOnReCommission(node);
@@ -235,8 +233,8 @@ public class DecommissionManager {
       pendingNodes.remove(node);
       decomNodeBlocks.remove(node);
     } else {
-      LOG.trace("stopDecommission: Node {} is not decommission in progress " +
-          "or decommissioned, nothing to do.", node);
+      LOG.trace("stopDecommission: Node {} in {}, nothing to do." +
+          node, node.getAdminState());
     }
   }
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/1d23e1ec/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
index d2905a2..b0ab315 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
@@ -20,8 +20,6 @@ package org.apache.hadoop.hdfs.server.blockmanagement;
 import java.util.ArrayList;
 import java.util.List;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSUtil;
@@ -31,6 +29,8 @@ import org.apache.hadoop.hdfs.server.protocol.StorageReport;
 import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
 import org.apache.hadoop.util.Daemon;
 import org.apache.hadoop.util.Time;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Manage the heartbeats received from datanodes.
@@ -38,7 +38,7 @@ import org.apache.hadoop.util.Time;
  * by the heartbeat manager lock.
  */
 class HeartbeatManager implements DatanodeStatistics {
-  static final Log LOG = LogFactory.getLog(HeartbeatManager.class);
+  static final Logger LOG = LoggerFactory.getLogger(HeartbeatManager.class);
 
   /**
    * Stores a subset of the datanodeMap in DatanodeManager,
@@ -227,15 +227,26 @@ class HeartbeatManager implements DatanodeStatistics {
   }
 
   synchronized void startDecommission(final DatanodeDescriptor node) {
-    stats.subtract(node);
-    node.startDecommission();
-    stats.add(node);
+    if (!node.isAlive) {
+      LOG.info("Dead node {} is decommissioned immediately.", node);
+      node.setDecommissioned();
+    } else {
+      stats.subtract(node);
+      node.startDecommission();
+      stats.add(node);
+    }
   }
 
   synchronized void stopDecommission(final DatanodeDescriptor node) {
-    stats.subtract(node);
-    node.stopDecommission();
-    stats.add(node);
+    LOG.info("Stopping decommissioning of {} node {}",
+        node.isAlive ? "live" : "dead", node);
+    if (!node.isAlive) {
+      node.stopDecommission();
+    } else {
+      stats.subtract(node);
+      node.stopDecommission();
+      stats.add(node);
+    }
   }
   
   /**

http://git-wip-us.apache.org/repos/asf/hadoop/blob/1d23e1ec/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java
----------------------------------------------------------------------
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java
index fd611ce..6f54722 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java
@@ -202,9 +202,14 @@ public class TestNamenodeCapacityReport {
         dn.shutdown();
         DFSTestUtil.setDatanodeDead(dnd);
         BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager());
+        //Verify decommission of dead node won't impact nodesInService metrics.
+        dnm.getDecomManager().startDecommission(dnd);
         expectedInServiceNodes--;
         assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes());
         assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
+        //Verify recommission of dead node won't impact nodesInService metrics.
+        dnm.getDecomManager().stopDecommission(dnd);
+        assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
       }
 
       // restart the nodes to verify that counts are correct after

Reply via email to