Author: acmurthy Date: Tue Mar 6 00:52:58 2012 New Revision: 1297311 URL: http://svn.apache.org/viewvc?rev=1297311&view=rev Log: Merge -c 1297310 from trunk to branch-0.23 to fix MAPREDUCE-3034. Ensure NodeManager reboots itself on direction from ResourceManager. Contributed by Devaraj K & Eric Payne.
Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/dev-support/findbugs-exclude.xml hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt?rev=1297311&r1=1297310&r2=1297311&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt (original) +++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/CHANGES.txt Tue Mar 6 00:52:58 2012 @@ -191,6 +191,9 @@ Release 0.23.2 - UNRELEASED MAPREDUCE-3964. ResourceManager does not have JVM metrics (Jason Lowe via bobby) + MAPREDUCE-3034. Ensure NodeManager reboots itself on direction from + ResourceManager. (Devaraj K & Eric Payne via acmurthy) + Release 0.23.1 - 2012-02-17 NEW FEATURES Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/dev-support/findbugs-exclude.xml URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/dev-support/findbugs-exclude.xml?rev=1297311&r1=1297310&r2=1297311&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/dev-support/findbugs-exclude.xml (original) +++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/dev-support/findbugs-exclude.xml Tue Mar 6 00:52:58 2012 @@ -198,8 +198,12 @@ <Method name="run" /> <Bug pattern="DM_EXIT" /> </Match> + <Match> + <Class name="org.apache.hadoop.yarn.server.nodemanager.NodeManager" /> + <Method name="initAndStartNodeManager" /> + <Bug pattern="DM_EXIT" /> + </Match> - <!-- Ignore heartbeat exception when killing localizer --> <Match> <Class name="org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer" /> Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java?rev=1297311&r1=1297310&r2=1297311&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java Tue Mar 6 00:52:58 2012 @@ -60,7 +60,8 @@ public class NodeManager extends Composi private ApplicationACLsManager aclsManager; private NodeHealthCheckerService nodeHealthChecker; private LocalDirsHandlerService dirsHandler; - + private static CompositeServiceShutdownHook nodeManagerShutdownHook; + public NodeManager() { super(NodeManager.class.getName()); } @@ -226,25 +227,52 @@ public class NodeManager extends Composi @Override public void stateChanged(Service service) { - // Shutdown the Nodemanager when the NodeStatusUpdater is stopped. if (NodeStatusUpdaterImpl.class.getName().equals(service.getName()) && STATE.STOPPED.equals(service.getServiceState())) { + + boolean hasToReboot = ((NodeStatusUpdaterImpl) service).hasToRebootNode(); + + // Shutdown the Nodemanager when the NodeStatusUpdater is stopped. stop(); + + // Reboot the whole node-manager if NodeStatusUpdater got a reboot command + // from the RM. + if (hasToReboot) { + LOG.info("Rebooting the node manager."); + NodeManager nodeManager = createNewNodeManager(); + nodeManager.initAndStartNodeManager(hasToReboot); + } } } - public static void main(String[] args) { - StringUtils.startupShutdownMessage(NodeManager.class, args, LOG); + private void initAndStartNodeManager(boolean hasToReboot) { try { - NodeManager nodeManager = new NodeManager(); - Runtime.getRuntime().addShutdownHook( - new CompositeServiceShutdownHook(nodeManager)); + + // Remove the old hook if we are rebooting. + if (hasToReboot && null != nodeManagerShutdownHook) { + Runtime.getRuntime().removeShutdownHook(nodeManagerShutdownHook); + } + + nodeManagerShutdownHook = new CompositeServiceShutdownHook(this); + Runtime.getRuntime().addShutdownHook(nodeManagerShutdownHook); + YarnConfiguration conf = new YarnConfiguration(); - nodeManager.init(conf); - nodeManager.start(); + this.init(conf); + this.start(); } catch (Throwable t) { LOG.fatal("Error starting NodeManager", t); System.exit(-1); } } + + // For testing + NodeManager createNewNodeManager() { + return new NodeManager(); + } + + public static void main(String[] args) { + StringUtils.startupShutdownMessage(NodeManager.class, args, LOG); + NodeManager nodeManager = new NodeManager(); + nodeManager.initAndStartNodeManager(false); + } } Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java?rev=1297311&r1=1297310&r2=1297311&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java Tue Mar 6 00:52:58 2012 @@ -91,6 +91,8 @@ public class NodeStatusUpdaterImpl exten private final NodeHealthCheckerService healthChecker; private final NodeManagerMetrics metrics; + private boolean hasToRebootNode; + public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, ContainerTokenSecretManager containerTokenSecretManager) { @@ -156,6 +158,18 @@ public class NodeStatusUpdaterImpl exten this.isStopped = true; super.stop(); } + + private synchronized void reboot() { + this.hasToRebootNode = true; + // Stop the status-updater. This will trigger a sub-service state change in + // the NodeManager which will then decide to reboot or not based on + // isRebooted. + this.stop(); + } + + synchronized boolean hasToRebootNode() { + return this.hasToRebootNode; + } protected boolean isSecurityEnabled() { return UserGroupInformation.isSecurityEnabled(); @@ -336,8 +350,8 @@ public class NodeStatusUpdaterImpl exten } if (response.getNodeAction() == NodeAction.REBOOT) { LOG.info("Node is out of sync with ResourceManager," - + " hence shutting down."); - NodeStatusUpdaterImpl.this.stop(); + + " hence rebooting."); + NodeStatusUpdaterImpl.this.reboot(); break; } Modified: hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java?rev=1297311&r1=1297310&r2=1297311&view=diff ============================================================================== --- hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java (original) +++ hadoop/common/branches/branch-0.23/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java Tue Mar 6 00:52:58 2012 @@ -18,6 +18,8 @@ package org.apache.hadoop.yarn.server.nodemanager; +import static org.mockito.Mockito.mock; + import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; @@ -71,7 +73,6 @@ import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import static org.mockito.Mockito.mock; public class TestNodeStatusUpdater { @@ -91,6 +92,7 @@ public class TestNodeStatusUpdater { private final List<NodeId> registeredNodes = new ArrayList<NodeId>(); private final Configuration conf = new YarnConfiguration(); private NodeManager nm; + protected NodeManager rebootedNodeManager; @After public void tearDown() { @@ -496,8 +498,28 @@ public class TestNodeStatusUpdater { LOG.info("Waiting for NM to stop.."); Thread.sleep(1000); } - Assert.assertEquals(STATE.STOPPED, nm.getServiceState()); + + waitCount = 0; + while (null == rebootedNodeManager && waitCount++ != 20) { + LOG.info("Waiting for NM to reinitialize.."); + Thread.sleep(1000); + } + + waitCount = 0; + while (rebootedNodeManager.getServiceState() != STATE.STARTED && waitCount++ != 20) { + LOG.info("Waiting for NM to start.."); + Thread.sleep(1000); + } + Assert.assertEquals(STATE.STARTED, rebootedNodeManager.getServiceState()); + + rebootedNodeManager.stop(); + waitCount = 0; + while (rebootedNodeManager.getServiceState() != STATE.STOPPED && waitCount++ != 20) { + LOG.info("Waiting for NM to stop.."); + Thread.sleep(1000); + } + Assert.assertEquals(STATE.STOPPED, rebootedNodeManager.getServiceState()); } @Test @@ -642,6 +664,12 @@ public class TestNodeStatusUpdater { myNodeStatusUpdater.resourceTracker = myResourceTracker2; return myNodeStatusUpdater; } + + @Override + NodeManager createNewNodeManager() { + rebootedNodeManager = getNodeManager(NodeAction.NORMAL); + return rebootedNodeManager; + } }; } }