YARN-3826. Race condition in ResourceTrackerService leads to wrong diagnostics messages. Contributed by Chengbing Liu.
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/5614d87d Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/5614d87d Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/5614d87d Branch: refs/heads/YARN-2928 Commit: 5614d87dfb53fe79a6fa69c8908f8757e7e3a51d Parents: af19e92 Author: Devaraj K <deva...@apache.org> Authored: Thu Jun 25 16:13:59 2015 +0530 Committer: Zhijie Shen <zjs...@apache.org> Committed: Mon Jun 29 10:28:25 2015 -0700 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 3 +++ .../server/utils/YarnServerBuilderUtils.java | 11 ++++++++-- .../resourcemanager/ResourceTrackerService.java | 23 +++++--------------- 3 files changed, 18 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/5614d87d/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index fccc5e2..d980e21 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -656,6 +656,9 @@ Release 2.8.0 - UNRELEASED YARN-3790. usedResource from rootQueue metrics may get stale data for FS scheduler after recovering the container (Zhihai Xu via rohithsharmaks) + YARN-3826. Race condition in ResourceTrackerService leads to + wrong diagnostics messages. (Chengbing Liu via devaraj) + Release 2.7.1 - UNRELEASED INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/5614d87d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/utils/YarnServerBuilderUtils.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/utils/YarnServerBuilderUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/utils/YarnServerBuilderUtils.java index 8bdff62..f333185 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/utils/YarnServerBuilderUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/utils/YarnServerBuilderUtils.java @@ -22,13 +22,11 @@ import java.util.List; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; -import org.apache.hadoop.yarn.api.records.SerializedException; import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.NodeAction; -import org.apache.hadoop.yarn.util.Records; /** * Server Builder utilities to construct various objects. @@ -39,6 +37,15 @@ public class YarnServerBuilderUtils { private static final RecordFactory recordFactory = RecordFactoryProvider .getRecordFactory(null); + public static NodeHeartbeatResponse newNodeHeartbeatResponse( + NodeAction action, String diagnosticsMessage) { + NodeHeartbeatResponse response = recordFactory + .newRecordInstance(NodeHeartbeatResponse.class); + response.setNodeAction(action); + response.setDiagnosticsMessage(diagnosticsMessage); + return response; + } + public static NodeHeartbeatResponse newNodeHeartbeatResponse(int responseId, NodeAction action, List<ContainerId> containersToCleanUp, List<ApplicationId> applicationsToCleanUp, http://git-wip-us.apache.org/repos/asf/hadoop/blob/5614d87d/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java index 5e240e5..a053155 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java @@ -103,22 +103,11 @@ public class ResourceTrackerService extends AbstractService implements private InetSocketAddress resourceTrackerAddress; private String minimumNodeManagerVersion; - private static final NodeHeartbeatResponse resync = recordFactory - .newRecordInstance(NodeHeartbeatResponse.class); - private static final NodeHeartbeatResponse shutDown = recordFactory - .newRecordInstance(NodeHeartbeatResponse.class); - private int minAllocMb; private int minAllocVcores; private boolean isDistributedNodeLabelsConf; - static { - resync.setNodeAction(NodeAction.RESYNC); - - shutDown.setNodeAction(NodeAction.SHUTDOWN); - } - public ResourceTrackerService(RMContext rmContext, NodesListManager nodesListManager, NMLivelinessMonitor nmLivelinessMonitor, @@ -417,8 +406,8 @@ public class ResourceTrackerService extends AbstractService implements "Disallowed NodeManager nodeId: " + nodeId + " hostname: " + nodeId.getHost(); LOG.info(message); - shutDown.setDiagnosticsMessage(message); - return shutDown; + return YarnServerBuilderUtils.newNodeHeartbeatResponse( + NodeAction.SHUTDOWN, message); } // 2. Check if it's a registered node @@ -427,8 +416,8 @@ public class ResourceTrackerService extends AbstractService implements /* node does not exist */ String message = "Node not found resyncing " + remoteNodeStatus.getNodeId(); LOG.info(message); - resync.setDiagnosticsMessage(message); - return resync; + return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.RESYNC, + message); } // Send ping @@ -448,11 +437,11 @@ public class ResourceTrackerService extends AbstractService implements + lastNodeHeartbeatResponse.getResponseId() + " nm response id:" + remoteNodeStatus.getResponseId(); LOG.info(message); - resync.setDiagnosticsMessage(message); // TODO: Just sending reboot is not enough. Think more. this.rmContext.getDispatcher().getEventHandler().handle( new RMNodeEvent(nodeId, RMNodeEventType.REBOOTING)); - return resync; + return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.RESYNC, + message); } // Check & update collectors info from request.