Repository: ambari Updated Branches: refs/heads/branch-2.5 4418358f8 -> 91a7d0efa
AMBARI-21142. Log more info about heartbeat message/response when server - agent communication gets out of sync. (stoader) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/91a7d0ef Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/91a7d0ef Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/91a7d0ef Branch: refs/heads/branch-2.5 Commit: 91a7d0efadd1522a9f736a1c8006f47457bef9af Parents: 4418358 Author: Toader, Sebastian <stoa...@hortonworks.com> Authored: Fri Jun 2 23:09:56 2017 +0200 Committer: Toader, Sebastian <stoa...@hortonworks.com> Committed: Fri Jun 2 23:09:56 2017 +0200 ---------------------------------------------------------------------- .../src/main/python/ambari_agent/Controller.py | 6 +++++- .../ambari/server/agent/HeartBeatHandler.java | 18 ++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/91a7d0ef/ambari-agent/src/main/python/ambari_agent/Controller.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/ambari_agent/Controller.py b/ambari-agent/src/main/python/ambari_agent/Controller.py index 83f1da8..5fab595 100644 --- a/ambari-agent/src/main/python/ambari_agent/Controller.py +++ b/ambari-agent/src/main/python/ambari_agent/Controller.py @@ -320,6 +320,7 @@ class Controller(threading.Thread): logger.log(logging_level, "Sending Heartbeat (id = %s)", self.responseId) response = self.sendRequest(self.heartbeatUrl, data) + exitStatus = 0 if 'exitstatus' in response.keys(): exitStatus = int(response['exitstatus']) @@ -365,7 +366,9 @@ class Controller(threading.Thread): self.restartAgent() if serverId != self.responseId + 1: - logger.error("Error in responseId sequence - restarting") + logger.error("Error in responseId sequence - received responseId={0} from server while expecting {1} - restarting..." + .format(serverId, self.responseId + 1)) + self.restartAgent() else: self.responseId = serverId @@ -464,6 +467,7 @@ class Controller(threading.Thread): #randomize the heartbeat delay = randint(0, self.max_reconnect_retry_delay) + logger.info("Waiting {0} seconds before reconnecting to {1}".format(delay, self.heartbeatUrl)) time.sleep(delay) # Sleep for some time http://git-wip-us.apache.org/repos/asf/ambari/blob/91a7d0ef/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java index 6b93462..fd43de5 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java @@ -207,10 +207,20 @@ public class HeartBeatHandler { + ", receivedResponseId=" + heartbeat.getResponseId()); if (heartbeat.getResponseId() == currentResponseId - 1) { - LOG.warn("Old responseId received - response was lost - returning cached response"); - return hostResponses.get(hostname); + HeartBeatResponse heartBeatResponse = hostResponses.get(hostname); + + LOG.warn("Old responseId={} received form host {} - response was lost - returning cached response with responseId={}", + heartbeat.getResponseId(), + hostname, + heartBeatResponse.getResponseId()); + + return heartBeatResponse; } else if (heartbeat.getResponseId() != currentResponseId) { - LOG.error("Error in responseId sequence - sending agent restart command"); + LOG.error("Error in responseId sequence - received responseId={} from host {} - sending agent restart command with responseId={}", + heartbeat.getResponseId(), + hostname, + currentResponseId); + return createRestartCommand(currentResponseId); } @@ -232,7 +242,7 @@ public class HeartBeatHandler { if (hostObject.getState().equals(HostState.HEARTBEAT_LOST)) { // After loosing heartbeat agent should reregister - LOG.warn("Host is in HEARTBEAT_LOST state - sending register command"); + LOG.warn("Host {} is in HEARTBEAT_LOST state - sending register command", hostname); return createRegisterCommand(); }