Repository: ambari Updated Branches: refs/heads/branch-2.5 1e72a0957 -> 4b3d2848d refs/heads/trunk 7b30be6dd -> aa588ca86
AMBARI-18704. Add code to improve debugging of ambari-agent related problems. (aonishuk) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/aa588ca8 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/aa588ca8 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/aa588ca8 Branch: refs/heads/trunk Commit: aa588ca8667f5f67b6be2251a6dad37230172fb4 Parents: 7b30be6 Author: Andrew Onishuk <[email protected]> Authored: Wed Oct 26 20:08:46 2016 +0300 Committer: Andrew Onishuk <[email protected]> Committed: Wed Oct 26 20:08:46 2016 +0300 ---------------------------------------------------------------------- .../python/ambari_agent/HeartbeatHandlers.py | 13 ++++-------- .../python/ambari_agent/RemoteDebugUtils.py | 21 +++++++++++++++++++- .../ambari_agent/StatusCommandsExecutor.py | 12 ++++++++--- 3 files changed, 33 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/aa588ca8/ambari-agent/src/main/python/ambari_agent/HeartbeatHandlers.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/ambari_agent/HeartbeatHandlers.py b/ambari-agent/src/main/python/ambari_agent/HeartbeatHandlers.py index 4a3d372..836ab07 100644 --- a/ambari-agent/src/main/python/ambari_agent/HeartbeatHandlers.py +++ b/ambari-agent/src/main/python/ambari_agent/HeartbeatHandlers.py @@ -26,9 +26,10 @@ import signal import threading import traceback from ambari_commons.os_family_impl import OsFamilyImpl -from RemoteDebugUtils import remote_debug import sys +from ambari_agent.RemoteDebugUtils import bind_debug_signal_handlers + logger = logging.getLogger() _handler = None @@ -128,14 +129,8 @@ def bind_signal_handlers(agentPid): if os.getpid() == agentPid: signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) - signal.signal(signal.SIGUSR2, remote_debug) # Interrupt running process, and provide a python prompt for it - try: - import faulthandler # This is not default module, has to be installed separately - faulthandler.enable(file=sys.stderr, all_threads=True) - faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True, chain=False) - sys.stderr.write("Registered faulthandler\n") - except ImportError: - pass # Module is not included into python distribution + + bind_debug_signal_handlers() _handler = HeartbeatStopHandlersLinux() else: http://git-wip-us.apache.org/repos/asf/ambari/blob/aa588ca8/ambari-agent/src/main/python/ambari_agent/RemoteDebugUtils.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/ambari_agent/RemoteDebugUtils.py b/ambari-agent/src/main/python/ambari_agent/RemoteDebugUtils.py index f2a462b..ae997ac 100644 --- a/ambari-agent/src/main/python/ambari_agent/RemoteDebugUtils.py +++ b/ambari-agent/src/main/python/ambari_agent/RemoteDebugUtils.py @@ -21,7 +21,26 @@ limitations under the License. try: import readline # For readline input support except: pass -import sys, os, traceback, codeop, cStringIO, cPickle, tempfile +import sys, signal, os, traceback, codeop, cStringIO, cPickle, tempfile + +def bind_debug_signal_handlers(): + signal.signal(signal.SIGUSR1, print_threads_stack_traces) # prints process threads current stack trace to the err stream. (can be found in ambari-agent.out) + signal.signal(signal.SIGUSR2, remote_debug) # provide a read-only python shell, which represent the process state at time of signal arrival. + +def print_threads_stack_traces(sig, frame): + print >> sys.stderr, "\n*** STACKTRACE - START ***\n" + code = [] + for threadId, stack in sys._current_frames().items(): + code.append("\n# ThreadID: %s" % threadId) + for filename, lineno, name, line in traceback.extract_stack(stack): + code.append('File: "%s", line %d, in %s' % (filename, + lineno, name)) + if line: + code.append(" %s" % (line.strip())) + + for line in code: + print >> sys.stderr, line + print >> sys.stderr, "\n*** STACKTRACE - END ***\n" def pipename(pid): """Return name of pipe to use""" http://git-wip-us.apache.org/repos/asf/ambari/blob/aa588ca8/ambari-agent/src/main/python/ambari_agent/StatusCommandsExecutor.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/ambari_agent/StatusCommandsExecutor.py b/ambari-agent/src/main/python/ambari_agent/StatusCommandsExecutor.py index 8959640..20acee4 100644 --- a/ambari-agent/src/main/python/ambari_agent/StatusCommandsExecutor.py +++ b/ambari-agent/src/main/python/ambari_agent/StatusCommandsExecutor.py @@ -22,7 +22,8 @@ import signal import threading import logging import multiprocessing -from PythonReflectiveExecutor import PythonReflectiveExecutor +from ambari_agent.PythonReflectiveExecutor import PythonReflectiveExecutor +from ambari_agent.RemoteDebugUtils import bind_debug_signal_handlers logger = logging.getLogger(__name__) @@ -43,8 +44,10 @@ class StatusCommandsExecutor(multiprocessing.Process): def run(self): try: + bind_debug_signal_handlers() while True: command = self.actionQueue.statusCommandQueue.get(True) # blocks until status status command appears + logger.info("Running status command for {0}".format(command['componentName'])) # TODO: change to logger.debug once fixed timeout_timer = threading.Timer( self.status_command_timeout, self.respawn, [command]) timeout_timer.start() @@ -52,6 +55,7 @@ class StatusCommandsExecutor(multiprocessing.Process): self.process_status_command(command) timeout_timer.cancel() + logger.info("Completed status command for {0}".format(command['componentName'])) # TODO: change to logger.debug once fixed except: logger.exception("StatusCommandsExecutor process failed with exception:") raise @@ -67,8 +71,10 @@ class StatusCommandsExecutor(multiprocessing.Process): def respawn(self, command): try: - # Force context to reset to normal. By context we mean sys.path, imports, etc. They are set by specific status command, and are not relevant to ambari-agent. - PythonReflectiveExecutor.last_context.revert() + if hasattr(PythonReflectiveExecutor, "last_context"): + # Force context to reset to normal. By context we mean sys.path, imports, etc. They are set by specific status command, and are not relevant to ambari-agent. + PythonReflectiveExecutor.last_context.revert() + logger.warn("Command {0} for {1} is running for more than {2} seconds. Terminating it due to timeout.".format(command['commandType'], command['componentName'], self.status_command_timeout)) self.hasTimeoutedEvent.set()
