[ambari] branch trunk updated: AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk)
This is an automated email from the ASF dual-hosted git repository. aonishuk pushed a commit to branch trunk in repository https://gitbox.apache.org/repos/asf/ambari.git The following commit(s) were added to refs/heads/trunk by this push: new 5ecf5e8 AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk) 5ecf5e8 is described below commit 5ecf5e8c4ecbd1cbd21699e53e27786729df8dbb Author: Andrew Onishuk AuthorDate: Mon Jul 23 16:38:53 2018 +0300 AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk) --- .../src/main/python/ambari_agent/CustomServiceOrchestrator.py | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py index 32c4094..c66a623 100644 --- a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py +++ b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py @@ -23,6 +23,7 @@ import os import ambari_simplejson as json import sys import time +import uuid from ambari_commons import shell import threading from collections import defaultdict @@ -516,9 +517,9 @@ class CustomServiceOrchestrator(): if logger.level == logging.DEBUG: override_output_files = False -timestamp = time.time() -status_commands_stdout = self.status_commands_stdout.format(timestamp) -status_commands_stderr = self.status_commands_stderr.format(timestamp) +# make sure status commands that run in parallel don't use the same files +status_commands_stdout = self.status_commands_stdout.format(uuid.uuid4()) +status_commands_stderr = self.status_commands_stderr.format(uuid.uuid4()) try: res = self.runCommand(command_header, status_commands_stdout, @@ -568,8 +569,8 @@ class CustomServiceOrchestrator(): command_type = command['commandType'] from ActionQueue import ActionQueue # To avoid cyclic dependency if command_type == ActionQueue.STATUS_COMMAND: - timestamp = time.time() - file_path = os.path.join(self.tmp_dir, "status_command_{0}.json".format(timestamp)) + # make sure status commands that run in parallel don't use the same files + file_path = os.path.join(self.tmp_dir, "status_command_{0}.json".format(uuid.uuid4())) else: task_id = command['taskId'] file_path = os.path.join(self.tmp_dir, "command-{0}.json".format(task_id))
[ambari] branch trunk updated: AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk)
This is an automated email from the ASF dual-hosted git repository. aonishuk pushed a commit to branch trunk in repository https://gitbox.apache.org/repos/asf/ambari.git The following commit(s) were added to refs/heads/trunk by this push: new 11ea165 AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk) 11ea165 is described below commit 11ea165b7c6d8da5462011fff10b6420b4f410c1 Author: Andrew Onishuk AuthorDate: Wed Jul 11 10:07:05 2018 +0300 AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk) --- .../main/python/ambari_agent/CustomServiceOrchestrator.py | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py index 9c1fb3e..32c4094 100644 --- a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py +++ b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py @@ -456,7 +456,13 @@ class CustomServiceOrchestrator(): if incremented_commands_for_component: self.commands_for_component_in_progress[cluster_id][command['role']] -= 1 - self.conditionally_remove_command_file(json_path, ret) + if is_status_command and json_path: +try: + os.unlink(json_path) +except OSError: + pass # Ignore failure + else: +self.conditionally_remove_command_file(json_path, ret) return ret @@ -562,9 +568,8 @@ class CustomServiceOrchestrator(): command_type = command['commandType'] from ActionQueue import ActionQueue # To avoid cyclic dependency if command_type == ActionQueue.STATUS_COMMAND: - # These files are frequently created, that's why we don't - # store them all, but only the latest one - file_path = os.path.join(self.tmp_dir, "status_command.json") + timestamp = time.time() + file_path = os.path.join(self.tmp_dir, "status_command_{0}.json".format(timestamp)) else: task_id = command['taskId'] file_path = os.path.join(self.tmp_dir, "command-{0}.json".format(task_id))
[ambari] branch trunk updated: AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk)
This is an automated email from the ASF dual-hosted git repository. aonishuk pushed a commit to branch trunk in repository https://gitbox.apache.org/repos/asf/ambari.git The following commit(s) were added to refs/heads/trunk by this push: new d3cf7ac AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk) d3cf7ac is described below commit d3cf7ac2c137189bea82d77d20ebf2dc891e7e71 Author: Andrew Onishuk AuthorDate: Tue Jul 10 21:28:26 2018 +0300 AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk) --- .../ambari_agent/CustomServiceOrchestrator.py | 30 +- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py index 51a0d59..9c1fb3e 100644 --- a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py +++ b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py @@ -22,6 +22,7 @@ import logging import os import ambari_simplejson as json import sys +import time from ambari_commons import shell import threading from collections import defaultdict @@ -86,9 +87,9 @@ class CustomServiceOrchestrator(): self.exec_tmp_dir = AGENT_TMP_DIR self.file_cache = initializer_module.file_cache self.status_commands_stdout = os.path.join(self.tmp_dir, - 'status_command_stdout.txt') + 'status_command_stdout_{0}.txt') self.status_commands_stderr = os.path.join(self.tmp_dir, - 'status_command_stderr.txt') + 'status_command_stderr_{0}.txt') # Construct the hadoop credential lib JARs path self.credential_shell_lib_path = os.path.join(self.config.get('security', 'credential_lib_dir', @@ -97,13 +98,6 @@ class CustomServiceOrchestrator(): self.credential_conf_dir = self.config.get('security', 'credential_conf_dir', self.DEFAULT_CREDENTIAL_CONF_DIR) self.credential_shell_cmd = self.config.get('security', 'credential_shell_cmd', self.DEFAULT_CREDENTIAL_SHELL_CMD) - -# Clean up old status command files if any -try: - os.unlink(self.status_commands_stdout) - os.unlink(self.status_commands_stderr) -except OSError: - pass # Ignore fail self.commands_in_progress_lock = threading.RLock() self.commands_in_progress = {} @@ -516,9 +510,21 @@ class CustomServiceOrchestrator(): if logger.level == logging.DEBUG: override_output_files = False -res = self.runCommand(command_header, self.status_commands_stdout, - self.status_commands_stderr, self.COMMAND_NAME_STATUS, - override_output_files=override_output_files, is_status_command=True) +timestamp = time.time() +status_commands_stdout = self.status_commands_stdout.format(timestamp) +status_commands_stderr = self.status_commands_stderr.format(timestamp) + +try: + res = self.runCommand(command_header, status_commands_stdout, +status_commands_stderr, self.COMMAND_NAME_STATUS, +override_output_files=override_output_files, is_status_command=True) +finally: + try: +os.unlink(status_commands_stdout) +os.unlink(status_commands_stderr) + except OSError: +pass # Ignore failure + return res def resolve_script_path(self, base_dir, script):
[ambari] branch trunk updated: AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk) (#1727)
This is an automated email from the ASF dual-hosted git repository. avijayan pushed a commit to branch trunk in repository https://gitbox.apache.org/repos/asf/ambari.git The following commit(s) were added to refs/heads/trunk by this push: new eca3633 AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk) (#1727) eca3633 is described below commit eca3633d0ef1d7a1f464d14d3729a3ae11abe420 Author: aonishuk AuthorDate: Mon Jul 9 23:42:26 2018 +0300 AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk) (#1727) --- .../main/python/ambari_agent/ComponentStatusExecutor.py| 2 +- .../main/python/ambari_agent/CustomServiceOrchestrator.py | 14 ++ .../src/main/python/ambari_agent/PythonExecutor.py | 11 +-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/ambari-agent/src/main/python/ambari_agent/ComponentStatusExecutor.py b/ambari-agent/src/main/python/ambari_agent/ComponentStatusExecutor.py index 5d20495..65af9b0 100644 --- a/ambari-agent/src/main/python/ambari_agent/ComponentStatusExecutor.py +++ b/ambari-agent/src/main/python/ambari_agent/ComponentStatusExecutor.py @@ -136,7 +136,7 @@ class ComponentStatusExecutor(threading.Thread): if status == LiveStatus.DEAD_STATUS: stderr = component_status_result['stderr'] if not "ComponentIsNotRunning" in stderr and not "ClientComponentHasNoStatus" in stderr: -logger.info("Status command for {0} failed:\n{1}".format(component_name, stderr)) +logger.info("Status command for {0} failed ({1}) :\n{2}".format(component_name, component_status_result, stderr)) result = { 'serviceName': service_name, diff --git a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py index 51a0d59..bcb3e9b 100644 --- a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py +++ b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py @@ -326,6 +326,7 @@ class CustomServiceOrchestrator(): try: command = self.generate_command(command_header) + logger.info("Generated command") script_type = command['commandParams']['script_type'] script = command['commandParams']['script'] timeout = int(command['commandParams']['command_timeout']) @@ -339,6 +340,8 @@ class CustomServiceOrchestrator(): command_name = command['roleCommand'] else: task_id = 'status' + + logger.info("[{0}]".format(task_id)) if forced_command_name is not None: # If not supplied as an argument command_name = forced_command_name @@ -422,11 +425,18 @@ class CustomServiceOrchestrator(): if log_out_files: script_params.append("-o") +logger.info("[{0}] python_executor.run_file({1})".format(task_id, (py_file, script_params, + tmpoutfile, tmperrfile, timeout, + tmpstrucoutfile, self.map_task_to_process, + task_id, override_output_files, backup_log_files, + handle, log_info_on_failure))) + ret = python_executor.run_file(py_file, script_params, tmpoutfile, tmperrfile, timeout, tmpstrucoutfile, self.map_task_to_process, task_id, override_output_files, backup_log_files = backup_log_files, handle = handle, log_info_on_failure=log_info_on_failure) +logger.info("[{0}] python_executor.run_file() returned {1}", task_id, ret) # Next run_file() invocations should always append to current output override_output_files = False if ret['exitcode'] != 0: @@ -437,6 +447,7 @@ class CustomServiceOrchestrator(): # if canceled and not background command if handle is None: +logger.info("[{0}] canceled", task_id) cancel_reason = self.command_canceled_reason(task_id) if cancel_reason is not None: ret['stdout'] += cancel_reason @@ -512,6 +523,7 @@ class CustomServiceOrchestrator(): Exit code 0 means that component is running and any other exit code means that component is not running """ +logger.info("Requesting component status {0}".format(command_header)) override_output_files=True # by default, we override status command output if logger.level == logging.DEBUG: override_output_files = False @@ -519,6 +531,8 @@ class CustomServiceOrchestrator(): res = self.runCommand(command_header, self.status_commands_stdout, self.status_commands_stderr, self.COMMAND_NAME_STATUS, override_output_files=override_output_files, is_status_command=True) + +