[ambari] branch trunk updated: AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk)

2018-07-23 Thread aonishuk
This is an automated email from the ASF dual-hosted git repository.

aonishuk pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/ambari.git


The following commit(s) were added to refs/heads/trunk by this push:
 new 5ecf5e8  AMBARI-24270. Agent Status Command Are Randomly Failing With 
Empty stderr (aonishuk)
5ecf5e8 is described below

commit 5ecf5e8c4ecbd1cbd21699e53e27786729df8dbb
Author: Andrew Onishuk 
AuthorDate: Mon Jul 23 16:38:53 2018 +0300

AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr 
(aonishuk)
---
 .../src/main/python/ambari_agent/CustomServiceOrchestrator.py | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git 
a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py 
b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
index 32c4094..c66a623 100644
--- a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
+++ b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
@@ -23,6 +23,7 @@ import os
 import ambari_simplejson as json
 import sys
 import time
+import uuid
 from ambari_commons import shell
 import threading
 from collections import defaultdict
@@ -516,9 +517,9 @@ class CustomServiceOrchestrator():
 if logger.level == logging.DEBUG:
   override_output_files = False
 
-timestamp = time.time()
-status_commands_stdout = self.status_commands_stdout.format(timestamp)
-status_commands_stderr = self.status_commands_stderr.format(timestamp)
+# make sure status commands that run in parallel don't use the same files
+status_commands_stdout = self.status_commands_stdout.format(uuid.uuid4())
+status_commands_stderr = self.status_commands_stderr.format(uuid.uuid4())
 
 try:
   res = self.runCommand(command_header, status_commands_stdout,
@@ -568,8 +569,8 @@ class CustomServiceOrchestrator():
 command_type = command['commandType']
 from ActionQueue import ActionQueue  # To avoid cyclic dependency
 if command_type == ActionQueue.STATUS_COMMAND:
-  timestamp = time.time()
-  file_path = os.path.join(self.tmp_dir, 
"status_command_{0}.json".format(timestamp))
+  # make sure status commands that run in parallel don't use the same files
+  file_path = os.path.join(self.tmp_dir, 
"status_command_{0}.json".format(uuid.uuid4()))
 else:
   task_id = command['taskId']
   file_path = os.path.join(self.tmp_dir, 
"command-{0}.json".format(task_id))



[ambari] branch trunk updated: AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk)

2018-07-11 Thread aonishuk
This is an automated email from the ASF dual-hosted git repository.

aonishuk pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/ambari.git


The following commit(s) were added to refs/heads/trunk by this push:
 new 11ea165  AMBARI-24270. Agent Status Command Are Randomly Failing With 
Empty stderr (aonishuk)
11ea165 is described below

commit 11ea165b7c6d8da5462011fff10b6420b4f410c1
Author: Andrew Onishuk 
AuthorDate: Wed Jul 11 10:07:05 2018 +0300

AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr 
(aonishuk)
---
 .../main/python/ambari_agent/CustomServiceOrchestrator.py   | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git 
a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py 
b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
index 9c1fb3e..32c4094 100644
--- a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
+++ b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
@@ -456,7 +456,13 @@ class CustomServiceOrchestrator():
   if incremented_commands_for_component:
 self.commands_for_component_in_progress[cluster_id][command['role']] 
-= 1
 
-  self.conditionally_remove_command_file(json_path, ret)
+  if is_status_command and json_path:
+try:
+  os.unlink(json_path)
+except OSError:
+  pass  # Ignore failure
+  else:
+self.conditionally_remove_command_file(json_path, ret)
 
 return ret
 
@@ -562,9 +568,8 @@ class CustomServiceOrchestrator():
 command_type = command['commandType']
 from ActionQueue import ActionQueue  # To avoid cyclic dependency
 if command_type == ActionQueue.STATUS_COMMAND:
-  # These files are frequently created, that's why we don't
-  # store them all, but only the latest one
-  file_path = os.path.join(self.tmp_dir, "status_command.json")
+  timestamp = time.time()
+  file_path = os.path.join(self.tmp_dir, 
"status_command_{0}.json".format(timestamp))
 else:
   task_id = command['taskId']
   file_path = os.path.join(self.tmp_dir, 
"command-{0}.json".format(task_id))



[ambari] branch trunk updated: AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk)

2018-07-10 Thread aonishuk
This is an automated email from the ASF dual-hosted git repository.

aonishuk pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/ambari.git


The following commit(s) were added to refs/heads/trunk by this push:
 new d3cf7ac  AMBARI-24270. Agent Status Command Are Randomly Failing With 
Empty stderr (aonishuk)
d3cf7ac is described below

commit d3cf7ac2c137189bea82d77d20ebf2dc891e7e71
Author: Andrew Onishuk 
AuthorDate: Tue Jul 10 21:28:26 2018 +0300

AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr 
(aonishuk)
---
 .../ambari_agent/CustomServiceOrchestrator.py  | 30 +-
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git 
a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py 
b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
index 51a0d59..9c1fb3e 100644
--- a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
+++ b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
@@ -22,6 +22,7 @@ import logging
 import os
 import ambari_simplejson as json
 import sys
+import time
 from ambari_commons import shell
 import threading
 from collections import defaultdict
@@ -86,9 +87,9 @@ class CustomServiceOrchestrator():
 self.exec_tmp_dir = AGENT_TMP_DIR
 self.file_cache = initializer_module.file_cache
 self.status_commands_stdout = os.path.join(self.tmp_dir,
-   'status_command_stdout.txt')
+   'status_command_stdout_{0}.txt')
 self.status_commands_stderr = os.path.join(self.tmp_dir,
-   'status_command_stderr.txt')
+   'status_command_stderr_{0}.txt')
 
 # Construct the hadoop credential lib JARs path
 self.credential_shell_lib_path = os.path.join(self.config.get('security', 
'credential_lib_dir',
@@ -97,13 +98,6 @@ class CustomServiceOrchestrator():
 self.credential_conf_dir = self.config.get('security', 
'credential_conf_dir', self.DEFAULT_CREDENTIAL_CONF_DIR)
 
 self.credential_shell_cmd = self.config.get('security', 
'credential_shell_cmd', self.DEFAULT_CREDENTIAL_SHELL_CMD)
-
-# Clean up old status command files if any
-try:
-  os.unlink(self.status_commands_stdout)
-  os.unlink(self.status_commands_stderr)
-except OSError:
-  pass # Ignore fail
 self.commands_in_progress_lock = threading.RLock()
 self.commands_in_progress = {}
 
@@ -516,9 +510,21 @@ class CustomServiceOrchestrator():
 if logger.level == logging.DEBUG:
   override_output_files = False
 
-res = self.runCommand(command_header, self.status_commands_stdout,
-  self.status_commands_stderr, 
self.COMMAND_NAME_STATUS,
-  override_output_files=override_output_files, 
is_status_command=True)
+timestamp = time.time()
+status_commands_stdout = self.status_commands_stdout.format(timestamp)
+status_commands_stderr = self.status_commands_stderr.format(timestamp)
+
+try:
+  res = self.runCommand(command_header, status_commands_stdout,
+status_commands_stderr, self.COMMAND_NAME_STATUS,
+override_output_files=override_output_files, 
is_status_command=True)
+finally:
+  try:
+os.unlink(status_commands_stdout)
+os.unlink(status_commands_stderr)
+  except OSError:
+pass # Ignore failure
+
 return res
 
   def resolve_script_path(self, base_dir, script):



[ambari] branch trunk updated: AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr (aonishuk) (#1727)

2018-07-09 Thread avijayan
This is an automated email from the ASF dual-hosted git repository.

avijayan pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/ambari.git


The following commit(s) were added to refs/heads/trunk by this push:
 new eca3633  AMBARI-24270. Agent Status Command Are Randomly Failing With 
Empty stderr (aonishuk) (#1727)
eca3633 is described below

commit eca3633d0ef1d7a1f464d14d3729a3ae11abe420
Author: aonishuk 
AuthorDate: Mon Jul 9 23:42:26 2018 +0300

AMBARI-24270. Agent Status Command Are Randomly Failing With Empty stderr 
(aonishuk) (#1727)
---
 .../main/python/ambari_agent/ComponentStatusExecutor.py|  2 +-
 .../main/python/ambari_agent/CustomServiceOrchestrator.py  | 14 ++
 .../src/main/python/ambari_agent/PythonExecutor.py | 11 +--
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git 
a/ambari-agent/src/main/python/ambari_agent/ComponentStatusExecutor.py 
b/ambari-agent/src/main/python/ambari_agent/ComponentStatusExecutor.py
index 5d20495..65af9b0 100644
--- a/ambari-agent/src/main/python/ambari_agent/ComponentStatusExecutor.py
+++ b/ambari-agent/src/main/python/ambari_agent/ComponentStatusExecutor.py
@@ -136,7 +136,7 @@ class ComponentStatusExecutor(threading.Thread):
 if status == LiveStatus.DEAD_STATUS:
   stderr = component_status_result['stderr']
   if not "ComponentIsNotRunning" in stderr and not 
"ClientComponentHasNoStatus" in stderr:
-logger.info("Status command for {0} 
failed:\n{1}".format(component_name, stderr))
+logger.info("Status command for {0} failed ({1}) 
:\n{2}".format(component_name, component_status_result, stderr))
 
 result = {
   'serviceName': service_name,
diff --git 
a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py 
b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
index 51a0d59..bcb3e9b 100644
--- a/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
+++ b/ambari-agent/src/main/python/ambari_agent/CustomServiceOrchestrator.py
@@ -326,6 +326,7 @@ class CustomServiceOrchestrator():
 
 try:
   command = self.generate_command(command_header)
+  logger.info("Generated command")
   script_type = command['commandParams']['script_type']
   script = command['commandParams']['script']
   timeout = int(command['commandParams']['command_timeout'])
@@ -339,6 +340,8 @@ class CustomServiceOrchestrator():
 command_name = command['roleCommand']
   else:
 task_id = 'status'
+
+  logger.info("[{0}]".format(task_id))
 
   if forced_command_name is not None:  # If not supplied as an argument
 command_name = forced_command_name
@@ -422,11 +425,18 @@ class CustomServiceOrchestrator():
 if log_out_files:
   script_params.append("-o")
 
+logger.info("[{0}] python_executor.run_file({1})".format(task_id, 
(py_file, script_params,
+   tmpoutfile, tmperrfile, timeout,
+   tmpstrucoutfile, self.map_task_to_process,
+   task_id, override_output_files, 
backup_log_files,
+   handle, log_info_on_failure)))
+   
 ret = python_executor.run_file(py_file, script_params,
tmpoutfile, tmperrfile, timeout,
tmpstrucoutfile, self.map_task_to_process,
task_id, override_output_files, 
backup_log_files = backup_log_files,
handle = handle, 
log_info_on_failure=log_info_on_failure)
+logger.info("[{0}] python_executor.run_file() returned {1}", task_id, 
ret)
 # Next run_file() invocations should always append to current output
 override_output_files = False
 if ret['exitcode'] != 0:
@@ -437,6 +447,7 @@ class CustomServiceOrchestrator():
 
   # if canceled and not background command
   if handle is None:
+logger.info("[{0}] canceled", task_id)
 cancel_reason = self.command_canceled_reason(task_id)
 if cancel_reason is not None:
   ret['stdout'] += cancel_reason
@@ -512,6 +523,7 @@ class CustomServiceOrchestrator():
  Exit code 0 means that component is running and any other exit code means 
that
  component is not running
 """
+logger.info("Requesting component status {0}".format(command_header))
 override_output_files=True # by default, we override status command output
 if logger.level == logging.DEBUG:
   override_output_files = False
@@ -519,6 +531,8 @@ class CustomServiceOrchestrator():
 res = self.runCommand(command_header, self.status_commands_stdout,
   self.status_commands_stderr, 
self.COMMAND_NAME_STATUS,
   override_output_files=override_output_files, 
is_status_command=True)
+  
+