AMBARI-18997 ambari-server.pid might not be created on slow hardware (dsen)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/d8ba7f1b
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/d8ba7f1b
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/d8ba7f1b

Branch: refs/heads/branch-feature-AMBARI-18456
Commit: d8ba7f1bad5f9bb96256a5ca64c0568d74b874f8
Parents: 1fb8d07
Author: Dmytro Sen <d...@apache.org>
Authored: Wed Nov 30 13:50:37 2016 +0200
Committer: Dmytro Sen <d...@apache.org>
Committed: Wed Nov 30 13:50:37 2016 +0200

----------------------------------------------------------------------
 .../src/main/python/ambari_server/utils.py      | 60 ++++-------------
 .../src/main/python/ambari_server_main.py       | 70 ++++++++++++++------
 .../src/test/python/TestAmbariServer.py         |  6 +-
 ambari-server/src/test/python/TestUtils.py      | 37 +----------
 4 files changed, 66 insertions(+), 107 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/d8ba7f1b/ambari-server/src/main/python/ambari_server/utils.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/python/ambari_server/utils.py 
b/ambari-server/src/main/python/ambari_server/utils.py
index 5188a48..26e59ae 100644
--- a/ambari-server/src/main/python/ambari_server/utils.py
+++ b/ambari-server/src/main/python/ambari_server/utils.py
@@ -119,19 +119,19 @@ def save_pid(pid, pidfile):
       pass
 
 
-def save_main_pid_ex(pids, pidfile, exclude_list=[], kill_exclude_list=False, 
skip_daemonize=False):
+def save_main_pid_ex(pids, pidfile, exclude_list=[], skip_daemonize=False):
   """
     Save pid which is not included to exclude_list to pidfile.
-    If kill_exclude_list is set to true,  all processes in that
-    list would be killed. It's might be useful to daemonize child process
 
     exclude_list contains list of full executable paths which should be 
excluded
   """
+  pid_saved = False
   try:
     pfile = open(pidfile, "w")
     for item in pids:
       if pid_exists(item["pid"]) and (item["exe"] not in exclude_list):
         pfile.write("%s\n" % item["pid"])
+        pid_saved = True
         logger.info("Ambari server started with PID " + str(item["pid"]))
       if pid_exists(item["pid"]) and (item["exe"] in exclude_list) and not 
skip_daemonize:
         try:
@@ -147,67 +147,33 @@ def save_main_pid_ex(pids, pidfile, exclude_list=[], 
kill_exclude_list=False, sk
     except Exception as e:
       logger.error("Failed to close PID file " + pidfile + " due to " + str(e))
       pass
+  return pid_saved
 
-
-def wait_for_pid(pids, server_init_timeout, occupy_port_timeout, 
init_web_ui_timeout,
-                 server_out_file, db_check_log, properties):
+def get_live_pids_count(pids):
   """
-    Check pid for existence during timeout
+    Check pids for existence
   """
-  from ambari_server.serverConfiguration import get_ambari_server_ui_port
-  ambari_server_ui_port = int(get_ambari_server_ui_port(properties))
-  server_ui_port_occupied = False
-  tstart = time.time()
-  pid_live = 0
-  while int(time.time()-tstart) <= occupy_port_timeout and len(pids) > 0:
-    sys.stdout.write('.')
-    sys.stdout.flush()
-    pid_live = 0
-    for item in pids:
-      if pid_exists(item["pid"]):
-        pid_live += 1
-    time.sleep(1)
+  return len([pid for pid in pids if pid_exists(pid)])
+
+def wait_for_ui_start(ambari_server_ui_port, timeout=1):
 
+  tstart = time.time()
+  while int(time.time()-tstart) <= timeout:
     try:
       sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
       sock.settimeout(1)
       sock.connect(('localhost', ambari_server_ui_port))
       print "\nServer started listening on " + str(ambari_server_ui_port)
-      server_ui_port_occupied = True
-      break
+      return True
     except Exception as e:
       #print str(e)
       pass
 
-  if 'Database consistency check: failed' in open(server_out_file).read():
-    print "\nDB configs consistency check failed. Run \"ambari-server start 
--skip-database-check\" to skip. " \
-          "If you use this \"--skip-database-check\" option, do not make any 
changes to your cluster topology " \
-          "or perform a cluster upgrade until you correct the database 
consistency issues. See " + \
-          db_check_log + "for more details on the consistency issues."
-  elif 'Database consistency check: warning' in open(server_out_file).read():
-    print "\nDB configs consistency check found warnings. See " + db_check_log 
+ " for more details."
-  else:
-    print "\nDB configs consistency check: no errors and warnings were found."
-
-
-  if not server_ui_port_occupied:
-    raise FatalException(1, "Server not yet listening on http port " + 
str(ambari_server_ui_port) +
-                            " after " + str(occupy_port_timeout + 
server_init_timeout) + " seconds. Exiting.")
-
-  tstart = time.time()
-  print "Waiting for 10 seconds, for server WEB UI initialization"
-  while int(time.time()-tstart) <= init_web_ui_timeout and len(pids) > 0:
     sys.stdout.write('.')
     sys.stdout.flush()
-    pid_live = 0
-    for item in pids:
-      if pid_exists(item["pid"]):
-        pid_live += 1
     time.sleep(1)
 
-
-  return pid_live
-
+  return False
 
 def get_symlink_path(path_to_link):
   """

http://git-wip-us.apache.org/repos/asf/ambari/blob/d8ba7f1b/ambari-server/src/main/python/ambari_server_main.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/python/ambari_server_main.py 
b/ambari-server/src/main/python/ambari_server_main.py
index 62f8980..b642cea 100644
--- a/ambari-server/src/main/python/ambari_server_main.py
+++ b/ambari-server/src/main/python/ambari_server_main.py
@@ -1,5 +1,3 @@
-SERVER_INIT_TIMEOUT = 5
-SERVER_START_TIMEOUT = 30
 #!/usr/bin/env python
 
 '''
@@ -33,15 +31,15 @@ from ambari_commons.os_utils import is_root, run_os_command
 from ambari_server.dbConfiguration import ensure_dbms_is_running, 
ensure_jdbc_driver_is_installed
 from ambari_server.serverConfiguration import configDefaults, find_jdk, 
get_ambari_properties, \
   get_conf_dir, get_is_persisted, get_is_secure, get_java_exe_path, 
get_original_master_key, read_ambari_user, \
-  get_is_active_instance, update_properties, \
+  get_is_active_instance, update_properties, get_ambari_server_ui_port, \
   PID_NAME, SECURITY_KEY_ENV_VAR_NAME, SECURITY_MASTER_KEY_LOCATION, \
   SETUP_OR_UPGRADE_MSG, check_database_name_property, parse_properties_file, 
get_missing_properties
 from ambari_server.serverUtils import refresh_stack_hash
 from ambari_server.setupHttps import get_fqdn
 from ambari_server.setupSecurity import generate_env, \
   ensure_can_start_under_current_user
-from ambari_server.utils import check_reverse_lookup, save_pid, locate_file, 
locate_all_file_paths, looking_for_pid, wait_for_pid, \
-  save_main_pid_ex, check_exitcode
+from ambari_server.utils import check_reverse_lookup, save_pid, locate_file, 
locate_all_file_paths, looking_for_pid, \
+  save_main_pid_ex, check_exitcode, get_live_pids_count, wait_for_ui_start
 from ambari_server.serverClassPath import ServerClassPath
 
 logger = logging.getLogger(__name__)
@@ -103,9 +101,9 @@ SERVER_START_CMD_DEBUG_WINDOWS = "{0} " \
     "-cp {3} " \
     "org.apache.ambari.server.controller.AmbariServer"
 
-SERVER_INIT_TIMEOUT = 5   #seconds
-WEB_UI_INIT_TIME = 10     #seconds
-SERVER_START_TIMEOUT = 50 #seconds
+SERVER_START_TIMEOUT = 5  #seconds
+SERVER_START_RETRIES = 4
+WEB_UI_INIT_TIME = 50     #seconds
 
 SERVER_PING_TIMEOUT_WINDOWS = 5
 SERVER_PING_ATTEMPTS_WINDOWS = 4
@@ -117,6 +115,7 @@ EXITCODE_NAME = "ambari-server.exitcode"
 CHECK_DATABASE_SKIPPED_PROPERTY = "check_database_skipped"
 
 AMBARI_SERVER_DIE_MSG = "Ambari Server java process died with exitcode {0}. 
Check {1} for more information."
+AMBARI_SERVER_NOT_STARTED_MSG = "Ambari Server java process hasn't been 
started or can't be determined."
 
 # linux open-file limit
 ULIMIT_OPEN_FILES_KEY = 'ulimit.open.files'
@@ -208,21 +207,48 @@ def wait_for_server_start(pidFile, scmStatus):
   #wait for server process for SERVER_START_TIMEOUT seconds
   sys.stdout.write('Waiting for server start...')
   sys.stdout.flush()
-
-  pids = looking_for_pid(SERVER_SEARCH_PATTERN, SERVER_INIT_TIMEOUT)
-  found_pids = wait_for_pid(pids, SERVER_INIT_TIMEOUT, SERVER_START_TIMEOUT, 
WEB_UI_INIT_TIME,
-                            configDefaults.SERVER_OUT_FILE, 
configDefaults.DB_CHECK_LOG, properties)
-
-  sys.stdout.write('\n')
-  sys.stdout.flush()
-
-  if found_pids <= 0:
+  pids = []
+  server_started = False
+  # looking_for_pid() might return partrial pid list on slow hardware
+  for i in range(1, SERVER_START_RETRIES):
+    pids = looking_for_pid(SERVER_SEARCH_PATTERN, SERVER_START_TIMEOUT)
+
+    sys.stdout.write('\n')
+    sys.stdout.flush()
+
+    if save_main_pid_ex(pids, pidFile, locate_all_file_paths('sh', '/bin') +
+                        locate_all_file_paths('bash', '/bin') +
+                        locate_all_file_paths('dash', '/bin'), IS_FOREGROUND):
+      server_started = True
+      break
+    else:
+      sys.stdout.write("Unable to determine server PID. Retrying...\n")
+      sys.stdout.flush()
+
+  exception = None
+  if server_started:
+    ambari_server_ui_port = get_ambari_server_ui_port(properties)
+    if not wait_for_ui_start(int(ambari_server_ui_port), WEB_UI_INIT_TIME):
+      exception = FatalException(1, "Server not yet listening on http port " + 
ambari_server_ui_port + \
+                                 " after " + str(WEB_UI_INIT_TIME) + " 
seconds. Exiting.")
+  elif get_live_pids_count(pids) <= 0:
     exitcode = check_exitcode(os.path.join(configDefaults.PID_DIR, 
EXITCODE_NAME))
-    raise FatalException(-1, AMBARI_SERVER_DIE_MSG.format(exitcode, 
configDefaults.SERVER_OUT_FILE))
+    exception = FatalException(-1, AMBARI_SERVER_DIE_MSG.format(exitcode, 
configDefaults.SERVER_OUT_FILE))
+  else:
+    exception = FatalException(-1, AMBARI_SERVER_NOT_STARTED_MSG)
+
+  if 'Database consistency check: failed' in 
open(configDefaults.SERVER_OUT_FILE).read():
+    print "DB configs consistency check failed. Run \"ambari-server start 
--skip-database-check\" to skip. " \
+          "If you use this \"--skip-database-check\" option, do not make any 
changes to your cluster topology " \
+          "or perform a cluster upgrade until you correct the database 
consistency issues. See " + \
+          configDefaults.DB_CHECK_LOG + " for more details on the consistency 
issues."
+  elif 'Database consistency check: warning' in 
open(configDefaults.SERVER_OUT_FILE).read():
+    print "DB configs consistency check found warnings. See " + 
configDefaults.DB_CHECK_LOG + " for more details."
   else:
-    save_main_pid_ex(pids, pidFile, locate_all_file_paths('sh', '/bin') +
-                                     locate_all_file_paths('bash', '/bin') +
-                                     locate_all_file_paths('dash', '/bin'), 
True, IS_FOREGROUND)
+    print "DB configs consistency check: no errors and warnings were found."
+
+  if exception:
+    raise exception
 
 
 def server_process_main(options, scmStatus=None):
@@ -355,7 +381,7 @@ def server_process_main(options, scmStatus=None):
     raise FatalException(-1, AMBARI_SERVER_DIE_MSG.format(exitcode, 
configDefaults.SERVER_OUT_FILE))
   else:
     pidfile = os.path.join(configDefaults.PID_DIR, PID_NAME)
-    save_pid(pidJava, pidfile)
+
     print "Server PID at: "+pidfile
     print "Server out at: "+configDefaults.SERVER_OUT_FILE
     print "Server log at: "+configDefaults.SERVER_LOG_FILE

http://git-wip-us.apache.org/repos/asf/ambari/blob/d8ba7f1b/ambari-server/src/test/python/TestAmbariServer.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/TestAmbariServer.py 
b/ambari-server/src/test/python/TestAmbariServer.py
index 76857cd..81e1a22 100644
--- a/ambari-server/src/test/python/TestAmbariServer.py
+++ b/ambari-server/src/test/python/TestAmbariServer.py
@@ -4398,7 +4398,7 @@ class TestAmbariServer(TestCase):
   @patch("sys.stdout.flush")
   @patch("sys.stdout.write")
   @patch("ambari_server_main.looking_for_pid")
-  @patch("ambari_server_main.wait_for_pid")
+  @patch("ambari_server_main.wait_for_ui_start")
   @patch("ambari_server_main.save_main_pid_ex")
   @patch("ambari_server_main.check_exitcode")
   @patch("os.makedirs")
@@ -4448,7 +4448,7 @@ class TestAmbariServer(TestCase):
                  save_master_key_method, get_master_key_location_method,
                  os_chown_mock, is_server_running_mock, locate_file_mock,
                  os_makedirs_mock, check_exitcode_mock, save_main_pid_ex_mock,
-                 wait_for_pid_mock, looking_for_pid_mock, stdout_write_mock, 
stdout_flush_mock,
+                 wait_for_ui_start_mock, looking_for_pid_mock, 
stdout_write_mock, stdout_flush_mock,
                  get_is_active_instance_mock):
 
     def reset_mocks():
@@ -4485,7 +4485,7 @@ class TestAmbariServer(TestCase):
         "exe": "/test",
         "cmd": "test arg"
     }]
-    wait_for_pid_mock.return_value = 1
+    wait_for_ui_start_mock.return_value = True
     check_exitcode_mock.return_value = 0
 
     p = Properties()

http://git-wip-us.apache.org/repos/asf/ambari/blob/d8ba7f1b/ambari-server/src/test/python/TestUtils.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/TestUtils.py 
b/ambari-server/src/test/python/TestUtils.py
index bedd75c..268d7e9 100644
--- a/ambari-server/src/test/python/TestUtils.py
+++ b/ambari-server/src/test/python/TestUtils.py
@@ -120,39 +120,6 @@ class TestUtils(TestCase):
     normpath_mock.return_value = "test value"
     self.assertEquals(utils.get_symlink_path("/"), "test value")
 
-  @patch('time.time')
-  @patch.object(utils, 'pid_exists')
-  @patch('time.sleep')
-  @patch("socket.socket")
-  @patch('__builtin__.open')
-  def test_wait_for_pid(self, open_mock, socket_mock, sleep_mock, 
pid_exists_mock, time_mock):
-    from ambari_server.serverConfiguration import SSL_API, 
CLIENT_API_PORT_PROPERTY
-    pid_exists_mock.return_value = True
-    time_mock.side_effect = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 51]
-    s = socket_mock.return_value
-    s.connect = MagicMock()
-    properties = FakeProperties({
-      SSL_API: "false",
-      CLIENT_API_PORT_PROPERTY: "8080"
-    })
-    out = StringIO.StringIO()
-    sys.stdout = out
-    live_pids = utils.wait_for_pid([
-                                   {"pid": "111",
-                                    "exe": "",
-                                    "cmd": ""
-                                    },
-                                   {"pid": "222",
-                                    "exe": "",
-                                    "cmd": ""
-                                    },
-                                   ], 5, 40, 10, '', '', properties)
-    self.assertEqual(".\nServer started listening on 8080\n\nDB configs 
consistency check: no errors and warnings were "
-                     "found.\nWaiting for 10 seconds, for server WEB UI 
initialization\n........", out.getvalue())
-    sys.stdout = sys.__stdout__
-
-    self.assertEquals(2, live_pids)
-
   @patch.object(utils, 'pid_exists')
   @patch('__builtin__.open')
   @patch('os.kill')
@@ -179,7 +146,7 @@ class TestUtils(TestCase):
                              "exe": "/exe2",
                              "cmd": ""
                              },
-                            ], "/pidfile", ["/exe1"], True)
+                            ], "/pidfile", ["/exe1"])
     self.assertEquals(open_mock.call_count, 1)
     self.assertEquals(pid_exists_mock.call_count, 4)
     self.assertEquals(kill_mock.call_count, 1)
@@ -257,4 +224,4 @@ class FakeProperties(object):
     self.prop_map = prop_map
 
   def get_property(self, prop_name):
-    return self.prop_map[prop_name]
\ No newline at end of file
+    return self.prop_map[prop_name]

Reply via email to