AMBARI-18997 ambari-server.pid might not be created on slow hardware (dsen)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/d8ba7f1b Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/d8ba7f1b Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/d8ba7f1b Branch: refs/heads/branch-feature-AMBARI-18456 Commit: d8ba7f1bad5f9bb96256a5ca64c0568d74b874f8 Parents: 1fb8d07 Author: Dmytro Sen <d...@apache.org> Authored: Wed Nov 30 13:50:37 2016 +0200 Committer: Dmytro Sen <d...@apache.org> Committed: Wed Nov 30 13:50:37 2016 +0200 ---------------------------------------------------------------------- .../src/main/python/ambari_server/utils.py | 60 ++++------------- .../src/main/python/ambari_server_main.py | 70 ++++++++++++++------ .../src/test/python/TestAmbariServer.py | 6 +- ambari-server/src/test/python/TestUtils.py | 37 +---------- 4 files changed, 66 insertions(+), 107 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/d8ba7f1b/ambari-server/src/main/python/ambari_server/utils.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/python/ambari_server/utils.py b/ambari-server/src/main/python/ambari_server/utils.py index 5188a48..26e59ae 100644 --- a/ambari-server/src/main/python/ambari_server/utils.py +++ b/ambari-server/src/main/python/ambari_server/utils.py @@ -119,19 +119,19 @@ def save_pid(pid, pidfile): pass -def save_main_pid_ex(pids, pidfile, exclude_list=[], kill_exclude_list=False, skip_daemonize=False): +def save_main_pid_ex(pids, pidfile, exclude_list=[], skip_daemonize=False): """ Save pid which is not included to exclude_list to pidfile. - If kill_exclude_list is set to true, all processes in that - list would be killed. It's might be useful to daemonize child process exclude_list contains list of full executable paths which should be excluded """ + pid_saved = False try: pfile = open(pidfile, "w") for item in pids: if pid_exists(item["pid"]) and (item["exe"] not in exclude_list): pfile.write("%s\n" % item["pid"]) + pid_saved = True logger.info("Ambari server started with PID " + str(item["pid"])) if pid_exists(item["pid"]) and (item["exe"] in exclude_list) and not skip_daemonize: try: @@ -147,67 +147,33 @@ def save_main_pid_ex(pids, pidfile, exclude_list=[], kill_exclude_list=False, sk except Exception as e: logger.error("Failed to close PID file " + pidfile + " due to " + str(e)) pass + return pid_saved - -def wait_for_pid(pids, server_init_timeout, occupy_port_timeout, init_web_ui_timeout, - server_out_file, db_check_log, properties): +def get_live_pids_count(pids): """ - Check pid for existence during timeout + Check pids for existence """ - from ambari_server.serverConfiguration import get_ambari_server_ui_port - ambari_server_ui_port = int(get_ambari_server_ui_port(properties)) - server_ui_port_occupied = False - tstart = time.time() - pid_live = 0 - while int(time.time()-tstart) <= occupy_port_timeout and len(pids) > 0: - sys.stdout.write('.') - sys.stdout.flush() - pid_live = 0 - for item in pids: - if pid_exists(item["pid"]): - pid_live += 1 - time.sleep(1) + return len([pid for pid in pids if pid_exists(pid)]) + +def wait_for_ui_start(ambari_server_ui_port, timeout=1): + tstart = time.time() + while int(time.time()-tstart) <= timeout: try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(1) sock.connect(('localhost', ambari_server_ui_port)) print "\nServer started listening on " + str(ambari_server_ui_port) - server_ui_port_occupied = True - break + return True except Exception as e: #print str(e) pass - if 'Database consistency check: failed' in open(server_out_file).read(): - print "\nDB configs consistency check failed. Run \"ambari-server start --skip-database-check\" to skip. " \ - "If you use this \"--skip-database-check\" option, do not make any changes to your cluster topology " \ - "or perform a cluster upgrade until you correct the database consistency issues. See " + \ - db_check_log + "for more details on the consistency issues." - elif 'Database consistency check: warning' in open(server_out_file).read(): - print "\nDB configs consistency check found warnings. See " + db_check_log + " for more details." - else: - print "\nDB configs consistency check: no errors and warnings were found." - - - if not server_ui_port_occupied: - raise FatalException(1, "Server not yet listening on http port " + str(ambari_server_ui_port) + - " after " + str(occupy_port_timeout + server_init_timeout) + " seconds. Exiting.") - - tstart = time.time() - print "Waiting for 10 seconds, for server WEB UI initialization" - while int(time.time()-tstart) <= init_web_ui_timeout and len(pids) > 0: sys.stdout.write('.') sys.stdout.flush() - pid_live = 0 - for item in pids: - if pid_exists(item["pid"]): - pid_live += 1 time.sleep(1) - - return pid_live - + return False def get_symlink_path(path_to_link): """ http://git-wip-us.apache.org/repos/asf/ambari/blob/d8ba7f1b/ambari-server/src/main/python/ambari_server_main.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/python/ambari_server_main.py b/ambari-server/src/main/python/ambari_server_main.py index 62f8980..b642cea 100644 --- a/ambari-server/src/main/python/ambari_server_main.py +++ b/ambari-server/src/main/python/ambari_server_main.py @@ -1,5 +1,3 @@ -SERVER_INIT_TIMEOUT = 5 -SERVER_START_TIMEOUT = 30 #!/usr/bin/env python ''' @@ -33,15 +31,15 @@ from ambari_commons.os_utils import is_root, run_os_command from ambari_server.dbConfiguration import ensure_dbms_is_running, ensure_jdbc_driver_is_installed from ambari_server.serverConfiguration import configDefaults, find_jdk, get_ambari_properties, \ get_conf_dir, get_is_persisted, get_is_secure, get_java_exe_path, get_original_master_key, read_ambari_user, \ - get_is_active_instance, update_properties, \ + get_is_active_instance, update_properties, get_ambari_server_ui_port, \ PID_NAME, SECURITY_KEY_ENV_VAR_NAME, SECURITY_MASTER_KEY_LOCATION, \ SETUP_OR_UPGRADE_MSG, check_database_name_property, parse_properties_file, get_missing_properties from ambari_server.serverUtils import refresh_stack_hash from ambari_server.setupHttps import get_fqdn from ambari_server.setupSecurity import generate_env, \ ensure_can_start_under_current_user -from ambari_server.utils import check_reverse_lookup, save_pid, locate_file, locate_all_file_paths, looking_for_pid, wait_for_pid, \ - save_main_pid_ex, check_exitcode +from ambari_server.utils import check_reverse_lookup, save_pid, locate_file, locate_all_file_paths, looking_for_pid, \ + save_main_pid_ex, check_exitcode, get_live_pids_count, wait_for_ui_start from ambari_server.serverClassPath import ServerClassPath logger = logging.getLogger(__name__) @@ -103,9 +101,9 @@ SERVER_START_CMD_DEBUG_WINDOWS = "{0} " \ "-cp {3} " \ "org.apache.ambari.server.controller.AmbariServer" -SERVER_INIT_TIMEOUT = 5 #seconds -WEB_UI_INIT_TIME = 10 #seconds -SERVER_START_TIMEOUT = 50 #seconds +SERVER_START_TIMEOUT = 5 #seconds +SERVER_START_RETRIES = 4 +WEB_UI_INIT_TIME = 50 #seconds SERVER_PING_TIMEOUT_WINDOWS = 5 SERVER_PING_ATTEMPTS_WINDOWS = 4 @@ -117,6 +115,7 @@ EXITCODE_NAME = "ambari-server.exitcode" CHECK_DATABASE_SKIPPED_PROPERTY = "check_database_skipped" AMBARI_SERVER_DIE_MSG = "Ambari Server java process died with exitcode {0}. Check {1} for more information." +AMBARI_SERVER_NOT_STARTED_MSG = "Ambari Server java process hasn't been started or can't be determined." # linux open-file limit ULIMIT_OPEN_FILES_KEY = 'ulimit.open.files' @@ -208,21 +207,48 @@ def wait_for_server_start(pidFile, scmStatus): #wait for server process for SERVER_START_TIMEOUT seconds sys.stdout.write('Waiting for server start...') sys.stdout.flush() - - pids = looking_for_pid(SERVER_SEARCH_PATTERN, SERVER_INIT_TIMEOUT) - found_pids = wait_for_pid(pids, SERVER_INIT_TIMEOUT, SERVER_START_TIMEOUT, WEB_UI_INIT_TIME, - configDefaults.SERVER_OUT_FILE, configDefaults.DB_CHECK_LOG, properties) - - sys.stdout.write('\n') - sys.stdout.flush() - - if found_pids <= 0: + pids = [] + server_started = False + # looking_for_pid() might return partrial pid list on slow hardware + for i in range(1, SERVER_START_RETRIES): + pids = looking_for_pid(SERVER_SEARCH_PATTERN, SERVER_START_TIMEOUT) + + sys.stdout.write('\n') + sys.stdout.flush() + + if save_main_pid_ex(pids, pidFile, locate_all_file_paths('sh', '/bin') + + locate_all_file_paths('bash', '/bin') + + locate_all_file_paths('dash', '/bin'), IS_FOREGROUND): + server_started = True + break + else: + sys.stdout.write("Unable to determine server PID. Retrying...\n") + sys.stdout.flush() + + exception = None + if server_started: + ambari_server_ui_port = get_ambari_server_ui_port(properties) + if not wait_for_ui_start(int(ambari_server_ui_port), WEB_UI_INIT_TIME): + exception = FatalException(1, "Server not yet listening on http port " + ambari_server_ui_port + \ + " after " + str(WEB_UI_INIT_TIME) + " seconds. Exiting.") + elif get_live_pids_count(pids) <= 0: exitcode = check_exitcode(os.path.join(configDefaults.PID_DIR, EXITCODE_NAME)) - raise FatalException(-1, AMBARI_SERVER_DIE_MSG.format(exitcode, configDefaults.SERVER_OUT_FILE)) + exception = FatalException(-1, AMBARI_SERVER_DIE_MSG.format(exitcode, configDefaults.SERVER_OUT_FILE)) + else: + exception = FatalException(-1, AMBARI_SERVER_NOT_STARTED_MSG) + + if 'Database consistency check: failed' in open(configDefaults.SERVER_OUT_FILE).read(): + print "DB configs consistency check failed. Run \"ambari-server start --skip-database-check\" to skip. " \ + "If you use this \"--skip-database-check\" option, do not make any changes to your cluster topology " \ + "or perform a cluster upgrade until you correct the database consistency issues. See " + \ + configDefaults.DB_CHECK_LOG + " for more details on the consistency issues." + elif 'Database consistency check: warning' in open(configDefaults.SERVER_OUT_FILE).read(): + print "DB configs consistency check found warnings. See " + configDefaults.DB_CHECK_LOG + " for more details." else: - save_main_pid_ex(pids, pidFile, locate_all_file_paths('sh', '/bin') + - locate_all_file_paths('bash', '/bin') + - locate_all_file_paths('dash', '/bin'), True, IS_FOREGROUND) + print "DB configs consistency check: no errors and warnings were found." + + if exception: + raise exception def server_process_main(options, scmStatus=None): @@ -355,7 +381,7 @@ def server_process_main(options, scmStatus=None): raise FatalException(-1, AMBARI_SERVER_DIE_MSG.format(exitcode, configDefaults.SERVER_OUT_FILE)) else: pidfile = os.path.join(configDefaults.PID_DIR, PID_NAME) - save_pid(pidJava, pidfile) + print "Server PID at: "+pidfile print "Server out at: "+configDefaults.SERVER_OUT_FILE print "Server log at: "+configDefaults.SERVER_LOG_FILE http://git-wip-us.apache.org/repos/asf/ambari/blob/d8ba7f1b/ambari-server/src/test/python/TestAmbariServer.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/TestAmbariServer.py b/ambari-server/src/test/python/TestAmbariServer.py index 76857cd..81e1a22 100644 --- a/ambari-server/src/test/python/TestAmbariServer.py +++ b/ambari-server/src/test/python/TestAmbariServer.py @@ -4398,7 +4398,7 @@ class TestAmbariServer(TestCase): @patch("sys.stdout.flush") @patch("sys.stdout.write") @patch("ambari_server_main.looking_for_pid") - @patch("ambari_server_main.wait_for_pid") + @patch("ambari_server_main.wait_for_ui_start") @patch("ambari_server_main.save_main_pid_ex") @patch("ambari_server_main.check_exitcode") @patch("os.makedirs") @@ -4448,7 +4448,7 @@ class TestAmbariServer(TestCase): save_master_key_method, get_master_key_location_method, os_chown_mock, is_server_running_mock, locate_file_mock, os_makedirs_mock, check_exitcode_mock, save_main_pid_ex_mock, - wait_for_pid_mock, looking_for_pid_mock, stdout_write_mock, stdout_flush_mock, + wait_for_ui_start_mock, looking_for_pid_mock, stdout_write_mock, stdout_flush_mock, get_is_active_instance_mock): def reset_mocks(): @@ -4485,7 +4485,7 @@ class TestAmbariServer(TestCase): "exe": "/test", "cmd": "test arg" }] - wait_for_pid_mock.return_value = 1 + wait_for_ui_start_mock.return_value = True check_exitcode_mock.return_value = 0 p = Properties() http://git-wip-us.apache.org/repos/asf/ambari/blob/d8ba7f1b/ambari-server/src/test/python/TestUtils.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/TestUtils.py b/ambari-server/src/test/python/TestUtils.py index bedd75c..268d7e9 100644 --- a/ambari-server/src/test/python/TestUtils.py +++ b/ambari-server/src/test/python/TestUtils.py @@ -120,39 +120,6 @@ class TestUtils(TestCase): normpath_mock.return_value = "test value" self.assertEquals(utils.get_symlink_path("/"), "test value") - @patch('time.time') - @patch.object(utils, 'pid_exists') - @patch('time.sleep') - @patch("socket.socket") - @patch('__builtin__.open') - def test_wait_for_pid(self, open_mock, socket_mock, sleep_mock, pid_exists_mock, time_mock): - from ambari_server.serverConfiguration import SSL_API, CLIENT_API_PORT_PROPERTY - pid_exists_mock.return_value = True - time_mock.side_effect = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 51] - s = socket_mock.return_value - s.connect = MagicMock() - properties = FakeProperties({ - SSL_API: "false", - CLIENT_API_PORT_PROPERTY: "8080" - }) - out = StringIO.StringIO() - sys.stdout = out - live_pids = utils.wait_for_pid([ - {"pid": "111", - "exe": "", - "cmd": "" - }, - {"pid": "222", - "exe": "", - "cmd": "" - }, - ], 5, 40, 10, '', '', properties) - self.assertEqual(".\nServer started listening on 8080\n\nDB configs consistency check: no errors and warnings were " - "found.\nWaiting for 10 seconds, for server WEB UI initialization\n........", out.getvalue()) - sys.stdout = sys.__stdout__ - - self.assertEquals(2, live_pids) - @patch.object(utils, 'pid_exists') @patch('__builtin__.open') @patch('os.kill') @@ -179,7 +146,7 @@ class TestUtils(TestCase): "exe": "/exe2", "cmd": "" }, - ], "/pidfile", ["/exe1"], True) + ], "/pidfile", ["/exe1"]) self.assertEquals(open_mock.call_count, 1) self.assertEquals(pid_exists_mock.call_count, 4) self.assertEquals(kill_mock.call_count, 1) @@ -257,4 +224,4 @@ class FakeProperties(object): self.prop_map = prop_map def get_property(self, prop_name): - return self.prop_map[prop_name] \ No newline at end of file + return self.prop_map[prop_name]