Repository: incubator-slider Updated Branches: refs/heads/develop d8b36cafe -> 961e17043
SLIDER-341. Add a window based failure count for auto-start to limit indefinite attempt Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/961e1704 Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/961e1704 Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/961e1704 Branch: refs/heads/develop Commit: 961e17043f589c4096dce896d64a582dd000dd4e Parents: d8b36ca Author: Sumit Mohanty <[email protected]> Authored: Mon Oct 6 20:47:44 2014 -0700 Committer: Sumit Mohanty <[email protected]> Committed: Mon Oct 6 20:47:44 2014 -0700 ---------------------------------------------------------------------- slider-agent/conf/agent.ini | 1 + .../src/main/python/agent/AgentConfig.py | 14 +++++ .../src/main/python/agent/Controller.py | 35 ++++++++++- .../src/test/python/agent/TestController.py | 63 ++++++++++++++++++++ slider-agent/src/test/python/agent/TestMain.py | 37 ++++++++++++ 5 files changed, 148 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/961e1704/slider-agent/conf/agent.ini ---------------------------------------------------------------------- diff --git a/slider-agent/conf/agent.ini b/slider-agent/conf/agent.ini index 7b9d57d..48113e3 100644 --- a/slider-agent/conf/agent.ini +++ b/slider-agent/conf/agent.ini @@ -43,6 +43,7 @@ log_level=INFO [command] max_retries=2 sleep_between_retries=1 +auto_restart=5,5 [security] http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/961e1704/slider-agent/src/main/python/agent/AgentConfig.py ---------------------------------------------------------------------- diff --git a/slider-agent/src/main/python/agent/AgentConfig.py b/slider-agent/src/main/python/agent/AgentConfig.py index e45ba23..86925b1 100644 --- a/slider-agent/src/main/python/agent/AgentConfig.py +++ b/slider-agent/src/main/python/agent/AgentConfig.py @@ -61,6 +61,7 @@ log_level=INFO [command] max_retries=2 sleep_between_retries=1 +auto_restart=5,5 [security] keysdir=security/keys @@ -109,6 +110,8 @@ class AgentConfig: # agent version file VERSION_FILE = "version_file" + AUTO_RESTART = "auto_restart" + FOLDER_MAPPING = { APP_PACKAGE_DIR: "WORK", APP_INSTALL_DIR: "WORK", @@ -164,6 +167,17 @@ class AgentConfig: return "" return command + # return max, window - max failures within window minutes + def getErrorWindow(self): + window = config.get(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART) + if window != None: + parts = window.split(',') + if len(parts) == 2: + if parts[0].isdigit() and parts[1].isdigit(): + return (int(parts[0]), int(parts[1])) + pass + return (0, 0) + def set(self, category, name, value): global config return config.set(category, name, value) http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/961e1704/slider-agent/src/main/python/agent/Controller.py ---------------------------------------------------------------------- diff --git a/slider-agent/src/main/python/agent/Controller.py b/slider-agent/src/main/python/agent/Controller.py index 11db21c..77f932c 100644 --- a/slider-agent/src/main/python/agent/Controller.py +++ b/slider-agent/src/main/python/agent/Controller.py @@ -27,6 +27,7 @@ import time import threading import urllib2 import pprint +import math from random import randint from AgentConfig import AgentConfig @@ -86,7 +87,8 @@ class Controller(threading.Thread): self.statusCommand = None self.failureCount = 0 self.heartBeatRetryCount = 0 - self.autoRestart = False + self.autoRestartFailures = 0 + self.autoRestartTrackingSince = 0 def __del__(self): @@ -275,7 +277,7 @@ class Controller(threading.Thread): stored_command = self.actionQueue.customServiceOrchestrator.stored_command if len(stored_command) > 0: auto_start_command = self.create_start_command(stored_command) - if auto_start_command: + if auto_start_command and self.shouldAutoRestart(): logger.info("Automatically adding a start command.") logger.debug("Auto start command: " + pprint.pformat(auto_start_command)) self.updateStateBasedOnCommand([auto_start_command], False) @@ -486,6 +488,35 @@ class Controller(threading.Thread): return {'exitstatus': 1, 'log': err_msg} + # Basic window that only counts failures till the window duration expires + def shouldAutoRestart(self): + max, window = self.config.getErrorWindow() + if max <= 0 or window <= 0: + return True + + seconds_now = time.time() + if self.autoRestartTrackingSince == 0: + self.autoRestartTrackingSince = seconds_now + self.autoRestartFailures = 1 + return True + + self.autoRestartFailures += 1 + minutes = math.floor((seconds_now - self.autoRestartTrackingSince) / 60) + if self.autoRestartFailures > max: + logger.info("Auto restart not allowed due to " + str(self.autoRestartFailures) + " failures in " + str(minutes) + + " minutes. Max restarts allowed is " + str(max) + " in " + str(window) + " minutes.") + return False + + if minutes > window: + logger.info("Resetting window as number of minutes passed is " + str(minutes)) + self.autoRestartTrackingSince = seconds_now + self.autoRestartFailures = 1 + return True + return True + + pass + + def main(argv=None): # Allow Ctrl-C signal.signal(signal.SIGINT, signal.SIG_DFL) http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/961e1704/slider-agent/src/test/python/agent/TestController.py ---------------------------------------------------------------------- diff --git a/slider-agent/src/test/python/agent/TestController.py b/slider-agent/src/test/python/agent/TestController.py index 401d69a..02b0d0e 100644 --- a/slider-agent/src/test/python/agent/TestController.py +++ b/slider-agent/src/test/python/agent/TestController.py @@ -25,6 +25,7 @@ import unittest, threading from agent import Controller, ActionQueue from agent import hostname import sys +import time from Controller import AGENT_AUTO_RESTART_EXIT_CODE from Controller import State from AgentConfig import AgentConfig @@ -255,6 +256,68 @@ class TestController(unittest.TestCase): self.assertTrue(os_exit_mock.call_args[0][0] == AGENT_AUTO_RESTART_EXIT_CODE) + @patch("time.time") + def test_failure_window(self, mock_time): + config = AgentConfig("", "") + original_config = config.get(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART) + config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '2,1') + ## The behavior of side_effect is different when you run tests in command line and when you do it through IDE + ## So few extra items are there in the list + mock_time.side_effect = [200, 500, 500] + controller5 = Controller.Controller(config) + + try: + self.assertTrue(controller5.shouldAutoRestart()) + self.assertTrue(controller5.shouldAutoRestart()) + finally: + config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, original_config) + + + @patch("time.time") + def test_failure_window(self, mock_time): + config = AgentConfig("", "") + original_config = config.get(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART) + config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '3,1') + ## The behavior of side_effect is different when you run tests in command line and when you do it through IDE + ## So few extra items are there in the list + mock_time.side_effect = [200, 210, 220, 230, 240, 250] + controller5 = Controller.Controller(config) + + try: + self.assertTrue(controller5.shouldAutoRestart()) + self.assertTrue(controller5.shouldAutoRestart()) + self.assertTrue(controller5.shouldAutoRestart()) + self.assertFalse(controller5.shouldAutoRestart()) + finally: + config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, original_config) + + + def test_failure_window2(self): + config = MagicMock() + config.getErrorWindow.return_value = (0, 0) + controller = Controller.Controller(config) + + self.assertTrue(controller.shouldAutoRestart()) + + config.getErrorWindow.return_value = (0, 1) + self.assertTrue(controller.shouldAutoRestart()) + + config.getErrorWindow.return_value = (1, 0) + self.assertTrue(controller.shouldAutoRestart()) + + config.getErrorWindow.return_value = (-1, -1) + self.assertTrue(controller.shouldAutoRestart()) + + config.getErrorWindow.return_value = (1, 1) + self.assertTrue(controller.shouldAutoRestart()) + + #second failure within a minute + self.assertFalse(controller.shouldAutoRestart()) + + #do not reset unless window expires + self.assertFalse(controller.shouldAutoRestart()) + + @patch("urllib2.urlopen") def test_sendRequest(self, requestMock): http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/961e1704/slider-agent/src/test/python/agent/TestMain.py ---------------------------------------------------------------------- diff --git a/slider-agent/src/test/python/agent/TestMain.py b/slider-agent/src/test/python/agent/TestMain.py index e73a05a..7c0036b 100644 --- a/slider-agent/src/test/python/agent/TestMain.py +++ b/slider-agent/src/test/python/agent/TestMain.py @@ -312,6 +312,43 @@ class TestMain(unittest.TestCase): AgentConfig_set_mock.assert_any_call("server", "zk_reg_path", "/registry/org-apache-slider/cl1") + def test_config1(self): + config = AgentConfig("", "") + (max, window) = config.getErrorWindow() + self.assertEqual(max, 5) + self.assertEqual(window, 5) + + config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '') + (max, window) = config.getErrorWindow() + self.assertEqual(max, 0) + self.assertEqual(window, 0) + + config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '33') + (max, window) = config.getErrorWindow() + self.assertEqual(max, 0) + self.assertEqual(window, 0) + + config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '-4,-6') + (max, window) = config.getErrorWindow() + self.assertEqual(max, 0) + self.assertEqual(window, 0) + + config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, 'wd,er') + (max, window) = config.getErrorWindow() + self.assertEqual(max, 0) + self.assertEqual(window, 0) + + config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '2,20') + (max, window) = config.getErrorWindow() + self.assertEqual(max, 2) + self.assertEqual(window, 20) + + config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, ' 2, 30') + (max, window) = config.getErrorWindow() + self.assertEqual(max, 0) + self.assertEqual(window, 0) + + if __name__ == "__main__": logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG) unittest.main() \ No newline at end of file
