Repository: ambari Updated Branches: refs/heads/trunk cc5aded3b -> 7232ea8dc
AMBARI-15991. DataNode and RegionServer during upgrade are reported as "failed" incorrectly (Daniel Gergely via oleewere) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/7232ea8d Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/7232ea8d Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/7232ea8d Branch: refs/heads/trunk Commit: 7232ea8dc26d8b7a4668c00c392b949af8aeb41e Parents: cc5aded Author: Daniel Gergely <[email protected]> Authored: Fri Apr 22 17:46:36 2016 +0200 Committer: oleewere <[email protected]> Committed: Fri Apr 22 17:46:36 2016 +0200 ---------------------------------------------------------------------- .../HBASE/0.96.0.2.0/package/scripts/upgrade.py | 21 +++++++++++-- .../package/scripts/datanode_upgrade.py | 17 +++++++++++ .../2.1.0.2.0/package/scripts/params_linux.py | 1 + .../2.0.6/HBASE/test_hbase_regionserver.py | 32 +++++++++++--------- .../python/stacks/2.0.6/HDFS/test_datanode.py | 14 ++++----- 5 files changed, 60 insertions(+), 25 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/7232ea8d/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py b/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py index 01a8156..bc68cc6 100644 --- a/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py +++ b/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py @@ -19,18 +19,18 @@ limitations under the License. """ from resource_management import * -from resource_management.core.resources.system import Execute from resource_management.core import shell from resource_management.libraries.functions import conf_select from resource_management.libraries.functions import stack_select from resource_management.libraries.functions import StackFeature from resource_management.libraries.functions.stack_features import check_stack_feature from resource_management.libraries.functions.decorator import retry +from resource_management.libraries.functions import check_process_status def prestart(env, stack_component): import params - if params.version and check_stack_feature(StackFeature.ROLLING_UPGRADE, params.version): + if params.version and check_stack_feature(StackFeature.ROLLING_UPGRADE, params.version): conf_select.select(params.stack_name, "hbase", params.version) stack_select.select(stack_component, params.version) @@ -41,7 +41,22 @@ def post_regionserver(env): check_cmd = "echo 'status \"simple\"' | {0} shell".format(params.hbase_cmd) exec_cmd = "{0} {1}".format(params.kinit_cmd, check_cmd) - call_and_match(exec_cmd, params.hbase_user, params.hostname + ":", re.IGNORECASE) + _wait_for_region_server_to_start(exec_cmd, params.hbase_user, params.hostname + ":", re.IGNORECASE) + +@retry(times=3, sleep_time=300, err_class=Fail) +def _wait_for_region_server_to_start(cmd, user, regex, regex_search_flags): + if not is_region_server_process_running(): + Logger.info("RegionServer process is not running") + raise Fail("RegionServer process is not running") + call_and_match(cmd, user, regex, regex_search_flags) + +def is_region_server_process_running(): + try: + pid_file = format("{pid_dir}/hbase-{hbase_user}-regionserver.pid") + check_process_status(pid_file) + return True + except ComponentIsNotRunning: + return False @retry(times=15, sleep_time=2, err_class=Fail) def call_and_match(cmd, user, regex, regex_search_flags): http://git-wip-us.apache.org/repos/asf/ambari/blob/7232ea8d/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py index 8f36001..c8e2eab 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py @@ -24,6 +24,8 @@ from resource_management.core.resources.system import Execute from resource_management.core import shell from resource_management.libraries.functions import format from resource_management.libraries.functions.decorator import retry +from resource_management.libraries.functions import check_process_status +from resource_management.core import ComponentIsNotRunning from utils import get_dfsadmin_base_command @@ -71,9 +73,24 @@ def post_upgrade_check(hdfs_binary): Execute(params.dn_kinit_cmd, user=params.hdfs_user) # verify that the datanode has started and rejoined the HDFS cluster + _wait_for_datanode_to_join(hdfs_binary) + +@retry(times=3, sleep_time=300, err_class=Fail) +def _wait_for_datanode_to_join(hdfs_binary): + if not is_datanode_process_running(): + Logger.info("DataNode process is not running") + raise Fail("DataNode process is not running") _check_datanode_startup(hdfs_binary) +def is_datanode_process_running(): + import params + try: + check_process_status(params.datanode_pid_file) + return True + except ComponentIsNotRunning: + return False + @retry(times=24, sleep_time=5, err_class=Fail) def _check_datanode_shutdown(hdfs_binary): """ http://git-wip-us.apache.org/repos/asf/ambari/blob/7232ea8d/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py index d8ff3c5..5e855e7 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py @@ -74,6 +74,7 @@ root_user = "root" hadoop_pid_dir_prefix = status_params.hadoop_pid_dir_prefix namenode_pid_file = status_params.namenode_pid_file zkfc_pid_file = status_params.zkfc_pid_file +datanode_pid_file = status_params.datanode_pid_file # Some datanode settings dfs_dn_addr = default('/configurations/hdfs-site/dfs.datanode.address', None) http://git-wip-us.apache.org/repos/asf/ambari/blob/7232ea8d/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py index 8d187ec..5913b62 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py +++ b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py @@ -18,9 +18,11 @@ See the License for the specific language governing permissions and limitations under the License. ''' import json +import upgrade from mock.mock import MagicMock, patch from stacks.utils.RMFTestCase import * [email protected](upgrade, 'check_process_status', new = MagicMock()) @patch("platform.linux_distribution", new = MagicMock(return_value="Linux")) @patch("os.path.exists", new = MagicMock(return_value=True)) class TestHbaseRegionServer(RMFTestCase): @@ -36,10 +38,10 @@ class TestHbaseRegionServer(RMFTestCase): stack_version = self.STACK_VERSION, target = RMFTestCase.TARGET_COMMON_SERVICES ) - + self.assert_configure_default() self.assertNoMoreResources() - + def test_start_default(self): self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/hbase_regionserver.py", classname = "HbaseRegionServer", @@ -48,14 +50,14 @@ class TestHbaseRegionServer(RMFTestCase): stack_version = self.STACK_VERSION, target = RMFTestCase.TARGET_COMMON_SERVICES ) - + self.assert_configure_default() self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf start regionserver', not_if = 'ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1', user = 'hbase' ) self.assertNoMoreResources() - + def test_stop_default(self): self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/hbase_regionserver.py", classname = "HbaseRegionServer", @@ -64,19 +66,19 @@ class TestHbaseRegionServer(RMFTestCase): stack_version = self.STACK_VERSION, target = RMFTestCase.TARGET_COMMON_SERVICES ) - + self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop regionserver', only_if = 'ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1', on_timeout = '! ( ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1 ) || ambari-sudo.sh -H -E kill -9 `ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid`', timeout = 30, user = 'hbase', ) - + self.assertResourceCalled('File', '/var/run/hbase/hbase-hbase-regionserver.pid', action = ['delete'], ) self.assertNoMoreResources() - + def test_configure_secured(self): self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/hbase_regionserver.py", classname = "HbaseRegionServer", @@ -85,10 +87,10 @@ class TestHbaseRegionServer(RMFTestCase): stack_version = self.STACK_VERSION, target = RMFTestCase.TARGET_COMMON_SERVICES ) - + self.assert_configure_secured() self.assertNoMoreResources() - + def test_start_secured(self): self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/hbase_regionserver.py", classname = "HbaseRegionServer", @@ -97,14 +99,14 @@ class TestHbaseRegionServer(RMFTestCase): stack_version = self.STACK_VERSION, target = RMFTestCase.TARGET_COMMON_SERVICES ) - + self.assert_configure_secured() self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf start regionserver', not_if = 'ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1', user = 'hbase', ) self.assertNoMoreResources() - + def test_stop_secured(self): self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/hbase_regionserver.py", classname = "HbaseRegionServer", @@ -120,7 +122,7 @@ class TestHbaseRegionServer(RMFTestCase): timeout = 30, user = 'hbase', ) - + self.assertResourceCalled('File', '/var/run/hbase/hbase-hbase-regionserver.pid', action = ['delete'], ) @@ -329,7 +331,7 @@ class TestHbaseRegionServer(RMFTestCase): config_file="hbase-rs-2.2.json", stack_version = self.STACK_VERSION, target = RMFTestCase.TARGET_COMMON_SERVICES) - + self.assertResourceCalled('Directory', '/etc/hbase', mode = 0755) @@ -675,8 +677,8 @@ class TestHbaseRegionServer(RMFTestCase): ('ambari-python-wrap', '/usr/bin/hdp-select', 'set', 'hbase-regionserver', version), sudo=True,) self.assertNoMoreResources() - - def test_post_rolling_restart(self): + @patch('time.sleep') + def test_post_rolling_restart(self, sleep_mock): config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json" with open(config_file, "r") as f: json_content = json.load(f) http://git-wip-us.apache.org/repos/asf/ambari/blob/7232ea8d/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py index 78b8171..dbd76cf 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py +++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py @@ -25,7 +25,7 @@ from resource_management.core import shell from resource_management.core.exceptions import Fail import resource_management.libraries.functions.dfs_datanode_helper - [email protected](resource_management.libraries.functions, 'check_process_status', new = MagicMock()) class TestDatanode(RMFTestCase): COMMON_SERVICES_PACKAGE_DIR = "HDFS/2.1.0.2.0/package" STACK_VERSION = "2.0.6" @@ -515,10 +515,10 @@ class TestDatanode(RMFTestCase): config_file = "default.json", stack_version = self.STACK_VERSION, target = RMFTestCase.TARGET_COMMON_SERVICES, - call_mocks = [(0, shell_call_output)], + call_mocks = [(0, shell_call_output)] * 3, mocks_dict = mocks_dict ) - + self.assertTrue(mocks_dict['call'].called) self.assertEqual(mocks_dict['call'].call_count,1) @@ -535,13 +535,13 @@ class TestDatanode(RMFTestCase): config_file = "default.json", stack_version = self.STACK_VERSION, target = RMFTestCase.TARGET_COMMON_SERVICES, - call_mocks = [(0, 'There are no DataNodes here!')], + call_mocks = [(0, 'There are no DataNodes here!')] * 36, mocks_dict = mocks_dict ) self.fail('Missing DataNode should have caused a failure') except Fail,fail: self.assertTrue(mocks_dict['call'].called) - self.assertEqual(mocks_dict['call'].call_count,12) + self.assertEqual(mocks_dict['call'].call_count,36) @patch("socket.gethostbyname") @@ -556,13 +556,13 @@ class TestDatanode(RMFTestCase): config_file = "default.json", stack_version = self.STACK_VERSION, target = RMFTestCase.TARGET_COMMON_SERVICES, - call_mocks = [(0, 'some')], + call_mocks = [(1, 'some')] * 36, mocks_dict = mocks_dict ) self.fail('Invalid return code should cause a failure') except Fail,fail: self.assertTrue(mocks_dict['call'].called) - self.assertEqual(mocks_dict['call'].call_count,12) + self.assertEqual(mocks_dict['call'].call_count,36) @patch("resource_management.core.shell.call")
