Repository: ambari Updated Branches: refs/heads/trunk b6c115ba2 -> b40d808d3
AMBARI-11624 - Datanode Shutdown Retries During Upgrade Are Too Long (jonathanhurley) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/b40d808d Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/b40d808d Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/b40d808d Branch: refs/heads/trunk Commit: b40d808d3a4ee9c7855532e555d89dd5910301f3 Parents: b6c115b Author: Jonathan Hurley <[email protected]> Authored: Tue Jun 2 10:29:53 2015 -0400 Committer: Jonathan Hurley <[email protected]> Committed: Tue Jun 2 13:48:03 2015 -0400 ---------------------------------------------------------------------- .../package/scripts/datanode_upgrade.py | 13 ++++++++-- .../python/stacks/2.0.6/HDFS/test_datanode.py | 27 ++++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/b40d808d/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py index 529ca4438..29af5bd 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py @@ -62,18 +62,27 @@ def post_upgrade_check(): _check_datanode_startup() -@retry(times=12, sleep_time=10, err_class=Fail) +@retry(times=24, sleep_time=5, err_class=Fail) def _check_datanode_shutdown(): """ Checks that a DataNode is down by running "hdfs dfsamin getDatanodeInfo" several times, pausing in between runs. Once the DataNode stops responding this method will return, otherwise it will raise a Fail(...) and retry automatically. + The stack defaults for retrying for HDFS are also way too slow for this + command; they are set to wait about 45 seconds between client retries. As + a result, a single execution of dfsadmin will take 45 seconds to retry and + the DataNode may be marked as dead, causing problems with HBase. + https://issues.apache.org/jira/browse/HDFS-8510 tracks reducing the + times for ipc.client.connect.retry.interval. In the meantime, override them + here, but only for RU. :return: """ import params - command = format('hdfs dfsadmin -getDatanodeInfo {dfs_dn_ipc_address}') + # override stock retry timeouts since after 30 seconds, the datanode is + # marked as dead and can affect HBase during RU + command = format('hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo {dfs_dn_ipc_address}') try: Execute(command, user=params.hdfs_user, tries=1) http://git-wip-us.apache.org/repos/asf/ambari/blob/b40d808d/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py index a310bf4..2440145 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py +++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py @@ -544,6 +544,33 @@ class TestDatanode(RMFTestCase): self.assertTrue(mocks_dict['call'].called) self.assertEqual(mocks_dict['call'].call_count,12) + + @patch('time.sleep') + def test_stop_during_upgrade(self, time_mock): + config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json" + with open(config_file, "r") as f: + json_content = json.load(f) + + version = '2.2.1.0-3242' + json_content['commandParams']['version'] = version + + try: + self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/datanode.py", + classname = "DataNode", + command = "stop", + config_dict = json_content, + hdp_stack_version = self.STACK_VERSION, + target = RMFTestCase.TARGET_COMMON_SERVICES, + command_args=[True]) + + raise Fail("Expected a fail since datanode didn't report a shutdown") + except: + pass + + self.assertResourceCalled('Execute', 'hdfs dfsadmin -shutdownDatanode 0.0.0.0:8010 upgrade', user="hdfs", tries=1) + self.assertResourceCalled('Execute', 'hdfs dfsadmin -D ipc.client.connect.max.retries=5 -D ipc.client.connect.retry.interval=1000 -getDatanodeInfo 0.0.0.0:8010', user="hdfs", tries=1) + + @patch("resource_management.libraries.functions.security_commons.build_expectations") @patch("resource_management.libraries.functions.security_commons.get_params_from_filesystem") @patch("resource_management.libraries.functions.security_commons.validate_security_config_properties")
