Repository: ambari
Updated Branches:
  refs/heads/trunk cc5aded3b -> 7232ea8dc


AMBARI-15991. DataNode and RegionServer during upgrade are reported as "failed" 
incorrectly (Daniel Gergely via oleewere)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/7232ea8d
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/7232ea8d
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/7232ea8d

Branch: refs/heads/trunk
Commit: 7232ea8dc26d8b7a4668c00c392b949af8aeb41e
Parents: cc5aded
Author: Daniel Gergely <[email protected]>
Authored: Fri Apr 22 17:46:36 2016 +0200
Committer: oleewere <[email protected]>
Committed: Fri Apr 22 17:46:36 2016 +0200

----------------------------------------------------------------------
 .../HBASE/0.96.0.2.0/package/scripts/upgrade.py | 21 +++++++++++--
 .../package/scripts/datanode_upgrade.py         | 17 +++++++++++
 .../2.1.0.2.0/package/scripts/params_linux.py   |  1 +
 .../2.0.6/HBASE/test_hbase_regionserver.py      | 32 +++++++++++---------
 .../python/stacks/2.0.6/HDFS/test_datanode.py   | 14 ++++-----
 5 files changed, 60 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/7232ea8d/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py
 
b/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py
index 01a8156..bc68cc6 100644
--- 
a/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py
+++ 
b/ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/package/scripts/upgrade.py
@@ -19,18 +19,18 @@ limitations under the License.
 
 """
 from resource_management import *
-from resource_management.core.resources.system import Execute
 from resource_management.core import shell
 from resource_management.libraries.functions import conf_select
 from resource_management.libraries.functions import stack_select
 from resource_management.libraries.functions import StackFeature
 from resource_management.libraries.functions.stack_features import 
check_stack_feature
 from resource_management.libraries.functions.decorator import retry
+from resource_management.libraries.functions import check_process_status
 
 def prestart(env, stack_component):
   import params
 
-  if params.version and check_stack_feature(StackFeature.ROLLING_UPGRADE, 
params.version):    
+  if params.version and check_stack_feature(StackFeature.ROLLING_UPGRADE, 
params.version):
     conf_select.select(params.stack_name, "hbase", params.version)
     stack_select.select(stack_component, params.version)
 
@@ -41,7 +41,22 @@ def post_regionserver(env):
   check_cmd = "echo 'status \"simple\"' | {0} shell".format(params.hbase_cmd)
 
   exec_cmd = "{0} {1}".format(params.kinit_cmd, check_cmd)
-  call_and_match(exec_cmd, params.hbase_user, params.hostname + ":", 
re.IGNORECASE)
+  _wait_for_region_server_to_start(exec_cmd, params.hbase_user, 
params.hostname + ":", re.IGNORECASE)
+
+@retry(times=3, sleep_time=300, err_class=Fail)
+def _wait_for_region_server_to_start(cmd, user, regex, regex_search_flags):
+  if not is_region_server_process_running():
+    Logger.info("RegionServer process is not running")
+    raise Fail("RegionServer process is not running")
+  call_and_match(cmd, user, regex, regex_search_flags)
+
+def is_region_server_process_running():
+  try:
+    pid_file = format("{pid_dir}/hbase-{hbase_user}-regionserver.pid")
+    check_process_status(pid_file)
+    return True
+  except ComponentIsNotRunning:
+    return False
 
 @retry(times=15, sleep_time=2, err_class=Fail)
 def call_and_match(cmd, user, regex, regex_search_flags):

http://git-wip-us.apache.org/repos/asf/ambari/blob/7232ea8d/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
 
b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
index 8f36001..c8e2eab 100644
--- 
a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
+++ 
b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/datanode_upgrade.py
@@ -24,6 +24,8 @@ from resource_management.core.resources.system import Execute
 from resource_management.core import shell
 from resource_management.libraries.functions import format
 from resource_management.libraries.functions.decorator import retry
+from resource_management.libraries.functions import check_process_status
+from resource_management.core import ComponentIsNotRunning
 from utils import get_dfsadmin_base_command
 
 
@@ -71,9 +73,24 @@ def post_upgrade_check(hdfs_binary):
     Execute(params.dn_kinit_cmd, user=params.hdfs_user)
 
   # verify that the datanode has started and rejoined the HDFS cluster
+  _wait_for_datanode_to_join(hdfs_binary)
+
+@retry(times=3, sleep_time=300, err_class=Fail)
+def _wait_for_datanode_to_join(hdfs_binary):
+  if not is_datanode_process_running():
+    Logger.info("DataNode process is not running")
+    raise Fail("DataNode process is not running")
   _check_datanode_startup(hdfs_binary)
 
 
+def is_datanode_process_running():
+  import params
+  try:
+    check_process_status(params.datanode_pid_file)
+    return True
+  except ComponentIsNotRunning:
+    return False
+
 @retry(times=24, sleep_time=5, err_class=Fail)
 def _check_datanode_shutdown(hdfs_binary):
   """

http://git-wip-us.apache.org/repos/asf/ambari/blob/7232ea8d/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py
 
b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py
index d8ff3c5..5e855e7 100644
--- 
a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py
+++ 
b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/params_linux.py
@@ -74,6 +74,7 @@ root_user = "root"
 hadoop_pid_dir_prefix = status_params.hadoop_pid_dir_prefix
 namenode_pid_file = status_params.namenode_pid_file
 zkfc_pid_file = status_params.zkfc_pid_file
+datanode_pid_file = status_params.datanode_pid_file
 
 # Some datanode settings
 dfs_dn_addr = default('/configurations/hdfs-site/dfs.datanode.address', None)

http://git-wip-us.apache.org/repos/asf/ambari/blob/7232ea8d/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py 
b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py
index 8d187ec..5913b62 100644
--- 
a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py
+++ 
b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py
@@ -18,9 +18,11 @@ See the License for the specific language governing 
permissions and
 limitations under the License.
 '''
 import json
+import upgrade
 from mock.mock import MagicMock, patch
 from stacks.utils.RMFTestCase import *
 
[email protected](upgrade, 'check_process_status', new = MagicMock())
 @patch("platform.linux_distribution", new = MagicMock(return_value="Linux"))
 @patch("os.path.exists", new = MagicMock(return_value=True))
 class TestHbaseRegionServer(RMFTestCase):
@@ -36,10 +38,10 @@ class TestHbaseRegionServer(RMFTestCase):
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES
     )
-    
+
     self.assert_configure_default()
     self.assertNoMoreResources()
-    
+
   def test_start_default(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + 
"/scripts/hbase_regionserver.py",
                    classname = "HbaseRegionServer",
@@ -48,14 +50,14 @@ class TestHbaseRegionServer(RMFTestCase):
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES
     )
-    
+
     self.assert_configure_default()
     self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh 
--config /etc/hbase/conf start regionserver',
       not_if = 'ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f 
/var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh 
[RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` 
>/dev/null 2>&1',
       user = 'hbase'
     )
     self.assertNoMoreResources()
-    
+
   def test_stop_default(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + 
"/scripts/hbase_regionserver.py",
                    classname = "HbaseRegionServer",
@@ -64,19 +66,19 @@ class TestHbaseRegionServer(RMFTestCase):
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES
     )
-    
+
     self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh 
--config /etc/hbase/conf stop regionserver',
         only_if = 'ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f 
/var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh 
[RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` 
>/dev/null 2>&1',
         on_timeout = '! ( ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f 
/var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh 
[RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` 
>/dev/null 2>&1 ) || ambari-sudo.sh -H -E kill -9 `ambari-sudo.sh 
[RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid`',
         timeout = 30,
         user = 'hbase',
     )
-    
+
     self.assertResourceCalled('File', 
'/var/run/hbase/hbase-hbase-regionserver.pid',
         action = ['delete'],
     )
     self.assertNoMoreResources()
-    
+
   def test_configure_secured(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + 
"/scripts/hbase_regionserver.py",
                    classname = "HbaseRegionServer",
@@ -85,10 +87,10 @@ class TestHbaseRegionServer(RMFTestCase):
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES
     )
-    
+
     self.assert_configure_secured()
     self.assertNoMoreResources()
-    
+
   def test_start_secured(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + 
"/scripts/hbase_regionserver.py",
                    classname = "HbaseRegionServer",
@@ -97,14 +99,14 @@ class TestHbaseRegionServer(RMFTestCase):
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES
     )
-    
+
     self.assert_configure_secured()
     self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh 
--config /etc/hbase/conf start regionserver',
       not_if = 'ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f 
/var/run/hbase/hbase-hbase-regionserver.pid && ps -p `ambari-sudo.sh 
[RMF_ENV_PLACEHOLDER] -H -E cat /var/run/hbase/hbase-hbase-regionserver.pid` 
>/dev/null 2>&1',
       user = 'hbase',
     )
     self.assertNoMoreResources()
-    
+
   def test_stop_secured(self):
     self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + 
"/scripts/hbase_regionserver.py",
                    classname = "HbaseRegionServer",
@@ -120,7 +122,7 @@ class TestHbaseRegionServer(RMFTestCase):
         timeout = 30,
         user = 'hbase',
     )
-    
+
     self.assertResourceCalled('File', 
'/var/run/hbase/hbase-hbase-regionserver.pid',
         action = ['delete'],
     )
@@ -329,7 +331,7 @@ class TestHbaseRegionServer(RMFTestCase):
                    config_file="hbase-rs-2.2.json",
                    stack_version = self.STACK_VERSION,
                    target = RMFTestCase.TARGET_COMMON_SERVICES)
-    
+
     self.assertResourceCalled('Directory', '/etc/hbase',
       mode = 0755)
 
@@ -675,8 +677,8 @@ class TestHbaseRegionServer(RMFTestCase):
                               ('ambari-python-wrap', '/usr/bin/hdp-select', 
'set', 'hbase-regionserver', version), sudo=True,)
     self.assertNoMoreResources()
 
-
-  def test_post_rolling_restart(self):
+  @patch('time.sleep')
+  def test_post_rolling_restart(self, sleep_mock):
     config_file = 
self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"
     with open(config_file, "r") as f:
       json_content = json.load(f)

http://git-wip-us.apache.org/repos/asf/ambari/blob/7232ea8d/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py 
b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
index 78b8171..dbd76cf 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_datanode.py
@@ -25,7 +25,7 @@ from resource_management.core import shell
 from resource_management.core.exceptions import Fail
 import resource_management.libraries.functions.dfs_datanode_helper
 
-
[email protected](resource_management.libraries.functions, 'check_process_status', 
new = MagicMock())
 class TestDatanode(RMFTestCase):
   COMMON_SERVICES_PACKAGE_DIR = "HDFS/2.1.0.2.0/package"
   STACK_VERSION = "2.0.6"
@@ -515,10 +515,10 @@ class TestDatanode(RMFTestCase):
                        config_file = "default.json",
                        stack_version = self.STACK_VERSION,
                        target = RMFTestCase.TARGET_COMMON_SERVICES,
-                       call_mocks = [(0, shell_call_output)],
+                       call_mocks = [(0, shell_call_output)] * 3,
                        mocks_dict = mocks_dict
     )
-    
+
     self.assertTrue(mocks_dict['call'].called)
     self.assertEqual(mocks_dict['call'].call_count,1)
 
@@ -535,13 +535,13 @@ class TestDatanode(RMFTestCase):
                          config_file = "default.json",
                          stack_version = self.STACK_VERSION,
                          target = RMFTestCase.TARGET_COMMON_SERVICES,
-                         call_mocks = [(0, 'There are no DataNodes here!')],
+                         call_mocks = [(0, 'There are no DataNodes here!')] * 
36,
                          mocks_dict = mocks_dict
       )
       self.fail('Missing DataNode should have caused a failure')
     except Fail,fail:
       self.assertTrue(mocks_dict['call'].called)
-      self.assertEqual(mocks_dict['call'].call_count,12)
+      self.assertEqual(mocks_dict['call'].call_count,36)
 
 
   @patch("socket.gethostbyname")
@@ -556,13 +556,13 @@ class TestDatanode(RMFTestCase):
                          config_file = "default.json",
                          stack_version = self.STACK_VERSION,
                          target = RMFTestCase.TARGET_COMMON_SERVICES,
-                         call_mocks = [(0, 'some')],
+                         call_mocks = [(1, 'some')] * 36,
                          mocks_dict = mocks_dict
       )
       self.fail('Invalid return code should cause a failure')
     except Fail,fail:
       self.assertTrue(mocks_dict['call'].called)
-      self.assertEqual(mocks_dict['call'].call_count,12)
+      self.assertEqual(mocks_dict['call'].call_count,36)
 
 
   @patch("resource_management.core.shell.call")

Reply via email to