borisstoyanov commented on a change in pull request #3239: [WIP DO NOT MERGE] 
KVM: Fix agents dont reconnect post maintenance
URL: https://github.com/apache/cloudstack/pull/3239#discussion_r274804027
 
 

 ##########
 File path: test/integration/smoke/test_host_maintenance.py
 ##########
 @@ -290,3 +274,313 @@ def 
test_02_cancel_host_maintenace_with_migration_jobs(self):
         return
 
 
+class TestHostMaintenanceAgents(cloudstackTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.testClient = super(TestHostMaintenanceAgents, 
cls).getClsTestClient()
+        cls.apiclient = cls.testClient.getApiClient()
+        cls.hypervisor = cls.testClient.getHypervisorInfo()
+        cls.dbclient = cls.testClient.getDbConnection()
+        cls.zone = get_zone(cls.apiclient, cls.testClient.getZoneForTests())
+        cls.pod = get_pod(cls.apiclient, cls.zone.id)
+        cls.services = cls.testClient.getParsedTestDataConfig()
+
+        cls.logger = logging.getLogger('TestHMAgents')
+        cls.stream_handler = logging.StreamHandler()
+        cls.logger.setLevel(logging.DEBUG)
+        cls.logger.addHandler(cls.stream_handler)
+
+        cls._cleanup = []
+        cls.hypervisorNotSupported = False
+        if cls.hypervisor.lower() not in ['kvm', 'lxc']:
+            cls.hypervisorNotSupported = True
+
+        if not cls.hypervisorNotSupported:
+            cls.initialsshvalue = cls.is_ssh_enabled()
+            cls.template = get_template(
+                cls.apiclient,
+                cls.zone.id,
+                cls.hypervisor
+            )
+            cls.services["virtual_machine"]["zoneid"] = cls.zone.id
+            cls.services["virtual_machine"]["template"] = cls.template.id
+            cls.services["virtual_machine"]["hypervisor"] = cls.hypervisor
+            cls.service_offering = ServiceOffering.create(
+                cls.apiclient,
+                cls.services["service_offerings"]["tiny"]
+            )
+            cls._cleanup.append(cls.service_offering)
+            cls.network_offering = NetworkOffering.create(
+                cls.apiclient,
+                cls.services["l2-network_offering"],
+            )
+            cls.network_offering.update(cls.apiclient, state='Enabled')
+            cls.services["network"]["networkoffering"] = 
cls.network_offering.id
+            cls.l2_network = Network.create(
+                cls.apiclient,
+                cls.services["l2-network"],
+                zoneid=cls.zone.id,
+                networkofferingid=cls.network_offering.id
+            )
+            cls._cleanup.append(cls.l2_network)
+            cls._cleanup.append(cls.network_offering)
+
+        cls.hostConfig = 
cls.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__
+
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            if not cls.hypervisorNotSupported:
+                # Revert setting value to the original
+                cls.set_ssh_enabled(cls.initialsshvalue)
+            cleanup_resources(cls.apiclient, cls._cleanup)
+        except Exception as e:
+            raise Exception("Warning: Exception during cleanup : %s" % e)
+
+    def setUp(self):
+        if not self.hypervisorNotSupported:
+            self.host = self.get_enabled_host_connected_agent()
+        self.cleanup = []
+
+    def tearDown(self):
+        try:
+            cleanup_resources(self.apiclient, self.cleanup)
+        except Exception as e:
+            raise Exception("Warning: Exception during cleanup : %s" % e)
+
+
+    @classmethod
+    def is_ssh_enabled(cls):
+        conf = Configurations.list(cls.apiclient, name="kvm.ssh.to.agent")
+        if not conf:
+            return False
+        else:
+            return bool(strtobool(conf[0].value)) if conf[0].value else False
+
+    @classmethod
+    def set_ssh_enabled(cls, on):
+        value = "true" if on else "false"
+        sql = "update configuration set value = '%s' where name = 
'kvm.ssh.to.agent';" % value
+        cls.dbclient.execute(sql)
+
+    def prepare_host_for_maintenance(self, hostid):
+        cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd()
+        cmd.id = hostid
+        self.apiclient.prepareHostForMaintenance(cmd)
+        self.logger.debug('Host with id %s is in prepareHostForMaintenance' % 
hostid)
+
+    def wait_until_host_is_in_state(self, hostid, resourcestate, interval=3, 
retries=20):
+        def check_resource_state():
+            response = Host.list(
+                self.apiclient,
+                id=hostid
+            )
+            if isinstance(response, list):
+                if response[0].resourcestate == resourcestate:
+                    self.logger.debug('Host with id %s is in resource state = 
%s' % (hostid, resourcestate))
+                    return True, None
+            return False, None
+
+        done, _ = wait_until(interval, retries, check_resource_state)
+        if not done:
+            raise Exception("Failed to wait for host %s to be on resource 
state %s" % (hostid, resourcestate))
+        return True
+
+    def wait_until_agent_is_in_state(self, hostid, state, interval=3, 
retries=20):
+        def check_agent_state():
+            response = Host.list(
+                self.apiclient,
+                id=hostid
+            )
+            if isinstance(response, list):
+                if response[0].state == state:
+                    self.logger.debug('Host agent with id %s is in state = %s' 
% (hostid, state))
+                    return True, None
+            return False, None
+
+        done, _ = wait_until(interval, retries, check_agent_state)
+        if not done:
+            raise Exception("Failed to wait for host agent %s to be on state 
%s" % (hostid, state))
+        return True
+
+    def cancel_host_maintenance(self, hostid):
+        cmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
+        cmd.id = hostid
+        self.apiclient.cancelHostMaintenance(cmd)
+        self.logger.debug('Host with id %s is cancelling maintenance' % hostid)
+
+    def get_enabled_host_connected_agent(self):
+        hosts = Host.list(
+            self.apiclient,
+            type='Routing',
+            zoneid=self.zone.id,
+            podid=self.pod.id,
+            hypervisor=self.hypervisor,
+            resourcestate='Enabled',
+            state='Up'
+        )
+        if len(hosts) < 2:
+            raise unittest.SkipTest("Cancel host maintenance must be tested 
for 2 or more hosts")
+        return hosts[0]
+
+    def deploy_vm_on_host(self, hostid):
+        return VirtualMachine.create(
+            self.apiclient,
+            self.services["virtual_machine"],
+            serviceofferingid=self.service_offering.id,
+            networkids=self.l2_network.id,
+            hostid=hostid
+        )
+
+    def assert_host_is_functional_after_cancelling_maintenance(self, hostid):
+        self.wait_until_agent_is_in_state(hostid, "Up")
+        self.logger.debug('Deploying VM on host %s' % hostid)
+        vm = self.deploy_vm_on_host(hostid)
+        self.assertEqual(
+            vm.state,
+            "Running",
+            "Check VM is running on the host"
+        )
+        self.cleanup.append(vm)
+
+    @skipTestIf("hypervisorNotSupported")
+    @attr(tags=["advanced", "advancedns", "smoke", "basic", "eip", "sg"], 
required_hardware="true")
+    def test_01_cancel_host_maintenance_ssh_enabled_agent_connected(self):
+        """
+        Test cancel maintenance when: 'kvm.ssh.to.agent' = true, agent state = 
'Up'
+
+        1) Put host on Maintenance
+        2) Cancel maintenance on host
+        4) Assert agent is still connected after cancelling maintenance
+        3) Deploy VM on the host after cancelling maintenance
+        """
+
+        if not self.is_ssh_enabled():
+            self.set_ssh_enabled(True)
+
+        try:
+            self.prepare_host_for_maintenance(self.host.id)
+            self.wait_until_host_is_in_state(self.host.id, "Maintenance")
+            self.cancel_host_maintenance(self.host.id)
+            self.wait_until_host_is_in_state(self.host.id, "Enabled")
+            
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
+        except Exception as e:
+            self.fail(e)
 
 Review comment:
   I think it's vital to have a call to a method here that will iterate through 
all hosts and set them all back to Up. If there's an error within this 'try' it 
will leave the host into maintenance and the rest of the test run will be 
doomed. 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to