borisstoyanov commented on a change in pull request #3239: [WIP DO NOT MERGE]
KVM: Fix agents dont reconnect post maintenance
URL: https://github.com/apache/cloudstack/pull/3239#discussion_r274804231
##########
File path: test/integration/smoke/test_host_maintenance.py
##########
@@ -290,3 +274,313 @@ def
test_02_cancel_host_maintenace_with_migration_jobs(self):
return
+class TestHostMaintenanceAgents(cloudstackTestCase):
+
+ @classmethod
+ def setUpClass(cls):
+ cls.testClient = super(TestHostMaintenanceAgents,
cls).getClsTestClient()
+ cls.apiclient = cls.testClient.getApiClient()
+ cls.hypervisor = cls.testClient.getHypervisorInfo()
+ cls.dbclient = cls.testClient.getDbConnection()
+ cls.zone = get_zone(cls.apiclient, cls.testClient.getZoneForTests())
+ cls.pod = get_pod(cls.apiclient, cls.zone.id)
+ cls.services = cls.testClient.getParsedTestDataConfig()
+
+ cls.logger = logging.getLogger('TestHMAgents')
+ cls.stream_handler = logging.StreamHandler()
+ cls.logger.setLevel(logging.DEBUG)
+ cls.logger.addHandler(cls.stream_handler)
+
+ cls._cleanup = []
+ cls.hypervisorNotSupported = False
+ if cls.hypervisor.lower() not in ['kvm', 'lxc']:
+ cls.hypervisorNotSupported = True
+
+ if not cls.hypervisorNotSupported:
+ cls.initialsshvalue = cls.is_ssh_enabled()
+ cls.template = get_template(
+ cls.apiclient,
+ cls.zone.id,
+ cls.hypervisor
+ )
+ cls.services["virtual_machine"]["zoneid"] = cls.zone.id
+ cls.services["virtual_machine"]["template"] = cls.template.id
+ cls.services["virtual_machine"]["hypervisor"] = cls.hypervisor
+ cls.service_offering = ServiceOffering.create(
+ cls.apiclient,
+ cls.services["service_offerings"]["tiny"]
+ )
+ cls._cleanup.append(cls.service_offering)
+ cls.network_offering = NetworkOffering.create(
+ cls.apiclient,
+ cls.services["l2-network_offering"],
+ )
+ cls.network_offering.update(cls.apiclient, state='Enabled')
+ cls.services["network"]["networkoffering"] =
cls.network_offering.id
+ cls.l2_network = Network.create(
+ cls.apiclient,
+ cls.services["l2-network"],
+ zoneid=cls.zone.id,
+ networkofferingid=cls.network_offering.id
+ )
+ cls._cleanup.append(cls.l2_network)
+ cls._cleanup.append(cls.network_offering)
+
+ cls.hostConfig =
cls.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__
+
+
+ @classmethod
+ def tearDownClass(cls):
+ try:
+ if not cls.hypervisorNotSupported:
+ # Revert setting value to the original
+ cls.set_ssh_enabled(cls.initialsshvalue)
+ cleanup_resources(cls.apiclient, cls._cleanup)
+ except Exception as e:
+ raise Exception("Warning: Exception during cleanup : %s" % e)
+
+ def setUp(self):
+ if not self.hypervisorNotSupported:
+ self.host = self.get_enabled_host_connected_agent()
+ self.cleanup = []
+
+ def tearDown(self):
+ try:
+ cleanup_resources(self.apiclient, self.cleanup)
+ except Exception as e:
+ raise Exception("Warning: Exception during cleanup : %s" % e)
+
+
+ @classmethod
+ def is_ssh_enabled(cls):
+ conf = Configurations.list(cls.apiclient, name="kvm.ssh.to.agent")
+ if not conf:
+ return False
+ else:
+ return bool(strtobool(conf[0].value)) if conf[0].value else False
+
+ @classmethod
+ def set_ssh_enabled(cls, on):
+ value = "true" if on else "false"
+ sql = "update configuration set value = '%s' where name =
'kvm.ssh.to.agent';" % value
+ cls.dbclient.execute(sql)
+
+ def prepare_host_for_maintenance(self, hostid):
+ cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd()
+ cmd.id = hostid
+ self.apiclient.prepareHostForMaintenance(cmd)
+ self.logger.debug('Host with id %s is in prepareHostForMaintenance' %
hostid)
+
+ def wait_until_host_is_in_state(self, hostid, resourcestate, interval=3,
retries=20):
+ def check_resource_state():
+ response = Host.list(
+ self.apiclient,
+ id=hostid
+ )
+ if isinstance(response, list):
+ if response[0].resourcestate == resourcestate:
+ self.logger.debug('Host with id %s is in resource state =
%s' % (hostid, resourcestate))
+ return True, None
+ return False, None
+
+ done, _ = wait_until(interval, retries, check_resource_state)
+ if not done:
+ raise Exception("Failed to wait for host %s to be on resource
state %s" % (hostid, resourcestate))
+ return True
+
+ def wait_until_agent_is_in_state(self, hostid, state, interval=3,
retries=20):
+ def check_agent_state():
+ response = Host.list(
+ self.apiclient,
+ id=hostid
+ )
+ if isinstance(response, list):
+ if response[0].state == state:
+ self.logger.debug('Host agent with id %s is in state = %s'
% (hostid, state))
+ return True, None
+ return False, None
+
+ done, _ = wait_until(interval, retries, check_agent_state)
+ if not done:
+ raise Exception("Failed to wait for host agent %s to be on state
%s" % (hostid, state))
+ return True
+
+ def cancel_host_maintenance(self, hostid):
+ cmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
+ cmd.id = hostid
+ self.apiclient.cancelHostMaintenance(cmd)
+ self.logger.debug('Host with id %s is cancelling maintenance' % hostid)
+
+ def get_enabled_host_connected_agent(self):
+ hosts = Host.list(
+ self.apiclient,
+ type='Routing',
+ zoneid=self.zone.id,
+ podid=self.pod.id,
+ hypervisor=self.hypervisor,
+ resourcestate='Enabled',
+ state='Up'
+ )
+ if len(hosts) < 2:
+ raise unittest.SkipTest("Cancel host maintenance must be tested
for 2 or more hosts")
+ return hosts[0]
+
+ def deploy_vm_on_host(self, hostid):
+ return VirtualMachine.create(
+ self.apiclient,
+ self.services["virtual_machine"],
+ serviceofferingid=self.service_offering.id,
+ networkids=self.l2_network.id,
+ hostid=hostid
+ )
+
+ def assert_host_is_functional_after_cancelling_maintenance(self, hostid):
+ self.wait_until_agent_is_in_state(hostid, "Up")
+ self.logger.debug('Deploying VM on host %s' % hostid)
+ vm = self.deploy_vm_on_host(hostid)
+ self.assertEqual(
+ vm.state,
+ "Running",
+ "Check VM is running on the host"
+ )
+ self.cleanup.append(vm)
+
+ @skipTestIf("hypervisorNotSupported")
+ @attr(tags=["advanced", "advancedns", "smoke", "basic", "eip", "sg"],
required_hardware="true")
+ def test_01_cancel_host_maintenance_ssh_enabled_agent_connected(self):
+ """
+ Test cancel maintenance when: 'kvm.ssh.to.agent' = true, agent state =
'Up'
+
+ 1) Put host on Maintenance
+ 2) Cancel maintenance on host
+ 4) Assert agent is still connected after cancelling maintenance
+ 3) Deploy VM on the host after cancelling maintenance
+ """
+
+ if not self.is_ssh_enabled():
+ self.set_ssh_enabled(True)
+
+ try:
+ self.prepare_host_for_maintenance(self.host.id)
+ self.wait_until_host_is_in_state(self.host.id, "Maintenance")
+ self.cancel_host_maintenance(self.host.id)
+ self.wait_until_host_is_in_state(self.host.id, "Enabled")
+
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
+ except Exception as e:
+ self.fail(e)
+
+ def get_ssh_client(self, ip, username, password, retries=10):
+ """ Setup ssh client connection and return connection """
+
+ try:
+ ssh_client = SshClient(ip, 22, username, password, retries)
+ except Exception as e:
+ raise unittest.SkipTest("Unable to create ssh connection: " % e)
+
+ self.assertIsNotNone(
+ ssh_client, "Failed to setup ssh connection to ip=%s" % ip)
+
+ return ssh_client
+
+ @skipTestIf("hypervisorNotSupported")
+ @attr(tags=["boris", "advancedns", "smoke", "basic", "eip", "sg"],
required_hardware="true")
+ def test_02_cancel_host_maintenance_ssh_enabled_agent_disconnected(self):
+ """
+ Test cancel maintenance when: 'kvm.ssh.to.agent' = true, agent state
!= 'Up'
+
+ 1) Put host on maintenance
+ 2) SSH into host and stop cloudstack-agent service - host gets
Disconnected
+ 3) Cancel maintenance on host
+ 4) Assert agent is connected after cancelling maintenance
+ 5) Deploy VM on the host
+ """
+
+ if not self.is_ssh_enabled():
+ self.set_ssh_enabled(True)
+ # username, password = self.get_host_credentials(self.host.id)
+ username = self.hostConfig["username"]
+ password = self.hostConfig["password"]
+
+ try:
+ self.prepare_host_for_maintenance(self.host.id)
+ self.wait_until_host_is_in_state(self.host.id, "Maintenance")
+
+ ssh_client = self.get_ssh_client(self.host.ipaddress,
self.hostConfig["username"],
+ self.hostConfig["password"])
+ ssh_client.execute("service cloudstack-agent stop")
+ self.wait_until_agent_is_in_state(self.host.id, "Disconnected")
+
+ self.cancel_host_maintenance(self.host.id)
+ self.wait_until_host_is_in_state(self.host.id, "Enabled")
+
+
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
+ except Exception as e:
+ self.fail(e)
+
+ @skipTestIf("hypervisorNotSupported")
+ @attr(tags=["advanced", "advancedns", "smoke", "basic", "eip", "sg"],
required_hardware="true")
+ def test_03_cancel_host_maintenance_ssh_disabled_agent_connected(self):
+ """
+ Test cancel maintenance when: 'kvm.ssh.to.agent' = false, agent state
= 'Up'
+
+ 1) Put host on Maintenance
+ 2) Cancel maintenance on host
+ 4) Assert agent is still connected after cancelling maintenance
+ 3) Deploy VM on the host after cancelling maintenance
+ """
+
+ if self.is_ssh_enabled():
+ self.set_ssh_enabled(False)
+
+ try:
+ self.prepare_host_for_maintenance(self.host.id)
+ self.wait_until_host_is_in_state(self.host.id, "Maintenance")
+ self.cancel_host_maintenance(self.host.id)
+ self.wait_until_host_is_in_state(self.host.id, "Enabled")
+
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
+ except Exception as e:
Review comment:
Same here, add a cleanup method that makes sure all hosts are Up
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services