GutoVeronezi commented on a change in pull request #4978: URL: https://github.com/apache/cloudstack/pull/4978#discussion_r634353753
########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java ########## @@ -0,0 +1,295 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import com.cloud.host.Host; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.VirtualMachine; +import com.cloud.vm.dao.VMInstanceDao; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.Nullable; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * This class provides a client that checks Agent status via a webserver. + * <br> + * The additional webserver exposes a simple JSON API which returns a list + * of Virtual Machines that are running on that host according to Libvirt. + * <br> + * This way, KVM HA can verify, via Libvirt, VMs status with an HTTP-call + * to this simple webserver and determine if the host is actually down + * or if it is just the Java Agent which has crashed. + */ +public class KvmHaAgentClient { + + private static final Logger LOGGER = Logger.getLogger(KvmHaAgentClient.class); + private static final int ERROR_CODE = -1; + private static final String EXPECTED_HTTP_STATUS = "2XX"; + private static final String VM_COUNT = "count"; + private static final String STATUS = "status"; + private static final String CHECK = "check"; + private static final String UP = "Up"; + private static final int WAIT_FOR_REQUEST_RETRY = 2; + private static final int MAX_REQUEST_RETRIES = 2; + private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; + private Host agent; + + /** + * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running according to the Libvirt + */ + public KvmHaAgentClient(Host agent) { + this.agent = agent; + } + + /** + * Returns the number of VMs running on the KVM host according to Libvirt. + */ + protected int countRunningVmsOnAgent() { + String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), getKvmHaMicroservicePortValue()); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return ERROR_CODE; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return ERROR_CODE; + } + + return responseInJson.get(VM_COUNT).getAsInt(); + } + + /** + * Executes ping command from the host executing the KVM HA Agent webservice to a target IP Address. + * The webserver serves a JSON Object such as {"status": "Up"} if the IP address is reachable OR {"status": "Down"} if could not ping the IP + */ + protected boolean isTargetHostReachable(String ipAddress) { + int port = getKvmHaMicroservicePortValue(); + String url = String.format("http://%s:%d/%s/%s:%d", agent.getPrivateIpAddress(), port, CHECK, ipAddress, port); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return false; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return false; + } + + return UP.equals(responseInJson.get(STATUS).getAsString()); + } + + protected int getKvmHaMicroservicePortValue() { + Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); + if (haAgentPort == null) { + LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", + KVMHAConfig.KvmHaWebservicePort.defaultValue(), agent.getClusterId(), agent)); + haAgentPort = Integer.parseInt(KVMHAConfig.KvmHaWebservicePort.defaultValue()); + } + return haAgentPort; + } + + /** + * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. + */ + public boolean isKvmHaWebserviceEnabled() { + return KVMHAConfig.IsKvmHaWebserviceEnabled.value(); + } + + /** + * Lists VMs on host according to vm_instance DB table. The states considered for such listing are: 'Running', 'Stopping', 'Migrating'. + * <br> + * <br> + * Note that VMs on state 'Starting' are not common to be at the host, therefore this method does not list them. + * However, there is still a probability of a VM in 'Starting' state be already listed on the KVM via '$virsh list', + * but that's not likely and thus it is not relevant for this very context. + */ + protected List<VMInstanceVO> listVmsOnHost(Host host, VMInstanceDao vmInstanceDao) { + List<VMInstanceVO> listByHostAndStateRunning = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running); + List<VMInstanceVO> listByHostAndStateStopping = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Stopping); + List<VMInstanceVO> listByHostAndStateMigrating = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Migrating); + + List<VMInstanceVO> listByHostAndState = new ArrayList<>(); + listByHostAndState.addAll(listByHostAndStateRunning); + listByHostAndState.addAll(listByHostAndStateStopping); + listByHostAndState.addAll(listByHostAndStateMigrating); + + if (LOGGER.isTraceEnabled()) { + List<VMInstanceVO> listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); + int startingVMs = listByHostAndStateStarting.size(); + int runningVMs = listByHostAndStateRunning.size(); + int stoppingVms = listByHostAndStateStopping.size(); + int migratingVms = listByHostAndStateMigrating.size(); + int countRunningVmsOnAgent = countRunningVmsOnAgent(); + LOGGER.trace( + String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, + stoppingVms, migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); + } + + return listByHostAndState; + } Review comment: As the method `listByHostAndStat` receive a varargs of state as parameter, we could simplify this method by joining the three first requests and filtering they if needed: ```java List<VMInstanceVO> listByHostAndStates = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running, VirtualMachine.State.Stopping, VirtualMachine.State.Migrating); if (LOGGER.isTraceEnabled()) { List<VMInstanceVO> listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); int startingVMs = listByHostAndStateStarting.size(); int runningVMs = listByHostAndStates.stream().filter...; int stoppingVms = listByHostAndStates.stream().filter...; int migratingVms = listByHostAndStates.stream().filter...; int countRunningVmsOnAgent = countRunningVmsOnAgent(); LOGGER.trace( String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, stoppingVms, migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); } return listByHostAndStates; ``` ########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java ########## @@ -81,7 +98,63 @@ public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException @Override public boolean isHealthy(Host r) { - return isAgentActive(r); + boolean isHealthy = true; + boolean isHostServedByNfsPool = isHostServedByNfsPool(r); + boolean isKvmHaWebserviceEnabled = isKvmHaWebserviceEnabled(r); + + isHealthy = isHealthViaNfs(r); + + if (!isKvmHaWebserviceEnabled) { + return isHealthy; + } + + //TODO Review comment: What this comment means? ########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java ########## @@ -81,7 +98,63 @@ public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException @Override public boolean isHealthy(Host r) { Review comment: We can rename the parameter to a more intuitive name. ########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java ########## @@ -0,0 +1,295 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import com.cloud.host.Host; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.VirtualMachine; +import com.cloud.vm.dao.VMInstanceDao; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.Nullable; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * This class provides a client that checks Agent status via a webserver. + * <br> + * The additional webserver exposes a simple JSON API which returns a list + * of Virtual Machines that are running on that host according to Libvirt. + * <br> + * This way, KVM HA can verify, via Libvirt, VMs status with an HTTP-call + * to this simple webserver and determine if the host is actually down + * or if it is just the Java Agent which has crashed. + */ +public class KvmHaAgentClient { + + private static final Logger LOGGER = Logger.getLogger(KvmHaAgentClient.class); + private static final int ERROR_CODE = -1; + private static final String EXPECTED_HTTP_STATUS = "2XX"; + private static final String VM_COUNT = "count"; + private static final String STATUS = "status"; + private static final String CHECK = "check"; + private static final String UP = "Up"; + private static final int WAIT_FOR_REQUEST_RETRY = 2; + private static final int MAX_REQUEST_RETRIES = 2; + private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; + private Host agent; + + /** + * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running according to the Libvirt + */ + public KvmHaAgentClient(Host agent) { + this.agent = agent; + } + + /** + * Returns the number of VMs running on the KVM host according to Libvirt. + */ + protected int countRunningVmsOnAgent() { + String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), getKvmHaMicroservicePortValue()); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return ERROR_CODE; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return ERROR_CODE; + } + + return responseInJson.get(VM_COUNT).getAsInt(); + } + + /** + * Executes ping command from the host executing the KVM HA Agent webservice to a target IP Address. + * The webserver serves a JSON Object such as {"status": "Up"} if the IP address is reachable OR {"status": "Down"} if could not ping the IP + */ + protected boolean isTargetHostReachable(String ipAddress) { + int port = getKvmHaMicroservicePortValue(); + String url = String.format("http://%s:%d/%s/%s:%d", agent.getPrivateIpAddress(), port, CHECK, ipAddress, port); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return false; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return false; + } + + return UP.equals(responseInJson.get(STATUS).getAsString()); + } + + protected int getKvmHaMicroservicePortValue() { + Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); + if (haAgentPort == null) { + LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", + KVMHAConfig.KvmHaWebservicePort.defaultValue(), agent.getClusterId(), agent)); + haAgentPort = Integer.parseInt(KVMHAConfig.KvmHaWebservicePort.defaultValue()); + } + return haAgentPort; + } + + /** + * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. + */ + public boolean isKvmHaWebserviceEnabled() { + return KVMHAConfig.IsKvmHaWebserviceEnabled.value(); + } + + /** + * Lists VMs on host according to vm_instance DB table. The states considered for such listing are: 'Running', 'Stopping', 'Migrating'. + * <br> + * <br> + * Note that VMs on state 'Starting' are not common to be at the host, therefore this method does not list them. + * However, there is still a probability of a VM in 'Starting' state be already listed on the KVM via '$virsh list', + * but that's not likely and thus it is not relevant for this very context. + */ + protected List<VMInstanceVO> listVmsOnHost(Host host, VMInstanceDao vmInstanceDao) { + List<VMInstanceVO> listByHostAndStateRunning = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running); + List<VMInstanceVO> listByHostAndStateStopping = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Stopping); + List<VMInstanceVO> listByHostAndStateMigrating = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Migrating); + + List<VMInstanceVO> listByHostAndState = new ArrayList<>(); + listByHostAndState.addAll(listByHostAndStateRunning); + listByHostAndState.addAll(listByHostAndStateStopping); + listByHostAndState.addAll(listByHostAndStateMigrating); + + if (LOGGER.isTraceEnabled()) { + List<VMInstanceVO> listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); + int startingVMs = listByHostAndStateStarting.size(); + int runningVMs = listByHostAndStateRunning.size(); + int stoppingVms = listByHostAndStateStopping.size(); + int migratingVms = listByHostAndStateMigrating.size(); + int countRunningVmsOnAgent = countRunningVmsOnAgent(); + LOGGER.trace( + String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, + stoppingVms, migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); + } + + return listByHostAndState; + } + + /** + * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to Libvirt. <br><br> + * + * IF: <br> + * (i) KVM HA agent finds 0 running but CloudStack considers that the host has 2 or more VMs running: returns false as could not find VMs running but it expected at least + * 2 VMs running, fencing/recovering host would avoid downtime to VMs in this case.<br> + * (ii) KVM HA agent finds 0 VM running but CloudStack considers that the host has 1 VM running: return true and log WARN messages and avoids triggering HA recovery/fencing + * when it could be a inconsistency when migrating a VM.<br> + * (iii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly + */ + public boolean isKvmHaAgentHealthy(Host host, VMInstanceDao vmInstanceDao) { + int numberOfVmsOnHostAccordingToDb = listVmsOnHost(host, vmInstanceDao).size(); + int numberOfVmsOnAgent = countRunningVmsOnAgent(); + if (numberOfVmsOnAgent < 0) { + LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", agent)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + return false; + } + if (numberOfVmsOnHostAccordingToDb == numberOfVmsOnAgent) { + return true; + } + if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDb > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { + // Return false as could not find VMs running but it expected at least one VM running, fencing/recovering host would avoid downtime to VMs in this case. + // There is cautious margin added on the conditional. This avoids fencing/recovering hosts when there is one VM migrating to a host that had zero VMs. + // If there are more VMs than the CAUTIOUS_MARGIN_OF_VMS_ON_HOST) the Host should be treated as not healthy and fencing/recovering process might be triggered. Review comment: Javadoc already explain a little bit of the context, could we remove this comment and improve javadoc? ########## File path: plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java ########## @@ -0,0 +1,278 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.math.NumberUtils; +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.HttpStatus; +import org.apache.http.ProtocolVersion; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.entity.InputStreamEntity; +import org.apache.http.message.BasicStatusLine; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.junit.MockitoJUnitRunner; + +import com.cloud.host.HostVO; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.dao.VMInstanceDaoImpl; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; + +@RunWith(MockitoJUnitRunner.class) +public class KvmHaAgentClientTest { + + private static final int ERROR_CODE = -1; + private HostVO agent = Mockito.mock(HostVO.class); + private KvmHaAgentClient kvmHaAgentClient = Mockito.spy(new KvmHaAgentClient(agent)); + private static final int DEFAULT_PORT = 8080; + private static final String PRIVATE_IP_ADDRESS = "1.2.3.4"; + private static final String JSON_STRING_EXAMPLE_3VMs = "{\"count\":3,\"virtualmachines\":[\"r-123-VM\",\"v-134-VM\",\"s-111-VM\"]}"; + private static final int EXPECTED_RUNNING_VMS_EXAMPLE_3VMs = 3; + private static final String JSON_STRING_EXAMPLE_0VMs = "{\"count\":0,\"virtualmachines\":[]}"; + private static final int EXPECTED_RUNNING_VMS_EXAMPLE_0VMs = 0; + private static final String EXPECTED_URL = String.format("http://%s:%d", PRIVATE_IP_ADDRESS, DEFAULT_PORT); + private static final HttpRequestBase HTTP_REQUEST_BASE = new HttpGet(EXPECTED_URL); + private static final String VMS_COUNT = "count"; + private static final String VIRTUAL_MACHINES = "virtualmachines"; + private static final int MAX_REQUEST_RETRIES = 2; + private static final int KVM_HA_WEBSERVICE_PORT = 8080; + + @Mock + HttpClient client; + + @Mock + VMInstanceDaoImpl vmInstanceDao; + + @Test + public void isKvmHaAgentHealthyTestAllGood() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, EXPECTED_RUNNING_VMS_EXAMPLE_3VMs); + Assert.assertTrue(result); + } + + @Test + public void isKvmHaAgentHealthyTestVMsDoNotMatchButDoNotReturnFalse() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 1); + Assert.assertTrue(result); + } + + @Test + public void isKvmHaAgentHealthyTestExpectedRunningVmsButNoneListed() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 0); + Assert.assertFalse(result); + } + + @Test + public void isKvmHaAgentHealthyTestReceivedErrorCode() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, ERROR_CODE); + Assert.assertFalse(result); + } + + private boolean isKvmHaAgentHealthyTests(int expectedNumberOfVms, int vmsRunningOnAgent) { + List<VMInstanceVO> vmsOnHostList = new ArrayList<>(); + for (int i = 0; i < expectedNumberOfVms; i++) { + VMInstanceVO vmInstance = Mockito.mock(VMInstanceVO.class); + vmsOnHostList.add(vmInstance); + } + + Mockito.doReturn(vmsOnHostList).when(kvmHaAgentClient).listVmsOnHost(Mockito.any(), Mockito.any()); + Mockito.doReturn(vmsRunningOnAgent).when(kvmHaAgentClient).countRunningVmsOnAgent(); + + return kvmHaAgentClient.isKvmHaAgentHealthy(agent, vmInstanceDao); + } + + @Test + public void processHttpResponseIntoJsonTestNull() { + JsonObject responseJson = kvmHaAgentClient.processHttpResponseIntoJson(null); + Assert.assertNull(responseJson); + } + + @Test + public void processHttpResponseIntoJsonTest() throws IOException { + prepareAndTestProcessHttpResponseIntoJson(JSON_STRING_EXAMPLE_3VMs, 3l); + } + + @Test + public void processHttpResponseIntoJsonTestOtherJsonExample() throws IOException { + prepareAndTestProcessHttpResponseIntoJson(JSON_STRING_EXAMPLE_0VMs, 0l); + } + + private void prepareAndTestProcessHttpResponseIntoJson(String jsonString, long expectedVmsCount) throws IOException { + CloseableHttpResponse mockedResponse = mockResponse(HttpStatus.SC_OK, jsonString); + JsonObject responseJson = kvmHaAgentClient.processHttpResponseIntoJson(mockedResponse); + + Assert.assertNotNull(responseJson); + JsonElement jsonElementVmsCount = responseJson.get(VMS_COUNT); + JsonElement jsonElementVmsArray = responseJson.get(VIRTUAL_MACHINES); + JsonArray jsonArray = jsonElementVmsArray.getAsJsonArray(); + + Assert.assertEquals(expectedVmsCount, jsonArray.size()); + Assert.assertEquals(expectedVmsCount, jsonElementVmsCount.getAsLong()); + Assert.assertEquals(jsonString, responseJson.toString()); + } + + private CloseableHttpResponse mockResponse(int httpStatusCode, String jsonString) throws IOException { + BasicStatusLine basicStatusLine = new BasicStatusLine(new ProtocolVersion("HTTP", 1000, 123), httpStatusCode, "Status"); + CloseableHttpResponse response = Mockito.mock(CloseableHttpResponse.class); + InputStream in = IOUtils.toInputStream(jsonString, StandardCharsets.UTF_8); + Mockito.when(response.getStatusLine()).thenReturn(basicStatusLine); + HttpEntity httpEntity = new InputStreamEntity(in); + Mockito.when(response.getEntity()).thenReturn(httpEntity); + return response; + } + + @Test + public void countRunningVmsOnAgentTest() throws IOException { + prepareAndRunCountRunningVmsOnAgent(JSON_STRING_EXAMPLE_3VMs, EXPECTED_RUNNING_VMS_EXAMPLE_3VMs); + } + + @Test + public void countRunningVmsOnAgentTestBlankNoVmsListed() throws IOException { + prepareAndRunCountRunningVmsOnAgent(JSON_STRING_EXAMPLE_0VMs, EXPECTED_RUNNING_VMS_EXAMPLE_0VMs); + } + + private void prepareAndRunCountRunningVmsOnAgent(String jsonStringExample, int expectedListedVms) throws IOException { + Mockito.when(agent.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); + Mockito.doReturn(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs)).when(kvmHaAgentClient).executeHttpRequest(EXPECTED_URL); + + JsonObject jObject = new JsonParser().parse(jsonStringExample).getAsJsonObject(); + Mockito.doReturn(jObject).when(kvmHaAgentClient).processHttpResponseIntoJson(Mockito.any(HttpResponse.class)); + + int result = kvmHaAgentClient.countRunningVmsOnAgent(); + Assert.assertEquals(expectedListedVms, result); + } + + @Test + public void retryHttpRequestTest() throws IOException { + kvmHaAgentClient.retryHttpRequest(EXPECTED_URL, HTTP_REQUEST_BASE, client); + Mockito.verify(client, Mockito.times(1)).execute(Mockito.any()); + Mockito.verify(kvmHaAgentClient, Mockito.times(1)).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); + } + + @Test + public void retryHttpRequestTestNullResponse() throws IOException { + Mockito.doReturn(null).when(kvmHaAgentClient).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); + HttpResponse response = kvmHaAgentClient.retryHttpRequest(EXPECTED_URL, HTTP_REQUEST_BASE, client); + Assert.assertNull(response); + } + + @Test + public void retryHttpRequestTestForbidden() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_FORBIDDEN, true); + } + + @Test + public void retryHttpRequestTestMultipleChoices() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_MULTIPLE_CHOICES, true); + } + + @Test + public void retryHttpRequestTestProcessing() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_PROCESSING, true); + } + + @Test + public void retryHttpRequestTestTimeout() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_GATEWAY_TIMEOUT, true); + } + + @Test + public void retryHttpRequestTestVersionNotSupported() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_HTTP_VERSION_NOT_SUPPORTED, true); + } + + @Test + public void retryHttpRequestTestOk() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_OK, false); + } + + private void prepareAndRunRetryHttpRequestTest(int scMultipleChoices, boolean expectNull) throws IOException { + HttpResponse mockedResponse = mockResponse(scMultipleChoices, JSON_STRING_EXAMPLE_3VMs); + Mockito.doReturn(mockedResponse).when(kvmHaAgentClient).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); + HttpResponse response = kvmHaAgentClient.retryHttpRequest(EXPECTED_URL, HTTP_REQUEST_BASE, client); + if (expectNull) { + Assert.assertNull(response); + } else { + Assert.assertEquals(mockedResponse, response); + } + } + + @Test + public void retryHttpRequestTestHttpOk() throws IOException { + HttpResponse mockedResponse = mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs); + Mockito.doReturn(mockedResponse).when(kvmHaAgentClient).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); + HttpResponse result = kvmHaAgentClient.retryHttpRequest(EXPECTED_URL, HTTP_REQUEST_BASE, client); + Mockito.verify(kvmHaAgentClient, Mockito.times(1)).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); + Assert.assertEquals(mockedResponse, result); + } + + @Test + public void retryUntilGetsHttpResponseTestOneIOException() throws IOException { + Mockito.when(client.execute(HTTP_REQUEST_BASE)).thenThrow(IOException.class).thenReturn(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs)); + HttpResponse result = kvmHaAgentClient.retryUntilGetsHttpResponse(EXPECTED_URL, HTTP_REQUEST_BASE, client); + Mockito.verify(client, Mockito.times(MAX_REQUEST_RETRIES)).execute(Mockito.any()); + Assert.assertNotNull(result); + } + + @Test + public void retryUntilGetsHttpResponseTestTwoIOException() throws IOException { + Mockito.when(client.execute(HTTP_REQUEST_BASE)).thenThrow(IOException.class).thenThrow(IOException.class); + HttpResponse result = kvmHaAgentClient.retryUntilGetsHttpResponse(EXPECTED_URL, HTTP_REQUEST_BASE, client); + Mockito.verify(client, Mockito.times(MAX_REQUEST_RETRIES)).execute(Mockito.any()); + Assert.assertNull(result); + } + + @Test + public void isKvmHaWebserviceEnabledTestDefault() { + Assert.assertFalse(kvmHaAgentClient.isKvmHaWebserviceEnabled()); + } + + @Test + public void getKvmHaMicroservicePortValueTestDefault() { + Assert.assertEquals(KVM_HA_WEBSERVICE_PORT, kvmHaAgentClient.getKvmHaMicroservicePortValue()); + } + +// private void prepareAndRunCountRunningVmsOnAgent(String jsonStringExample, int expectedListedVms) throws IOException { +// Mockito.when(agent.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); +// Mockito.doReturn(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs)).when(kvmHaAgentClient).executeHttpRequest(EXPECTED_URL); +// +// JsonObject jObject = new JsonParser().parse(jsonStringExample).getAsJsonObject(); +// Mockito.doReturn(jObject).when(kvmHaAgentClient).processHttpResponseIntoJson(Mockito.any(HttpResponse.class)); +// +// int result = kvmHaAgentClient.countRunningVmsOnAgent(); +// Assert.assertEquals(expectedListedVms, result); +// } +//TODO +// @Test +// public void isTargetHostReachableTest() { +// kvmHaAgentClient.isTargetHostReachable(PRIVATE_IP_ADDRESS); +// } Review comment: Do we need commented code? ########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java ########## @@ -0,0 +1,295 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import com.cloud.host.Host; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.VirtualMachine; +import com.cloud.vm.dao.VMInstanceDao; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.Nullable; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * This class provides a client that checks Agent status via a webserver. + * <br> + * The additional webserver exposes a simple JSON API which returns a list + * of Virtual Machines that are running on that host according to Libvirt. + * <br> + * This way, KVM HA can verify, via Libvirt, VMs status with an HTTP-call + * to this simple webserver and determine if the host is actually down + * or if it is just the Java Agent which has crashed. + */ +public class KvmHaAgentClient { + + private static final Logger LOGGER = Logger.getLogger(KvmHaAgentClient.class); + private static final int ERROR_CODE = -1; + private static final String EXPECTED_HTTP_STATUS = "2XX"; + private static final String VM_COUNT = "count"; + private static final String STATUS = "status"; + private static final String CHECK = "check"; + private static final String UP = "Up"; + private static final int WAIT_FOR_REQUEST_RETRY = 2; + private static final int MAX_REQUEST_RETRIES = 2; + private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; + private Host agent; + + /** + * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running according to the Libvirt + */ + public KvmHaAgentClient(Host agent) { + this.agent = agent; + } + + /** + * Returns the number of VMs running on the KVM host according to Libvirt. + */ + protected int countRunningVmsOnAgent() { + String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), getKvmHaMicroservicePortValue()); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return ERROR_CODE; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return ERROR_CODE; + } + + return responseInJson.get(VM_COUNT).getAsInt(); + } + + /** + * Executes ping command from the host executing the KVM HA Agent webservice to a target IP Address. + * The webserver serves a JSON Object such as {"status": "Up"} if the IP address is reachable OR {"status": "Down"} if could not ping the IP + */ + protected boolean isTargetHostReachable(String ipAddress) { + int port = getKvmHaMicroservicePortValue(); + String url = String.format("http://%s:%d/%s/%s:%d", agent.getPrivateIpAddress(), port, CHECK, ipAddress, port); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return false; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return false; + } + + return UP.equals(responseInJson.get(STATUS).getAsString()); + } + + protected int getKvmHaMicroservicePortValue() { + Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); + if (haAgentPort == null) { + LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", + KVMHAConfig.KvmHaWebservicePort.defaultValue(), agent.getClusterId(), agent)); + haAgentPort = Integer.parseInt(KVMHAConfig.KvmHaWebservicePort.defaultValue()); + } + return haAgentPort; + } + + /** + * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. + */ + public boolean isKvmHaWebserviceEnabled() { + return KVMHAConfig.IsKvmHaWebserviceEnabled.value(); + } + + /** + * Lists VMs on host according to vm_instance DB table. The states considered for such listing are: 'Running', 'Stopping', 'Migrating'. + * <br> + * <br> + * Note that VMs on state 'Starting' are not common to be at the host, therefore this method does not list them. + * However, there is still a probability of a VM in 'Starting' state be already listed on the KVM via '$virsh list', + * but that's not likely and thus it is not relevant for this very context. + */ + protected List<VMInstanceVO> listVmsOnHost(Host host, VMInstanceDao vmInstanceDao) { + List<VMInstanceVO> listByHostAndStateRunning = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running); + List<VMInstanceVO> listByHostAndStateStopping = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Stopping); + List<VMInstanceVO> listByHostAndStateMigrating = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Migrating); + + List<VMInstanceVO> listByHostAndState = new ArrayList<>(); + listByHostAndState.addAll(listByHostAndStateRunning); + listByHostAndState.addAll(listByHostAndStateStopping); + listByHostAndState.addAll(listByHostAndStateMigrating); + + if (LOGGER.isTraceEnabled()) { + List<VMInstanceVO> listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); + int startingVMs = listByHostAndStateStarting.size(); + int runningVMs = listByHostAndStateRunning.size(); + int stoppingVms = listByHostAndStateStopping.size(); + int migratingVms = listByHostAndStateMigrating.size(); + int countRunningVmsOnAgent = countRunningVmsOnAgent(); + LOGGER.trace( + String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, + stoppingVms, migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); + } + + return listByHostAndState; + } + + /** + * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to Libvirt. <br><br> + * + * IF: <br> + * (i) KVM HA agent finds 0 running but CloudStack considers that the host has 2 or more VMs running: returns false as could not find VMs running but it expected at least + * 2 VMs running, fencing/recovering host would avoid downtime to VMs in this case.<br> + * (ii) KVM HA agent finds 0 VM running but CloudStack considers that the host has 1 VM running: return true and log WARN messages and avoids triggering HA recovery/fencing + * when it could be a inconsistency when migrating a VM.<br> + * (iii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly + */ + public boolean isKvmHaAgentHealthy(Host host, VMInstanceDao vmInstanceDao) { + int numberOfVmsOnHostAccordingToDb = listVmsOnHost(host, vmInstanceDao).size(); + int numberOfVmsOnAgent = countRunningVmsOnAgent(); + if (numberOfVmsOnAgent < 0) { + LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", agent)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + return false; + } + if (numberOfVmsOnHostAccordingToDb == numberOfVmsOnAgent) { + return true; + } + if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDb > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { + // Return false as could not find VMs running but it expected at least one VM running, fencing/recovering host would avoid downtime to VMs in this case. + // There is cautious margin added on the conditional. This avoids fencing/recovering hosts when there is one VM migrating to a host that had zero VMs. + // If there are more VMs than the CAUTIOUS_MARGIN_OF_VMS_ON_HOST) the Host should be treated as not healthy and fencing/recovering process might be triggered. + LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", agent, numberOfVmsOnHostAccordingToDb)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + return false; + } + // In order to have a less "aggressive" health-check, the KvmHaAgentClient will not return false; fencing/recovering could bring downtime to existing VMs + // Additionally, the inconsistency can also be due to jobs in progress to migrate/stop/start VMs + // Either way, WARN messages should be presented to Admins so they can look closely to what is happening on the host + LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", agent, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDb)); + return true; + } + + /** + * Executes a GET request for the given URL address. + */ + protected HttpResponse executeHttpRequest(String url) { + HttpGet httpReq = prepareHttpRequestForUrl(url); + if (httpReq == null) { + return null; + } + + HttpClient client = HttpClientBuilder.create().build(); + HttpResponse response = null; + try { + response = client.execute(httpReq); + } catch (IOException e) { + if (MAX_REQUEST_RETRIES == 0) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); + return null; + } + retryHttpRequest(url, httpReq, client); + } + return response; + } + + @Nullable + private HttpGet prepareHttpRequestForUrl(String url) { + HttpGet httpReq = null; + try { + URIBuilder builder = new URIBuilder(url); + httpReq = new HttpGet(builder.build()); + } catch (URISyntaxException e) { + LOGGER.error(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); + return null; + } + return httpReq; + } + + /** + * Re-executes the HTTP GET request until it gets a response or it reaches the maximum request retries {@link #MAX_REQUEST_RETRIES} + */ + protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url)); + HttpResponse response = retryUntilGetsHttpResponse(url, httpReq, client); + + if (response == null) { + LOGGER.error(String.format("Failed to execute HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + int statusCode = response.getStatusLine().getStatusCode(); + if (statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES) { + LOGGER.error( + String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, + EXPECTED_HTTP_STATUS, statusCode)); + return null; + } + + LOGGER.debug(String.format("Successfully executed HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + protected HttpResponse retryUntilGetsHttpResponse(String url, HttpRequestBase httpReq, HttpClient client) { + for (int attempt = 1; attempt < MAX_REQUEST_RETRIES + 1; attempt++) { Review comment: ```suggestion for (int attempt = 1; attempt <= MAX_REQUEST_RETRIES; attempt++) { ``` ########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java ########## @@ -0,0 +1,295 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import com.cloud.host.Host; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.VirtualMachine; +import com.cloud.vm.dao.VMInstanceDao; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.Nullable; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * This class provides a client that checks Agent status via a webserver. + * <br> + * The additional webserver exposes a simple JSON API which returns a list + * of Virtual Machines that are running on that host according to Libvirt. + * <br> + * This way, KVM HA can verify, via Libvirt, VMs status with an HTTP-call + * to this simple webserver and determine if the host is actually down + * or if it is just the Java Agent which has crashed. + */ +public class KvmHaAgentClient { + + private static final Logger LOGGER = Logger.getLogger(KvmHaAgentClient.class); + private static final int ERROR_CODE = -1; + private static final String EXPECTED_HTTP_STATUS = "2XX"; + private static final String VM_COUNT = "count"; + private static final String STATUS = "status"; + private static final String CHECK = "check"; + private static final String UP = "Up"; + private static final int WAIT_FOR_REQUEST_RETRY = 2; + private static final int MAX_REQUEST_RETRIES = 2; + private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; + private Host agent; + + /** + * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running according to the Libvirt + */ + public KvmHaAgentClient(Host agent) { + this.agent = agent; + } + + /** + * Returns the number of VMs running on the KVM host according to Libvirt. + */ + protected int countRunningVmsOnAgent() { + String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), getKvmHaMicroservicePortValue()); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return ERROR_CODE; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return ERROR_CODE; + } + + return responseInJson.get(VM_COUNT).getAsInt(); + } + + /** + * Executes ping command from the host executing the KVM HA Agent webservice to a target IP Address. + * The webserver serves a JSON Object such as {"status": "Up"} if the IP address is reachable OR {"status": "Down"} if could not ping the IP + */ + protected boolean isTargetHostReachable(String ipAddress) { + int port = getKvmHaMicroservicePortValue(); + String url = String.format("http://%s:%d/%s/%s:%d", agent.getPrivateIpAddress(), port, CHECK, ipAddress, port); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return false; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return false; + } + + return UP.equals(responseInJson.get(STATUS).getAsString()); + } + + protected int getKvmHaMicroservicePortValue() { + Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); + if (haAgentPort == null) { + LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", + KVMHAConfig.KvmHaWebservicePort.defaultValue(), agent.getClusterId(), agent)); + haAgentPort = Integer.parseInt(KVMHAConfig.KvmHaWebservicePort.defaultValue()); + } + return haAgentPort; + } + + /** + * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. + */ + public boolean isKvmHaWebserviceEnabled() { + return KVMHAConfig.IsKvmHaWebserviceEnabled.value(); + } + + /** + * Lists VMs on host according to vm_instance DB table. The states considered for such listing are: 'Running', 'Stopping', 'Migrating'. + * <br> + * <br> + * Note that VMs on state 'Starting' are not common to be at the host, therefore this method does not list them. + * However, there is still a probability of a VM in 'Starting' state be already listed on the KVM via '$virsh list', + * but that's not likely and thus it is not relevant for this very context. + */ + protected List<VMInstanceVO> listVmsOnHost(Host host, VMInstanceDao vmInstanceDao) { + List<VMInstanceVO> listByHostAndStateRunning = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running); + List<VMInstanceVO> listByHostAndStateStopping = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Stopping); + List<VMInstanceVO> listByHostAndStateMigrating = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Migrating); + + List<VMInstanceVO> listByHostAndState = new ArrayList<>(); + listByHostAndState.addAll(listByHostAndStateRunning); + listByHostAndState.addAll(listByHostAndStateStopping); + listByHostAndState.addAll(listByHostAndStateMigrating); + + if (LOGGER.isTraceEnabled()) { + List<VMInstanceVO> listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); + int startingVMs = listByHostAndStateStarting.size(); + int runningVMs = listByHostAndStateRunning.size(); + int stoppingVms = listByHostAndStateStopping.size(); + int migratingVms = listByHostAndStateMigrating.size(); + int countRunningVmsOnAgent = countRunningVmsOnAgent(); + LOGGER.trace( + String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, + stoppingVms, migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); + } + + return listByHostAndState; + } + + /** + * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to Libvirt. <br><br> + * + * IF: <br> + * (i) KVM HA agent finds 0 running but CloudStack considers that the host has 2 or more VMs running: returns false as could not find VMs running but it expected at least + * 2 VMs running, fencing/recovering host would avoid downtime to VMs in this case.<br> + * (ii) KVM HA agent finds 0 VM running but CloudStack considers that the host has 1 VM running: return true and log WARN messages and avoids triggering HA recovery/fencing + * when it could be a inconsistency when migrating a VM.<br> + * (iii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly + */ + public boolean isKvmHaAgentHealthy(Host host, VMInstanceDao vmInstanceDao) { + int numberOfVmsOnHostAccordingToDb = listVmsOnHost(host, vmInstanceDao).size(); + int numberOfVmsOnAgent = countRunningVmsOnAgent(); + if (numberOfVmsOnAgent < 0) { + LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", agent)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + return false; + } + if (numberOfVmsOnHostAccordingToDb == numberOfVmsOnAgent) { + return true; + } + if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDb > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { + // Return false as could not find VMs running but it expected at least one VM running, fencing/recovering host would avoid downtime to VMs in this case. + // There is cautious margin added on the conditional. This avoids fencing/recovering hosts when there is one VM migrating to a host that had zero VMs. + // If there are more VMs than the CAUTIOUS_MARGIN_OF_VMS_ON_HOST) the Host should be treated as not healthy and fencing/recovering process might be triggered. + LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", agent, numberOfVmsOnHostAccordingToDb)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + return false; + } + // In order to have a less "aggressive" health-check, the KvmHaAgentClient will not return false; fencing/recovering could bring downtime to existing VMs + // Additionally, the inconsistency can also be due to jobs in progress to migrate/stop/start VMs + // Either way, WARN messages should be presented to Admins so they can look closely to what is happening on the host + LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", agent, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDb)); + return true; + } + + /** + * Executes a GET request for the given URL address. + */ + protected HttpResponse executeHttpRequest(String url) { + HttpGet httpReq = prepareHttpRequestForUrl(url); + if (httpReq == null) { + return null; + } + + HttpClient client = HttpClientBuilder.create().build(); + HttpResponse response = null; + try { + response = client.execute(httpReq); + } catch (IOException e) { + if (MAX_REQUEST_RETRIES == 0) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); + return null; + } + retryHttpRequest(url, httpReq, client); + } + return response; + } + + @Nullable + private HttpGet prepareHttpRequestForUrl(String url) { + HttpGet httpReq = null; + try { + URIBuilder builder = new URIBuilder(url); + httpReq = new HttpGet(builder.build()); + } catch (URISyntaxException e) { + LOGGER.error(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); + return null; + } + return httpReq; + } + + /** + * Re-executes the HTTP GET request until it gets a response or it reaches the maximum request retries {@link #MAX_REQUEST_RETRIES} + */ + protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url)); + HttpResponse response = retryUntilGetsHttpResponse(url, httpReq, client); + + if (response == null) { + LOGGER.error(String.format("Failed to execute HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + int statusCode = response.getStatusLine().getStatusCode(); + if (statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES) { + LOGGER.error( + String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, + EXPECTED_HTTP_STATUS, statusCode)); + return null; + } + + LOGGER.debug(String.format("Successfully executed HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + protected HttpResponse retryUntilGetsHttpResponse(String url, HttpRequestBase httpReq, HttpClient client) { + for (int attempt = 1; attempt < MAX_REQUEST_RETRIES + 1; attempt++) { + try { + TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY); + LOGGER.debug(String.format("Retry HTTP %s request [URL: %s], attempt %d/%d.", httpReq.getMethod(), url, attempt, MAX_REQUEST_RETRIES)); + return client.execute(httpReq); + } catch (IOException | InterruptedException e) { + String errorMessage = String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", + httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, url, e); + LOGGER.error(errorMessage); + } + } + return null; + } + + /** + * Processes the response of request GET System ID as a JSON object.<br> + * Json example: {"count": 3, "virtualmachines": ["r-123-VM", "v-134-VM", "s-111-VM"]}<br><br> + * + * Note: this method can return NULL JsonObject in case HttpResponse is NULL. + */ + protected JsonObject processHttpResponseIntoJson(HttpResponse response) { + InputStream in; + String jsonString; + if (response == null) { + return null; + } + try { + in = response.getEntity().getContent(); + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); + jsonString = streamReader.readLine(); + } catch (UnsupportedOperationException | IOException e) { + throw new CloudRuntimeException("Failed to process response", e); + } + + return new JsonParser().parse(jsonString).getAsJsonObject(); Review comment: If needed, could we improve the exception with some `response` context? ########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java ########## @@ -0,0 +1,295 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import com.cloud.host.Host; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.VirtualMachine; +import com.cloud.vm.dao.VMInstanceDao; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.Nullable; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * This class provides a client that checks Agent status via a webserver. + * <br> + * The additional webserver exposes a simple JSON API which returns a list + * of Virtual Machines that are running on that host according to Libvirt. + * <br> + * This way, KVM HA can verify, via Libvirt, VMs status with an HTTP-call + * to this simple webserver and determine if the host is actually down + * or if it is just the Java Agent which has crashed. + */ +public class KvmHaAgentClient { + + private static final Logger LOGGER = Logger.getLogger(KvmHaAgentClient.class); + private static final int ERROR_CODE = -1; + private static final String EXPECTED_HTTP_STATUS = "2XX"; + private static final String VM_COUNT = "count"; + private static final String STATUS = "status"; + private static final String CHECK = "check"; + private static final String UP = "Up"; + private static final int WAIT_FOR_REQUEST_RETRY = 2; + private static final int MAX_REQUEST_RETRIES = 2; + private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; + private Host agent; + + /** + * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running according to the Libvirt + */ + public KvmHaAgentClient(Host agent) { + this.agent = agent; + } + + /** + * Returns the number of VMs running on the KVM host according to Libvirt. + */ + protected int countRunningVmsOnAgent() { + String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), getKvmHaMicroservicePortValue()); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return ERROR_CODE; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return ERROR_CODE; + } + + return responseInJson.get(VM_COUNT).getAsInt(); + } + + /** + * Executes ping command from the host executing the KVM HA Agent webservice to a target IP Address. + * The webserver serves a JSON Object such as {"status": "Up"} if the IP address is reachable OR {"status": "Down"} if could not ping the IP + */ + protected boolean isTargetHostReachable(String ipAddress) { + int port = getKvmHaMicroservicePortValue(); + String url = String.format("http://%s:%d/%s/%s:%d", agent.getPrivateIpAddress(), port, CHECK, ipAddress, port); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return false; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return false; + } + + return UP.equals(responseInJson.get(STATUS).getAsString()); + } + + protected int getKvmHaMicroservicePortValue() { + Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); + if (haAgentPort == null) { + LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", + KVMHAConfig.KvmHaWebservicePort.defaultValue(), agent.getClusterId(), agent)); + haAgentPort = Integer.parseInt(KVMHAConfig.KvmHaWebservicePort.defaultValue()); + } + return haAgentPort; + } + + /** + * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. + */ + public boolean isKvmHaWebserviceEnabled() { + return KVMHAConfig.IsKvmHaWebserviceEnabled.value(); + } + + /** + * Lists VMs on host according to vm_instance DB table. The states considered for such listing are: 'Running', 'Stopping', 'Migrating'. + * <br> + * <br> + * Note that VMs on state 'Starting' are not common to be at the host, therefore this method does not list them. + * However, there is still a probability of a VM in 'Starting' state be already listed on the KVM via '$virsh list', + * but that's not likely and thus it is not relevant for this very context. + */ + protected List<VMInstanceVO> listVmsOnHost(Host host, VMInstanceDao vmInstanceDao) { + List<VMInstanceVO> listByHostAndStateRunning = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running); + List<VMInstanceVO> listByHostAndStateStopping = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Stopping); + List<VMInstanceVO> listByHostAndStateMigrating = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Migrating); + + List<VMInstanceVO> listByHostAndState = new ArrayList<>(); + listByHostAndState.addAll(listByHostAndStateRunning); + listByHostAndState.addAll(listByHostAndStateStopping); + listByHostAndState.addAll(listByHostAndStateMigrating); + + if (LOGGER.isTraceEnabled()) { + List<VMInstanceVO> listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); + int startingVMs = listByHostAndStateStarting.size(); + int runningVMs = listByHostAndStateRunning.size(); + int stoppingVms = listByHostAndStateStopping.size(); + int migratingVms = listByHostAndStateMigrating.size(); + int countRunningVmsOnAgent = countRunningVmsOnAgent(); + LOGGER.trace( + String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, + stoppingVms, migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); + } + + return listByHostAndState; + } + + /** + * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to Libvirt. <br><br> + * + * IF: <br> + * (i) KVM HA agent finds 0 running but CloudStack considers that the host has 2 or more VMs running: returns false as could not find VMs running but it expected at least + * 2 VMs running, fencing/recovering host would avoid downtime to VMs in this case.<br> + * (ii) KVM HA agent finds 0 VM running but CloudStack considers that the host has 1 VM running: return true and log WARN messages and avoids triggering HA recovery/fencing + * when it could be a inconsistency when migrating a VM.<br> + * (iii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly + */ + public boolean isKvmHaAgentHealthy(Host host, VMInstanceDao vmInstanceDao) { + int numberOfVmsOnHostAccordingToDb = listVmsOnHost(host, vmInstanceDao).size(); + int numberOfVmsOnAgent = countRunningVmsOnAgent(); + if (numberOfVmsOnAgent < 0) { + LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", agent)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + return false; + } + if (numberOfVmsOnHostAccordingToDb == numberOfVmsOnAgent) { + return true; + } + if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDb > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { + // Return false as could not find VMs running but it expected at least one VM running, fencing/recovering host would avoid downtime to VMs in this case. + // There is cautious margin added on the conditional. This avoids fencing/recovering hosts when there is one VM migrating to a host that had zero VMs. + // If there are more VMs than the CAUTIOUS_MARGIN_OF_VMS_ON_HOST) the Host should be treated as not healthy and fencing/recovering process might be triggered. + LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", agent, numberOfVmsOnHostAccordingToDb)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + return false; + } + // In order to have a less "aggressive" health-check, the KvmHaAgentClient will not return false; fencing/recovering could bring downtime to existing VMs + // Additionally, the inconsistency can also be due to jobs in progress to migrate/stop/start VMs + // Either way, WARN messages should be presented to Admins so they can look closely to what is happening on the host + LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", agent, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDb)); + return true; + } + + /** + * Executes a GET request for the given URL address. + */ + protected HttpResponse executeHttpRequest(String url) { + HttpGet httpReq = prepareHttpRequestForUrl(url); + if (httpReq == null) { + return null; + } + + HttpClient client = HttpClientBuilder.create().build(); + HttpResponse response = null; + try { + response = client.execute(httpReq); + } catch (IOException e) { + if (MAX_REQUEST_RETRIES == 0) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); + return null; + } + retryHttpRequest(url, httpReq, client); + } + return response; + } + + @Nullable + private HttpGet prepareHttpRequestForUrl(String url) { + HttpGet httpReq = null; + try { + URIBuilder builder = new URIBuilder(url); + httpReq = new HttpGet(builder.build()); + } catch (URISyntaxException e) { + LOGGER.error(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); + return null; + } + return httpReq; + } + + /** + * Re-executes the HTTP GET request until it gets a response or it reaches the maximum request retries {@link #MAX_REQUEST_RETRIES} + */ + protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url)); + HttpResponse response = retryUntilGetsHttpResponse(url, httpReq, client); + + if (response == null) { + LOGGER.error(String.format("Failed to execute HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + int statusCode = response.getStatusLine().getStatusCode(); + if (statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES) { + LOGGER.error( + String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, + EXPECTED_HTTP_STATUS, statusCode)); + return null; + } + + LOGGER.debug(String.format("Successfully executed HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + protected HttpResponse retryUntilGetsHttpResponse(String url, HttpRequestBase httpReq, HttpClient client) { + for (int attempt = 1; attempt < MAX_REQUEST_RETRIES + 1; attempt++) { + try { + TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY); + LOGGER.debug(String.format("Retry HTTP %s request [URL: %s], attempt %d/%d.", httpReq.getMethod(), url, attempt, MAX_REQUEST_RETRIES)); + return client.execute(httpReq); + } catch (IOException | InterruptedException e) { + String errorMessage = String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", + httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, url, e); + LOGGER.error(errorMessage); + } + } + return null; + } + + /** + * Processes the response of request GET System ID as a JSON object.<br> + * Json example: {"count": 3, "virtualmachines": ["r-123-VM", "v-134-VM", "s-111-VM"]}<br><br> + * + * Note: this method can return NULL JsonObject in case HttpResponse is NULL. + */ + protected JsonObject processHttpResponseIntoJson(HttpResponse response) { + InputStream in; + String jsonString; + if (response == null) { + return null; + } + try { + in = response.getEntity().getContent(); + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); + jsonString = streamReader.readLine(); + } catch (UnsupportedOperationException | IOException e) { + throw new CloudRuntimeException("Failed to process response", e); + } + + return new JsonParser().parse(jsonString).getAsJsonObject(); Review comment: ```suggestion if (response == null) { return null; } try { InputStream in = response.getEntity().getContent(); BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); String jsonString = streamReader.readLine(); return new JsonParser().parse(jsonString).getAsJsonObject(); } catch (UnsupportedOperationException | IOException e) { throw new CloudRuntimeException("Failed to process response", e); } ``` ########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java ########## @@ -0,0 +1,295 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import com.cloud.host.Host; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.VirtualMachine; +import com.cloud.vm.dao.VMInstanceDao; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.Nullable; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * This class provides a client that checks Agent status via a webserver. + * <br> + * The additional webserver exposes a simple JSON API which returns a list + * of Virtual Machines that are running on that host according to Libvirt. + * <br> + * This way, KVM HA can verify, via Libvirt, VMs status with an HTTP-call + * to this simple webserver and determine if the host is actually down + * or if it is just the Java Agent which has crashed. + */ +public class KvmHaAgentClient { + + private static final Logger LOGGER = Logger.getLogger(KvmHaAgentClient.class); + private static final int ERROR_CODE = -1; + private static final String EXPECTED_HTTP_STATUS = "2XX"; + private static final String VM_COUNT = "count"; + private static final String STATUS = "status"; + private static final String CHECK = "check"; + private static final String UP = "Up"; + private static final int WAIT_FOR_REQUEST_RETRY = 2; + private static final int MAX_REQUEST_RETRIES = 2; + private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; + private Host agent; + + /** + * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running according to the Libvirt + */ + public KvmHaAgentClient(Host agent) { + this.agent = agent; + } + + /** + * Returns the number of VMs running on the KVM host according to Libvirt. + */ + protected int countRunningVmsOnAgent() { + String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), getKvmHaMicroservicePortValue()); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return ERROR_CODE; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return ERROR_CODE; + } + + return responseInJson.get(VM_COUNT).getAsInt(); + } + + /** + * Executes ping command from the host executing the KVM HA Agent webservice to a target IP Address. + * The webserver serves a JSON Object such as {"status": "Up"} if the IP address is reachable OR {"status": "Down"} if could not ping the IP + */ + protected boolean isTargetHostReachable(String ipAddress) { + int port = getKvmHaMicroservicePortValue(); + String url = String.format("http://%s:%d/%s/%s:%d", agent.getPrivateIpAddress(), port, CHECK, ipAddress, port); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return false; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return false; + } + + return UP.equals(responseInJson.get(STATUS).getAsString()); + } + + protected int getKvmHaMicroservicePortValue() { + Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); + if (haAgentPort == null) { + LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", + KVMHAConfig.KvmHaWebservicePort.defaultValue(), agent.getClusterId(), agent)); + haAgentPort = Integer.parseInt(KVMHAConfig.KvmHaWebservicePort.defaultValue()); + } + return haAgentPort; + } + + /** + * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. + */ + public boolean isKvmHaWebserviceEnabled() { + return KVMHAConfig.IsKvmHaWebserviceEnabled.value(); + } + + /** + * Lists VMs on host according to vm_instance DB table. The states considered for such listing are: 'Running', 'Stopping', 'Migrating'. + * <br> + * <br> + * Note that VMs on state 'Starting' are not common to be at the host, therefore this method does not list them. + * However, there is still a probability of a VM in 'Starting' state be already listed on the KVM via '$virsh list', + * but that's not likely and thus it is not relevant for this very context. + */ + protected List<VMInstanceVO> listVmsOnHost(Host host, VMInstanceDao vmInstanceDao) { + List<VMInstanceVO> listByHostAndStateRunning = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running); + List<VMInstanceVO> listByHostAndStateStopping = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Stopping); + List<VMInstanceVO> listByHostAndStateMigrating = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Migrating); + + List<VMInstanceVO> listByHostAndState = new ArrayList<>(); + listByHostAndState.addAll(listByHostAndStateRunning); + listByHostAndState.addAll(listByHostAndStateStopping); + listByHostAndState.addAll(listByHostAndStateMigrating); + + if (LOGGER.isTraceEnabled()) { + List<VMInstanceVO> listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); + int startingVMs = listByHostAndStateStarting.size(); + int runningVMs = listByHostAndStateRunning.size(); + int stoppingVms = listByHostAndStateStopping.size(); + int migratingVms = listByHostAndStateMigrating.size(); + int countRunningVmsOnAgent = countRunningVmsOnAgent(); + LOGGER.trace( + String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, + stoppingVms, migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); + } + + return listByHostAndState; + } + + /** + * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to Libvirt. <br><br> + * + * IF: <br> + * (i) KVM HA agent finds 0 running but CloudStack considers that the host has 2 or more VMs running: returns false as could not find VMs running but it expected at least + * 2 VMs running, fencing/recovering host would avoid downtime to VMs in this case.<br> + * (ii) KVM HA agent finds 0 VM running but CloudStack considers that the host has 1 VM running: return true and log WARN messages and avoids triggering HA recovery/fencing + * when it could be a inconsistency when migrating a VM.<br> + * (iii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly + */ + public boolean isKvmHaAgentHealthy(Host host, VMInstanceDao vmInstanceDao) { + int numberOfVmsOnHostAccordingToDb = listVmsOnHost(host, vmInstanceDao).size(); + int numberOfVmsOnAgent = countRunningVmsOnAgent(); + if (numberOfVmsOnAgent < 0) { + LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", agent)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + return false; + } + if (numberOfVmsOnHostAccordingToDb == numberOfVmsOnAgent) { + return true; + } + if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDb > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { + // Return false as could not find VMs running but it expected at least one VM running, fencing/recovering host would avoid downtime to VMs in this case. + // There is cautious margin added on the conditional. This avoids fencing/recovering hosts when there is one VM migrating to a host that had zero VMs. + // If there are more VMs than the CAUTIOUS_MARGIN_OF_VMS_ON_HOST) the Host should be treated as not healthy and fencing/recovering process might be triggered. + LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", agent, numberOfVmsOnHostAccordingToDb)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + return false; + } + // In order to have a less "aggressive" health-check, the KvmHaAgentClient will not return false; fencing/recovering could bring downtime to existing VMs + // Additionally, the inconsistency can also be due to jobs in progress to migrate/stop/start VMs + // Either way, WARN messages should be presented to Admins so they can look closely to what is happening on the host + LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", agent, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDb)); + return true; + } + + /** + * Executes a GET request for the given URL address. + */ + protected HttpResponse executeHttpRequest(String url) { + HttpGet httpReq = prepareHttpRequestForUrl(url); + if (httpReq == null) { + return null; + } + + HttpClient client = HttpClientBuilder.create().build(); + HttpResponse response = null; + try { + response = client.execute(httpReq); + } catch (IOException e) { + if (MAX_REQUEST_RETRIES == 0) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); + return null; + } + retryHttpRequest(url, httpReq, client); + } + return response; + } + + @Nullable + private HttpGet prepareHttpRequestForUrl(String url) { + HttpGet httpReq = null; + try { + URIBuilder builder = new URIBuilder(url); + httpReq = new HttpGet(builder.build()); + } catch (URISyntaxException e) { + LOGGER.error(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); + return null; + } + return httpReq; + } + + /** + * Re-executes the HTTP GET request until it gets a response or it reaches the maximum request retries {@link #MAX_REQUEST_RETRIES} + */ + protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url)); + HttpResponse response = retryUntilGetsHttpResponse(url, httpReq, client); + + if (response == null) { + LOGGER.error(String.format("Failed to execute HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + int statusCode = response.getStatusLine().getStatusCode(); + if (statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES) { + LOGGER.error( + String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, + EXPECTED_HTTP_STATUS, statusCode)); + return null; + } + + LOGGER.debug(String.format("Successfully executed HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + protected HttpResponse retryUntilGetsHttpResponse(String url, HttpRequestBase httpReq, HttpClient client) { + for (int attempt = 1; attempt < MAX_REQUEST_RETRIES + 1; attempt++) { + try { + TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY); + LOGGER.debug(String.format("Retry HTTP %s request [URL: %s], attempt %d/%d.", httpReq.getMethod(), url, attempt, MAX_REQUEST_RETRIES)); + return client.execute(httpReq); + } catch (IOException | InterruptedException e) { + String errorMessage = String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", + httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, url, e); + LOGGER.error(errorMessage); + } + } + return null; + } + + /** + * Processes the response of request GET System ID as a JSON object.<br> + * Json example: {"count": 3, "virtualmachines": ["r-123-VM", "v-134-VM", "s-111-VM"]}<br><br> + * + * Note: this method can return NULL JsonObject in case HttpResponse is NULL. + */ + protected JsonObject processHttpResponseIntoJson(HttpResponse response) { + InputStream in; + String jsonString; + if (response == null) { + return null; + } + try { + in = response.getEntity().getContent(); + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); + jsonString = streamReader.readLine(); + } catch (UnsupportedOperationException | IOException e) { + throw new CloudRuntimeException("Failed to process response", e); + } + + return new JsonParser().parse(jsonString).getAsJsonObject(); Review comment: Seems to me that this method will be called several times, should we extract this `new JsonParser()` to a constant? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
