GutoVeronezi commented on a change in pull request #4978: URL: https://github.com/apache/cloudstack/pull/4978#discussion_r657921262
########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java ########## @@ -0,0 +1,256 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import com.cloud.host.Host; +import com.cloud.host.Status; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.VirtualMachine; +import com.cloud.vm.dao.VMInstanceDao; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.Nullable; + +import javax.inject.Inject; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * This class provides a client that checks Agent status via a webserver. + * <br> + * The additional webserver exposes a simple JSON API which returns a list + * of Virtual Machines that are running on that host according to Libvirt. + * <br> + * This way, KVM HA can verify, via Libvirt, VMs status with an HTTP-call + * to this simple webserver and determine if the host is actually down + * or if it is just the Java Agent which has crashed. + */ +public class KvmHaAgentClient { + + private static final Logger LOGGER = Logger.getLogger(KvmHaAgentClient.class); + private static final int ERROR_CODE = -1; + private static final String EXPECTED_HTTP_STATUS = "2XX"; + private static final String VM_COUNT = "count"; + private static final String STATUS = "status"; + private static final String CHECK_NEIGHBOUR = "check-neighbour"; + private static final int WAIT_FOR_REQUEST_RETRY = 2; + private static final int MAX_REQUEST_RETRIES = 2; + private static final JsonParser JSON_PARSER = new JsonParser(); + + @Inject + private VMInstanceDao vmInstanceDao; + + /** + * Returns the number of VMs running on the KVM host according to Libvirt. + */ + public int countRunningVmsOnAgent(Host host) { + String url = String.format("http://%s:%d", host.getPrivateIpAddress(), getKvmHaMicroservicePortValue(host)); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return ERROR_CODE; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return ERROR_CODE; + } + + return responseInJson.get(VM_COUNT).getAsInt(); + } + + protected int getKvmHaMicroservicePortValue(Host host) { + Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); + if (haAgentPort == null) { + LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", + KVMHAConfig.KvmHaWebservicePort.defaultValue(), host.getClusterId(), host)); + haAgentPort = Integer.parseInt(KVMHAConfig.KvmHaWebservicePort.defaultValue()); + } + return haAgentPort; + } + + /** + * Lists VMs on host according to vm_instance DB table. The states considered for such listing are: 'Running', 'Stopping', 'Migrating'. + * <br> + * <br> + * Note that VMs on state 'Starting' are not common to be at the host, therefore this method does not list them. + * However, there is still a probability of a VM in 'Starting' state be already listed on the KVM via '$virsh list', + * but that's not likely and thus it is not relevant for this very context. + */ + public List<VMInstanceVO> listVmsOnHost(Host host) { + List<VMInstanceVO> listByHostAndStates = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running, VirtualMachine.State.Stopping, VirtualMachine.State.Migrating); + + if (LOGGER.isTraceEnabled()) { + List<VMInstanceVO> listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); + int startingVMs = listByHostAndStateStarting.size(); + long runningVMs = listByHostAndStates.stream().filter(vm -> vm.getState() == VirtualMachine.State.Running).count(); + long stoppingVms = listByHostAndStates.stream().filter(vm -> vm.getState() == VirtualMachine.State.Stopping).count(); + long migratingVms = listByHostAndStates.stream().filter(vm -> vm.getState() == VirtualMachine.State.Migrating).count(); + int countRunningVmsOnAgent = countRunningVmsOnAgent(host); + LOGGER.trace( + String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", host.getName(), startingVMs, runningVMs, + stoppingVms, migratingVms, listByHostAndStates.size(), countRunningVmsOnAgent)); + } + + return listByHostAndStates; + } + + /** + * Sends HTTP GET request from the host executing the KVM HA Agent webservice to a target Host (expected to also be running the KVM HA Agent). + * The webserver serves a JSON Object such as {"status": "Up"} if the request gets a HTTP_OK OR {"status": "Down"} if HTTP GET failed + */ + public boolean isHostReachableByNeighbour(Host neighbour, Host target) { + String neighbourHostAddress = neighbour.getPrivateIpAddress(); + String targetHostAddress = target.getPrivateIpAddress(); + int port = getKvmHaMicroservicePortValue(neighbour); + String url = String.format("http://%s:%d/%s/%s:%d", neighbourHostAddress, port, CHECK_NEIGHBOUR, targetHostAddress, port); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return false; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) + return false; + + int statusCode = response.getStatusLine().getStatusCode(); + if (isHttpStatusCodNotOk(statusCode)) { + LOGGER.error( + String.format("Failed HTTP %s Request %s; the expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, EXPECTED_HTTP_STATUS, statusCode)); + return false; + } + + String hostStatusFromJson = responseInJson.get(STATUS).getAsString(); + return Status.Up.toString().equals(hostStatusFromJson); + } + + protected boolean isHttpStatusCodNotOk(int statusCode) { + return statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES; + } + + /** + * Executes a GET request for the given URL address. + */ + @Nullable + protected HttpResponse executeHttpRequest(String url) { + HttpGet httpReq = prepareHttpRequestForUrl(url); + if (httpReq == null) { + return null; + } + + HttpClient client = HttpClientBuilder.create().build(); + HttpResponse response = null; + try { + response = client.execute(httpReq); + } catch (IOException e) { + if (MAX_REQUEST_RETRIES == 0) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); + return null; + } + response = retryHttpRequest(url, httpReq, client); + } + return response; + } + + @Nullable + private HttpGet prepareHttpRequestForUrl(String url) { + HttpGet httpReq = null; + try { + URIBuilder builder = new URIBuilder(url); + httpReq = new HttpGet(builder.build()); + } catch (URISyntaxException e) { + LOGGER.error(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); + return null; + } + return httpReq; Review comment: ```suggestion try { URIBuilder builder = new URIBuilder(url); return new HttpGet(builder.build()); } catch (URISyntaxException e) { LOGGER.error(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); return null; } ``` ########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java ########## @@ -59,29 +68,63 @@ @Inject private AgentManager agentMgr; @Inject - private PrimaryDataStoreDao storagePool; - @Inject private StorageManager storageManager; @Inject + private PrimaryDataStoreDao storagePool; + @Inject private ResourceManager resourceManager; + @Inject + private StoragePoolHostDao storagePoolHostDao; + @Inject + private KvmHaHelper kvmHaHelper; + + private static final Set<Storage.StoragePoolType> NFS_POOL_TYPE = new HashSet<>(Arrays.asList(Storage.StoragePoolType.NetworkFilesystem, Storage.StoragePoolType.ManagedNFS)); + private static final Set<Hypervisor.HypervisorType> KVM_OR_LXC = new HashSet<>(Arrays.asList(Hypervisor.HypervisorType.KVM, Hypervisor.HypervisorType.LXC)); @Override - public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException { + public boolean isActive(Host host, DateTime suspectTime) throws HACheckerException { try { - return isVMActivtyOnHost(r, suspectTime); + return isVMActivtyOnHost(host, suspectTime); } catch (HACheckerException e) { //Re-throwing the exception to avoid poluting the 'HACheckerException' already thrown throw e; - } catch (Exception e){ - String message = String.format("Operation timed out, probably the %s is not reachable.", r.toString()); + } catch (Exception e) { + String message = String.format("Operation timed out, probably the %s is not reachable.", host.toString()); LOG.warn(message, e); throw new HACheckerException(message, e); } } @Override - public boolean isHealthy(Host r) { - return isAgentActive(r); + public boolean isHealthy(Host host) { + boolean isHealthy = true; + boolean isHostServedByNfsPool = isHostServedByNfsPool(host); + boolean isKvmHaWebserviceEnabled = kvmHaHelper.isKvmHaWebserviceEnabled(host); + + if (isHostServedByNfsPool) { + isHealthy = isHealthViaNfs(host); + } + + if (!isKvmHaWebserviceEnabled) { + return isHealthy; + } + + if (kvmHaHelper.isKvmHealthyCheckViaLibvirt(host) && !isHealthy) { + return true; + } + + return isHealthy; + } + + private boolean isHealthViaNfs(Host r) { + boolean isHealthy = true; + if (isHostServedByNfsPool(r)) { + isHealthy = isAgentActive(r); + if (!isHealthy) { + LOG.warn(String.format("NFS storage health check failed for %s. It seems that a storage does not have activity.", r.toString())); + } + } Review comment: ```suggestion private boolean isHealthViaNfs(Host r) { if (!isHostServedByNfsPool(r)) { return true; } boolean isHealthy = isAgentActive(r); if (!isHealthy) { LOG.warn(String.format("NFS storage health check failed for %s. It seems that a storage does not have activity.", r.toString())); } ``` ########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java ########## @@ -59,29 +68,63 @@ @Inject private AgentManager agentMgr; @Inject - private PrimaryDataStoreDao storagePool; - @Inject private StorageManager storageManager; @Inject + private PrimaryDataStoreDao storagePool; + @Inject private ResourceManager resourceManager; + @Inject + private StoragePoolHostDao storagePoolHostDao; + @Inject + private KvmHaHelper kvmHaHelper; + + private static final Set<Storage.StoragePoolType> NFS_POOL_TYPE = new HashSet<>(Arrays.asList(Storage.StoragePoolType.NetworkFilesystem, Storage.StoragePoolType.ManagedNFS)); + private static final Set<Hypervisor.HypervisorType> KVM_OR_LXC = new HashSet<>(Arrays.asList(Hypervisor.HypervisorType.KVM, Hypervisor.HypervisorType.LXC)); @Override - public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException { + public boolean isActive(Host host, DateTime suspectTime) throws HACheckerException { try { - return isVMActivtyOnHost(r, suspectTime); + return isVMActivtyOnHost(host, suspectTime); } catch (HACheckerException e) { //Re-throwing the exception to avoid poluting the 'HACheckerException' already thrown throw e; - } catch (Exception e){ - String message = String.format("Operation timed out, probably the %s is not reachable.", r.toString()); + } catch (Exception e) { + String message = String.format("Operation timed out, probably the %s is not reachable.", host.toString()); LOG.warn(message, e); throw new HACheckerException(message, e); } } @Override - public boolean isHealthy(Host r) { - return isAgentActive(r); + public boolean isHealthy(Host host) { + boolean isHealthy = true; + boolean isHostServedByNfsPool = isHostServedByNfsPool(host); + boolean isKvmHaWebserviceEnabled = kvmHaHelper.isKvmHaWebserviceEnabled(host); + + if (isHostServedByNfsPool) { + isHealthy = isHealthViaNfs(host); + } + + if (!isKvmHaWebserviceEnabled) { + return isHealthy; + } + + if (kvmHaHelper.isKvmHealthyCheckViaLibvirt(host) && !isHealthy) { + return true; + } + + return isHealthy; + } + + private boolean isHealthViaNfs(Host r) { + boolean isHealthy = true; + if (isHostServedByNfsPool(r)) { + isHealthy = isAgentActive(r); + if (!isHealthy) { + LOG.warn(String.format("NFS storage health check failed for %s. It seems that a storage does not have activity.", r.toString())); + } + } Review comment: This was suggested to avoid nested `if`. Maybe we could add some log to the first return too, like `...host is not served by a NFS pool...`. ########## File path: plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java ########## @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import com.cloud.dc.ClusterVO; +import com.cloud.dc.dao.ClusterDao; +import com.cloud.host.Host; +import com.cloud.host.HostVO; +import com.cloud.host.Status; +import com.cloud.resource.ResourceManager; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.NotNull; + +import javax.inject.Inject; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * This class provides methods that help the KVM HA process on checking hosts status as well as deciding if a host should be fenced/recovered or not. + */ +public class KvmHaHelper { + + @Inject + protected ResourceManager resourceManager; + @Inject + protected KvmHaAgentClient kvmHaAgentClient; + @Inject + protected ClusterDao clusterDao; + + private static final Logger LOGGER = Logger.getLogger(KvmHaHelper.class); + private static final double PROBLEMATIC_HOSTS_RATIO_ACCEPTED = 0.3; + private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; + + private static final Set<Status> PROBLEMATIC_HOST_STATUS = new HashSet<>(Arrays.asList(Status.Alert, Status.Disconnected, Status.Down, Status.Error)); + + /** + * It checks the KVM node status via KVM HA Agent. + * If the agent is healthy it returns Status.Up, otherwise it keeps the provided Status as it is. + */ + public Status checkAgentStatusViaKvmHaAgent(Host host, Status agentStatus) { + boolean isVmsCountOnKvmMatchingWithDatabase = isKvmHaAgentHealthy(host); + if (isVmsCountOnKvmMatchingWithDatabase) { + agentStatus = Status.Up; + LOGGER.debug(String.format("Checking agent %s status; KVM HA Agent is Running as expected.", agentStatus)); + } else { + LOGGER.warn(String.format("Checking agent %s status. Failed to check host status via KVM HA Agent", agentStatus)); + } + return agentStatus; + } + + /** + * Given a List of Hosts, it lists Hosts that are in the following states: + * <ul> + * <li> Status.Alert; + * <li> Status.Disconnected; + * <li> Status.Down; + * <li> Status.Error. + * </ul> + */ + @NotNull + protected List<HostVO> listProblematicHosts(List<HostVO> hostsInCluster) { + return hostsInCluster.stream().filter(neighbour -> PROBLEMATIC_HOST_STATUS.contains(neighbour.getStatus())).collect(Collectors.toList()); + } + + /** + * Returns false if the cluster has no problematic hosts or a small fraction of it.<br><br> + * Returns true if the cluster is problematic. A cluster is problematic if many hosts are in Down or Disconnected states, in such case it should not recover/fence.<br> + * Instead, Admins should be warned and check as it could be networking problems and also might not even have resources capacity on the few Healthy hosts at the cluster. + * <br><br> + * Admins can change the accepted ration of problematic hosts via global settings by updating configuration: "kvm.ha.accepted.problematic.hosts.ratio". + */ + protected boolean isClusteProblematic(Host host) { + List<HostVO> hostsInCluster = resourceManager.listAllHostsInCluster(host.getClusterId()); + List<HostVO> problematicNeighbors = listProblematicHosts(hostsInCluster); + int problematicHosts = problematicNeighbors.size(); + int problematicHostsRatioAccepted = (int) (hostsInCluster.size() * KVMHAConfig.KvmHaAcceptedProblematicHostsRatio.value()); + + if (problematicHosts > problematicHostsRatioAccepted) { + ClusterVO cluster = clusterDao.findById(host.getClusterId()); + LOGGER.warn(String.format("%s is problematic but HA will not fence/recover due to its cluster [id: %d, name: %s] containing %d problematic hosts (Down, Disconnected, " + + "Alert or Error states). Maximum problematic hosts accepted for this cluster is %d.", + host, cluster.getId(), cluster.getName(), problematicHosts, problematicHostsRatioAccepted)); + return true; + } + return false; + } + + /** + * Returns true if the given Host KVM-HA-Helper is reachable by another host in the same cluster. + */ + protected boolean isHostAgentReachableByNeighbour(Host host) { + List<HostVO> neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up); + for (HostVO neighbor : neighbors) { + boolean isVmActivtyOnNeighborHost = isKvmHaAgentHealthy(neighbor); + if (isVmActivtyOnNeighborHost) { + boolean isReachable = kvmHaAgentClient.isHostReachableByNeighbour(neighbor, host); + if (isReachable) { + String.format("%s is reachable by neighbour %s. If CloudStack is failing to reach the respective host then it is probably a network issue between the host " + + "and CloudStack management server.", host, neighbor); + return true; + } + } + } + return false; + } + + /** + * Returns true if the host is healthy. The health-check is performed via HTTP GET request to a service that retrieves Running KVM instances via Libvirt. <br> + * The health-check is executed on the KVM node and verifies the amount of VMs running and if the Libvirt service is running. + */ + public boolean isKvmHealthyCheckViaLibvirt(Host host) { + boolean isKvmHaAgentHealthy = isKvmHaAgentHealthy(host); + + if (!isKvmHaAgentHealthy) { + if (isClusteProblematic(host) || isHostAgentReachableByNeighbour(host)) { + return true; + } + } + + return isKvmHaAgentHealthy; Review comment: We could use a ternary on return here. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
