rhtyd closed pull request #3095: Prevent corner case for infinite PrepareForMaintenance URL: https://github.com/apache/cloudstack/pull/3095
This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/engine/components-api/src/com/cloud/resource/ResourceManager.java b/engine/components-api/src/com/cloud/resource/ResourceManager.java index 1f7d3cbea54..720a980f4e7 100755 --- a/engine/components-api/src/com/cloud/resource/ResourceManager.java +++ b/engine/components-api/src/com/cloud/resource/ResourceManager.java @@ -38,12 +38,20 @@ import com.cloud.hypervisor.Hypervisor.HypervisorType; import com.cloud.resource.ResourceState.Event; import com.cloud.utils.fsm.NoTransitionException; +import org.apache.cloudstack.framework.config.ConfigKey; +import org.apache.cloudstack.framework.config.Configurable; /** * ResourceManager manages how physical resources are organized within the * CloudStack. It also manages the life cycle of the physical resources. */ -public interface ResourceManager extends ResourceService { +public interface ResourceManager extends ResourceService, Configurable { + + ConfigKey<Integer> HostMaintenanceRetries = new ConfigKey<>("Advanced", Integer.class, + "host.maintenance.retries","20", + "Number of retries when preparing a host into Maintenance Mode is faulty before failing", + true, ConfigKey.Scope.Cluster); + /** * Register a listener for different types of resource life cycle events. * There can only be one type of listener per type of host. diff --git a/server/src/com/cloud/resource/ResourceManagerImpl.java b/server/src/com/cloud/resource/ResourceManagerImpl.java index ba3c157cf9f..59a7fa85d7a 100755 --- a/server/src/com/cloud/resource/ResourceManagerImpl.java +++ b/server/src/com/cloud/resource/ResourceManagerImpl.java @@ -26,11 +26,13 @@ import java.util.List; import java.util.Map; import java.util.Random; +import java.util.concurrent.ConcurrentHashMap; import javax.inject.Inject; import javax.naming.ConfigurationException; import com.cloud.vm.dao.UserVmDetailsDao; +import org.apache.cloudstack.framework.config.ConfigKey; import org.apache.commons.lang.ObjectUtils; import org.apache.log4j.Logger; import org.springframework.stereotype.Component; @@ -271,6 +273,8 @@ public void setDiscoverers(final List<? extends Discoverer> discoverers) { private SearchBuilder<HostGpuGroupsVO> _gpuAvailability; + private Map<Long,Integer> retryHostMaintenance = new ConcurrentHashMap<>(); + private void insertListener(final Integer event, final ResourceListener listener) { List<ResourceListener> lst = _lifeCycleListeners.get(event); if (lst == null) { @@ -1224,6 +1228,7 @@ private boolean doMaintain(final long hostId) { ActionEventUtils.onStartedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), EventTypes.EVENT_MAINTENANCE_PREPARE, "starting maintenance for host " + hostId, true, 0); _agentMgr.pullAgentToMaintenance(hostId); + setHostMaintenanceRetries(host); /* TODO: move below to listener */ if (host.getType() == Host.Type.Routing) { @@ -1251,6 +1256,16 @@ private boolean doMaintain(final long hostId) { return true; } + /** + * Set retries for transiting the host into Maintenance + */ + protected void setHostMaintenanceRetries(HostVO host) { + Integer retries = HostMaintenanceRetries.valueIn(host.getClusterId()); + retryHostMaintenance.put(host.getId(), retries); + s_logger.debug(String.format("Setting the host %s (%s) retries for Maintenance mode: %s", + host.getId(), host.getName(), retries)); + } + @Override public boolean maintain(final long hostId) throws AgentUnavailableException { final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenace); @@ -1350,7 +1365,23 @@ protected boolean isHostInMaintenance(HostVO host, List<VMInstanceVO> runningVms return CollectionUtils.isEmpty(failedMigrations) ? setHostIntoMaintenance(host) : setHostIntoErrorInMaintenance(host, failedMigrations); + } else if (retryHostMaintenance.containsKey(host.getId())) { + Integer retriesLeft = retryHostMaintenance.get(host.getId()); + if (retriesLeft != null) { + if (retriesLeft <= 0) { + retryHostMaintenance.remove(host.getId()); + s_logger.debug(String.format("No retries left while preparing KVM host %s (%s) for Maintenance, " + + "please investigate this connection.", + host.getId(), host.getName())); + return setHostIntoErrorInMaintenance(host, failedMigrations); + } + retriesLeft--; + retryHostMaintenance.put(host.getId(), retriesLeft); + s_logger.debug(String.format("Retries left preparing KVM host %s (%s) for Maintenance: %s", + host.getId(), host.getName(), retriesLeft)); + } } + return false; } @@ -2316,6 +2347,7 @@ private boolean doCancelMaintenance(final long hostId) { try { resourceStateTransitTo(host, ResourceState.Event.AdminCancelMaintenance, _nodeId); _agentMgr.pullAgentOutMaintenance(hostId); + retryHostMaintenance.remove(hostId); // for kvm, need to log into kvm host, restart cloudstack-agent if ((host.getHypervisorType() == HypervisorType.KVM && !vms_migrating) || host.getHypervisorType() == HypervisorType.LXC) { @@ -2924,4 +2956,13 @@ public boolean start() { return super.start(); } + @Override + public String getConfigComponentName() { + return ResourceManagerImpl.class.getSimpleName(); + } + + @Override + public ConfigKey<?>[] getConfigKeys() { + return new ConfigKey<?>[] {HostMaintenanceRetries}; + } } diff --git a/server/test/com/cloud/resource/MockResourceManagerImpl.java b/server/test/com/cloud/resource/MockResourceManagerImpl.java index e3e46de7ac1..82a1e923cb1 100755 --- a/server/test/com/cloud/resource/MockResourceManagerImpl.java +++ b/server/test/com/cloud/resource/MockResourceManagerImpl.java @@ -56,6 +56,7 @@ import com.cloud.resource.ResourceState.Event; import com.cloud.utils.component.ManagerBase; import com.cloud.utils.fsm.NoTransitionException; +import org.apache.cloudstack.framework.config.ConfigKey; public class MockResourceManagerImpl extends ManagerBase implements ResourceManager { @@ -625,4 +626,14 @@ public boolean isHostGpuEnabled(final long hostId) { // TODO Auto-generated method stub return false; } + + @Override + public String getConfigComponentName() { + return null; + } + + @Override + public ConfigKey<?>[] getConfigKeys() { + return new ConfigKey[0]; + } } diff --git a/server/test/com/cloud/resource/ResourceManagerImplTest.java b/server/test/com/cloud/resource/ResourceManagerImplTest.java index 525ccd0c071..6d14410fc1c 100644 --- a/server/test/com/cloud/resource/ResourceManagerImplTest.java +++ b/server/test/com/cloud/resource/ResourceManagerImplTest.java @@ -118,6 +118,7 @@ public void setup() throws Exception { when(host.getId()).thenReturn(hostId); when(host.getResourceState()).thenReturn(ResourceState.Enabled); when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.VMware); + when(host.getClusterId()).thenReturn(1L); when(hostDao.findById(hostId)).thenReturn(host); when(vm1.getId()).thenReturn(vm1Id); when(vm2.getId()).thenReturn(vm2Id); @@ -188,4 +189,21 @@ public void testConfigureVncAccessForKVMHostFailedMigrations() { verify(userVmDetailsDao).addDetail(eq(vm2Id), eq("kvm.vnc.port"), eq(String.valueOf(vm2VncPort)), anyBoolean()); verify(agentManager).pullAgentToMaintenance(hostId); } + + @Test + public void testCheckAndMaintainErrorInMaintenanceRetries() throws NoTransitionException { + resourceManager.setHostMaintenanceRetries(host); + + List<VMInstanceVO> failedMigrations = Arrays.asList(vm1, vm2); + when(vmInstanceDao.listByHostId(host.getId())).thenReturn(failedMigrations); + when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(host.getId())).thenReturn(failedMigrations); + + Integer retries = ResourceManager.HostMaintenanceRetries.valueIn(host.getClusterId()); + for (int i = 0; i <= retries; i++) { + resourceManager.checkAndMaintain(host.getId()); + } + + verify(resourceManager, times(retries + 1)).isHostInMaintenance(host, failedMigrations, new ArrayList<>(), failedMigrations); + verify(resourceManager).setHostIntoErrorInMaintenance(host, failedMigrations); + } } ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services