This is an automated email from the ASF dual-hosted git repository. rohit pushed a commit to branch 4.11 in repository https://gitbox.apache.org/repos/asf/cloudstack.git
The following commit(s) were added to refs/heads/4.11 by this push: new 13c81a8 server: Prevent corner case for infinite PrepareForMaintenance (#3095) 13c81a8 is described below commit 13c81a8ee441e239fb4afe98b5b3da4455fe6459 Author: Nicolas Vazquez <nicovazque...@gmail.com> AuthorDate: Fri Dec 28 06:44:16 2018 -0300 server: Prevent corner case for infinite PrepareForMaintenance (#3095) A corner case was found on 4.11.2 for #2493 leading to an infinite loop in state PrepareForMaintenance To prevent such cases, in which failed migrations are detected but still running on the host, this feature adds a new cluster setting host.maintenance.retries which is the number of retries before marking the host as ErrorInMaintenance if migration errors persist. How Has This Been Tested? - 2 KVM hosts, pick one which has running VMs as H - Block migrations ports on H to simulate failures on migrations: iptables -I OUTPUT -j REJECT -m state --state NEW -m tcp -p tcp --dport 49152:49215 -m comment --comment 'test block migrations' iptables -I OUTPUT -j REJECT -m state --state NEW -m tcp -p tcp --dport 16509 -m comment --comment 'test block migrations - Put host H in Maintenance - Observe that host is indefinitely in PrepareForMaintenance state (after this fix it goes into ErrorInMaintenance after retrying host.maintenance.retries times) --- .../src/com/cloud/resource/ResourceManager.java | 10 +++++- .../com/cloud/resource/ResourceManagerImpl.java | 41 ++++++++++++++++++++++ .../cloud/resource/MockResourceManagerImpl.java | 11 ++++++ .../cloud/resource/ResourceManagerImplTest.java | 18 ++++++++++ 4 files changed, 79 insertions(+), 1 deletion(-) diff --git a/engine/components-api/src/com/cloud/resource/ResourceManager.java b/engine/components-api/src/com/cloud/resource/ResourceManager.java index 1f7d3cb..720a980 100755 --- a/engine/components-api/src/com/cloud/resource/ResourceManager.java +++ b/engine/components-api/src/com/cloud/resource/ResourceManager.java @@ -38,12 +38,20 @@ import com.cloud.host.Status; import com.cloud.hypervisor.Hypervisor.HypervisorType; import com.cloud.resource.ResourceState.Event; import com.cloud.utils.fsm.NoTransitionException; +import org.apache.cloudstack.framework.config.ConfigKey; +import org.apache.cloudstack.framework.config.Configurable; /** * ResourceManager manages how physical resources are organized within the * CloudStack. It also manages the life cycle of the physical resources. */ -public interface ResourceManager extends ResourceService { +public interface ResourceManager extends ResourceService, Configurable { + + ConfigKey<Integer> HostMaintenanceRetries = new ConfigKey<>("Advanced", Integer.class, + "host.maintenance.retries","20", + "Number of retries when preparing a host into Maintenance Mode is faulty before failing", + true, ConfigKey.Scope.Cluster); + /** * Register a listener for different types of resource life cycle events. * There can only be one type of listener per type of host. diff --git a/server/src/com/cloud/resource/ResourceManagerImpl.java b/server/src/com/cloud/resource/ResourceManagerImpl.java index ba3c157..59a7fa8 100755 --- a/server/src/com/cloud/resource/ResourceManagerImpl.java +++ b/server/src/com/cloud/resource/ResourceManagerImpl.java @@ -26,11 +26,13 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; +import java.util.concurrent.ConcurrentHashMap; import javax.inject.Inject; import javax.naming.ConfigurationException; import com.cloud.vm.dao.UserVmDetailsDao; +import org.apache.cloudstack.framework.config.ConfigKey; import org.apache.commons.lang.ObjectUtils; import org.apache.log4j.Logger; import org.springframework.stereotype.Component; @@ -271,6 +273,8 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, private SearchBuilder<HostGpuGroupsVO> _gpuAvailability; + private Map<Long,Integer> retryHostMaintenance = new ConcurrentHashMap<>(); + private void insertListener(final Integer event, final ResourceListener listener) { List<ResourceListener> lst = _lifeCycleListeners.get(event); if (lst == null) { @@ -1224,6 +1228,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, ActionEventUtils.onStartedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), EventTypes.EVENT_MAINTENANCE_PREPARE, "starting maintenance for host " + hostId, true, 0); _agentMgr.pullAgentToMaintenance(hostId); + setHostMaintenanceRetries(host); /* TODO: move below to listener */ if (host.getType() == Host.Type.Routing) { @@ -1251,6 +1256,16 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, return true; } + /** + * Set retries for transiting the host into Maintenance + */ + protected void setHostMaintenanceRetries(HostVO host) { + Integer retries = HostMaintenanceRetries.valueIn(host.getClusterId()); + retryHostMaintenance.put(host.getId(), retries); + s_logger.debug(String.format("Setting the host %s (%s) retries for Maintenance mode: %s", + host.getId(), host.getName(), retries)); + } + @Override public boolean maintain(final long hostId) throws AgentUnavailableException { final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenace); @@ -1350,7 +1365,23 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, return CollectionUtils.isEmpty(failedMigrations) ? setHostIntoMaintenance(host) : setHostIntoErrorInMaintenance(host, failedMigrations); + } else if (retryHostMaintenance.containsKey(host.getId())) { + Integer retriesLeft = retryHostMaintenance.get(host.getId()); + if (retriesLeft != null) { + if (retriesLeft <= 0) { + retryHostMaintenance.remove(host.getId()); + s_logger.debug(String.format("No retries left while preparing KVM host %s (%s) for Maintenance, " + + "please investigate this connection.", + host.getId(), host.getName())); + return setHostIntoErrorInMaintenance(host, failedMigrations); + } + retriesLeft--; + retryHostMaintenance.put(host.getId(), retriesLeft); + s_logger.debug(String.format("Retries left preparing KVM host %s (%s) for Maintenance: %s", + host.getId(), host.getName(), retriesLeft)); + } } + return false; } @@ -2316,6 +2347,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, try { resourceStateTransitTo(host, ResourceState.Event.AdminCancelMaintenance, _nodeId); _agentMgr.pullAgentOutMaintenance(hostId); + retryHostMaintenance.remove(hostId); // for kvm, need to log into kvm host, restart cloudstack-agent if ((host.getHypervisorType() == HypervisorType.KVM && !vms_migrating) || host.getHypervisorType() == HypervisorType.LXC) { @@ -2924,4 +2956,13 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, return super.start(); } + @Override + public String getConfigComponentName() { + return ResourceManagerImpl.class.getSimpleName(); + } + + @Override + public ConfigKey<?>[] getConfigKeys() { + return new ConfigKey<?>[] {HostMaintenanceRetries}; + } } diff --git a/server/test/com/cloud/resource/MockResourceManagerImpl.java b/server/test/com/cloud/resource/MockResourceManagerImpl.java index e3e46de..82a1e92 100755 --- a/server/test/com/cloud/resource/MockResourceManagerImpl.java +++ b/server/test/com/cloud/resource/MockResourceManagerImpl.java @@ -56,6 +56,7 @@ import com.cloud.org.Cluster; import com.cloud.resource.ResourceState.Event; import com.cloud.utils.component.ManagerBase; import com.cloud.utils.fsm.NoTransitionException; +import org.apache.cloudstack.framework.config.ConfigKey; public class MockResourceManagerImpl extends ManagerBase implements ResourceManager { @@ -625,4 +626,14 @@ public class MockResourceManagerImpl extends ManagerBase implements ResourceMana // TODO Auto-generated method stub return false; } + + @Override + public String getConfigComponentName() { + return null; + } + + @Override + public ConfigKey<?>[] getConfigKeys() { + return new ConfigKey[0]; + } } diff --git a/server/test/com/cloud/resource/ResourceManagerImplTest.java b/server/test/com/cloud/resource/ResourceManagerImplTest.java index 525ccd0..6d14410 100644 --- a/server/test/com/cloud/resource/ResourceManagerImplTest.java +++ b/server/test/com/cloud/resource/ResourceManagerImplTest.java @@ -118,6 +118,7 @@ public class ResourceManagerImplTest { when(host.getId()).thenReturn(hostId); when(host.getResourceState()).thenReturn(ResourceState.Enabled); when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.VMware); + when(host.getClusterId()).thenReturn(1L); when(hostDao.findById(hostId)).thenReturn(host); when(vm1.getId()).thenReturn(vm1Id); when(vm2.getId()).thenReturn(vm2Id); @@ -188,4 +189,21 @@ public class ResourceManagerImplTest { verify(userVmDetailsDao).addDetail(eq(vm2Id), eq("kvm.vnc.port"), eq(String.valueOf(vm2VncPort)), anyBoolean()); verify(agentManager).pullAgentToMaintenance(hostId); } + + @Test + public void testCheckAndMaintainErrorInMaintenanceRetries() throws NoTransitionException { + resourceManager.setHostMaintenanceRetries(host); + + List<VMInstanceVO> failedMigrations = Arrays.asList(vm1, vm2); + when(vmInstanceDao.listByHostId(host.getId())).thenReturn(failedMigrations); + when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(host.getId())).thenReturn(failedMigrations); + + Integer retries = ResourceManager.HostMaintenanceRetries.valueIn(host.getClusterId()); + for (int i = 0; i <= retries; i++) { + resourceManager.checkAndMaintain(host.getId()); + } + + verify(resourceManager, times(retries + 1)).isHostInMaintenance(host, failedMigrations, new ArrayList<>(), failedMigrations); + verify(resourceManager).setHostIntoErrorInMaintenance(host, failedMigrations); + } }