rhtyd closed pull request #3095: Prevent corner case for infinite 
PrepareForMaintenance
URL: https://github.com/apache/cloudstack/pull/3095
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/engine/components-api/src/com/cloud/resource/ResourceManager.java 
b/engine/components-api/src/com/cloud/resource/ResourceManager.java
index 1f7d3cbea54..720a980f4e7 100755
--- a/engine/components-api/src/com/cloud/resource/ResourceManager.java
+++ b/engine/components-api/src/com/cloud/resource/ResourceManager.java
@@ -38,12 +38,20 @@
 import com.cloud.hypervisor.Hypervisor.HypervisorType;
 import com.cloud.resource.ResourceState.Event;
 import com.cloud.utils.fsm.NoTransitionException;
+import org.apache.cloudstack.framework.config.ConfigKey;
+import org.apache.cloudstack.framework.config.Configurable;
 
 /**
  * ResourceManager manages how physical resources are organized within the
  * CloudStack. It also manages the life cycle of the physical resources.
  */
-public interface ResourceManager extends ResourceService {
+public interface ResourceManager extends ResourceService, Configurable {
+
+    ConfigKey<Integer> HostMaintenanceRetries = new ConfigKey<>("Advanced", 
Integer.class,
+            "host.maintenance.retries","20",
+            "Number of retries when preparing a host into Maintenance Mode is 
faulty before failing",
+            true, ConfigKey.Scope.Cluster);
+
     /**
      * Register a listener for different types of resource life cycle events.
      * There can only be one type of listener per type of host.
diff --git a/server/src/com/cloud/resource/ResourceManagerImpl.java 
b/server/src/com/cloud/resource/ResourceManagerImpl.java
index ba3c157cf9f..59a7fa85d7a 100755
--- a/server/src/com/cloud/resource/ResourceManagerImpl.java
+++ b/server/src/com/cloud/resource/ResourceManagerImpl.java
@@ -26,11 +26,13 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
+import java.util.concurrent.ConcurrentHashMap;
 
 import javax.inject.Inject;
 import javax.naming.ConfigurationException;
 
 import com.cloud.vm.dao.UserVmDetailsDao;
+import org.apache.cloudstack.framework.config.ConfigKey;
 import org.apache.commons.lang.ObjectUtils;
 import org.apache.log4j.Logger;
 import org.springframework.stereotype.Component;
@@ -271,6 +273,8 @@ public void setDiscoverers(final List<? extends Discoverer> 
discoverers) {
 
     private SearchBuilder<HostGpuGroupsVO> _gpuAvailability;
 
+    private Map<Long,Integer> retryHostMaintenance = new ConcurrentHashMap<>();
+
     private void insertListener(final Integer event, final ResourceListener 
listener) {
         List<ResourceListener> lst = _lifeCycleListeners.get(event);
         if (lst == null) {
@@ -1224,6 +1228,7 @@ private boolean doMaintain(final long hostId) {
 
         
ActionEventUtils.onStartedActionEvent(CallContext.current().getCallingUserId(), 
CallContext.current().getCallingAccountId(), 
EventTypes.EVENT_MAINTENANCE_PREPARE, "starting maintenance for host " + 
hostId, true, 0);
         _agentMgr.pullAgentToMaintenance(hostId);
+        setHostMaintenanceRetries(host);
 
         /* TODO: move below to listener */
         if (host.getType() == Host.Type.Routing) {
@@ -1251,6 +1256,16 @@ private boolean doMaintain(final long hostId) {
         return true;
     }
 
+    /**
+     * Set retries for transiting the host into Maintenance
+     */
+    protected void setHostMaintenanceRetries(HostVO host) {
+        Integer retries = HostMaintenanceRetries.valueIn(host.getClusterId());
+        retryHostMaintenance.put(host.getId(), retries);
+        s_logger.debug(String.format("Setting the host %s (%s) retries for 
Maintenance mode: %s",
+                host.getId(), host.getName(), retries));
+    }
+
     @Override
     public boolean maintain(final long hostId) throws 
AgentUnavailableException {
         final Boolean result = propagateResourceEvent(hostId, 
ResourceState.Event.AdminAskMaintenace);
@@ -1350,7 +1365,23 @@ protected boolean isHostInMaintenance(HostVO host, 
List<VMInstanceVO> runningVms
             return CollectionUtils.isEmpty(failedMigrations) ?
                     setHostIntoMaintenance(host) :
                     setHostIntoErrorInMaintenance(host, failedMigrations);
+        } else if (retryHostMaintenance.containsKey(host.getId())) {
+            Integer retriesLeft = retryHostMaintenance.get(host.getId());
+            if (retriesLeft != null) {
+                if (retriesLeft <= 0) {
+                    retryHostMaintenance.remove(host.getId());
+                    s_logger.debug(String.format("No retries left while 
preparing KVM host %s (%s) for Maintenance, " +
+                                    "please investigate this connection.",
+                            host.getId(), host.getName()));
+                    return setHostIntoErrorInMaintenance(host, 
failedMigrations);
+                }
+                retriesLeft--;
+                retryHostMaintenance.put(host.getId(), retriesLeft);
+                s_logger.debug(String.format("Retries left preparing KVM host 
%s (%s) for Maintenance: %s",
+                        host.getId(), host.getName(), retriesLeft));
+            }
         }
+
         return false;
     }
 
@@ -2316,6 +2347,7 @@ private boolean doCancelMaintenance(final long hostId) {
         try {
             resourceStateTransitTo(host, 
ResourceState.Event.AdminCancelMaintenance, _nodeId);
             _agentMgr.pullAgentOutMaintenance(hostId);
+            retryHostMaintenance.remove(hostId);
 
             // for kvm, need to log into kvm host, restart cloudstack-agent
             if ((host.getHypervisorType() == HypervisorType.KVM && 
!vms_migrating) || host.getHypervisorType() == HypervisorType.LXC) {
@@ -2924,4 +2956,13 @@ public boolean start() {
         return super.start();
     }
 
+    @Override
+    public String getConfigComponentName() {
+        return ResourceManagerImpl.class.getSimpleName();
+    }
+
+    @Override
+    public ConfigKey<?>[] getConfigKeys() {
+        return new ConfigKey<?>[] {HostMaintenanceRetries};
+    }
 }
diff --git a/server/test/com/cloud/resource/MockResourceManagerImpl.java 
b/server/test/com/cloud/resource/MockResourceManagerImpl.java
index e3e46de7ac1..82a1e923cb1 100755
--- a/server/test/com/cloud/resource/MockResourceManagerImpl.java
+++ b/server/test/com/cloud/resource/MockResourceManagerImpl.java
@@ -56,6 +56,7 @@
 import com.cloud.resource.ResourceState.Event;
 import com.cloud.utils.component.ManagerBase;
 import com.cloud.utils.fsm.NoTransitionException;
+import org.apache.cloudstack.framework.config.ConfigKey;
 
 public class MockResourceManagerImpl extends ManagerBase implements 
ResourceManager {
 
@@ -625,4 +626,14 @@ public boolean isHostGpuEnabled(final long hostId) {
         // TODO Auto-generated method stub
         return false;
     }
+
+    @Override
+    public String getConfigComponentName() {
+        return null;
+    }
+
+    @Override
+    public ConfigKey<?>[] getConfigKeys() {
+        return new ConfigKey[0];
+    }
 }
diff --git a/server/test/com/cloud/resource/ResourceManagerImplTest.java 
b/server/test/com/cloud/resource/ResourceManagerImplTest.java
index 525ccd0c071..6d14410fc1c 100644
--- a/server/test/com/cloud/resource/ResourceManagerImplTest.java
+++ b/server/test/com/cloud/resource/ResourceManagerImplTest.java
@@ -118,6 +118,7 @@ public void setup() throws Exception {
         when(host.getId()).thenReturn(hostId);
         when(host.getResourceState()).thenReturn(ResourceState.Enabled);
         
when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.VMware);
+        when(host.getClusterId()).thenReturn(1L);
         when(hostDao.findById(hostId)).thenReturn(host);
         when(vm1.getId()).thenReturn(vm1Id);
         when(vm2.getId()).thenReturn(vm2Id);
@@ -188,4 +189,21 @@ public void 
testConfigureVncAccessForKVMHostFailedMigrations() {
         verify(userVmDetailsDao).addDetail(eq(vm2Id), eq("kvm.vnc.port"), 
eq(String.valueOf(vm2VncPort)), anyBoolean());
         verify(agentManager).pullAgentToMaintenance(hostId);
     }
+
+    @Test
+    public void testCheckAndMaintainErrorInMaintenanceRetries() throws 
NoTransitionException {
+        resourceManager.setHostMaintenanceRetries(host);
+
+        List<VMInstanceVO> failedMigrations = Arrays.asList(vm1, vm2);
+        
when(vmInstanceDao.listByHostId(host.getId())).thenReturn(failedMigrations);
+        
when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(host.getId())).thenReturn(failedMigrations);
+
+        Integer retries = 
ResourceManager.HostMaintenanceRetries.valueIn(host.getClusterId());
+        for (int i = 0; i <= retries; i++) {
+            resourceManager.checkAndMaintain(host.getId());
+        }
+
+        verify(resourceManager, times(retries + 1)).isHostInMaintenance(host, 
failedMigrations, new ArrayList<>(), failedMigrations);
+        verify(resourceManager).setHostIntoErrorInMaintenance(host, 
failedMigrations);
+    }
 }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to