This is an automated email from the ASF dual-hosted git repository.

rohit pushed a commit to branch 4.11
in repository https://gitbox.apache.org/repos/asf/cloudstack.git


The following commit(s) were added to refs/heads/4.11 by this push:
     new 13c81a8  server: Prevent corner case for infinite 
PrepareForMaintenance (#3095)
13c81a8 is described below

commit 13c81a8ee441e239fb4afe98b5b3da4455fe6459
Author: Nicolas Vazquez <nicovazque...@gmail.com>
AuthorDate: Fri Dec 28 06:44:16 2018 -0300

    server: Prevent corner case for infinite PrepareForMaintenance (#3095)
    
    A corner case was found on 4.11.2 for #2493 leading to an infinite loop in 
state PrepareForMaintenance
    
    To prevent such cases, in which failed migrations are detected but still 
running on the host, this feature adds a new cluster setting 
host.maintenance.retries which is the number of retries before marking the host 
as ErrorInMaintenance if migration errors persist.
    
    How Has This Been Tested?
    - 2 KVM hosts, pick one which has running VMs as H
    - Block migrations ports on H to simulate failures on migrations:
    iptables -I OUTPUT -j REJECT -m state --state NEW -m tcp -p tcp --dport 
49152:49215 -m comment --comment 'test block migrations' iptables -I OUTPUT -j 
REJECT -m state --state NEW -m tcp -p tcp --dport 16509 -m comment --comment 
'test block migrations
    - Put host H in Maintenance
    - Observe that host is indefinitely in PrepareForMaintenance state (after 
this fix it goes into ErrorInMaintenance after retrying 
host.maintenance.retries times)
---
 .../src/com/cloud/resource/ResourceManager.java    | 10 +++++-
 .../com/cloud/resource/ResourceManagerImpl.java    | 41 ++++++++++++++++++++++
 .../cloud/resource/MockResourceManagerImpl.java    | 11 ++++++
 .../cloud/resource/ResourceManagerImplTest.java    | 18 ++++++++++
 4 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/engine/components-api/src/com/cloud/resource/ResourceManager.java 
b/engine/components-api/src/com/cloud/resource/ResourceManager.java
index 1f7d3cb..720a980 100755
--- a/engine/components-api/src/com/cloud/resource/ResourceManager.java
+++ b/engine/components-api/src/com/cloud/resource/ResourceManager.java
@@ -38,12 +38,20 @@ import com.cloud.host.Status;
 import com.cloud.hypervisor.Hypervisor.HypervisorType;
 import com.cloud.resource.ResourceState.Event;
 import com.cloud.utils.fsm.NoTransitionException;
+import org.apache.cloudstack.framework.config.ConfigKey;
+import org.apache.cloudstack.framework.config.Configurable;
 
 /**
  * ResourceManager manages how physical resources are organized within the
  * CloudStack. It also manages the life cycle of the physical resources.
  */
-public interface ResourceManager extends ResourceService {
+public interface ResourceManager extends ResourceService, Configurable {
+
+    ConfigKey<Integer> HostMaintenanceRetries = new ConfigKey<>("Advanced", 
Integer.class,
+            "host.maintenance.retries","20",
+            "Number of retries when preparing a host into Maintenance Mode is 
faulty before failing",
+            true, ConfigKey.Scope.Cluster);
+
     /**
      * Register a listener for different types of resource life cycle events.
      * There can only be one type of listener per type of host.
diff --git a/server/src/com/cloud/resource/ResourceManagerImpl.java 
b/server/src/com/cloud/resource/ResourceManagerImpl.java
index ba3c157..59a7fa8 100755
--- a/server/src/com/cloud/resource/ResourceManagerImpl.java
+++ b/server/src/com/cloud/resource/ResourceManagerImpl.java
@@ -26,11 +26,13 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
+import java.util.concurrent.ConcurrentHashMap;
 
 import javax.inject.Inject;
 import javax.naming.ConfigurationException;
 
 import com.cloud.vm.dao.UserVmDetailsDao;
+import org.apache.cloudstack.framework.config.ConfigKey;
 import org.apache.commons.lang.ObjectUtils;
 import org.apache.log4j.Logger;
 import org.springframework.stereotype.Component;
@@ -271,6 +273,8 @@ public class ResourceManagerImpl extends ManagerBase 
implements ResourceManager,
 
     private SearchBuilder<HostGpuGroupsVO> _gpuAvailability;
 
+    private Map<Long,Integer> retryHostMaintenance = new ConcurrentHashMap<>();
+
     private void insertListener(final Integer event, final ResourceListener 
listener) {
         List<ResourceListener> lst = _lifeCycleListeners.get(event);
         if (lst == null) {
@@ -1224,6 +1228,7 @@ public class ResourceManagerImpl extends ManagerBase 
implements ResourceManager,
 
         
ActionEventUtils.onStartedActionEvent(CallContext.current().getCallingUserId(), 
CallContext.current().getCallingAccountId(), 
EventTypes.EVENT_MAINTENANCE_PREPARE, "starting maintenance for host " + 
hostId, true, 0);
         _agentMgr.pullAgentToMaintenance(hostId);
+        setHostMaintenanceRetries(host);
 
         /* TODO: move below to listener */
         if (host.getType() == Host.Type.Routing) {
@@ -1251,6 +1256,16 @@ public class ResourceManagerImpl extends ManagerBase 
implements ResourceManager,
         return true;
     }
 
+    /**
+     * Set retries for transiting the host into Maintenance
+     */
+    protected void setHostMaintenanceRetries(HostVO host) {
+        Integer retries = HostMaintenanceRetries.valueIn(host.getClusterId());
+        retryHostMaintenance.put(host.getId(), retries);
+        s_logger.debug(String.format("Setting the host %s (%s) retries for 
Maintenance mode: %s",
+                host.getId(), host.getName(), retries));
+    }
+
     @Override
     public boolean maintain(final long hostId) throws 
AgentUnavailableException {
         final Boolean result = propagateResourceEvent(hostId, 
ResourceState.Event.AdminAskMaintenace);
@@ -1350,7 +1365,23 @@ public class ResourceManagerImpl extends ManagerBase 
implements ResourceManager,
             return CollectionUtils.isEmpty(failedMigrations) ?
                     setHostIntoMaintenance(host) :
                     setHostIntoErrorInMaintenance(host, failedMigrations);
+        } else if (retryHostMaintenance.containsKey(host.getId())) {
+            Integer retriesLeft = retryHostMaintenance.get(host.getId());
+            if (retriesLeft != null) {
+                if (retriesLeft <= 0) {
+                    retryHostMaintenance.remove(host.getId());
+                    s_logger.debug(String.format("No retries left while 
preparing KVM host %s (%s) for Maintenance, " +
+                                    "please investigate this connection.",
+                            host.getId(), host.getName()));
+                    return setHostIntoErrorInMaintenance(host, 
failedMigrations);
+                }
+                retriesLeft--;
+                retryHostMaintenance.put(host.getId(), retriesLeft);
+                s_logger.debug(String.format("Retries left preparing KVM host 
%s (%s) for Maintenance: %s",
+                        host.getId(), host.getName(), retriesLeft));
+            }
         }
+
         return false;
     }
 
@@ -2316,6 +2347,7 @@ public class ResourceManagerImpl extends ManagerBase 
implements ResourceManager,
         try {
             resourceStateTransitTo(host, 
ResourceState.Event.AdminCancelMaintenance, _nodeId);
             _agentMgr.pullAgentOutMaintenance(hostId);
+            retryHostMaintenance.remove(hostId);
 
             // for kvm, need to log into kvm host, restart cloudstack-agent
             if ((host.getHypervisorType() == HypervisorType.KVM && 
!vms_migrating) || host.getHypervisorType() == HypervisorType.LXC) {
@@ -2924,4 +2956,13 @@ public class ResourceManagerImpl extends ManagerBase 
implements ResourceManager,
         return super.start();
     }
 
+    @Override
+    public String getConfigComponentName() {
+        return ResourceManagerImpl.class.getSimpleName();
+    }
+
+    @Override
+    public ConfigKey<?>[] getConfigKeys() {
+        return new ConfigKey<?>[] {HostMaintenanceRetries};
+    }
 }
diff --git a/server/test/com/cloud/resource/MockResourceManagerImpl.java 
b/server/test/com/cloud/resource/MockResourceManagerImpl.java
index e3e46de..82a1e92 100755
--- a/server/test/com/cloud/resource/MockResourceManagerImpl.java
+++ b/server/test/com/cloud/resource/MockResourceManagerImpl.java
@@ -56,6 +56,7 @@ import com.cloud.org.Cluster;
 import com.cloud.resource.ResourceState.Event;
 import com.cloud.utils.component.ManagerBase;
 import com.cloud.utils.fsm.NoTransitionException;
+import org.apache.cloudstack.framework.config.ConfigKey;
 
 public class MockResourceManagerImpl extends ManagerBase implements 
ResourceManager {
 
@@ -625,4 +626,14 @@ public class MockResourceManagerImpl extends ManagerBase 
implements ResourceMana
         // TODO Auto-generated method stub
         return false;
     }
+
+    @Override
+    public String getConfigComponentName() {
+        return null;
+    }
+
+    @Override
+    public ConfigKey<?>[] getConfigKeys() {
+        return new ConfigKey[0];
+    }
 }
diff --git a/server/test/com/cloud/resource/ResourceManagerImplTest.java 
b/server/test/com/cloud/resource/ResourceManagerImplTest.java
index 525ccd0..6d14410 100644
--- a/server/test/com/cloud/resource/ResourceManagerImplTest.java
+++ b/server/test/com/cloud/resource/ResourceManagerImplTest.java
@@ -118,6 +118,7 @@ public class ResourceManagerImplTest {
         when(host.getId()).thenReturn(hostId);
         when(host.getResourceState()).thenReturn(ResourceState.Enabled);
         
when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.VMware);
+        when(host.getClusterId()).thenReturn(1L);
         when(hostDao.findById(hostId)).thenReturn(host);
         when(vm1.getId()).thenReturn(vm1Id);
         when(vm2.getId()).thenReturn(vm2Id);
@@ -188,4 +189,21 @@ public class ResourceManagerImplTest {
         verify(userVmDetailsDao).addDetail(eq(vm2Id), eq("kvm.vnc.port"), 
eq(String.valueOf(vm2VncPort)), anyBoolean());
         verify(agentManager).pullAgentToMaintenance(hostId);
     }
+
+    @Test
+    public void testCheckAndMaintainErrorInMaintenanceRetries() throws 
NoTransitionException {
+        resourceManager.setHostMaintenanceRetries(host);
+
+        List<VMInstanceVO> failedMigrations = Arrays.asList(vm1, vm2);
+        
when(vmInstanceDao.listByHostId(host.getId())).thenReturn(failedMigrations);
+        
when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(host.getId())).thenReturn(failedMigrations);
+
+        Integer retries = 
ResourceManager.HostMaintenanceRetries.valueIn(host.getClusterId());
+        for (int i = 0; i <= retries; i++) {
+            resourceManager.checkAndMaintain(host.getId());
+        }
+
+        verify(resourceManager, times(retries + 1)).isHostInMaintenance(host, 
failedMigrations, new ArrayList<>(), failedMigrations);
+        verify(resourceManager).setHostIntoErrorInMaintenance(host, 
failedMigrations);
+    }
 }

Reply via email to