Arik Hadas has uploaded a new change for review.

Change subject: core: run HA VMs that went down right before engine stopped
......................................................................

core: run HA VMs that went down right before engine stopped

This patch solves a known problem that was exists before: when HA VMs
went down, the engine (VdsUpdateRunTimeInfo) detected it (and updated
the DB) but didn't manage to run the VM, we won't try to run the VM when
the engine start.

This problem is solved by fetching from the DB all the HA VMs which are
down with exit status ERROR when initializing the AutoStartVmsRunner
job, and those VMs are set as the first VMs the job will try to run on
its first iteration.

Change-Id: I2d5876f196819b2a69be0b71287c5325a8ff9dc9
Signed-off-by: Arik Hadas <[email protected]>
---
M 
backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java
M backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAO.java
M 
backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAODbFacadeImpl.java
M packaging/dbscripts/vms_sp.sql
4 files changed, 47 insertions(+), 8 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/02/19502/1

diff --git 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java
 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java
index 4261fa0..a481445 100644
--- 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java
+++ 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java
@@ -2,12 +2,14 @@
 
 import java.util.Collections;
 import java.util.LinkedList;
+import java.util.List;
 import java.util.concurrent.CopyOnWriteArraySet;
 
 import org.ovirt.engine.core.bll.job.ExecutionHandler;
 import org.ovirt.engine.core.common.AuditLogType;
 import org.ovirt.engine.core.common.action.RunVmParams;
 import org.ovirt.engine.core.common.action.VdcActionType;
+import org.ovirt.engine.core.common.businessentities.VM;
 import org.ovirt.engine.core.common.businessentities.VMStatus;
 import org.ovirt.engine.core.common.config.Config;
 import org.ovirt.engine.core.common.config.ConfigValues;
@@ -19,6 +21,7 @@
 import org.ovirt.engine.core.dal.dbbroker.DbFacade;
 import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector;
 import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogableBase;
+import org.ovirt.engine.core.dao.VmDAO;
 import org.ovirt.engine.core.dao.VmDynamicDAO;
 import org.ovirt.engine.core.utils.lock.EngineLock;
 import org.ovirt.engine.core.utils.lock.LockManager;
@@ -45,6 +48,15 @@
     }
 
     private AutoStartVmsRunner() {
+        // There might be HA VMs which went down just before the engine 
stopped, we detected
+        // the failure and updated the DB but didn't made it to rerun the VM. 
So here we'll
+        // take all the HA VMs which are down because of an error and add them 
to the set
+        DateTime now = DateTime.getNow();
+        List<Pair<Guid, DateTime>> initialFailedVms = new LinkedList<>();
+        for (VM vm: getVmDao().getAllFailedAutoStartVms()) {
+            initialFailedVms.add(new Pair<>(vm.getId(), now));
+        }
+        autoStartVmsToRun.addAll(initialFailedVms);
     }
 
     @OnTimerMethodAnnotation("startFailedAutoStartVms")
@@ -54,6 +66,7 @@
         DateTime nextTimeOfRetryToRun = 
now.AddSeconds(RETRY_TO_RUN_HA_VM_INTERVAL);
 
         for(Pair<Guid, DateTime> vmIdAndDateTime: autoStartVmsToRun) {
+            // if it is not the time to rerun this VM yet, skip for now
             if (now.compareTo(vmIdAndDateTime.getSecond()) < 0) {
                 continue;
             }
@@ -75,8 +88,8 @@
                 continue;
             }
 
-            // the VM reached WaitForLunch, so we can remove it from the set,
-            // from now on errors will be detected by VdsUpdateRuntimeInfo
+            // the VM reached status which is different from Down, therefore 
we can remove it from
+            // the set and from now on errors will be detected by 
VdsUpdateRuntimeInfo
             idsToRemove.add(vmIdAndDateTime);
         }
 
@@ -93,10 +106,6 @@
                 null);
     }
 
-    protected VmDynamicDAO getVmDynamicDao() {
-        return DbFacade.getInstance().getVmDynamicDao();
-    }
-
     protected LockManager getLockManager() {
         return LockManagerFactory.getLockManager();
     }
@@ -106,11 +115,17 @@
     }
 
     private boolean runVm(Guid vmId, EngineLock lock) {
-        boolean succeeded = Backend.getInstance().runInternalAction(
+        return Backend.getInstance().runInternalAction(
                 VdcActionType.RunVm,
                 new RunVmParams(vmId),
                 
ExecutionHandler.createInternalJobContext(lock)).getSucceeded();
+    }
 
-        return succeeded;
+    protected VmDynamicDAO getVmDynamicDao() {
+        return DbFacade.getInstance().getVmDynamicDao();
+    }
+
+    protected VmDAO getVmDao() {
+        return DbFacade.getInstance().getVmDao();
     }
 }
diff --git 
a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAO.java
 
b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAO.java
index 3d7d35b..1e18092 100644
--- 
a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAO.java
+++ 
b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAO.java
@@ -256,4 +256,11 @@
      * @return the list of VMs
      */
     List<VM> getAllForVnicProfile(Guid vnicProfileId);
+
+    /**
+     * Retrieves all auto started VMs that went down unintentionally
+     *
+     * @return the list of VMs
+     */
+    List<VM> getAllFailedAutoStartVms();
 }
diff --git 
a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAODbFacadeImpl.java
 
b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAODbFacadeImpl.java
index 23928db..c04de12 100644
--- 
a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAODbFacadeImpl.java
+++ 
b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAODbFacadeImpl.java
@@ -237,6 +237,13 @@
     }
 
     @Override
+    public List<VM> getAllFailedAutoStartVms() {
+        return getCallsHandler().executeReadList("GetFailedAutoStartVms",
+                VMRowMapper.instance,
+                getCustomMapSqlParameterSource());
+    }
+
+    @Override
     public List<VM> getAllMigratingToHost(Guid vdsId) {
         return getCallsHandler().executeReadList("GetVmsMigratingToVds",
                 VMRowMapper.instance,
diff --git a/packaging/dbscripts/vms_sp.sql b/packaging/dbscripts/vms_sp.sql
index c97f60e..fb2d871 100644
--- a/packaging/dbscripts/vms_sp.sql
+++ b/packaging/dbscripts/vms_sp.sql
@@ -1090,3 +1090,13 @@
 END; $procedure$
 LANGUAGE plpgsql;
 
+
+Create or replace FUNCTION GetFailedAutoStartVms() RETURNS SETOF vms STABLE
+   AS $procedure$
+BEGIN
+      RETURN QUERY SELECT vms.*
+      FROM vms
+      WHERE auto_startup = TRUE and status = 0 and exit_status = 1;
+END; $procedure$
+LANGUAGE plpgsql;
+


-- 
To view, visit http://gerrit.ovirt.org/19502
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2d5876f196819b2a69be0b71287c5325a8ff9dc9
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-engine
Gerrit-Branch: master
Gerrit-Owner: Arik Hadas <[email protected]>
_______________________________________________
Engine-patches mailing list
[email protected]
http://lists.ovirt.org/mailman/listinfo/engine-patches

Reply via email to