Liron Ar has uploaded a new change for review. Change subject: core: intrdoucing host immediate domain recovery mechanism ......................................................................
core: intrdoucing host immediate domain recovery mechanism Ovirt engine allows hosts to be activated even if they can't access some of the data center's storage domain in case that those domains are marked as "inactive" which means that all the hosts that are already in status up reported them as problematic (therefore there's no need to prevent "new" hosts from being activated). In case that we have an inactive domain that we failed to connect to it's storage server we won't have the link for that domain and we won't be able to produce it (as the mount was possible unavailable when we attempted to connect to the storage server). If the connectivity to that domain will return, host that was already active before might report that he has access to the domain which will cause the engine to change that domain's status to "active". The issue is that hosts that were activated after the connectivity was lost would move to non operational (causing to vm migration..etc) as they possibly won't have connection to the domain (it's a race between the domain status being changed to Active and the domain auto recovery meachanism) and won't have the needed links of that domain. The implemented solution is attempting to prevent hosts from moving to non-operational status to avoid the related affects of it. A new quartz job is set to run every 30 seconds, that job will inspect all reports of hosts that were gatherd since it's last run. The motivation for that implementation is to aggregate the operations on the different hosts together to avoid long wait time and block other "pool" operations. If any hosts has a "new" report on a domain that is active or unknown that it can't access for "storage" reason, those hosts would be reconnected to the active/unknown domains storage servers and will refresh it's storage pool metadata. the engine will attempt to "recover" each host only once for each problematic report to avoid flooding the system with recovery attempts, if the host would still have problem accessing the domain it'll be moved to non operational as usual. Change-Id: Idb7b2fe8c87805986aaf25cd0f24f605d67d4186 Bug-Url: https://bugzilla.redhat.com/show_bug.cgi?id=1093924 Signed-off-by: Liron Aravot <[email protected]> --- M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/MaintenanceVdsCommand.java M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/eventqueue/EventType.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/locks/LockingGroup.java M backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java M backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/storage/StoragePoolDomainHelper.java M packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql 9 files changed, 298 insertions(+), 45 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/23/27523/1 diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/MaintenanceVdsCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/MaintenanceVdsCommand.java index 2421f9c..a11eaf2 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/MaintenanceVdsCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/MaintenanceVdsCommand.java @@ -33,6 +33,8 @@ import org.ovirt.engine.core.common.eventqueue.EventType; import org.ovirt.engine.core.common.job.Step; import org.ovirt.engine.core.common.job.StepEnum; +import org.ovirt.engine.core.common.locks.LockingGroup; +import org.ovirt.engine.core.common.utils.Pair; import org.ovirt.engine.core.common.vdscommands.DisconnectStoragePoolVDSCommandParameters; import org.ovirt.engine.core.common.vdscommands.SetHaMaintenanceModeVDSCommandParameters; import org.ovirt.engine.core.common.vdscommands.SetVdsStatusVDSCommandParameters; @@ -43,6 +45,8 @@ import org.ovirt.engine.core.utils.ejb.BeanProxyType; import org.ovirt.engine.core.utils.ejb.BeanType; import org.ovirt.engine.core.utils.ejb.EjbUtils; +import org.ovirt.engine.core.utils.lock.EngineLock; +import org.ovirt.engine.core.utils.lock.LockManagerFactory; import org.ovirt.engine.core.vdsbroker.irsbroker.IrsBrokerCommand; @NonTransactiveCommandAttribute @@ -222,21 +226,29 @@ // Clear the problematic timers since the VDS is in maintenance so it doesn't make sense to check it // anymore. if (!Guid.Empty.equals(vds.getStoragePoolId())) { - clearDomainCache(vds); + EngineLock lock = new EngineLock(Collections.singletonMap(vds.getId().toString(), + new Pair<>(LockingGroup.VDS_POOL_AND_STORAGE_CONNECTIONS.toString(), + VdcBllMessages.ACTION_TYPE_FAILED_OBJECT_LOCKED.toString())), null); + try { + LockManagerFactory.getLockManager().acquireLockWait(lock); + clearDomainCache(vds); - StoragePool storage_pool = DbFacade.getInstance() - .getStoragePoolDao() - .get(vds.getStoragePoolId()); - if (StoragePoolStatus.Uninitialized != storage_pool - .getStatus()) { - Backend.getInstance().getResourceManager() - .RunVdsCommand( - VDSCommandType.DisconnectStoragePool, - new DisconnectStoragePoolVDSCommandParameters(vds.getId(), - vds.getStoragePoolId(), vds.getVdsSpmId())); - HostStoragePoolParametersBase params = - new HostStoragePoolParametersBase(storage_pool, vds); - Backend.getInstance().runInternalAction(VdcActionType.DisconnectHostFromStoragePoolServers, params); + StoragePool storage_pool = DbFacade.getInstance() + .getStoragePoolDao() + .get(vds.getStoragePoolId()); + if (StoragePoolStatus.Uninitialized != storage_pool + .getStatus()) { + Backend.getInstance().getResourceManager() + .RunVdsCommand( + VDSCommandType.DisconnectStoragePool, + new DisconnectStoragePoolVDSCommandParameters(vds.getId(), + vds.getStoragePoolId(), vds.getVdsSpmId())); + HostStoragePoolParametersBase params = + new HostStoragePoolParametersBase(storage_pool, vds); + Backend.getInstance().runInternalAction(VdcActionType.DisconnectHostFromStoragePoolServers, params); + } + } finally { + LockManagerFactory.getLockManager().releaseLock(lock); } } } diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java index 73c6a15..659c830 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java @@ -18,6 +18,7 @@ import org.ovirt.engine.core.bll.storage.StoragePoolStatusHandler; import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.action.AddVmFromScratchParameters; +import org.ovirt.engine.core.common.action.ConnectHostToStoragePoolServersParameters; import org.ovirt.engine.core.common.action.FenceVdsActionParameters; import org.ovirt.engine.core.common.action.HostStoragePoolParametersBase; import org.ovirt.engine.core.common.action.MaintenanceNumberOfVdssParameters; @@ -197,6 +198,13 @@ return isSucceeded; } + @Override + public boolean connectHostToDomainsInActiveOrUnknownStatus(VDS vds) { + ConnectHostToStoragePoolServersParameters params = new ConnectHostToStoragePoolServersParameters(vds, false); + return Backend.getInstance().runInternalAction(VdcActionType.ConnectHostToStoragePoolServers, params).getSucceeded(); + } + + private List<VdcActionParametersBase> createMigrateVmToServerParametersList(List<VmStatic> vmsToMigrate, final VDS vds) { return LinqUtils.foreach(vmsToMigrate, new Function<VmStatic, VdcActionParametersBase>() { diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java index b80987a..9481fa3 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java @@ -27,6 +27,8 @@ boolean vdsUpEvent(VDS vds); + boolean connectHostToDomainsInActiveOrUnknownStatus(VDS vds); + void processOnClientIpChange(VDS vds, Guid vmId); void processOnCpuFlagsChange(Guid vdsId); diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java index 96bc571..2131290 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java @@ -481,6 +481,9 @@ @TypeConverterAttribute(Integer.class) @DefaultValueAttribute("3") StoragePoolRefreshTimeInSeconds, + @TypeConverterAttribute(Integer.class) + @DefaultValueAttribute("30") + HostStorageConnectionAndPoolRefreshTimeInSeconds, @Reloadable @TypeConverterAttribute(Integer.class) @DefaultValueAttribute("3") diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/eventqueue/EventType.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/eventqueue/EventType.java index 38b6fa3..c56ab26 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/eventqueue/EventType.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/eventqueue/EventType.java @@ -8,5 +8,6 @@ VDSSTOARGEPROBLEMS, DOMAINMONITORING, VDSCLEARCACHE, - VDSCONNECTTOPOOL; + VDSCONNECTTOPOOL, + VDSSPOOLREFRESH; } diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/locks/LockingGroup.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/locks/LockingGroup.java index 6539a6f..f9bedf0 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/locks/LockingGroup.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/locks/LockingGroup.java @@ -24,6 +24,7 @@ REMOTE_VM, OVF_UPDATE, /** This group is used for indication that an operation is executed using the specified host */ - VDS_EXECUTION; + VDS_EXECUTION, + VDS_POOL_AND_STORAGE_CONNECTIONS; } diff --git a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java index 69262b3..e58d8f7 100644 --- a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java +++ b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java @@ -42,10 +42,12 @@ import org.ovirt.engine.core.common.config.ConfigValues; import org.ovirt.engine.core.common.errors.VDSError; import org.ovirt.engine.core.common.errors.VdcBllErrors; +import org.ovirt.engine.core.common.errors.VdcBllMessages; import org.ovirt.engine.core.common.eventqueue.Event; import org.ovirt.engine.core.common.eventqueue.EventQueue; import org.ovirt.engine.core.common.eventqueue.EventResult; import org.ovirt.engine.core.common.eventqueue.EventType; +import org.ovirt.engine.core.common.locks.LockingGroup; import org.ovirt.engine.core.common.utils.Pair; import org.ovirt.engine.core.common.vdscommands.ConnectStoragePoolVDSCommandParameters; import org.ovirt.engine.core.common.vdscommands.DisconnectStoragePoolVDSCommandParameters; @@ -67,6 +69,8 @@ import org.ovirt.engine.core.utils.ejb.BeanProxyType; import org.ovirt.engine.core.utils.ejb.BeanType; import org.ovirt.engine.core.utils.ejb.EjbUtils; +import org.ovirt.engine.core.utils.lock.EngineLock; +import org.ovirt.engine.core.utils.lock.LockManagerFactory; import org.ovirt.engine.core.utils.log.Log; import org.ovirt.engine.core.utils.log.LogFactory; import org.ovirt.engine.core.utils.log.Logged; @@ -143,6 +147,7 @@ public Object syncObj = new Object(); private final String storagePoolRefreshJobId; + private final String domainRecoverOnHostJobId; private final HashSet<Guid> mTriedVdssList = new HashSet<Guid>(); private Guid mCurrentVdsId; @@ -181,6 +186,14 @@ storagePoolRefreshJobId = SchedulerUtilQuartzImpl.getInstance().scheduleAFixedDelayJob(this, "_updatingTimer_Elapsed", new Class[0], new Object[0], storagePoolRefreshTime, storagePoolRefreshTime, TimeUnit.SECONDS); + domainRecoverOnHostJobId = + SchedulerUtilQuartzImpl.getInstance().scheduleAFixedDelayJob(this, + "hostsStorageConnectionsAndPoolMetadataRefresh", + new Class[0], + new Object[0], + Config.<Integer> getValue(ConfigValues.HostStorageConnectionAndPoolRefreshTimeInSeconds), + storagePoolRefreshTime, + TimeUnit.SECONDS); } @OnTimerMethodAnnotation("_updatingTimer_Elapsed") @@ -1059,12 +1072,13 @@ private final Map<Guid, HashSet<Guid>> _domainsInProblem = new ConcurrentHashMap<Guid, HashSet<Guid>>(); private final Map<Guid, HashSet<Guid>> _domainsInMaintenance = new ConcurrentHashMap<Guid, HashSet<Guid>>(); + private final Map<Guid, Guid> vdsReportsOnUnseenDomain = new ConcurrentHashMap<Guid, Guid>(); + private final Map<Guid, Guid> vdsHandeledReportsOnUnseenDomains = new ConcurrentHashMap<Guid, Guid>(); private final Map<Guid, String> _timers = new HashMap<Guid, String>(); public void updateVdsDomainsData(final Guid vdsId, final String vdsName, final ArrayList<VDSDomainsData> data) { - Set<Guid> domainsInProblems = null; Set<Guid> domainsInMaintenance = null; StoragePool storagePool = DbFacade.getInstance().getStoragePoolDao().get(_storagePoolId); @@ -1072,6 +1086,7 @@ && (storagePool.getStatus() == StoragePoolStatus.Up || storagePool.getStatus() == StoragePoolStatus.NonResponsive)) { try { + Map<Guid, DomainMonitoringResult> domainsProblematicReportInfo = new HashMap<>(); // build a list of all domains in pool // which are in status Active or Unknown Set<Guid> domainsInPool = new HashSet<Guid>( @@ -1087,14 +1102,13 @@ // build a list of all the domains in // pool (domainsInPool) that are not // visible by the host. - List<Guid> domainsInPoolThatNonVisibleByVds = new ArrayList<Guid>(); Set<Guid> dataDomainIds = new HashSet<Guid>(); for (VDSDomainsData tempData : data) { dataDomainIds.add(tempData.getDomainId()); } for (Guid tempDomainId : domainsInPool) { if (!dataDomainIds.contains(tempDomainId)) { - domainsInPoolThatNonVisibleByVds.add(tempDomainId); + domainsProblematicReportInfo.put(tempDomainId, DomainMonitoringResult.NOT_REPORTED); } } @@ -1104,16 +1118,16 @@ // ConfigValues.MaxStorageVdsTimeoutCheckSec) // and are contained in the Active or // Unknown domains in pool - List<Guid> domainsSeenByVdsInProblem = new ArrayList<Guid>(); for (VDSDomainsData tempData : data) { if (domainsInPool.contains(tempData.getDomainId())) { - if (isDomainReportedAsProblematic(tempData, false)) { - domainsSeenByVdsInProblem.add(tempData.getDomainId()); + DomainMonitoringResult domainMonitoringResult = analyzeDomainReport(tempData, false); + if (domainMonitoringResult.isInvalid()) { + domainsProblematicReportInfo.put(tempData.getDomainId(), domainMonitoringResult); } else if (tempData.getDelay() > Config.<Double> getValue(ConfigValues.MaxStorageVdsDelayCheckSec)) { logDelayedDomain(vdsId, tempData); } } else if (inActiveDomainsInPool.contains(tempData.getDomainId()) - && !isDomainReportedAsProblematic(tempData, false)) { + && analyzeDomainReport(tempData, false).isValid()) { log.warnFormat("Storage Domain {0} was reported by Host {1} as Active in Pool {2}, moving to active status", getDomainIdTuple(tempData.getDomainId()), vdsName, @@ -1140,35 +1154,26 @@ } } - // build a list of all potential domains - // in problem - domainsInProblems = new HashSet<Guid>(); - domainsInProblems.addAll(domainsInPoolThatNonVisibleByVds); - domainsInProblems.addAll(domainsSeenByVdsInProblem); - + updateDomainInProblem(vdsId, vdsName, domainsProblematicReportInfo, domainsInMaintenance); } catch (RuntimeException ex) { log.error("error in updateVdsDomainsData", ex); } - } - updateDomainInProblem(vdsId, vdsName, domainsInProblems, domainsInMaintenance); } - private void updateDomainInProblem(final Guid vdsId, final String vdsName, final Set<Guid> domainsInProblems, + private void updateDomainInProblem(final Guid vdsId, final String vdsName, final Map<Guid, DomainMonitoringResult> domainsInProblem, final Set<Guid> domainsInMaintenance) { - if (domainsInProblems != null) { ((EventQueue) EjbUtils.findBean(BeanType.EVENTQUEUE_MANAGER, BeanProxyType.LOCAL)).submitEventSync(new Event(_storagePoolId, null, vdsId, EventType.DOMAINMONITORING, ""), new Callable<EventResult>() { @Override public EventResult call() { EventResult result = new EventResult(true, EventType.DOMAINMONITORING); - updateProblematicVdsData(vdsId, vdsName, domainsInProblems); + updateProblematicVdsData(vdsId, vdsName, domainsInProblem); updateMaintenanceVdsData(vdsId, vdsName, domainsInMaintenance); return result; } }); - } } private void logDelayedDomain(final Guid vdsId, VDSDomainsData tempData) { @@ -1191,7 +1196,7 @@ List<Guid> domainWhichWereSeen = new ArrayList<Guid>(); for (VDSDomainsData vdsDomainData : vdsDomainsData) { if (domainsInPool.contains(vdsDomainData.getDomainId())) { - if (isDomainReportedAsProblematic(vdsDomainData, true)) { + if (analyzeDomainReport(vdsDomainData, true).isInvalid()) { domainsInProblem.add(vdsDomainData.getDomainId()); } domainWhichWereSeen.add(vdsDomainData.getDomainId()); @@ -1207,14 +1212,38 @@ return domainsInProblem; } - private boolean isDomainReportedAsProblematic(VDSDomainsData tempData, boolean isLog) { + private enum DomainMonitoringResult { + PROBLEMATIC(false), STORAGE_ACCCESS_ERROR(false), OK(true), NOT_REPORTED(false); + + private boolean valid; + + private DomainMonitoringResult(boolean valid) { + this.valid = valid; + } + + public boolean isValid() { + return valid; + } + + public boolean isInvalid() { + return !valid; + } + } + + private DomainMonitoringResult analyzeDomainReport(VDSDomainsData tempData, boolean isLog) { if (tempData.getCode() != 0) { if (isLog) { log.errorFormat("Domain {0} was reported with error code {1}", getDomainIdTuple(tempData.getDomainId()), tempData.getCode()); } - return true; + + if (tempData.getCode() == VdcBllErrors.StorageDomainDoesNotExist.getValue() + || tempData.getCode() == VdcBllErrors.StorageException.getValue()) { + return DomainMonitoringResult.STORAGE_ACCCESS_ERROR; + } + + return DomainMonitoringResult.PROBLEMATIC; } if (tempData.getLastCheck() > Config .<Double> getValue(ConfigValues.MaxStorageVdsTimeoutCheckSec)) { @@ -1223,9 +1252,10 @@ getDomainIdTuple(tempData.getDomainId()), tempData.getLastCheck()); } - return true; + return DomainMonitoringResult.PROBLEMATIC; } - return false; + + return DomainMonitoringResult.OK; } private void updateMaintenanceVdsData(final Guid vdsId, final String vdsName, Set<Guid> domainsInMaintenance) { @@ -1252,23 +1282,48 @@ } } - private void updateProblematicVdsData(final Guid vdsId, final String vdsName, Set<Guid> domainsInProblems) { + private void clearVdsReportInfoOnUnseenDomain(Guid vdsId) { + vdsReportsOnUnseenDomain.remove(vdsId); + } + + private void updateProblematicVdsData(final Guid vdsId, final String vdsName, Map<Guid, DomainMonitoringResult> domainsInProblem) { // for all problematic domains // update cache of _domainsInProblem // and _vdssInProblem and add a new // timer for new domains in problem - Set<Guid> domainsInProblemKeySet = _domainsInProblem.keySet(); - for (Guid domainId : domainsInProblems) { - if (domainsInProblemKeySet.contains(domainId)) { + boolean newDomainUnreachableByHost = false; + int domainsUnreachableByHost = 0; + for (Map.Entry<Guid, DomainMonitoringResult> entry : domainsInProblem.entrySet()) { + Guid domainId = entry.getKey(); + DomainMonitoringResult domainMonitoringResult = entry.getValue(); + HashSet<Guid> hostsReportedDomainAsProblematic = _domainsInProblem.get(domainId); + boolean domainNotFound = domainMonitoringResult == DomainMonitoringResult.STORAGE_ACCCESS_ERROR; + if (domainNotFound) { + domainsUnreachableByHost++; + } + if (hostsReportedDomainAsProblematic != null) { + if (!hostsReportedDomainAsProblematic.contains(vdsId) && domainNotFound) { + newDomainUnreachableByHost = true; + } // existing domains in problem updateDomainInProblemData(domainId, vdsId, vdsName); } else { + if (domainNotFound) { + newDomainUnreachableByHost = true; + } // new domains in problems addDomainInProblemData(domainId, vdsId, vdsName); } } + + if (domainsUnreachableByHost == 0) { + clearVdsReportInfoOnUnseenDomain(vdsId); + } else if (newDomainUnreachableByHost) { + vdsReportsOnUnseenDomain.put(vdsId, Guid.newGuid()); + } + Set<Guid> notReportedDomainsByHost = new HashSet<Guid>(_domainsInProblem.keySet()); - notReportedDomainsByHost.removeAll(domainsInProblems); + notReportedDomainsByHost.removeAll(domainsInProblem.keySet()); for (Guid domainId : notReportedDomainsByHost) { Set<Guid> vdsForDomain = _domainsInProblem.get(domainId); if (vdsForDomain != null && vdsForDomain.contains(vdsId)) { @@ -1317,6 +1372,107 @@ return result; } }); + } + + @OnTimerMethodAnnotation("hostsStorageConnectionsAndPoolMetadataRefresh") + public void hostsStorageConnectionsAndPoolMetadataRefresh() { + if (vdsReportsOnUnseenDomain.isEmpty()) { + if (!vdsHandeledReportsOnUnseenDomains.isEmpty()) { + vdsHandeledReportsOnUnseenDomains.clear(); + } + + return; + } + + Map<Guid, Guid> reportsToHandle = new HashMap<>(); + reportsToHandle.putAll(vdsReportsOnUnseenDomain); + + for (Map.Entry<Guid, Guid> entry : vdsHandeledReportsOnUnseenDomains.entrySet()) { + Guid vdsId = entry.getKey(); + Guid currentReportId = reportsToHandle.get(vdsId); + if (currentReportId == null) { + vdsHandeledReportsOnUnseenDomains.remove(vdsId); + } else { + Guid handledReportId = entry.getValue(); + if (currentReportId.equals(handledReportId)) { + reportsToHandle.remove(vdsId); + } + } + } + + if (reportsToHandle.isEmpty()) { + return; + } + + List<Callable<Void>> connectStorageTasks = new ArrayList<>(); + final List<Callable<Void>> refreshStoragePoolMetadata = new ArrayList<>(); + final StoragePool storagePool = DbFacade.getInstance().getStoragePoolDao().get(_storagePoolId); + final Guid masterDomainId = + DbFacade.getInstance().getStorageDomainDao().getMasterStorageDomainIdForPool(_storagePoolId); + + Map<String, Pair<String, String>> acquiredLocks = new HashMap<>(); + try { + for (Map.Entry<Guid, Guid> entry : reportsToHandle.entrySet()) { + Guid vdsId = entry.getKey(); + Guid currentReportId = entry.getValue(); + + vdsHandeledReportsOnUnseenDomains.put(vdsId, currentReportId); + Map<String, Pair<String, String>> lockMap = Collections.singletonMap(vdsId.toString(), + new Pair<>(LockingGroup.VDS_POOL_AND_STORAGE_CONNECTIONS.toString(), + VdcBllMessages.ACTION_TYPE_FAILED_OBJECT_LOCKED.toString())); + if (!LockManagerFactory.getLockManager() + .acquireLock(new EngineLock(lockMap, null)) + .getFirst()) { + continue; + } + + acquiredLocks.putAll(lockMap); + // this check is to verify after the lock is taken that the host wasn't moved to maintenance to + // avoid connecting it to the storage servers and to the pool when it's on maintenance. + if (!vdsReportsOnUnseenDomain.containsKey(vdsId)) { + continue; + } + + final VDS vds = DbFacade.getInstance().getVdsDao().get(entry.getKey()); + connectStorageTasks.add(new Callable<Void>() { + + @Override + public Void call() { + ResourceManager.getInstance() + .getEventListener().connectHostToDomainsInActiveOrUnknownStatus(vds); + return null; + } + }); + + refreshStoragePoolMetadata.add(new Callable<Void>() { + + @Override + public Void call() { + StoragePoolDomainHelper.refreshHostPoolMetadata(vds, storagePool, masterDomainId); + return null; + } + }); + } + + ThreadPoolUtil.invokeAll(connectStorageTasks); + + ((EventQueue) EjbUtils.findBean(BeanType.EVENTQUEUE_MANAGER, BeanProxyType.LOCAL)).submitEventSync(new Event(_storagePoolId, + null, + null, + EventType.VDSSPOOLREFRESH, + ""), + new Callable<EventResult>() { + @Override + public EventResult call() { + ThreadPoolUtil.invokeAll(refreshStoragePoolMetadata); + return new EventResult(true, EventType.VDSSPOOLREFRESH); + } + }); + } finally { + if (!acquiredLocks.isEmpty()) { + LockManagerFactory.getLockManager().releaseLock(new EngineLock(acquiredLocks, null)); + } + } } private void updateDomainInProblemData(Guid domainId, Guid vdsId, String vdsName) { @@ -1457,6 +1613,7 @@ } removeVdsAsProblematic(nonOpVdss); removeVdsFromDomainMaintenance(nonOpVdss); + removeVdsFromUnseenDomainsReport(nonOpVdss); } private void removeVdsAsProblematic(List<Guid> nonOpVdss) { @@ -1483,6 +1640,13 @@ if (entry.getValue().isEmpty()) { iterDomainsInProblem.remove(); } + } + } + + private void removeVdsFromUnseenDomainsReport(List<Guid> nonOpVdss) { + log.infoFormat("Removing host(s) {0} from hosts unseen domain report cache", nonOpVdss); + for(Guid id : nonOpVdss) { + clearVdsReportInfoOnUnseenDomain(id); } } @@ -1530,6 +1694,7 @@ log.info("IrsProxyData::disposing"); resetIrs(); SchedulerUtilQuartzImpl.getInstance().deleteJob(storagePoolRefreshJobId); + SchedulerUtilQuartzImpl.getInstance().deleteJob(domainRecoverOnHostJobId); _disposed = true; } } diff --git a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/storage/StoragePoolDomainHelper.java b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/storage/StoragePoolDomainHelper.java index bde2488..f6abe84 100644 --- a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/storage/StoragePoolDomainHelper.java +++ b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/storage/StoragePoolDomainHelper.java @@ -8,11 +8,21 @@ import java.util.Set; import org.ovirt.engine.core.common.businessentities.StorageDomainStatus; +import org.ovirt.engine.core.common.businessentities.StoragePool; import org.ovirt.engine.core.common.businessentities.StoragePoolIsoMap; +import org.ovirt.engine.core.common.businessentities.VDS; +import org.ovirt.engine.core.common.errors.VDSError; +import org.ovirt.engine.core.common.errors.VdcBLLException; +import org.ovirt.engine.core.common.errors.VdcBllErrors; +import org.ovirt.engine.core.common.vdscommands.ConnectStoragePoolVDSCommandParameters; +import org.ovirt.engine.core.common.vdscommands.RefreshStoragePoolVDSCommandParameters; +import org.ovirt.engine.core.common.vdscommands.VDSCommandType; +import org.ovirt.engine.core.common.vdscommands.VDSReturnValue; import org.ovirt.engine.core.compat.Guid; import org.ovirt.engine.core.dal.dbbroker.DbFacade; import org.ovirt.engine.core.utils.log.Log; import org.ovirt.engine.core.utils.log.LogFactory; +import org.ovirt.engine.core.vdsbroker.ResourceManager; public class StoragePoolDomainHelper { @@ -38,6 +48,56 @@ return storageDomains; } + /** + * Refreshes the given vds pool metadata, if the host isn't connected to the pool it'll be connected. + * + * @return boolean indicating whether the host pool metadata was "refreshed" succesfully (either by refresh or + * connect) + */ + public static boolean refreshHostPoolMetadata(VDS vds, StoragePool storagePool, Guid masterDomainId) { + try { + ResourceManager.getInstance().runVdsCommand( + VDSCommandType.RefreshStoragePool, + new RefreshStoragePoolVDSCommandParameters(vds.getId(), + storagePool.getId(), + masterDomainId, + storagePool.getmaster_domain_version())); + } catch (VdcBLLException ex) { + VDSError error = ex.getVdsError(); + if (error.getCode() != VdcBllErrors.StoragePoolUnknown) { + log.infoFormat("Failed to refresh host {0} pool {1} metadata with error {2} (message: {3})", + vds.getName(), + storagePool.getId(), error.getCode(), error.getMessage()); + return false; + } + + error = null; + + try { + VDSReturnValue vdsReturnValue = ResourceManager.getInstance().runVdsCommand( + VDSCommandType.ConnectStoragePool, + new ConnectStoragePoolVDSCommandParameters(vds.getId(), + storagePool.getId(), vds.getVdsSpmId(), + masterDomainId, storagePool + .getmaster_domain_version())); + if (!vdsReturnValue.getSucceeded()) { + error = vdsReturnValue.getVdsError(); + } + } catch (VdcBLLException e) { + error = e.getVdsError(); + } + + if (error != null) { + log.infoFormat("Failed to connect host {0} to pool {1} with error {2} (message: {3})", + vds.getName(), + storagePool.getId(), error.getCode(), error.getMessage()); + return false; + } + } + + return true; + } + public static void updateApplicablePoolDomainsStatuses(Guid storagePoolId, Set<StorageDomainStatus> applicableStatusesForUpdate, StorageDomainStatus newStatus, String reason) { diff --git a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql index 155f22f..ade2e86 100644 --- a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql +++ b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql @@ -496,6 +496,7 @@ select fn_db_add_config_value('StoragePoolNameSizeLimit','40','general'); select fn_db_add_config_value('StoragePoolNonOperationalResetTimeoutInMin','3','general'); select fn_db_add_config_value('StoragePoolRefreshTimeInSeconds','10','general'); +select fn_db_add_config_value('HostStorageConnectionAndPoolRefreshTimeInSeconds','30','general'); select fn_db_add_config_value('SucceededJobCleanupTimeInMinutes','10','general'); select fn_db_add_config_value('SupportedClusterLevels','3.0','general'); select fn_db_add_config_value('SupportedStorageFormats','0,2','3.0'); -- To view, visit http://gerrit.ovirt.org/27523 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Idb7b2fe8c87805986aaf25cd0f24f605d67d4186 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: master Gerrit-Owner: Liron Ar <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
