Michael Kublin has uploaded a new change for review. Change subject: core: Auto-Recovery should check whether getVdsStats returns 'lastCheck<60' before it proclaims host as up (#844438) ......................................................................
core: Auto-Recovery should check whether getVdsStats returns 'lastCheck<60' before it proclaims host as up (#844438) https://bugzilla.redhat.com/844438 The described bug is a private case of regular activate host operation. The problem is that the condition for moving host to NonOperational is different for those that move host to Active state. Solution is during IniVdsOnUpCommand perform getVdsStats in order to get info about storage domains that host can see, if all Active domins in cluster is seen by host it will be moved Active, otherwise it will be left as NonOperational with reason: STORAGE_DOMAIN_UNREACHABLE Change-Id: Id88ef24829d5fae61dddd34f0265ae3132672783 Signed-off-by: Michael Kublin <[email protected]> --- M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/InitVdsOnUpCommand.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/vdscommands/VDSCommandType.java M backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AuditLogDirector.java M backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties M backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java 6 files changed, 82 insertions(+), 7 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/31/8131/1 diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/InitVdsOnUpCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/InitVdsOnUpCommand.java index 06d56bb..448d028 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/InitVdsOnUpCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/InitVdsOnUpCommand.java @@ -21,9 +21,11 @@ import org.ovirt.engine.core.common.businessentities.VdsSpmStatus; import org.ovirt.engine.core.common.businessentities.storage_pool; import org.ovirt.engine.core.common.businessentities.gluster.GlusterServerInfo; +import org.ovirt.engine.core.common.errors.VdcBLLException; import org.ovirt.engine.core.common.vdscommands.ConnectStoragePoolVDSCommandParameters; import org.ovirt.engine.core.common.vdscommands.VDSCommandType; import org.ovirt.engine.core.common.vdscommands.VDSReturnValue; +import org.ovirt.engine.core.common.vdscommands.VdsIdAndVdsVDSCommandParametersBase; import org.ovirt.engine.core.common.vdscommands.VdsIdVDSCommandParametersBase; import org.ovirt.engine.core.common.vdscommands.gluster.GlusterHostAddVDSParameters; import org.ovirt.engine.core.compat.Guid; @@ -33,6 +35,7 @@ import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogableBase; import org.ovirt.engine.core.dao.InterfaceDAO; +import org.ovirt.engine.core.vdsbroker.irsbroker.IrsBrokerCommand; /** * Initialize Vds on its loading. For storages: First connect all storage @@ -40,6 +43,7 @@ * * After server initialized - its will be moved to Up status. */ +@SuppressWarnings("serial") @NonTransactiveCommandAttribute public class InitVdsOnUpCommand<T extends StoragePoolParametersBase> extends StorageHandlingCommandBase<T> { private boolean _fencingSucceeded = true; @@ -130,10 +134,7 @@ _connectStorageSucceeded = true; try { setStoragePool(null); - returnValue = _connectPoolSucceeded = Backend - .getInstance() - .getResourceManager() - .RunVdsCommand( + returnValue = _connectPoolSucceeded = runVdsCommand( VDSCommandType.ConnectStoragePool, new ConnectStoragePoolVDSCommandParameters(getVds().getId(), getVds() .getstorage_pool_id(), getVds().getvds_spm_id(), getMasterDomainIdFromDb(), @@ -143,6 +144,13 @@ .getname()); returnValue = false; } + if(returnValue) { + returnValue = proceedVdsStats(); + if(!returnValue) { + AuditLogDirector.log(new AuditLogableBase(getVdsId()), + AuditLogType.VDS_STORAGE_VDS_STATS_FAILED); + } + } // if couldn't connect check if this is the only vds // return true if connect succeeded or it's the only vds if (!returnValue && suppressCheck) { @@ -155,6 +163,26 @@ return returnValue; } + protected boolean proceedVdsStats() { + boolean returnValue = true; + try { + runVdsCommand(VDSCommandType.GetStats, new VdsIdAndVdsVDSCommandParametersBase(getVds())); + if (IrsBrokerCommand.isDomainsReportedAsProblematic(getVds().getstorage_pool_id(), getVds().getDomains())) { + log.errorFormat("One of the domains of host {0} in pool {1} is problematic", + getVds().getvds_name(), + getStoragePool() + .getname()); + returnValue = false; + } + } catch (VdcBLLException e) { + log.errorFormat("Could not get vds stats for vds {0} because of error {1}", + getVds().getvds_name(), + e); + returnValue = false; + } + return returnValue; + } + @Override public AuditLogType getAuditLogTypeValue() { AuditLogType type = AuditLogType.UNASSIGNED; diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java index 14556b5..883124b 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/AuditLogType.java @@ -42,6 +42,7 @@ VDS_CPU_LOWER_THAN_CLUSTER(515), VDS_CPU_RETRIEVE_FAILED(516), VDS_STORAGE_CONNECTION_FAILED_BUT_LAST_VDS(533), + VDS_STORAGE_VDS_STATS_FAILED(534), VDS_SET_NONOPERATIONAL(517, AuditLogTimeInterval.MINUTE.getValue()), VDS_SET_NONOPERATIONAL_FAILED(518, AuditLogTimeInterval.MINUTE.getValue()), VDS_SET_NONOPERATIONAL_NETWORK(519, AuditLogTimeInterval.MINUTE.getValue()), diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/vdscommands/VDSCommandType.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/vdscommands/VDSCommandType.java index 7d0d18a6..8f3ec23 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/vdscommands/VDSCommandType.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/vdscommands/VDSCommandType.java @@ -5,6 +5,7 @@ RemoveVds("org.ovirt.engine.core.vdsbroker"), ActivateVds("org.ovirt.engine.core.vdsbroker"), FenceVds("org.ovirt.engine.core.vdsbroker.vdsbroker"), + GetStats("org.ovirt.engine.core.vdsbroker.vdsbroker"), CreateVm("org.ovirt.engine.core.vdsbroker"), DestroyVm("org.ovirt.engine.core.vdsbroker"), Pause("org.ovirt.engine.core.vdsbroker.vdsbroker"), diff --git a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AuditLogDirector.java b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AuditLogDirector.java index 3ccb016..590e079 100644 --- a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AuditLogDirector.java +++ b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dal/dbbroker/auditloghandling/AuditLogDirector.java @@ -263,6 +263,7 @@ mSeverities.put(AuditLogType.VDS_ALERT_FENCING_NO_PROXY_HOST, AuditLogSeverity.NORMAL); mSeverities.put(AuditLogType.VDS_LOW_MEM, AuditLogSeverity.WARNING); mSeverities.put(AuditLogType.VDS_STORAGE_CONNECTION_FAILED_BUT_LAST_VDS, AuditLogSeverity.ERROR); + mSeverities.put(AuditLogType.VDS_STORAGE_VDS_STATS_FAILED, AuditLogSeverity.ERROR); mSeverities.put(AuditLogType.VDS_LOW_DISK_SPACE, AuditLogSeverity.WARNING); mSeverities.put(AuditLogType.VDS_LOW_DISK_SPACE_ERROR, AuditLogSeverity.ERROR); mSeverities.put(AuditLogType.VDS_ACTIVATE_ASYNC, AuditLogSeverity.NORMAL); diff --git a/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties b/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties index e861513..e3d958c 100644 --- a/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties +++ b/backend/manager/modules/dal/src/main/resources/bundles/AuditLogMessages.properties @@ -358,6 +358,7 @@ CPU_FLAGS_NX_IS_MISSING=Host ${VdsName} is missing the NX cpu flag. This flag can be enabled via the host BIOS. Please set Disable Execute (XD) for an Intel host, or No Execute (NX) for AMD. Please make sure to completely power off the host for this change to take effect. VDS_CPU_RETRIEVE_FAILED=Failed to determine Host ${VdsName} CPU level - could not retrieve CPU flags. VDS_STORAGE_CONNECTION_FAILED_BUT_LAST_VDS=Failed to connect Host ${VdsName} to Data Center, due to connectivity errors with the Storage. Host ${VdsName} will remain in Up state (but inactive), as it is the last Host in the Data Center, to enable manual intervention by the Administrator. +VDS_STORAGE_VDS_STATS_FAILED=Host ${VdsName} reports about one of the active domains as problematic. VDS_SET_NONOPERATIONAL=Host ${VdsName} moved to Non-Operational state. VDS_SET_NONOPERATIONAL_FAILED=Failed to move Host ${VdsName} to Non-Operational state. VDS_FENCE_STATUS=Host ${VdsName} power management was verified successfully. diff --git a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java index 07cce29..87ac3b6 100644 --- a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java +++ b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java @@ -87,6 +87,14 @@ } } + public static boolean isDomainsReportedAsProblematic(Guid storagePoolId, List<VDSDomainsData> vdsDomainsData) { + IrsProxyData proxy = _irsProxyData.get(storagePoolId); + if (proxy != null) { + return proxy.isDomainsReportedAsProblematic(vdsDomainsData); + } + return false; + } + public static void lockDbSave(Guid storagePoolId) { IrsProxyData proxy = _irsProxyData.get(storagePoolId); if (proxy != null) { @@ -1098,13 +1106,13 @@ List<Guid> domainsSeenByVdsInProblem = new ArrayList<Guid>(); for (VDSDomainsData tempData : data) { if (domainsInPool.contains(tempData.getDomainId())) { - if (isDomainReportedAsProblematic(tempData)) { + if (isDomainReportedAsProblematic(tempData, false)) { domainsSeenByVdsInProblem.add(tempData.getDomainId()); } else if (tempData.getDelay() > Config.<Double> GetValue(ConfigValues.MaxStorageVdsDelayCheckSec)) { logDelayedDomain(vdsId, tempData); } } else if (inActiveDomainsInPool.contains(tempData.getDomainId()) - && !isDomainReportedAsProblematic(tempData)) { + && !isDomainReportedAsProblematic(tempData, false)) { log.warnFormat("Storage {0} was reported by vds {1} as active in pool {2}, moving to active status", tempData.getDomainId(), vdsName, @@ -1154,12 +1162,47 @@ AuditLogType.VDS_DOMAIN_DELAY_INTERVAL); } - private boolean isDomainReportedAsProblematic(VDSDomainsData tempData) { + public boolean isDomainsReportedAsProblematic(List<VDSDomainsData> vdsDomainsData) { + Set<Guid> domainsInPool = new HashSet<Guid>( + DbFacade.getInstance().getStorageDomainStaticDAO().getAllIds( + _storagePoolId, StorageDomainStatus.Active)); + domainsInPool.addAll(DbFacade.getInstance().getStorageDomainStaticDAO().getAllIds( + _storagePoolId, StorageDomainStatus.Unknown)); + List<Guid> domainWhicWereSeen = new ArrayList<Guid>(); + for (VDSDomainsData vdsDomainData : vdsDomainsData) { + if (domainsInPool.contains(vdsDomainData.getDomainId())) { + if (isDomainReportedAsProblematic(vdsDomainData, true)) { + return true; + } + domainWhicWereSeen.add(vdsDomainData.getDomainId()); + } + } + domainsInPool.removeAll(domainWhicWereSeen); + if (domainsInPool.size() > 0) { + for (Guid domainId : domainsInPool) { + log.errorFormat("Domain {0} was not seen by host", domainId); + } + return true; + } + return false; + } + + private boolean isDomainReportedAsProblematic(VDSDomainsData tempData, boolean isLog) { if (tempData.getCode() != 0) { + if (isLog) { + log.errorFormat("Domain {0} was reported with error code {1}", + tempData.getDomainId(), + tempData.getCode()); + } return true; } if (tempData.getLastCheck() > Config .<Double> GetValue(ConfigValues.MaxStorageVdsTimeoutCheckSec)) { + if (isLog) { + log.errorFormat("Domain {0} was reported with too big lastCheck {1}", + tempData.getDomainId(), + tempData.getLastCheck()); + } return true; } return false; -- To view, visit http://gerrit.ovirt.org/8131 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id88ef24829d5fae61dddd34f0265ae3132672783 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: master Gerrit-Owner: Michael Kublin <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
