Martin Peřina has uploaded a new change for review. Change subject: core: Adds SSH soft fencing capability ......................................................................
core: Adds SSH soft fencing capability Adds a step into standard host not responding treatment process that tries to restart VDSM on the host using SSH connection prior to standard fencing process. If the VDSM restart dont't help, standard fencing process will be executed. Change-Id: I8002b6ac00a1e2e543b5cc8d1affdd42b994d5f7 Bug-Url: https://bugzilla.redhat.com/967328 Signed-off-by: Martin Perina <[email protected]> --- M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsNotRespondingTreatmentCommand.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/action/FenceVdsActionParameters.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java M backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java 5 files changed, 171 insertions(+), 12 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/98/15798/1 diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java index c8c0ee3..18422a7 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java @@ -115,7 +115,7 @@ } @Override - public void vdsNotResponding(final VDS vds) { + public void vdsNotResponding(final VDS vds, final boolean executeSshSoftFencing) { ExecutionHandler.updateSpecificActionJobCompleted(vds.getId(), VdcActionType.MaintenanceVds, false); ThreadPoolUtil.execute(new Runnable() { @Override @@ -124,7 +124,7 @@ vds.getId(), vds.getHostName()); Backend.getInstance().runInternalAction(VdcActionType.VdsNotRespondingTreatment, - new FenceVdsActionParameters(vds.getId(), FenceActionType.Restart), + new FenceVdsActionParameters(vds.getId(), FenceActionType.Restart, executeSshSoftFencing), ExecutionHandler.createInternalJobContext()); } }); diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsNotRespondingTreatmentCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsNotRespondingTreatmentCommand.java index 2d070fb..77ce9f3 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsNotRespondingTreatmentCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsNotRespondingTreatmentCommand.java @@ -1,7 +1,17 @@ package org.ovirt.engine.core.bll; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.security.KeyPair; +import java.security.KeyStore; +import java.security.KeyStoreException; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; +import java.util.concurrent.TimeUnit; import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.VdcObjectType; @@ -12,10 +22,14 @@ import org.ovirt.engine.core.common.businessentities.VM; import org.ovirt.engine.core.common.businessentities.VMStatus; import org.ovirt.engine.core.common.businessentities.VdsSpmStatus; +import org.ovirt.engine.core.common.config.Config; +import org.ovirt.engine.core.common.config.ConfigValues; import org.ovirt.engine.core.common.vdscommands.SetVmStatusVDSCommandParameters; import org.ovirt.engine.core.common.vdscommands.VDSCommandType; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogableBase; +import org.ovirt.engine.core.utils.EngineLocalConfig; +import org.ovirt.engine.core.utils.ssh.SSHClient; @NonTransactiveCommandAttribute public class VdsNotRespondingTreatmentCommand<T extends FenceVdsActionParameters> extends RestartVdsCommand<T> { @@ -48,7 +62,17 @@ protected void executeCommand() { setVds(null); if (getVds() != null && shouldVdsBeFenced()) { - super.executeCommand(); + boolean sshVdsmRestartSuccess = false; + if (getParameters().isExecuteSshVdsmRestart()) { + sshVdsmRestartSuccess = + executeRestartVDSMUsingSsh(getVds().getHostName(), getVds().getVdsGroupCompatibilityVersion() + .toString()); + } + + // VDSM restart using SSH was not successful, execute standard fencing + if (!sshVdsmRestartSuccess) { + super.executeCommand(); + } } else { setCommandShouldBeLogged(false); log.infoFormat("Host {0}({1}) not fenced since it's status is ok, or it doesn't exist anymore.", @@ -129,4 +153,110 @@ } return jobProperties; } + + /** + * Executes VDSM restart command using SSH connection + * @param host host to restart VDSM on + * @returns {@code true} if restart command has been executed successfully, {@code false} otherwise + */ + private boolean executeRestartVDSMUsingSsh(String host, String version) { + boolean result = true; + SSHClient sshClient = null; + + try { + sshClient = getSshClient(host); + sshClient.connect(); + sshClient.authenticate(); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + sshClient.executeCommand(Config.<String> GetValue(ConfigValues.RestartVdsmBySshCommand, version), + null, + bos, + null); + log.info("VDSM restart executed on host " + host); + log.debug("VDSM restart command output " + bos.toString()); + } catch (Exception ex) { + log.error("VDSM restart failed on host " + host, ex); + result = false; + } finally { + closeSshConnection(sshClient); + } + return result; + } + + /** + * Tries to close SSH client connection + * @param sshClient SSH client + */ + private void closeSshConnection(SSHClient sshClient) { + if (sshClient != null) { + try { + sshClient.disconnect(); + } catch (Exception ex) { + log.error("Error disconnecting SSH connection", ex); + } + } + } + + /** + * Initializes SSH client instance + * + * @param host + * host to connect to + * @return initialized SSH client instance + * @throws KeyStoreException + * if engine SSH key pair cannot be loaded + */ + private SSHClient getSshClient(String host) throws KeyStoreException { + SSHClient sshClient = new SSHClient(); + sshClient.setHardTimeout(TimeUnit.SECONDS.toMillis( + Config.<Integer> GetValue(ConfigValues.SSHInactivityHardTimoutSeconds))); + sshClient.setSoftTimeout(TimeUnit.SECONDS.toMillis( + Config.<Integer> GetValue(ConfigValues.SSHInactivityTimoutSeconds))); + sshClient.setHost(host); + sshClient.setUser("root"); + loadEngineSshKeyPair(sshClient); + return sshClient; + } + + /** + * Loads engine SSH key pair into specified SSH client instance. + * + * @param sshClient + * SSH client instance + */ + private void loadEngineSshKeyPair(SSHClient sshClient) throws KeyStoreException { + EngineLocalConfig config = EngineLocalConfig.getInstance(); + final File p12 = config.getPKIEngineStore(); + final char[] password = config.getPKIEngineStorePassword().toCharArray(); + final String alias = config.getPKIEngineStoreAlias(); + + KeyStore.PrivateKeyEntry entry; + InputStream in = null; + try { + in = new FileInputStream(p12); + KeyStore ks = KeyStore.getInstance("PKCS12"); + ks.load(in, password); + + entry = (KeyStore.PrivateKeyEntry) ks.getEntry(alias, new KeyStore.PasswordProtection(password)); + } catch (Exception e) { + throw new KeyStoreException(String.format("Failed to get certificate entry from key store: %1$s/%2$s", + p12, + alias), e); + } finally { + Arrays.fill(password, '*'); + if (in != null) { + try { + in.close(); + } catch (IOException e) { + log.error("Cannot close key store", e); + } + } + } + + if (entry == null) { + throw new KeyStoreException( + String.format("Bad key store: %1$s/%2$s", p12, alias)); + } + sshClient.setKeyPair(new KeyPair(entry.getCertificate().getPublicKey(), entry.getPrivateKey())); + } } diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/action/FenceVdsActionParameters.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/action/FenceVdsActionParameters.java index 532d4d0..db9127c5 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/action/FenceVdsActionParameters.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/action/FenceVdsActionParameters.java @@ -6,17 +6,32 @@ public class FenceVdsActionParameters extends VdsActionParameters { private static final long serialVersionUID = 6174371941176548263L; - public FenceVdsActionParameters(Guid vdsId, FenceActionType action) { - super(vdsId); - _action = action; + private FenceActionType _action = FenceActionType.forValue(0); + + /** + * Indicator to execute VDSM restart using SSH + */ + private final boolean executeSshVdsmRestart; + + public FenceVdsActionParameters() { + this.executeSshVdsmRestart = false; } - private FenceActionType _action = FenceActionType.forValue(0); + public FenceVdsActionParameters(Guid vdsId, FenceActionType action) { + this(vdsId, action, false); + } + + public FenceVdsActionParameters(Guid vdsId, FenceActionType action, boolean executeSshVdsmRestart) { + super(vdsId); + _action = action; + this.executeSshVdsmRestart = executeSshVdsmRestart; + } public FenceActionType getAction() { return _action; } - public FenceVdsActionParameters() { + public boolean isExecuteSshVdsmRestart() { + return executeSshVdsmRestart; } } diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java index 22318ae..f0b97b6 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java @@ -8,7 +8,7 @@ import org.ovirt.engine.core.compat.TransactionScopeOption; public interface IVdsEventListener { - void vdsNotResponding(VDS vds); // BLL + void vdsNotResponding(VDS vds, boolean executeSshSoftFencing); // BLL void vdsNonOperational(Guid vdsId, NonOperationalReason type, boolean logCommand, boolean saveToDb, Guid domainId); // BLL diff --git a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java index e2db5f5..343a520 100644 --- a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java +++ b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java @@ -94,6 +94,7 @@ private final AtomicInteger mFailedToRunVmAttempts; private final AtomicInteger mUnrespondedAttempts; + private final AtomicBoolean sshSoftFencingExecuted; private static final int VDS_DURING_FAILURE_TIMEOUT_IN_MINUTES = Config .<Integer> GetValue(ConfigValues.TimeToReduceFailedRunOnVdsInMinutes); @@ -137,6 +138,7 @@ monitoringStrategy = MonitoringStrategyFactory.getMonitoringStrategyForVds(vds); mUnrespondedAttempts = new AtomicInteger(); mFailedToRunVmAttempts = new AtomicInteger(); + sshSoftFencingExecuted = new AtomicBoolean(false); monitoringLock = new EngineLock(Collections.singletonMap(_vdsId.toString(), new Pair<String, String>(LockingGroup.VDS_INIT.name(), "")), null); @@ -521,6 +523,7 @@ */ public void SuccededToRunVm(Guid vmId) { mUnrespondedAttempts.set(0); + sshSoftFencingExecuted.set(false); ResourceManager.getInstance().SuccededToRunVm(vmId, _vds.getId()); } @@ -600,11 +603,18 @@ if (spmStatus != VdsSpmStatus.None) { spmIndicator = 1; } - return TimeUnit.SECONDS.toMillis((int)( + int secToFence = (int)( // delay time can be fracture number, casting it to int should be enough Config.<Integer> GetValue(ConfigValues.TimeoutToResetVdsInSeconds) + (Config.<Double> GetValue(ConfigValues.DelayResetForSpmInSeconds) * spmIndicator) + - (Config.<Double> GetValue(ConfigValues.DelayResetPerVmInSeconds) * vmCount))); + (Config.<Double> GetValue(ConfigValues.DelayResetPerVmInSeconds) * vmCount)); + + if (sshSoftFencingExecuted.get()) { + // VDSM restart by SSH has been executed, wait more to see if host is OK + secToFence = 2 * secToFence; + } + + return TimeUnit.SECONDS.toMillis(secToFence); } /** * Handle network exception, return true if save vdsDynamic to DB is needed. @@ -640,7 +650,11 @@ AuditLogableBase logable = new AuditLogableBase(vds.getId()); AuditLogDirector.log(logable, AuditLogType.VDS_FAILURE); - ResourceManager.getInstance().getEventListener().vdsNotResponding(vds); + boolean executeSshSoftFencing = false; + if (!sshSoftFencingExecuted.getAndSet(true)) { + executeSshSoftFencing = true; + } + ResourceManager.getInstance().getEventListener().vdsNotResponding(vds, executeSshSoftFencing); } return true; } -- To view, visit http://gerrit.ovirt.org/15798 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I8002b6ac00a1e2e543b5cc8d1affdd42b994d5f7 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: master Gerrit-Owner: Martin Peřina <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
