This is an automated email from the ASF dual-hosted git repository.

andrijapanicsb pushed a commit to branch 4.22.1.0-ha-fence-hotfix
in repository https://gitbox.apache.org/repos/asf/cloudstack.git

commit 4408a3346a9af0916bcb516205cbac523b7e9571
Author: Andrija Panic <[email protected]>
AuthorDate: Mon Jun 8 18:19:03 2026 +0200

    KVM HA: fence by confirming host power state, not the power-off result
    
    KVMHAProvider.fence() declared a host fenced only when the out-of-band 
power-off
    command reported success. Against an already-off chassis the BMC rejects the
    power-off (e.g. Redfish returns HTTP 409), so fence() failed and the host 
stayed
    stuck in the Fencing HA state, which maps to Disconnected (not Down). VM-HA
    therefore never restarted the VMs until the dead host was powered back on.
    
    Fencing now succeeds based on the actual chassis power state:
     - if the host is already powered off (OOBM STATUS == Off), treat it as 
fenced;
     - otherwise issue a best-effort power-off and confirm via OOBM STATUS;
     - only a confirmed Off state counts as success; if the state cannot be 
confirmed
       (e.g. unreachable BMC) the fence fails and is retried, to avoid 
split-brain.
    
    Also map Redfish PowerOperation.OFF to ForceOff (hard power-off) instead of
    GracefulShutdown, consistent with the ipmitool driver and appropriate for 
fencing
    an unresponsive host (SOFT remains the graceful ACPI shutdown).
    
    Fixes #13376
---
 .../apache/cloudstack/kvm/ha/KVMHAProvider.java    | 58 ++++++++++++++---
 .../apache/cloudstack/kvm/ha/KVMHostHATest.java    | 76 ++++++++++++++++++++++
 .../driver/redfish/RedfishWrapper.java             |  6 +-
 3 files changed, 129 insertions(+), 11 deletions(-)

diff --git 
a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java
 
b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java
index f0b5cfc337d..352c1b14724 100644
--- 
a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java
+++ 
b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java
@@ -33,6 +33,7 @@ import org.apache.cloudstack.ha.provider.HAProvider;
 import org.apache.cloudstack.ha.provider.HARecoveryException;
 import org.apache.cloudstack.ha.provider.host.HAAbstractHostProvider;
 import 
org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerOperation;
+import 
org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerState;
 import org.apache.cloudstack.outofbandmanagement.OutOfBandManagementService;
 import org.joda.time.DateTime;
 
@@ -85,18 +86,57 @@ public final class KVMHAProvider extends 
HAAbstractHostProvider implements HAPro
 
     @Override
     public boolean fence(Host r) throws HAFenceException {
+        if (!outOfBandManagementService.isOutOfBandManagementEnabled(r)) {
+            logger.warn("Out-of-band management is not enabled/configured for 
host {}; cannot fence it.", r);
+            return false;
+        }
 
+        // Fencing must guarantee the host is powered off, and the only 
reliable signal for that is
+        // the actual chassis power state - not the return code of the 
power-off command. An already-off
+        // (or just-turned-off) host can make the power-off command report an 
error/conflict even though
+        // the host is down; conversely, an unreachable BMC must NOT be 
treated as a successful fence, to
+        // avoid split-brain. Therefore: (1) if already off, consider it 
fenced; (2) otherwise issue a
+        // best-effort power-off; (3) declare the fence successful only if the 
power state is confirmed Off.
         try {
-            if (outOfBandManagementService.isOutOfBandManagementEnabled(r)){
-                final OutOfBandManagementResponse resp = 
outOfBandManagementService.executePowerOperation(r, PowerOperation.OFF, null);
-                return resp.getSuccess();
-            } else {
-                logger.warn("OOBM fence operation failed for this host {}", r);
-                return false;
+            if (isHostPoweredOff(r)) {
+                logger.info("Host {} is already powered off; considering it 
fenced.", r);
+                return true;
             }
-        } catch (Exception e){
-            logger.warn("OOBM service is not configured or enabled for this 
host {} error is {}", r, e.getMessage());
-            throw new HAFenceException(String.format("OBM service is not 
configured or enabled for this host %s", r.getName()), e);
+
+            try {
+                outOfBandManagementService.executePowerOperation(r, 
PowerOperation.OFF, null);
+            } catch (Exception e) {
+                // The power-off may legitimately fail (e.g. the chassis is 
already off or the command
+                // conflicts with the current power state). Do not fail here - 
verify the actual power
+                // state below instead of trusting the command result.
+                logger.warn("Out-of-band power-off command for host {} did not 
complete successfully ({}); verifying the actual power state.", r, 
e.getMessage());
+            }
+
+            if (isHostPoweredOff(r)) {
+                logger.info("Confirmed host {} is powered off; fencing 
successful.", r);
+                return true;
+            }
+
+            logger.warn("Could not confirm host {} is powered off after the 
fence power-off; fencing will be retried.", r);
+            return false;
+        } catch (Exception e) {
+            logger.warn("Out-of-band fence operation failed for host {}: {}", 
r, e.getMessage());
+            throw new HAFenceException(String.format("Out-of-band fence 
operation failed for host %s", r.getName()), e);
+        }
+    }
+
+    /**
+     * Returns {@code true} only when the host's chassis power is positively 
confirmed to be Off via
+     * out-of-band management. Any failure to determine the power state (e.g. 
an unreachable BMC) returns
+     * {@code false}, so that a host whose state cannot be confirmed is never 
treated as fenced.
+     */
+    protected boolean isHostPoweredOff(Host r) {
+        try {
+            final OutOfBandManagementResponse statusResponse = 
outOfBandManagementService.executePowerOperation(r, PowerOperation.STATUS, 
null);
+            return statusResponse != null && 
PowerState.Off.equals(statusResponse.getPowerState());
+        } catch (Exception e) {
+            logger.warn("Unable to determine the out-of-band power state of 
host {}: {}", r, e.getMessage());
+            return false;
         }
     }
 
diff --git 
a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java
 
b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java
index a94fb01475a..8b7cfc854c0 100644
--- 
a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java
+++ 
b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java
@@ -20,10 +20,20 @@ package org.apache.cloudstack.kvm.ha;
 
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.ArgumentMatchers.isNull;
 import static org.mockito.Mockito.lenient;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 
+import org.apache.cloudstack.api.response.OutOfBandManagementResponse;
 import org.apache.cloudstack.ha.provider.HACheckerException;
+import org.apache.cloudstack.ha.provider.HAFenceException;
+import 
org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerOperation;
+import 
org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerState;
+import org.apache.cloudstack.outofbandmanagement.OutOfBandManagementService;
 import org.joda.time.DateTime;
 import org.junit.Before;
 import org.junit.Test;
@@ -34,6 +44,7 @@ import org.mockito.junit.MockitoJUnitRunner;
 import com.cloud.exception.StorageUnavailableException;
 import com.cloud.host.Host;
 import com.cloud.hypervisor.Hypervisor.HypervisorType;
+import com.cloud.utils.exception.CloudRuntimeException;
 
 @RunWith(MockitoJUnitRunner.class)
 public class KVMHostHATest {
@@ -42,12 +53,21 @@ public class KVMHostHATest {
     private Host host;
     @Mock
     private KVMHostActivityChecker kvmHostActivityChecker;
+    @Mock
+    private OutOfBandManagementService outOfBandManagementService;
     private KVMHAProvider kvmHAProvider;
 
     @Before
     public void setup() {
         kvmHAProvider = new KVMHAProvider();
         kvmHAProvider.hostActivityChecker = kvmHostActivityChecker;
+        kvmHAProvider.outOfBandManagementService = outOfBandManagementService;
+    }
+
+    private OutOfBandManagementResponse powerStatusResponse(PowerState state) {
+        OutOfBandManagementResponse response = 
mock(OutOfBandManagementResponse.class);
+        lenient().when(response.getPowerState()).thenReturn(state);
+        return response;
     }
 
     @Test
@@ -80,4 +100,60 @@ public class KVMHostHATest {
         assertFalse(kvmHAProvider.hasActivity(host, dt));
     }
 
+    @Test
+    public void testFenceWhenOutOfBandManagementNotEnabled() throws 
HAFenceException {
+        
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(false);
+        assertFalse(kvmHAProvider.fence(host));
+        verify(outOfBandManagementService, 
never()).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull());
+    }
+
+    @Test
+    public void testFenceWhenHostAlreadyPoweredOff() throws HAFenceException {
+        OutOfBandManagementResponse offStatus = 
powerStatusResponse(PowerState.Off);
+        
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
+        when(outOfBandManagementService.executePowerOperation(eq(host), 
eq(PowerOperation.STATUS), isNull())).thenReturn(offStatus);
+        assertTrue(kvmHAProvider.fence(host));
+        // The host is already off, so no power-off command should be issued.
+        verify(outOfBandManagementService, 
never()).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull());
+    }
+
+    @Test
+    public void testFenceConfirmedOffAfterPowerOff() throws HAFenceException {
+        OutOfBandManagementResponse onStatus = 
powerStatusResponse(PowerState.On);
+        OutOfBandManagementResponse offStatus = 
powerStatusResponse(PowerState.Off);
+        
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
+        // First STATUS (pre-check) returns On, second STATUS (post power-off) 
returns Off.
+        when(outOfBandManagementService.executePowerOperation(eq(host), 
eq(PowerOperation.STATUS), isNull()))
+                .thenReturn(onStatus, offStatus);
+        assertTrue(kvmHAProvider.fence(host));
+        verify(outOfBandManagementService).executePowerOperation(eq(host), 
eq(PowerOperation.OFF), isNull());
+    }
+
+    @Test
+    public void testFencePowerOffCommandFailsButHostConfirmedOff() throws 
HAFenceException {
+        // Regression: the power-off command fails (e.g. the chassis is 
already off and the BMC returns
+        // HTTP 409), but the host is actually off. Fencing must succeed 
because the confirmed power state -
+        // not the power-off command's result - is authoritative.
+        OutOfBandManagementResponse onStatus = 
powerStatusResponse(PowerState.On);
+        OutOfBandManagementResponse offStatus = 
powerStatusResponse(PowerState.Off);
+        
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
+        when(outOfBandManagementService.executePowerOperation(eq(host), 
eq(PowerOperation.STATUS), isNull()))
+                .thenReturn(onStatus, offStatus);
+        when(outOfBandManagementService.executePowerOperation(eq(host), 
eq(PowerOperation.OFF), isNull()))
+                .thenThrow(new CloudRuntimeException("power-off failed: HTTP 
409"));
+        assertTrue(kvmHAProvider.fence(host));
+    }
+
+    @Test
+    public void testFenceFailsWhenPowerStateCannotBeConfirmedOff() throws 
HAFenceException {
+        // The host cannot be confirmed off (e.g. an unreachable BMC). Fencing 
must NOT succeed, to avoid
+        // restarting VMs while the host may still be running (split-brain).
+        
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
+        when(outOfBandManagementService.executePowerOperation(eq(host), 
eq(PowerOperation.STATUS), isNull()))
+                .thenThrow(new CloudRuntimeException("BMC unreachable"));
+        when(outOfBandManagementService.executePowerOperation(eq(host), 
eq(PowerOperation.OFF), isNull()))
+                .thenThrow(new CloudRuntimeException("BMC unreachable"));
+        assertFalse(kvmHAProvider.fence(host));
+    }
+
 }
diff --git 
a/plugins/outofbandmanagement-drivers/redfish/src/main/java/org/apache/cloudstack/outofbandmanagement/driver/redfish/RedfishWrapper.java
 
b/plugins/outofbandmanagement-drivers/redfish/src/main/java/org/apache/cloudstack/outofbandmanagement/driver/redfish/RedfishWrapper.java
index 09cee3b21ae..17199de1c71 100644
--- 
a/plugins/outofbandmanagement-drivers/redfish/src/main/java/org/apache/cloudstack/outofbandmanagement/driver/redfish/RedfishWrapper.java
+++ 
b/plugins/outofbandmanagement-drivers/redfish/src/main/java/org/apache/cloudstack/outofbandmanagement/driver/redfish/RedfishWrapper.java
@@ -31,7 +31,9 @@ public class RedfishWrapper {
         case ON:
             return RedfishClient.RedfishResetCmd.On;
         case OFF:
-            return RedfishClient.RedfishResetCmd.GracefulShutdown;
+            // OFF is a hard power-off (consistent with the ipmitool driver 
and required for HA
+            // fencing of an unresponsive host); SOFT performs the graceful 
ACPI shutdown.
+            return RedfishClient.RedfishResetCmd.ForceOff;
         case CYCLE:
             return RedfishClient.RedfishResetCmd.PowerCycle;
         case RESET:
@@ -39,7 +41,7 @@ public class RedfishWrapper {
         case SOFT:
             return RedfishClient.RedfishResetCmd.GracefulShutdown;
         case STATUS:
-            throw new IllegalStateException(String.format("%s is not a valid 
Redfish Reset command [%s]", operation));
+            throw new IllegalStateException(String.format("%s is not a valid 
Redfish Reset command", operation));
         default:
             throw new IllegalStateException(String.format("Redfish does not 
support operation [%s]", operation));
         }

Reply via email to