Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.apache.cloudstack.ha.provider.HARecoveryException;
import org.apache.cloudstack.ha.provider.host.HAAbstractHostProvider;
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerOperation;
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerState;
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagementService;
import org.joda.time.DateTime;

Expand Down Expand Up @@ -85,18 +86,57 @@ public boolean recover(Host r) throws HARecoveryException {

@Override
public boolean fence(Host r) throws HAFenceException {
if (!outOfBandManagementService.isOutOfBandManagementEnabled(r)) {
logger.warn("Out-of-band management is not enabled/configured for host {}; cannot fence it.", r);
return false;
}

// Fencing must guarantee the host is powered off, and the only reliable signal for that is
// the actual chassis power state - not the return code of the power-off command. An already-off
// (or just-turned-off) host can make the power-off command report an error/conflict even though
// the host is down; conversely, an unreachable BMC must NOT be treated as a successful fence, to
// avoid split-brain. Therefore: (1) if already off, consider it fenced; (2) otherwise issue a
// best-effort power-off; (3) declare the fence successful only if the power state is confirmed Off.
try {
if (outOfBandManagementService.isOutOfBandManagementEnabled(r)){
final OutOfBandManagementResponse resp = outOfBandManagementService.executePowerOperation(r, PowerOperation.OFF, null);
return resp.getSuccess();
} else {
logger.warn("OOBM fence operation failed for this host {}", r);
return false;
if (isHostPoweredOff(r)) {
logger.info("Host {} is already powered off; considering it fenced.", r);
return true;
}
} catch (Exception e){
logger.warn("OOBM service is not configured or enabled for this host {} error is {}", r, e.getMessage());
throw new HAFenceException(String.format("OBM service is not configured or enabled for this host %s", r.getName()), e);

try {
outOfBandManagementService.executePowerOperation(r, PowerOperation.OFF, null);
} catch (Exception e) {
// The power-off may legitimately fail (e.g. the chassis is already off or the command
// conflicts with the current power state). Do not fail here - verify the actual power
// state below instead of trusting the command result.
logger.warn("Out-of-band power-off command for host {} did not complete successfully ({}); verifying the actual power state.", r, e.getMessage());
}

if (isHostPoweredOff(r)) {
logger.info("Confirmed host {} is powered off; fencing successful.", r);
return true;
}

logger.warn("Could not confirm host {} is powered off after the fence power-off; fencing will be retried.", r);
return false;
} catch (Exception e) {
logger.warn("Out-of-band fence operation failed for host {}: {}", r, e.getMessage());
throw new HAFenceException(String.format("Out-of-band fence operation failed for host %s", r.getName()), e);
}
}

/**
* Returns {@code true} only when the host's chassis power is positively confirmed to be Off via
* out-of-band management. Any failure to determine the power state (e.g. an unreachable BMC) returns
* {@code false}, so that a host whose state cannot be confirmed is never treated as fenced.
*/
protected boolean isHostPoweredOff(Host r) {
try {
final OutOfBandManagementResponse statusResponse = outOfBandManagementService.executePowerOperation(r, PowerOperation.STATUS, null);
return statusResponse != null && PowerState.Off.equals(statusResponse.getPowerState());
} catch (Exception e) {
logger.warn("Unable to determine the out-of-band power state of host {}: {}", r, e.getMessage());
return false;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,20 @@

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.ArgumentMatchers.isNull;
import static org.mockito.Mockito.lenient;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;

import org.apache.cloudstack.api.response.OutOfBandManagementResponse;
import org.apache.cloudstack.ha.provider.HACheckerException;
import org.apache.cloudstack.ha.provider.HAFenceException;
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerOperation;
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerState;
import org.apache.cloudstack.outofbandmanagement.OutOfBandManagementService;
import org.joda.time.DateTime;
import org.junit.Before;
import org.junit.Test;
Expand All @@ -34,6 +44,7 @@
import com.cloud.exception.StorageUnavailableException;
import com.cloud.host.Host;
import com.cloud.hypervisor.Hypervisor.HypervisorType;
import com.cloud.utils.exception.CloudRuntimeException;

@RunWith(MockitoJUnitRunner.class)
public class KVMHostHATest {
Expand All @@ -42,12 +53,21 @@ public class KVMHostHATest {
private Host host;
@Mock
private KVMHostActivityChecker kvmHostActivityChecker;
@Mock
private OutOfBandManagementService outOfBandManagementService;
private KVMHAProvider kvmHAProvider;

@Before
public void setup() {
kvmHAProvider = new KVMHAProvider();
kvmHAProvider.hostActivityChecker = kvmHostActivityChecker;
kvmHAProvider.outOfBandManagementService = outOfBandManagementService;
}

private OutOfBandManagementResponse powerStatusResponse(PowerState state) {
OutOfBandManagementResponse response = mock(OutOfBandManagementResponse.class);
lenient().when(response.getPowerState()).thenReturn(state);
return response;
}

@Test
Expand Down Expand Up @@ -80,4 +100,60 @@ public void testHostActivityForDownHost() throws HACheckerException, StorageUnav
assertFalse(kvmHAProvider.hasActivity(host, dt));
}

@Test
public void testFenceWhenOutOfBandManagementNotEnabled() throws HAFenceException {
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(false);
assertFalse(kvmHAProvider.fence(host));
verify(outOfBandManagementService, never()).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull());
}

@Test
public void testFenceWhenHostAlreadyPoweredOff() throws HAFenceException {
OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off);
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull())).thenReturn(offStatus);
assertTrue(kvmHAProvider.fence(host));
// The host is already off, so no power-off command should be issued.
verify(outOfBandManagementService, never()).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull());
}

@Test
public void testFenceConfirmedOffAfterPowerOff() throws HAFenceException {
OutOfBandManagementResponse onStatus = powerStatusResponse(PowerState.On);
OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off);
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
// First STATUS (pre-check) returns On, second STATUS (post power-off) returns Off.
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull()))
.thenReturn(onStatus, offStatus);
assertTrue(kvmHAProvider.fence(host));
verify(outOfBandManagementService).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull());
}

@Test
public void testFencePowerOffCommandFailsButHostConfirmedOff() throws HAFenceException {
// Regression: the power-off command fails (e.g. the chassis is already off and the BMC returns
// HTTP 409), but the host is actually off. Fencing must succeed because the confirmed power state -
// not the power-off command's result - is authoritative.
OutOfBandManagementResponse onStatus = powerStatusResponse(PowerState.On);
OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off);
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull()))
.thenReturn(onStatus, offStatus);
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull()))
.thenThrow(new CloudRuntimeException("power-off failed: HTTP 409"));
assertTrue(kvmHAProvider.fence(host));
}

@Test
public void testFenceFailsWhenPowerStateCannotBeConfirmedOff() throws HAFenceException {
// The host cannot be confirmed off (e.g. an unreachable BMC). Fencing must NOT succeed, to avoid
// restarting VMs while the host may still be running (split-brain).
when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true);
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull()))
.thenThrow(new CloudRuntimeException("BMC unreachable"));
when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull()))
.thenThrow(new CloudRuntimeException("BMC unreachable"));
assertFalse(kvmHAProvider.fence(host));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,17 @@ public RedfishClient.RedfishResetCmd parsePowerCommand(OutOfBandManagement.Power
case ON:
return RedfishClient.RedfishResetCmd.On;
case OFF:
return RedfishClient.RedfishResetCmd.GracefulShutdown;
// OFF is a hard power-off (consistent with the ipmitool driver and required for HA
// fencing of an unresponsive host); SOFT performs the graceful ACPI shutdown.
return RedfishClient.RedfishResetCmd.ForceOff;
case CYCLE:
return RedfishClient.RedfishResetCmd.PowerCycle;
case RESET:
return RedfishClient.RedfishResetCmd.ForceRestart;
case SOFT:
return RedfishClient.RedfishResetCmd.GracefulShutdown;
case STATUS:
throw new IllegalStateException(String.format("%s is not a valid Redfish Reset command [%s]", operation));
throw new IllegalStateException(String.format("%s is not a valid Redfish Reset command", operation));
default:
throw new IllegalStateException(String.format("Redfish does not support operation [%s]", operation));
}
Expand Down
Loading