This is an automated email from the ASF dual-hosted git repository.
msingh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 676610e HDDS-3493. Refactor Failures in MiniOzoneChaosCluster into
pluggable model. (#874)
676610e is described below
commit 676610ef40b0f8701d950b347fb916d28268f1da
Author: Mukul Kumar Singh <[email protected]>
AuthorDate: Tue May 19 10:01:31 2020 +0530
HDDS-3493. Refactor Failures in MiniOzoneChaosCluster into pluggable model.
(#874)
---
.../apache/hadoop/ozone/MiniOzoneChaosCluster.java | 309 +++++++++------------
.../ozone/MiniOzoneDatanodeChaosCluster.java | 57 ----
.../hadoop/ozone/MiniOzoneOMChaosCluster.java | 132 ---------
.../hadoop/ozone/TestMiniChaosOzoneCluster.java | 34 ++-
.../hadoop/ozone/failure/FailureManager.java | 99 +++++++
.../org/apache/hadoop/ozone/failure/Failures.java | 147 ++++++++++
.../apache/hadoop/ozone/failure/package-info.java | 19 ++
.../src/test/resources/log4j.properties | 1 +
.../hadoop/ozone/MiniOzoneHAClusterImpl.java | 11 +-
9 files changed, 422 insertions(+), 387 deletions(-)
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
index 9357768..dc7c26c 100644
---
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
@@ -19,33 +19,37 @@
package org.apache.hadoop.ozone;
import java.util.Arrays;
-import java.util.concurrent.ExecutionException;
+import java.util.List;
+import java.util.HashSet;
+import java.util.ArrayList;
+import java.util.Set;
+import java.util.Collections;
import java.util.concurrent.TimeoutException;
import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.conf.StorageUnit;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.ozone.failure.FailureManager;
+import org.apache.hadoop.ozone.failure.Failures;
+import org.apache.hadoop.ozone.om.OMConfigKeys;
import org.apache.hadoop.ozone.om.OzoneManager;
import
org.apache.hadoop.security.authentication.client.AuthenticationException;
+import org.apache.hadoop.test.GenericTestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
-import java.util.List;
-
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
-import java.util.concurrent.Executors;
/**
* This class causes random failures in the chaos cluster.
*/
-public abstract class MiniOzoneChaosCluster extends MiniOzoneHAClusterImpl {
+public class MiniOzoneChaosCluster extends MiniOzoneHAClusterImpl {
static final Logger LOG =
LoggerFactory.getLogger(MiniOzoneChaosCluster.class);
@@ -53,21 +57,12 @@ public abstract class MiniOzoneChaosCluster extends
MiniOzoneHAClusterImpl {
private final int numDatanodes;
private final int numOzoneManagers;
- // Number of Nodes of the service (Datanode or OM) on which chaos will be
- // unleashed
- private int numNodes;
-
- private FailureService failureService;
- private long failureIntervalInMS;
+ private final FailureManager failureManager;
- private final ScheduledExecutorService executorService;
+ private final int waitForClusterToBeReadyTimeout = 120000; // 2 min
- private ScheduledFuture scheduledFuture;
-
- private enum FailureMode {
- NODES_RESTART,
- NODES_SHUTDOWN
- }
+ private final Set<OzoneManager> failedOmSet;
+ private final Set<DatanodeDetails> failedDnSet;
// The service on which chaos will be unleashed.
enum FailureService {
@@ -96,166 +91,30 @@ public abstract class MiniOzoneChaosCluster extends
MiniOzoneHAClusterImpl {
public MiniOzoneChaosCluster(OzoneConfiguration conf,
List<OzoneManager> ozoneManagers, StorageContainerManager scm,
List<HddsDatanodeService> hddsDatanodes, String omServiceID,
- FailureService service) {
+ List<Class<? extends Failures>> clazzes) {
super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID);
-
- this.executorService = Executors.newSingleThreadScheduledExecutor();
this.numDatanodes = getHddsDatanodes().size();
this.numOzoneManagers = ozoneManagers.size();
- this.failureService = service;
- LOG.info("Starting MiniOzoneChaosCluster with {} OzoneManagers and {} " +
- "Datanodes, chaos on service: {}",
- numOzoneManagers, numDatanodes, failureService);
- }
-
- protected int getNumNodes() {
- return numNodes;
- }
-
- protected void setNumNodes(int numOfNodes) {
- this.numNodes = numOfNodes;
- }
-
- protected long getFailureIntervalInMS() {
- return failureIntervalInMS;
- }
-
- /**
- * Is the cluster ready for chaos.
- */
- protected boolean isClusterReady() {
- return true;
- }
-
- protected void getClusterReady() {
- // Do nothing
- }
-
- // Get the number of nodes to fail in the cluster.
- protected int getNumberOfNodesToFail() {
- return RandomUtils.nextBoolean() ? 1 : 2;
- }
-
- // Should the failed node wait for SCM to register even before
- // restart, i.e fast restart or not.
- protected boolean isFastRestart() {
- return RandomUtils.nextBoolean();
- }
-
- // Should the selected node be stopped or started.
- protected boolean shouldStop() {
- return RandomUtils.nextBoolean();
- }
-
- // Get the node index of the node to fail.
- private int getNodeToFail() {
- return RandomUtils.nextInt() % numNodes;
- }
-
- protected abstract void restartNode(int failedNodeIndex,
- boolean waitForNodeRestart)
- throws TimeoutException, InterruptedException, IOException;
-
- protected abstract void shutdownNode(int failedNodeIndex)
- throws ExecutionException, InterruptedException;
-
- protected abstract String getFailedNodeID(int failedNodeIndex);
-
- private void restartNodes() {
- final int numNodesToFail = getNumberOfNodesToFail();
- LOG.info("Will restart {} nodes to simulate failure", numNodesToFail);
- for (int i = 0; i < numNodesToFail; i++) {
- boolean failureMode = isFastRestart();
- int failedNodeIndex = getNodeToFail();
- String failString = failureMode ? "Fast" : "Slow";
- String failedNodeID = getFailedNodeID(failedNodeIndex);
- try {
- LOG.info("{} Restarting {}: {}", failString, failureService,
- failedNodeID);
- restartNode(failedNodeIndex, failureMode);
- LOG.info("{} Completed restarting {}: {}", failString, failureService,
- failedNodeID);
- } catch (Exception e) {
- LOG.error("Failed to restartNodes {}: {}", failedNodeID,
- failureService, e);
- }
- }
- }
-
- private void shutdownNodes() {
- final int numNodesToFail = getNumberOfNodesToFail();
- LOG.info("Will shutdown {} nodes to simulate failure", numNodesToFail);
- for (int i = 0; i < numNodesToFail; i++) {
- boolean shouldStop = shouldStop();
- int failedNodeIndex = getNodeToFail();
- String stopString = shouldStop ? "Stopping" : "Restarting";
- String failedNodeID = getFailedNodeID(failedNodeIndex);
- try {
- LOG.info("{} {} {}", stopString, failureService, failedNodeID);
- if (shouldStop) {
- shutdownNode(failedNodeIndex);
- } else {
- restartNode(failedNodeIndex, false);
- }
- LOG.info("Completed {} {} {}", stopString, failureService,
- failedNodeID);
- } catch (Exception e) {
- LOG.error("Failed {} {} {}", stopString, failureService,
- failedNodeID, e);
- }
- }
- }
- private FailureMode getFailureMode() {
- return FailureMode.
- values()[RandomUtils.nextInt() % FailureMode.values().length];
- }
+ this.failedOmSet = new HashSet<>();
+ this.failedDnSet = new HashSet<>();
- // Fail nodes randomly at configured timeout period.
- private void fail() {
- if (isClusterReady()) {
- FailureMode mode = getFailureMode();
- switch (mode) {
- case NODES_RESTART:
- restartNodes();
- break;
- case NODES_SHUTDOWN:
- shutdownNodes();
- break;
-
- default:
- LOG.error("invalid failure mode:{}", mode);
- break;
- }
- } else {
- // Cluster is not ready for failure yet. Skip failing this time and get
- // the cluster ready by restarting any OM that is not running.
- LOG.info("Cluster is not ready for failure.");
- getClusterReady();
- }
+ this.failureManager = new FailureManager(this, conf, clazzes);
+ LOG.info("Starting MiniOzoneChaosCluster with {} OzoneManagers and {} " +
+ "Datanodes", numOzoneManagers, numDatanodes);
+ clazzes.forEach(c -> LOG.info("added failure:{}", c.getSimpleName()));
}
void startChaos(long initialDelay, long period, TimeUnit timeUnit) {
LOG.info("Starting Chaos with failure period:{} unit:{} numDataNodes:{} " +
"numOzoneManagers:{}", period, timeUnit, numDatanodes,
numOzoneManagers);
- this.failureIntervalInMS = TimeUnit.MILLISECONDS.convert(period, timeUnit);
- scheduledFuture = executorService.scheduleAtFixedRate(this::fail,
- initialDelay, period, timeUnit);
- }
-
- void stopChaos() throws Exception {
- if (scheduledFuture != null) {
- scheduledFuture.cancel(false);
- scheduledFuture.get();
- }
+ failureManager.start(initialDelay, period, timeUnit);
}
public void shutdown() {
try {
- stopChaos();
- executorService.shutdown();
- executorService.awaitTermination(1, TimeUnit.DAYS);
+ failureManager.stop();
//this should be called after stopChaos to be sure that the
//datanode collection is not modified during the shutdown
super.shutdown();
@@ -265,11 +124,30 @@ public abstract class MiniOzoneChaosCluster extends
MiniOzoneHAClusterImpl {
}
/**
+ * Check if cluster is ready for a restart or shutdown of an OM node. If
+ * yes, then set isClusterReady to false so that another thread cannot
+ * restart/ shutdown OM till all OMs are up again.
+ */
+ @Override
+ public void waitForClusterToBeReady()
+ throws TimeoutException, InterruptedException {
+ super.waitForClusterToBeReady();
+ GenericTestUtils.waitFor(() -> {
+ for (OzoneManager om : getOzoneManagersList()) {
+ if (!om.isRunning()) {
+ return false;
+ }
+ }
+ return true;
+ }, 1000, waitForClusterToBeReadyTimeout);
+ }
+
+ /**
* Builder for configuring the MiniOzoneChaosCluster to run.
*/
public static class Builder extends MiniOzoneHAClusterImpl.Builder {
- private FailureService failureService;
+ private final List<Class<? extends Failures>> clazzes = new ArrayList<>();
/**
* Creates a new Builder.
@@ -311,8 +189,8 @@ public abstract class MiniOzoneChaosCluster extends
MiniOzoneHAClusterImpl {
return this;
}
- public Builder setFailureService(String serviceName) {
- this.failureService = FailureService.of(serviceName);
+ public Builder addFailures(Class<? extends Failures> clazz) {
+ this.clazzes.add(clazz);
return this;
}
@@ -358,6 +236,10 @@ public abstract class MiniOzoneChaosCluster extends
MiniOzoneHAClusterImpl {
conf.setInt("hdds.scm.replication.event.timeout", 20 * 1000);
conf.setInt(OzoneConfigKeys.DFS_RATIS_SNAPSHOT_THRESHOLD_KEY, 100);
conf.setInt(OzoneConfigKeys.DFS_CONTAINER_RATIS_LOG_PURGE_GAP, 100);
+
+ conf.setInt(OMConfigKeys.
+ OZONE_OM_RATIS_SNAPSHOT_AUTO_TRIGGER_THRESHOLD_KEY, 100);
+ conf.setInt(OMConfigKeys.OZONE_OM_RATIS_LOG_PURGE_GAP, 100);
}
/**
@@ -375,12 +257,6 @@ public abstract class MiniOzoneChaosCluster extends
MiniOzoneHAClusterImpl {
@Override
public MiniOzoneChaosCluster build() throws IOException {
- if (failureService == FailureService.OZONE_MANAGER && numOfOMs < 3) {
- throw new IllegalArgumentException("Not enough number of " +
- "OzoneManagers to test chaos on OzoneManagers. Set number of " +
- "OzoneManagers to at least 3");
- }
-
DefaultMetricsSystem.setMiniClusterMode(true);
initializeConfiguration();
if (numOfOMs > 1) {
@@ -406,14 +282,9 @@ public abstract class MiniOzoneChaosCluster extends
MiniOzoneHAClusterImpl {
final List<HddsDatanodeService> hddsDatanodes = createHddsDatanodes(
scm, null);
- MiniOzoneChaosCluster cluster;
- if (failureService == FailureService.DATANODE) {
- cluster = new MiniOzoneDatanodeChaosCluster(conf, omList, scm,
- hddsDatanodes, omServiceId);
- } else {
- cluster = new MiniOzoneOMChaosCluster(conf, omList, scm,
- hddsDatanodes, omServiceId);
- }
+ MiniOzoneChaosCluster cluster =
+ new MiniOzoneChaosCluster(conf, omList, scm, hddsDatanodes,
+ omServiceId, clazzes);
if (startDataNodes) {
cluster.startHddsDatanodes();
@@ -421,4 +292,78 @@ public abstract class MiniOzoneChaosCluster extends
MiniOzoneHAClusterImpl {
return cluster;
}
}
+
+ // OzoneManager specifc
+ public static int getNumberOfOmToFail() {
+ return 1;
+ }
+
+ public Set<OzoneManager> omToFail() {
+ int numNodesToFail = getNumberOfOmToFail();
+ if (failedOmSet.size() >= numOzoneManagers/2) {
+ return Collections.emptySet();
+ }
+
+ int numOms = getOzoneManagersList().size();
+ Set<OzoneManager> oms = new HashSet<>();
+ for (int i = 0; i < numNodesToFail; i++) {
+ int failedNodeIndex = FailureManager.getBoundedRandomIndex(numOms);
+ oms.add(getOzoneManager(failedNodeIndex));
+ }
+ return oms;
+ }
+
+ public void shutdownOzoneManager(OzoneManager om) {
+ super.shutdownOzoneManager(om);
+ failedOmSet.add(om);
+ }
+
+ public void restartOzoneManager(OzoneManager om, boolean waitForOM)
+ throws IOException, TimeoutException, InterruptedException {
+ super.restartOzoneManager(om, waitForOM);
+ failedOmSet.remove(om);
+ }
+
+ // Should the selected node be stopped or started.
+ public boolean shouldStop() {
+ if (failedOmSet.size() >= numOzoneManagers/2) {
+ return false;
+ }
+ return RandomUtils.nextBoolean();
+ }
+
+ // Datanode specifc
+ private int getNumberOfDnToFail() {
+ return RandomUtils.nextBoolean() ? 1 : 2;
+ }
+
+ public Set<DatanodeDetails> dnToFail() {
+ int numNodesToFail = getNumberOfDnToFail();
+ int numDns = getHddsDatanodes().size();
+ Set<DatanodeDetails> dns = new HashSet<>();
+ for (int i = 0; i < numNodesToFail; i++) {
+ int failedNodeIndex = FailureManager.getBoundedRandomIndex(numDns);
+ dns.add(getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails());
+ }
+ return dns;
+ }
+
+ @Override
+ public void restartHddsDatanode(DatanodeDetails dn, boolean waitForDatanode)
+ throws InterruptedException, TimeoutException, IOException {
+ failedDnSet.add(dn);
+ super.restartHddsDatanode(dn, waitForDatanode);
+ failedDnSet.remove(dn);
+ }
+
+ @Override
+ public void shutdownHddsDatanode(DatanodeDetails dn) throws IOException {
+ failedDnSet.add(dn);
+ super.shutdownHddsDatanode(dn);
+ }
+
+ // Should the selected node be stopped or started.
+ public boolean shouldStop(DatanodeDetails dn) {
+ return !failedDnSet.contains(dn);
+ }
}
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
deleted file mode 100644
index f402831..0000000
---
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.ozone;
-
-import java.util.List;
-import java.util.concurrent.TimeoutException;
-import org.apache.hadoop.hdds.conf.OzoneConfiguration;
-import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
-import org.apache.hadoop.ozone.om.OzoneManager;
-
-/**
- * This class causes random failures in Datanodes in the chaos cluster.
- */
-public class MiniOzoneDatanodeChaosCluster extends MiniOzoneChaosCluster {
-
- public MiniOzoneDatanodeChaosCluster(OzoneConfiguration conf,
- List<OzoneManager> ozoneManagers,
- StorageContainerManager scm,
- List<HddsDatanodeService> hddsDatanodes,
- String omServiceID) {
- super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
- FailureService.DATANODE);
- setNumNodes(hddsDatanodes.size());
- }
-
- @Override
- protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
- throws TimeoutException, InterruptedException {
- restartHddsDatanode(failedNodeIndex, waitForNodeRestart);
- }
-
- @Override
- protected void shutdownNode(int failedNodeIndex) {
- shutdownHddsDatanode(failedNodeIndex);
- }
-
- @Override
- protected String getFailedNodeID(int failedNodeIndex) {
- return getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails()
- .getUuidString();
- }
-}
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
deleted file mode 100644
index 2b2a4d7..0000000
---
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
+++ /dev/null
@@ -1,132 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.ozone;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
-import java.util.concurrent.atomic.AtomicBoolean;
-
-import org.apache.commons.lang3.RandomUtils;
-import org.apache.hadoop.hdds.conf.OzoneConfiguration;
-import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
-import org.apache.hadoop.ozone.om.OzoneManager;
-
-/**
- * This class causes random failures in OMs in the chaos cluster.
- */
-public class MiniOzoneOMChaosCluster extends MiniOzoneChaosCluster {
-
- // Cluster is deemed ready for chaos when all the OMs are up and running.
- private AtomicBoolean isClusterReady = new AtomicBoolean(true);
-
- // The maximum number of nodes failures which can be tolerated without
- // losing quorum. This should be equal to (Num of OMs - 1)/2.
- private int numOfOMNodeFailuresTolerated;
-
- MiniOzoneOMChaosCluster(OzoneConfiguration conf,
- List<OzoneManager> ozoneManagers,
- StorageContainerManager scm,
- List<HddsDatanodeService> hddsDatanodes,
- String omServiceID) {
- super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
- FailureService.OZONE_MANAGER);
- setNumNodes(ozoneManagers.size());
- numOfOMNodeFailuresTolerated = (getNumNodes() - 1) / 2;
- }
-
- /**
- * Check if cluster is ready for a restart or shutdown of an OM node. If
- * yes, then set isClusterReady to false so that another thread cannot
- * restart/ shutdown OM till all OMs are up again.
- */
- protected boolean isClusterReady() {
- return isClusterReady.compareAndSet(true, false);
- }
-
- /**
- * If any OM node is not running, restart it.
- */
- @Override
- protected void getClusterReady() {
- boolean clusterReady = true;
- for (OzoneManager om : getOzoneManagersList()) {
- if (!om.isRunning()) {
- try {
- restartOzoneManager(om, true);
- } catch (Exception e) {
- clusterReady = false;
- LOG.error("Cluster not ready for chaos. Failed to restart OM {}: {}",
- om.getOMNodeId(), e);
- }
- }
- }
- if (clusterReady) {
- isClusterReady.set(true);
- }
- }
-
- @Override
- protected int getNumberOfNodesToFail() {
- return RandomUtils.nextInt(1, numOfOMNodeFailuresTolerated + 1);
- }
-
- @Override
- protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
- throws IOException, TimeoutException, InterruptedException {
- shutdownOzoneManager(failedNodeIndex);
- restartOzoneManager(failedNodeIndex, waitForNodeRestart);
- getClusterReady();
- }
-
- /**
- * For OM chaos, a shutdown node should eventually be restarted before the
- * next failure.
- */
- @Override
- protected void shutdownNode(int failedNodeIndex)
- throws ExecutionException, InterruptedException {
- shutdownOzoneManager(failedNodeIndex);
-
- // Restart the OM after FailureInterval / 2 duration.
- Executors.newSingleThreadScheduledExecutor().schedule(
- this::getClusterReady, getFailureIntervalInMS() / 2,
- TimeUnit.MILLISECONDS).get();
- }
-
- @Override
- protected String getFailedNodeID(int failedNodeIndex) {
- return getOzoneManager(failedNodeIndex).getOMNodeId();
- }
-
- /**
- * When restarting OM, always wait for it to catch up with Leader OM.
- */
- @Override
- protected boolean isFastRestart() {
- return true;
- }
-
- @Override
- protected boolean shouldStop() {
- return true;
- }
-}
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
index ea7fe69..9ad2a16 100644
---
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
@@ -23,6 +23,7 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.ozone.client.ObjectStore;
import org.apache.hadoop.ozone.client.OzoneVolume;
import org.apache.hadoop.ozone.MiniOzoneChaosCluster.FailureService;
+import org.apache.hadoop.ozone.failure.Failures;
import org.apache.hadoop.ozone.loadgenerators.RandomLoadGenerator;
import org.apache.hadoop.ozone.loadgenerators.ReadOnlyLoadGenerator;
import org.apache.hadoop.ozone.loadgenerators.FilesystemLoadGenerator;
@@ -92,17 +93,36 @@ public class TestMiniChaosOzoneCluster extends GenericCli {
@BeforeClass
public static void init() throws Exception {
OzoneConfiguration configuration = new OzoneConfiguration();
- String omServiceID =
- FailureService.of(failureService) == FailureService.OZONE_MANAGER ?
- OM_SERVICE_ID : null;
+ FailureService service = FailureService.of(failureService);
+ String omServiceID;
+
+ MiniOzoneChaosCluster.Builder builder =
+ new MiniOzoneChaosCluster.Builder(configuration);
+
+ switch (service) {
+ case DATANODE:
+ omServiceID = null;
+ builder
+ .addFailures(Failures.DatanodeRestartFailure.class)
+ .addFailures(Failures.DatanodeStartStopFailure.class);
+ break;
+ case OZONE_MANAGER:
+ omServiceID = OM_SERVICE_ID;
+ builder
+ .addFailures(Failures.OzoneManagerStartStopFailure.class)
+ .addFailures(Failures.OzoneManagerRestartFailure.class);
+ break;
+ default:
+ throw new IllegalArgumentException();
+ }
- cluster = new MiniOzoneChaosCluster.Builder(configuration)
+ builder
.setNumDatanodes(numDatanodes)
.setNumOzoneManagers(numOzoneManagers)
- .setFailureService(failureService)
.setOMServiceID(omServiceID)
- .setNumDataVolumes(numDataVolumes)
- .build();
+ .setNumDataVolumes(numDataVolumes);
+
+ cluster = builder.build();
cluster.waitForClusterToBeReady();
String volumeName = RandomStringUtils.randomAlphabetic(10).toLowerCase();
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/FailureManager.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/FailureManager.java
new file mode 100644
index 0000000..15aa7f0
--- /dev/null
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/FailureManager.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.failure;
+
+import org.apache.commons.lang3.RandomUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.ozone.MiniOzoneChaosCluster;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Manages all the failures in the MiniOzoneChaosCluster.
+ */
+public class FailureManager {
+
+ static final Logger LOG =
+ LoggerFactory.getLogger(Failures.class);
+
+ private final MiniOzoneChaosCluster cluster;
+ private final List<Failures> failures;
+ private ScheduledFuture scheduledFuture;
+ private final ScheduledExecutorService executorService;
+ public FailureManager(MiniOzoneChaosCluster cluster,
+ Configuration conf,
+ List<Class<? extends Failures>> clazzes) {
+ this.cluster = cluster;
+ this.executorService = Executors.newSingleThreadScheduledExecutor();
+
+ failures = new ArrayList<>();
+ for (Class<? extends Failures> clazz : clazzes) {
+ Failures f = ReflectionUtils.newInstance(clazz, conf);
+ f.validateFailure(cluster);
+ failures.add(f);
+ }
+
+ }
+
+ // Fail nodes randomly at configured timeout period.
+ private void fail() {
+ Failures f = failures.get(getBoundedRandomIndex(failures.size()));
+ try {
+ LOG.info("time failure with {}", f.getName());
+ f.fail(cluster);
+ } catch (Throwable t) {
+ LOG.info("Caught exception while inducing failure:{}", f.getName(), t);
+ System.exit(-2);
+ }
+
+ }
+
+ public void start(long initialDelay, long period, TimeUnit timeUnit) {
+ LOG.info("starting failure manager {} {} {}", initialDelay,
+ period, timeUnit);
+ scheduledFuture = executorService.scheduleAtFixedRate(this::fail,
+ initialDelay, period, timeUnit);
+ }
+
+ public void stop() throws Exception {
+ if (scheduledFuture != null) {
+ scheduledFuture.cancel(false);
+ scheduledFuture.get();
+ }
+
+ executorService.shutdown();
+ executorService.awaitTermination(1, TimeUnit.MINUTES);
+ }
+
+ public static boolean isFastRestart() {
+ return RandomUtils.nextBoolean();
+ }
+
+ public static int getBoundedRandomIndex(int size) {
+ return RandomUtils.nextInt(0, size);
+ }
+}
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/Failures.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/Failures.java
new file mode 100644
index 0000000..6d226ca
--- /dev/null
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/Failures.java
@@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.failure;
+
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.ozone.MiniOzoneChaosCluster;
+import org.apache.hadoop.ozone.om.OzoneManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Set;
+
+/**
+ * Implementation of all the failures.
+ */
+public abstract class Failures {
+ static final Logger LOG =
+ LoggerFactory.getLogger(Failures.class);
+
+ public String getName() {
+ return this.getClass().getSimpleName();
+ }
+
+ public abstract void fail(MiniOzoneChaosCluster cluster);
+
+ public abstract void validateFailure(MiniOzoneChaosCluster cluster);
+
+ /**
+ * Ozone Manager failures.
+ */
+ public abstract static class OzoneFailures extends Failures {
+ @Override
+ public void validateFailure(MiniOzoneChaosCluster cluster) {
+ if (cluster.getOzoneManagersList().size() < 3) {
+ throw new IllegalArgumentException("Not enough number of " +
+ "OzoneManagers to test chaos on OzoneManagers. Set number of " +
+ "OzoneManagers to at least 3");
+ }
+ }
+ }
+
+ /**
+ * Restart Ozone Manager to induce failure.
+ */
+ public static class OzoneManagerRestartFailure extends OzoneFailures {
+ public void fail(MiniOzoneChaosCluster cluster) {
+ boolean failureMode = FailureManager.isFastRestart();
+ Set<OzoneManager> oms = cluster.omToFail();
+ oms.parallelStream().forEach(om -> {
+ try {
+ cluster.shutdownOzoneManager(om);
+ cluster.restartOzoneManager(om, failureMode);
+ cluster.waitForClusterToBeReady();
+ } catch (Throwable t) {
+ LOG.error("Failed to restartNodes OM {}", om, t);
+ }
+ });
+ }
+ }
+
+ /**
+ * Start/Stop Ozone Manager to induce failure.
+ */
+ public static class OzoneManagerStartStopFailure extends OzoneFailures {
+ public void fail(MiniOzoneChaosCluster cluster) {
+ // Get the number of OzoneManager to fail in the cluster.
+ boolean shouldStop = cluster.shouldStop();
+ Set<OzoneManager> oms = cluster.omToFail();
+ oms.parallelStream().forEach(om -> {
+ try {
+ if (shouldStop) {
+ // start another OM before failing the next one.
+ cluster.shutdownOzoneManager(om);
+ } else {
+ cluster.restartOzoneManager(om, true);
+ }
+ } catch (Throwable t) {
+ LOG.error("Failed to shutdown OM {}", om, t);
+ }
+ });
+ }
+ }
+
+ /**
+ * Datanode failures.
+ */
+ public abstract static class DatanodeFailures extends Failures {
+ @Override
+ public void validateFailure(MiniOzoneChaosCluster cluster) {
+ // Nothing to do here.
+ }
+ }
+
+ /**
+ * Restart Datanodes to induce failure.
+ */
+ public static class DatanodeRestartFailure extends DatanodeFailures {
+ public void fail(MiniOzoneChaosCluster cluster) {
+ boolean failureMode = FailureManager.isFastRestart();
+ Set<DatanodeDetails> dns = cluster.dnToFail();
+ dns.parallelStream().forEach(dn -> {
+ try {
+ cluster.restartHddsDatanode(dn, failureMode);
+ } catch (Throwable t) {
+ LOG.error("Failed to restartNodes Datanode {}", dn.getUuid(), t);
+ }
+ });
+ }
+ }
+
+ /**
+ * Start/Stop Datanodes to induce failure.
+ */
+ public static class DatanodeStartStopFailure extends DatanodeFailures {
+ public void fail(MiniOzoneChaosCluster cluster) {
+ // Get the number of datanodes to fail in the cluster.
+ Set<DatanodeDetails> dns = cluster.dnToFail();
+ dns.parallelStream().forEach(dn -> {
+ try {
+ if (cluster.shouldStop(dn)) {
+ cluster.shutdownHddsDatanode(dn);
+ } else {
+ cluster.restartHddsDatanode(dn, true);
+ }
+ } catch (Throwable t) {
+ LOG.error("Failed to shutdown Datanode {}", dn.getUuid(), t);
+ }
+ });
+ }
+ }
+}
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/package-info.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/package-info.java
new file mode 100644
index 0000000..e93958a
--- /dev/null
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/package-info.java
@@ -0,0 +1,19 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.failure;
\ No newline at end of file
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/resources/log4j.properties
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/resources/log4j.properties
index c3a1cec..aabb0b1 100644
---
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/resources/log4j.properties
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/resources/log4j.properties
@@ -25,6 +25,7 @@
log4j.logger.org.apache.ratis.grpc.client.GrpcClientProtocolClient=WARN
log4j.logger.org.apache.hadoop.ozone.utils=DEBUG,stdout,CHAOS
log4j.logger.org.apache.hadoop.ozone.loadgenerators=DEBUG,stdout,CHAOS
+log4j.logger.org.apache.hadoop.ozone.failure=INFO, CHAOS
log4j.appender.CHAOS.File=${chaoslogfilename}
log4j.appender.CHAOS=org.apache.log4j.FileAppender
log4j.appender.CHAOS.layout=org.apache.log4j.PatternLayout
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
index 4548717..6953594 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
@@ -206,20 +206,13 @@ public class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
}
}
- void shutdownOzoneManager(int omNodeIndex) {
- OzoneManager ozoneManager = ozoneManagers.get(omNodeIndex);
+ public void shutdownOzoneManager(OzoneManager ozoneManager) {
LOG.info("Shutting down OzoneManager " + ozoneManager.getOMNodeId());
ozoneManager.stop();
}
- void restartOzoneManager(int omNodeIndex, boolean waitForOM)
- throws IOException, TimeoutException, InterruptedException {
- OzoneManager ozoneManager = ozoneManagers.get(omNodeIndex);
- restartOzoneManager(ozoneManager, waitForOM);
- }
-
- void restartOzoneManager(OzoneManager ozoneManager, boolean waitForOM)
+ public void restartOzoneManager(OzoneManager ozoneManager, boolean waitForOM)
throws IOException, TimeoutException, InterruptedException {
LOG.info("Restarting OzoneManager " + ozoneManager.getOMNodeId());
ozoneManager.restart();
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]