This is an automated email from the ASF dual-hosted git repository.

msingh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 676610e  HDDS-3493. Refactor Failures in MiniOzoneChaosCluster into 
pluggable model. (#874)
676610e is described below

commit 676610ef40b0f8701d950b347fb916d28268f1da
Author: Mukul Kumar Singh <[email protected]>
AuthorDate: Tue May 19 10:01:31 2020 +0530

    HDDS-3493. Refactor Failures in MiniOzoneChaosCluster into pluggable model. 
(#874)
---
 .../apache/hadoop/ozone/MiniOzoneChaosCluster.java | 309 +++++++++------------
 .../ozone/MiniOzoneDatanodeChaosCluster.java       |  57 ----
 .../hadoop/ozone/MiniOzoneOMChaosCluster.java      | 132 ---------
 .../hadoop/ozone/TestMiniChaosOzoneCluster.java    |  34 ++-
 .../hadoop/ozone/failure/FailureManager.java       |  99 +++++++
 .../org/apache/hadoop/ozone/failure/Failures.java  | 147 ++++++++++
 .../apache/hadoop/ozone/failure/package-info.java  |  19 ++
 .../src/test/resources/log4j.properties            |   1 +
 .../hadoop/ozone/MiniOzoneHAClusterImpl.java       |  11 +-
 9 files changed, 422 insertions(+), 387 deletions(-)

diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
index 9357768..dc7c26c 100644
--- 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
@@ -19,33 +19,37 @@
 package org.apache.hadoop.ozone;
 
 import java.util.Arrays;
-import java.util.concurrent.ExecutionException;
+import java.util.List;
+import java.util.HashSet;
+import java.util.ArrayList;
+import java.util.Set;
+import java.util.Collections;
 import java.util.concurrent.TimeoutException;
 
 import org.apache.commons.lang3.RandomUtils;
 import org.apache.hadoop.conf.StorageUnit;
 import org.apache.hadoop.hdds.HddsConfigKeys;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
 import org.apache.hadoop.hdds.scm.ScmConfigKeys;
 import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.ozone.failure.FailureManager;
+import org.apache.hadoop.ozone.failure.Failures;
+import org.apache.hadoop.ozone.om.OMConfigKeys;
 import org.apache.hadoop.ozone.om.OzoneManager;
 import 
org.apache.hadoop.security.authentication.client.AuthenticationException;
+import org.apache.hadoop.test.GenericTestUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
-import java.util.List;
-
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.ScheduledFuture;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.Executors;
 
 /**
  * This class causes random failures in the chaos cluster.
  */
-public abstract class MiniOzoneChaosCluster extends MiniOzoneHAClusterImpl {
+public class MiniOzoneChaosCluster extends MiniOzoneHAClusterImpl {
 
   static final Logger LOG =
       LoggerFactory.getLogger(MiniOzoneChaosCluster.class);
@@ -53,21 +57,12 @@ public abstract class MiniOzoneChaosCluster extends 
MiniOzoneHAClusterImpl {
   private final int numDatanodes;
   private final int numOzoneManagers;
 
-  // Number of Nodes of the service (Datanode or OM) on which chaos will be
-  // unleashed
-  private int numNodes;
-
-  private FailureService failureService;
-  private long failureIntervalInMS;
+  private final FailureManager failureManager;
 
-  private final ScheduledExecutorService executorService;
+  private final int waitForClusterToBeReadyTimeout = 120000; // 2 min
 
-  private ScheduledFuture scheduledFuture;
-
-  private enum FailureMode {
-    NODES_RESTART,
-    NODES_SHUTDOWN
-  }
+  private final Set<OzoneManager> failedOmSet;
+  private final Set<DatanodeDetails> failedDnSet;
 
   // The service on which chaos will be unleashed.
   enum FailureService {
@@ -96,166 +91,30 @@ public abstract class MiniOzoneChaosCluster extends 
MiniOzoneHAClusterImpl {
   public MiniOzoneChaosCluster(OzoneConfiguration conf,
       List<OzoneManager> ozoneManagers, StorageContainerManager scm,
       List<HddsDatanodeService> hddsDatanodes, String omServiceID,
-      FailureService service) {
+      List<Class<? extends Failures>> clazzes) {
     super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID);
-
-    this.executorService =  Executors.newSingleThreadScheduledExecutor();
     this.numDatanodes = getHddsDatanodes().size();
     this.numOzoneManagers = ozoneManagers.size();
-    this.failureService = service;
-    LOG.info("Starting MiniOzoneChaosCluster with {} OzoneManagers and {} " +
-        "Datanodes, chaos on service: {}",
-        numOzoneManagers, numDatanodes, failureService);
-  }
-
-  protected int getNumNodes() {
-    return numNodes;
-  }
-
-  protected void setNumNodes(int numOfNodes) {
-    this.numNodes = numOfNodes;
-  }
-
-  protected long getFailureIntervalInMS() {
-    return failureIntervalInMS;
-  }
-
-  /**
-   * Is the cluster ready for chaos.
-   */
-  protected boolean isClusterReady() {
-    return true;
-  }
-
-  protected void getClusterReady() {
-    // Do nothing
-  }
-
-  // Get the number of nodes to fail in the cluster.
-  protected int getNumberOfNodesToFail() {
-    return RandomUtils.nextBoolean() ? 1 : 2;
-  }
-
-  // Should the failed node wait for SCM to register even before
-  // restart, i.e fast restart or not.
-  protected boolean isFastRestart() {
-    return RandomUtils.nextBoolean();
-  }
-
-  // Should the selected node be stopped or started.
-  protected boolean shouldStop() {
-    return RandomUtils.nextBoolean();
-  }
-
-  // Get the node index of the node to fail.
-  private int getNodeToFail() {
-    return RandomUtils.nextInt() % numNodes;
-  }
-
-  protected abstract void restartNode(int failedNodeIndex,
-      boolean waitForNodeRestart)
-      throws TimeoutException, InterruptedException, IOException;
-
-  protected abstract void shutdownNode(int failedNodeIndex)
-      throws ExecutionException, InterruptedException;
-
-  protected abstract String getFailedNodeID(int failedNodeIndex);
-
-  private void restartNodes() {
-    final int numNodesToFail = getNumberOfNodesToFail();
-    LOG.info("Will restart {} nodes to simulate failure", numNodesToFail);
-    for (int i = 0; i < numNodesToFail; i++) {
-      boolean failureMode = isFastRestart();
-      int failedNodeIndex = getNodeToFail();
-      String failString = failureMode ? "Fast" : "Slow";
-      String failedNodeID = getFailedNodeID(failedNodeIndex);
-      try {
-        LOG.info("{} Restarting {}: {}", failString, failureService,
-            failedNodeID);
-        restartNode(failedNodeIndex, failureMode);
-        LOG.info("{} Completed restarting {}: {}", failString, failureService,
-            failedNodeID);
-      } catch (Exception e) {
-        LOG.error("Failed to restartNodes {}: {}", failedNodeID,
-            failureService, e);
-      }
-    }
-  }
-
-  private void shutdownNodes() {
-    final int numNodesToFail = getNumberOfNodesToFail();
-    LOG.info("Will shutdown {} nodes to simulate failure", numNodesToFail);
-    for (int i = 0; i < numNodesToFail; i++) {
-      boolean shouldStop = shouldStop();
-      int failedNodeIndex = getNodeToFail();
-      String stopString = shouldStop ? "Stopping" : "Restarting";
-      String failedNodeID = getFailedNodeID(failedNodeIndex);
-      try {
-        LOG.info("{} {} {}", stopString, failureService, failedNodeID);
-        if (shouldStop) {
-          shutdownNode(failedNodeIndex);
-        } else {
-          restartNode(failedNodeIndex, false);
-        }
-        LOG.info("Completed {} {} {}", stopString, failureService,
-            failedNodeID);
-      } catch (Exception e) {
-        LOG.error("Failed {} {} {}", stopString, failureService,
-            failedNodeID, e);
-      }
-    }
-  }
 
-  private FailureMode getFailureMode() {
-    return FailureMode.
-        values()[RandomUtils.nextInt() % FailureMode.values().length];
-  }
+    this.failedOmSet = new HashSet<>();
+    this.failedDnSet = new HashSet<>();
 
-  // Fail nodes randomly at configured timeout period.
-  private void fail() {
-    if (isClusterReady()) {
-      FailureMode mode = getFailureMode();
-      switch (mode) {
-      case NODES_RESTART:
-        restartNodes();
-        break;
-      case NODES_SHUTDOWN:
-        shutdownNodes();
-        break;
-
-      default:
-        LOG.error("invalid failure mode:{}", mode);
-        break;
-      }
-    } else {
-      // Cluster is not ready for failure yet. Skip failing this time and get
-      // the cluster ready by restarting any OM that is not running.
-      LOG.info("Cluster is not ready for failure.");
-      getClusterReady();
-    }
+    this.failureManager = new FailureManager(this, conf, clazzes);
+    LOG.info("Starting MiniOzoneChaosCluster with {} OzoneManagers and {} " +
+        "Datanodes", numOzoneManagers, numDatanodes);
+    clazzes.forEach(c -> LOG.info("added failure:{}", c.getSimpleName()));
   }
 
   void startChaos(long initialDelay, long period, TimeUnit timeUnit) {
     LOG.info("Starting Chaos with failure period:{} unit:{} numDataNodes:{} " +
             "numOzoneManagers:{}", period, timeUnit, numDatanodes,
         numOzoneManagers);
-    this.failureIntervalInMS = TimeUnit.MILLISECONDS.convert(period, timeUnit);
-    scheduledFuture = executorService.scheduleAtFixedRate(this::fail,
-        initialDelay, period, timeUnit);
-  }
-
-  void stopChaos() throws Exception {
-    if (scheduledFuture != null) {
-      scheduledFuture.cancel(false);
-      scheduledFuture.get();
-    }
+    failureManager.start(initialDelay, period, timeUnit);
   }
 
   public void shutdown() {
     try {
-      stopChaos();
-      executorService.shutdown();
-      executorService.awaitTermination(1, TimeUnit.DAYS);
+      failureManager.stop();
       //this should be called after stopChaos to be sure that the
       //datanode collection is not modified during the shutdown
       super.shutdown();
@@ -265,11 +124,30 @@ public abstract class MiniOzoneChaosCluster extends 
MiniOzoneHAClusterImpl {
   }
 
   /**
+   * Check if cluster is ready for a restart or shutdown of an OM node. If
+   * yes, then set isClusterReady to false so that another thread cannot
+   * restart/ shutdown OM till all OMs are up again.
+   */
+  @Override
+  public void waitForClusterToBeReady()
+      throws TimeoutException, InterruptedException {
+    super.waitForClusterToBeReady();
+    GenericTestUtils.waitFor(() -> {
+      for (OzoneManager om : getOzoneManagersList()) {
+        if (!om.isRunning()) {
+          return false;
+        }
+      }
+      return true;
+    }, 1000, waitForClusterToBeReadyTimeout);
+  }
+
+  /**
    * Builder for configuring the MiniOzoneChaosCluster to run.
    */
   public static class Builder extends MiniOzoneHAClusterImpl.Builder {
 
-    private FailureService failureService;
+    private final List<Class<? extends Failures>> clazzes = new ArrayList<>();
 
     /**
      * Creates a new Builder.
@@ -311,8 +189,8 @@ public abstract class MiniOzoneChaosCluster extends 
MiniOzoneHAClusterImpl {
       return this;
     }
 
-    public Builder setFailureService(String serviceName) {
-      this.failureService = FailureService.of(serviceName);
+    public Builder addFailures(Class<? extends Failures> clazz) {
+      this.clazzes.add(clazz);
       return this;
     }
 
@@ -358,6 +236,10 @@ public abstract class MiniOzoneChaosCluster extends 
MiniOzoneHAClusterImpl {
       conf.setInt("hdds.scm.replication.event.timeout", 20 * 1000);
       conf.setInt(OzoneConfigKeys.DFS_RATIS_SNAPSHOT_THRESHOLD_KEY, 100);
       conf.setInt(OzoneConfigKeys.DFS_CONTAINER_RATIS_LOG_PURGE_GAP, 100);
+
+      conf.setInt(OMConfigKeys.
+              OZONE_OM_RATIS_SNAPSHOT_AUTO_TRIGGER_THRESHOLD_KEY, 100);
+      conf.setInt(OMConfigKeys.OZONE_OM_RATIS_LOG_PURGE_GAP, 100);
     }
 
     /**
@@ -375,12 +257,6 @@ public abstract class MiniOzoneChaosCluster extends 
MiniOzoneHAClusterImpl {
     @Override
     public MiniOzoneChaosCluster build() throws IOException {
 
-      if (failureService == FailureService.OZONE_MANAGER && numOfOMs < 3) {
-        throw new IllegalArgumentException("Not enough number of " +
-            "OzoneManagers to test chaos on OzoneManagers. Set number of " +
-            "OzoneManagers to at least 3");
-      }
-
       DefaultMetricsSystem.setMiniClusterMode(true);
       initializeConfiguration();
       if (numOfOMs > 1) {
@@ -406,14 +282,9 @@ public abstract class MiniOzoneChaosCluster extends 
MiniOzoneHAClusterImpl {
       final List<HddsDatanodeService> hddsDatanodes = createHddsDatanodes(
           scm, null);
 
-      MiniOzoneChaosCluster cluster;
-      if (failureService == FailureService.DATANODE) {
-        cluster = new MiniOzoneDatanodeChaosCluster(conf, omList, scm,
-            hddsDatanodes, omServiceId);
-      } else {
-        cluster = new MiniOzoneOMChaosCluster(conf, omList, scm,
-            hddsDatanodes, omServiceId);
-      }
+      MiniOzoneChaosCluster cluster =
+          new MiniOzoneChaosCluster(conf, omList, scm, hddsDatanodes,
+              omServiceId, clazzes);
 
       if (startDataNodes) {
         cluster.startHddsDatanodes();
@@ -421,4 +292,78 @@ public abstract class MiniOzoneChaosCluster extends 
MiniOzoneHAClusterImpl {
       return cluster;
     }
   }
+
+  // OzoneManager specifc
+  public static int getNumberOfOmToFail() {
+    return 1;
+  }
+
+  public Set<OzoneManager> omToFail() {
+    int numNodesToFail = getNumberOfOmToFail();
+    if (failedOmSet.size() >= numOzoneManagers/2) {
+      return Collections.emptySet();
+    }
+
+    int numOms = getOzoneManagersList().size();
+    Set<OzoneManager> oms = new HashSet<>();
+    for (int i = 0; i < numNodesToFail; i++) {
+      int failedNodeIndex = FailureManager.getBoundedRandomIndex(numOms);
+      oms.add(getOzoneManager(failedNodeIndex));
+    }
+    return oms;
+  }
+
+  public void shutdownOzoneManager(OzoneManager om) {
+    super.shutdownOzoneManager(om);
+    failedOmSet.add(om);
+  }
+
+  public void restartOzoneManager(OzoneManager om, boolean waitForOM)
+      throws IOException, TimeoutException, InterruptedException {
+    super.restartOzoneManager(om, waitForOM);
+    failedOmSet.remove(om);
+  }
+
+  // Should the selected node be stopped or started.
+  public boolean shouldStop() {
+    if (failedOmSet.size() >= numOzoneManagers/2) {
+      return false;
+    }
+    return RandomUtils.nextBoolean();
+  }
+
+  // Datanode specifc
+  private int getNumberOfDnToFail() {
+    return RandomUtils.nextBoolean() ? 1 : 2;
+  }
+
+  public Set<DatanodeDetails> dnToFail() {
+    int numNodesToFail = getNumberOfDnToFail();
+    int numDns = getHddsDatanodes().size();
+    Set<DatanodeDetails> dns = new HashSet<>();
+    for (int i = 0; i < numNodesToFail; i++) {
+      int failedNodeIndex = FailureManager.getBoundedRandomIndex(numDns);
+      dns.add(getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails());
+    }
+    return dns;
+  }
+  
+  @Override
+  public void restartHddsDatanode(DatanodeDetails dn, boolean waitForDatanode)
+      throws InterruptedException, TimeoutException, IOException {
+    failedDnSet.add(dn);
+    super.restartHddsDatanode(dn, waitForDatanode);
+    failedDnSet.remove(dn);
+  }
+
+  @Override
+  public void shutdownHddsDatanode(DatanodeDetails dn) throws IOException {
+    failedDnSet.add(dn);
+    super.shutdownHddsDatanode(dn);
+  }
+
+  // Should the selected node be stopped or started.
+  public boolean shouldStop(DatanodeDetails dn) {
+    return !failedDnSet.contains(dn);
+  }
 }
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
deleted file mode 100644
index f402831..0000000
--- 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.ozone;
-
-import java.util.List;
-import java.util.concurrent.TimeoutException;
-import org.apache.hadoop.hdds.conf.OzoneConfiguration;
-import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
-import org.apache.hadoop.ozone.om.OzoneManager;
-
-/**
- * This class causes random failures in Datanodes in the chaos cluster.
- */
-public class MiniOzoneDatanodeChaosCluster extends MiniOzoneChaosCluster {
-
-  public MiniOzoneDatanodeChaosCluster(OzoneConfiguration conf,
-      List<OzoneManager> ozoneManagers,
-      StorageContainerManager scm,
-      List<HddsDatanodeService> hddsDatanodes,
-      String omServiceID) {
-    super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
-        FailureService.DATANODE);
-    setNumNodes(hddsDatanodes.size());
-  }
-
-  @Override
-  protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
-      throws TimeoutException, InterruptedException {
-    restartHddsDatanode(failedNodeIndex, waitForNodeRestart);
-  }
-
-  @Override
-  protected void shutdownNode(int failedNodeIndex) {
-    shutdownHddsDatanode(failedNodeIndex);
-  }
-
-  @Override
-  protected String getFailedNodeID(int failedNodeIndex) {
-    return getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails()
-        .getUuidString();
-  }
-}
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
deleted file mode 100644
index 2b2a4d7..0000000
--- 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
+++ /dev/null
@@ -1,132 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.ozone;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
-import java.util.concurrent.atomic.AtomicBoolean;
-
-import org.apache.commons.lang3.RandomUtils;
-import org.apache.hadoop.hdds.conf.OzoneConfiguration;
-import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
-import org.apache.hadoop.ozone.om.OzoneManager;
-
-/**
- * This class causes random failures in OMs in the chaos cluster.
- */
-public class MiniOzoneOMChaosCluster extends MiniOzoneChaosCluster {
-
-  // Cluster is deemed ready for chaos when all the OMs are up and running.
-  private AtomicBoolean isClusterReady = new AtomicBoolean(true);
-
-  // The maximum number of nodes failures which can be tolerated without
-  // losing quorum. This should be equal to (Num of OMs - 1)/2.
-  private int numOfOMNodeFailuresTolerated;
-
-  MiniOzoneOMChaosCluster(OzoneConfiguration conf,
-      List<OzoneManager> ozoneManagers,
-      StorageContainerManager scm,
-      List<HddsDatanodeService> hddsDatanodes,
-      String omServiceID) {
-    super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
-        FailureService.OZONE_MANAGER);
-    setNumNodes(ozoneManagers.size());
-    numOfOMNodeFailuresTolerated = (getNumNodes() - 1) / 2;
-  }
-
-  /**
-   * Check if cluster is ready for a restart or shutdown of an OM node. If
-   * yes, then set isClusterReady to false so that another thread cannot
-   * restart/ shutdown OM till all OMs are up again.
-   */
-  protected boolean isClusterReady() {
-    return isClusterReady.compareAndSet(true, false);
-  }
-
-  /**
-   * If any OM node is not running, restart it.
-   */
-  @Override
-  protected void getClusterReady()  {
-    boolean clusterReady = true;
-    for (OzoneManager om : getOzoneManagersList()) {
-      if (!om.isRunning()) {
-        try {
-          restartOzoneManager(om, true);
-        } catch (Exception e) {
-          clusterReady = false;
-          LOG.error("Cluster not ready for chaos. Failed to restart OM {}: {}",
-              om.getOMNodeId(), e);
-        }
-      }
-    }
-    if (clusterReady) {
-      isClusterReady.set(true);
-    }
-  }
-
-  @Override
-  protected int getNumberOfNodesToFail() {
-    return RandomUtils.nextInt(1, numOfOMNodeFailuresTolerated + 1);
-  }
-
-  @Override
-  protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
-      throws IOException, TimeoutException, InterruptedException {
-    shutdownOzoneManager(failedNodeIndex);
-    restartOzoneManager(failedNodeIndex, waitForNodeRestart);
-    getClusterReady();
-  }
-
-  /**
-   * For OM chaos, a shutdown node should eventually be restarted before the
-   * next failure.
-   */
-  @Override
-  protected void shutdownNode(int failedNodeIndex)
-      throws ExecutionException, InterruptedException {
-    shutdownOzoneManager(failedNodeIndex);
-
-    // Restart the OM after FailureInterval / 2 duration.
-    Executors.newSingleThreadScheduledExecutor().schedule(
-        this::getClusterReady, getFailureIntervalInMS() / 2,
-        TimeUnit.MILLISECONDS).get();
-  }
-
-  @Override
-  protected String getFailedNodeID(int failedNodeIndex) {
-    return getOzoneManager(failedNodeIndex).getOMNodeId();
-  }
-
-  /**
-   * When restarting OM, always wait for it to catch up with Leader OM.
-   */
-  @Override
-  protected boolean isFastRestart() {
-    return true;
-  }
-
-  @Override
-  protected boolean shouldStop() {
-    return true;
-  }
-}
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
index ea7fe69..9ad2a16 100644
--- 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
@@ -23,6 +23,7 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.ozone.client.ObjectStore;
 import org.apache.hadoop.ozone.client.OzoneVolume;
 import org.apache.hadoop.ozone.MiniOzoneChaosCluster.FailureService;
+import org.apache.hadoop.ozone.failure.Failures;
 import org.apache.hadoop.ozone.loadgenerators.RandomLoadGenerator;
 import org.apache.hadoop.ozone.loadgenerators.ReadOnlyLoadGenerator;
 import org.apache.hadoop.ozone.loadgenerators.FilesystemLoadGenerator;
@@ -92,17 +93,36 @@ public class TestMiniChaosOzoneCluster extends GenericCli {
   @BeforeClass
   public static void init() throws Exception {
     OzoneConfiguration configuration = new OzoneConfiguration();
-    String omServiceID =
-        FailureService.of(failureService) == FailureService.OZONE_MANAGER ?
-            OM_SERVICE_ID : null;
+    FailureService service = FailureService.of(failureService);
+    String omServiceID;
+
+    MiniOzoneChaosCluster.Builder builder =
+        new MiniOzoneChaosCluster.Builder(configuration);
+
+    switch (service) {
+    case DATANODE:
+      omServiceID = null;
+      builder
+          .addFailures(Failures.DatanodeRestartFailure.class)
+          .addFailures(Failures.DatanodeStartStopFailure.class);
+      break;
+    case OZONE_MANAGER:
+      omServiceID = OM_SERVICE_ID;
+      builder
+          .addFailures(Failures.OzoneManagerStartStopFailure.class)
+          .addFailures(Failures.OzoneManagerRestartFailure.class);
+      break;
+    default:
+      throw new IllegalArgumentException();
+    }
 
-    cluster = new MiniOzoneChaosCluster.Builder(configuration)
+    builder
         .setNumDatanodes(numDatanodes)
         .setNumOzoneManagers(numOzoneManagers)
-        .setFailureService(failureService)
         .setOMServiceID(omServiceID)
-        .setNumDataVolumes(numDataVolumes)
-        .build();
+        .setNumDataVolumes(numDataVolumes);
+
+    cluster = builder.build();
     cluster.waitForClusterToBeReady();
 
     String volumeName = RandomStringUtils.randomAlphabetic(10).toLowerCase();
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/FailureManager.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/FailureManager.java
new file mode 100644
index 0000000..15aa7f0
--- /dev/null
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/FailureManager.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.failure;
+
+import org.apache.commons.lang3.RandomUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.ozone.MiniOzoneChaosCluster;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Manages all the failures in the MiniOzoneChaosCluster.
+ */
+public class FailureManager {
+
+  static final Logger LOG =
+      LoggerFactory.getLogger(Failures.class);
+
+  private final MiniOzoneChaosCluster cluster;
+  private final List<Failures> failures;
+  private ScheduledFuture scheduledFuture;
+  private final ScheduledExecutorService executorService;
+  public FailureManager(MiniOzoneChaosCluster cluster,
+                        Configuration conf,
+                        List<Class<? extends Failures>> clazzes) {
+    this.cluster = cluster;
+    this.executorService = Executors.newSingleThreadScheduledExecutor();
+
+    failures = new ArrayList<>();
+    for (Class<? extends Failures> clazz : clazzes) {
+      Failures f = ReflectionUtils.newInstance(clazz, conf);
+      f.validateFailure(cluster);
+      failures.add(f);
+    }
+
+  }
+
+  // Fail nodes randomly at configured timeout period.
+  private void fail() {
+    Failures f = failures.get(getBoundedRandomIndex(failures.size()));
+    try {
+      LOG.info("time failure with {}", f.getName());
+      f.fail(cluster);
+    } catch (Throwable t) {
+      LOG.info("Caught exception while inducing failure:{}", f.getName(), t);
+      System.exit(-2);
+    }
+
+  }
+
+  public void start(long initialDelay, long period, TimeUnit timeUnit) {
+    LOG.info("starting failure manager {} {} {}", initialDelay,
+        period, timeUnit);
+    scheduledFuture = executorService.scheduleAtFixedRate(this::fail,
+        initialDelay, period, timeUnit);
+  }
+
+  public void stop() throws Exception {
+    if (scheduledFuture != null) {
+      scheduledFuture.cancel(false);
+      scheduledFuture.get();
+    }
+
+    executorService.shutdown();
+    executorService.awaitTermination(1, TimeUnit.MINUTES);
+  }
+
+  public static boolean isFastRestart() {
+    return RandomUtils.nextBoolean();
+  }
+
+  public static int getBoundedRandomIndex(int size) {
+    return RandomUtils.nextInt(0, size);
+  }
+}
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/Failures.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/Failures.java
new file mode 100644
index 0000000..6d226ca
--- /dev/null
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/Failures.java
@@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.failure;
+
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.ozone.MiniOzoneChaosCluster;
+import org.apache.hadoop.ozone.om.OzoneManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Set;
+
+/**
+ * Implementation of all the failures.
+ */
+public abstract class Failures {
+  static final Logger LOG =
+      LoggerFactory.getLogger(Failures.class);
+
+  public String getName() {
+    return this.getClass().getSimpleName();
+  }
+
+  public abstract void fail(MiniOzoneChaosCluster cluster);
+
+  public abstract void validateFailure(MiniOzoneChaosCluster cluster);
+
+  /**
+   * Ozone Manager failures.
+   */
+  public abstract static class OzoneFailures extends Failures {
+    @Override
+    public void validateFailure(MiniOzoneChaosCluster cluster) {
+      if (cluster.getOzoneManagersList().size() < 3) {
+        throw new IllegalArgumentException("Not enough number of " +
+            "OzoneManagers to test chaos on OzoneManagers. Set number of " +
+            "OzoneManagers to at least 3");
+      }
+    }
+  }
+
+  /**
+   * Restart Ozone Manager to induce failure.
+   */
+  public static class OzoneManagerRestartFailure extends OzoneFailures {
+    public void fail(MiniOzoneChaosCluster cluster) {
+      boolean failureMode = FailureManager.isFastRestart();
+      Set<OzoneManager> oms = cluster.omToFail();
+      oms.parallelStream().forEach(om -> {
+        try {
+          cluster.shutdownOzoneManager(om);
+          cluster.restartOzoneManager(om, failureMode);
+          cluster.waitForClusterToBeReady();
+        } catch (Throwable t) {
+          LOG.error("Failed to restartNodes OM {}", om, t);
+        }
+      });
+    }
+  }
+
+  /**
+   * Start/Stop Ozone Manager to induce failure.
+   */
+  public static class OzoneManagerStartStopFailure extends OzoneFailures {
+    public void fail(MiniOzoneChaosCluster cluster) {
+      // Get the number of OzoneManager to fail in the cluster.
+      boolean shouldStop = cluster.shouldStop();
+      Set<OzoneManager> oms = cluster.omToFail();
+      oms.parallelStream().forEach(om -> {
+        try {
+          if (shouldStop) {
+            // start another OM before failing the next one.
+            cluster.shutdownOzoneManager(om);
+          } else {
+            cluster.restartOzoneManager(om, true);
+          }
+        } catch (Throwable t) {
+          LOG.error("Failed to shutdown OM {}", om, t);
+        }
+      });
+    }
+  }
+
+  /**
+   * Datanode failures.
+   */
+  public abstract static class DatanodeFailures extends Failures {
+    @Override
+    public void validateFailure(MiniOzoneChaosCluster cluster) {
+      // Nothing to do here.
+    }
+  }
+
+  /**
+   * Restart Datanodes to induce failure.
+   */
+  public static class DatanodeRestartFailure extends DatanodeFailures {
+    public void fail(MiniOzoneChaosCluster cluster) {
+      boolean failureMode = FailureManager.isFastRestart();
+      Set<DatanodeDetails> dns = cluster.dnToFail();
+      dns.parallelStream().forEach(dn -> {
+        try {
+          cluster.restartHddsDatanode(dn, failureMode);
+        } catch (Throwable t) {
+          LOG.error("Failed to restartNodes Datanode {}", dn.getUuid(), t);
+        }
+      });
+    }
+  }
+
+  /**
+   * Start/Stop Datanodes to induce failure.
+   */
+  public static class DatanodeStartStopFailure extends DatanodeFailures {
+    public void fail(MiniOzoneChaosCluster cluster) {
+      // Get the number of datanodes to fail in the cluster.
+      Set<DatanodeDetails> dns = cluster.dnToFail();
+      dns.parallelStream().forEach(dn -> {
+        try {
+          if (cluster.shouldStop(dn)) {
+            cluster.shutdownHddsDatanode(dn);
+          } else {
+            cluster.restartHddsDatanode(dn, true);
+          }
+        } catch (Throwable t) {
+          LOG.error("Failed to shutdown Datanode {}", dn.getUuid(), t);
+        }
+      });
+    }
+  }
+}
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/package-info.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/package-info.java
new file mode 100644
index 0000000..e93958a
--- /dev/null
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/failure/package-info.java
@@ -0,0 +1,19 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.failure;
\ No newline at end of file
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/resources/log4j.properties
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/resources/log4j.properties
index c3a1cec..aabb0b1 100644
--- 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/resources/log4j.properties
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/resources/log4j.properties
@@ -25,6 +25,7 @@ 
log4j.logger.org.apache.ratis.grpc.client.GrpcClientProtocolClient=WARN
 
 log4j.logger.org.apache.hadoop.ozone.utils=DEBUG,stdout,CHAOS
 log4j.logger.org.apache.hadoop.ozone.loadgenerators=DEBUG,stdout,CHAOS
+log4j.logger.org.apache.hadoop.ozone.failure=INFO, CHAOS
 log4j.appender.CHAOS.File=${chaoslogfilename}
 log4j.appender.CHAOS=org.apache.log4j.FileAppender
 log4j.appender.CHAOS.layout=org.apache.log4j.PatternLayout
diff --git 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
index 4548717..6953594 100644
--- 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
+++ 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
@@ -206,20 +206,13 @@ public class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
     }
   }
 
-  void shutdownOzoneManager(int omNodeIndex) {
-    OzoneManager ozoneManager = ozoneManagers.get(omNodeIndex);
+  public void shutdownOzoneManager(OzoneManager ozoneManager) {
     LOG.info("Shutting down OzoneManager " + ozoneManager.getOMNodeId());
 
     ozoneManager.stop();
   }
 
-  void restartOzoneManager(int omNodeIndex, boolean waitForOM)
-      throws IOException, TimeoutException, InterruptedException {
-    OzoneManager ozoneManager = ozoneManagers.get(omNodeIndex);
-    restartOzoneManager(ozoneManager, waitForOM);
-  }
-
-  void restartOzoneManager(OzoneManager ozoneManager, boolean waitForOM)
+  public void restartOzoneManager(OzoneManager ozoneManager, boolean waitForOM)
       throws IOException, TimeoutException, InterruptedException {
     LOG.info("Restarting OzoneManager " + ozoneManager.getOMNodeId());
     ozoneManager.restart();


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to