This is an automated email from the ASF dual-hosted git repository.

msingh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 410e4d3  HDDS-2339. Add OzoneManager to MiniOzoneChaosCluster (#643)
410e4d3 is described below

commit 410e4d3f5416f45f4bc6e85c1a2c46a8dcae2d39
Author: Hanisha Koneru <[email protected]>
AuthorDate: Tue Mar 31 10:42:12 2020 -0700

    HDDS-2339. Add OzoneManager to MiniOzoneChaosCluster (#643)
---
 .../apache/hadoop/ozone/MiniOzoneChaosCluster.java | 240 ++++++++++++++++-----
 .../ozone/MiniOzoneDatanodeChaosCluster.java       |  57 +++++
 .../hadoop/ozone/MiniOzoneLoadGenerator.java       |   9 +-
 .../hadoop/ozone/MiniOzoneOMChaosCluster.java      | 132 ++++++++++++
 .../hadoop/ozone/TestMiniChaosOzoneCluster.java    |  26 ++-
 .../org/apache/hadoop/ozone/utils/LoadBucket.java  |  17 +-
 .../hadoop/ozone/MiniOzoneHAClusterImpl.java       | 133 +++++++++---
 .../org/apache/hadoop/ozone/om/OzoneManager.java   |  34 ++-
 8 files changed, 540 insertions(+), 108 deletions(-)

diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
index 22cb3b4..65eb86d 100644
--- 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
@@ -18,11 +18,14 @@
 
 package org.apache.hadoop.ozone;
 
+import java.util.Arrays;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
 import org.apache.commons.lang3.RandomUtils;
 import org.apache.hadoop.conf.StorageUnit;
 import org.apache.hadoop.hdds.HddsConfigKeys;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
-import org.apache.hadoop.hdds.protocol.DatanodeDetails;
 import org.apache.hadoop.hdds.scm.ScmConfigKeys;
 import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@@ -42,12 +45,21 @@ import java.util.concurrent.Executors;
 /**
  * This class causes random failures in the chaos cluster.
  */
-public class MiniOzoneChaosCluster extends MiniOzoneClusterImpl {
+public abstract class MiniOzoneChaosCluster extends MiniOzoneHAClusterImpl {
 
   static final Logger LOG =
       LoggerFactory.getLogger(MiniOzoneChaosCluster.class);
 
   private final int numDatanodes;
+  private final int numOzoneManagers;
+
+  // Number of Nodes of the service (Datanode or OM) on which chaos will be
+  // unleashed
+  private int numNodes;
+
+  private FailureService failureService;
+  private long failureIntervalInMS;
+
   private final ScheduledExecutorService executorService;
 
   private ScheduledFuture scheduledFuture;
@@ -57,38 +69,98 @@ public class MiniOzoneChaosCluster extends 
MiniOzoneClusterImpl {
     NODES_SHUTDOWN
   }
 
+  // The service on which chaos will be unleashed.
+  enum FailureService {
+    DATANODE,
+    OZONE_MANAGER;
+
+    public String toString() {
+      if (this == DATANODE) {
+        return "Datanode";
+      } else {
+        return "OzoneManager";
+      }
+    }
+
+    public static FailureService of(String serviceName) {
+      if (serviceName.equalsIgnoreCase("Datanode")) {
+        return DATANODE;
+      } else if (serviceName.equalsIgnoreCase("OzoneManager")) {
+        return OZONE_MANAGER;
+      }
+      throw new IllegalArgumentException("Unrecognized value for " +
+          "FailureService enum: " + serviceName);
+    }
+  }
+
   public MiniOzoneChaosCluster(OzoneConfiguration conf,
-                               OzoneManager ozoneManager,
-                       StorageContainerManager scm,
-                       List<HddsDatanodeService> hddsDatanodes) {
-    super(conf, ozoneManager, scm, hddsDatanodes);
+      List<OzoneManager> ozoneManagers, StorageContainerManager scm,
+      List<HddsDatanodeService> hddsDatanodes, String omServiceID,
+      FailureService service) {
+    super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID);
 
     this.executorService =  Executors.newSingleThreadScheduledExecutor();
     this.numDatanodes = getHddsDatanodes().size();
-    LOG.info("Starting MiniOzoneChaosCluster with {} datanodes", numDatanodes);
+    this.numOzoneManagers = ozoneManagers.size();
+    this.failureService = service;
+    LOG.info("Starting MiniOzoneChaosCluster with {} OzoneManagers and {} " +
+        "Datanodes, chaos on service: {}",
+        numOzoneManagers, numDatanodes, failureService);
+  }
+
+  protected int getNumNodes() {
+    return numNodes;
+  }
+
+  protected void setNumNodes(int numOfNodes) {
+    this.numNodes = numOfNodes;
+  }
+
+  protected long getFailureIntervalInMS() {
+    return failureIntervalInMS;
+  }
+
+  /**
+   * Is the cluster ready for chaos.
+   */
+  protected boolean isClusterReady() {
+    return true;
   }
 
-  // Get the number of datanodes to fail in the cluster.
-  private int getNumberOfNodesToFail() {
+  protected void getClusterReady() {
+    // Do nothing
+  }
+
+  // Get the number of nodes to fail in the cluster.
+  protected int getNumberOfNodesToFail() {
     return RandomUtils.nextBoolean() ? 1 : 2;
   }
 
-  // Should the failed node wait for SCM to register the even before
+  // Should the failed node wait for SCM to register even before
   // restart, i.e fast restart or not.
-  private boolean isFastRestart() {
+  protected boolean isFastRestart() {
     return RandomUtils.nextBoolean();
   }
 
   // Should the selected node be stopped or started.
-  private boolean shouldStop() {
+  protected boolean shouldStop() {
     return RandomUtils.nextBoolean();
   }
 
-  // Get the datanode index of the datanode to fail.
+  // Get the node index of the node to fail.
   private int getNodeToFail() {
-    return RandomUtils.nextInt() % numDatanodes;
+    return RandomUtils.nextInt() % numNodes;
   }
 
+  protected abstract void restartNode(int failedNodeIndex,
+      boolean waitForNodeRestart)
+      throws TimeoutException, InterruptedException, IOException;
+
+  protected abstract void shutdownNode(int failedNodeIndex)
+      throws ExecutionException, InterruptedException;
+
+  protected abstract String getFailedNodeID(int failedNodeIndex);
+
   private void restartNodes() {
     final int numNodesToFail = getNumberOfNodesToFail();
     LOG.info("Will restart {} nodes to simulate failure", numNodesToFail);
@@ -96,15 +168,16 @@ public class MiniOzoneChaosCluster extends 
MiniOzoneClusterImpl {
       boolean failureMode = isFastRestart();
       int failedNodeIndex = getNodeToFail();
       String failString = failureMode ? "Fast" : "Slow";
-      DatanodeDetails dn =
-          getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails();
+      String failedNodeID = getFailedNodeID(failedNodeIndex);
       try {
-        LOG.info("{} Restarting DataNode: {}", failString, dn.getUuid());
-        restartHddsDatanode(failedNodeIndex, failureMode);
-        LOG.info("{} Completed restarting Datanode: {}", failString,
-            dn.getUuid());
+        LOG.info("{} Restarting {}: {}", failString, failureService,
+            failedNodeID);
+        restartNode(failedNodeIndex, failureMode);
+        LOG.info("{} Completed restarting {}: {}", failString, failureService,
+            failedNodeID);
       } catch (Exception e) {
-        LOG.error("Failed to restartNodes Datanode {}", dn.getUuid(), e);
+        LOG.error("Failed to restartNodes {}: {}", failedNodeID,
+            failureService, e);
       }
     }
   }
@@ -116,20 +189,19 @@ public class MiniOzoneChaosCluster extends 
MiniOzoneClusterImpl {
       boolean shouldStop = shouldStop();
       int failedNodeIndex = getNodeToFail();
       String stopString = shouldStop ? "Stopping" : "Restarting";
-      DatanodeDetails dn =
-          getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails();
+      String failedNodeID = getFailedNodeID(failedNodeIndex);
       try {
-        LOG.info("{} DataNode {}", stopString, dn.getUuid());
-
+        LOG.info("{} {} {}", stopString, failureService, failedNodeID);
         if (shouldStop) {
-          shutdownHddsDatanode(failedNodeIndex);
+          shutdownNode(failedNodeIndex);
         } else {
-          restartHddsDatanode(failedNodeIndex, true);
+          restartNode(failedNodeIndex, false);
         }
-        LOG.info("Completed {} DataNode {}", stopString, dn.getUuid());
-
+        LOG.info("Completed {} {} {}", stopString, failureService,
+            failedNodeID);
       } catch (Exception e) {
-        LOG.error("Failed {} Datanode {}", stopString, dn.getUuid(), e);
+        LOG.error("Failed {} {} {}", stopString, failureService,
+            failedNodeID, e);
       }
     }
   }
@@ -141,24 +213,33 @@ public class MiniOzoneChaosCluster extends 
MiniOzoneClusterImpl {
 
   // Fail nodes randomly at configured timeout period.
   private void fail() {
-    FailureMode mode = getFailureMode();
-    switch (mode) {
-    case NODES_RESTART:
-      restartNodes();
-      break;
-    case NODES_SHUTDOWN:
-      shutdownNodes();
-      break;
-
-    default:
-      LOG.error("invalid failure mode:{}", mode);
-      break;
+    if (isClusterReady()) {
+      FailureMode mode = getFailureMode();
+      switch (mode) {
+      case NODES_RESTART:
+        restartNodes();
+        break;
+      case NODES_SHUTDOWN:
+        shutdownNodes();
+        break;
+
+      default:
+        LOG.error("invalid failure mode:{}", mode);
+        break;
+      }
+    } else {
+      // Cluster is not ready for failure yet. Skip failing this time and get
+      // the cluster ready by restarting any OM that is not running.
+      LOG.info("Cluster is not ready for failure.");
+      getClusterReady();
     }
   }
 
   void startChaos(long initialDelay, long period, TimeUnit timeUnit) {
-    LOG.info("Starting Chaos with failure period:{} unit:{} numDataNodes:{}",
-        period, timeUnit, numDatanodes);
+    LOG.info("Starting Chaos with failure period:{} unit:{} numDataNodes:{} " +
+            "numOzoneManagers:{}", period, timeUnit, numDatanodes,
+        numOzoneManagers);
+    this.failureIntervalInMS = TimeUnit.MILLISECONDS.convert(period, timeUnit);
     scheduledFuture = executorService.scheduleAtFixedRate(this::fail,
         initialDelay, period, timeUnit);
   }
@@ -186,7 +267,9 @@ public class MiniOzoneChaosCluster extends 
MiniOzoneClusterImpl {
   /**
    * Builder for configuring the MiniOzoneChaosCluster to run.
    */
-  public static class Builder extends MiniOzoneClusterImpl.Builder {
+  public static class Builder extends MiniOzoneHAClusterImpl.Builder {
+
+    private FailureService failureService;
 
     /**
      * Creates a new Builder.
@@ -200,9 +283,7 @@ public class MiniOzoneChaosCluster extends 
MiniOzoneClusterImpl {
     /**
      * Sets the number of HddsDatanodes to be started as part of
      * MiniOzoneChaosCluster.
-     *
      * @param val number of datanodes
-     *
      * @return MiniOzoneChaosCluster.Builder
      */
     public Builder setNumDatanodes(int val) {
@@ -210,7 +291,31 @@ public class MiniOzoneChaosCluster extends 
MiniOzoneClusterImpl {
       return this;
     }
 
-    @Override
+    /**
+     * Sets the number of OzoneManagers to be started as part of
+     * MiniOzoneChaosCluster.
+     * @param val number of OzoneManagers
+     * @return MiniOzoneChaosCluster.Builder
+     */
+    public Builder setNumOzoneManagers(int val) {
+      super.setNumOfOzoneManagers(val);
+      super.setNumOfActiveOMs(val);
+      return this;
+    }
+
+    /**
+     * Sets OM Service ID.
+     */
+    public Builder setOMServiceID(String omServiceID) {
+      super.setOMServiceId(omServiceID);
+      return this;
+    }
+
+    public Builder setFailureService(String serviceName) {
+      this.failureService = FailureService.of(serviceName);
+      return this;
+    }
+
     protected void initializeConfiguration() throws IOException {
       super.initializeConfiguration();
       conf.setStorageSize(ScmConfigKeys.OZONE_SCM_CHUNK_SIZE_KEY,
@@ -257,26 +362,47 @@ public class MiniOzoneChaosCluster extends 
MiniOzoneClusterImpl {
 
     @Override
     public MiniOzoneChaosCluster build() throws IOException {
+
+      if (failureService == FailureService.OZONE_MANAGER && numOfOMs < 3) {
+        throw new IllegalArgumentException("Not enough number of " +
+            "OzoneManagers to test chaos on OzoneManagers. Set number of " +
+            "OzoneManagers to at least 3");
+      }
+
       DefaultMetricsSystem.setMiniClusterMode(true);
       initializeConfiguration();
+      if (numOfOMs > 1) {
+        initOMRatisConf();
+      }
+
       StorageContainerManager scm;
-      OzoneManager om;
+      List<OzoneManager> omList;
       try {
         scm = createSCM();
         scm.start();
-        om = createOM();
-        if(certClient != null) {
-          om.setCertClient(certClient);
+        if (numOfOMs > 1) {
+          omList = createOMService();
+        } else {
+          OzoneManager om = createOM();
+          om.start();
+          omList = Arrays.asList(om);
         }
       } catch (AuthenticationException ex) {
         throw new IOException("Unable to build MiniOzoneCluster. ", ex);
       }
 
-      om.start();
-      final List<HddsDatanodeService> hddsDatanodes =
-          createHddsDatanodes(scm, null);
-      MiniOzoneChaosCluster cluster =
-          new MiniOzoneChaosCluster(conf, om, scm, hddsDatanodes);
+      final List<HddsDatanodeService> hddsDatanodes = createHddsDatanodes(
+          scm, null);
+
+      MiniOzoneChaosCluster cluster;
+      if (failureService == FailureService.DATANODE) {
+        cluster = new MiniOzoneDatanodeChaosCluster(conf, omList, scm,
+            hddsDatanodes, omServiceId);
+      } else {
+        cluster = new MiniOzoneOMChaosCluster(conf, omList, scm,
+            hddsDatanodes, omServiceId);
+      }
+
       if (startDataNodes) {
         cluster.startHddsDatanodes();
       }
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
new file mode 100644
index 0000000..f402831
--- /dev/null
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ozone;
+
+import java.util.List;
+import java.util.concurrent.TimeoutException;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
+import org.apache.hadoop.ozone.om.OzoneManager;
+
+/**
+ * This class causes random failures in Datanodes in the chaos cluster.
+ */
+public class MiniOzoneDatanodeChaosCluster extends MiniOzoneChaosCluster {
+
+  public MiniOzoneDatanodeChaosCluster(OzoneConfiguration conf,
+      List<OzoneManager> ozoneManagers,
+      StorageContainerManager scm,
+      List<HddsDatanodeService> hddsDatanodes,
+      String omServiceID) {
+    super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
+        FailureService.DATANODE);
+    setNumNodes(hddsDatanodes.size());
+  }
+
+  @Override
+  protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
+      throws TimeoutException, InterruptedException {
+    restartHddsDatanode(failedNodeIndex, waitForNodeRestart);
+  }
+
+  @Override
+  protected void shutdownNode(int failedNodeIndex) {
+    shutdownHddsDatanode(failedNodeIndex);
+  }
+
+  @Override
+  protected String getFailedNodeID(int failedNodeIndex) {
+    return getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails()
+        .getUuidString();
+  }
+}
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneLoadGenerator.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneLoadGenerator.java
index d1256b1..62a1db2 100644
--- 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneLoadGenerator.java
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneLoadGenerator.java
@@ -47,14 +47,16 @@ public class MiniOzoneLoadGenerator {
 
   private final OzoneVolume volume;
   private final OzoneConfiguration conf;
+  private final String omServiceID;
 
   MiniOzoneLoadGenerator(OzoneVolume volume, int numClients, int numThreads,
-                         int numBuffers, OzoneConfiguration conf)
+      int numBuffers, OzoneConfiguration conf, String omServiceId)
       throws Exception {
     DataBuffer buffer = new DataBuffer(numBuffers);
     loadExecutors = new ArrayList<>();
     this.volume = volume;
     this.conf = conf;
+    this.omServiceID = omServiceId;
 
     // Random Load
     String mixBucketName = 
RandomStringUtils.randomAlphabetic(10).toLowerCase();
@@ -62,7 +64,7 @@ public class MiniOzoneLoadGenerator {
     List<LoadBucket> ozoneBuckets = new ArrayList<>(numClients);
     for (int i = 0; i < numClients; i++) {
       ozoneBuckets.add(new LoadBucket(volume.getBucket(mixBucketName),
-          conf));
+          conf, omServiceId));
     }
     RandomLoadGenerator loadGenerator =
         new RandomLoadGenerator(buffer, ozoneBuckets);
@@ -82,7 +84,8 @@ public class MiniOzoneLoadGenerator {
       throws Exception {
     String bucketName = RandomStringUtils.randomAlphabetic(10).toLowerCase();
     volume.createBucket(bucketName);
-    LoadBucket bucket = new LoadBucket(volume.getBucket(bucketName), conf);
+    LoadBucket bucket = new LoadBucket(volume.getBucket(bucketName), conf,
+        omServiceID);
     LoadGenerator loadGenerator = function.apply(bucket);
     loadExecutors.add(new LoadExecutors(numThreads, loadGenerator));
   }
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
new file mode 100644
index 0000000..2b2a4d7
--- /dev/null
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ozone;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.apache.commons.lang3.RandomUtils;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
+import org.apache.hadoop.ozone.om.OzoneManager;
+
+/**
+ * This class causes random failures in OMs in the chaos cluster.
+ */
+public class MiniOzoneOMChaosCluster extends MiniOzoneChaosCluster {
+
+  // Cluster is deemed ready for chaos when all the OMs are up and running.
+  private AtomicBoolean isClusterReady = new AtomicBoolean(true);
+
+  // The maximum number of nodes failures which can be tolerated without
+  // losing quorum. This should be equal to (Num of OMs - 1)/2.
+  private int numOfOMNodeFailuresTolerated;
+
+  MiniOzoneOMChaosCluster(OzoneConfiguration conf,
+      List<OzoneManager> ozoneManagers,
+      StorageContainerManager scm,
+      List<HddsDatanodeService> hddsDatanodes,
+      String omServiceID) {
+    super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
+        FailureService.OZONE_MANAGER);
+    setNumNodes(ozoneManagers.size());
+    numOfOMNodeFailuresTolerated = (getNumNodes() - 1) / 2;
+  }
+
+  /**
+   * Check if cluster is ready for a restart or shutdown of an OM node. If
+   * yes, then set isClusterReady to false so that another thread cannot
+   * restart/ shutdown OM till all OMs are up again.
+   */
+  protected boolean isClusterReady() {
+    return isClusterReady.compareAndSet(true, false);
+  }
+
+  /**
+   * If any OM node is not running, restart it.
+   */
+  @Override
+  protected void getClusterReady()  {
+    boolean clusterReady = true;
+    for (OzoneManager om : getOzoneManagersList()) {
+      if (!om.isRunning()) {
+        try {
+          restartOzoneManager(om, true);
+        } catch (Exception e) {
+          clusterReady = false;
+          LOG.error("Cluster not ready for chaos. Failed to restart OM {}: {}",
+              om.getOMNodeId(), e);
+        }
+      }
+    }
+    if (clusterReady) {
+      isClusterReady.set(true);
+    }
+  }
+
+  @Override
+  protected int getNumberOfNodesToFail() {
+    return RandomUtils.nextInt(1, numOfOMNodeFailuresTolerated + 1);
+  }
+
+  @Override
+  protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
+      throws IOException, TimeoutException, InterruptedException {
+    shutdownOzoneManager(failedNodeIndex);
+    restartOzoneManager(failedNodeIndex, waitForNodeRestart);
+    getClusterReady();
+  }
+
+  /**
+   * For OM chaos, a shutdown node should eventually be restarted before the
+   * next failure.
+   */
+  @Override
+  protected void shutdownNode(int failedNodeIndex)
+      throws ExecutionException, InterruptedException {
+    shutdownOzoneManager(failedNodeIndex);
+
+    // Restart the OM after FailureInterval / 2 duration.
+    Executors.newSingleThreadScheduledExecutor().schedule(
+        this::getClusterReady, getFailureIntervalInMS() / 2,
+        TimeUnit.MILLISECONDS).get();
+  }
+
+  @Override
+  protected String getFailedNodeID(int failedNodeIndex) {
+    return getOzoneManager(failedNodeIndex).getOMNodeId();
+  }
+
+  /**
+   * When restarting OM, always wait for it to catch up with Leader OM.
+   */
+  @Override
+  protected boolean isFastRestart() {
+    return true;
+  }
+
+  @Override
+  protected boolean shouldStop() {
+    return true;
+  }
+}
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
index 0fa9a14..53718e4 100644
--- 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
@@ -21,6 +21,7 @@ import org.apache.commons.lang3.RandomStringUtils;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.ozone.client.ObjectStore;
 import org.apache.hadoop.ozone.client.OzoneVolume;
+import org.apache.hadoop.ozone.MiniOzoneChaosCluster.FailureService;
 import org.junit.BeforeClass;
 import org.junit.AfterClass;
 import org.junit.Ignore;
@@ -43,6 +44,15 @@ public class TestMiniChaosOzoneCluster implements Runnable {
       description = "num of datanodes")
   private static int numDatanodes = 20;
 
+  @Option(names = {"-o", "--numOzoneManager"},
+      description = "num of ozoneManagers")
+  private static int numOzoneManagers = 1;
+
+  @Option(names = {"-s", "--failureService"},
+      description = "service (datanode or ozoneManager) to test chaos on",
+      defaultValue = "datanode")
+  private static String failureService = "datanode";
+
   @Option(names = {"-t", "--numThreads"},
       description = "num of IO threads")
   private static int numThreads = 5;
@@ -61,16 +71,26 @@ public class TestMiniChaosOzoneCluster implements Runnable {
 
   @Option(names = {"-i", "--failureInterval"},
       description = "time between failure events in seconds")
-  private static int failureInterval = 300; // 5 second period between 
failures.
+  private static int failureInterval = 300; // 5 minute period between 
failures.
 
   private static MiniOzoneChaosCluster cluster;
   private static MiniOzoneLoadGenerator loadGenerator;
 
+  private static final String OM_SERVICE_ID = "ozoneChaosTest";
+
   @BeforeClass
   public static void init() throws Exception {
     OzoneConfiguration configuration = new OzoneConfiguration();
+    String omServiceID =
+        FailureService.of(failureService) == FailureService.OZONE_MANAGER ?
+            OM_SERVICE_ID : null;
+
     cluster = new MiniOzoneChaosCluster.Builder(configuration)
-        .setNumDatanodes(numDatanodes).build();
+        .setNumDatanodes(numDatanodes)
+        .setNumOzoneManagers(numOzoneManagers)
+        .setFailureService(failureService)
+        .setOMServiceID(omServiceID)
+        .build();
     cluster.waitForClusterToBeReady();
 
     String volumeName = RandomStringUtils.randomAlphabetic(10).toLowerCase();
@@ -80,7 +100,7 @@ public class TestMiniChaosOzoneCluster implements Runnable {
 
     loadGenerator =
         new MiniOzoneLoadGenerator(volume, numClients, numThreads,
-            numBuffers, configuration);
+            numBuffers, configuration, omServiceID);
   }
 
   /**
diff --git 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/utils/LoadBucket.java
 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/utils/LoadBucket.java
index 2fb92d1..8e1ef31 100644
--- 
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/utils/LoadBucket.java
+++ 
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/utils/LoadBucket.java
@@ -49,10 +49,15 @@ public class LoadBucket {
   private final OzoneBucket bucket;
   private final OzoneFileSystem fs;
 
-  public LoadBucket(OzoneBucket bucket, OzoneConfiguration conf)
-      throws Exception {
+  public LoadBucket(OzoneBucket bucket, OzoneConfiguration conf,
+      String omServiceID) throws Exception {
     this.bucket = bucket;
-    this.fs = (OzoneFileSystem)FileSystem.get(getFSUri(bucket), conf);
+    if (omServiceID == null) {
+      this.fs = (OzoneFileSystem) FileSystem.get(getFSUri(bucket), conf);
+    } else {
+      this.fs = (OzoneFileSystem) FileSystem.get(getFSUri(bucket, omServiceID),
+          conf);
+    }
   }
 
   private boolean isFsOp() {
@@ -97,6 +102,12 @@ public class LoadBucket {
       bucket.getName(), bucket.getVolumeName()));
   }
 
+  private static URI getFSUri(OzoneBucket bucket, String omServiceID)
+      throws URISyntaxException {
+    return new URI(String.format("%s://%s.%s.%s/", 
OzoneConsts.OZONE_URI_SCHEME,
+        bucket.getName(), bucket.getVolumeName(), omServiceID));
+  }
+
   abstract class Op {
     private final boolean fsOp;
     private final String opName;
diff --git 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
index 8a74a19..56c9be1 100644
--- 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
+++ 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
@@ -18,6 +18,10 @@
 
 package org.apache.hadoop.ozone;
 
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@@ -29,16 +33,17 @@ import org.apache.hadoop.ozone.om.OMStorage;
 import org.apache.hadoop.ozone.om.OzoneManager;
 import org.apache.hadoop.ozone.recon.ReconServer;
 import 
org.apache.hadoop.security.authentication.client.AuthenticationException;
+import org.apache.hadoop.test.GenericTestUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.net.BindException;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.TimeUnit;
 
 import static org.apache.hadoop.hdds.HddsConfigKeys.OZONE_METADATA_DIRS;
@@ -48,7 +53,7 @@ import static 
org.apache.hadoop.hdds.HddsConfigKeys.OZONE_METADATA_DIRS;
  * with OM HA suitable for running tests.  The cluster consists of a set of
  * OzoneManagers, StorageContainerManager and multiple DataNodes.
  */
-public final class MiniOzoneHAClusterImpl extends MiniOzoneClusterImpl {
+public class MiniOzoneHAClusterImpl extends MiniOzoneClusterImpl {
 
   private static final Logger LOG =
       LoggerFactory.getLogger(MiniOzoneHAClusterImpl.class);
@@ -61,20 +66,21 @@ public final class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
   private List<OzoneManager> activeOMs;
   private List<OzoneManager> inactiveOMs;
 
+  private int waitForOMToBeReadyTimeout = 120000; // 2 min
+
   private static final Random RANDOM = new Random();
   private static final int RATIS_LEADER_ELECTION_TIMEOUT = 1000; // 1 seconds
 
   public static final int NODE_FAILURE_TIMEOUT = 2000; // 2 seconds
 
   /**
-   * Creates a new MiniOzoneCluster with OM HA.
+   * Creates a new MiniOzoneCluster.
    *
    * @throws IOException if there is an I/O error
    */
   @SuppressWarnings("checkstyle:ParameterNumber")
   private MiniOzoneHAClusterImpl(
       OzoneConfiguration conf,
-      Map<String, OzoneManager> omMap,
       List<OzoneManager> activeOMList,
       List<OzoneManager> inactiveOMList,
       StorageContainerManager scm,
@@ -82,11 +88,40 @@ public final class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
       String omServiceId,
       ReconServer reconServer) {
     super(conf, scm, hddsDatanodes, reconServer);
-    this.ozoneManagerMap = omMap;
-    this.ozoneManagers = new ArrayList<>(omMap.values());
+
+    this.ozoneManagerMap = Maps.newHashMap();
+    if (activeOMList != null) {
+      for (OzoneManager om : activeOMList) {
+        this.ozoneManagerMap.put(om.getOMNodeId(), om);
+      }
+    }
+    if (inactiveOMList != null) {
+      for (OzoneManager om : inactiveOMList) {
+        this.ozoneManagerMap.put(om.getOMNodeId(), om);
+      }
+    }
+    this.ozoneManagers = new ArrayList<>(ozoneManagerMap.values());
     this.activeOMs = activeOMList;
     this.inactiveOMs = inactiveOMList;
     this.omServiceId = omServiceId;
+
+    // If the serviceID is null, then this should be a non-HA cluster.
+    if (omServiceId == null) {
+      Preconditions.checkArgument(ozoneManagers.size() <= 1);
+    }
+  }
+
+  /**
+   * Creates a new MiniOzoneCluster with all OMs active.
+   * This is used by MiniOzoneChaosCluster.
+   */
+  protected MiniOzoneHAClusterImpl(
+      OzoneConfiguration conf,
+      List<OzoneManager> omList,
+      StorageContainerManager scm,
+      List<HddsDatanodeService> hddsDatanodes,
+      String omServiceId) {
+    this(conf, omList, null, scm, hddsDatanodes, omServiceId, null);
   }
 
   @Override
@@ -105,7 +140,13 @@ public final class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
 
   @Override
   public OzoneClient getRpcClient() throws IOException {
-    return OzoneClientFactory.getRpcClient(getServiceId(), getConf());
+    if (omServiceId == null) {
+      // Non-HA cluster.
+      return OzoneClientFactory.getRpcClient(getConf());
+    } else {
+      // HA cluster
+      return OzoneClientFactory.getRpcClient(omServiceId, getConf());
+    }
   }
 
   public boolean isOMActive(String omNodeId) {
@@ -120,6 +161,10 @@ public final class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
     return this.ozoneManagerMap.get(omNodeId);
   }
 
+  public List<OzoneManager> getOzoneManagersList() {
+    return ozoneManagers;
+  }
+
   /**
    * Get OzoneManager leader object.
    * @return OzoneManager object, null if there isn't one or more than one
@@ -162,6 +207,30 @@ public final class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
     }
   }
 
+  void shutdownOzoneManager(int omNodeIndex) {
+    OzoneManager ozoneManager = ozoneManagers.get(omNodeIndex);
+    LOG.info("Shutting down OzoneManager " + ozoneManager.getOMNodeId());
+
+    ozoneManager.stop();
+  }
+
+  void restartOzoneManager(int omNodeIndex, boolean waitForOM)
+      throws IOException, TimeoutException, InterruptedException {
+    OzoneManager ozoneManager = ozoneManagers.get(omNodeIndex);
+    restartOzoneManager(ozoneManager, waitForOM);
+  }
+
+  void restartOzoneManager(OzoneManager ozoneManager, boolean waitForOM)
+      throws IOException, TimeoutException, InterruptedException {
+    LOG.info("Restarting OzoneManager " + ozoneManager.getOMNodeId());
+    ozoneManager.restart();
+
+    if (waitForOM) {
+      GenericTestUtils.waitFor(ozoneManager::isRunning,
+          1000, waitForOMToBeReadyTimeout);
+    }
+  }
+
   @Override
   public void stop() {
     for (OzoneManager ozoneManager : ozoneManagers) {
@@ -211,15 +280,16 @@ public final class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
       if (numOfActiveOMs == ACTIVE_OMS_NOT_SET) {
         numOfActiveOMs = numOfOMs;
       }
+
       DefaultMetricsSystem.setMiniClusterMode(true);
       initializeConfiguration();
+      initOMRatisConf();
       StorageContainerManager scm;
-      Map<String, OzoneManager> omMap;
       ReconServer reconServer = null;
       try {
         scm = createSCM();
         scm.start();
-        omMap = createOMService();
+        createOMService();
         if (includeRecon) {
           configureRecon();
           reconServer = new ReconServer();
@@ -229,26 +299,25 @@ public final class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
         throw new IOException("Unable to build MiniOzoneCluster. ", ex);
       }
 
-      final List<HddsDatanodeService> hddsDatanodes = createHddsDatanodes(scm,
-          reconServer);
-      MiniOzoneHAClusterImpl cluster = new MiniOzoneHAClusterImpl(
-          conf, omMap, activeOMs, inactiveOMs, scm, hddsDatanodes,
-          omServiceId, reconServer);
+      final List<HddsDatanodeService> hddsDatanodes = createHddsDatanodes(
+          scm, reconServer);
+
+      MiniOzoneHAClusterImpl cluster = new MiniOzoneHAClusterImpl(conf,
+          activeOMs, inactiveOMs, scm, hddsDatanodes, omServiceId, 
reconServer);
+
       if (startDataNodes) {
         cluster.startHddsDatanodes();
       }
       return cluster;
     }
 
-    /**
-     * Initialize OM configurations.
-     * @throws IOException
-     */
-    @Override
-    protected void initializeConfiguration() throws IOException {
-      super.initializeConfiguration();
+    protected void initOMRatisConf() {
       conf.setBoolean(OMConfigKeys.OZONE_OM_RATIS_ENABLE_KEY, true);
       conf.setInt(OMConfigKeys.OZONE_OM_HANDLER_COUNT_KEY, numOfOmHandlers);
+      conf.setLong(
+          OMConfigKeys.OZONE_OM_RATIS_SNAPSHOT_AUTO_TRIGGER_THRESHOLD_KEY,
+          100L);
+      conf.setLong(OMConfigKeys.OZONE_OM_RATIS_LOG_PURGE_GAP, 200L);
       conf.setTimeDuration(
           OMConfigKeys.OZONE_OM_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_KEY,
           RATIS_LEADER_ELECTION_TIMEOUT, TimeUnit.MILLISECONDS);
@@ -259,14 +328,11 @@ public final class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
 
     /**
      * Start OM service with multiple OMs.
-     * @return list of OzoneManagers
-     * @throws IOException
-     * @throws AuthenticationException
      */
-    private Map<String, OzoneManager> createOMService() throws IOException,
+    protected List<OzoneManager> createOMService() throws IOException,
         AuthenticationException {
 
-      Map<String, OzoneManager> omMap = new HashMap<>();
+      List<OzoneManager> omList = Lists.newArrayList();
 
       int retryCount = 0;
       int basePort = 10000;
@@ -293,8 +359,10 @@ public final class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
             initializeOmStorage(omStore);
 
             OzoneManager om = OzoneManager.createOm(config);
-            om.setCertClient(certClient);
-            omMap.put(nodeId, om);
+            if (certClient != null) {
+              om.setCertClient(certClient);
+            }
+            omList.add(om);
 
             if (i <= numOfActiveOMs) {
               om.start();
@@ -311,24 +379,23 @@ public final class MiniOzoneHAClusterImpl extends 
MiniOzoneClusterImpl {
           // Set default OM address to point to the first OM. Clients would
           // try connecting to this address by default
           conf.set(OMConfigKeys.OZONE_OM_ADDRESS_KEY,
-              NetUtils.getHostPortString(omMap.get(nodeIdBaseStr + 1)
-                  .getOmRpcServerAddr()));
+              NetUtils.getHostPortString(omList.get(0).getOmRpcServerAddr()));
 
           break;
         } catch (BindException e) {
-          for (OzoneManager om : omMap.values()) {
+          for (OzoneManager om : omList) {
             om.stop();
             om.join();
             LOG.info("Stopping OzoneManager server at {}",
                 om.getOmRpcServerAddr());
           }
-          omMap.clear();
+          omList.clear();
           ++retryCount;
           LOG.info("MiniOzoneHACluster port conflicts, retried {} times",
                   retryCount);
         }
       }
-      return omMap;
+      return omList;
     }
 
     /**
diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
index 0c036bc..627be1f 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
@@ -304,6 +304,14 @@ public final class OzoneManager extends 
ServiceRuntimeInfoImpl
 
   private boolean isNativeAuthorizerEnabled;
 
+  private enum State {
+    INITIALIZED,
+    RUNNING,
+    STOPPED
+  }
+  // Used in MiniOzoneCluster testing
+  private State omState;
+
   private OzoneManager(OzoneConfiguration conf) throws IOException,
       AuthenticationException {
     super(OzoneVersionInfo.OZONE_VERSION_INFO);
@@ -322,10 +330,10 @@ public final class OzoneManager extends 
ServiceRuntimeInfoImpl
     // In case of single OM Node Service there will be no OM Node ID
     // specified, set it to value from om storage
     if (this.omNodeDetails.getOMNodeId() == null) {
-      this.omNodeDetails =
-          OMHANodeDetails.getOMNodeDetails(conf, 
omNodeDetails.getOMServiceId(),
-              omStorage.getOmId(), omNodeDetails.getRpcAddress(),
-              omNodeDetails.getRatisPort());
+      this.omNodeDetails = OMHANodeDetails.getOMNodeDetails(conf,
+          omNodeDetails.getOMServiceId(),
+          omStorage.getOmId(), omNodeDetails.getRpcAddress(),
+          omNodeDetails.getRatisPort());
     }
 
     loginOMUserIfSecurityEnabled(conf);
@@ -413,7 +421,6 @@ public final class OzoneManager extends 
ServiceRuntimeInfoImpl
 
     this.omRatisSnapshotInfo = new OMRatisSnapshotInfo(
         omStorage.getCurrentDir());
-
     initializeRatisServer();
 
     if (isRatisEnabled) {
@@ -451,6 +458,7 @@ public final class OzoneManager extends 
ServiceRuntimeInfoImpl
     };
     ShutdownHookManager.get().addShutdownHook(shutdownHook,
         SHUTDOWN_HOOK_PRIORITY);
+    omState = State.INITIALIZED;
   }
 
   /**
@@ -1135,6 +1143,7 @@ public final class OzoneManager extends 
ServiceRuntimeInfoImpl
     }
     registerMXBean();
     setStartTime();
+    omState = State.RUNNING;
   }
 
   /**
@@ -1168,15 +1177,15 @@ public final class OzoneManager extends 
ServiceRuntimeInfoImpl
     metricsTimer = new Timer();
     metricsTimer.schedule(scheduleOMMetricsWriteTask, 0, period);
 
-    omRpcServer = getRpcServer(configuration);
-    omRpcServer.start();
-    isOmRpcServerRunning = true;
-
     initializeRatisServer();
     if (omRatisServer != null) {
       omRatisServer.start();
     }
 
+    omRpcServer = getRpcServer(configuration);
+    omRpcServer.start();
+    isOmRpcServerRunning = true;
+
     try {
       httpServer = new OzoneManagerHttpServer(configuration, this);
       httpServer.start();
@@ -1191,6 +1200,7 @@ public final class OzoneManager extends 
ServiceRuntimeInfoImpl
     jvmPauseMonitor.init(configuration);
     jvmPauseMonitor.start();
     setStartTime();
+    omState = State.RUNNING;
   }
 
   /**
@@ -1291,6 +1301,7 @@ public final class OzoneManager extends 
ServiceRuntimeInfoImpl
       if (jvmPauseMonitor != null) {
         jvmPauseMonitor.stop();
       }
+      omState = State.STOPPED;
     } catch (Exception e) {
       LOG.error("OzoneManager stop failed.", e);
     }
@@ -3367,4 +3378,9 @@ public final class OzoneManager extends 
ServiceRuntimeInfoImpl
   public boolean isNativeAuthorizerEnabled() {
     return isNativeAuthorizerEnabled;
   }
+
+  @VisibleForTesting
+  public boolean isRunning() {
+    return omState == State.RUNNING;
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to