This is an automated email from the ASF dual-hosted git repository.
msingh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 410e4d3 HDDS-2339. Add OzoneManager to MiniOzoneChaosCluster (#643)
410e4d3 is described below
commit 410e4d3f5416f45f4bc6e85c1a2c46a8dcae2d39
Author: Hanisha Koneru <[email protected]>
AuthorDate: Tue Mar 31 10:42:12 2020 -0700
HDDS-2339. Add OzoneManager to MiniOzoneChaosCluster (#643)
---
.../apache/hadoop/ozone/MiniOzoneChaosCluster.java | 240 ++++++++++++++++-----
.../ozone/MiniOzoneDatanodeChaosCluster.java | 57 +++++
.../hadoop/ozone/MiniOzoneLoadGenerator.java | 9 +-
.../hadoop/ozone/MiniOzoneOMChaosCluster.java | 132 ++++++++++++
.../hadoop/ozone/TestMiniChaosOzoneCluster.java | 26 ++-
.../org/apache/hadoop/ozone/utils/LoadBucket.java | 17 +-
.../hadoop/ozone/MiniOzoneHAClusterImpl.java | 133 +++++++++---
.../org/apache/hadoop/ozone/om/OzoneManager.java | 34 ++-
8 files changed, 540 insertions(+), 108 deletions(-)
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
index 22cb3b4..65eb86d 100644
---
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java
@@ -18,11 +18,14 @@
package org.apache.hadoop.ozone;
+import java.util.Arrays;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.conf.StorageUnit;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
-import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@@ -42,12 +45,21 @@ import java.util.concurrent.Executors;
/**
* This class causes random failures in the chaos cluster.
*/
-public class MiniOzoneChaosCluster extends MiniOzoneClusterImpl {
+public abstract class MiniOzoneChaosCluster extends MiniOzoneHAClusterImpl {
static final Logger LOG =
LoggerFactory.getLogger(MiniOzoneChaosCluster.class);
private final int numDatanodes;
+ private final int numOzoneManagers;
+
+ // Number of Nodes of the service (Datanode or OM) on which chaos will be
+ // unleashed
+ private int numNodes;
+
+ private FailureService failureService;
+ private long failureIntervalInMS;
+
private final ScheduledExecutorService executorService;
private ScheduledFuture scheduledFuture;
@@ -57,38 +69,98 @@ public class MiniOzoneChaosCluster extends
MiniOzoneClusterImpl {
NODES_SHUTDOWN
}
+ // The service on which chaos will be unleashed.
+ enum FailureService {
+ DATANODE,
+ OZONE_MANAGER;
+
+ public String toString() {
+ if (this == DATANODE) {
+ return "Datanode";
+ } else {
+ return "OzoneManager";
+ }
+ }
+
+ public static FailureService of(String serviceName) {
+ if (serviceName.equalsIgnoreCase("Datanode")) {
+ return DATANODE;
+ } else if (serviceName.equalsIgnoreCase("OzoneManager")) {
+ return OZONE_MANAGER;
+ }
+ throw new IllegalArgumentException("Unrecognized value for " +
+ "FailureService enum: " + serviceName);
+ }
+ }
+
public MiniOzoneChaosCluster(OzoneConfiguration conf,
- OzoneManager ozoneManager,
- StorageContainerManager scm,
- List<HddsDatanodeService> hddsDatanodes) {
- super(conf, ozoneManager, scm, hddsDatanodes);
+ List<OzoneManager> ozoneManagers, StorageContainerManager scm,
+ List<HddsDatanodeService> hddsDatanodes, String omServiceID,
+ FailureService service) {
+ super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID);
this.executorService = Executors.newSingleThreadScheduledExecutor();
this.numDatanodes = getHddsDatanodes().size();
- LOG.info("Starting MiniOzoneChaosCluster with {} datanodes", numDatanodes);
+ this.numOzoneManagers = ozoneManagers.size();
+ this.failureService = service;
+ LOG.info("Starting MiniOzoneChaosCluster with {} OzoneManagers and {} " +
+ "Datanodes, chaos on service: {}",
+ numOzoneManagers, numDatanodes, failureService);
+ }
+
+ protected int getNumNodes() {
+ return numNodes;
+ }
+
+ protected void setNumNodes(int numOfNodes) {
+ this.numNodes = numOfNodes;
+ }
+
+ protected long getFailureIntervalInMS() {
+ return failureIntervalInMS;
+ }
+
+ /**
+ * Is the cluster ready for chaos.
+ */
+ protected boolean isClusterReady() {
+ return true;
}
- // Get the number of datanodes to fail in the cluster.
- private int getNumberOfNodesToFail() {
+ protected void getClusterReady() {
+ // Do nothing
+ }
+
+ // Get the number of nodes to fail in the cluster.
+ protected int getNumberOfNodesToFail() {
return RandomUtils.nextBoolean() ? 1 : 2;
}
- // Should the failed node wait for SCM to register the even before
+ // Should the failed node wait for SCM to register even before
// restart, i.e fast restart or not.
- private boolean isFastRestart() {
+ protected boolean isFastRestart() {
return RandomUtils.nextBoolean();
}
// Should the selected node be stopped or started.
- private boolean shouldStop() {
+ protected boolean shouldStop() {
return RandomUtils.nextBoolean();
}
- // Get the datanode index of the datanode to fail.
+ // Get the node index of the node to fail.
private int getNodeToFail() {
- return RandomUtils.nextInt() % numDatanodes;
+ return RandomUtils.nextInt() % numNodes;
}
+ protected abstract void restartNode(int failedNodeIndex,
+ boolean waitForNodeRestart)
+ throws TimeoutException, InterruptedException, IOException;
+
+ protected abstract void shutdownNode(int failedNodeIndex)
+ throws ExecutionException, InterruptedException;
+
+ protected abstract String getFailedNodeID(int failedNodeIndex);
+
private void restartNodes() {
final int numNodesToFail = getNumberOfNodesToFail();
LOG.info("Will restart {} nodes to simulate failure", numNodesToFail);
@@ -96,15 +168,16 @@ public class MiniOzoneChaosCluster extends
MiniOzoneClusterImpl {
boolean failureMode = isFastRestart();
int failedNodeIndex = getNodeToFail();
String failString = failureMode ? "Fast" : "Slow";
- DatanodeDetails dn =
- getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails();
+ String failedNodeID = getFailedNodeID(failedNodeIndex);
try {
- LOG.info("{} Restarting DataNode: {}", failString, dn.getUuid());
- restartHddsDatanode(failedNodeIndex, failureMode);
- LOG.info("{} Completed restarting Datanode: {}", failString,
- dn.getUuid());
+ LOG.info("{} Restarting {}: {}", failString, failureService,
+ failedNodeID);
+ restartNode(failedNodeIndex, failureMode);
+ LOG.info("{} Completed restarting {}: {}", failString, failureService,
+ failedNodeID);
} catch (Exception e) {
- LOG.error("Failed to restartNodes Datanode {}", dn.getUuid(), e);
+ LOG.error("Failed to restartNodes {}: {}", failedNodeID,
+ failureService, e);
}
}
}
@@ -116,20 +189,19 @@ public class MiniOzoneChaosCluster extends
MiniOzoneClusterImpl {
boolean shouldStop = shouldStop();
int failedNodeIndex = getNodeToFail();
String stopString = shouldStop ? "Stopping" : "Restarting";
- DatanodeDetails dn =
- getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails();
+ String failedNodeID = getFailedNodeID(failedNodeIndex);
try {
- LOG.info("{} DataNode {}", stopString, dn.getUuid());
-
+ LOG.info("{} {} {}", stopString, failureService, failedNodeID);
if (shouldStop) {
- shutdownHddsDatanode(failedNodeIndex);
+ shutdownNode(failedNodeIndex);
} else {
- restartHddsDatanode(failedNodeIndex, true);
+ restartNode(failedNodeIndex, false);
}
- LOG.info("Completed {} DataNode {}", stopString, dn.getUuid());
-
+ LOG.info("Completed {} {} {}", stopString, failureService,
+ failedNodeID);
} catch (Exception e) {
- LOG.error("Failed {} Datanode {}", stopString, dn.getUuid(), e);
+ LOG.error("Failed {} {} {}", stopString, failureService,
+ failedNodeID, e);
}
}
}
@@ -141,24 +213,33 @@ public class MiniOzoneChaosCluster extends
MiniOzoneClusterImpl {
// Fail nodes randomly at configured timeout period.
private void fail() {
- FailureMode mode = getFailureMode();
- switch (mode) {
- case NODES_RESTART:
- restartNodes();
- break;
- case NODES_SHUTDOWN:
- shutdownNodes();
- break;
-
- default:
- LOG.error("invalid failure mode:{}", mode);
- break;
+ if (isClusterReady()) {
+ FailureMode mode = getFailureMode();
+ switch (mode) {
+ case NODES_RESTART:
+ restartNodes();
+ break;
+ case NODES_SHUTDOWN:
+ shutdownNodes();
+ break;
+
+ default:
+ LOG.error("invalid failure mode:{}", mode);
+ break;
+ }
+ } else {
+ // Cluster is not ready for failure yet. Skip failing this time and get
+ // the cluster ready by restarting any OM that is not running.
+ LOG.info("Cluster is not ready for failure.");
+ getClusterReady();
}
}
void startChaos(long initialDelay, long period, TimeUnit timeUnit) {
- LOG.info("Starting Chaos with failure period:{} unit:{} numDataNodes:{}",
- period, timeUnit, numDatanodes);
+ LOG.info("Starting Chaos with failure period:{} unit:{} numDataNodes:{} " +
+ "numOzoneManagers:{}", period, timeUnit, numDatanodes,
+ numOzoneManagers);
+ this.failureIntervalInMS = TimeUnit.MILLISECONDS.convert(period, timeUnit);
scheduledFuture = executorService.scheduleAtFixedRate(this::fail,
initialDelay, period, timeUnit);
}
@@ -186,7 +267,9 @@ public class MiniOzoneChaosCluster extends
MiniOzoneClusterImpl {
/**
* Builder for configuring the MiniOzoneChaosCluster to run.
*/
- public static class Builder extends MiniOzoneClusterImpl.Builder {
+ public static class Builder extends MiniOzoneHAClusterImpl.Builder {
+
+ private FailureService failureService;
/**
* Creates a new Builder.
@@ -200,9 +283,7 @@ public class MiniOzoneChaosCluster extends
MiniOzoneClusterImpl {
/**
* Sets the number of HddsDatanodes to be started as part of
* MiniOzoneChaosCluster.
- *
* @param val number of datanodes
- *
* @return MiniOzoneChaosCluster.Builder
*/
public Builder setNumDatanodes(int val) {
@@ -210,7 +291,31 @@ public class MiniOzoneChaosCluster extends
MiniOzoneClusterImpl {
return this;
}
- @Override
+ /**
+ * Sets the number of OzoneManagers to be started as part of
+ * MiniOzoneChaosCluster.
+ * @param val number of OzoneManagers
+ * @return MiniOzoneChaosCluster.Builder
+ */
+ public Builder setNumOzoneManagers(int val) {
+ super.setNumOfOzoneManagers(val);
+ super.setNumOfActiveOMs(val);
+ return this;
+ }
+
+ /**
+ * Sets OM Service ID.
+ */
+ public Builder setOMServiceID(String omServiceID) {
+ super.setOMServiceId(omServiceID);
+ return this;
+ }
+
+ public Builder setFailureService(String serviceName) {
+ this.failureService = FailureService.of(serviceName);
+ return this;
+ }
+
protected void initializeConfiguration() throws IOException {
super.initializeConfiguration();
conf.setStorageSize(ScmConfigKeys.OZONE_SCM_CHUNK_SIZE_KEY,
@@ -257,26 +362,47 @@ public class MiniOzoneChaosCluster extends
MiniOzoneClusterImpl {
@Override
public MiniOzoneChaosCluster build() throws IOException {
+
+ if (failureService == FailureService.OZONE_MANAGER && numOfOMs < 3) {
+ throw new IllegalArgumentException("Not enough number of " +
+ "OzoneManagers to test chaos on OzoneManagers. Set number of " +
+ "OzoneManagers to at least 3");
+ }
+
DefaultMetricsSystem.setMiniClusterMode(true);
initializeConfiguration();
+ if (numOfOMs > 1) {
+ initOMRatisConf();
+ }
+
StorageContainerManager scm;
- OzoneManager om;
+ List<OzoneManager> omList;
try {
scm = createSCM();
scm.start();
- om = createOM();
- if(certClient != null) {
- om.setCertClient(certClient);
+ if (numOfOMs > 1) {
+ omList = createOMService();
+ } else {
+ OzoneManager om = createOM();
+ om.start();
+ omList = Arrays.asList(om);
}
} catch (AuthenticationException ex) {
throw new IOException("Unable to build MiniOzoneCluster. ", ex);
}
- om.start();
- final List<HddsDatanodeService> hddsDatanodes =
- createHddsDatanodes(scm, null);
- MiniOzoneChaosCluster cluster =
- new MiniOzoneChaosCluster(conf, om, scm, hddsDatanodes);
+ final List<HddsDatanodeService> hddsDatanodes = createHddsDatanodes(
+ scm, null);
+
+ MiniOzoneChaosCluster cluster;
+ if (failureService == FailureService.DATANODE) {
+ cluster = new MiniOzoneDatanodeChaosCluster(conf, omList, scm,
+ hddsDatanodes, omServiceId);
+ } else {
+ cluster = new MiniOzoneOMChaosCluster(conf, omList, scm,
+ hddsDatanodes, omServiceId);
+ }
+
if (startDataNodes) {
cluster.startHddsDatanodes();
}
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
new file mode 100644
index 0000000..f402831
--- /dev/null
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneDatanodeChaosCluster.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ozone;
+
+import java.util.List;
+import java.util.concurrent.TimeoutException;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
+import org.apache.hadoop.ozone.om.OzoneManager;
+
+/**
+ * This class causes random failures in Datanodes in the chaos cluster.
+ */
+public class MiniOzoneDatanodeChaosCluster extends MiniOzoneChaosCluster {
+
+ public MiniOzoneDatanodeChaosCluster(OzoneConfiguration conf,
+ List<OzoneManager> ozoneManagers,
+ StorageContainerManager scm,
+ List<HddsDatanodeService> hddsDatanodes,
+ String omServiceID) {
+ super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
+ FailureService.DATANODE);
+ setNumNodes(hddsDatanodes.size());
+ }
+
+ @Override
+ protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
+ throws TimeoutException, InterruptedException {
+ restartHddsDatanode(failedNodeIndex, waitForNodeRestart);
+ }
+
+ @Override
+ protected void shutdownNode(int failedNodeIndex) {
+ shutdownHddsDatanode(failedNodeIndex);
+ }
+
+ @Override
+ protected String getFailedNodeID(int failedNodeIndex) {
+ return getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails()
+ .getUuidString();
+ }
+}
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneLoadGenerator.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneLoadGenerator.java
index d1256b1..62a1db2 100644
---
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneLoadGenerator.java
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneLoadGenerator.java
@@ -47,14 +47,16 @@ public class MiniOzoneLoadGenerator {
private final OzoneVolume volume;
private final OzoneConfiguration conf;
+ private final String omServiceID;
MiniOzoneLoadGenerator(OzoneVolume volume, int numClients, int numThreads,
- int numBuffers, OzoneConfiguration conf)
+ int numBuffers, OzoneConfiguration conf, String omServiceId)
throws Exception {
DataBuffer buffer = new DataBuffer(numBuffers);
loadExecutors = new ArrayList<>();
this.volume = volume;
this.conf = conf;
+ this.omServiceID = omServiceId;
// Random Load
String mixBucketName =
RandomStringUtils.randomAlphabetic(10).toLowerCase();
@@ -62,7 +64,7 @@ public class MiniOzoneLoadGenerator {
List<LoadBucket> ozoneBuckets = new ArrayList<>(numClients);
for (int i = 0; i < numClients; i++) {
ozoneBuckets.add(new LoadBucket(volume.getBucket(mixBucketName),
- conf));
+ conf, omServiceId));
}
RandomLoadGenerator loadGenerator =
new RandomLoadGenerator(buffer, ozoneBuckets);
@@ -82,7 +84,8 @@ public class MiniOzoneLoadGenerator {
throws Exception {
String bucketName = RandomStringUtils.randomAlphabetic(10).toLowerCase();
volume.createBucket(bucketName);
- LoadBucket bucket = new LoadBucket(volume.getBucket(bucketName), conf);
+ LoadBucket bucket = new LoadBucket(volume.getBucket(bucketName), conf,
+ omServiceID);
LoadGenerator loadGenerator = function.apply(bucket);
loadExecutors.add(new LoadExecutors(numThreads, loadGenerator));
}
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
new file mode 100644
index 0000000..2b2a4d7
--- /dev/null
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ozone;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.apache.commons.lang3.RandomUtils;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
+import org.apache.hadoop.ozone.om.OzoneManager;
+
+/**
+ * This class causes random failures in OMs in the chaos cluster.
+ */
+public class MiniOzoneOMChaosCluster extends MiniOzoneChaosCluster {
+
+ // Cluster is deemed ready for chaos when all the OMs are up and running.
+ private AtomicBoolean isClusterReady = new AtomicBoolean(true);
+
+ // The maximum number of nodes failures which can be tolerated without
+ // losing quorum. This should be equal to (Num of OMs - 1)/2.
+ private int numOfOMNodeFailuresTolerated;
+
+ MiniOzoneOMChaosCluster(OzoneConfiguration conf,
+ List<OzoneManager> ozoneManagers,
+ StorageContainerManager scm,
+ List<HddsDatanodeService> hddsDatanodes,
+ String omServiceID) {
+ super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
+ FailureService.OZONE_MANAGER);
+ setNumNodes(ozoneManagers.size());
+ numOfOMNodeFailuresTolerated = (getNumNodes() - 1) / 2;
+ }
+
+ /**
+ * Check if cluster is ready for a restart or shutdown of an OM node. If
+ * yes, then set isClusterReady to false so that another thread cannot
+ * restart/ shutdown OM till all OMs are up again.
+ */
+ protected boolean isClusterReady() {
+ return isClusterReady.compareAndSet(true, false);
+ }
+
+ /**
+ * If any OM node is not running, restart it.
+ */
+ @Override
+ protected void getClusterReady() {
+ boolean clusterReady = true;
+ for (OzoneManager om : getOzoneManagersList()) {
+ if (!om.isRunning()) {
+ try {
+ restartOzoneManager(om, true);
+ } catch (Exception e) {
+ clusterReady = false;
+ LOG.error("Cluster not ready for chaos. Failed to restart OM {}: {}",
+ om.getOMNodeId(), e);
+ }
+ }
+ }
+ if (clusterReady) {
+ isClusterReady.set(true);
+ }
+ }
+
+ @Override
+ protected int getNumberOfNodesToFail() {
+ return RandomUtils.nextInt(1, numOfOMNodeFailuresTolerated + 1);
+ }
+
+ @Override
+ protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
+ throws IOException, TimeoutException, InterruptedException {
+ shutdownOzoneManager(failedNodeIndex);
+ restartOzoneManager(failedNodeIndex, waitForNodeRestart);
+ getClusterReady();
+ }
+
+ /**
+ * For OM chaos, a shutdown node should eventually be restarted before the
+ * next failure.
+ */
+ @Override
+ protected void shutdownNode(int failedNodeIndex)
+ throws ExecutionException, InterruptedException {
+ shutdownOzoneManager(failedNodeIndex);
+
+ // Restart the OM after FailureInterval / 2 duration.
+ Executors.newSingleThreadScheduledExecutor().schedule(
+ this::getClusterReady, getFailureIntervalInMS() / 2,
+ TimeUnit.MILLISECONDS).get();
+ }
+
+ @Override
+ protected String getFailedNodeID(int failedNodeIndex) {
+ return getOzoneManager(failedNodeIndex).getOMNodeId();
+ }
+
+ /**
+ * When restarting OM, always wait for it to catch up with Leader OM.
+ */
+ @Override
+ protected boolean isFastRestart() {
+ return true;
+ }
+
+ @Override
+ protected boolean shouldStop() {
+ return true;
+ }
+}
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
index 0fa9a14..53718e4 100644
---
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/TestMiniChaosOzoneCluster.java
@@ -21,6 +21,7 @@ import org.apache.commons.lang3.RandomStringUtils;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.ozone.client.ObjectStore;
import org.apache.hadoop.ozone.client.OzoneVolume;
+import org.apache.hadoop.ozone.MiniOzoneChaosCluster.FailureService;
import org.junit.BeforeClass;
import org.junit.AfterClass;
import org.junit.Ignore;
@@ -43,6 +44,15 @@ public class TestMiniChaosOzoneCluster implements Runnable {
description = "num of datanodes")
private static int numDatanodes = 20;
+ @Option(names = {"-o", "--numOzoneManager"},
+ description = "num of ozoneManagers")
+ private static int numOzoneManagers = 1;
+
+ @Option(names = {"-s", "--failureService"},
+ description = "service (datanode or ozoneManager) to test chaos on",
+ defaultValue = "datanode")
+ private static String failureService = "datanode";
+
@Option(names = {"-t", "--numThreads"},
description = "num of IO threads")
private static int numThreads = 5;
@@ -61,16 +71,26 @@ public class TestMiniChaosOzoneCluster implements Runnable {
@Option(names = {"-i", "--failureInterval"},
description = "time between failure events in seconds")
- private static int failureInterval = 300; // 5 second period between
failures.
+ private static int failureInterval = 300; // 5 minute period between
failures.
private static MiniOzoneChaosCluster cluster;
private static MiniOzoneLoadGenerator loadGenerator;
+ private static final String OM_SERVICE_ID = "ozoneChaosTest";
+
@BeforeClass
public static void init() throws Exception {
OzoneConfiguration configuration = new OzoneConfiguration();
+ String omServiceID =
+ FailureService.of(failureService) == FailureService.OZONE_MANAGER ?
+ OM_SERVICE_ID : null;
+
cluster = new MiniOzoneChaosCluster.Builder(configuration)
- .setNumDatanodes(numDatanodes).build();
+ .setNumDatanodes(numDatanodes)
+ .setNumOzoneManagers(numOzoneManagers)
+ .setFailureService(failureService)
+ .setOMServiceID(omServiceID)
+ .build();
cluster.waitForClusterToBeReady();
String volumeName = RandomStringUtils.randomAlphabetic(10).toLowerCase();
@@ -80,7 +100,7 @@ public class TestMiniChaosOzoneCluster implements Runnable {
loadGenerator =
new MiniOzoneLoadGenerator(volume, numClients, numThreads,
- numBuffers, configuration);
+ numBuffers, configuration, omServiceID);
}
/**
diff --git
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/utils/LoadBucket.java
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/utils/LoadBucket.java
index 2fb92d1..8e1ef31 100644
---
a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/utils/LoadBucket.java
+++
b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/utils/LoadBucket.java
@@ -49,10 +49,15 @@ public class LoadBucket {
private final OzoneBucket bucket;
private final OzoneFileSystem fs;
- public LoadBucket(OzoneBucket bucket, OzoneConfiguration conf)
- throws Exception {
+ public LoadBucket(OzoneBucket bucket, OzoneConfiguration conf,
+ String omServiceID) throws Exception {
this.bucket = bucket;
- this.fs = (OzoneFileSystem)FileSystem.get(getFSUri(bucket), conf);
+ if (omServiceID == null) {
+ this.fs = (OzoneFileSystem) FileSystem.get(getFSUri(bucket), conf);
+ } else {
+ this.fs = (OzoneFileSystem) FileSystem.get(getFSUri(bucket, omServiceID),
+ conf);
+ }
}
private boolean isFsOp() {
@@ -97,6 +102,12 @@ public class LoadBucket {
bucket.getName(), bucket.getVolumeName()));
}
+ private static URI getFSUri(OzoneBucket bucket, String omServiceID)
+ throws URISyntaxException {
+ return new URI(String.format("%s://%s.%s.%s/",
OzoneConsts.OZONE_URI_SCHEME,
+ bucket.getName(), bucket.getVolumeName(), omServiceID));
+ }
+
abstract class Op {
private final boolean fsOp;
private final String opName;
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
index 8a74a19..56c9be1 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java
@@ -18,6 +18,10 @@
package org.apache.hadoop.ozone;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@@ -29,16 +33,17 @@ import org.apache.hadoop.ozone.om.OMStorage;
import org.apache.hadoop.ozone.om.OzoneManager;
import org.apache.hadoop.ozone.recon.ReconServer;
import
org.apache.hadoop.security.authentication.client.AuthenticationException;
+import org.apache.hadoop.test.GenericTestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.BindException;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
+import java.util.concurrent.TimeoutException;
import java.util.concurrent.TimeUnit;
import static org.apache.hadoop.hdds.HddsConfigKeys.OZONE_METADATA_DIRS;
@@ -48,7 +53,7 @@ import static
org.apache.hadoop.hdds.HddsConfigKeys.OZONE_METADATA_DIRS;
* with OM HA suitable for running tests. The cluster consists of a set of
* OzoneManagers, StorageContainerManager and multiple DataNodes.
*/
-public final class MiniOzoneHAClusterImpl extends MiniOzoneClusterImpl {
+public class MiniOzoneHAClusterImpl extends MiniOzoneClusterImpl {
private static final Logger LOG =
LoggerFactory.getLogger(MiniOzoneHAClusterImpl.class);
@@ -61,20 +66,21 @@ public final class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
private List<OzoneManager> activeOMs;
private List<OzoneManager> inactiveOMs;
+ private int waitForOMToBeReadyTimeout = 120000; // 2 min
+
private static final Random RANDOM = new Random();
private static final int RATIS_LEADER_ELECTION_TIMEOUT = 1000; // 1 seconds
public static final int NODE_FAILURE_TIMEOUT = 2000; // 2 seconds
/**
- * Creates a new MiniOzoneCluster with OM HA.
+ * Creates a new MiniOzoneCluster.
*
* @throws IOException if there is an I/O error
*/
@SuppressWarnings("checkstyle:ParameterNumber")
private MiniOzoneHAClusterImpl(
OzoneConfiguration conf,
- Map<String, OzoneManager> omMap,
List<OzoneManager> activeOMList,
List<OzoneManager> inactiveOMList,
StorageContainerManager scm,
@@ -82,11 +88,40 @@ public final class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
String omServiceId,
ReconServer reconServer) {
super(conf, scm, hddsDatanodes, reconServer);
- this.ozoneManagerMap = omMap;
- this.ozoneManagers = new ArrayList<>(omMap.values());
+
+ this.ozoneManagerMap = Maps.newHashMap();
+ if (activeOMList != null) {
+ for (OzoneManager om : activeOMList) {
+ this.ozoneManagerMap.put(om.getOMNodeId(), om);
+ }
+ }
+ if (inactiveOMList != null) {
+ for (OzoneManager om : inactiveOMList) {
+ this.ozoneManagerMap.put(om.getOMNodeId(), om);
+ }
+ }
+ this.ozoneManagers = new ArrayList<>(ozoneManagerMap.values());
this.activeOMs = activeOMList;
this.inactiveOMs = inactiveOMList;
this.omServiceId = omServiceId;
+
+ // If the serviceID is null, then this should be a non-HA cluster.
+ if (omServiceId == null) {
+ Preconditions.checkArgument(ozoneManagers.size() <= 1);
+ }
+ }
+
+ /**
+ * Creates a new MiniOzoneCluster with all OMs active.
+ * This is used by MiniOzoneChaosCluster.
+ */
+ protected MiniOzoneHAClusterImpl(
+ OzoneConfiguration conf,
+ List<OzoneManager> omList,
+ StorageContainerManager scm,
+ List<HddsDatanodeService> hddsDatanodes,
+ String omServiceId) {
+ this(conf, omList, null, scm, hddsDatanodes, omServiceId, null);
}
@Override
@@ -105,7 +140,13 @@ public final class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
@Override
public OzoneClient getRpcClient() throws IOException {
- return OzoneClientFactory.getRpcClient(getServiceId(), getConf());
+ if (omServiceId == null) {
+ // Non-HA cluster.
+ return OzoneClientFactory.getRpcClient(getConf());
+ } else {
+ // HA cluster
+ return OzoneClientFactory.getRpcClient(omServiceId, getConf());
+ }
}
public boolean isOMActive(String omNodeId) {
@@ -120,6 +161,10 @@ public final class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
return this.ozoneManagerMap.get(omNodeId);
}
+ public List<OzoneManager> getOzoneManagersList() {
+ return ozoneManagers;
+ }
+
/**
* Get OzoneManager leader object.
* @return OzoneManager object, null if there isn't one or more than one
@@ -162,6 +207,30 @@ public final class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
}
}
+ void shutdownOzoneManager(int omNodeIndex) {
+ OzoneManager ozoneManager = ozoneManagers.get(omNodeIndex);
+ LOG.info("Shutting down OzoneManager " + ozoneManager.getOMNodeId());
+
+ ozoneManager.stop();
+ }
+
+ void restartOzoneManager(int omNodeIndex, boolean waitForOM)
+ throws IOException, TimeoutException, InterruptedException {
+ OzoneManager ozoneManager = ozoneManagers.get(omNodeIndex);
+ restartOzoneManager(ozoneManager, waitForOM);
+ }
+
+ void restartOzoneManager(OzoneManager ozoneManager, boolean waitForOM)
+ throws IOException, TimeoutException, InterruptedException {
+ LOG.info("Restarting OzoneManager " + ozoneManager.getOMNodeId());
+ ozoneManager.restart();
+
+ if (waitForOM) {
+ GenericTestUtils.waitFor(ozoneManager::isRunning,
+ 1000, waitForOMToBeReadyTimeout);
+ }
+ }
+
@Override
public void stop() {
for (OzoneManager ozoneManager : ozoneManagers) {
@@ -211,15 +280,16 @@ public final class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
if (numOfActiveOMs == ACTIVE_OMS_NOT_SET) {
numOfActiveOMs = numOfOMs;
}
+
DefaultMetricsSystem.setMiniClusterMode(true);
initializeConfiguration();
+ initOMRatisConf();
StorageContainerManager scm;
- Map<String, OzoneManager> omMap;
ReconServer reconServer = null;
try {
scm = createSCM();
scm.start();
- omMap = createOMService();
+ createOMService();
if (includeRecon) {
configureRecon();
reconServer = new ReconServer();
@@ -229,26 +299,25 @@ public final class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
throw new IOException("Unable to build MiniOzoneCluster. ", ex);
}
- final List<HddsDatanodeService> hddsDatanodes = createHddsDatanodes(scm,
- reconServer);
- MiniOzoneHAClusterImpl cluster = new MiniOzoneHAClusterImpl(
- conf, omMap, activeOMs, inactiveOMs, scm, hddsDatanodes,
- omServiceId, reconServer);
+ final List<HddsDatanodeService> hddsDatanodes = createHddsDatanodes(
+ scm, reconServer);
+
+ MiniOzoneHAClusterImpl cluster = new MiniOzoneHAClusterImpl(conf,
+ activeOMs, inactiveOMs, scm, hddsDatanodes, omServiceId,
reconServer);
+
if (startDataNodes) {
cluster.startHddsDatanodes();
}
return cluster;
}
- /**
- * Initialize OM configurations.
- * @throws IOException
- */
- @Override
- protected void initializeConfiguration() throws IOException {
- super.initializeConfiguration();
+ protected void initOMRatisConf() {
conf.setBoolean(OMConfigKeys.OZONE_OM_RATIS_ENABLE_KEY, true);
conf.setInt(OMConfigKeys.OZONE_OM_HANDLER_COUNT_KEY, numOfOmHandlers);
+ conf.setLong(
+ OMConfigKeys.OZONE_OM_RATIS_SNAPSHOT_AUTO_TRIGGER_THRESHOLD_KEY,
+ 100L);
+ conf.setLong(OMConfigKeys.OZONE_OM_RATIS_LOG_PURGE_GAP, 200L);
conf.setTimeDuration(
OMConfigKeys.OZONE_OM_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_KEY,
RATIS_LEADER_ELECTION_TIMEOUT, TimeUnit.MILLISECONDS);
@@ -259,14 +328,11 @@ public final class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
/**
* Start OM service with multiple OMs.
- * @return list of OzoneManagers
- * @throws IOException
- * @throws AuthenticationException
*/
- private Map<String, OzoneManager> createOMService() throws IOException,
+ protected List<OzoneManager> createOMService() throws IOException,
AuthenticationException {
- Map<String, OzoneManager> omMap = new HashMap<>();
+ List<OzoneManager> omList = Lists.newArrayList();
int retryCount = 0;
int basePort = 10000;
@@ -293,8 +359,10 @@ public final class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
initializeOmStorage(omStore);
OzoneManager om = OzoneManager.createOm(config);
- om.setCertClient(certClient);
- omMap.put(nodeId, om);
+ if (certClient != null) {
+ om.setCertClient(certClient);
+ }
+ omList.add(om);
if (i <= numOfActiveOMs) {
om.start();
@@ -311,24 +379,23 @@ public final class MiniOzoneHAClusterImpl extends
MiniOzoneClusterImpl {
// Set default OM address to point to the first OM. Clients would
// try connecting to this address by default
conf.set(OMConfigKeys.OZONE_OM_ADDRESS_KEY,
- NetUtils.getHostPortString(omMap.get(nodeIdBaseStr + 1)
- .getOmRpcServerAddr()));
+ NetUtils.getHostPortString(omList.get(0).getOmRpcServerAddr()));
break;
} catch (BindException e) {
- for (OzoneManager om : omMap.values()) {
+ for (OzoneManager om : omList) {
om.stop();
om.join();
LOG.info("Stopping OzoneManager server at {}",
om.getOmRpcServerAddr());
}
- omMap.clear();
+ omList.clear();
++retryCount;
LOG.info("MiniOzoneHACluster port conflicts, retried {} times",
retryCount);
}
}
- return omMap;
+ return omList;
}
/**
diff --git
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
index 0c036bc..627be1f 100644
---
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
+++
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
@@ -304,6 +304,14 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
private boolean isNativeAuthorizerEnabled;
+ private enum State {
+ INITIALIZED,
+ RUNNING,
+ STOPPED
+ }
+ // Used in MiniOzoneCluster testing
+ private State omState;
+
private OzoneManager(OzoneConfiguration conf) throws IOException,
AuthenticationException {
super(OzoneVersionInfo.OZONE_VERSION_INFO);
@@ -322,10 +330,10 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
// In case of single OM Node Service there will be no OM Node ID
// specified, set it to value from om storage
if (this.omNodeDetails.getOMNodeId() == null) {
- this.omNodeDetails =
- OMHANodeDetails.getOMNodeDetails(conf,
omNodeDetails.getOMServiceId(),
- omStorage.getOmId(), omNodeDetails.getRpcAddress(),
- omNodeDetails.getRatisPort());
+ this.omNodeDetails = OMHANodeDetails.getOMNodeDetails(conf,
+ omNodeDetails.getOMServiceId(),
+ omStorage.getOmId(), omNodeDetails.getRpcAddress(),
+ omNodeDetails.getRatisPort());
}
loginOMUserIfSecurityEnabled(conf);
@@ -413,7 +421,6 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
this.omRatisSnapshotInfo = new OMRatisSnapshotInfo(
omStorage.getCurrentDir());
-
initializeRatisServer();
if (isRatisEnabled) {
@@ -451,6 +458,7 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
};
ShutdownHookManager.get().addShutdownHook(shutdownHook,
SHUTDOWN_HOOK_PRIORITY);
+ omState = State.INITIALIZED;
}
/**
@@ -1135,6 +1143,7 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
}
registerMXBean();
setStartTime();
+ omState = State.RUNNING;
}
/**
@@ -1168,15 +1177,15 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
metricsTimer = new Timer();
metricsTimer.schedule(scheduleOMMetricsWriteTask, 0, period);
- omRpcServer = getRpcServer(configuration);
- omRpcServer.start();
- isOmRpcServerRunning = true;
-
initializeRatisServer();
if (omRatisServer != null) {
omRatisServer.start();
}
+ omRpcServer = getRpcServer(configuration);
+ omRpcServer.start();
+ isOmRpcServerRunning = true;
+
try {
httpServer = new OzoneManagerHttpServer(configuration, this);
httpServer.start();
@@ -1191,6 +1200,7 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
jvmPauseMonitor.init(configuration);
jvmPauseMonitor.start();
setStartTime();
+ omState = State.RUNNING;
}
/**
@@ -1291,6 +1301,7 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
if (jvmPauseMonitor != null) {
jvmPauseMonitor.stop();
}
+ omState = State.STOPPED;
} catch (Exception e) {
LOG.error("OzoneManager stop failed.", e);
}
@@ -3367,4 +3378,9 @@ public final class OzoneManager extends
ServiceRuntimeInfoImpl
public boolean isNativeAuthorizerEnabled() {
return isNativeAuthorizerEnabled;
}
+
+ @VisibleForTesting
+ public boolean isRunning() {
+ return omState == State.RUNNING;
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]