This is an automated email from the ASF dual-hosted git repository.
nanda pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 403caf0361 HDDS-9022. DiskChecker incorrectly reporting errors. (#5086)
403caf0361 is described below
commit 403caf03615722545d9e6fcf784b78deebc3425d
Author: Ethan Rose <[email protected]>
AuthorDate: Sat Jul 22 19:16:22 2023 -0700
HDDS-9022. DiskChecker incorrectly reporting errors. (#5086)
---
.../common/statemachine/DatanodeConfiguration.java | 4 +-
.../container/common/utils/DiskCheckUtil.java | 18 +-
.../container/common/volume/MetadataVolume.java | 15 +
.../container/common/volume/MutableVolumeSet.java | 15 +-
.../container/common/volume/StorageVolume.java | 5 +-
.../common/volume/StorageVolumeChecker.java | 47 ++-
.../ozone/container/ozoneimpl/OzoneContainer.java | 19 +-
.../container/common/volume/TestHddsVolume.java | 27 ++
.../container/common/volume/TestStorageVolume.java | 268 ----------------
.../volume/TestStorageVolumeHealthChecks.java | 346 +++++++++++++++++++++
.../common/volume/TestVolumeSetDiskChecks.java | 8 +
.../TestDatanodeHddsVolumeFailureToleration.java | 30 +-
12 files changed, 477 insertions(+), 325 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeConfiguration.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeConfiguration.java
index 164af7f31c..e5f0046c05 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeConfiguration.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeConfiguration.java
@@ -43,7 +43,7 @@ public class DatanodeConfiguration {
static final String PERIODIC_DISK_CHECK_INTERVAL_MINUTES_KEY =
"hdds.datanode.periodic.disk.check.interval.minutes";
public static final String DISK_CHECK_FILE_SIZE_KEY =
- "hdds.datanode.disk.check.file.size";
+ "hdds.datanode.disk.check.io.file.size";
public static final String DISK_CHECK_IO_TEST_COUNT_KEY =
"hdds.datanode.disk.check.io.test.count";
public static final String DISK_CHECK_IO_FAILURES_TOLERATED_KEY =
@@ -301,7 +301,7 @@ public class DatanodeConfiguration {
private int volumeIOFailureTolerance =
DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT;
- @Config(key = "disk.check.file.size",
+ @Config(key = "disk.check.io.file.size",
defaultValue = "100B",
type = ConfigType.SIZE,
tags = { DATANODE },
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/DiskCheckUtil.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/DiskCheckUtil.java
index b267b1d479..b567841c02 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/DiskCheckUtil.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/DiskCheckUtil.java
@@ -139,15 +139,15 @@ public final class DiskCheckUtil {
fos.getFD().sync();
} catch (FileNotFoundException notFoundEx) {
logError(storageDir, String.format("Could not find file %s for " +
- "volume check.", testFile), notFoundEx);
+ "volume check.", testFile.getAbsolutePath()), notFoundEx);
return false;
} catch (SyncFailedException syncEx) {
logError(storageDir, String.format("Could sync file %s to disk.",
- testFile), syncEx);
+ testFile.getAbsolutePath()), syncEx);
return false;
} catch (IOException ioEx) {
logError(storageDir, String.format("Could not write file %s " +
- "for volume check.", testFile), ioEx);
+ "for volume check.", testFile.getAbsolutePath()), ioEx);
return false;
}
@@ -157,17 +157,17 @@ public final class DiskCheckUtil {
int numBytesRead = fis.read(readBytes);
if (numBytesRead != numBytesToWrite) {
logError(storageDir, String.format("%d bytes written to file %s " +
- "but %d bytes were read back.", numBytesToWrite, testFile,
- numBytesRead));
+ "but %d bytes were read back.", numBytesToWrite,
+ testFile.getAbsolutePath(), numBytesRead));
return false;
}
} catch (FileNotFoundException notFoundEx) {
logError(storageDir, String.format("Could not find file %s " +
- "for volume check.", testFile), notFoundEx);
+ "for volume check.", testFile.getAbsolutePath()), notFoundEx);
return false;
} catch (IOException ioEx) {
logError(storageDir, String.format("Could not read file %s " +
- "for volume check.", testFile), ioEx);
+ "for volume check.", testFile.getAbsolutePath()), ioEx);
return false;
}
@@ -175,14 +175,14 @@ public final class DiskCheckUtil {
if (!Arrays.equals(writtenBytes, readBytes)) {
logError(storageDir, String.format("%d Bytes read from file " +
"%s do not match the %d bytes that were written.",
- writtenBytes.length, testFile, readBytes.length));
+ writtenBytes.length, testFile.getAbsolutePath(),
readBytes.length));
return false;
}
// Delete the file.
if (!testFile.delete()) {
logError(storageDir, String.format("Could not delete file %s " +
- "for volume check.", testFile));
+ "for volume check.", testFile.getAbsolutePath()));
return false;
}
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MetadataVolume.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MetadataVolume.java
index c5b399b662..d7f22b42be 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MetadataVolume.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MetadataVolume.java
@@ -31,12 +31,27 @@ public class MetadataVolume extends StorageVolume {
protected MetadataVolume(Builder b) throws IOException {
super(b);
+ // Tmp directory on Metadata Volume uses the volume root as the working
+ // directory. It is not dependent on the cluster ID from SCM.
+ super.createTmpDirs("");
}
public VolumeType getType() {
return type;
}
+ @Override
+ public void format(String cid) throws IOException {
+ // No-op for Metadata volumes.
+ }
+
+ @Override
+ public void createTmpDirs(String workingDirName) {
+ // No-op for metadata volumes.
+ // Tmp directory is created on construction since it is not dependent on
+ // getting the cluster ID from SCM.
+ }
+
/**
* Builder class for MetadataVolume.
*/
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java
index f4a5d0b9ad..985ddea8de 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/MutableVolumeSet.java
@@ -89,7 +89,6 @@ public class MutableVolumeSet implements VolumeSet {
private final StorageVolumeFactory volumeFactory;
private final StorageVolume.VolumeType volumeType;
private int maxVolumeFailuresTolerated;
- private boolean initialized;
public MutableVolumeSet(String dnUuid, ConfigurationSource conf,
StateContext context, StorageVolume.VolumeType volumeType,
@@ -101,7 +100,6 @@ public class MutableVolumeSet implements VolumeSet {
ConfigurationSource conf, StateContext context,
StorageVolume.VolumeType volumeType, StorageVolumeChecker volumeChecker
) throws IOException {
- this.initialized = false;
this.context = context;
this.datanodeUuid = dnUuid;
this.clusterID = clusterID;
@@ -198,9 +196,6 @@ public class MutableVolumeSet implements VolumeSet {
if (volumeMap.size() == 0) {
throw new DiskOutOfSpaceException("No storage locations configured");
}
-
- checkAllVolumes();
- initialized = true;
}
/**
@@ -253,15 +248,7 @@ public class MutableVolumeSet implements VolumeSet {
// check failed volume tolerated
if (!hasEnoughVolumes()) {
- // on startup, we could not try to stop uninitialized services
- if (!initialized) {
- throw new IOException("Don't have enough good volumes on startup,"
- + " bad volumes detected: " + failedVolumes.size()
- + " max tolerated: " + maxVolumeFailuresTolerated);
- }
- if (context != null) {
- context.getParent().handleFatalVolumeFailures();
- }
+ context.getParent().handleFatalVolumeFailures();
}
} finally {
this.writeUnlock();
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
index 95d1b2c2de..dd48f0bb17 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
@@ -653,13 +653,14 @@ public abstract class StorageVolume
// The failure counts can be left as is.
if (currentIOFailureCount.get() > ioFailureTolerance) {
LOG.info("Failed IO test for volume {}: the last {} runs " +
- "encountered {}/{} tolerated failures.", this,
+ "encountered {} out of {} tolerated failures.", this,
ioTestSlidingWindow.size(), currentIOFailureCount,
ioFailureTolerance);
return VolumeCheckResult.FAILED;
} else if (LOG.isDebugEnabled()) {
LOG.debug("IO test results for volume {}: the last {} runs encountered "
+
- "{}/{} tolerated failures", this, ioTestSlidingWindow.size(),
+ "{} out of {} tolerated failures", this,
+ ioTestSlidingWindow.size(),
currentIOFailureCount, ioFailureTolerance);
}
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolumeChecker.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolumeChecker.java
index d9869894b2..bcb11fddf0 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolumeChecker.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolumeChecker.java
@@ -34,6 +34,7 @@ import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import com.google.common.util.concurrent.MoreExecutors;
@@ -91,13 +92,17 @@ public class StorageVolumeChecker {
private final ExecutorService checkVolumeResultHandlerExecutorService;
+ private final DatanodeConfiguration dnConf;
+
/**
* An executor for periodic disk checks.
*/
private final ScheduledExecutorService diskCheckerservice;
- private final ScheduledFuture<?> periodicDiskChecker;
+ private ScheduledFuture<?> periodicDiskChecker;
private final List<VolumeSet> registeredVolumeSets;
+ private final AtomicBoolean started;
+
/**
* @param conf Configuration object.
* @param timer {@link Timer} object used for throttling checks.
@@ -106,7 +111,7 @@ public class StorageVolumeChecker {
this.timer = timer;
- DatanodeConfiguration dnConf = conf.getObject(DatanodeConfiguration.class);
+ dnConf = conf.getObject(DatanodeConfiguration.class);
maxAllowedTimeForCheckMs = dnConf.getDiskCheckTimeout().toMillis();
@@ -137,12 +142,18 @@ public class StorageVolumeChecker {
return t;
});
- long periodicDiskCheckIntervalMinutes =
- dnConf.getPeriodicDiskCheckIntervalMinutes();
- this.periodicDiskChecker =
- diskCheckerservice.scheduleWithFixedDelay(this::checkAllVolumeSets,
- periodicDiskCheckIntervalMinutes, periodicDiskCheckIntervalMinutes,
- TimeUnit.MINUTES);
+ started = new AtomicBoolean(false);
+ }
+
+ public void start() {
+ if (started.compareAndSet(false, true)) {
+ long periodicDiskCheckIntervalMinutes =
+ dnConf.getPeriodicDiskCheckIntervalMinutes();
+ periodicDiskChecker =
+ diskCheckerservice.scheduleWithFixedDelay(this::checkAllVolumeSets,
+ periodicDiskCheckIntervalMinutes,
+ periodicDiskCheckIntervalMinutes, TimeUnit.MINUTES);
+ }
}
public synchronized void registerVolumeSet(VolumeSet volumeSet) {
@@ -384,15 +395,17 @@ public class StorageVolumeChecker {
* of the parameters.
*/
public void shutdownAndWait(int gracePeriod, TimeUnit timeUnit) {
- periodicDiskChecker.cancel(true);
- diskCheckerservice.shutdownNow();
- checkVolumeResultHandlerExecutorService.shutdownNow();
- try {
- delegateChecker.shutdownAndWait(gracePeriod, timeUnit);
- } catch (InterruptedException e) {
- LOG.warn("{} interrupted during shutdown.",
- this.getClass().getSimpleName());
- Thread.currentThread().interrupt();
+ if (started.compareAndSet(true, false)) {
+ periodicDiskChecker.cancel(true);
+ diskCheckerservice.shutdownNow();
+ checkVolumeResultHandlerExecutorService.shutdownNow();
+ try {
+ delegateChecker.shutdownAndWait(gracePeriod, timeUnit);
+ } catch (InterruptedException e) {
+ LOG.warn("{} interrupted during shutdown.",
+ this.getClass().getSimpleName());
+ Thread.currentThread().interrupt();
+ }
}
}
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
index 48fd1b909d..3a3d1dcf6d 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
@@ -141,7 +141,7 @@ public class OzoneContainer {
config = conf;
this.datanodeDetails = datanodeDetails;
this.context = context;
- this.volumeChecker = getVolumeChecker(conf);
+ this.volumeChecker = new StorageVolumeChecker(conf, new Timer());
volumeSet = new MutableVolumeSet(datanodeDetails.getUuidString(), conf,
context, VolumeType.DATA_VOLUME, volumeChecker);
@@ -416,6 +416,18 @@ public class OzoneContainer {
return;
}
+ // Start background volume checks, which will begin after the configured
+ // delay.
+ volumeChecker.start();
+ // Do an immediate check of all volumes to ensure datanode health before
+ // proceeding.
+ volumeSet.checkAllVolumes();
+ metaVolumeSet.checkAllVolumes();
+ // DB volume set may be null if dedicated DB volumes are not used.
+ if (dbVolumeSet != null) {
+ dbVolumeSet.checkAllVolumes();
+ }
+
LOG.info("Attempting to start container services.");
startContainerScrub();
@@ -538,11 +550,6 @@ public class OzoneContainer {
return dbVolumeSet;
}
- @VisibleForTesting
- StorageVolumeChecker getVolumeChecker(ConfigurationSource conf) {
- return new StorageVolumeChecker(conf, new Timer());
- }
-
public ContainerMetrics getMetrics() {
return metrics;
}
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestHddsVolume.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestHddsVolume.java
index d02b5733d5..898b44da7b 100644
---
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestHddsVolume.java
+++
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestHddsVolume.java
@@ -21,6 +21,7 @@ import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.time.Duration;
+import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicLong;
@@ -46,6 +47,7 @@ import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import org.apache.hadoop.ozone.container.common.ContainerTestUtils;
+import org.apache.hadoop.ozone.container.common.helpers.DatanodeVersionFile;
import org.apache.hadoop.ozone.container.common.utils.DatanodeStoreCache;
import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
import org.junit.Before;
@@ -81,6 +83,31 @@ public class TestHddsVolume {
versionFile = StorageVolumeUtil.getVersionFile(rootDir);
}
+ @Test
+ public void testReadPropertiesFromVersionFile() throws Exception {
+ StorageVolume volume = volumeBuilder.build();
+
+ volume.format(CLUSTER_ID);
+
+ Properties properties = DatanodeVersionFile.readFrom(versionFile);
+
+ String storageID = StorageVolumeUtil.getStorageID(properties, versionFile);
+ String clusterID = StorageVolumeUtil.getClusterID(
+ properties, versionFile, CLUSTER_ID);
+ String datanodeUuid = StorageVolumeUtil.getDatanodeUUID(
+ properties, versionFile, DATANODE_UUID);
+ long cTime = StorageVolumeUtil.getCreationTime(
+ properties, versionFile);
+ int layoutVersion = StorageVolumeUtil.getLayOutVersion(
+ properties, versionFile);
+
+ assertEquals(volume.getStorageID(), storageID);
+ assertEquals(volume.getClusterID(), clusterID);
+ assertEquals(volume.getDatanodeUuid(), datanodeUuid);
+ assertEquals(volume.getCTime(), cTime);
+ assertEquals(volume.getLayoutVersion(), layoutVersion);
+ }
+
@Test
public void testHddsVolumeInitialization() throws Exception {
HddsVolume volume = volumeBuilder.build();
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolume.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolume.java
deleted file mode 100644
index 74469c78b5..0000000000
---
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolume.java
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with this
- * work for additional information regarding copyright ownership. The ASF
- * licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * <p>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
under
- * the License.
- */
-package org.apache.hadoop.ozone.container.common.volume;
-
-import org.apache.hadoop.hdfs.server.datanode.checker.VolumeCheckResult;
-import org.apache.hadoop.hdds.conf.OzoneConfiguration;
-import org.apache.hadoop.hdds.fs.MockSpaceUsageCheckFactory;
-import org.apache.hadoop.ozone.container.common.helpers.DatanodeVersionFile;
-import
org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
-import org.apache.hadoop.ozone.container.common.utils.DiskCheckUtil;
-import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-
-import java.io.File;
-import java.util.Properties;
-import java.util.UUID;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-/**
- * Test for StorageVolume.
- */
-public class TestStorageVolume {
-
- private static final String DATANODE_UUID = UUID.randomUUID().toString();
- private static final String CLUSTER_ID = UUID.randomUUID().toString();
- private static final OzoneConfiguration CONF = new OzoneConfiguration();
-
- @Rule
- public TemporaryFolder folder = new TemporaryFolder();
-
- private HddsVolume.Builder volumeBuilder;
- private File versionFile;
-
- private static final DiskCheckUtil.DiskChecks IO_FAILURE =
- new DiskCheckUtil.DiskChecks() {
- @Override
- public boolean checkReadWrite(File storageDir, File testFileDir,
- int numBytesToWrite) {
- return false;
- }
- };
-
- @Before
- public void setup() throws Exception {
- File rootDir = new File(folder.getRoot(), HddsVolume.HDDS_VOLUME_DIR);
- volumeBuilder = new HddsVolume.Builder(folder.getRoot().getPath())
- .datanodeUuid(DATANODE_UUID)
- .conf(CONF)
- .usageCheckFactory(MockSpaceUsageCheckFactory.NONE);
- versionFile = StorageVolumeUtil.getVersionFile(rootDir);
- DiskCheckUtil.clearTestImpl();
- }
-
- @Test
- public void testReadPropertiesFromVersionFile() throws Exception {
- HddsVolume volume = volumeBuilder.build();
-
- volume.format(CLUSTER_ID);
-
- Properties properties = DatanodeVersionFile.readFrom(versionFile);
-
- String storageID = StorageVolumeUtil.getStorageID(properties, versionFile);
- String clusterID = StorageVolumeUtil.getClusterID(
- properties, versionFile, CLUSTER_ID);
- String datanodeUuid = StorageVolumeUtil.getDatanodeUUID(
- properties, versionFile, DATANODE_UUID);
- long cTime = StorageVolumeUtil.getCreationTime(
- properties, versionFile);
- int layoutVersion = StorageVolumeUtil.getLayOutVersion(
- properties, versionFile);
-
- assertEquals(volume.getStorageID(), storageID);
- assertEquals(volume.getClusterID(), clusterID);
- assertEquals(volume.getDatanodeUuid(), datanodeUuid);
- assertEquals(volume.getCTime(), cTime);
- assertEquals(volume.getLayoutVersion(), layoutVersion);
- }
-
- @Test
- public void testCheckExistence() throws Exception {
- HddsVolume volume = volumeBuilder.build();
- volume.format(CLUSTER_ID);
-
- VolumeCheckResult result = volume.check(false);
- assertEquals(VolumeCheckResult.HEALTHY, result);
-
- final DiskCheckUtil.DiskChecks doesNotExist =
- new DiskCheckUtil.DiskChecks() {
- @Override
- public boolean checkExistence(File storageDir) {
- return false;
- }
- };
-
- DiskCheckUtil.setTestImpl(doesNotExist);
- result = volume.check(false);
- assertEquals(VolumeCheckResult.FAILED, result);
- }
-
- @Test
- public void testCheckPermissions() throws Exception {
- HddsVolume volume = volumeBuilder.build();
- volume.format(CLUSTER_ID);
-
- VolumeCheckResult result = volume.check(false);
- assertEquals(VolumeCheckResult.HEALTHY, result);
-
- final DiskCheckUtil.DiskChecks noPermissions =
- new DiskCheckUtil.DiskChecks() {
- @Override
- public boolean checkPermissions(File storageDir) {
- return false;
- }
- };
-
- DiskCheckUtil.setTestImpl(noPermissions);
- result = volume.check(false);
- assertEquals(VolumeCheckResult.FAILED, result);
- }
-
- /**
- * Setting test count to 0 should disable IO tests.
- */
- @Test
- public void testCheckIODisabled() throws Exception {
- DatanodeConfiguration dnConf = CONF.getObject(DatanodeConfiguration.class);
- dnConf.setVolumeIOTestCount(0);
- CONF.setFromObject(dnConf);
- volumeBuilder.conf(CONF);
- HddsVolume volume = volumeBuilder.build();
- volume.format(CLUSTER_ID);
-
- DiskCheckUtil.setTestImpl(IO_FAILURE);
- assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
- }
-
- @Test
- public void testCheckIODefaultConfigs() {
- CONF.clear();
- DatanodeConfiguration dnConf = CONF.getObject(DatanodeConfiguration.class);
- // Make sure default values are not invalid.
- assertTrue(dnConf.getVolumeIOFailureTolerance() <
- dnConf.getVolumeIOTestCount());
- }
-
- @Test
- public void testCheckIOInvalidConfig() throws Exception {
- HddsVolume volume = volumeBuilder.build();
- volume.format(CLUSTER_ID);
- DatanodeConfiguration dnConf = CONF.getObject(DatanodeConfiguration.class);
-
- // When failure tolerance is above test count, default values should be
- // used.
- dnConf.setVolumeIOTestCount(3);
- dnConf.setVolumeIOFailureTolerance(4);
- CONF.setFromObject(dnConf);
- dnConf = CONF.getObject(DatanodeConfiguration.class);
- assertEquals(dnConf.getVolumeIOTestCount(),
- DatanodeConfiguration.DISK_CHECK_IO_TEST_COUNT_DEFAULT);
- assertEquals(dnConf.getVolumeIOFailureTolerance(),
- DatanodeConfiguration.DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT);
-
- // When test count and failure tolerance are set to the same value,
- // Default values should be used.
- dnConf.setVolumeIOTestCount(2);
- dnConf.setVolumeIOFailureTolerance(2);
- CONF.setFromObject(dnConf);
- dnConf = CONF.getObject(DatanodeConfiguration.class);
- assertEquals(DatanodeConfiguration.DISK_CHECK_IO_TEST_COUNT_DEFAULT,
- dnConf.getVolumeIOTestCount());
-
assertEquals(DatanodeConfiguration.DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT,
- dnConf.getVolumeIOFailureTolerance());
-
- // Negative test count should reset to default value.
- dnConf.setVolumeIOTestCount(-1);
- CONF.setFromObject(dnConf);
- dnConf = CONF.getObject(DatanodeConfiguration .class);
- assertEquals(DatanodeConfiguration.DISK_CHECK_IO_TEST_COUNT_DEFAULT,
- dnConf.getVolumeIOTestCount());
-
- // Negative failure tolerance should reset to default value.
- dnConf.setVolumeIOFailureTolerance(-1);
- CONF.setFromObject(dnConf);
- dnConf = CONF.getObject(DatanodeConfiguration .class);
-
assertEquals(DatanodeConfiguration.DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT,
- dnConf.getVolumeIOFailureTolerance());
- }
-
- @Test
- public void testCheckIOInitiallyPassing() throws Exception {
- testCheckIOUntilFailure(3, 1, true, true, true, false, true, false);
- }
-
- @Test
- public void testCheckIOEarlyFailure() throws Exception {
- testCheckIOUntilFailure(3, 1, false, false);
- }
-
- @Test
- public void testCheckIOFailuresDiscarded() throws Exception {
- testCheckIOUntilFailure(3, 1, false, true, true, true, false, false);
- }
-
- @Test
- public void testCheckIOAlternatingFailures() throws Exception {
- testCheckIOUntilFailure(3, 1, true, false, true, false);
- }
-
- /**
- * Helper method to test the sliding window of IO checks before volume
- * failure.
- *
- * @param ioTestCount The number of most recent tests whose results should
- * be considered.
- * @param ioFailureTolerance The number of IO failures tolerated out of the
- * last {@param ioTestCount} tests.
- * @param checkResults The result of the IO check for each run. Volume
- * should fail after the last IO check is completed.
- */
- private void testCheckIOUntilFailure(int ioTestCount, int ioFailureTolerance,
- boolean... checkResults) throws Exception {
- DatanodeConfiguration dnConf = CONF.getObject(DatanodeConfiguration.class);
- dnConf.setVolumeIOTestCount(ioTestCount);
- dnConf.setVolumeIOFailureTolerance(ioFailureTolerance);
- CONF.setFromObject(dnConf);
- volumeBuilder.conf(CONF);
- HddsVolume volume = volumeBuilder.build();
- volume.format(CLUSTER_ID);
-
- for (int i = 0; i < checkResults.length; i++) {
- final boolean result = checkResults[i];
- final DiskCheckUtil.DiskChecks ioResult = new DiskCheckUtil.DiskChecks()
{
- @Override
- public boolean checkReadWrite(File storageDir, File testDir,
- int numBytesToWrite) {
- return result;
- }
- };
- DiskCheckUtil.setTestImpl(ioResult);
- if (i < checkResults.length - 1) {
- assertEquals("Unexpected IO failure in run " + i,
- VolumeCheckResult.HEALTHY, volume.check(false));
- } else {
- assertEquals("Unexpected IO success in run " + i,
- VolumeCheckResult.FAILED, volume.check(false));
- }
- }
- }
-}
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
new file mode 100644
index 0000000000..e2fdbdc937
--- /dev/null
+++
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
@@ -0,0 +1,346 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
under
+ * the License.
+ */
+package org.apache.hadoop.ozone.container.common.volume;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.hdfs.server.datanode.checker.VolumeCheckResult;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.fs.MockSpaceUsageCheckFactory;
+import
org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
+import org.apache.hadoop.ozone.container.common.utils.DiskCheckUtil;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Named;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import java.io.File;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.UUID;
+import java.util.stream.Stream;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Test for StorageVolume health checks using Real volume instances with
+ * mocked checkers to simulate failures.
+ */
+public class TestStorageVolumeHealthChecks {
+
+ private static final String DATANODE_UUID = UUID.randomUUID().toString();
+ private static final String CLUSTER_ID = UUID.randomUUID().toString();
+ private static final OzoneConfiguration CONF = new OzoneConfiguration();
+
+ @TempDir
+ private static Path volumePath;
+
+ public static Stream<Arguments> volumeBuilders() {
+ HddsVolume.Builder hddsVolumeBuilder =
+ new HddsVolume.Builder(volumePath.toString())
+ .datanodeUuid(DATANODE_UUID)
+ .conf(CONF)
+ .usageCheckFactory(MockSpaceUsageCheckFactory.NONE);
+
+ MetadataVolume.Builder metadataVolumeBuilder =
+ new MetadataVolume.Builder(volumePath.toString())
+ .datanodeUuid(DATANODE_UUID)
+ .conf(CONF)
+ .usageCheckFactory(MockSpaceUsageCheckFactory.NONE);
+
+ DbVolume.Builder dbVolumeBuilder =
+ new DbVolume.Builder(volumePath.toString())
+ .datanodeUuid(DATANODE_UUID)
+ .conf(CONF)
+ .usageCheckFactory(MockSpaceUsageCheckFactory.NONE);
+
+ return Stream.of(
+ Arguments.of(Named.of("HDDS Volume", hddsVolumeBuilder)),
+ Arguments.of(Named.of("Metadata Volume", metadataVolumeBuilder)),
+ Arguments.of(Named.of("DB Volume", dbVolumeBuilder))
+ );
+ }
+
+ @BeforeEach
+ public void setup() throws Exception {
+ // Volume path must be static to construct volume argument provider, but
+ // needs to be cleared before each test.
+ FileUtils.deleteDirectory(volumePath.toFile());
+ DiskCheckUtil.clearTestImpl();
+ }
+
+ @ParameterizedTest
+ @MethodSource("volumeBuilders")
+ public void testCheckExistence(StorageVolume.Builder<?> builder)
+ throws Exception {
+ StorageVolume volume = builder.build();
+ volume.format(CLUSTER_ID);
+ volume.createTmpDirs(CLUSTER_ID);
+
+ VolumeCheckResult result = volume.check(false);
+ assertEquals(VolumeCheckResult.HEALTHY, result);
+
+ final DiskCheckUtil.DiskChecks doesNotExist =
+ new DiskCheckUtil.DiskChecks() {
+ @Override
+ public boolean checkExistence(File storageDir) {
+ return false;
+ }
+ };
+
+ DiskCheckUtil.setTestImpl(doesNotExist);
+ result = volume.check(false);
+ assertEquals(VolumeCheckResult.FAILED, result);
+ }
+
+ @ParameterizedTest
+ @MethodSource("volumeBuilders")
+ public void testCheckPermissions(StorageVolume.Builder<?> builder)
+ throws Exception {
+ StorageVolume volume = builder.build();
+ volume.format(CLUSTER_ID);
+ volume.createTmpDirs(CLUSTER_ID);
+
+ VolumeCheckResult result = volume.check(false);
+ assertEquals(VolumeCheckResult.HEALTHY, result);
+
+ final DiskCheckUtil.DiskChecks noPermissions =
+ new DiskCheckUtil.DiskChecks() {
+ @Override
+ public boolean checkPermissions(File storageDir) {
+ return false;
+ }
+ };
+
+ DiskCheckUtil.setTestImpl(noPermissions);
+ result = volume.check(false);
+ assertEquals(VolumeCheckResult.FAILED, result);
+ }
+
+ /**
+ * Setting test count to 0 should disable IO tests.
+ */
+ @ParameterizedTest
+ @MethodSource("volumeBuilders")
+ public void testCheckIODisabled(StorageVolume.Builder<?> builder)
+ throws Exception {
+ DatanodeConfiguration dnConf = CONF.getObject(DatanodeConfiguration.class);
+ dnConf.setVolumeIOTestCount(0);
+ CONF.setFromObject(dnConf);
+
+ builder.conf(CONF);
+ StorageVolume volume = builder.build();
+ volume.format(CLUSTER_ID);
+ volume.createTmpDirs(CLUSTER_ID);
+
+ DiskCheckUtil.DiskChecks ioFailure = new DiskCheckUtil.DiskChecks() {
+ @Override
+ public boolean checkReadWrite(File storageDir, File testFileDir,
+ int numBytesToWrite) {
+ return false;
+ }
+ };
+ DiskCheckUtil.setTestImpl(ioFailure);
+ assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+ }
+
+ @Test
+ public void testCheckIODefaultConfigs() {
+ CONF.clear();
+ DatanodeConfiguration dnConf = CONF.getObject(DatanodeConfiguration.class);
+ // Make sure default values are not invalid.
+ assertTrue(dnConf.getVolumeIOFailureTolerance() <
+ dnConf.getVolumeIOTestCount());
+ }
+
+ @Test
+ public void testCheckIOInvalidConfig() {
+ DatanodeConfiguration dnConf = CONF.getObject(DatanodeConfiguration.class);
+
+ // When failure tolerance is above test count, default values should be
+ // used.
+ dnConf.setVolumeIOTestCount(3);
+ dnConf.setVolumeIOFailureTolerance(4);
+ CONF.setFromObject(dnConf);
+ dnConf = CONF.getObject(DatanodeConfiguration.class);
+ assertEquals(dnConf.getVolumeIOTestCount(),
+ DatanodeConfiguration.DISK_CHECK_IO_TEST_COUNT_DEFAULT);
+ assertEquals(dnConf.getVolumeIOFailureTolerance(),
+ DatanodeConfiguration.DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT);
+
+ // When test count and failure tolerance are set to the same value,
+ // Default values should be used.
+ dnConf.setVolumeIOTestCount(2);
+ dnConf.setVolumeIOFailureTolerance(2);
+ CONF.setFromObject(dnConf);
+ dnConf = CONF.getObject(DatanodeConfiguration.class);
+ assertEquals(DatanodeConfiguration.DISK_CHECK_IO_TEST_COUNT_DEFAULT,
+ dnConf.getVolumeIOTestCount());
+
assertEquals(DatanodeConfiguration.DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT,
+ dnConf.getVolumeIOFailureTolerance());
+
+ // Negative test count should reset to default value.
+ dnConf.setVolumeIOTestCount(-1);
+ CONF.setFromObject(dnConf);
+ dnConf = CONF.getObject(DatanodeConfiguration .class);
+ assertEquals(DatanodeConfiguration.DISK_CHECK_IO_TEST_COUNT_DEFAULT,
+ dnConf.getVolumeIOTestCount());
+
+ // Negative failure tolerance should reset to default value.
+ dnConf.setVolumeIOFailureTolerance(-1);
+ CONF.setFromObject(dnConf);
+ dnConf = CONF.getObject(DatanodeConfiguration .class);
+
assertEquals(DatanodeConfiguration.DISK_CHECK_IO_FAILURES_TOLERATED_DEFAULT,
+ dnConf.getVolumeIOFailureTolerance());
+ }
+
+ @ParameterizedTest
+ @MethodSource("volumeBuilders")
+ public void testCheckIOInitiallyPassing(StorageVolume.Builder<?> builder)
+ throws Exception {
+ testCheckIOUntilFailure(builder, 3, 1, true, true, true, false, true,
+ false);
+ }
+
+ @ParameterizedTest
+ @MethodSource("volumeBuilders")
+ public void testCheckIOEarlyFailure(StorageVolume.Builder<?> builder)
+ throws Exception {
+ testCheckIOUntilFailure(builder, 3, 1, false, false);
+ }
+
+ @ParameterizedTest
+ @MethodSource("volumeBuilders")
+ public void testCheckIOFailuresDiscarded(StorageVolume.Builder<?> builder)
+ throws Exception {
+ testCheckIOUntilFailure(builder, 3, 1, false, true, true, true, false,
+ false);
+ }
+
+ @ParameterizedTest
+ @MethodSource("volumeBuilders")
+ public void testCheckIOAlternatingFailures(StorageVolume.Builder<?> builder)
+ throws Exception {
+ testCheckIOUntilFailure(builder, 3, 1, true, false, true, false);
+ }
+
+ /**
+ * Helper method to test the sliding window of IO checks before volume
+ * failure.
+ *
+ * @param ioTestCount The number of most recent tests whose results should
+ * be considered.
+ * @param ioFailureTolerance The number of IO failures tolerated out of the
+ * last {@param ioTestCount} tests.
+ * @param checkResults The result of the IO check for each run. Volume
+ * should fail after the last IO check is completed.
+ */
+ private void testCheckIOUntilFailure(StorageVolume.Builder<?> builder,
+ int ioTestCount, int ioFailureTolerance, boolean... checkResults)
+ throws Exception {
+ DatanodeConfiguration dnConf = CONF.getObject(DatanodeConfiguration.class);
+ dnConf.setVolumeIOTestCount(ioTestCount);
+ dnConf.setVolumeIOFailureTolerance(ioFailureTolerance);
+ CONF.setFromObject(dnConf);
+ builder.conf(CONF);
+ StorageVolume volume = builder.build();
+ volume.format(CLUSTER_ID);
+ volume.createTmpDirs(CLUSTER_ID);
+
+ for (int i = 0; i < checkResults.length; i++) {
+ final boolean result = checkResults[i];
+ final DiskCheckUtil.DiskChecks ioResult = new DiskCheckUtil.DiskChecks()
{
+ @Override
+ public boolean checkReadWrite(File storageDir, File testDir,
+ int numBytesToWrite) {
+ return result;
+ }
+ };
+ DiskCheckUtil.setTestImpl(ioResult);
+ if (i < checkResults.length - 1) {
+ assertEquals(VolumeCheckResult.HEALTHY, volume.check(false),
+ "Unexpected IO failure in run " + i);
+ } else {
+ assertEquals(VolumeCheckResult.FAILED, volume.check(false),
+ "Unexpected IO success in run " + i);
+ }
+ }
+ }
+
+ @ParameterizedTest
+ @MethodSource("volumeBuilders")
+ public void testCorrectDirectoryChecked(StorageVolume.Builder<?> builder)
+ throws Exception {
+ StorageVolume volume = builder.build();
+ DiskCheckUtil.setTestImpl(new DirectoryCheck(volume));
+ volume.format(CLUSTER_ID);
+ volume.createTmpDirs(CLUSTER_ID);
+ volume.check(false);
+ }
+
+ /**
+ * Asserts that the disk checks are being done on the correct directory for
+ * each volume type.
+ */
+ private static final class DirectoryCheck implements
+ DiskCheckUtil.DiskChecks {
+ private final StorageVolume volume;
+
+ DirectoryCheck(StorageVolume volume) {
+ this.volume = volume;
+ }
+
+ @Override
+ public boolean checkExistence(File storageDir) {
+ assertEquals(volume.getStorageDir(), storageDir);
+ return true;
+ }
+
+ @Override
+ public boolean checkPermissions(File storageDir) {
+ assertEquals(volume.getStorageDir(), storageDir);
+ return true;
+ }
+
+ @Override
+ public boolean checkReadWrite(File storageDir, File testFileDir,
+ int numBytesToWrite) {
+ assertEquals(volume.getStorageDir(), storageDir);
+
+ Path expectedDiskCheckPath;
+ if (volume instanceof MetadataVolume) {
+ expectedDiskCheckPath = Paths.get(
+ volume.getStorageDir().getAbsolutePath(),
+ StorageVolume.TMP_DIR_NAME,
+ StorageVolume.TMP_DISK_CHECK_DIR_NAME);
+ } else {
+ expectedDiskCheckPath = Paths.get(
+ volume.getStorageDir().getAbsolutePath(),
+ volume.getClusterID(),
+ StorageVolume.TMP_DIR_NAME,
+ StorageVolume.TMP_DISK_CHECK_DIR_NAME);
+ }
+
+ assertEquals(expectedDiskCheckPath.toFile(), volume.getDiskCheckDir());
+ assertEquals(expectedDiskCheckPath.toFile(), testFileDir);
+ return true;
+ }
+ }
+}
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
index bb281f1eb5..10a1b43f90 100644
---
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
+++
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
@@ -154,14 +154,19 @@ public class TestVolumeSetDiskChecks {
StorageVolume.VolumeType.DB_VOLUME,
dummyChecker);
+ volumeSet.checkAllVolumes();
Assert.assertEquals(volumeSet.getFailedVolumesList().size(),
numBadVolumes);
Assert.assertEquals(volumeSet.getVolumesList().size(),
numVolumes - numBadVolumes);
+
+ metaVolumeSet.checkAllVolumes();
Assert.assertEquals(metaVolumeSet.getFailedVolumesList().size(),
numBadVolumes);
Assert.assertEquals(metaVolumeSet.getVolumesList().size(),
numVolumes - numBadVolumes);
+
+ dbVolumeSet.checkAllVolumes();
Assert.assertEquals(dbVolumeSet.getFailedVolumesList().size(),
numBadVolumes);
Assert.assertEquals(dbVolumeSet.getVolumesList().size(),
@@ -197,10 +202,13 @@ public class TestVolumeSetDiskChecks {
StorageVolume.VolumeType.DB_VOLUME,
dummyChecker);
+ volumeSet.checkAllVolumes();
assertEquals(volumeSet.getFailedVolumesList().size(), numVolumes);
assertEquals(volumeSet.getVolumesList().size(), 0);
+ metaVolumeSet.checkAllVolumes();
assertEquals(metaVolumeSet.getFailedVolumesList().size(), numVolumes);
assertEquals(metaVolumeSet.getVolumesList().size(), 0);
+ dbVolumeSet.checkAllVolumes();
assertEquals(dbVolumeSet.getFailedVolumesList().size(), numVolumes);
assertEquals(dbVolumeSet.getVolumesList().size(), 0);
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/volume/TestDatanodeHddsVolumeFailureToleration.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/volume/TestDatanodeHddsVolumeFailureToleration.java
index 25c0bac0e6..c0c181dc36 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/volume/TestDatanodeHddsVolumeFailureToleration.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/volume/TestDatanodeHddsVolumeFailureToleration.java
@@ -25,16 +25,21 @@ import org.apache.hadoop.hdds.conf.StorageUnit;
import org.apache.hadoop.ozone.HddsDatanodeService;
import org.apache.hadoop.ozone.MiniOzoneCluster;
import
org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
+import
org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine;
import org.apache.hadoop.ozone.container.common.volume.MutableVolumeSet;
import org.apache.hadoop.ozone.container.common.volume.StorageVolume;
import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer;
import org.apache.hadoop.ozone.dn.DatanodeTestUtils;
+import org.apache.hadoop.util.ExitUtil;
+import org.apache.ozone.test.GenericTestUtils;
+import org.apache.ozone.test.GenericTestUtils.LogCapturer;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.Timeout;
+import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
@@ -124,13 +129,24 @@ public class TestDatanodeHddsVolumeFailureToleration {
DatanodeTestUtils.simulateBadRootDir(volRootDir1);
// restart datanode to test
- try {
- cluster.restartHddsDatanode(0, true);
- Assert.fail();
- } catch (RuntimeException e) {
- Assert.assertTrue(e.getMessage()
- .contains("Can't start the HDDS datanode plugin"));
- }
+ // Make datanode throw an exception instead of exiting the jvm when too
+ // many volumes fail so that the test keeps running.
+ ExitUtil.disableSystemExit();
+ // Since the exception will not be thrown from the main thread, the
+ // datanode will not actually exit. Use log messages to determine that
+ // the ExitUtil was invoked which would terminate the process in a normal
+ // deployment.
+ LogCapturer dsmCapturer = LogCapturer.captureLogs(
+ LoggerFactory.getLogger(DatanodeStateMachine.class));
+ LogCapturer exitCapturer = LogCapturer.captureLogs(
+ LoggerFactory.getLogger(ExitUtil.class.getName()));
+ cluster.restartHddsDatanode(0, false);
+ // Give the datanode time to restart. This may be slow in a mini ozone
+ // cluster.
+ GenericTestUtils.waitFor(() -> exitCapturer.getOutput()
+ .contains("Exiting with status 1: ExitException"), 500, 60000);
+ Assert.assertTrue(dsmCapturer.getOutput()
+ .contains("DatanodeStateMachine Shutdown due to too many bad
volumes"));
// restore bad volumes
DatanodeTestUtils.restoreBadRootDir(volRootDir0);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]