This is an automated email from the ASF dual-hosted git repository.
sammichen pushed a commit to branch HDDS-5713
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/HDDS-5713 by this push:
new 65a0cd0475b HDDS-14279. Double check selected container state before
move process starts (#9575)
65a0cd0475b is described below
commit 65a0cd0475b2d74aeb857d799e2a207d1264be40
Author: Gargi Jaiswal <[email protected]>
AuthorDate: Mon Jan 5 11:02:37 2026 +0530
HDDS-14279. Double check selected container state before move process
starts (#9575)
---
.../diskbalancer/DiskBalancerService.java | 21 +++++-
.../policy/DefaultContainerChoosingPolicy.java | 5 ++
.../diskbalancer/TestDiskBalancerTask.java | 74 ++++++++++++++++++++++
3 files changed, 97 insertions(+), 3 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java
index 7490c5bb3ea..957f187491f 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java
@@ -45,7 +45,7 @@
import org.apache.hadoop.hdds.conf.StorageUnit;
import org.apache.hadoop.hdds.fs.SpaceUsageSource;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
-import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
+import
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import
org.apache.hadoop.hdds.protocol.proto.HddsProtos.DiskBalancerRunningStatus;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
@@ -70,6 +70,7 @@
import
org.apache.hadoop.ozone.container.common.volume.VolumeChoosingPolicyFactory;
import
org.apache.hadoop.ozone.container.diskbalancer.DiskBalancerVolumeCalculation.VolumeFixedUsage;
import
org.apache.hadoop.ozone.container.diskbalancer.policy.ContainerChoosingPolicy;
+import
org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy;
import
org.apache.hadoop.ozone.container.diskbalancer.policy.DiskBalancerVolumeChoosingPolicy;
import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
import
org.apache.hadoop.ozone.container.keyvalue.helpers.KeyValueContainerLocationUtil;
@@ -457,6 +458,20 @@ public BackgroundTaskResult call() {
postCall(false, startTime);
return BackgroundTaskResult.EmptyTaskResult.newResult();
}
+
+ // Double check container state before acquiring lock to start move
process.
+ // Container state may have changed after selection. Only CLOSED
containers can be moved.
+ // QUASI_CLOSED is allowed when test mode is enabled, this is done to
test in production
+ // these containers are rejected.
+ State containerState = container.getContainerData().getState();
+ boolean isTestMode = DefaultContainerChoosingPolicy.isTest();
+ if (containerState != State.CLOSED && !(isTestMode && containerState ==
State.QUASI_CLOSED)) {
+ LOG.warn("Container {} is in {} state, skipping move process. Only
CLOSED containers can be moved.",
+ containerId, containerState);
+ postCall(false, startTime);
+ return BackgroundTaskResult.EmptyTaskResult.newResult();
+ }
+
// hold read lock on the container first, to avoid other threads to
update the container state,
// such as block deletion.
container.readLock();
@@ -477,8 +492,8 @@ public BackgroundTaskResult call() {
// Before move the container directory to final place, the destination
dir is empty and doesn't have
// a metadata directory. Writing the .container file will fail as the
metadata dir doesn't exist.
// So we instead save the container file to the diskBalancerTmpDir.
- ContainerProtos.ContainerDataProto.State originalState =
tempContainerData.getState();
-
tempContainerData.setState(ContainerProtos.ContainerDataProto.State.RECOVERING);
+ State originalState = tempContainerData.getState();
+ tempContainerData.setState(State.RECOVERING);
// update tempContainerData volume to point to destVolume
tempContainerData.setVolume(destVolume);
// overwrite the .container file with the new state.
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/policy/DefaultContainerChoosingPolicy.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/policy/DefaultContainerChoosingPolicy.java
index 1f55a976548..f964f6f519e 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/policy/DefaultContainerChoosingPolicy.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/policy/DefaultContainerChoosingPolicy.java
@@ -97,4 +97,9 @@ public ContainerData chooseContainer(OzoneContainer
ozoneContainer,
public static void setTest(boolean isTest) {
test = isTest;
}
+
+ @VisibleForTesting
+ public static boolean isTest() {
+ return test;
+ }
}
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java
index 76a3992658b..5e2df4eb392 100644
---
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java
+++
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java
@@ -82,6 +82,7 @@
import org.apache.hadoop.ozone.container.ozoneimpl.ContainerController;
import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer;
import org.apache.ozone.test.GenericTestUtils;
+import org.apache.ozone.test.GenericTestUtils.LogCapturer;
import org.assertj.core.api.Fail;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
@@ -602,6 +603,79 @@ public void
testOldReplicaDelayedDeletion(ContainerTestVersionInfo versionInfo)
assertFalse(oldContainerDir.exists());
}
+ /**
+ * Testing that invalid states (including QUASI_CLOSED in production mode)
are correctly rejected.
+ * Here, with QUASI_CLOSED state, we ensure that the test runs in production
mode
+ * where QUASI_CLOSED is not allowed for move.
+ */
+ @ParameterizedTest
+ @EnumSource(names = {"OPEN", "CLOSING", "QUASI_CLOSED", "UNHEALTHY",
"INVALID", "DELETED", "RECOVERING"})
+ public void testMoveSkippedWhenContainerStateChanged(State invalidState)
+ throws IOException, InterruptedException, TimeoutException {
+ LogCapturer serviceLog =
LogCapturer.captureLogs(DiskBalancerService.class);
+
+ // Create a CLOSED container which will be selected by
DefaultContainerChoosingPolicy
+ Container container = createContainer(CONTAINER_ID, sourceVolume,
State.CLOSED);
+ long initialSourceUsed = sourceVolume.getCurrentUsage().getUsedSpace();
+ long initialDestUsed = destVolume.getCurrentUsage().getUsedSpace();
+ long initialDestCommitted = destVolume.getCommittedBytes();
+ long initialSourceDelta =
diskBalancerService.getDeltaSizes().get(sourceVolume) == null ?
+ 0L : diskBalancerService.getDeltaSizes().get(sourceVolume);
+ String oldContainerPath = container.getContainerData().getContainerPath();
+
+ // Verify temp container directory doesn't exist before task execution
+ Path tempContainerDir = destVolume.getTmpDir().toPath()
+ .resolve(DISK_BALANCER_DIR).resolve(String.valueOf(CONTAINER_ID));
+ assertFalse(Files.exists(tempContainerDir));
+
+ // Get the task (container is selected as CLOSED)
+ DiskBalancerService.DiskBalancerTask task = getTask();
+ assertNotNull(task);
+
+ // Change container state to invalid state (OPEN or DELETED) before move
process starts
+ KeyValueContainerData containerData = (KeyValueContainerData)
container.getContainerData();
+ containerData.setState(invalidState);
+
+ // Execute the task - it should skip the move due to invalid state
+ task.call();
+
+ // Verify that move process was skipped
+ GenericTestUtils.waitFor(() ->
+ serviceLog.getOutput().contains("skipping move process") &&
+ serviceLog.getOutput().contains(String.valueOf(CONTAINER_ID)) &&
+ serviceLog.getOutput().contains(invalidState.toString()),
+ 100, 5000);
+
+ // Verify container is still in the original location
+ Container originalContainer = containerSet.getContainer(CONTAINER_ID);
+ assertNotNull(originalContainer);
+ assertEquals(container, originalContainer);
+ assertEquals(invalidState, originalContainer.getContainerState());
+ assertEquals(sourceVolume,
originalContainer.getContainerData().getVolume());
+ assertTrue(new File(oldContainerPath).exists(), "Container should still
exist in original location");
+
+ // Verify no temp directory was created
+ assertFalse(Files.exists(tempContainerDir), "Temp container directory
should not be created");
+
+ // Verify volume usage is unchanged
+ assertEquals(initialSourceUsed,
sourceVolume.getCurrentUsage().getUsedSpace());
+ assertEquals(initialDestUsed, destVolume.getCurrentUsage().getUsedSpace());
+
+ // Verify metrics show failure (since move was skipped)
+ assertEquals(1, diskBalancerService.getMetrics().getFailureCount());
+ assertEquals(0, diskBalancerService.getMetrics().getSuccessCount());
+ assertEquals(0, diskBalancerService.getMetrics().getSuccessBytes());
+
+ // Verify committed bytes are released
+ assertEquals(initialDestCommitted, destVolume.getCommittedBytes());
+
+ // Verify container is removed from in-progress set
+
assertFalse(diskBalancerService.getInProgressContainers().contains(ContainerID.valueOf(CONTAINER_ID)));
+
+ // Verify delta sizes are restored
+ assertEquals(initialSourceDelta,
diskBalancerService.getDeltaSizes().get(sourceVolume));
+ }
+
private KeyValueContainer createContainer(long containerId, HddsVolume vol,
State state)
throws IOException {
KeyValueContainerData containerData = new KeyValueContainerData(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]