This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 9d46a3cd18 HDDS-10227. Intermittent timeout in
TestReconAndAdminContainerCLI.compareRMReportToReconResponse (#6112)
9d46a3cd18 is described below
commit 9d46a3cd189104db5fbb1af162939d0a07dbea09
Author: Doroszlai, Attila <[email protected]>
AuthorDate: Mon Jan 29 12:19:50 2024 +0100
HDDS-10227. Intermittent timeout in
TestReconAndAdminContainerCLI.compareRMReportToReconResponse (#6112)
---
.../ozone/recon/TestReconAndAdminContainerCLI.java | 132 +++++++++++----------
.../hadoop/ozone/recon/TestReconEndpointUtil.java | 3 +-
2 files changed, 70 insertions(+), 65 deletions(-)
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconAndAdminContainerCLI.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconAndAdminContainerCLI.java
index 8429269b24..7691704d92 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconAndAdminContainerCLI.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconAndAdminContainerCLI.java
@@ -16,7 +16,6 @@
*/
package org.apache.hadoop.ozone.recon;
-import com.google.common.base.Strings;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.client.RatisReplicationConfig;
import org.apache.hadoop.hdds.client.ReplicationConfig;
@@ -36,7 +35,6 @@ import
org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
-import org.apache.hadoop.hdds.scm.pipeline.PipelineNotFoundException;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.ozone.MiniOzoneCluster;
import org.apache.hadoop.ozone.OzoneConfigKeys;
@@ -59,13 +57,14 @@ import org.apache.ozone.test.GenericTestUtils;
import org.apache.ozone.test.LambdaTestUtils;
import
org.hadoop.ozone.recon.schema.ContainerSchemaDefinition.UnHealthyContainerStates;
import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Timeout;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.slf4j.event.Level;
import java.io.IOException;
@@ -94,13 +93,19 @@ import static
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERV
import static
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL;
import static
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
import static
org.apache.hadoop.ozone.recon.ReconServerConfigKeys.OZONE_RECON_OM_SNAPSHOT_TASK_INTERVAL_DELAY;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
/**
* Integration tests for ensuring Recon's consistency
* with the "ozone admin container" CLI.
*/
@Timeout(300)
-public class TestReconAndAdminContainerCLI {
+class TestReconAndAdminContainerCLI {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(TestReconAndAdminContainerCLI.class);
private static final OzoneConfiguration CONF = new OzoneConfiguration();
private static ScmClient scmClient;
@@ -121,7 +126,7 @@ public class TestReconAndAdminContainerCLI {
}
@BeforeAll
- public static void init() throws Exception {
+ static void init() throws Exception {
setupConfigKeys();
cluster = MiniOzoneCluster.newBuilder(CONF)
.setNumDatanodes(5)
@@ -147,18 +152,15 @@ public class TestReconAndAdminContainerCLI {
// Verify that Recon has all the pipelines from SCM.
scmPipelineManager.getPipelines().forEach(p -> {
- try {
- Assertions.assertNotNull(reconPipelineManager.getPipeline(p.getId()));
- } catch (PipelineNotFoundException e) {
- Assertions.fail();
- }
+ Pipeline pipeline = assertDoesNotThrow(() ->
reconPipelineManager.getPipeline(p.getId()));
+ assertNotNull(pipeline);
});
- Assertions.assertTrue(scmContainerManager.getContainers().isEmpty());
+ assertThat(scmContainerManager.getContainers()).isEmpty();
// Verify that all nodes are registered with Recon.
NodeManager reconNodeManager = reconScm.getScmNodeManager();
- Assertions.assertEquals(scmNodeManager.getAllNodes().size(),
+ assertEquals(scmNodeManager.getAllNodes().size(),
reconNodeManager.getAllNodes().size());
OzoneClient client = cluster.newClient();
@@ -174,7 +176,7 @@ public class TestReconAndAdminContainerCLI {
}
@AfterAll
- public static void shutdown() {
+ static void shutdown() {
if (cluster != null) {
cluster.shutdown();
}
@@ -185,7 +187,7 @@ public class TestReconAndAdminContainerCLI {
* but it's easier to test with Ratis ONE.
*/
@Test
- public void testMissingContainer() throws Exception {
+ void testMissingContainer() throws Exception {
String keyNameR1 = "key2";
long containerID = setupRatisKey(keyNameR1,
HddsProtos.ReplicationFactor.ONE);
@@ -209,7 +211,7 @@ public class TestReconAndAdminContainerCLI {
UnHealthyContainerStates containerStateForTesting =
UnHealthyContainerStates.MISSING;
- compareRMReportToReconResponse(containerStateForTesting.toString());
+ compareRMReportToReconResponse(containerStateForTesting);
for (DatanodeDetails details : pipeline.getNodes()) {
cluster.restartHddsDatanode(details, false);
@@ -219,7 +221,7 @@ public class TestReconAndAdminContainerCLI {
@ParameterizedTest
@MethodSource("outOfServiceNodeStateArgs")
- public void testNodesInDecommissionOrMaintenance(
+ void testNodesInDecommissionOrMaintenance(
NodeOperationalState initialState, NodeOperationalState finalState,
boolean isMaintenance) throws Exception {
Pipeline pipeline =
@@ -250,8 +252,8 @@ public class TestReconAndAdminContainerCLI {
TestNodeUtil.waitForDnToReachOpState(scmNodeManager,
nodeToGoOffline1, initialState);
- compareRMReportToReconResponse(underReplicatedState.toString());
- compareRMReportToReconResponse(overReplicatedState.toString());
+ compareRMReportToReconResponse(underReplicatedState);
+ compareRMReportToReconResponse(overReplicatedState);
TestNodeUtil.waitForDnToReachOpState(scmNodeManager,
nodeToGoOffline1, finalState);
@@ -262,8 +264,8 @@ public class TestReconAndAdminContainerCLI {
TestHelper.waitForReplicaCount(containerIdR3, 4, cluster);
}
- compareRMReportToReconResponse(underReplicatedState.toString());
- compareRMReportToReconResponse(overReplicatedState.toString());
+ compareRMReportToReconResponse(underReplicatedState);
+ compareRMReportToReconResponse(overReplicatedState);
// Second node goes offline.
if (isMaintenance) {
@@ -277,8 +279,8 @@ public class TestReconAndAdminContainerCLI {
TestNodeUtil.waitForDnToReachOpState(scmNodeManager,
nodeToGoOffline2, initialState);
- compareRMReportToReconResponse(underReplicatedState.toString());
- compareRMReportToReconResponse(overReplicatedState.toString());
+ compareRMReportToReconResponse(underReplicatedState);
+ compareRMReportToReconResponse(overReplicatedState);
TestNodeUtil.waitForDnToReachOpState(scmNodeManager,
nodeToGoOffline2, finalState);
@@ -288,8 +290,8 @@ public class TestReconAndAdminContainerCLI {
int expectedReplicaNum = isMaintenance ? 4 : 5;
TestHelper.waitForReplicaCount(containerIdR3, expectedReplicaNum, cluster);
- compareRMReportToReconResponse(underReplicatedState.toString());
- compareRMReportToReconResponse(overReplicatedState.toString());
+ compareRMReportToReconResponse(underReplicatedState);
+ compareRMReportToReconResponse(overReplicatedState);
scmClient.recommissionNodes(Arrays.asList(
TestNodeUtil.getDNHostAndPort(nodeToGoOffline1),
@@ -303,8 +305,8 @@ public class TestReconAndAdminContainerCLI {
TestNodeUtil.waitForDnToReachPersistedOpState(nodeToGoOffline1,
IN_SERVICE);
TestNodeUtil.waitForDnToReachPersistedOpState(nodeToGoOffline2,
IN_SERVICE);
- compareRMReportToReconResponse(underReplicatedState.toString());
- compareRMReportToReconResponse(overReplicatedState.toString());
+ compareRMReportToReconResponse(underReplicatedState);
+ compareRMReportToReconResponse(overReplicatedState);
}
/**
@@ -312,53 +314,53 @@ public class TestReconAndAdminContainerCLI {
* but to make sure that they are consistent between
* Recon and the ReplicationManager.
*/
- private static void compareRMReportToReconResponse(String containerState)
+ private static void compareRMReportToReconResponse(UnHealthyContainerStates
containerState)
throws Exception {
- Assertions.assertFalse(Strings.isNullOrEmpty(containerState));
-
- ReplicationManagerReport rmReport =
scmClient.getReplicationManagerReport();
- UnhealthyContainersResponse reconResponse =
- TestReconEndpointUtil
- .getUnhealthyContainersFromRecon(CONF, containerState);
-
- long rmMissingCounter = rmReport.getStat(
- ReplicationManagerReport.HealthState.MISSING);
- long rmUnderReplCounter = rmReport.getStat(
- ReplicationManagerReport.HealthState.UNDER_REPLICATED);
- long rmOverReplCounter = rmReport.getStat(
- ReplicationManagerReport.HealthState.OVER_REPLICATED);
- long rmMisReplCounter = rmReport.getStat(
- ReplicationManagerReport.HealthState.MIS_REPLICATED);
+ assertNotNull(containerState);
// Both threads are running every 1 second.
// Wait until all values are equal.
- GenericTestUtils.waitFor(
- () -> rmMissingCounter == reconResponse.getMissingCount() &&
- rmUnderReplCounter ==
reconResponse.getUnderReplicatedCount() &&
- rmOverReplCounter ==
reconResponse.getOverReplicatedCount() &&
- rmMisReplCounter == reconResponse.getMisReplicatedCount(),
- 1000, 40000);
+ GenericTestUtils.waitFor(() -> assertReportsMatch(containerState),
+ 1000, 40000);
+ }
+
+ private static boolean assertReportsMatch(UnHealthyContainerStates state) {
+ ReplicationManagerReport rmReport;
+ UnhealthyContainersResponse reconResponse;
+
+ try {
+ rmReport = scmClient.getReplicationManagerReport();
+ reconResponse = TestReconEndpointUtil
+ .getUnhealthyContainersFromRecon(CONF, state);
+
+ assertEquals(rmReport.getStat(HealthState.MISSING),
reconResponse.getMissingCount());
+ assertEquals(rmReport.getStat(HealthState.UNDER_REPLICATED),
reconResponse.getUnderReplicatedCount());
+ assertEquals(rmReport.getStat(HealthState.OVER_REPLICATED),
reconResponse.getOverReplicatedCount());
+ assertEquals(rmReport.getStat(HealthState.MIS_REPLICATED),
reconResponse.getMisReplicatedCount());
+ } catch (IOException e) {
+ LOG.info("Error getting report", e);
+ return false;
+ } catch (AssertionError e) {
+ LOG.info("Reports do not match (yet): {}", e.getMessage());
+ return false;
+ }
// Recon's UnhealthyContainerResponse contains a list of containers
// for a particular state. Check if RMs sample of containers can be
// found in Recon's list of containers for a particular state.
HealthState rmState = HealthState.UNHEALTHY;
- if (UnHealthyContainerStates.valueOf(containerState)
- .equals(UnHealthyContainerStates.MISSING) &&
- rmMissingCounter > 0) {
+ if (state.equals(UnHealthyContainerStates.MISSING) &&
+ reconResponse.getMissingCount() > 0) {
rmState = HealthState.MISSING;
- } else if (UnHealthyContainerStates.valueOf(containerState)
- .equals(UnHealthyContainerStates.UNDER_REPLICATED) &&
- rmUnderReplCounter > 0) {
+ } else if (state.equals(UnHealthyContainerStates.UNDER_REPLICATED) &&
+ reconResponse.getUnderReplicatedCount() > 0) {
rmState = HealthState.UNDER_REPLICATED;
- } else if (UnHealthyContainerStates.valueOf(containerState)
- .equals(UnHealthyContainerStates.OVER_REPLICATED) &&
- rmOverReplCounter > 0) {
+ } else if (state.equals(UnHealthyContainerStates.OVER_REPLICATED) &&
+ reconResponse.getOverReplicatedCount() > 0) {
rmState = HealthState.OVER_REPLICATED;
- } else if (UnHealthyContainerStates.valueOf(containerState)
- .equals(UnHealthyContainerStates.MIS_REPLICATED) &&
- rmMisReplCounter > 0) {
+ } else if (state.equals(UnHealthyContainerStates.MIS_REPLICATED) &&
+ reconResponse.getMisReplicatedCount() > 0) {
rmState = HealthState.MIS_REPLICATED;
}
@@ -372,7 +374,9 @@ public class TestReconAndAdminContainerCLI {
.stream()
.map(UnhealthyContainerMetadata::getContainerID)
.collect(Collectors.toList());
- Assertions.assertTrue(reconContainerIDs.containsAll(rmIDsToLong));
+ assertThat(reconContainerIDs).containsAll(rmIDsToLong);
+
+ return true;
}
private static long setupRatisKey(String keyName,
@@ -385,11 +389,11 @@ public class TestReconAndAdminContainerCLI {
List<Long> containerIDs = getContainerIdsForKey(omKeyInfo);
// The list has only 1 containerID.
- Assertions.assertEquals(1, containerIDs.size());
+ assertEquals(1, containerIDs.size());
long containerID = containerIDs.get(0);
// Verify Recon picked up the new container.
- Assertions.assertEquals(scmContainerManager.getContainers(),
+ assertEquals(scmContainerManager.getContainers(),
reconContainerManager.getContainers());
ReconContainerMetadataManager reconContainerMetadataManager =
@@ -427,7 +431,7 @@ public class TestReconAndAdminContainerCLI {
}
private static List<Long> getContainerIdsForKey(OmKeyInfo omKeyInfo) {
- Assertions.assertNotNull(omKeyInfo.getLatestVersionLocations());
+ assertNotNull(omKeyInfo.getLatestVersionLocations());
List<OmKeyLocationInfo> locations =
omKeyInfo.getLatestVersionLocations().getLocationList();
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconEndpointUtil.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconEndpointUtil.java
index 5f5f8b2351..002de94cb0 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconEndpointUtil.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/recon/TestReconEndpointUtil.java
@@ -24,6 +24,7 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.server.http.HttpConfig;
import org.apache.hadoop.hdfs.web.URLConnectionFactory;
import org.apache.hadoop.ozone.recon.api.types.UnhealthyContainersResponse;
+import org.hadoop.ozone.recon.schema.ContainerSchemaDefinition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -80,7 +81,7 @@ public final class TestReconEndpointUtil {
}
public static UnhealthyContainersResponse getUnhealthyContainersFromRecon(
- OzoneConfiguration conf, String containerState)
+ OzoneConfiguration conf,
ContainerSchemaDefinition.UnHealthyContainerStates containerState)
throws JsonProcessingException {
StringBuilder urlBuilder = new StringBuilder();
urlBuilder.append(getReconWebAddress(conf))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]