This is an automated email from the ASF dual-hosted git repository.

ritesh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new ef480765d8 HDDS-7654. EC: ReplicationManager - merge mis-rep queue 
into under replicated queue (#4099)
ef480765d8 is described below

commit ef480765d8eb74fdcdfc9f6a47200c541c60b06a
Author: Stephen O'Donnell <[email protected]>
AuthorDate: Mon Dec 19 17:29:13 2022 +0000

    HDDS-7654. EC: ReplicationManager - merge mis-rep queue into under 
replicated queue (#4099)
    
    Co-authored-by: S O'Donnell <[email protected]>
---
 .../replication/ContainerHealthResult.java         | 35 ++++++++++++++++------
 .../container/replication/ReplicationManager.java  | 17 +++++++++--
 .../container/replication/ReplicationQueue.java    | 17 -----------
 .../replication/TestReplicationManager.java        | 15 ++++++++++
 .../health/TestECReplicationCheckHandler.java      |  6 +---
 .../health/TestRatisReplicationCheckHandler.java   |  5 +---
 6 files changed, 58 insertions(+), 37 deletions(-)

diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
index 5d80268ef4..6393f0349d 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
@@ -113,7 +113,15 @@ public class ContainerHealthResult {
     public UnderReplicatedHealthResult(ContainerInfo containerInfo,
         int remainingRedundancy, boolean dueToDecommission,
         boolean replicatedOkWithPending, boolean unrecoverable) {
-      super(containerInfo, HealthState.UNDER_REPLICATED);
+      this(containerInfo, remainingRedundancy, dueToDecommission,
+          replicatedOkWithPending, unrecoverable, 
HealthState.UNDER_REPLICATED);
+    }
+
+    protected UnderReplicatedHealthResult(ContainerInfo containerInfo,
+        int remainingRedundancy, boolean dueToDecommission,
+        boolean replicatedOkWithPending, boolean unrecoverable,
+        HealthState healthState) {
+      super(containerInfo, healthState);
       this.remainingRedundancy = remainingRedundancy;
       this.dueToDecommission = dueToDecommission;
       this.sufficientlyReplicatedAfterPending = replicatedOkWithPending;
@@ -148,7 +156,7 @@ public class ContainerHealthResult {
       if (dueToDecommission) {
         result += DECOMMISSION_REDUNDANCY;
       } else {
-        result += remainingRedundancy;
+        result += getRemainingRedundancy();
       }
       return result;
     }
@@ -207,19 +215,28 @@ public class ContainerHealthResult {
    * containers are not spread across enough racks.
    */
   public static class MisReplicatedHealthResult
-      extends ContainerHealthResult {
+      extends UnderReplicatedHealthResult {
 
-    private final boolean replicatedOkAfterPending;
+    /**
+     * In UnderReplicatedHealthState, DECOMMISSION_REDUNDANCY is defined as
+     * 5 so that containers which are really under replicated get fixed as a
+     * priority over decommissioning hosts. We have defined that a container
+     * can only be mis replicated if it is not over or under replicated. Fixing
+     * mis replication is arguably less important than competing a 
decommission.
+     * So as a lot of mis replicated container do not block decommission, we
+     * set the redundancy of mis replicated containers to 6 so they sort after
+     * under / over replicated and decommissioning replicas in the under
+     * replication queue.
+     */
+    private static final int MIS_REP_REDUNDANCY = 6;
 
     public MisReplicatedHealthResult(ContainerInfo containerInfo,
         boolean replicatedOkAfterPending) {
-      super(containerInfo, HealthState.MIS_REPLICATED);
-      this.replicatedOkAfterPending = replicatedOkAfterPending;
+      super(containerInfo, MIS_REP_REDUNDANCY, false,
+          replicatedOkAfterPending, false,
+          HealthState.MIS_REPLICATED);
     }
 
-    public boolean isReplicatedOkAfterPending() {
-      return replicatedOkAfterPending;
-    }
   }
 
   /**
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
index 351eec7178..5fd0ad2ea2 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
@@ -159,6 +159,7 @@ public class ReplicationManager implements SCMService {
   private ReplicationQueue replicationQueue;
   private final ECUnderReplicationHandler ecUnderReplicationHandler;
   private final ECOverReplicationHandler ecOverReplicationHandler;
+  private final ECMisReplicationHandler ecMisReplicationHandler;
   private final RatisUnderReplicationHandler ratisUnderReplicationHandler;
   private final RatisOverReplicationHandler ratisOverReplicationHandler;
   private final int maintenanceRedundancy;
@@ -223,6 +224,8 @@ public class ReplicationManager implements SCMService {
         ecContainerPlacement, conf, nodeManager, this);
     ecOverReplicationHandler =
         new ECOverReplicationHandler(ecContainerPlacement, nodeManager);
+    ecMisReplicationHandler = new ECMisReplicationHandler(ecContainerPlacement,
+        conf, nodeManager);
     ratisUnderReplicationHandler = new RatisUnderReplicationHandler(
         ratisContainerPlacement, conf, nodeManager);
     ratisOverReplicationHandler =
@@ -525,8 +528,18 @@ public class ReplicationManager implements SCMService {
     List<ContainerReplicaOp> pendingOps =
         containerReplicaPendingOps.getPendingOps(containerID);
     if (result.getContainerInfo().getReplicationType() == EC) {
-      return ecUnderReplicationHandler.processAndCreateCommands(replicas,
-          pendingOps, result, maintenanceRedundancy);
+      if (result.getHealthState()
+          == ContainerHealthResult.HealthState.UNDER_REPLICATED) {
+        return ecUnderReplicationHandler.processAndCreateCommands(replicas,
+            pendingOps, result, maintenanceRedundancy);
+      } else if (result.getHealthState()
+          == ContainerHealthResult.HealthState.MIS_REPLICATED) {
+        return ecMisReplicationHandler.processAndCreateCommands(replicas,
+            pendingOps, result, maintenanceRedundancy);
+      } else {
+        throw new IllegalArgumentException("Unexpected health state: "
+            + result.getHealthState());
+      }
     }
     return ratisUnderReplicationHandler.processAndCreateCommands(replicas,
         pendingOps, result, ratisMaintenanceMinReplicas);
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationQueue.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationQueue.java
index a14d21c76c..d27c1d9c61 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationQueue.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationQueue.java
@@ -32,8 +32,6 @@ public class ReplicationQueue {
       underRepQueue;
   private final Queue<ContainerHealthResult.OverReplicatedHealthResult>
       overRepQueue;
-  private final Queue<ContainerHealthResult.MisReplicatedHealthResult>
-      misRepQueue;
 
   public ReplicationQueue() {
     underRepQueue = new PriorityQueue<>(
@@ -42,7 +40,6 @@ public class ReplicationQueue {
         .thenComparing(ContainerHealthResult
             .UnderReplicatedHealthResult::getRequeueCount));
     overRepQueue = new LinkedList<>();
-    misRepQueue = new LinkedList<>();
   }
 
   public void enqueue(ContainerHealthResult.UnderReplicatedHealthResult
@@ -55,11 +52,6 @@ public class ReplicationQueue {
     overRepQueue.add(overReplicatedHealthResult);
   }
 
-  public void enqueue(ContainerHealthResult.MisReplicatedHealthResult
-      misReplicatedHealthResult) {
-    misRepQueue.add(misReplicatedHealthResult);
-  }
-
   public ContainerHealthResult.UnderReplicatedHealthResult
       dequeueUnderReplicatedContainer() {
     return underRepQueue.poll();
@@ -70,11 +62,6 @@ public class ReplicationQueue {
     return overRepQueue.poll();
   }
 
-  public ContainerHealthResult.MisReplicatedHealthResult
-      dequeueMisReplicatedContainer() {
-    return misRepQueue.poll();
-  }
-
   public int underReplicatedQueueSize() {
     return underRepQueue.size();
   }
@@ -83,8 +70,4 @@ public class ReplicationQueue {
     return overRepQueue.size();
   }
 
-  public int misReplicatedQueueSize() {
-    return misRepQueue.size();
-  }
-
 }
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
index f537ec5628..dd4a9e385c 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
@@ -390,6 +390,13 @@ public class TestReplicationManager {
 
   @Test
   public void testUnderReplicationQueuePopulated() {
+    // Make it always return mis-replicated. Only a perfectly replicated
+    // container should make it the mis-replicated state as under / over
+    // replicated take precedence.
+    Mockito.when(ecPlacementPolicy.validateContainerPlacement(
+            anyList(), anyInt()))
+        .thenReturn(new ContainerPlacementStatusDefault(1, 2, 3));
+
     ContainerInfo decomContainer = createContainerInfo(repConfig, 1,
         HddsProtos.LifeCycleState.CLOSED);
     addReplicas(decomContainer, ContainerReplicaProto.State.CLOSED,
@@ -404,6 +411,10 @@ public class TestReplicationManager {
         HddsProtos.LifeCycleState.CLOSED);
     addReplicas(underRep0, ContainerReplicaProto.State.CLOSED, 1, 2, 3);
 
+    ContainerInfo misRep = createContainerInfo(repConfig, 4,
+        HddsProtos.LifeCycleState.CLOSED);
+    addReplicas(misRep, ContainerReplicaProto.State.CLOSED, 1, 2, 3, 4, 5);
+
     enableProcessAll();
     replicationManager.processAll();
 
@@ -438,6 +449,10 @@ public class TestReplicationManager {
     res = replicationManager.dequeueUnderReplicatedContainer();
     Assert.assertEquals(underRep0, res.getContainerInfo());
 
+    // Next is the mis-rep container, which has a remaining redundancy of 6.
+    res = replicationManager.dequeueUnderReplicatedContainer();
+    Assert.assertEquals(misRep, res.getContainerInfo());
+
     res = replicationManager.dequeueUnderReplicatedContainer();
     Assert.assertNull(res);
   }
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
index 06f68ee91c..8b7aa665c8 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
@@ -481,9 +481,8 @@ public class TestECReplicationCheckHandler {
     Assert.assertEquals(HealthState.MIS_REPLICATED, result.getHealthState());
 
     Assert.assertTrue(healthCheck.handle(request));
-    Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
+    Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
     Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
-    Assert.assertEquals(1, repQueue.misReplicatedQueueSize());
     Assert.assertEquals(0, report.getStat(
         ReplicationManagerReport.HealthState.UNDER_REPLICATED));
     Assert.assertEquals(0, report.getStat(
@@ -531,7 +530,6 @@ public class TestECReplicationCheckHandler {
     Assert.assertTrue(healthCheck.handle(request));
     Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
     Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
-    Assert.assertEquals(0, repQueue.misReplicatedQueueSize());
     Assert.assertEquals(0, report.getStat(
         ReplicationManagerReport.HealthState.UNDER_REPLICATED));
     Assert.assertEquals(0, report.getStat(
@@ -567,7 +565,6 @@ public class TestECReplicationCheckHandler {
     Assert.assertTrue(healthCheck.handle(request));
     Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
     Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
-    Assert.assertEquals(0, repQueue.misReplicatedQueueSize());
     Assert.assertEquals(1, report.getStat(
         ReplicationManagerReport.HealthState.UNDER_REPLICATED));
     Assert.assertEquals(0, report.getStat(
@@ -604,7 +601,6 @@ public class TestECReplicationCheckHandler {
     Assert.assertTrue(healthCheck.handle(request));
     Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
     Assert.assertEquals(1, repQueue.overReplicatedQueueSize());
-    Assert.assertEquals(0, repQueue.misReplicatedQueueSize());
     Assert.assertEquals(0, report.getStat(
         ReplicationManagerReport.HealthState.UNDER_REPLICATED));
     Assert.assertEquals(1, report.getStat(
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
index f6493b7817..9cc2c6263d 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
@@ -423,7 +423,6 @@ public class TestRatisReplicationCheckHandler {
     Assert.assertTrue(healthCheck.handle(requestBuilder.build()));
     Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
     Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
-    Assert.assertEquals(0, repQueue.misReplicatedQueueSize());
     Assert.assertEquals(1, report.getStat(
         ReplicationManagerReport.HealthState.UNDER_REPLICATED));
     Assert.assertEquals(0, report.getStat(
@@ -468,7 +467,6 @@ public class TestRatisReplicationCheckHandler {
     Assert.assertTrue(healthCheck.handle(requestBuilder.build()));
     Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
     Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
-    Assert.assertEquals(0, repQueue.misReplicatedQueueSize());
     Assert.assertEquals(1, report.getStat(
         ReplicationManagerReport.HealthState.UNDER_REPLICATED));
     Assert.assertEquals(0, report.getStat(
@@ -494,9 +492,8 @@ public class TestRatisReplicationCheckHandler {
     Assert.assertFalse(result.isReplicatedOkAfterPending());
 
     Assert.assertTrue(healthCheck.handle(requestBuilder.build()));
-    Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
+    Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
     Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
-    Assert.assertEquals(1, repQueue.misReplicatedQueueSize());
     Assert.assertEquals(0, report.getStat(
         ReplicationManagerReport.HealthState.UNDER_REPLICATED));
     Assert.assertEquals(1, report.getStat(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to