JacksonYao287 commented on a change in pull request #2349:
URL: https://github.com/apache/ozone/pull/2349#discussion_r663914240
##########
File path:
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
##########
@@ -471,6 +549,127 @@ private void updateInflightAction(final ContainerInfo
container,
}
}
+ /**
+ * add a move action for a given container.
+ *
+ * @param cid Container to move
+ * @param srcDn datanode to move from
+ * @param targetDn datanode to move to
+ */
+ public Optional<CompletableFuture<MoveResult>> move(ContainerID cid,
+ DatanodeDetails srcDn, DatanodeDetails targetDn)
+ throws ContainerNotFoundException, NodeNotFoundException {
+ LOG.info("receive a move requset about container {} , from {} to {}",
+ cid, srcDn.getUuid(), targetDn.getUuid());
+ Optional<CompletableFuture<MoveResult>> ret = Optional.empty();
+ if (!isRunning()) {
+ LOG.info("Replication Manager in not running. please start it first");
+ return ret;
+ }
+
+ /*
+ * make sure the flowing conditions are met:
+ * 1 the given two datanodes are in healthy state
+ * 2 the given container exists on the given source datanode
+ * 3 the given container does not exist on the given target datanode
+ * 4 the given container is in closed state
+ * 5 the giver container is not taking any inflight action
+ * 6 the given two datanodes are in IN_SERVICE state
+ *
+ * move is a combination of two steps : replication and deletion.
+ * if the conditions above are all met, then we take a conservative
+ * strategy here : replication can always be executed, but the execution
+ * of deletion always depends on placement policy
+ */
+
+ NodeStatus currentNodeStat = nodeManager.getNodeStatus(srcDn);
+ NodeState healthStat = currentNodeStat.getHealth();
+ NodeOperationalState operationalState =
+ currentNodeStat.getOperationalState();
+ if (healthStat != NodeState.HEALTHY) {
+ LOG.info("given source datanode is in {} state, " +
+ "not in HEALTHY state", healthStat);
+ return ret;
+ }
+ if (operationalState != NodeOperationalState.IN_SERVICE) {
+ LOG.info("given source datanode is in {} state, " +
+ "not in IN_SERVICE state", operationalState);
+ return ret;
+ }
+
+ currentNodeStat = nodeManager.getNodeStatus(targetDn);
+ healthStat = currentNodeStat.getHealth();
+ operationalState = currentNodeStat.getOperationalState();
+ if (healthStat != NodeState.HEALTHY) {
+ LOG.info("given target datanode is in {} state, " +
+ "not in HEALTHY state", healthStat);
+ return ret;
+ }
+ if (operationalState != NodeOperationalState.IN_SERVICE) {
+ LOG.info("given target datanode is in {} state, " +
+ "not in IN_SERVICE state", operationalState);
+ return ret;
+ }
+
+ // we need to synchronize on ContainerInfo, since it is
+ // shared by ICR/FCR handler and this.processContainer
+ // TODO: use a Read lock after introducing a RW lock into ContainerInfo
+ ContainerInfo cif = containerManager.getContainer(cid);
+ synchronized (cif) {
+ final Set<DatanodeDetails> replicas = containerManager
+ .getContainerReplicas(cid).stream()
+ .map(ContainerReplica::getDatanodeDetails)
+ .collect(Collectors.toSet());
+ if (replicas.contains(targetDn)) {
+ LOG.info("given container exists in the target Datanode");
+ return ret;
+ }
+ if (!replicas.contains(srcDn)) {
+ LOG.info("given container does not exist in the source Datanode");
+ return ret;
+ }
+
+ /*
+ * the reason why the given container should not be taking any inflight
+ * action is that: if the given container is being replicated or deleted,
+ * the num of its replica is not deterministic, so move operation issued
+ * by balancer may cause a nondeterministic result, so we should drop
+ * this option for this time.
+ * */
+
+ if (inflightReplication.containsKey(cid)) {
+ LOG.info("given container is in inflight replication");
+ return ret;
+ }
+ if (inflightDeletion.containsKey(cid)) {
+ LOG.info("given container is in inflight deletion");
+ return ret;
+ }
+
+ /*
+ * here, no need to see whether cid is in inflightMove, because
+ * these three map are all synchronized on ContainerInfo, if cid
+ * is in infligtMove , it must now being replicated or deleted,
+ * so it must be in inflightReplication or in infligthDeletion.
+ * thus, if we can not find cid in both of them , this cid must
+ * not be in inflightMove.
+ */
+
Review comment:
yea, it makes sense, i will change this
##########
File path:
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
##########
@@ -862,43 +1153,128 @@ private void handleOverReplicatedContainer(final
ContainerInfo container,
break;
}
}
- // After removing all unhealthy replicas, if the container is still over
- // replicated then we need to check if it is already mis-replicated.
- // If it is, we do no harm by removing excess replicas. However, if it is
- // not mis-replicated, then we can only remove replicas if they don't
- // make the container become mis-replicated.
- if (excess > 0) {
- eligibleReplicas.removeAll(unhealthyReplicas);
- Set<ContainerReplica> eligibleSet = new HashSet<>(eligibleReplicas);
- ContainerPlacementStatus ps =
- getPlacementStatus(eligibleSet, replicationFactor);
- for (ContainerReplica r : eligibleReplicas) {
- if (excess <= 0) {
+
+ eligibleReplicas.removeAll(unhealthyReplicas);
+ boolean isInMove = inflightMove.containsKey(id);
+ boolean isSourceDnInReplicaSet = false;
+ boolean isTargetDnInReplicaSet = false;
+
+ if (isInMove) {
+ Pair<DatanodeDetails, DatanodeDetails> movePair =
+ inflightMove.get(id);
+ final DatanodeDetails sourceDN = movePair.getKey();
+ isSourceDnInReplicaSet = eligibleReplicas.stream()
+ .anyMatch(r -> r.getDatanodeDetails().equals(sourceDN));
+ isTargetDnInReplicaSet = eligibleReplicas.stream()
+ .anyMatch(r -> r.getDatanodeDetails()
+ .equals(movePair.getValue()));
+ int sourceDnPos = 0;
+ for (int i = 0; i < eligibleReplicas.size(); i++) {
+ if (eligibleReplicas.get(i).getDatanodeDetails()
+ .equals(sourceDN)) {
+ sourceDnPos = i;
break;
}
- // First remove the replica we are working on from the set, and then
- // check if the set is now mis-replicated.
- eligibleSet.remove(r);
- ContainerPlacementStatus nowPS =
- getPlacementStatus(eligibleSet, replicationFactor);
- if ((!ps.isPolicySatisfied()
- && nowPS.actualPlacementCount() == ps.actualPlacementCount())
- || (ps.isPolicySatisfied() && nowPS.isPolicySatisfied())) {
- // Remove the replica if the container was already unsatisfied
- // and losing this replica keep actual placement count unchanged.
- // OR if losing this replica still keep satisfied
- sendDeleteCommand(container, r.getDatanodeDetails(), true);
- excess -= 1;
- continue;
+ }
+ if (isTargetDnInReplicaSet) {
+ if (isSourceDnInReplicaSet) {
+ // if the target is present, and source disappears somehow,
+ // we can consider move is successful.
+ inflightMoveFuture.get(id).complete(MoveResult.COMPLETED);
+ inflightMove.remove(id);
+ inflightMoveFuture.remove(id);
+ } else {
+ // if the container is in inflightMove and target datanode is
+ // included in the replicas, then swap the source datanode to
+ // first of the replica list if exists, so the source datanode
+ // will be first removed if possible.
+ eligibleReplicas.add(0, eligibleReplicas.remove(sourceDnPos));
+ }
Review comment:
yea, thanks @siddhantsangwan for pointing out this! I explained the
logic in the comment, which is just what you said, but i made a mistake in the
code. i will fix this , thanks!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]