This is an automated email from the ASF dual-hosted git repository.
sodonnell pushed a commit to branch HDDS-1880-Decom
in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git
The following commit(s) were added to refs/heads/HDDS-1880-Decom by this push:
new f66c40c HDDS-2728. Remove methods of internal representation from
DatanodeAdminMontor interface (#355)
f66c40c is described below
commit f66c40cce184dfaf099c6f1ffd3f3ec4d306d923
Author: Elek, Márton <[email protected]>
AuthorDate: Fri Dec 13 21:17:34 2019 +0100
HDDS-2728. Remove methods of internal representation from
DatanodeAdminMontor interface (#355)
---
.../hadoop/hdds/scm/node/DatanodeAdminMonitor.java | 382 +--------------------
...nMonitor.java => DatanodeAdminMonitorImpl.java} | 102 +++---
.../scm/node/DatanodeAdminMonitorInterface.java | 43 ---
.../hdds/scm/node/NodeDecommissionManager.java | 17 +-
.../hdds/scm/server/StorageContainerManager.java | 2 +-
.../hdds/scm/node/TestDatanodeAdminMonitor.java | 8 +-
.../scm/node/TestDatanodeAdminNodeDetails.java | 2 -
.../hdds/scm/node/TestNodeDecommissionManager.java | 2 +-
8 files changed, 61 insertions(+), 497 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java
index 6aabd62..9ead66f 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java
@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- * <p>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p>
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,380 +17,16 @@
*/
package org.apache.hadoop.hdds.scm.node;
-import com.google.common.annotations.VisibleForTesting;
-import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState;
-import org.apache.hadoop.hdds.scm.container.ContainerID;
-import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
-import org.apache.hadoop.hdds.scm.container.ContainerReplicaCount;
-import org.apache.hadoop.hdds.scm.container.ReplicationManager;
-import org.apache.hadoop.hdds.scm.events.SCMEvents;
-import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
-import org.apache.hadoop.hdds.scm.pipeline.PipelineID;
-import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
-import org.apache.hadoop.hdds.server.events.EventPublisher;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.*;
/**
- * Monitor thread which watches for nodes to be decommissioned, recommissioned
- * or placed into maintenance. Newly added nodes are queued in pendingNodes
- * and recommissoned nodes are queued in cancelled nodes. On each monitor
- * 'tick', the cancelled nodes are processed and removed from the monitor.
- * Then any pending nodes are added to the trackedNodes set, where they stay
- * until decommission or maintenance has ended.
- *
- * Once an node is placed into tracked nodes, it goes through a workflow where
- * the following happens:
- *
- * 1. First an event is fired to close any pipelines on the node, which will
- * also close any containers.
- * 2. Next the containers on the node are obtained and checked to see if new
- * replicas are needed. If so, the new replicas are scheduled.
- * 3. After scheduling replication, the node remains pending until replication
- * has completed.
- * 4. At this stage the node will complete decommission or enter maintenance.
- * 5. Maintenance nodes will remain tracked by this monitor until maintenance
- * is manually ended, or the maintenance window expires.
+ * Interface used by the DatanodeAdminMonitor, which can be used to
+ * decommission or recommission nodes and take them in and out of maintenance.
*/
-public class DatanodeAdminMonitor implements DatanodeAdminMonitorInterface {
-
- private OzoneConfiguration conf;
- private EventPublisher eventQueue;
- private NodeManager nodeManager;
- private PipelineManager pipelineManager;
- private ReplicationManager replicationManager;
- private Queue<DatanodeAdminNodeDetails> pendingNodes = new ArrayDeque();
- private Queue<DatanodeAdminNodeDetails> cancelledNodes = new ArrayDeque();
- private Set<DatanodeAdminNodeDetails> trackedNodes = new HashSet<>();
-
- private static final Logger LOG =
- LoggerFactory.getLogger(DatanodeAdminMonitor.class);
-
- public DatanodeAdminMonitor(OzoneConfiguration config) {
- conf = config;
- }
-
- @Override
- public void setConf(OzoneConfiguration config) {
- conf = config;
- }
-
- @Override
- public void setEventQueue(EventPublisher eventQueue) {
- this.eventQueue = eventQueue;
- }
-
- @Override
- public void setNodeManager(NodeManager nm) {
- nodeManager = nm;
- }
-
- @Override
- public void setPipelineManager(PipelineManager pm) {
- pipelineManager = pm;
- }
-
- @Override
- public void setReplicationManager(ReplicationManager rm) {
- replicationManager = rm;
- }
-
- /**
- * Add a node to the decommission or maintenance workflow. The node will be
- * queued and added to the workflow after a defined interval.
- *
- * @param dn The datanode to move into an admin state
- * @param endInHours For nodes going into maintenance, the number of hours
- * from now for maintenance to automatically end. Ignored
- * for decommissioning nodes.
- */
- @Override
- public synchronized void startMonitoring(DatanodeDetails dn, int endInHours)
{
- DatanodeAdminNodeDetails nodeDetails =
- new DatanodeAdminNodeDetails(dn, endInHours);
- cancelledNodes.remove(nodeDetails);
- pendingNodes.add(nodeDetails);
- }
-
- /**
- * Remove a node from the decommission or maintenance workflow, and return it
- * to service. The node will be queued and removed from decommission or
- * maintenance after a defined interval.
- * @param dn The datanode for which to stop decommission or maintenance.
- */
- @Override
- public synchronized void stopMonitoring(DatanodeDetails dn) {
- DatanodeAdminNodeDetails nodeDetails = new DatanodeAdminNodeDetails(dn, 0);
- pendingNodes.remove(nodeDetails);
- cancelledNodes.add(nodeDetails);
- }
-
- /**
- * Run an iteration of the monitor. This is the main run loop, and performs
- * the following checks:
- *
- * 1. Check for any cancelled nodes and process them
- * 2. Check for any newly added nodes and add them to the workflow
- * 3. Perform checks on the transitioning nodes and move them through the
- * workflow until they have completed decommission or maintenance
- */
- @Override
- public void run() {
- try {
- synchronized (this) {
- processCancelledNodes();
- processPendingNodes();
- }
- processTransitioningNodes();
- if (trackedNodes.size() > 0 || pendingNodes.size() > 0) {
- LOG.info("There are {} nodes tracked for decommission and "+
- "maintenance. {} pending nodes.",
- trackedNodes.size(), pendingNodes.size());
- }
- } catch (Exception e) {
- LOG.error("Caught an error in the DatanodeAdminMonitor", e);
- // Intentionally do not re-throw, as if we do the monitor thread
- // will not get rescheduled.
- }
- }
-
- @Override
- public int getPendingCount() {
- return pendingNodes.size();
- }
-
- @Override
- public int getCancelledCount() {
- return cancelledNodes.size();
- }
-
- @Override
- public int getTrackedNodeCount() {
- return trackedNodes.size();
- }
-
- @VisibleForTesting
- public Set<DatanodeAdminNodeDetails> getTrackedNodes() {
- return trackedNodes;
- }
-
- private void processCancelledNodes() {
- while(!cancelledNodes.isEmpty()) {
- DatanodeAdminNodeDetails dn = cancelledNodes.poll();
- try {
- stopTrackingNode(dn);
- putNodeBackInService(dn);
- LOG.info("Recommissioned node {}", dn.getDatanodeDetails());
- } catch (NodeNotFoundException e) {
- LOG.warn("Failed processing the cancel admin request for {}",
- dn.getDatanodeDetails(), e);
- }
- // TODO - fire event to bring node back into service?
- }
- }
-
- private void processPendingNodes() {
- while(!pendingNodes.isEmpty()) {
- startTrackingNode(pendingNodes.poll());
- }
- }
-
- private void processTransitioningNodes() {
- Iterator<DatanodeAdminNodeDetails> iterator = trackedNodes.iterator();
- while (iterator.hasNext()) {
- DatanodeAdminNodeDetails dn = iterator.next();
- try {
- NodeStatus status = getNodeStatus(dn.getDatanodeDetails());
-
- if (!shouldContinueWorkflow(dn, status)) {
- abortWorkflow(dn);
- iterator.remove();
- continue;
- }
-
- if (status.isMaintenance()) {
- if (dn.shouldMaintenanceEnd()) {
- completeMaintenance(dn);
- iterator.remove();
- continue;
- }
- }
-
- if (status.isDecommissioning() || status.isEnteringMaintenance()) {
- if (checkPipelinesClosedOnNode(dn)
- && checkContainersReplicatedOnNode(dn)) {
- // CheckContainersReplicatedOnNode may take a short time to run
- // so after it completes, re-get the nodestatus to check the health
- // and ensure the state is still good to continue
- status = getNodeStatus(dn.getDatanodeDetails());
- if (status.isDead()) {
- LOG.warn("Datanode {} is dead and the admin workflow cannot "+
- "continue. The node will be put back to IN_SERVICE and "+
- "handled as a dead node", dn);
- putNodeBackInService(dn);
- iterator.remove();
- } else if (status.isDecommissioning()) {
- completeDecommission(dn);
- iterator.remove();
- } else if (status.isEnteringMaintenance()) {
- putIntoMaintenance(dn);
- }
- }
- }
-
- } catch (NodeNotFoundException e) {
- LOG.error("An unexpected error occurred processing datanode {}. " +
- "Aborting the admin workflow", dn.getDatanodeDetails(), e);
- abortWorkflow(dn);
- iterator.remove();
- }
- }
- }
-
- /**
- * Checks if a node is in an unexpected state or has gone dead while
- * decommissioning or entering maintenance. If the node is not in a valid
- * state to continue the admin workflow, return false, otherwise return true.
- * @param dn The Datanode for which to check the current state
- * @param nodeStatus The current NodeStatus for the datanode
- * @return True if admin can continue, false otherwise
- */
- private boolean shouldContinueWorkflow(DatanodeAdminNodeDetails dn,
- NodeStatus nodeStatus) {
- if (!nodeStatus.isDecommission() && !nodeStatus.isMaintenance()) {
- LOG.warn("Datanode {} has an operational state of {} when it should "+
- "be undergoing decommission or maintenance. Aborting admin for "+
- "this node.",
- dn.getDatanodeDetails(), nodeStatus.getOperationalState());
- return false;
- }
- if (nodeStatus.isDead() && !nodeStatus.isInMaintenance()) {
- LOG.error("Datanode {} is dead but is not IN_MAINTENANCE. Aborting the "+
- "admin workflow for this node", dn.getDatanodeDetails());
- return false;
- }
- return true;
- }
-
- private boolean checkPipelinesClosedOnNode(DatanodeAdminNodeDetails dn) {
- DatanodeDetails dnd = dn.getDatanodeDetails();
- Set<PipelineID> pipelines = nodeManager.getPipelines(dnd);
- if (pipelines == null || pipelines.size() == 0
- || dn.shouldMaintenanceEnd()) {
- return true;
- } else {
- LOG.info("Waiting for pipelines to close for {}. There are {} "+
- "pipelines", dnd, pipelines.size());
- return false;
- }
- }
-
- private boolean checkContainersReplicatedOnNode(DatanodeAdminNodeDetails dn)
- throws NodeNotFoundException {
- int sufficientlyReplicated = 0;
- int underReplicated = 0;
- int unhealthy = 0;
- Set<ContainerID> containers =
- nodeManager.getContainers(dn.getDatanodeDetails());
- for(ContainerID cid : containers) {
- try {
- ContainerReplicaCount replicaSet =
- replicationManager.getContainerReplicaCount(cid);
- if (replicaSet.isSufficientlyReplicated()) {
- sufficientlyReplicated++;
- } else {
- underReplicated++;
- }
- if (!replicaSet.isHealthy()) {
- unhealthy++;
- }
- } catch (ContainerNotFoundException e) {
- LOG.warn("ContainerID {} present in node list for {} but not found "+
- "in containerManager", cid, dn.getDatanodeDetails());
- }
- }
- dn.setSufficientlyReplicatedContainers(sufficientlyReplicated);
- dn.setUnderReplicatedContainers(underReplicated);
- dn.setUnHealthyContainers(unhealthy);
-
- return underReplicated == 0 && unhealthy == 0;
- }
-
- private void completeDecommission(DatanodeAdminNodeDetails dn)
- throws NodeNotFoundException{
- setNodeOpState(dn, NodeOperationalState.DECOMMISSIONED);
- LOG.info("Datanode {} has completed the admin workflow. The operational "+
- "state has been set to {}", dn.getDatanodeDetails(),
- NodeOperationalState.DECOMMISSIONED);
- }
-
- private void putIntoMaintenance(DatanodeAdminNodeDetails dn)
- throws NodeNotFoundException {
- LOG.info("Datanode {} has entered maintenance", dn.getDatanodeDetails());
- setNodeOpState(dn, NodeOperationalState.IN_MAINTENANCE);
- }
-
- private void completeMaintenance(DatanodeAdminNodeDetails dn)
- throws NodeNotFoundException {
- // The end state of Maintenance is to put the node back IN_SERVICE, whether
- // it is dead or not.
- // TODO - if the node is dead do we trigger a dead node event here or leave
- // it to the heartbeat manager?
- LOG.info("Datanode {} has ended maintenance automatically",
- dn.getDatanodeDetails());
- putNodeBackInService(dn);
- }
-
- private void startTrackingNode(DatanodeAdminNodeDetails dn) {
- eventQueue.fireEvent(SCMEvents.START_ADMIN_ON_NODE,
- dn.getDatanodeDetails());
- trackedNodes.add(dn);
- }
-
- private void stopTrackingNode(DatanodeAdminNodeDetails dn) {
- trackedNodes.remove(dn);
- }
-
- /**
- * If we encounter an unexpected condition in maintenance, we must abort the
- * workflow by setting the node operationalState back to IN_SERVICE and then
- * remove the node from tracking.
- * @param dn The datanode for which to abort tracking
- */
- private void abortWorkflow(DatanodeAdminNodeDetails dn) {
- try {
- putNodeBackInService(dn);
- } catch (NodeNotFoundException e) {
- LOG.error("Unable to set the node OperationalState for {} while "+
- "aborting the datanode admin workflow", dn.getDatanodeDetails());
- }
- }
-
- private void putNodeBackInService(DatanodeAdminNodeDetails dn)
- throws NodeNotFoundException {
- setNodeOpState(dn, NodeOperationalState.IN_SERVICE);
- }
+public interface DatanodeAdminMonitor extends Runnable {
- private void setNodeOpState(DatanodeAdminNodeDetails dn,
- HddsProtos.NodeOperationalState state) throws NodeNotFoundException {
- nodeManager.setNodeOperationalState(dn.getDatanodeDetails(), state);
- }
+ void startMonitoring(DatanodeDetails dn, int endInHours);
- // TODO - The nodeManager.getNodeStatus call should really throw
- // NodeNotFoundException rather than having to handle it here as all
- // registered nodes must have a status.
- private NodeStatus getNodeStatus(DatanodeDetails dnd)
- throws NodeNotFoundException {
- NodeStatus nodeStatus = nodeManager.getNodeStatus(dnd);
- if (nodeStatus == null) {
- throw new NodeNotFoundException("Unable to retrieve the nodeStatus");
- }
- return nodeStatus;
- }
+ void stopMonitoring(DatanodeDetails dn);
-}
\ No newline at end of file
+}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
similarity index 89%
copy from
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java
copy to
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
index 6aabd62..931a45e 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.hdds.scm.node;
import com.google.common.annotations.VisibleForTesting;
+
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
@@ -29,8 +30,8 @@ import
org.apache.hadoop.hdds.scm.container.ReplicationManager;
import org.apache.hadoop.hdds.scm.events.SCMEvents;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
import org.apache.hadoop.hdds.scm.pipeline.PipelineID;
-import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.server.events.EventPublisher;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -43,68 +44,49 @@ import java.util.*;
* 'tick', the cancelled nodes are processed and removed from the monitor.
* Then any pending nodes are added to the trackedNodes set, where they stay
* until decommission or maintenance has ended.
- *
+ * <p>
* Once an node is placed into tracked nodes, it goes through a workflow where
* the following happens:
- *
+ * <p>
* 1. First an event is fired to close any pipelines on the node, which will
- * also close any containers.
+ * also close any containers.
* 2. Next the containers on the node are obtained and checked to see if new
- * replicas are needed. If so, the new replicas are scheduled.
+ * replicas are needed. If so, the new replicas are scheduled.
* 3. After scheduling replication, the node remains pending until replication
- * has completed.
+ * has completed.
* 4. At this stage the node will complete decommission or enter maintenance.
* 5. Maintenance nodes will remain tracked by this monitor until maintenance
- * is manually ended, or the maintenance window expires.
+ * is manually ended, or the maintenance window expires.
*/
-public class DatanodeAdminMonitor implements DatanodeAdminMonitorInterface {
+public class DatanodeAdminMonitorImpl implements DatanodeAdminMonitor {
private OzoneConfiguration conf;
private EventPublisher eventQueue;
private NodeManager nodeManager;
- private PipelineManager pipelineManager;
private ReplicationManager replicationManager;
private Queue<DatanodeAdminNodeDetails> pendingNodes = new ArrayDeque();
private Queue<DatanodeAdminNodeDetails> cancelledNodes = new ArrayDeque();
private Set<DatanodeAdminNodeDetails> trackedNodes = new HashSet<>();
private static final Logger LOG =
- LoggerFactory.getLogger(DatanodeAdminMonitor.class);
-
- public DatanodeAdminMonitor(OzoneConfiguration config) {
- conf = config;
- }
-
- @Override
- public void setConf(OzoneConfiguration config) {
- conf = config;
- }
-
- @Override
- public void setEventQueue(EventPublisher eventQueue) {
+ LoggerFactory.getLogger(DatanodeAdminMonitorImpl.class);
+
+ public DatanodeAdminMonitorImpl(
+ OzoneConfiguration conf,
+ EventPublisher eventQueue,
+ NodeManager nodeManager,
+ ReplicationManager replicationManager) {
+ this.conf = conf;
this.eventQueue = eventQueue;
- }
-
- @Override
- public void setNodeManager(NodeManager nm) {
- nodeManager = nm;
- }
-
- @Override
- public void setPipelineManager(PipelineManager pm) {
- pipelineManager = pm;
- }
-
- @Override
- public void setReplicationManager(ReplicationManager rm) {
- replicationManager = rm;
+ this.nodeManager = nodeManager;
+ this.replicationManager = replicationManager;
}
/**
* Add a node to the decommission or maintenance workflow. The node will be
* queued and added to the workflow after a defined interval.
*
- * @param dn The datanode to move into an admin state
+ * @param dn The datanode to move into an admin state
* @param endInHours For nodes going into maintenance, the number of hours
* from now for maintenance to automatically end. Ignored
* for decommissioning nodes.
@@ -121,6 +103,7 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
* Remove a node from the decommission or maintenance workflow, and return it
* to service. The node will be queued and removed from decommission or
* maintenance after a defined interval.
+ *
* @param dn The datanode for which to stop decommission or maintenance.
*/
@Override
@@ -133,11 +116,11 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
/**
* Run an iteration of the monitor. This is the main run loop, and performs
* the following checks:
- *
+ * <p>
* 1. Check for any cancelled nodes and process them
* 2. Check for any newly added nodes and add them to the workflow
* 3. Perform checks on the transitioning nodes and move them through the
- * workflow until they have completed decommission or maintenance
+ * workflow until they have completed decommission or maintenance
*/
@Override
public void run() {
@@ -148,8 +131,8 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
}
processTransitioningNodes();
if (trackedNodes.size() > 0 || pendingNodes.size() > 0) {
- LOG.info("There are {} nodes tracked for decommission and "+
- "maintenance. {} pending nodes.",
+ LOG.info("There are {} nodes tracked for decommission and " +
+ "maintenance. {} pending nodes.",
trackedNodes.size(), pendingNodes.size());
}
} catch (Exception e) {
@@ -159,17 +142,14 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
}
}
- @Override
public int getPendingCount() {
return pendingNodes.size();
}
- @Override
public int getCancelledCount() {
return cancelledNodes.size();
}
- @Override
public int getTrackedNodeCount() {
return trackedNodes.size();
}
@@ -180,7 +160,7 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
}
private void processCancelledNodes() {
- while(!cancelledNodes.isEmpty()) {
+ while (!cancelledNodes.isEmpty()) {
DatanodeAdminNodeDetails dn = cancelledNodes.poll();
try {
stopTrackingNode(dn);
@@ -195,7 +175,7 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
}
private void processPendingNodes() {
- while(!pendingNodes.isEmpty()) {
+ while (!pendingNodes.isEmpty()) {
startTrackingNode(pendingNodes.poll());
}
}
@@ -229,8 +209,8 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
// and ensure the state is still good to continue
status = getNodeStatus(dn.getDatanodeDetails());
if (status.isDead()) {
- LOG.warn("Datanode {} is dead and the admin workflow cannot "+
- "continue. The node will be put back to IN_SERVICE and "+
+ LOG.warn("Datanode {} is dead and the admin workflow cannot " +
+ "continue. The node will be put back to IN_SERVICE and " +
"handled as a dead node", dn);
putNodeBackInService(dn);
iterator.remove();
@@ -256,21 +236,22 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
* Checks if a node is in an unexpected state or has gone dead while
* decommissioning or entering maintenance. If the node is not in a valid
* state to continue the admin workflow, return false, otherwise return true.
- * @param dn The Datanode for which to check the current state
+ *
+ * @param dn The Datanode for which to check the current state
* @param nodeStatus The current NodeStatus for the datanode
* @return True if admin can continue, false otherwise
*/
private boolean shouldContinueWorkflow(DatanodeAdminNodeDetails dn,
NodeStatus nodeStatus) {
if (!nodeStatus.isDecommission() && !nodeStatus.isMaintenance()) {
- LOG.warn("Datanode {} has an operational state of {} when it should "+
- "be undergoing decommission or maintenance. Aborting admin for "+
+ LOG.warn("Datanode {} has an operational state of {} when it should " +
+ "be undergoing decommission or maintenance. Aborting admin for "
+
"this node.",
dn.getDatanodeDetails(), nodeStatus.getOperationalState());
return false;
}
if (nodeStatus.isDead() && !nodeStatus.isInMaintenance()) {
- LOG.error("Datanode {} is dead but is not IN_MAINTENANCE. Aborting the "+
+ LOG.error("Datanode {} is dead but is not IN_MAINTENANCE. Aborting the "
+
"admin workflow for this node", dn.getDatanodeDetails());
return false;
}
@@ -284,7 +265,7 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
|| dn.shouldMaintenanceEnd()) {
return true;
} else {
- LOG.info("Waiting for pipelines to close for {}. There are {} "+
+ LOG.info("Waiting for pipelines to close for {}. There are {} " +
"pipelines", dnd, pipelines.size());
return false;
}
@@ -297,7 +278,7 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
int unhealthy = 0;
Set<ContainerID> containers =
nodeManager.getContainers(dn.getDatanodeDetails());
- for(ContainerID cid : containers) {
+ for (ContainerID cid : containers) {
try {
ContainerReplicaCount replicaSet =
replicationManager.getContainerReplicaCount(cid);
@@ -310,7 +291,7 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
unhealthy++;
}
} catch (ContainerNotFoundException e) {
- LOG.warn("ContainerID {} present in node list for {} but not found "+
+ LOG.warn("ContainerID {} present in node list for {} but not found " +
"in containerManager", cid, dn.getDatanodeDetails());
}
}
@@ -322,10 +303,10 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
}
private void completeDecommission(DatanodeAdminNodeDetails dn)
- throws NodeNotFoundException{
+ throws NodeNotFoundException {
setNodeOpState(dn, NodeOperationalState.DECOMMISSIONED);
- LOG.info("Datanode {} has completed the admin workflow. The operational "+
- "state has been set to {}", dn.getDatanodeDetails(),
+ LOG.info("Datanode {} has completed the admin workflow. The operational " +
+ "state has been set to {}", dn.getDatanodeDetails(),
NodeOperationalState.DECOMMISSIONED);
}
@@ -360,13 +341,14 @@ public class DatanodeAdminMonitor implements
DatanodeAdminMonitorInterface {
* If we encounter an unexpected condition in maintenance, we must abort the
* workflow by setting the node operationalState back to IN_SERVICE and then
* remove the node from tracking.
+ *
* @param dn The datanode for which to abort tracking
*/
private void abortWorkflow(DatanodeAdminNodeDetails dn) {
try {
putNodeBackInService(dn);
} catch (NodeNotFoundException e) {
- LOG.error("Unable to set the node OperationalState for {} while "+
+ LOG.error("Unable to set the node OperationalState for {} while " +
"aborting the datanode admin workflow", dn.getDatanodeDetails());
}
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorInterface.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorInterface.java
deleted file mode 100644
index cfbbffb..0000000
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorInterface.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hdds.scm.node;
-
-import org.apache.hadoop.hdds.conf.OzoneConfiguration;
-import org.apache.hadoop.hdds.protocol.DatanodeDetails;
-import org.apache.hadoop.hdds.scm.container.ReplicationManager;
-import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
-import org.apache.hadoop.hdds.server.events.EventPublisher;
-
-/**
- * Interface used by the DatanodeAdminMonitor, which can be used to
- * decommission or recommission nodes and take them in and out of maintenance.
- */
-public interface DatanodeAdminMonitorInterface extends Runnable {
-
- void setConf(OzoneConfiguration conf);
- void setEventQueue(EventPublisher scm);
- void setNodeManager(NodeManager nm);
- void setPipelineManager(PipelineManager pm);
- void setReplicationManager(ReplicationManager rm);
- void startMonitoring(DatanodeDetails dn, int endInHours);
- void stopMonitoring(DatanodeDetails dn);
-
- int getPendingCount();
- int getCancelledCount();
- int getTrackedNodeCount();
-}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java
index f76929e..4c9bf9c 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java
@@ -25,7 +25,6 @@ import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.container.ContainerManager;
import org.apache.hadoop.hdds.scm.container.ReplicationManager;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
-import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.server.events.EventPublisher;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.slf4j.Logger;
@@ -47,10 +46,9 @@ import java.util.concurrent.TimeUnit;
public class NodeDecommissionManager {
private ScheduledExecutorService executor;
- private DatanodeAdminMonitorInterface monitor;
+ private DatanodeAdminMonitor monitor;
private NodeManager nodeManager;
- private PipelineManager pipelineManager;
//private ContainerManager containerManager;
private EventPublisher eventQueue;
private ReplicationManager replicationManager;
@@ -168,11 +166,10 @@ public class NodeDecommissionManager {
}
public NodeDecommissionManager(OzoneConfiguration config, NodeManager nm,
- PipelineManager pm, ContainerManager containerManager,
+ ContainerManager containerManager,
EventPublisher eventQueue, ReplicationManager rm) {
this.nodeManager = nm;
conf = config;
- this.pipelineManager = pm;
//this.containerManager = containerManager;
this.eventQueue = eventQueue;
this.replicationManager = rm;
@@ -201,19 +198,15 @@ public class NodeDecommissionManager {
TimeUnit.SECONDS);
}
- monitor = new DatanodeAdminMonitor(conf);
- monitor.setConf(conf);
- monitor.setEventQueue(this.eventQueue);
- monitor.setNodeManager(nodeManager);
- monitor.setPipelineManager(pipelineManager);
- monitor.setReplicationManager(replicationManager);
+ monitor = new DatanodeAdminMonitorImpl(conf, eventQueue, nodeManager,
+ replicationManager);
executor.scheduleAtFixedRate(monitor, monitorInterval, monitorInterval,
TimeUnit.SECONDS);
}
@VisibleForTesting
- public DatanodeAdminMonitorInterface getMonitor() {
+ public DatanodeAdminMonitor getMonitor() {
return monitor;
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
index 651f259..a7a75eb 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
@@ -340,7 +340,7 @@ public final class StorageContainerManager extends
ServiceRuntimeInfoImpl
pipelineManager);
scmDecommissionManager = new NodeDecommissionManager(conf, scmNodeManager,
- pipelineManager, containerManager, eventQueue, replicationManager);
+ containerManager, eventQueue, replicationManager);
eventQueue.addHandler(SCMEvents.DATANODE_COMMAND, scmNodeManager);
eventQueue.addHandler(SCMEvents.RETRIABLE_DATANODE_COMMAND,
scmNodeManager);
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
index 98ad037..5b74750 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
@@ -52,7 +52,7 @@ public class TestDatanodeAdminMonitor {
private SimpleMockNodeManager nodeManager;
private OzoneConfiguration conf;
- private DatanodeAdminMonitor monitor;
+ private DatanodeAdminMonitorImpl monitor;
private DatanodeAdminHandler startAdminHandler;
private ReplicationManager repManager;
private EventQueue eventQueue;
@@ -69,10 +69,8 @@ public class TestDatanodeAdminMonitor {
repManager = Mockito.mock(ReplicationManager.class);
- monitor = new DatanodeAdminMonitor(conf);
- monitor.setEventQueue(eventQueue);
- monitor.setNodeManager(nodeManager);
- monitor.setReplicationManager(repManager);
+ monitor =
+ new DatanodeAdminMonitorImpl(conf, eventQueue, nodeManager,
repManager);
}
@After
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminNodeDetails.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminNodeDetails.java
index 916e383..c5310b9 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminNodeDetails.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminNodeDetails.java
@@ -35,12 +35,10 @@ import static org.junit.Assert.assertNotEquals;
public class TestDatanodeAdminNodeDetails {
private OzoneConfiguration conf;
- private DatanodeAdminMonitor monitor;
@Before
public void setup() {
conf = new OzoneConfiguration();
- monitor = new DatanodeAdminMonitor(conf);
}
@After
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java
index 2da7a61..47055f5 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java
@@ -56,7 +56,7 @@ public class TestNodeDecommissionManager {
conf.set(HddsConfigKeys.OZONE_METADATA_DIRS, storageDir);
nodeManager = createNodeManager(conf);
decom = new NodeDecommissionManager(
- conf, nodeManager, null, null, null, null);
+ conf, nodeManager, null, null, null);
}
@Test
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]