This is an automated email from the ASF dual-hosted git repository. elek pushed a commit to branch HDDS-2196 in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git
commit edda9a6dead848cb16d4691300243d8748b33051 Author: S O'Donnell <sodonn...@cloudera.com> AuthorDate: Mon Sep 23 18:23:47 2019 +0100 Create admin commands and protobuf messages to allow decommission / recommission and maintenance commands to be sent from the CLI and update the node status in a skeleton decommission manager --- .../hdds/scm/client/ContainerOperationClient.java | 16 ++ .../apache/hadoop/hdds/scm/client/ScmClient.java | 29 +++ .../protocol/StorageContainerLocationProtocol.java | 7 + ...inerLocationProtocolClientSideTranslatorPB.java | 58 +++++ .../proto/StorageContainerLocationProtocol.proto | 45 +++- .../hdds/scm/node/InvalidHostStringException.java | 34 +++ .../hdds/scm/node/NodeDecommissionManager.java | 286 +++++++++++++++++++++ .../apache/hadoop/hdds/scm/node/NodeManager.java | 14 +- .../hadoop/hdds/scm/node/NodeStateManager.java | 15 ++ .../hadoop/hdds/scm/node/SCMNodeManager.java | 19 +- ...inerLocationProtocolServerSideTranslatorPB.java | 54 ++++ .../hdds/scm/server/SCMClientProtocolServer.java | 37 +++ .../hdds/scm/server/StorageContainerManager.java | 15 ++ .../hadoop/hdds/scm/container/MockNodeManager.java | 11 +- .../hdds/scm/node/TestNodeDecommissionManager.java | 253 ++++++++++++++++++ .../hadoop/hdds/scm/node/TestNodeStateManager.java | 17 ++ .../testutils/ReplicationNodeManagerMock.java | 34 ++- .../org/apache/hadoop/hdds/scm/cli/SCMCLI.java | 4 +- .../hdds/scm/cli/node/DatanodeAdminCommands.java | 55 ++++ .../node/DatanodeAdminDecommissionSubCommand.java | 58 +++++ .../node/DatanodeAdminMaintenanceSubCommand.java | 63 +++++ .../node/DatanodeAdminRecommissionSubCommand.java | 58 +++++ .../hadoop/hdds/scm/cli/node/package-info.java | 23 ++ .../scm/node/TestDecommissionAndMaintenance.java | 137 ++++++++++ 24 files changed, 1322 insertions(+), 20 deletions(-) diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/client/ContainerOperationClient.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/client/ContainerOperationClient.java index c97354f..15fc09b 100644 --- a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/client/ContainerOperationClient.java +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/client/ContainerOperationClient.java @@ -210,6 +210,22 @@ public class ContainerOperationClient implements ScmClient { poolName); } + @Override + public void decommissionNodes(List<String> hosts) throws IOException { + storageContainerLocationClient.decommissionNodes(hosts); + } + + @Override + public void recommissionNodes(List<String> hosts) throws IOException { + storageContainerLocationClient.recommissionNodes(hosts); + } + + @Override + public void startMaintenanceNodes(List<String> hosts, int endHours) + throws IOException { + storageContainerLocationClient.startMaintenanceNodes(hosts, endHours); + } + /** * Creates a specified replication pipeline. */ diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java index 226ceda..8238e84 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java @@ -162,6 +162,35 @@ public interface ScmClient extends Closeable { HddsProtos.QueryScope queryScope, String poolName) throws IOException; /** + * Allows a list of hosts to be decommissioned. The hosts are identified + * by their hostname and optionally port in the format foo.com:port. + * @param hosts A list of hostnames, optionally with port + * @throws IOException + */ + void decommissionNodes(List<String> hosts) throws IOException; + + /** + * Allows a list of hosts in maintenance or decommission states to be placed + * back in service. The hosts are identified by their hostname and optionally + * port in the format foo.com:port. + * @param hosts A list of hostnames, optionally with port + * @throws IOException + */ + void recommissionNodes(List<String> hosts) throws IOException; + + /** + * Place the list of datanodes into maintenance mode. If a non-null endDtm + * is passed, the hosts will automatically exit maintenance mode after the + * given time has passed. The hosts are identified by their hostname and + * optionally port in the format foo.com:port. + * @param hosts A list of hostnames, optionally with port + * @param endHours The number of hours from now which maintenance will end + * @throws IOException + */ + void startMaintenanceNodes(List<String> hosts, int endHours) + throws IOException; + + /** * Creates a specified replication pipeline. * @param type - Type * @param factor - Replication factor diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java index 88db820..91cd40d 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java @@ -112,6 +112,13 @@ public interface StorageContainerLocationProtocol extends Closeable { List<HddsProtos.Node> queryNode(HddsProtos.NodeState state, HddsProtos.QueryScope queryScope, String poolName) throws IOException; + void decommissionNodes(List<String> nodes) throws IOException; + + void recommissionNodes(List<String> nodes) throws IOException; + + void startMaintenanceNodes(List<String> nodes, int endInHours) + throws IOException; + /** * Notify from client when begin or finish creating objects like pipeline * or containers on datanodes. diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java index 01db597..99941d1 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java @@ -53,6 +53,9 @@ import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolPro import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ScmContainerLocationResponse; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartReplicationManagerRequestProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StopReplicationManagerRequestProto; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartMaintenanceNodesRequestProto; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionNodesRequestProto; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.RecommissionNodesRequestProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type; import org.apache.hadoop.hdds.scm.ScmInfo; import org.apache.hadoop.hdds.scm.container.ContainerInfo; @@ -257,6 +260,61 @@ public final class StorageContainerLocationProtocolClientSideTranslatorPB } /** + * Attempts to decommission the list of nodes. + * @param nodes The list of hostnames or hostname:ports to decommission + * @throws IOException + */ + @Override + public void decommissionNodes(List<String> nodes) throws IOException { + Preconditions.checkNotNull(nodes); + DecommissionNodesRequestProto request = + DecommissionNodesRequestProto.newBuilder() + .addAllHosts(nodes) + .build(); + submitRequest(Type.DecommissionNodes, + builder -> builder.setDecommissionNodesRequest(request)); + } + + /** + * Attempts to recommission the list of nodes. + * @param nodes The list of hostnames or hostname:ports to recommission + * @throws IOException + */ + @Override + public void recommissionNodes(List<String> nodes) throws IOException { + Preconditions.checkNotNull(nodes); + RecommissionNodesRequestProto request = + RecommissionNodesRequestProto.newBuilder() + .addAllHosts(nodes) + .build(); + submitRequest(Type.RecommissionNodes, + builder -> builder.setRecommissionNodesRequest(request)); + } + + /** + * Attempts to put the list of nodes into maintenance mode. + * + * @param nodes The list of hostnames or hostname:ports to put into + * maintenance + * @param endInHours A number of hours from now where the nodes will be taken + * out of maintenance automatically. Passing zero will + * allow the nodes to stay in maintenance indefinitely + * @throws IOException + */ + @Override + public void startMaintenanceNodes(List<String> nodes, int endInHours) + throws IOException { + Preconditions.checkNotNull(nodes); + StartMaintenanceNodesRequestProto request = + StartMaintenanceNodesRequestProto.newBuilder() + .addAllHosts(nodes) + .setEndInHours(endInHours) + .build(); + submitRequest(Type.StartMaintenanceNodes, + builder -> builder.setStartMaintenanceNodesRequest(request)); + } + + /** * Notify from client that creates object on datanodes. * * @param type object type diff --git a/hadoop-hdds/common/src/main/proto/StorageContainerLocationProtocol.proto b/hadoop-hdds/common/src/main/proto/StorageContainerLocationProtocol.proto index 8ea72b6..00e58c0 100644 --- a/hadoop-hdds/common/src/main/proto/StorageContainerLocationProtocol.proto +++ b/hadoop-hdds/common/src/main/proto/StorageContainerLocationProtocol.proto @@ -59,7 +59,9 @@ message ScmContainerLocationRequest { optional StartReplicationManagerRequestProto startReplicationManagerRequest = 21; optional StopReplicationManagerRequestProto stopReplicationManagerRequest = 22; optional ReplicationManagerStatusRequestProto seplicationManagerStatusRequest = 23; - + optional DecommissionNodesRequestProto decommissionNodesRequest = 24; + optional RecommissionNodesRequestProto recommissionNodesRequest = 25; + optional StartMaintenanceNodesRequestProto startMaintenanceNodesRequest = 26; } message ScmContainerLocationResponse { @@ -91,6 +93,9 @@ message ScmContainerLocationResponse { optional StartReplicationManagerResponseProto startReplicationManagerResponse = 21; optional StopReplicationManagerResponseProto stopReplicationManagerResponse = 22; optional ReplicationManagerStatusResponseProto replicationManagerStatusResponse = 23; + optional DecommissionNodesResponseProto decommissionNodesResponse = 24; + optional RecommissionNodesResponseProto recommissionNodesResponse = 25; + optional StartMaintenanceNodesResponseProto startMaintenanceNodesResponse = 26; enum Status { OK = 1; CONTAINER_ALREADY_EXISTS = 2; @@ -118,6 +123,9 @@ enum Type { StartReplicationManager = 16; StopReplicationManager = 17; GetReplicationManagerStatus = 18; + DecommissionNodes = 19; + RecommissionNodes = 20; + StartMaintenanceNodes = 21; } /** @@ -225,6 +233,40 @@ message NodeQueryResponseProto { repeated Node datanodes = 1; } +/* + Decommission a list of hosts +*/ +message DecommissionNodesRequestProto { + repeated string hosts = 1; +} + +message DecommissionNodesResponseProto { + // empty response +} + +/* + Recommission a list of hosts in maintenance or decommission states +*/ +message RecommissionNodesRequestProto { + repeated string hosts = 1; +} + +message RecommissionNodesResponseProto { + // empty response +} + +/* + Place a list of hosts into maintenance mode +*/ +message StartMaintenanceNodesRequestProto { + repeated string hosts = 1; + optional int64 endInHours = 2; +} + +message StartMaintenanceNodesResponseProto { + // empty response +} + /** Request to create a replication pipeline. */ @@ -326,5 +368,4 @@ message ReplicationManagerStatusResponseProto { */ service StorageContainerLocationProtocolService { rpc submitRequest (ScmContainerLocationRequest) returns (ScmContainerLocationResponse); - } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/InvalidHostStringException.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/InvalidHostStringException.java new file mode 100644 index 0000000..c4046c1 --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/InvalidHostStringException.java @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.hdds.scm.node; + +import java.io.IOException; + +/** + * Exception thrown by the NodeDecommissionManager when it encounters + * host strings it does not expect or understand. + */ + +public class InvalidHostStringException extends IOException { + public InvalidHostStringException(String msg) { + super(msg); + } + + public InvalidHostStringException(String msg, Exception e) { + super(msg, e); + } +} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java new file mode 100644 index 0000000..60813dd --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java @@ -0,0 +1,286 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.hadoop.hdds.scm.node; + +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState; +import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; +import org.apache.hadoop.hdds.scm.pipeline.PipelineManager; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeAdminManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.InetAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.util.LinkedList; +import java.util.List; + +/** + * Class used to manage datanodes scheduled for maintenance or decommission. + */ +public class NodeDecommissionManager { + + private NodeManager nodeManager; + private PipelineManager pipeLineManager; + private ContainerManager containerManager; + private OzoneConfiguration conf; + private boolean useHostnames; + + private List<DatanodeDetails> pendingNodes = new LinkedList<>(); + + private static final Logger LOG = + LoggerFactory.getLogger(DatanodeAdminManager.class); + + + static class HostDefinition { + private String rawHostname; + private String hostname; + private int port; + + HostDefinition(String hostname) throws InvalidHostStringException { + this.rawHostname = hostname; + parseHostname(); + } + + public String getRawHostname() { + return rawHostname; + } + + public String getHostname() { + return hostname; + } + + public int getPort() { + return port; + } + + private void parseHostname() throws InvalidHostStringException{ + try { + // A URI *must* have a scheme, so just create a fake one + URI uri = new URI("my://"+rawHostname.trim()); + this.hostname = uri.getHost(); + this.port = uri.getPort(); + + if (this.hostname == null) { + throw new InvalidHostStringException("The string "+rawHostname+ + " does not contain a value hostname or hostname:port definition"); + } + } catch (URISyntaxException e) { + throw new InvalidHostStringException( + "Unable to parse the hoststring "+rawHostname, e); + } + } + } + + private List<DatanodeDetails> mapHostnamesToDatanodes(List<String> hosts) + throws InvalidHostStringException { + List<DatanodeDetails> results = new LinkedList<>(); + for (String hostString : hosts) { + HostDefinition host = new HostDefinition(hostString); + InetAddress addr; + try { + addr = InetAddress.getByName(host.getHostname()); + } catch (UnknownHostException e) { + throw new InvalidHostStringException("Unable to resolve the host " + +host.getRawHostname(), e); + } + String dnsName; + if (useHostnames) { + dnsName = addr.getHostName(); + } else { + dnsName = addr.getHostAddress(); + } + List<DatanodeDetails> found = nodeManager.getNodesByAddress(dnsName); + if (found.size() == 0) { + throw new InvalidHostStringException("The string " + + host.getRawHostname()+" resolved to "+dnsName + + " is not found in SCM"); + } else if (found.size() == 1) { + if (host.getPort() != -1 && + !validateDNPortMatch(host.getPort(), found.get(0))) { + throw new InvalidHostStringException("The string "+ + host.getRawHostname()+" matched a single datanode, but the "+ + "given port is not used by that Datanode"); + } + results.add(found.get(0)); + } else if (found.size() > 1) { + DatanodeDetails match = null; + for(DatanodeDetails dn : found) { + if (validateDNPortMatch(host.getPort(), dn)) { + match = dn; + break; + } + } + if (match == null) { + throw new InvalidHostStringException("The string " + + host.getRawHostname()+ "matched multiple Datanodes, but no "+ + "datanode port matched the given port"); + } + results.add(match); + } + } + return results; + } + + /** + * Check if the passed port is used by the given DatanodeDetails object. If + * it is, return true, otherwise return false. + * @param port Port number to check if it is used by the datanode + * @param dn Datanode to check if it is using the given port + * @return True if port is used by the datanode. False otherwise. + */ + private boolean validateDNPortMatch(int port, DatanodeDetails dn) { + for (DatanodeDetails.Port p : dn.getPorts()) { + if (p.getValue() == port) { + return true; + } + } + return false; + } + + public NodeDecommissionManager(OzoneConfiguration conf, + NodeManager nodeManager, PipelineManager pipelineManager, + ContainerManager containerManager) { + this.conf = conf; + this.nodeManager = nodeManager; + this.pipeLineManager = pipelineManager; + this.containerManager = containerManager; + + useHostnames = conf.getBoolean( + DFSConfigKeys.DFS_DATANODE_USE_DN_HOSTNAME, + DFSConfigKeys.DFS_DATANODE_USE_DN_HOSTNAME_DEFAULT); + } + + public synchronized void decommissionNodes(List nodes) + throws InvalidHostStringException { + List<DatanodeDetails> dns = mapHostnamesToDatanodes(nodes); + for (DatanodeDetails dn : dns) { + try { + startDecommission(dn); + } catch (NodeNotFoundException e) { + // We already validated the host strings and retrieved the DnDetails + // object from the node manager. Therefore we should never get a + // NodeNotFoundException here expect if the node is remove in the + // very short window between validation and starting decom. Therefore + // log a warning and ignore the exception + LOG.warn("The host {} was not found in SCM. Ignoring the request to "+ + "decommission it", dn.getHostName()); + } + } + } + + public synchronized void startDecommission(DatanodeDetails dn) + throws NodeNotFoundException { + NodeStatus nodeStatus = getNodeStatus(dn); + NodeOperationalState opState = nodeStatus.getOperationalState(); + LOG.info("In decommission the op state is {}", opState); + if (opState != NodeOperationalState.DECOMMISSIONING + && opState != NodeOperationalState.DECOMMISSIONED) { + LOG.info("Starting Decommission for node {}", dn); + nodeManager.setNodeOperationalState( + dn, NodeOperationalState.DECOMMISSIONING); + pendingNodes.add(dn); + } else { + LOG.info("Start Decommission called on node {} in state {}. Nothing to "+ + "do.", dn, opState); + } + } + + public synchronized void recommissionNodes(List nodes) + throws InvalidHostStringException { + List<DatanodeDetails> dns = mapHostnamesToDatanodes(nodes); + for (DatanodeDetails dn : dns) { + try { + recommission(dn); + } catch (NodeNotFoundException e) { + // We already validated the host strings and retrieved the DnDetails + // object from the node manager. Therefore we should never get a + // NodeNotFoundException here expect if the node is remove in the + // very short window between validation and starting decom. Therefore + // log a warning and ignore the exception + LOG.warn("The host {} was not found in SCM. Ignoring the request to "+ + "recommission it", dn.getHostName()); + } + } + } + + public synchronized void recommission(DatanodeDetails dn) + throws NodeNotFoundException{ + NodeStatus nodeStatus = getNodeStatus(dn); + NodeOperationalState opState = nodeStatus.getOperationalState(); + if (opState != NodeOperationalState.IN_SERVICE) { + nodeManager.setNodeOperationalState( + dn, NodeOperationalState.IN_SERVICE); + pendingNodes.remove(dn); + LOG.info("Recommissioned node {}", dn); + } else { + LOG.info("Recommission called on node {} with state {}. "+ + "Nothing to do.", dn, opState); + } + } + + public synchronized void startMaintenanceNodes(List nodes, int endInHours) + throws InvalidHostStringException { + List<DatanodeDetails> dns = mapHostnamesToDatanodes(nodes); + for (DatanodeDetails dn : dns) { + try { + startMaintenance(dn, endInHours); + } catch (NodeNotFoundException e) { + // We already validated the host strings and retrieved the DnDetails + // object from the node manager. Therefore we should never get a + // NodeNotFoundException here expect if the node is remove in the + // very short window between validation and starting decom. Therefore + // log a warning and ignore the exception + LOG.warn("The host {} was not found in SCM. Ignoring the request to "+ + "start maintenance on it", dn.getHostName()); + } + } + } + + // TODO - If startMaintenance is called on a host already in maintenance, + // then we should update the end time? + public synchronized void startMaintenance(DatanodeDetails dn, int endInHours) + throws NodeNotFoundException { + NodeStatus nodeStatus = getNodeStatus(dn); + NodeOperationalState opState = nodeStatus.getOperationalState(); + if (opState != NodeOperationalState.ENTERING_MAINTENANCE && + opState != NodeOperationalState.IN_MAINTENANCE) { + nodeManager.setNodeOperationalState( + dn, NodeOperationalState.ENTERING_MAINTENANCE); + pendingNodes.add(dn); + LOG.info("Starting Maintenance for node {}", dn); + } else { + LOG.info("Starting Maintenance called on node {} with state {}. "+ + "Nothing to do.", dn, opState); + } + } + + private NodeStatus getNodeStatus(DatanodeDetails dn) + throws NodeNotFoundException { + NodeStatus nodeStatus = nodeManager.getNodeStatus(dn); + if (nodeStatus == null) { + throw new NodeNotFoundException(); + } + return nodeStatus; + } + +} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeManager.java index 205f2e1..252c38f 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeManager.java @@ -125,11 +125,19 @@ public interface NodeManager extends StorageContainerNodeProtocol, SCMNodeMetric getNodeStat(DatanodeDetails datanodeDetails); /** - * Returns the node state of a specific node. + * Returns the node status of a specific node. * @param datanodeDetails DatanodeDetails - * @return Healthy/Stale/Dead. + * @return NodeStatus for the node */ - NodeState getNodeState(DatanodeDetails datanodeDetails); + NodeStatus getNodeStatus(DatanodeDetails datanodeDetails); + + /** + * Set the operation state of a node. + * @param datanodeDetails The datanode to set the new state for + * @param newState The new operational state for the node + */ + void setNodeOperationalState(DatanodeDetails datanodeDetails, + NodeOperationalState newState) throws NodeNotFoundException; /** * Get set of pipelines a datanode is part of. diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java index 1e1a50c..4177b63 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStateManager.java @@ -395,6 +395,21 @@ public class NodeStateManager implements Runnable, Closeable { } /** + * Sets the operational state of the given node. Intended to be called when + * a node is being decommissioned etc. + * + * @param dn The datanode having its state set + * @param newState The new operational State of the node. + */ + public void setNodeOperationalState(DatanodeDetails dn, + NodeOperationalState newState) throws NodeNotFoundException { + DatanodeInfo dni = nodeStateMap.getNodeInfo(dn.getUuid()); + if (dni.getNodeStatus().getOperationalState() != newState) { + nodeStateMap.updateNodeOperationalState(dn.getUuid(), newState); + } + } + + /** * Gets set of pipelineID a datanode belongs to. * @param dnId - Datanode ID * @return Set of PipelineID diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java index e48eda1..ccf1d0e 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/SCMNodeManager.java @@ -213,15 +213,15 @@ public class SCMNodeManager implements NodeManager { } /** - * Returns the node state of a specific node. + * Returns the node status of a specific node. * * @param datanodeDetails Datanode Details - * @return Healthy/Stale/Dead/Unknown. + * @return NodeStatus for the node */ @Override - public NodeState getNodeState(DatanodeDetails datanodeDetails) { + public NodeStatus getNodeStatus(DatanodeDetails datanodeDetails) { try { - return nodeStateManager.getNodeStatus(datanodeDetails).getHealth(); + return nodeStateManager.getNodeStatus(datanodeDetails); } catch (NodeNotFoundException e) { // TODO: should we throw NodeNotFoundException? return null; @@ -229,6 +229,17 @@ public class SCMNodeManager implements NodeManager { } /** + * Set the operation state of a node. + * @param datanodeDetails The datanode to set the new state for + * @param newState The new operational state for the node + */ + @Override + public void setNodeOperationalState(DatanodeDetails datanodeDetails, + NodeOperationalState newState) throws NodeNotFoundException{ + nodeStateManager.setNodeOperationalState(datanodeDetails, newState); + } + + /** * Closes this stream and releases any system resources associated with it. If * the stream is already closed then invoking this method has no effect. * diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java index 0d2f470..53bd95d 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java @@ -57,6 +57,18 @@ import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolPro import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartReplicationManagerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StopReplicationManagerRequestProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StopReplicationManagerResponseProto; +import org.apache.hadoop.hdds.protocol.proto. + StorageContainerLocationProtocolProtos.DecommissionNodesRequestProto; +import org.apache.hadoop.hdds.protocol.proto. + StorageContainerLocationProtocolProtos.DecommissionNodesResponseProto; +import org.apache.hadoop.hdds.protocol.proto. + StorageContainerLocationProtocolProtos.RecommissionNodesRequestProto; +import org.apache.hadoop.hdds.protocol.proto. + StorageContainerLocationProtocolProtos.RecommissionNodesResponseProto; +import org.apache.hadoop.hdds.protocol.proto. + StorageContainerLocationProtocolProtos.StartMaintenanceNodesRequestProto; +import org.apache.hadoop.hdds.protocol.proto. + StorageContainerLocationProtocolProtos.StartMaintenanceNodesResponseProto; import org.apache.hadoop.hdds.scm.ScmInfo; import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerWithPipeline; @@ -214,6 +226,27 @@ public final class StorageContainerLocationProtocolServerSideTranslatorPB .setReplicationManagerStatusResponse(getReplicationManagerStatus( request.getSeplicationManagerStatusRequest())) .build(); + case DecommissionNodes: + return ScmContainerLocationResponse.newBuilder() + .setCmdType(request.getCmdType()) + .setStatus(Status.OK) + .setDecommissionNodesResponse(decommissionNodes( + request.getDecommissionNodesRequest())) + .build(); + case RecommissionNodes: + return ScmContainerLocationResponse.newBuilder() + .setCmdType(request.getCmdType()) + .setStatus(Status.OK) + .setRecommissionNodesResponse(recommissionNodes( + request.getRecommissionNodesRequest())) + .build(); + case StartMaintenanceNodes: + return ScmContainerLocationResponse.newBuilder() + .setCmdType(request.getCmdType()) + .setStatus(Status.OK) + .setStartMaintenanceNodesResponse(startMaintenanceNodes( + request.getStartMaintenanceNodesRequest())) + .build(); default: throw new IllegalArgumentException( "Unknown command type: " + request.getCmdType()); @@ -390,4 +423,25 @@ public final class StorageContainerLocationProtocolServerSideTranslatorPB .setIsRunning(impl.getReplicationManagerStatus()).build(); } + public DecommissionNodesResponseProto decommissionNodes( + DecommissionNodesRequestProto request) throws IOException { + impl.decommissionNodes(request.getHostsList()); + return DecommissionNodesResponseProto.newBuilder() + .build(); + } + + public RecommissionNodesResponseProto recommissionNodes( + RecommissionNodesRequestProto request) throws IOException { + impl.recommissionNodes(request.getHostsList()); + return RecommissionNodesResponseProto.newBuilder().build(); + } + + public StartMaintenanceNodesResponseProto startMaintenanceNodes( + StartMaintenanceNodesRequestProto request) throws IOException { + impl.startMaintenanceNodes(request.getHostsList(), + (int)request.getEndInHours()); + return StartMaintenanceNodesResponseProto.newBuilder() + .build(); + } + } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index d982507..38d846f 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -366,6 +366,43 @@ public class SCMClientProtocolServer implements } @Override + public void decommissionNodes(List<String> nodes) throws IOException { + String remoteUser = getRpcRemoteUsername(); + try { + getScm().checkAdminAccess(remoteUser); + scm.getScmDecommissionManager().decommissionNodes(nodes); + } catch (Exception ex) { + LOG.error("Failed to decommission nodes", ex); + throw ex; + } + } + + @Override + public void recommissionNodes(List<String> nodes) throws IOException { + String remoteUser = getRpcRemoteUsername(); + try { + getScm().checkAdminAccess(remoteUser); + scm.getScmDecommissionManager().recommissionNodes(nodes); + } catch (Exception ex) { + LOG.error("Failed to recommission nodes", ex); + throw ex; + } + } + + @Override + public void startMaintenanceNodes(List<String> nodes, int endInHours) + throws IOException { + String remoteUser = getRpcRemoteUsername(); + try { + getScm().checkAdminAccess(remoteUser); + scm.getScmDecommissionManager().startMaintenanceNodes(nodes, endInHours); + } catch (Exception ex) { + LOG.error("Failed to place nodes into maintenance mode", ex); + throw ex; + } + } + + @Override public void notifyObjectStageChange(StorageContainerLocationProtocolProtos .ObjectStageChangeRequestProto.Type type, long id, StorageContainerLocationProtocolProtos.ObjectStageChangeRequestProto.Op diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index 702102b..2cc1cde 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -74,6 +74,7 @@ import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.node.NodeReportHandler; import org.apache.hadoop.hdds.scm.node.SCMNodeManager; import org.apache.hadoop.hdds.scm.node.StaleNodeHandler; +import org.apache.hadoop.hdds.scm.node.NodeDecommissionManager; import org.apache.hadoop.hdds.scm.pipeline.PipelineActionHandler; import org.apache.hadoop.hdds.scm.pipeline.PipelineManager; import org.apache.hadoop.hdds.scm.pipeline.PipelineReportHandler; @@ -160,6 +161,7 @@ public final class StorageContainerManager extends ServiceRuntimeInfoImpl private ContainerManager containerManager; private BlockManager scmBlockManager; private final SCMStorageConfig scmStorageConfig; + private NodeDecommissionManager scmDecommissionManager; private SCMMetadataStore scmMetadataStore; @@ -335,6 +337,9 @@ public final class StorageContainerManager extends ServiceRuntimeInfoImpl clientProtocolServer, scmBlockManager, replicationManager, pipelineManager); + scmDecommissionManager = new NodeDecommissionManager(conf, scmNodeManager, + pipelineManager, containerManager); + eventQueue.addHandler(SCMEvents.DATANODE_COMMAND, scmNodeManager); eventQueue.addHandler(SCMEvents.RETRIABLE_DATANODE_COMMAND, scmNodeManager); eventQueue.addHandler(SCMEvents.NODE_REPORT, nodeReportHandler); @@ -928,6 +933,16 @@ public final class StorageContainerManager extends ServiceRuntimeInfoImpl } /** + * Returns the node decommission manager. + * + * @return NodeDecommissionManager The decommission manger for the used by + * scm + */ + public NodeDecommissionManager getScmDecommissionManager() { + return scmDecommissionManager; + } + + /** * Returns SCM container manager. */ @VisibleForTesting diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/MockNodeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/MockNodeManager.java index 20a8b74..25cf504 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/MockNodeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/MockNodeManager.java @@ -262,11 +262,20 @@ public class MockNodeManager implements NodeManager { * @return Healthy/Stale/Dead. */ @Override - public HddsProtos.NodeState getNodeState(DatanodeDetails dd) { + public NodeStatus getNodeStatus(DatanodeDetails dd) { return null; } /** + * Set the operation state of a node. + * @param datanodeDetails The datanode to set the new state for + * @param newState The new operational state for the node + */ + public void setNodeOperationalState(DatanodeDetails datanodeDetails, + HddsProtos.NodeOperationalState newState) throws NodeNotFoundException { + } + + /** * Get set of pipelines a datanode is part of. * @param dnId - datanodeID * @return Set of PipelineID diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java new file mode 100644 index 0000000..4492a6e --- /dev/null +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java @@ -0,0 +1,253 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdds.scm.node; + +import org.apache.hadoop.hdds.HddsConfigKeys; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.scm.HddsTestUtils; +import org.apache.hadoop.hdds.scm.TestUtils; +import org.apache.hadoop.hdds.scm.server.StorageContainerManager; +import org.apache.hadoop.security.authentication.client.AuthenticationException; +import org.apache.hadoop.test.GenericTestUtils; +import org.junit.Before; +import org.junit.Test; +import java.io.IOException; +import java.util.List; +import java.util.UUID; +import java.util.Arrays; +import java.util.ArrayList; +import static junit.framework.TestCase.assertEquals; +import static org.assertj.core.api.Fail.fail; + +/** + * Unit tests for the decommision manager. + */ + +public class TestNodeDecommissionManager { + + private NodeDecommissionManager decom; + private StorageContainerManager scm; + private NodeManager nodeManager; + private OzoneConfiguration conf; + private String storageDir; + + @Before + public void setup() throws Exception { + conf = new OzoneConfiguration(); + storageDir = GenericTestUtils.getTempPath( + TestDeadNodeHandler.class.getSimpleName() + UUID.randomUUID()); + conf.set(HddsConfigKeys.OZONE_METADATA_DIRS, storageDir); + nodeManager = createNodeManager(conf); + decom = new NodeDecommissionManager(conf, nodeManager, null, null); + } + + @Test + public void testHostStringsParseCorrectly() + throws InvalidHostStringException { + NodeDecommissionManager.HostDefinition def = + new NodeDecommissionManager.HostDefinition("foobar"); + assertEquals("foobar", def.getHostname()); + assertEquals(-1, def.getPort()); + + def = new NodeDecommissionManager.HostDefinition(" foobar "); + assertEquals("foobar", def.getHostname()); + assertEquals(-1, def.getPort()); + + def = new NodeDecommissionManager.HostDefinition("foobar:1234"); + assertEquals("foobar", def.getHostname()); + assertEquals(1234, def.getPort()); + + def = new NodeDecommissionManager.HostDefinition( + "foobar.mycompany.com:1234"); + assertEquals("foobar.mycompany.com", def.getHostname()); + assertEquals(1234, def.getPort()); + + try { + def = new NodeDecommissionManager.HostDefinition("foobar:abcd"); + fail("InvalidHostStringException should have been thrown"); + } catch (InvalidHostStringException e) { + } + } + + @Test + public void testAnyInvalidHostThrowsException() + throws InvalidHostStringException{ + List<DatanodeDetails> dns = generateDatanodes(); + + // Try to decommission a host that does exist, but give incorrect port + try { + decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress()+":10")); + fail("InvalidHostStringException expected"); + } catch (InvalidHostStringException e) { + } + + // Try to decommission a host that does not exist + try { + decom.decommissionNodes(Arrays.asList("123.123.123.123")); + fail("InvalidHostStringException expected"); + } catch (InvalidHostStringException e) { + } + + // Try to decommission a host that does exist and a host that does not + try { + decom.decommissionNodes(Arrays.asList( + dns.get(1).getIpAddress(), "123,123,123,123")); + fail("InvalidHostStringException expected"); + } catch (InvalidHostStringException e) { + } + + // Try to decommission a host with many DNs on the address with no port + try { + decom.decommissionNodes(Arrays.asList( + dns.get(0).getIpAddress())); + fail("InvalidHostStringException expected"); + } catch (InvalidHostStringException e) { + } + + // Try to decommission a host with many DNs on the address with a port + // that does not exist + try { + decom.decommissionNodes(Arrays.asList( + dns.get(0).getIpAddress()+":10")); + fail("InvalidHostStringException expected"); + } catch (InvalidHostStringException e) { + } + } + + @Test + public void testNodesCanBeDecommissionedAndRecommissioned() + throws InvalidHostStringException { + List<DatanodeDetails> dns = generateDatanodes(); + + // Decommission 2 valid nodes + decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress(), + dns.get(2).getIpAddress())); + assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING, + nodeManager.getNodeStatus(dns.get(1)).getOperationalState()); + assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING, + nodeManager.getNodeStatus(dns.get(2)).getOperationalState()); + + // Running the command again gives no error - nodes already decommissioning + // are silently ignored. + decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress(), + dns.get(2).getIpAddress())); + + // Attempt to decommission dn(10) which has multiple hosts on the same IP + // and we hardcoded ports to 3456, 4567, 5678 + DatanodeDetails multiDn = dns.get(10); + String multiAddr = + multiDn.getIpAddress()+":"+multiDn.getPorts().get(0).getValue(); + decom.decommissionNodes(Arrays.asList(multiAddr)); + assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING, + nodeManager.getNodeStatus(multiDn).getOperationalState()); + + // Recommission all 3 hosts + decom.recommissionNodes(Arrays.asList( + multiAddr, dns.get(1).getIpAddress(), dns.get(2).getIpAddress())); + assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, + nodeManager.getNodeStatus(dns.get(1)).getOperationalState()); + assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, + nodeManager.getNodeStatus(dns.get(2)).getOperationalState()); + assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, + nodeManager.getNodeStatus(dns.get(10)).getOperationalState()); + } + + @Test + public void testNodesCanBePutIntoMaintenanceAndRecommissioned() + throws InvalidHostStringException { + List<DatanodeDetails> dns = generateDatanodes(); + + // Put 2 valid nodes into maintenance + decom.startMaintenanceNodes(Arrays.asList(dns.get(1).getIpAddress(), + dns.get(2).getIpAddress()), 100); + assertEquals(HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE, + nodeManager.getNodeStatus(dns.get(1)).getOperationalState()); + assertEquals(HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE, + nodeManager.getNodeStatus(dns.get(2)).getOperationalState()); + + // Running the command again gives no error - nodes already decommissioning + // are silently ignored. + decom.startMaintenanceNodes(Arrays.asList(dns.get(1).getIpAddress(), + dns.get(2).getIpAddress()), 100); + + // Attempt to decommission dn(10) which has multiple hosts on the same IP + // and we hardcoded ports to 3456, 4567, 5678 + DatanodeDetails multiDn = dns.get(10); + String multiAddr = + multiDn.getIpAddress()+":"+multiDn.getPorts().get(0).getValue(); + decom.startMaintenanceNodes(Arrays.asList(multiAddr), 100); + assertEquals(HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE, + nodeManager.getNodeStatus(multiDn).getOperationalState()); + + // Recommission all 3 hosts + decom.recommissionNodes(Arrays.asList( + multiAddr, dns.get(1).getIpAddress(), dns.get(2).getIpAddress())); + assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, + nodeManager.getNodeStatus(dns.get(1)).getOperationalState()); + assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, + nodeManager.getNodeStatus(dns.get(2)).getOperationalState()); + assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, + nodeManager.getNodeStatus(dns.get(10)).getOperationalState()); + } + + private SCMNodeManager createNodeManager(OzoneConfiguration config) + throws IOException, AuthenticationException { + scm = HddsTestUtils.getScm(config); + return (SCMNodeManager) scm.getScmNodeManager(); + } + + /** + * Generate a list of random DNs and return the list. A total of 11 DNs will + * be generated and registered with the node manager. Index 0 and 10 will + * have the same IP and host and the rest will have unique IPs and Hosts. + * The DN at index 10, has 3 hard coded ports of 3456, 4567, 5678. All other + * DNs will have ports set to 0. + * @return The list of DatanodeDetails Generated + */ + private List<DatanodeDetails> generateDatanodes() { + List<DatanodeDetails> dns = new ArrayList<>(); + for (int i=0; i<10; i++) { + DatanodeDetails dn = TestUtils.randomDatanodeDetails(); + dns.add(dn); + nodeManager.register(dn, null, null); + } + // We have 10 random DNs, we want to create another one that is on the same + // host as some of the others. + DatanodeDetails multiDn = dns.get(0); + + DatanodeDetails.Builder builder = DatanodeDetails.newBuilder(); + builder.setUuid(UUID.randomUUID().toString()) + .setHostName(multiDn.getHostName()) + .setIpAddress(multiDn.getIpAddress()) + .addPort(DatanodeDetails.newPort( + DatanodeDetails.Port.Name.STANDALONE, 3456)) + .addPort(DatanodeDetails.newPort( + DatanodeDetails.Port.Name.RATIS, 4567)) + .addPort(DatanodeDetails.newPort( + DatanodeDetails.Port.Name.REST, 5678)) + .setNetworkLocation(multiDn.getNetworkLocation()); + + DatanodeDetails dn = builder.build(); + nodeManager.register(dn, null, null); + dns.add(dn); + return dns; + } + +} diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeStateManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeStateManager.java index bc28a43..9fbf251 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeStateManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeStateManager.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hdds.scm.node; import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState; import org.apache.hadoop.hdds.scm.HddsServerUtil; import org.apache.hadoop.hdds.scm.node.states.NodeAlreadyExistsException; @@ -185,6 +186,22 @@ public class TestNodeStateManager { eventPublisher.getLastEvent().getName()); } + @Test + public void testNodeOpStateCanBeSet() + throws NodeAlreadyExistsException, NodeNotFoundException { + DatanodeDetails dn = generateDatanode(); + nsm.addNode(dn); + + nsm.setNodeOperationalState(dn, + HddsProtos.NodeOperationalState.DECOMMISSIONED); + + NodeStatus newStatus = nsm.getNodeStatus(dn); + assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONED, + newStatus.getOperationalState()); + assertEquals(NodeState.HEALTHY, + newStatus.getHealth()); + } + private DatanodeDetails generateDatanode() { String uuid = UUID.randomUUID().toString(); return DatanodeDetails.newBuilder().setUuid(uuid).build(); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/ozone/container/testutils/ReplicationNodeManagerMock.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/ozone/container/testutils/ReplicationNodeManagerMock.java index a48b2a0..5a2e3c2 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/ozone/container/testutils/ReplicationNodeManagerMock.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/ozone/container/testutils/ReplicationNodeManagerMock.java @@ -52,17 +52,17 @@ import java.util.LinkedList; * A Node Manager to test replication. */ public class ReplicationNodeManagerMock implements NodeManager { - private final Map<DatanodeDetails, NodeState> nodeStateMap; + private final Map<DatanodeDetails, NodeStatus> nodeStateMap; private final CommandQueue commandQueue; /** * A list of Datanodes and current states. - * @param nodeState A node state map. + * @param nodeStatus A node state map. */ - public ReplicationNodeManagerMock(Map<DatanodeDetails, NodeState> nodeState, + public ReplicationNodeManagerMock(Map<DatanodeDetails, NodeStatus> nodeStatus, CommandQueue commandQueue) { - Preconditions.checkNotNull(nodeState); - this.nodeStateMap = nodeState; + Preconditions.checkNotNull(nodeStatus); + this.nodeStateMap = nodeStatus; this.commandQueue = commandQueue; } @@ -179,11 +179,27 @@ public class ReplicationNodeManagerMock implements NodeManager { * @return Healthy/Stale/Dead. */ @Override - public NodeState getNodeState(DatanodeDetails dd) { + public NodeStatus getNodeStatus(DatanodeDetails dd) { return nodeStateMap.get(dd); } /** + * Set the operation state of a node. + * @param dd The datanode to set the new state for + * @param newState The new operational state for the node + */ + @Override + public void setNodeOperationalState(DatanodeDetails dd, + HddsProtos.NodeOperationalState newState) throws NodeNotFoundException { + NodeStatus currentStatus = nodeStateMap.get(dd); + if (currentStatus != null) { + nodeStateMap.put(dd, new NodeStatus(newState, currentStatus.getHealth())); + } else { + throw new NodeNotFoundException(); + } + } + + /** * Get set of pipelines a datanode is part of. * @param dnId - datanodeID * @return Set of PipelineID @@ -313,10 +329,10 @@ public class ReplicationNodeManagerMock implements NodeManager { * Adds a node to the existing Node manager. This is used only for test * purposes. * @param id DatanodeDetails - * @param state State you want to put that node to. + * @param status State you want to put that node to. */ - public void addNode(DatanodeDetails id, NodeState state) { - nodeStateMap.put(id, state); + public void addNode(DatanodeDetails id, NodeStatus status) { + nodeStateMap.put(id, status); } @Override diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/SCMCLI.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/SCMCLI.java index ff30eca..b095cd1 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/SCMCLI.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/SCMCLI.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.hdds.scm.XceiverClientManager; import org.apache.hadoop.hdds.scm.cli.container.ContainerCommands; import org.apache.hadoop.hdds.scm.cli.pipeline.PipelineCommands; +import org.apache.hadoop.hdds.scm.cli.node.DatanodeAdminCommands; import org.apache.hadoop.hdds.scm.client.ContainerOperationClient; import org.apache.hadoop.hdds.scm.client.ScmClient; import org.apache.hadoop.hdds.scm.container.ContainerInfo; @@ -76,7 +77,8 @@ import picocli.CommandLine.Option; ContainerCommands.class, PipelineCommands.class, TopologySubcommand.class, - ReplicationManagerCommands.class + ReplicationManagerCommands.class, + DatanodeAdminCommands.class }, mixinStandardHelpOptions = true) public class SCMCLI extends GenericCli { diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminCommands.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminCommands.java new file mode 100644 index 0000000..dc7a6f4 --- /dev/null +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminCommands.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdds.scm.cli.node; + +import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.hdds.cli.MissingSubcommandException; +import picocli.CommandLine.Command; +import picocli.CommandLine.ParentCommand; +import org.apache.hadoop.hdds.scm.cli.SCMCLI; + +import java.util.concurrent.Callable; + +/** + * Subcommand to group datanode admin related operations. + */ +@Command( + name = "dnadmin", + description = "Datanode Administration specific operations", + mixinStandardHelpOptions = true, + versionProvider = HddsVersionProvider.class, + subcommands = { + DatanodeAdminDecommissionSubCommand.class, + DatanodeAdminMaintenanceSubCommand.class, + DatanodeAdminRecommissionSubCommand.class + }) +public class DatanodeAdminCommands implements Callable<Void> { + + @ParentCommand + private SCMCLI parent; + + public SCMCLI getParent() { + return parent; + } + + @Override + public Void call() throws Exception { + throw new MissingSubcommandException( + this.parent.getCmd().getSubcommands().get("nodeadmin")); + } +} \ No newline at end of file diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminDecommissionSubCommand.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminDecommissionSubCommand.java new file mode 100644 index 0000000..1406603 --- /dev/null +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminDecommissionSubCommand.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdds.scm.cli.node; + +import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.hdds.scm.client.ScmClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import picocli.CommandLine; +import picocli.CommandLine.Command; +import picocli.CommandLine.ParentCommand; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; + +/** + * Decommission one or more datanodes. + */ +@Command( + name = "decommission", + description = "Decommission a datanode", + mixinStandardHelpOptions = true, + versionProvider = HddsVersionProvider.class) +public class DatanodeAdminDecommissionSubCommand implements Callable<Void> { + + private static final Logger LOG = + LoggerFactory.getLogger(DatanodeAdminDecommissionSubCommand.class); + + @CommandLine.Parameters(description = "List of fully qualified host names") + private List<String> hosts = new ArrayList<String>(); + + @ParentCommand + private DatanodeAdminCommands parent; + + @Override + public Void call() throws Exception { + try (ScmClient scmClient = parent.getParent().createScmClient()) { + scmClient.decommissionNodes(hosts); + return null; + } + } +} diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminMaintenanceSubCommand.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminMaintenanceSubCommand.java new file mode 100644 index 0000000..2e4b2b5 --- /dev/null +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminMaintenanceSubCommand.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdds.scm.cli.node; + +import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.hdds.scm.client.ScmClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import picocli.CommandLine; +import picocli.CommandLine.Command; +import picocli.CommandLine.ParentCommand; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; + +/** + * Place one or more datanodes into Maintenance Mode. + */ +@Command( + name = "maintenance", + description = "Put a datanode into Maintenance Mode", + mixinStandardHelpOptions = true, + versionProvider = HddsVersionProvider.class) +public class DatanodeAdminMaintenanceSubCommand implements Callable<Void> { + + private static final Logger LOG = + LoggerFactory.getLogger(DatanodeAdminMaintenanceSubCommand.class); + + @CommandLine.Parameters(description = "List of fully qualified host names") + private List<String> hosts = new ArrayList<String>(); + + @CommandLine.Option(names = {"--end"}, + description = "Automatically end maintenance after the given hours. "+ + "By default, maintenance must be ended manually.") + private int endInHours = 0; + + @ParentCommand + private DatanodeAdminCommands parent; + + @Override + public Void call() throws Exception { + try (ScmClient scmClient = parent.getParent().createScmClient()) { + scmClient.startMaintenanceNodes(hosts, endInHours); + return null; + } + } +} \ No newline at end of file diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminRecommissionSubCommand.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminRecommissionSubCommand.java new file mode 100644 index 0000000..eaa1280 --- /dev/null +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/DatanodeAdminRecommissionSubCommand.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdds.scm.cli.node; + +import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.hdds.scm.client.ScmClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import picocli.CommandLine; +import picocli.CommandLine.Command; +import picocli.CommandLine.ParentCommand; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; + +/** + * Place decommissioned or maintenance nodes back into service. + */ +@Command( + name = "recommission", + description = "Return a datanode to service", + mixinStandardHelpOptions = true, + versionProvider = HddsVersionProvider.class) +public class DatanodeAdminRecommissionSubCommand implements Callable<Void> { + + private static final Logger LOG = + LoggerFactory.getLogger(DatanodeAdminRecommissionSubCommand.class); + + @CommandLine.Parameters(description = "List of fully qualified host names") + private List<String> hosts = new ArrayList<String>(); + + @ParentCommand + private DatanodeAdminCommands parent; + + @Override + public Void call() throws Exception { + try (ScmClient scmClient = parent.getParent().createScmClient()) { + scmClient.recommissionNodes(hosts); + return null; + } + } +} \ No newline at end of file diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/package-info.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/package-info.java new file mode 100644 index 0000000..dfb04b8 --- /dev/null +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/node/package-info.java @@ -0,0 +1,23 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * <p> + * SCM related cli tools. + */ +/** + * SCM related cli tools for Datanode Admin. + */ +package org.apache.hadoop.hdds.scm.cli.node; diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDecommissionAndMaintenance.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDecommissionAndMaintenance.java new file mode 100644 index 0000000..bada595 --- /dev/null +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDecommissionAndMaintenance.java @@ -0,0 +1,137 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ozone.scm.node; + +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.scm.XceiverClientManager; +import org.apache.hadoop.hdds.scm.client.ContainerOperationClient; +import org.apache.hadoop.hdds.scm.node.NodeManager; +import org.apache.hadoop.ozone.MiniOzoneCluster; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static java.util.concurrent.TimeUnit.SECONDS; +import static junit.framework.TestCase.assertEquals; +import static org.apache.hadoop.hdds.HddsConfigKeys.*; +import static org.apache.hadoop.hdds.scm.ScmConfigKeys.*; + +/** + * Test from the scmclient for decommission and maintenance. + */ + +public class TestDecommissionAndMaintenance { + private static final Logger LOG = + LoggerFactory.getLogger(TestDecommissionAndMaintenance.class); + + private static int numOfDatanodes = 5; + private MiniOzoneCluster cluster; + + private ContainerOperationClient scmClient; + + @Before + public void setUp() throws Exception { + OzoneConfiguration conf = new OzoneConfiguration(); + final int interval = 100; + + conf.setTimeDuration(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL, + interval, TimeUnit.MILLISECONDS); + conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 1, SECONDS); + conf.setTimeDuration(HDDS_PIPELINE_REPORT_INTERVAL, 1, SECONDS); + conf.setTimeDuration(HDDS_COMMAND_STATUS_REPORT_INTERVAL, 1, SECONDS); + conf.setTimeDuration(HDDS_CONTAINER_REPORT_INTERVAL, 1, SECONDS); + conf.setTimeDuration(HDDS_NODE_REPORT_INTERVAL, 1, SECONDS); + conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 3, SECONDS); + conf.setTimeDuration(OZONE_SCM_DEADNODE_INTERVAL, 6, SECONDS); + + cluster = MiniOzoneCluster.newBuilder(conf) + .setNumDatanodes(numOfDatanodes) + .build(); + cluster.waitForClusterToBeReady(); + scmClient = new ContainerOperationClient(cluster + .getStorageContainerLocationClient(), + new XceiverClientManager(conf)); + } + + @After + public void tearDown() throws Exception { + if (cluster != null) { + cluster.shutdown(); + } + } + + @Test + public void testNodeCanBeDecommMaintAndRecommissioned() + throws IOException { + NodeManager nm = cluster.getStorageContainerManager().getScmNodeManager(); + + List<DatanodeDetails> dns = nm.getAllNodes(); + scmClient.decommissionNodes(Arrays.asList(getDNHostAndPort(dns.get(0)))); + + // Ensure one node is decommissioning + List<DatanodeDetails> decomNodes = nm.getNodes( + HddsProtos.NodeOperationalState.DECOMMISSIONING, + HddsProtos.NodeState.HEALTHY); + assertEquals(1, decomNodes.size()); + + scmClient.recommissionNodes(Arrays.asList(getDNHostAndPort(dns.get(0)))); + + // Ensure zero nodes are now decommissioning + decomNodes = nm.getNodes( + HddsProtos.NodeOperationalState.DECOMMISSIONING, + HddsProtos.NodeState.HEALTHY); + assertEquals(0, decomNodes.size()); + + scmClient.startMaintenanceNodes(Arrays.asList( + getDNHostAndPort(dns.get(0))), 10); + + // None are decommissioning + decomNodes = nm.getNodes( + HddsProtos.NodeOperationalState.DECOMMISSIONING, + HddsProtos.NodeState.HEALTHY); + assertEquals(0, decomNodes.size()); + + // One is in Maintenance + decomNodes = nm.getNodes( + HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE, + HddsProtos.NodeState.HEALTHY); + assertEquals(1, decomNodes.size()); + + scmClient.recommissionNodes(Arrays.asList(getDNHostAndPort(dns.get(0)))); + + // None are in maintenance + decomNodes = nm.getNodes( + HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE, + HddsProtos.NodeState.HEALTHY); + assertEquals(0, decomNodes.size()); + } + + private String getDNHostAndPort(DatanodeDetails dn) { + return dn.getHostName()+":"+dn.getPorts().get(0).getValue(); + } + +} --------------------------------------------------------------------- To unsubscribe, e-mail: hdfs-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: hdfs-commits-h...@hadoop.apache.org