adoroszlai commented on code in PR #2686:
URL: https://github.com/apache/ozone/pull/2686#discussion_r861080466


##########
hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/admin/failover/FailoverCommand.java:
##########
@@ -0,0 +1,414 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.hadoop.ozone.admin.failover;
+
+import org.apache.commons.lang3.EnumUtils;
+import org.apache.hadoop.hdds.NodeDetails;
+import org.apache.hadoop.hdds.cli.GenericCli;
+import org.apache.hadoop.hdds.cli.HddsVersionProvider;
+import org.apache.hadoop.hdds.cli.OzoneAdmin;
+import org.apache.hadoop.hdds.cli.SubcommandWithParent;
+import org.apache.hadoop.hdds.conf.ConfigurationException;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.ratis.RatisHelper;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
+import org.apache.hadoop.hdds.security.x509.SecurityConfig;
+import org.apache.hadoop.ozone.OmUtils;
+import org.apache.hadoop.ozone.OzoneSecurityUtil;
+import org.apache.hadoop.ozone.client.OzoneClientFactory;
+import org.apache.hadoop.ozone.client.protocol.ClientProtocol;
+import org.apache.hadoop.ozone.ha.ConfUtils;
+import org.apache.hadoop.ozone.om.OMConfigKeys;
+import org.apache.hadoop.ozone.om.ha.OMHANodeDetails;
+import org.apache.hadoop.ozone.om.helpers.ServiceInfoEx;
+import org.apache.ratis.client.RaftClient;
+import org.apache.ratis.client.RaftClientConfigKeys;
+import org.apache.ratis.client.api.GroupManagementApi;
+import org.apache.ratis.conf.Parameters;
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.grpc.GrpcConfigKeys;
+import org.apache.ratis.grpc.GrpcTlsConfig;
+import org.apache.ratis.proto.RaftProtos;
+import org.apache.ratis.protocol.*;
+import org.apache.ratis.retry.ExponentialBackoffRetry;
+import org.apache.ratis.util.TimeDuration;
+import org.kohsuke.MetaInfServices;
+import picocli.CommandLine;
+
+import java.io.IOException;
+import java.security.cert.X509Certificate;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.*;
+
+/**
+ * Subcommand for admin operations related to failover actions.
+ */
[email protected](
+    name = "failover",
+    description = "manually transfer leadership of raft group to target node",
+    mixinStandardHelpOptions = true,
+    versionProvider = HddsVersionProvider.class
+)
+@MetaInfServices(SubcommandWithParent.class)
+public class FailoverCommand extends GenericCli
+    implements SubcommandWithParent {
+
+  public static final RaftGroupId PSEUDO_RAFT_GROUP_ID = 
RaftGroupId.randomId();
+  public static final int TRANSFER_LEADER_WAIT_MS = 120_000;
+  public static final String RANDOM = "RANDOM";
+
+  enum Domain {
+    OM,
+    SCM,
+  }
+
+  @CommandLine.ParentCommand
+  private OzoneAdmin parent;
+
+  @CommandLine.Parameters(
+      description = "['om'/'scm']"
+  )
+  private String domain;

Review Comment:
   We already have `ozone admin om` and `ozone admin scm` commands.  I think it 
would make sense to add `failover` as a subcommand under each of those, instead 
of being a direct subcommand under `ozone admin` and requiring an extra 
parameter.  This would be more consistent with e.g. `roles`, which also exists 
under those two parents.
   
   The generic part of `FailoverCommand` could be in an abstract parent class, 
with OM/SCM-specific details in concrete subclasses.



##########
hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/admin/failover/FailoverCommand.java:
##########
@@ -0,0 +1,414 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.hadoop.ozone.admin.failover;
+
+import org.apache.commons.lang3.EnumUtils;
+import org.apache.hadoop.hdds.NodeDetails;
+import org.apache.hadoop.hdds.cli.GenericCli;
+import org.apache.hadoop.hdds.cli.HddsVersionProvider;
+import org.apache.hadoop.hdds.cli.OzoneAdmin;
+import org.apache.hadoop.hdds.cli.SubcommandWithParent;
+import org.apache.hadoop.hdds.conf.ConfigurationException;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.ratis.RatisHelper;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
+import org.apache.hadoop.hdds.security.x509.SecurityConfig;
+import org.apache.hadoop.ozone.OmUtils;
+import org.apache.hadoop.ozone.OzoneSecurityUtil;
+import org.apache.hadoop.ozone.client.OzoneClientFactory;
+import org.apache.hadoop.ozone.client.protocol.ClientProtocol;
+import org.apache.hadoop.ozone.ha.ConfUtils;
+import org.apache.hadoop.ozone.om.OMConfigKeys;
+import org.apache.hadoop.ozone.om.ha.OMHANodeDetails;
+import org.apache.hadoop.ozone.om.helpers.ServiceInfoEx;
+import org.apache.ratis.client.RaftClient;
+import org.apache.ratis.client.RaftClientConfigKeys;
+import org.apache.ratis.client.api.GroupManagementApi;
+import org.apache.ratis.conf.Parameters;
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.grpc.GrpcConfigKeys;
+import org.apache.ratis.grpc.GrpcTlsConfig;
+import org.apache.ratis.proto.RaftProtos;
+import org.apache.ratis.protocol.*;
+import org.apache.ratis.retry.ExponentialBackoffRetry;
+import org.apache.ratis.util.TimeDuration;
+import org.kohsuke.MetaInfServices;
+import picocli.CommandLine;
+
+import java.io.IOException;
+import java.security.cert.X509Certificate;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.*;
+
+/**
+ * Subcommand for admin operations related to failover actions.
+ */
[email protected](
+    name = "failover",
+    description = "manually transfer leadership of raft group to target node",
+    mixinStandardHelpOptions = true,
+    versionProvider = HddsVersionProvider.class
+)
+@MetaInfServices(SubcommandWithParent.class)
+public class FailoverCommand extends GenericCli
+    implements SubcommandWithParent {
+
+  public static final RaftGroupId PSEUDO_RAFT_GROUP_ID = 
RaftGroupId.randomId();
+  public static final int TRANSFER_LEADER_WAIT_MS = 120_000;
+  public static final String RANDOM = "RANDOM";
+
+  enum Domain {
+    OM,
+    SCM,
+  }
+
+  @CommandLine.ParentCommand
+  private OzoneAdmin parent;
+
+  @CommandLine.Parameters(
+      description = "['om'/'scm']"
+  )
+  private String domain;
+
+  @CommandLine.Parameters(
+      description = "[host/host:port/'random'] ratis rpc address of " +
+      "target domain if 'random' then choose arbitrary follower node "
+  )
+  private String tgtAddress;
+
+  @CommandLine.Option(
+      names = {"-id", "--service-id"},
+      description = "Ozone Manager Service ID, if domain is om, " +
+          "this option is prerequisite"
+  )
+  private String omServiceId;
+
+  @CommandLine.Option(
+          names = {"--hostPortList"},
+          description = "if need to test on given nodes, " +
+                  "pattern like this 'host:port,host:port,host:port'"
+  )
+  private String ratisAddresses;
+
+  public OzoneAdmin getParent() {
+    return parent;
+  }
+
+  @Override
+  public Void call() throws Exception {
+    OzoneConfiguration configuration;
+    if (parent.getOzoneConf() == null) {
+      configuration = new OzoneConfiguration();
+    } else{
+      configuration = parent.getOzoneConf();
+    }
+    try {
+      transferLeadership(configuration);
+    } catch (Exception ex) {
+      ex.printStackTrace();
+      throw ex;
+    }
+    return null;
+  }
+
+  @Override
+  public Class<?> getParentType() {
+    return OzoneAdmin.class;
+  }
+
+  /**
+   * Create ratis client.
+   *
+   * @param raftGroupId the raft group id
+   * @param peers       the peers
+   * @return the raft client
+   */
+  public static RaftClient createRatisClient(RaftGroupId raftGroupId,
+                                             Collection<RaftPeer> peers,
+                                             GrpcTlsConfig tlsConfig) {
+    RaftProperties properties = new RaftProperties();
+    Parameters parameters = new Parameters();
+    RaftClient.Builder builder = RaftClient.newBuilder();
+    RaftClientConfigKeys.Rpc.setRequestTimeout(properties,
+        TimeDuration.valueOf(15, TimeUnit.SECONDS));
+    ExponentialBackoffRetry retryPolicy = ExponentialBackoffRetry.newBuilder()
+        .setBaseSleepTime(
+                TimeDuration.valueOf(100, TimeUnit.MILLISECONDS))
+        .setMaxAttempts(10)
+        .setMaxSleepTime(
+                TimeDuration.valueOf(100000, TimeUnit.MILLISECONDS))
+        .build();
+    // currently only Grpc supported
+    if (tlsConfig != null) {
+      GrpcConfigKeys.Client.setTlsConf(parameters, tlsConfig);
+    }
+    return builder
+            .setClientId(ClientId.randomId())
+            .setLeaderId(null)
+            .setProperties(properties)
+            .setParameters(parameters)
+            .setRetryPolicy(retryPolicy)
+            .setRaftGroup(RaftGroup.valueOf(raftGroupId, peers))
+            .build();
+  }
+
+  /**
+   * The whole procedure is as following.
+   *
+   *   Pseudo raft client -> get real raft group info -> check address
+   *   whether in raft peers -> real raft client -> set priority -> trigger
+   *   transferleadership -> new raft leader takes office
+   *
+   * @throws IOException IOException
+   */
+  private void transferLeadership(OzoneConfiguration conf) throws IOException {
+    List<String> ratisAddressList;
+    if (ratisAddresses != null) {
+      ratisAddressList = Arrays.asList(ratisAddresses.split(","));
+    } else if (domain.equalsIgnoreCase(Domain.SCM.toString())) {
+      ratisAddressList = getSCMRatisAddressList(conf);
+    } else if (domain.equalsIgnoreCase(Domain.OM.toString())) {
+      ratisAddressList = getOMRatisAddressList(conf);
+    } else {
+      throw new IllegalArgumentException("Invalid domain, should be one of " +
+              Arrays.toString(Domain.values()));
+    }
+    assert ratisAddressList.size() > 0;
+    List<RaftPeer> peerList = ratisAddressList.stream().map(addr-> RaftPeer
+            .newBuilder()
+            .setId(RaftPeerId.valueOf(addr))
+            .setAddress(addr)
+            .build())
+            .collect(Collectors.toList());
+
+    final GrpcTlsConfig tlsConfig = RatisHelper.createTlsClientConfig(new
+            SecurityConfig(conf), getCACertificates(conf));
+    // Pseudo client for inquiry
+    RaftClient raftClient = createRatisClient(PSEUDO_RAFT_GROUP_ID, peerList,
+            tlsConfig);
+    RaftGroupId remoteGroupId;
+    GroupManagementApi groupManagementApi = raftClient.getGroupManagementApi(
+            peerList.get(0).getId());
+    List<RaftGroupId> groupIds = groupManagementApi.list().getGroupIds();

Review Comment:
   `groupManagementApi.list()` fails in secure cluster with:
   
   ```
   $ kinit -kt /etc/security/keytabs/testuser.keytab testuser/[email protected]
   $ ozone admin failover --service-id id1 om 172.25.0.112
   ...
   org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: UNKNOWN
     ...
     at 
org.apache.ratis.grpc.client.GrpcClientProtocolClient.groupList(GrpcClientProtocolClient.java:187)
     at 
org.apache.ratis.grpc.client.GrpcClientRpc.sendRequest(GrpcClientRpc.java:104)
     at 
org.apache.ratis.client.impl.BlockingImpl.sendRequest(BlockingImpl.java:130)
     at 
org.apache.ratis.client.impl.GroupManagementImpl.list(GroupManagementImpl.java:67)
     at 
org.apache.hadoop.ozone.admin.failover.FailoverCommand.transferLeadership(FailoverCommand.java:211)
     at 
org.apache.hadoop.ozone.admin.failover.FailoverCommand.call(FailoverCommand.java:125)
     at 
org.apache.hadoop.ozone.admin.failover.FailoverCommand.call(FailoverCommand.java:65)
   Caused by: 
org.apache.ratis.thirdparty.io.netty.handler.codec.DecoderException: 
javax.net.ssl.SSLException: error:1000045c:SSL 
routines:OPENSSL_internal:TLSV1_ALERT_CERTIFICATE_REQUIRED
   ```



##########
hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/admin/failover/FailoverCommand.java:
##########
@@ -0,0 +1,414 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.hadoop.ozone.admin.failover;
+
+import org.apache.commons.lang3.EnumUtils;
+import org.apache.hadoop.hdds.NodeDetails;
+import org.apache.hadoop.hdds.cli.GenericCli;
+import org.apache.hadoop.hdds.cli.HddsVersionProvider;
+import org.apache.hadoop.hdds.cli.OzoneAdmin;
+import org.apache.hadoop.hdds.cli.SubcommandWithParent;
+import org.apache.hadoop.hdds.conf.ConfigurationException;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.ratis.RatisHelper;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
+import org.apache.hadoop.hdds.security.x509.SecurityConfig;
+import org.apache.hadoop.ozone.OmUtils;
+import org.apache.hadoop.ozone.OzoneSecurityUtil;
+import org.apache.hadoop.ozone.client.OzoneClientFactory;
+import org.apache.hadoop.ozone.client.protocol.ClientProtocol;
+import org.apache.hadoop.ozone.ha.ConfUtils;
+import org.apache.hadoop.ozone.om.OMConfigKeys;
+import org.apache.hadoop.ozone.om.ha.OMHANodeDetails;
+import org.apache.hadoop.ozone.om.helpers.ServiceInfoEx;
+import org.apache.ratis.client.RaftClient;
+import org.apache.ratis.client.RaftClientConfigKeys;
+import org.apache.ratis.client.api.GroupManagementApi;
+import org.apache.ratis.conf.Parameters;
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.grpc.GrpcConfigKeys;
+import org.apache.ratis.grpc.GrpcTlsConfig;
+import org.apache.ratis.proto.RaftProtos;
+import org.apache.ratis.protocol.*;
+import org.apache.ratis.retry.ExponentialBackoffRetry;
+import org.apache.ratis.util.TimeDuration;
+import org.kohsuke.MetaInfServices;
+import picocli.CommandLine;
+
+import java.io.IOException;
+import java.security.cert.X509Certificate;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.*;
+
+/**
+ * Subcommand for admin operations related to failover actions.
+ */
[email protected](
+    name = "failover",
+    description = "manually transfer leadership of raft group to target node",
+    mixinStandardHelpOptions = true,
+    versionProvider = HddsVersionProvider.class
+)
+@MetaInfServices(SubcommandWithParent.class)
+public class FailoverCommand extends GenericCli
+    implements SubcommandWithParent {
+
+  public static final RaftGroupId PSEUDO_RAFT_GROUP_ID = 
RaftGroupId.randomId();
+  public static final int TRANSFER_LEADER_WAIT_MS = 120_000;
+  public static final String RANDOM = "RANDOM";
+
+  enum Domain {
+    OM,
+    SCM,
+  }
+
+  @CommandLine.ParentCommand
+  private OzoneAdmin parent;
+
+  @CommandLine.Parameters(
+      description = "['om'/'scm']"
+  )
+  private String domain;
+
+  @CommandLine.Parameters(
+      description = "[host/host:port/'random'] ratis rpc address of " +
+      "target domain if 'random' then choose arbitrary follower node "
+  )
+  private String tgtAddress;
+
+  @CommandLine.Option(
+      names = {"-id", "--service-id"},
+      description = "Ozone Manager Service ID, if domain is om, " +
+          "this option is prerequisite"
+  )
+  private String omServiceId;
+
+  @CommandLine.Option(
+          names = {"--hostPortList"},
+          description = "if need to test on given nodes, " +
+                  "pattern like this 'host:port,host:port,host:port'"
+  )
+  private String ratisAddresses;
+
+  public OzoneAdmin getParent() {
+    return parent;
+  }
+
+  @Override
+  public Void call() throws Exception {
+    OzoneConfiguration configuration;
+    if (parent.getOzoneConf() == null) {
+      configuration = new OzoneConfiguration();
+    } else{
+      configuration = parent.getOzoneConf();
+    }
+    try {
+      transferLeadership(configuration);
+    } catch (Exception ex) {
+      ex.printStackTrace();
+      throw ex;
+    }
+    return null;
+  }
+
+  @Override
+  public Class<?> getParentType() {
+    return OzoneAdmin.class;
+  }
+
+  /**
+   * Create ratis client.
+   *
+   * @param raftGroupId the raft group id
+   * @param peers       the peers
+   * @return the raft client
+   */
+  public static RaftClient createRatisClient(RaftGroupId raftGroupId,
+                                             Collection<RaftPeer> peers,
+                                             GrpcTlsConfig tlsConfig) {
+    RaftProperties properties = new RaftProperties();
+    Parameters parameters = new Parameters();
+    RaftClient.Builder builder = RaftClient.newBuilder();
+    RaftClientConfigKeys.Rpc.setRequestTimeout(properties,
+        TimeDuration.valueOf(15, TimeUnit.SECONDS));
+    ExponentialBackoffRetry retryPolicy = ExponentialBackoffRetry.newBuilder()
+        .setBaseSleepTime(
+                TimeDuration.valueOf(100, TimeUnit.MILLISECONDS))
+        .setMaxAttempts(10)
+        .setMaxSleepTime(
+                TimeDuration.valueOf(100000, TimeUnit.MILLISECONDS))
+        .build();
+    // currently only Grpc supported
+    if (tlsConfig != null) {
+      GrpcConfigKeys.Client.setTlsConf(parameters, tlsConfig);
+    }
+    return builder
+            .setClientId(ClientId.randomId())
+            .setLeaderId(null)
+            .setProperties(properties)
+            .setParameters(parameters)
+            .setRetryPolicy(retryPolicy)
+            .setRaftGroup(RaftGroup.valueOf(raftGroupId, peers))
+            .build();
+  }
+
+  /**
+   * The whole procedure is as following.
+   *
+   *   Pseudo raft client -> get real raft group info -> check address
+   *   whether in raft peers -> real raft client -> set priority -> trigger
+   *   transferleadership -> new raft leader takes office
+   *
+   * @throws IOException IOException
+   */
+  private void transferLeadership(OzoneConfiguration conf) throws IOException {
+    List<String> ratisAddressList;
+    if (ratisAddresses != null) {
+      ratisAddressList = Arrays.asList(ratisAddresses.split(","));
+    } else if (domain.equalsIgnoreCase(Domain.SCM.toString())) {
+      ratisAddressList = getSCMRatisAddressList(conf);
+    } else if (domain.equalsIgnoreCase(Domain.OM.toString())) {
+      ratisAddressList = getOMRatisAddressList(conf);
+    } else {
+      throw new IllegalArgumentException("Invalid domain, should be one of " +
+              Arrays.toString(Domain.values()));
+    }
+    assert ratisAddressList.size() > 0;
+    List<RaftPeer> peerList = ratisAddressList.stream().map(addr-> RaftPeer
+            .newBuilder()
+            .setId(RaftPeerId.valueOf(addr))
+            .setAddress(addr)
+            .build())
+            .collect(Collectors.toList());
+
+    final GrpcTlsConfig tlsConfig = RatisHelper.createTlsClientConfig(new
+            SecurityConfig(conf), getCACertificates(conf));
+    // Pseudo client for inquiry
+    RaftClient raftClient = createRatisClient(PSEUDO_RAFT_GROUP_ID, peerList,
+            tlsConfig);
+    RaftGroupId remoteGroupId;
+    GroupManagementApi groupManagementApi = raftClient.getGroupManagementApi(
+            peerList.get(0).getId());
+    List<RaftGroupId> groupIds = groupManagementApi.list().getGroupIds();
+    if (groupIds.size() == 1) {
+      remoteGroupId = groupIds.get(0);
+    } else {
+      throw new IOException("There are more than one raft groups.");
+    }
+    GroupInfoReply groupInfoReply = groupManagementApi.info(remoteGroupId);
+    RaftGroup raftGroup = groupInfoReply.getGroup();
+    raftClient = createRatisClient(raftGroup.getGroupId(),
+            raftGroup.getPeers(), tlsConfig);
+    System.out.println("RaftGroup from raft server: " + raftGroup);
+
+    if (tgtAddress.equalsIgnoreCase(RANDOM)) {
+      tgtAddress = getRandomAddress(groupInfoReply);
+    }
+    getRatisPortHost(conf);
+    // check address passed whether belongs to the peers
+    if (raftGroup.getPeers().stream().noneMatch(raftPeer ->
+        raftPeer.getAddress().contains(tgtAddress))) {
+      throw new IOException(String.format("%s is not part of the " +
+          "quorum %s.", tgtAddress, raftGroup.getPeers().stream().
+              map(RaftPeer::getAddress).collect(Collectors.toList())));
+    }
+    System.out.printf("Trying to transfer to new leader %s.%n", tgtAddress);
+
+    List<RaftPeer> peersWithNewPriorities = new ArrayList<>();
+    for (RaftPeer peer : raftGroup.getPeers()) {
+      peersWithNewPriorities.add(
+          RaftPeer.newBuilder(peer)
+                  .setPriority(peer.getAddress()
+                          .equalsIgnoreCase(tgtAddress) ? 2 : 1)
+                  .build()
+      );
+    }
+    RaftClientReply reply;
+    reply = raftClient.admin().setConfiguration(peersWithNewPriorities);
+    if (reply.isSuccess()) {
+      System.out.printf("Successfully set new priority for division: %s.%n",
+          peersWithNewPriorities);
+    } else {
+      System.out.printf("Failed to set new priority for division: %s." +
+          " Ratis reply: %s.%n", peersWithNewPriorities, reply);
+      throw new IOException(reply.getException());
+    }
+
+    RaftPeerId newLeaderPeerId = raftGroup.getPeers().stream().
+        filter(peer -> peer.getAddress().equalsIgnoreCase(tgtAddress))
+            .findAny().get().getId();
+    reply = raftClient.admin().transferLeadership(
+        newLeaderPeerId, TRANSFER_LEADER_WAIT_MS);
+    if (reply.isSuccess()) {
+      System.out.printf("Successfully transferred leadership: %s.%n",
+              tgtAddress);
+    } else {
+      System.out.printf("Failed to transfer leadership: %s." +
+          " Ratis reply: %s.%n", tgtAddress, reply);
+      throw new IOException(reply.getException());
+    }
+  }
+
+  /**
+   * Check OM HA and get ratis address list.
+   *
+   * @param conf the conf
+   * @return the om ratis address list
+   * @throws IOException the io exception
+   */
+  List<String> getOMRatisAddressList(OzoneConfiguration conf)
+          throws IOException {
+    if (OmUtils.isOmHAServiceId(conf, omServiceId)) {
+      OMHANodeDetails omhaNodeDetails = OMHANodeDetails.loadOMHAConfig(conf);

Review Comment:
   Transfer OM leadership failed when executed on SCM host:
   
   (172.25.0.112 is om2)
   
   ```
   $ ozone admin failover --service-id id1 om 172.25.0.112
   ServiceID for OzoneManager is id1
   Configuration has no ozone.om.address address that matches local node's 
address.
   org.apache.hadoop.ozone.OzoneIllegalArgumentException: Configuration has no 
ozone.om.address address that matches local node's address.
        at 
org.apache.hadoop.ozone.om.ha.OMHANodeDetails.throwConfException(OMHANodeDetails.java:307)
        at 
org.apache.hadoop.ozone.om.ha.OMHANodeDetails.loadOMHAConfig(OMHANodeDetails.java:225)
        at 
org.apache.hadoop.ozone.admin.failover.FailoverCommand.getOMRatisAddressList(FailoverCommand.java:281)
        at 
org.apache.hadoop.ozone.admin.failover.FailoverCommand.transferLeadership(FailoverCommand.java:190)
        at 
org.apache.hadoop.ozone.admin.failover.FailoverCommand.call(FailoverCommand.java:125)
        at 
org.apache.hadoop.ozone.admin.failover.FailoverCommand.call(FailoverCommand.java:65)
   ```
   
   It would be nice to make the command work on any host in the cluster.



##########
hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/admin/om/GetCertificatesSubcommand.java:
##########
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.admin.om;
+
+import org.apache.hadoop.hdds.cli.HddsVersionProvider;
+import org.apache.hadoop.ozone.OzoneSecurityUtil;
+import org.apache.hadoop.ozone.om.helpers.ServiceInfoEx;
+import org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol;
+import picocli.CommandLine;
+
+import java.io.IOException;
+import java.security.cert.X509Certificate;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+/**
+ * Handler of om roles command.
+ */
[email protected](
+    name = "certificates", aliases = "certs",
+    description = "List all CA certificates from OM",
+    mixinStandardHelpOptions = true,
+    versionProvider = HddsVersionProvider.class)
+public class GetCertificatesSubcommand implements Callable<Void> {

Review Comment:
   This subcommand does not seem to be necessary for the failover.



##########
hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/admin/om/GetCertificatesSubcommand.java:
##########
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.admin.om;
+
+import org.apache.hadoop.hdds.cli.HddsVersionProvider;
+import org.apache.hadoop.ozone.OzoneSecurityUtil;
+import org.apache.hadoop.ozone.om.helpers.ServiceInfoEx;
+import org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol;
+import picocli.CommandLine;
+
+import java.io.IOException;
+import java.security.cert.X509Certificate;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+/**
+ * Handler of om roles command.
+ */
[email protected](
+    name = "certificates", aliases = "certs",
+    description = "List all CA certificates from OM",
+    mixinStandardHelpOptions = true,
+    versionProvider = HddsVersionProvider.class)
+public class GetCertificatesSubcommand implements Callable<Void> {
+
+  @CommandLine.ParentCommand
+  private OMAdmin parent;
+
+  @CommandLine.Option(names = {"-id", "--service-id"},
+      description = "OM Service ID, for HA mode."
+  )
+  private String omServiceId;
+
+  @CommandLine.Option(
+      names = {"-host", "--service-host"},
+      description = "Ozone Manager Host, for non-HA mode."
+  )
+  private String omHost;
+
+  @CommandLine.Option(
+      names = {"--show", "-show"},
+      required = true
+  )
+  private boolean action;

Review Comment:
   This option is not used.  What is the purpose?



##########
hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/admin/failover/FailoverCommand.java:
##########
@@ -0,0 +1,414 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.hadoop.ozone.admin.failover;
+
+import org.apache.commons.lang3.EnumUtils;
+import org.apache.hadoop.hdds.NodeDetails;
+import org.apache.hadoop.hdds.cli.GenericCli;
+import org.apache.hadoop.hdds.cli.HddsVersionProvider;
+import org.apache.hadoop.hdds.cli.OzoneAdmin;
+import org.apache.hadoop.hdds.cli.SubcommandWithParent;
+import org.apache.hadoop.hdds.conf.ConfigurationException;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.ratis.RatisHelper;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+import org.apache.hadoop.hdds.scm.ha.SCMHAUtils;
+import org.apache.hadoop.hdds.security.x509.SecurityConfig;
+import org.apache.hadoop.ozone.OmUtils;
+import org.apache.hadoop.ozone.OzoneSecurityUtil;
+import org.apache.hadoop.ozone.client.OzoneClientFactory;
+import org.apache.hadoop.ozone.client.protocol.ClientProtocol;
+import org.apache.hadoop.ozone.ha.ConfUtils;
+import org.apache.hadoop.ozone.om.OMConfigKeys;
+import org.apache.hadoop.ozone.om.ha.OMHANodeDetails;
+import org.apache.hadoop.ozone.om.helpers.ServiceInfoEx;
+import org.apache.ratis.client.RaftClient;
+import org.apache.ratis.client.RaftClientConfigKeys;
+import org.apache.ratis.client.api.GroupManagementApi;
+import org.apache.ratis.conf.Parameters;
+import org.apache.ratis.conf.RaftProperties;
+import org.apache.ratis.grpc.GrpcConfigKeys;
+import org.apache.ratis.grpc.GrpcTlsConfig;
+import org.apache.ratis.proto.RaftProtos;
+import org.apache.ratis.protocol.*;
+import org.apache.ratis.retry.ExponentialBackoffRetry;
+import org.apache.ratis.util.TimeDuration;
+import org.kohsuke.MetaInfServices;
+import picocli.CommandLine;
+
+import java.io.IOException;
+import java.security.cert.X509Certificate;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.*;
+
+/**
+ * Subcommand for admin operations related to failover actions.
+ */
[email protected](
+    name = "failover",
+    description = "manually transfer leadership of raft group to target node",
+    mixinStandardHelpOptions = true,
+    versionProvider = HddsVersionProvider.class
+)
+@MetaInfServices(SubcommandWithParent.class)
+public class FailoverCommand extends GenericCli
+    implements SubcommandWithParent {
+
+  public static final RaftGroupId PSEUDO_RAFT_GROUP_ID = 
RaftGroupId.randomId();
+  public static final int TRANSFER_LEADER_WAIT_MS = 120_000;
+  public static final String RANDOM = "RANDOM";
+
+  enum Domain {
+    OM,
+    SCM,
+  }
+
+  @CommandLine.ParentCommand
+  private OzoneAdmin parent;
+
+  @CommandLine.Parameters(
+      description = "['om'/'scm']"
+  )
+  private String domain;
+
+  @CommandLine.Parameters(
+      description = "[host/host:port/'random'] ratis rpc address of " +
+      "target domain if 'random' then choose arbitrary follower node "
+  )
+  private String tgtAddress;
+
+  @CommandLine.Option(
+      names = {"-id", "--service-id"},
+      description = "Ozone Manager Service ID, if domain is om, " +
+          "this option is prerequisite"
+  )
+  private String omServiceId;
+
+  @CommandLine.Option(
+          names = {"--hostPortList"},
+          description = "if need to test on given nodes, " +
+                  "pattern like this 'host:port,host:port,host:port'"
+  )
+  private String ratisAddresses;
+
+  public OzoneAdmin getParent() {
+    return parent;
+  }
+
+  @Override
+  public Void call() throws Exception {
+    OzoneConfiguration configuration;
+    if (parent.getOzoneConf() == null) {
+      configuration = new OzoneConfiguration();
+    } else{
+      configuration = parent.getOzoneConf();
+    }
+    try {
+      transferLeadership(configuration);
+    } catch (Exception ex) {
+      ex.printStackTrace();
+      throw ex;
+    }

Review Comment:
   Ozone CLI handles uncaught exception and prints message or stack trace 
depending on this global option:
   
   ```
   --verbose   More verbose output. Show the stack trace of the errors.
   ```
   
   So the `printStackTrace()` is unnecessary here.  (And then `try-catch` can 
be omitted, too.)
   
   ```suggestion
       transferLeadership(configuration);
   ```



##########
hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/admin/om/GetCertificatesSubcommand.java:
##########
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.admin.om;
+
+import org.apache.hadoop.hdds.cli.HddsVersionProvider;
+import org.apache.hadoop.ozone.OzoneSecurityUtil;
+import org.apache.hadoop.ozone.om.helpers.ServiceInfoEx;
+import org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol;
+import picocli.CommandLine;
+
+import java.io.IOException;
+import java.security.cert.X509Certificate;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+/**
+ * Handler of om roles command.
+ */
[email protected](
+    name = "certificates", aliases = "certs",
+    description = "List all CA certificates from OM",
+    mixinStandardHelpOptions = true,
+    versionProvider = HddsVersionProvider.class)
+public class GetCertificatesSubcommand implements Callable<Void> {
+
+  @CommandLine.ParentCommand
+  private OMAdmin parent;
+
+  @CommandLine.Option(names = {"-id", "--service-id"},
+      description = "OM Service ID, for HA mode."
+  )
+  private String omServiceId;
+
+  @CommandLine.Option(
+      names = {"-host", "--service-host"},

Review Comment:
   Single-dash options should be single-char (e.g. `-h`), not longer (`-host`). 
 If the char is already used for other option, consider capital (`-H`), another 
char or omitting the single-dash option completely.
   
   Rationale: different single-dash options could be combined in the same 
param, e.g. `-host` is expected to be the same as `-h -o -s -t`.
   
   This applies to all options, not just to this specific one.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to