Hi!

On Thu, 23 Apr 2015 at 11:29 Hrvoje Ribicic <[email protected]> wrote:

> On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel <
> [email protected]> wrote:
>
>> When removing an SSH key, RemoveNodeSshKey so far did only
>> try to contact each node once. This made the entire
>> procedure not very resilient towards temporary network
>> failures. This patch adds a couple of retries for contacting
>> each node and only gives up after those are exhausted.
>>
>> Signed-off-by: Helga Velroyen <[email protected]>
>> ---
>>  lib/backend.py                     | 101
>> ++++++++++++++++++++++++-------------
>>  lib/cmdlib/node.py                 |  21 ++++++--
>>  test/py/ganeti.backend_unittest.py |  65 ++++++++++++++++++++++++
>>  3 files changed, 148 insertions(+), 39 deletions(-)
>>
>> diff --git a/lib/backend.py b/lib/backend.py
>> index 8ec22d9..c3190d2 100644
>> --- a/lib/backend.py
>> +++ b/lib/backend.py
>> @@ -1660,12 +1660,15 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>    @returns: list of feedback messages
>>
>>    """
>> -  result_msgs = []
>> +  # Non-disruptive error messages, mapped from node name to message
>> +  result_msgs = {}
>> +  # maximum number of retries for SSH connections
>> +  max_retries = 3
>>
>>    # Make sure at least one of these flags is true.
>>    if not (from_authorized_keys or from_public_keys or
>> clear_authorized_keys
>>            or clear_public_keys):
>> -    result_msgs.append("No removal from any key file was requested.")
>> +    raise errors.SshUpdateError("No removal from any key file was
>> requested.")
>>
>>    if not ssconf_store:
>>      ssconf_store = ssconf.SimpleStore()
>> @@ -1717,8 +1720,11 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>
>>        all_nodes = ssconf_store.GetNodeList()
>>        online_nodes = ssconf_store.GetOnlineNodeList()
>> +      logging.debug("Removing key of node '%s' from all nodes but itself
>> and"
>> +                    " master.", node_name)
>>        for node in all_nodes:
>>          if node == master_node:
>> +          logging.debug("Skipping master node '%s'.", master_node)
>>            continue
>>          if node not in online_nodes:
>>            logging.debug("Skipping offline node '%s'.", node)
>> @@ -1729,24 +1735,48 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>                                     " node '%s', map: %s." %
>>                                     (node, ssh_port_map))
>>          if node in potential_master_candidates:
>> -          try:
>> -            run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>> -                       ssh_port, pot_mc_data,
>> -                       debug=False, verbose=False, use_cluster_key=False,
>> -                       ask_key=False, strict_host_check=False)
>> -          except errors.OpExecError as e:
>> -            result_msgs.append("Warning: the SSH setup of node '%s'
>> could not"
>> -                               " be adjusted." % node)
>> -        else:
>> -          if from_authorized_keys:
>> +          error_msg_try = ("The SSH setup of node '%s' could not"
>> +                           " be adjusted in try no %s. Error: %s.")
>> +          error_msg_final = ("When removing the key of node '%s',
>> updating the"
>> +                             " SSH key files of node '%s' failed after
>> %s"
>> +                             " retries. Not trying again. Last error
>> was: %s.")
>>
>
> Looking at the code, it seems this has to go before the if as it is used
> in the else as well.
>

Ack.


>
>
>> +          last_exception = None
>> +          for i in range(max_retries):
>>              try:
>>                run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>> -                         ssh_port, base_data,
>> +                         ssh_port, pot_mc_data,
>>                           debug=False, verbose=False,
>> use_cluster_key=False,
>>                           ask_key=False, strict_host_check=False)
>> +              break
>>              except errors.OpExecError as e:
>> -              result_msgs.append("Warning: the SSH setup of node '%s'
>> could"
>> -                                 " not be adjusted." % node)
>> +              logging.error(error_msg_try, node, i, e)
>> +              last_exception = e
>> +          else:
>> +            if last_exception:
>> +              error_msg = error_msg_final % (node_name, node,
>> max_retries,
>> +                                             last_exception)
>> +              result_msgs[node] = error_msg
>> +              logging.error(error_msg)
>> +
>> +        else:
>> +          last_exception = None
>> +          if from_authorized_keys:
>> +            for i in range(max_retries):
>> +              try:
>> +                run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>> +                           ssh_port, base_data,
>> +                           debug=False, verbose=False,
>> use_cluster_key=False,
>> +                           ask_key=False, strict_host_check=False)
>> +                break
>> +              except errors.OpExecError as e:
>> +                logging.error(error_msg_try, node, i, e)
>> +                last_exception = e
>> +          else:
>> +            if last_exception:
>> +              error_msg = error_msg_final % (node_name, node,
>> max_retries,
>> +                                             last_exception)
>> +              result_msgs[node] = error_msg
>> +              logging.error(error_msg)
>
>
>>    if clear_authorized_keys or from_public_keys or clear_public_keys:
>>      data = {}
>> @@ -1790,9 +1820,10 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>                   debug=False, verbose=False, use_cluster_key=False,
>>                   ask_key=False, strict_host_check=False)
>>      except errors.OpExecError as e:
>> -      result_msgs.append("Removing SSH keys from node '%s' failed. This
>> can"
>> -                         " happen when the node is already unreachable."
>> -                         " Error: %s" % (node_name, e))
>> +      result_msgs[node_name] = \
>> +          ("Removing SSH keys from node '%s' failed. This can"
>> +           " happen when the node is already unreachable."
>> +           " Error: %s" % (node_name, e))
>>
>>    return result_msgs
>>
>> @@ -1980,15 +2011,14 @@ def RenewSshKeys(node_uuids, node_names,
>> ssh_port_map,
>>          # and that would terminate all communication from the master to
>> the
>>          # node.
>>          logging.debug("Removing SSH key of node '%s'.", node_name)
>> -        RemoveNodeSshKey(node_uuid, node_name,
>> -                         master_candidate_uuids,
>> -                         potential_master_candidates,
>> -                         ssh_port_map,
>> -                         master_uuid=master_node_uuid,
>> -                         from_authorized_keys=master_candidate,
>> -                         from_public_keys=False,
>> -                         clear_authorized_keys=False,
>> -                         clear_public_keys=False)
>> +        node_errors = RemoveNodeSshKey(
>> +           node_uuid, node_name, master_candidate_uuids,
>> +           potential_master_candidates, ssh_port_map,
>> +           master_uuid=master_node_uuid,
>> from_authorized_keys=master_candidate,
>> +           from_public_keys=False, clear_authorized_keys=False,
>> +           clear_public_keys=False)
>> +        if node_errors:
>> +          all_node_errors = all_node_errors + [node_errors.items()]
>>        else:
>>          logging.debug("Old key of node '%s' is the same as the current
>> master"
>>                        " key. Not deleting that key on the node.",
>> node_name)
>> @@ -2070,15 +2100,14 @@ def RenewSshKeys(node_uuids, node_names,
>> ssh_port_map,
>>
>>    # Remove the old key from all node's authorized keys file
>>    logging.debug("Remove the old master key from all nodes.")
>> -  RemoveNodeSshKey(master_node_uuid, master_node_name,
>> -                   master_candidate_uuids,
>> -                   potential_master_candidates,
>> -                   ssh_port_map,
>> -                   keys_to_remove=old_master_keys_by_uuid,
>> -                   from_authorized_keys=True,
>> -                   from_public_keys=False,
>> -                   clear_authorized_keys=False,
>> -                   clear_public_keys=False)
>> +  node_errors = RemoveNodeSshKey(
>> +      master_node_uuid, master_node_name, master_candidate_uuids,
>> +      potential_master_candidates, ssh_port_map,
>> +      keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
>> +      from_public_keys=False, clear_authorized_keys=False,
>> +      clear_public_keys=False)
>> +  if node_errors:
>> +    all_node_errors = all_node_errors + node_errors.items()
>>
>>    return all_node_errors
>>
>> diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py
>> index 6a13d8a..8453b33 100644
>> --- a/lib/cmdlib/node.py
>> +++ b/lib/cmdlib/node.py
>> @@ -366,6 +366,12 @@ class LUNodeAdd(LogicalUnit):
>>        remove_result[master_node].Raise(
>>          "Could not remove SSH keys of node %s before readding,"
>>          " (UUID: %s)." % (new_node_name, new_node_uuid))
>>
>
> As with previous patches, please insert space, use helper function, alter
> message.
>

I added a helper function and used it here. I don't really get the request
for spaces. It is calling the rpc, raising serious exceptions, then
evaluating the other output. I think that pretty much belongs together as a
block.


>
>
>> +      result_msgs = remove_result[master_node].payload
>> +      if result_msgs:
>> +        feedback_fn("Removing old SSH key of node '%s' failed on some
>> nodes."
>> +                    % new_node_name)
>> +        for node, error_msg in result_msgs.items():
>> +          feedback_fn("%s: %s" % (node, error_msg))
>>
>>      result = rpcrunner.call_node_ssh_key_add(
>>        [master_node], new_node_uuid, new_node_name,
>> @@ -885,12 +891,15 @@ class LUNodeSetParams(LogicalUnit):
>>              False, # currently, all nodes are potential master candidates
>>              False, # do not clear node's 'authorized_keys'
>>              False) # do not clear node's 'ganeti_pub_keys'
>> -          if not ssh_result[master_node].fail_msg:
>> -            for message in ssh_result[master_node].payload:
>> -              feedback_fn(message)
>>            ssh_result[master_node].Raise(
>>              "Could not adjust the SSH setup after demoting node '%s'"
>>              " (UUID: %s)." % (node.name, node.uuid))
>>
>
> As above.
>

See above.


>
>
>> +          if not ssh_result[master_node].fail_msg:
>> +            if ssh_result[master_node].payload:
>> +              feedback_fn("When demoting node '%s', updating SSH keys on"
>> +                          " some nodes failed." % (node.name))
>> +            for node, message in ssh_result[master_node].payload:
>> +              feedback_fn("%s: %s" % (node, message))
>>
>>          if self.new_role == self._ROLE_CANDIDATE:
>>            ssh_result = self.rpc.call_node_ssh_key_add(
>> @@ -1603,6 +1612,12 @@ class LUNodeRemove(LogicalUnit):
>>        result[master_node].Raise(
>>          "Could not remove the SSH key of node '%s' (UUID: %s)." %
>>          (self.op.node_name, self.node.uuid))
>> +      result_msgs = result[master_node].payload
>> +      if result_msgs:
>> +        feedback_fn("Removing old SSH key of node '%s' failed on some
>> nodes."
>> +                    % self.op.node_name)
>> +        for node, error_msg in result_msgs.items():
>> +          feedback_fn("%s: %s" % (node, error_msg))
>>
>>      # Promote nodes to master candidate as needed
>>      AdjustCandidatePool(self, [self.node.uuid])
>> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
>> ganeti.backend_unittest.py
>> index e05580d..3c207f8 100755
>> --- a/test/py/ganeti.backend_unittest.py
>> +++ b/test/py/ganeti.backend_unittest.py
>> @@ -1563,6 +1563,71 @@ class
>> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>>
>>      self.assertTrue(node_errors[other_node_name])
>>
>> +  def testRemoveKeySuccessfullyWithRetriesOnOtherNode(self):
>> +    """Test removing keys even if one of the old nodes needs retries.
>> +
>> +    This tests checks whether a key can be removed successfully even
>> +    when one of the other nodes needs to be contacted with several
>> +    retries.
>> +
>> +    """
>> +    all_master_candidates =
>> self._ssh_file_manager.GetAllMasterCandidates()
>> +    node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0]
>> +    other_node_name, _, _, _, _, _ = all_master_candidates[1]
>> +    assert node_name != self._master_node
>> +    assert other_node_name != self._master_node
>> +    assert node_name != other_node_name
>> +    self._ssh_file_manager.SetMaxRetries(other_node_name, 3)
>> +
>> +    backend.RemoveNodeSshKey(node_uuid, node_name,
>> +                             self._master_candidate_uuids,
>> +                             self._potential_master_candidates,
>> +                             self._ssh_port_map,
>> +                             from_authorized_keys=True,
>> +                             from_public_keys=True,
>> +                             clear_authorized_keys=True,
>> +                             clear_public_keys=True,
>> +                             pub_key_file=self._pub_key_file,
>> +                             ssconf_store=self._ssconf_mock,
>> +                             noded_cert_file=self.noded_cert_file,
>> +                             run_cmd_fn=self._run_cmd_mock)
>> +
>> +    self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
>> +        node_uuid, node_key))
>> +
>> self.assertTrue(self._ssh_file_manager.NoNodeHasAuthorizedKey(node_key))
>> +
>> +  def testRemoveKeyFailedWithRetriesOnOtherNode(self):
>> +    """Test removing keys even if one of the old nodes fails even with
>> retries.
>> +
>> +    This tests checks whether the removal of a key finishes properly,
>> even if
>> +    the update of the key files on one of the other nodes fails despite
>> several
>> +    retries.
>> +
>> +    """
>> +    all_master_candidates =
>> self._ssh_file_manager.GetAllMasterCandidates()
>> +    node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0]
>> +    other_node_name, _, _, _, _, _ = all_master_candidates[1]
>> +    assert node_name != self._master_node
>> +    assert other_node_name != self._master_node
>> +    assert node_name != other_node_name
>> +    self._ssh_file_manager.SetMaxRetries(other_node_name, 4)
>> +
>> +    error_msgs = backend.RemoveNodeSshKey(
>> +        node_uuid, node_name, self._master_candidate_uuids,
>> +        self._potential_master_candidates, self._ssh_port_map,
>> +        from_authorized_keys=True, from_public_keys=True,
>> +        clear_authorized_keys=True, clear_public_keys=True,
>> +        pub_key_file=self._pub_key_file, ssconf_store=self._ssconf_mock,
>> +        noded_cert_file=self.noded_cert_file,
>> run_cmd_fn=self._run_cmd_mock)
>> +
>> +    for node in self._all_nodes:
>> +      if node == other_node_name:
>> +        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>> +            node, node_key))
>> +      else:
>> +        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
>> +            node, node_key))
>> +    self.assertTrue(error_msgs[other_node_name])
>>
>>  class TestVerifySshSetup(testutils.GanetiTestCase):
>>
>> --
>> 2.2.0.rc0.207.ga3a616c
>>
>>
Interdiff:

commit cdd4f00f57d5610dd467ba1c3562d3d89ff6eca0
Author: Helga Velroyen <[email protected]>
Date:   Thu Apr 23 12:29:06 2015 +0200

    interdiff

diff --git a/lib/backend.py b/lib/backend.py
index 87c8c77..bdc7d3e 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1659,8 +1659,8 @@ def RemoveNodeSshKey(node_uuid, node_name,
   @returns: list of feedback messages

   """
-  # Non-disruptive error messages, mapped from node name to message
-  result_msgs = {}
+  # Non-disruptive error messages, list of (node, msg) pairs
+  result_msgs = []

   # Make sure at least one of these flags is true.
   if not (from_authorized_keys or from_public_keys or clear_authorized_keys
@@ -1731,13 +1731,12 @@ def RemoveNodeSshKey(node_uuid, node_name,
           raise errors.OpExecError("No SSH port information available for"
                                    " node '%s', map: %s." %
                                    (node, ssh_port_map))
+        error_msg_try = ("The SSH setup of node '%s' could not"
+                         " be adjusted in try no %s. Error: %s.")
+        error_msg_final = ("When removing the key of node '%s', updating
the"
+                           " SSH key files of node '%s' failed after %s"
+                           " retries. Not trying again. Last error was:
%s.")
         if node in potential_master_candidates:
-          error_msg_try = ("The SSH setup of node '%s' could not"
-                           " be adjusted in try no %s. Error: %s.")
-          error_msg_final = ("When removing the key of node '%s', updating
the"
-                             " SSH key files of node '%s' failed after %s"
-                             " retries. Not trying again. Last error was:
%s.")
-          last_exception = None
           for i in range(constants.SSHS_MAX_RETRIES):
             try:
               run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
@@ -1752,7 +1751,7 @@ def RemoveNodeSshKey(node_uuid, node_name,
             if last_exception:
               error_msg = error_msg_final % (
                   node_name, node, constants.SSHS_MAX_RETRIES,
last_exception)
-              result_msgs[node] = error_msg
+              result_msgs.append((node, error_msg))
               logging.error(error_msg)

         else:
@@ -1772,7 +1771,7 @@ def RemoveNodeSshKey(node_uuid, node_name,
             if last_exception:
               error_msg = error_msg_final % (
                   node_name, node, constants.SSHS_MAX_RETRIES,
last_exception)
-              result_msgs[node] = error_msg
+              result_msgs.append((node, error_msg))
               logging.error(error_msg)

   if clear_authorized_keys or from_public_keys or clear_public_keys:
@@ -2015,7 +2014,7 @@ def RenewSshKeys(node_uuids, node_names, ssh_port_map,
            from_public_keys=False, clear_authorized_keys=False,
            clear_public_keys=False)
         if node_errors:
-          all_node_errors = all_node_errors + [node_errors.items()]
+          all_node_errors = all_node_errors + node_errors
       else:
         logging.debug("Old key of node '%s' is the same as the current
master"
                       " key. Not deleting that key on the node.",
node_name)
@@ -2104,7 +2103,7 @@ def RenewSshKeys(node_uuids, node_names, ssh_port_map,
       from_public_keys=False, clear_authorized_keys=False,
       clear_public_keys=False)
   if node_errors:
-    all_node_errors = all_node_errors + node_errors.items()
+    all_node_errors = all_node_errors + node_errors

   return all_node_errors

diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py
index 337290e..41451b3 100644
--- a/lib/cmdlib/node.py
+++ b/lib/cmdlib/node.py
@@ -366,12 +366,7 @@ class LUNodeAdd(LogicalUnit):
       remove_result[master_node].Raise(
         "Could not remove SSH keys of node %s before readding,"
         " (UUID: %s)." % (new_node_name, new_node_uuid))
-      result_msgs = remove_result[master_node].payload
-      if result_msgs:
-        feedback_fn("Removing old SSH key of node '%s' failed on some
nodes."
-                    % new_node_name)
-        for node, error_msg in result_msgs.items():
-          feedback_fn("%s: %s" % (node, error_msg))
+      EvaluateSshUpdateRPC(remove_result, master_node, feedback_fn)

     result = rpcrunner.call_node_ssh_key_add(
       [master_node], new_node_uuid, new_node_name,
@@ -891,12 +886,7 @@ class LUNodeSetParams(LogicalUnit):
           ssh_result[master_node].Raise(
             "Could not adjust the SSH setup after demoting node '%s'"
             " (UUID: %s)." % (node.name, node.uuid))
-          if not ssh_result[master_node].fail_msg:
-            if ssh_result[master_node].payload:
-              feedback_fn("When demoting node '%s', updating SSH keys on"
-                          " some nodes failed." % (node.name))
-            for node, message in ssh_result[master_node].payload:
-              feedback_fn("%s: %s" % (node, message))
+          EvaluateSshUpdateRPC(ssh_result, master_node, feedback_fn)

         if self.new_role == self._ROLE_CANDIDATE:
           ssh_result = self.rpc.call_node_ssh_key_add(
@@ -1605,12 +1595,7 @@ class LUNodeRemove(LogicalUnit):
       result[master_node].Raise(
         "Could not remove the SSH key of node '%s' (UUID: %s)." %
         (self.op.node_name, self.node.uuid))
-      result_msgs = result[master_node].payload
-      if result_msgs:
-        feedback_fn("Removing old SSH key of node '%s' failed on some
nodes."
-                    % self.op.node_name)
-        for node, error_msg in result_msgs.items():
-          feedback_fn("%s: %s" % (node, error_msg))
+      EvaluateSshUpdateRPC(result, master_node, feedback_fn)

     # Promote nodes to master candidate as needed
     AdjustCandidatePool(self, [self.node.uuid])
diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
ganeti.backend_unittest.py
index 3e54c38..181f495 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -1634,7 +1634,8 @@ class
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
       else:
         self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
             node, node_key))
-    self.assertTrue(error_msgs[other_node_name])
+    self.assertTrue([error_msg for (node, error_msg) in error_msgs
+                     if node == other_node_name])

 class TestVerifySshSetup(testutils.GanetiTestCase):



>
> Hrvoje Ribicic
> Ganeti Engineering
> Google Germany GmbH
> Dienerstr. 12, 80331, München
>
> Registergericht und -nummer: Hamburg, HRB 86891
> Sitz der Gesellschaft: Hamburg
> Geschäftsführer: Graham Law, Christine Elizabeth Flores
> Steuernummer: 48/725/00206
> Umsatzsteueridentifikationsnummer: DE813741370
>

Reply via email to