Hi!

On Tue, 28 Apr 2015 at 15:42 Hrvoje Ribicic <[email protected]> wrote:

> On Thu, Apr 23, 2015 at 5:30 PM, 'Helga Velroyen' via ganeti-devel <
> [email protected]> wrote:
>
>> This patch adds the RetryByNumberOfTimes utility function
>> which wraps around a given function and calls that one
>> up to a maximum number of times before raising a final
>> exception.
>>
>> This function is used in the SSH key updating code in
>> backend.py.
>>
>> Signed-off-by: Helga Velroyen <[email protected]>
>> ---
>>  lib/backend.py     | 178
>> +++++++++++++++++++++++------------------------------
>>  lib/utils/retry.py |  28 +++++++++
>>  2 files changed, 106 insertions(+), 100 deletions(-)
>>
>> diff --git a/lib/backend.py b/lib/backend.py
>> index 41f8a37..5b2b558 100644
>> --- a/lib/backend.py
>> +++ b/lib/backend.py
>> @@ -1529,26 +1529,19 @@ def AddNodeSshKey(node_uuid, node_name,
>>      node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
>>        (constants.SSHS_OVERRIDE, all_keys)
>>
>> -    last_exception = None
>> -    for _ in range(constants.SSHS_MAX_RETRIES):
>> -      try:
>> -        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
>> -                   ssh_port_map.get(node_name), node_data,
>> -                   debug=False, verbose=False, use_cluster_key=False,
>> -                   ask_key=False, strict_host_check=False)
>> -        break
>> -      except errors.OpExecError as e:
>> -        logging.error("Updating SSH key files of node '%s' failed."
>> -                      " Error: %s", node_name, e)
>> -        last_exception = e
>> -    else:
>> -      if last_exception:
>> -        # Clean up the master's public key file if adding key fails
>> -        if to_public_keys:
>> -          ssh.RemovePublicKey(node_uuid)
>> -        raise errors.SshUpdateError(
>> -          "Could not update the SSH setup of node '%s' itself. Error:
>> %s."
>> -          % (node_name, last_exception))
>> +    try:
>> +      utils.RetryByNumberOfTimes(
>> +          constants.SSHS_MAX_RETRIES,
>> +          errors.SshUpdateError,
>> +          run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE,
>> +          ssh_port_map.get(node_name), node_data,
>> +          debug=False, verbose=False, use_cluster_key=False,
>> +          ask_key=False, strict_host_check=False)
>> +    except errors.SshUpdateError as e:
>> +      # Clean up the master's public key file if adding key fails
>> +      if to_public_keys:
>> +        ssh.RemovePublicKey(node_uuid)
>> +      raise e
>>
>>    # Update all nodes except master and the target node
>>    if to_authorized_keys:
>> @@ -1573,29 +1566,25 @@ def AddNodeSshKey(node_uuid, node_name,
>>        logging.debug("Skipping offline node '%s'.", node)
>>        continue
>>      if node in potential_master_candidates:
>> -      logging.debug("Updating SSH key files of node '%s'.")
>> -      for i in range(constants.SSHS_MAX_RETRIES):
>> -        try:
>> -          run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>> -                     ssh_port_map.get(node), pot_mc_data,
>> -                     debug=False, verbose=False, use_cluster_key=False,
>> -                     ask_key=False, strict_host_check=False)
>> -          break
>> -        except errors.OpExecError as e:
>> -          logging.error("Updating SSH key files of node '%s' failed"
>> -                        " at try no %s. Error: %s.", node, i, e)
>> -          last_exception = e
>> -      else:
>> -        if last_exception:
>> -          error_msg = ("When adding the key of node '%s', updating SSH
>> key"
>> -                       " files of node '%s' failed after %s retries."
>> -                       " Not trying again. Last error was: %s." %
>> -                       (node, node_name, constants.SSHS_MAX_RETRIES,
>> -                        last_exception))
>> -          node_errors.append((node, error_msg))
>> -          # We only log the error and don't throw an exception, because
>> -          # one unreachable node shall not abort the entire procedure.
>> -          logging.error(error_msg)
>> +      logging.debug("Updating SSH key files of node '%s'.", node)
>> +      try:
>> +        utils.RetryByNumberOfTimes(
>> +            constants.SSHS_MAX_RETRIES,
>> +            errors.SshUpdateError,
>> +            run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
>> +            ssh_port_map.get(node), pot_mc_data,
>> +            debug=False, verbose=False, use_cluster_key=False,
>> +            ask_key=False, strict_host_check=False)
>> +      except errors.SshUpdateError as last_exception:
>> +        error_msg = ("When adding the key of node '%s', updating SSH key"
>> +                     " files of node '%s' failed after %s retries."
>> +                     " Not trying again. Last errors was: %s." %
>>
>
> s/errors/error/
>

ACK


>
>
>> +                     (node, node_name, constants.SSHS_MAX_RETRIES,
>> +                      last_exception))
>> +        node_errors.append((node, error_msg))
>> +        # We only log the error and don't throw an exception, because
>> +        # one unreachable node shall not abort the entire procedure.
>> +        logging.error(error_msg)
>>
>>      else:
>>        if to_authorized_keys:
>> @@ -1731,48 +1720,41 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>            raise errors.OpExecError("No SSH port information available
>> for"
>>                                     " node '%s', map: %s." %
>>                                     (node, ssh_port_map))
>> -        error_msg_try = ("The SSH setup of node '%s' could not"
>> -                         " be adjusted in try no %s. Error: %s.")
>>          error_msg_final = ("When removing the key of node '%s', updating
>> the"
>> -                           " SSH key files of node '%s' failed after %s"
>> -                           " retries. Not trying again. Last error was:
>> %s.")
>> +                           " SSH key files of node '%s' failed. Last
>> error"
>> +                           " was: %s.")
>>          if node in potential_master_candidates:
>> -          for i in range(constants.SSHS_MAX_RETRIES):
>> -            try:
>> -              run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>> -                         ssh_port, pot_mc_data,
>> -                         debug=False, verbose=False,
>> use_cluster_key=False,
>> -                         ask_key=False, strict_host_check=False)
>> -              break
>> -            except errors.OpExecError as e:
>> -              logging.error(error_msg_try, node, i, e)
>> -              last_exception = e
>> -          else:
>> -            if last_exception:
>> -              error_msg = error_msg_final % (
>> -                  node_name, node, constants.SSHS_MAX_RETRIES,
>> last_exception)
>> -              result_msgs.append((node, error_msg))
>> -              logging.error(error_msg)
>> +          logging.debug("Updating key setup of potential master
>> candidate node"
>> +                        " %s.", node)
>> +          try:
>> +            utils.RetryByNumberOfTimes(
>> +                constants.SSHS_MAX_RETRIES,
>> +                errors.SshUpdateError,
>> +                run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
>> +                ssh_port, pot_mc_data,
>> +                debug=False, verbose=False, use_cluster_key=False,
>> +                ask_key=False, strict_host_check=False)
>> +          except errors.SshUpdateError as last_exception:
>> +            error_msg = error_msg_final % (
>> +                node_name, node, last_exception)
>> +            result_msgs.append((node, error_msg))
>> +            logging.error(error_msg)
>>
>>          else:
>> -          last_exception = None
>> -          if from_authorized_keys:
>>
>
> This line appears to have gotten lost in the refactoring.
>

Good catch, thanks!


>
>
>> -            for i in range(constants.SSHS_MAX_RETRIES):
>> -              try:
>> -                run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>> -                           ssh_port, base_data,
>> -                           debug=False, verbose=False,
>> use_cluster_key=False,
>> -                           ask_key=False, strict_host_check=False)
>> -                break
>> -              except errors.OpExecError as e:
>> -                logging.error(error_msg_try, node, i, e)
>> -                last_exception = e
>> -          else:
>> -            if last_exception:
>> -              error_msg = error_msg_final % (
>> -                  node_name, node, constants.SSHS_MAX_RETRIES,
>> last_exception)
>> -              result_msgs.append((node, error_msg))
>> -              logging.error(error_msg)
>> +          logging.debug("Updating key setup of normal node %s.", node)
>> +          try:
>> +            utils.RetryByNumberOfTimes(
>> +                constants.SSHS_MAX_RETRIES,
>> +                errors.SshUpdateError,
>> +                run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
>> +                ssh_port, base_data,
>> +                debug=False, verbose=False, use_cluster_key=False,
>> +                ask_key=False, strict_host_check=False)
>> +          except errors.SshUpdateError as last_exception:
>> +            error_msg = error_msg_final % (
>> +                node_name, node, last_exception)
>> +            result_msgs.append((node, error_msg))
>> +            logging.error(error_msg)
>>
>>    if clear_authorized_keys or from_public_keys or clear_public_keys:
>>      data = {}
>> @@ -1810,25 +1792,21 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>              constants.SSHS_SSH_AUTHORIZED_KEYS in data):
>>        return
>>
>> -    last_exception = None
>> -    for i in range(constants.SSHS_MAX_RETRIES):
>> -      try:
>> -        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
>> -                   ssh_port, data,
>> -                   debug=False, verbose=False, use_cluster_key=False,
>> -                   ask_key=False, strict_host_check=False)
>> -      except errors.OpExecError as e:
>> -        logging.error("Updating the SSH key files of node '%s', whose
>> key"
>> -                      " itself is being removed failed with try no %s."
>> -                      " Error: %s", node_name, i, e)
>> -        last_exception = e
>> -    else:
>> -      if last_exception:
>> -        result_msgs.append(
>> -            (node_name,
>> -             ("Removing SSH keys from node '%s' failed after %s retries."
>> -              " This can happen when the node is already unreachable."
>> -              " Error: %s" % (node_name, constants.SSHS_MAX_RETRIES,
>> e))))
>> +    logging.debug("Updating SSH key setup of target node '%s'.",
>> node_name)
>> +    try:
>> +      utils.RetryByNumberOfTimes(
>> +          constants.SSHS_MAX_RETRIES,
>> +          errors.SshUpdateError,
>> +          run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE,
>> +          ssh_port, data,
>> +          debug=False, verbose=False, use_cluster_key=False,
>> +          ask_key=False, strict_host_check=False)
>> +    except errors.SshUpdateError as last_exception:
>> +      result_msgs.append(
>> +          (node_name,
>> +           ("Removing SSH keys from node '%s' failed."
>> +            " This can happen when the node is already unreachable."
>> +            " Error: %s" % (node_name, last_exception))))
>>
>>    return result_msgs
>>
>> diff --git a/lib/utils/retry.py b/lib/utils/retry.py
>> index c7567fe..382d0cb 100644
>> --- a/lib/utils/retry.py
>> +++ b/lib/utils/retry.py
>> @@ -32,6 +32,7 @@
>>  """
>>
>>
>> +import logging
>>  import time
>>
>>  from ganeti import errors
>> @@ -230,3 +231,30 @@ def SimpleRetry(expected, fn, delay, timeout,
>> args=None, wait_fn=time.sleep,
>>      assert "result" in rdict
>>      result = rdict["result"]
>>    return result
>> +
>> +
>> +def RetryByNumberOfTimes(max_retries, exception_class, fn, *args,
>> **kwargs):
>> +  """Retries calling a function up to the specified number of times.
>> +
>> +  @type max_retries: integer
>> +  @param max_retries: maximum number of retries.
>> +  @type exception_class: class
>> +  @param exception_class: exception class which is used for throwing the
>> +                          final exception.
>> +  @type fn: callable
>> +  @param fn: function to be called (up to the specified maximum number of
>> +             retries.
>>
>
> Very minor nitpick: doclint capitalization in this comment does not match
> the rest of the file. Could you fix that up?
>

ACK


>
>
>> +
>> +  """
>> +  last_exception = None
>> +  for i in range(max_retries):
>> +    try:
>> +      fn(*args, **kwargs)
>> +      break
>> +    except errors.OpExecError as e:
>> +      logging.error("Error after retry no. %s: %s.", i, e)
>> +      last_exception = e
>> +  else:
>> +    if last_exception:
>> +      raise exception_class("Error after %s retries. Last exception: %s."
>> +                            % (max_retries, last_exception))
>> --
>> 2.2.0.rc0.207.ga3a616c
>>
>>
> As discussed offline, overall it would still be good to have delays. The
> retries currently assume that the operations performed take enough time
> that whatever transient problems are present might fix themselves.
> Please make an issue and refactor later.
>

Filed https://code.google.com/p/ganeti/issues/detail?id=1078

Interdiff:
diff --git a/lib/backend.py b/lib/backend.py
index 5b2b558..52eee65 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1578,7 +1578,7 @@ def AddNodeSshKey(node_uuid, node_name,
       except errors.SshUpdateError as last_exception:
         error_msg = ("When adding the key of node '%s', updating SSH key"
                      " files of node '%s' failed after %s retries."
-                     " Not trying again. Last errors was: %s." %
+                     " Not trying again. Last error was: %s." %
                      (node, node_name, constants.SSHS_MAX_RETRIES,
                       last_exception))
         node_errors.append((node, error_msg))
@@ -1741,20 +1741,21 @@ def RemoveNodeSshKey(node_uuid, node_name,
             logging.error(error_msg)

         else:
-          logging.debug("Updating key setup of normal node %s.", node)
-          try:
-            utils.RetryByNumberOfTimes(
-                constants.SSHS_MAX_RETRIES,
-                errors.SshUpdateError,
-                run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
-                ssh_port, base_data,
-                debug=False, verbose=False, use_cluster_key=False,
-                ask_key=False, strict_host_check=False)
-          except errors.SshUpdateError as last_exception:
-            error_msg = error_msg_final % (
-                node_name, node, last_exception)
-            result_msgs.append((node, error_msg))
-            logging.error(error_msg)
+          if from_authorized_keys:
+            logging.debug("Updating key setup of normal node %s.", node)
+            try:
+              utils.RetryByNumberOfTimes(
+                  constants.SSHS_MAX_RETRIES,
+                  errors.SshUpdateError,
+                  run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
+                  ssh_port, base_data,
+                  debug=False, verbose=False, use_cluster_key=False,
+                  ask_key=False, strict_host_check=False)
+            except errors.SshUpdateError as last_exception:
+              error_msg = error_msg_final % (
+                  node_name, node, last_exception)
+              result_msgs.append((node, error_msg))
+              logging.error(error_msg)

   if clear_authorized_keys or from_public_keys or clear_public_keys:
     data = {}
diff --git a/lib/utils/retry.py b/lib/utils/retry.py
index 382d0cb..fda3fcd 100644
--- a/lib/utils/retry.py
+++ b/lib/utils/retry.py
@@ -237,12 +237,12 @@ def RetryByNumberOfTimes(max_retries,
exception_class, fn, *args, **kwargs):
   """Retries calling a function up to the specified number of times.

   @type max_retries: integer
-  @param max_retries: maximum number of retries.
+  @param max_retries: Maximum number of retries.
   @type exception_class: class
-  @param exception_class: exception class which is used for throwing the
+  @param exception_class: Exception class which is used for throwing the
                           final exception.
   @type fn: callable
-  @param fn: function to be called (up to the specified maximum number of
+  @param fn: Function to be called (up to the specified maximum number of
              retries.

   """



>
>
> Hrvoje Ribicic
> Ganeti Engineering
> Google Germany GmbH
> Dienerstr. 12, 80331, München
>
> Registergericht und -nummer: Hamburg, HRB 86891
> Sitz der Gesellschaft: Hamburg
> Geschäftsführer: Graham Law, Christine Elizabeth Flores
> Steuernummer: 48/725/00206
> Umsatzsteueridentifikationsnummer: DE813741370
>

Reply via email to