Hi Rafael,

I have one question on this patch below. /Thanks HansN

-----Original Message-----
From: Lennart Lund [mailto:[email protected]] 
Sent: den 25 september 2017 13:44
To: Rafael Odzakow <[email protected]>
Cc: [email protected]
Subject: Re: [devel] [PATCH 1/1] smf: try to wait for opensafd status before 
reboot [#2464]

Ack

Thanks
Lennart

> -----Original Message-----
> From: Rafael Odzakow
> Sent: den 14 september 2017 13:34
> To: Lennart Lund <[email protected]>
> Cc: [email protected]; Rafael Odzakow 
> <[email protected]>
> Subject: [PATCH 1/1] smf: try to wait for opensafd status before 
> reboot [#2464]
> 
> There are cases when opensafd startup is still ongoing and SMF will 
> send out a reboot command for a node. Because opensafd has taken a 
> lock the reboot command will not be able to call opensafd stop. It is 
> suggested that SMF tries to wait for the release of the lock with 
> "opensafd status". The waiting time is short and SMF continues with 
> reboot even if the lock is not released.
> 
> - Change smf remote command to have two versions, one that logs errors of
>   the endpoint command and one without error logging.
> ---
>  src/smf/smfd/SmfUpgradeStep.cc |  24 ++++++++++
>  src/smf/smfd/smfd_smfnd.c      | 100 ++++++++++++++++++++++++++------
> ---------
>  src/smf/smfd/smfd_smfnd.h      |   8 +++-
>  3 files changed, 95 insertions(+), 37 deletions(-)
> 
> diff --git a/src/smf/smfd/SmfUpgradeStep.cc 
> b/src/smf/smfd/SmfUpgradeStep.cc index ddd96a0..d40fac4 100644
> --- a/src/smf/smfd/SmfUpgradeStep.cc
> +++ b/src/smf/smfd/SmfUpgradeStep.cc
> @@ -54,6 +54,7 @@
>  #include "smf/smfd/SmfRollback.h"
>  #include "smf/smfd/SmfUtils.h"
>  #include "osaf/immutil/immutil.h"
> +#include "osaf/configmake.h"
>  #include "smf/smfd/smfd_smfnd.h"
>  #include "smfd.h"
>  #include "base/osaf_time.h"
> @@ -2316,6 +2317,29 @@ bool SmfUpgradeStep::nodeReboot() {
>        goto done;
>      }
> 
> +    // Wait for a good opensafd status. If retrying fails give up and reboot
> +    // anyway.
> +    int counter = 0;
> +    while (counter < 5) {
> +      TRACE("check status of opensafd");
> +      std::string command = LSBINITDIR;
> +      command += "/opensafd status";
> +      cmdrc = smfnd_exec_remote_cmdnolog(command.c_str(), &nodeDest,
> +                                         cliTimeout, localTimeout);
[HansN]  is the below code correct, shouldn't it be the following?:
      if ((cmdrc  & 0xffff0000) == SMFSV_CMD_RESULT_CODE &&
          (cmdrc & 0xffff) == 0) {
        LOG_NO("opensafd status OK");
        break;
      } else {
         // The status is not OK, try again
          LOG_WA("opensafd status, retcode[%u] retry in 2 seconds",
                 cmdrc & 0xffff);
          struct timespec time = {2, 0};
          osaf_nanosleep(&time);
          counter += 1;
      }
Instead of:
> +      if ((cmdrc  & 0xffff0000) == SMFSV_CMD_RESULT_CODE &&
> +          (cmdrc & 0xffff) != 0) {
> +          // The status is not OK, try again
> +          LOG_WA("opensafd status, retcode[%u] retry in 2 seconds",
> +                 cmdrc & 0xffff);
> +          struct timespec time = {2, 0};
> +          osaf_nanosleep(&time);
> +          counter += 1;
> +      } else {
> +        LOG_NO("opensafd status OK");
> +        break;
> +      }
> +    }
> +
>      /* When executing a reboot command on a node the command will 
> never return
>         so we want a short local timeout. Since the smfnd is handling the
>         cli timeout we want that to be much longer so that the reboot 
> command diff --git a/src/smf/smfd/smfd_smfnd.c 
> b/src/smf/smfd/smfd_smfnd.c index 7aadaf9..c48adb2 100644
> --- a/src/smf/smfd/smfd_smfnd.c
> +++ b/src/smf/smfd/smfd_smfnd.c
> @@ -230,6 +230,44 @@ uint32_t smfnd_down(SaClmNodeIdT i_node_id)
>       return NCSCC_RC_FAILURE;
>  }
> 
> +/*
> +   Log errors from smfnd_main_remote_command
> + * @param i_cmd Name of remote command that was executed
> + * @param i_timeout Max time out for the remote command in 10 ms
> + * @param i_result Result code from smfnd_main_remote_cmd */ void 
> +log_rsp_errors(const char *i_cmd, uint32_t i_timeout, uint32_t
> i_result) {
> +  if (i_result != 0) { /* 0 = cmd OK */
> +    switch (i_result & 0xffff0000) {
> +      case SMFSV_CMD_EXEC_FAILED: {
> +        LOG_ER("Command %s failed to start (%u)",
> +               i_cmd, i_result & 0xffff);
> +        break;
> +      }
> +      case SMFSV_CMD_TIMEOUT: {
> +        LOG_ER("Command %s timed out (timeout %u ms)",
> +               i_cmd, i_timeout * 10);
> +        break;
> +      }
> +      case SMFSV_CMD_RESULT_CODE: {
> +        LOG_ER("Command %s returned error %u",
> +               i_cmd, i_result & 0xffff);
> +        break;
> +      }
> +      case SMFSV_CMD_SIGNAL_TERM: {
> +        LOG_ER("Command %s terminated by signal %u",
> +               i_cmd, i_result & 0xffff);
> +        break;
> +      }
> +      default: {
> +        LOG_ER("Command %s failed by unknown reason %x",
> +               i_cmd, i_result);
> +        break;
> +      }
> +    } // switch
> +  } // if
> +}
> +
>  /**
>   * smfnd_exec_remote_cmd
>   * @param i_cmd Remote command to be executed @@ -237,9 +275,11 @@ 
> uint32_t smfnd_down(SaClmNodeIdT i_node_id)
>   *                     the command
>   * @param i_timeout Max time the command may take in 10 ms
>   */
> -uint32_t smfnd_exec_remote_cmd(const char *i_cmd, const SmfndNodeDest 
> *i_smfnd,
> -                            uint32_t i_timeout, uint32_t i_localTimeout)
> -{
> +uint32_t smfnd_main_remote_cmd(const char *i_cmd,
> +                               const SmfndNodeDest *i_smfnd,
> +                               uint32_t i_timeout,
> +                               uint32_t i_localTimeout,
> +                               bool error_log) {
>       SMFSV_EVT cmd_req_asynch;
>       SMFSV_EVT *cmd_rsp = 0;
>       uint32_t rc;
> @@ -297,44 +337,32 @@ uint32_t smfnd_exec_remote_cmd(const char 
> *i_cmd, const SmfndNodeDest *i_smfnd,
>               return SMFSV_CMD_EXEC_FAILED;
>       }
> 
> -     if (cmd_rsp->info.smfd.event.cmd_rsp.result != 0) { /* 0 = cmd OK */
> -             switch (cmd_rsp->info.smfd.event.cmd_rsp.result &
> 0xffff0000) {
> -             case SMFSV_CMD_EXEC_FAILED: {
> -                     LOG_ER("Command %s failed to start (%u)", i_cmd,
> -                            cmd_rsp->info.smfd.event.cmd_rsp.result &
> -                                0xffff);
> -                     break;
> -             }
> -             case SMFSV_CMD_TIMEOUT: {
> -                     LOG_ER("Command %s timed out (timeout %u ms)",
> i_cmd,
> -                            i_timeout * 10);
> -                     break;
> -             }
> -             case SMFSV_CMD_RESULT_CODE: {
> -                     LOG_ER("Command %s returned error %u", i_cmd,
> -                            cmd_rsp->info.smfd.event.cmd_rsp.result &
> -                                0xffff);
> -                     break;
> -             }
> -             case SMFSV_CMD_SIGNAL_TERM: {
> -                     LOG_ER("Command %s terminated by signal %u",
> i_cmd,
> -                            cmd_rsp->info.smfd.event.cmd_rsp.result &
> -                                0xffff);
> -                     break;
> -             }
> -             default: {
> -                     LOG_ER("Command %s failed by unknown reason
> %x", i_cmd,
> -                            cmd_rsp->info.smfd.event.cmd_rsp.result);
> -                     break;
> -             }
> -             }
> -     }
> -
>       rc = cmd_rsp->info.smfd.event.cmd_rsp.result;
> +
> +        if (error_log) {
> +          log_rsp_errors(i_cmd, i_timeout, rc);
> +        }
> +
>       free(cmd_rsp);
>       return rc;
>  }
> 
> +uint32_t smfnd_exec_remote_cmd(const char *i_cmd,
> +                               const SmfndNodeDest *i_smfnd,
> +                            uint32_t i_timeout,
> +                               uint32_t i_localTimeout) {
> +  return smfnd_main_remote_cmd(i_cmd, i_smfnd,
> +                               i_timeout, i_localTimeout, true); }
> +
> +uint32_t smfnd_exec_remote_cmdnolog(const char *i_cmd,
> +                                    const SmfndNodeDest *i_smfnd,
> +                                    uint32_t i_timeout,
> +                                    uint32_t i_localTimeout) {
> +  return smfnd_main_remote_cmd(i_cmd, i_smfnd,
> +                               i_timeout, i_localTimeout, false); }
> +
>  /**
>   * smfnd_remote_cmd
>   * @param i_cmd Remote command to be executed diff --git 
> a/src/smf/smfd/smfd_smfnd.h b/src/smf/smfd/smfd_smfnd.h index 
> 6c227a5..fed706f 100644
> --- a/src/smf/smfd/smfd_smfnd.h
> +++ b/src/smf/smfd/smfd_smfnd.h
> @@ -80,7 +80,13 @@ uint32_t smfnd_down(SaClmNodeIdT node_id);  bool 
> smfnd_for_name(const char *i_nodeName, SmfndNodeDest *o_nodeDest);  
> uint32_t smfnd_exec_remote_cmd(const char *i_cmd, const SmfndNodeDest 
> *i_smfnd,
>                                 uint32_t i_timeout, uint32_t 
> i_localTimeout);
> -
> +/*
> + * Execute remote command without logging the errors of the remote
> command.
> +*/
> +uint32_t smfnd_exec_remote_cmdnolog(const char *i_cmd,
> +                                    const SmfndNodeDest *i_smfnd,
> +                                    uint32_t i_timeout,
> +                                    uint32_t i_localTimeout);
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 2.7.4


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most engaging tech 
sites, Slashdot.org! http://sdm.link/slashdot 
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to