Hi Rafael

Ack. See comment tagged [Lennart]

Thanks
Lennart

> -----Original Message-----
> From: Rafael Odzakow
> Sent: den 19 maj 2017 14:17
> To: Lennart Lund <[email protected]>;
> [email protected]
> Cc: [email protected]; Rafael Odzakow
> <[email protected]>
> Subject: [PATCH 1/1] smf: try to wait for opensafd status before executing
> reboot [#2464]
> 
> There are cases when opensafd startup is still ongoing and SMF will send
> out a reboot command for a node. Because opensafd has taken a lock the
> reboot command will not be able to call opensafd stop. It is suggested
> that SMF tries to wait for the release of the lock with "opensafd
> status". The waiting time is short and SMF continues with reboot even if
> the lock is not released.
> 
> ticket #2459 allows SMF to query the status of opensafd.
> 
> - Refactor smf remote command to have two versions, one that logs errors
> of
>   the endpoint command and one without error logging.
> ---
>  src/smf/smfd/SmfUpgradeStep.cc |  23 ++++++++++
>  src/smf/smfd/smfd_smfnd.c      | 102 +++++++++++++++++++++++++-------
> ---------
>  src/smf/smfd/smfd_smfnd.h      |   4 ++
>  3 files changed, 90 insertions(+), 39 deletions(-)
> 
> diff --git a/src/smf/smfd/SmfUpgradeStep.cc
> b/src/smf/smfd/SmfUpgradeStep.cc
> index 2ffeab1..fc54019 100644
> --- a/src/smf/smfd/SmfUpgradeStep.cc
> +++ b/src/smf/smfd/SmfUpgradeStep.cc
> @@ -54,6 +54,7 @@
>  #include "smf/smfd/SmfRollback.h"
>  #include "smf/smfd/SmfUtils.h"
>  #include "osaf/immutil/immutil.h"
> +#include "osaf/configmake.h"
>  #include "smf/smfd/smfd_smfnd.h"
>  #include "smfd.h"
>  #include "base/osaf_time.h"
> @@ -2299,6 +2300,28 @@ bool SmfUpgradeStep::nodeReboot() {
>        goto done;
>      }
> 
> +    // Try to make sure opensafd is not in startup phase otherwise reboot 
> will
> +    // not trigger opensafd stop.
[Lennart] Instead of using a counter a timed loop could be used. A possible 
future improvement could be to make the timeout time configurable via SMF 
configuration object

> +    int counter = 0;
> +    while (counter < 5) {
> +      TRACE("checking status of opensafd");
> +      std::string command = LSBINITDIR;
> +      command += "/opensafd status";
> +      cmdrc = smfnd_remote_cmd(command.c_str(), &nodeDest,
> +                               cliTimeout, localTimeout);
> +      if ((cmdrc  & 0xffff0000) == SMFSV_CMD_RESULT_CODE &&
> +          (cmdrc & 0xffff) == 150) {
> +          // The lockfile is taken, try again
> +          LOG_WA("opensafd status, retcode[%u] retry in 2 seconds",
> +                 cmdrc & 0xffff);
> +          struct timespec time = {2, 0};
> +          osaf_nanosleep(&time);
> +          counter += 1;
> +      } else {
> +        break;
> +      }
> +    }
> +
>      /* When executing a reboot command on a node the command will never
> return
>         so we want a short local timeout. Since the smfnd is handling the
>         cli timeout we want that to be much longer so that the reboot command
> diff --git a/src/smf/smfd/smfd_smfnd.c b/src/smf/smfd/smfd_smfnd.c
> index 23770ef..7384637 100644
> --- a/src/smf/smfd/smfd_smfnd.c
> +++ b/src/smf/smfd/smfd_smfnd.c
> @@ -55,8 +55,10 @@ static SaVersionT clmVersion = {'B', 1, 1};
> 
>  static pthread_mutex_t smfnd_list_lock = PTHREAD_MUTEX_INITIALIZER;
> 
> -static uint32_t smfnd_remote_cmd(const char *i_cmd, MDS_DEST
> i_smfnd_dest,
> -                              uint32_t i_timeout);
> +static uint32_t smfnd_legacy_remote_cmd(const char *i_cmd,
> +                                        MDS_DEST i_smfnd_dest,
> +                                        uint32_t i_timeout);
> +void log_rsp_errors(const char *i_cmd, uint32_t i_timeout, uint32_t
> i_result);
> 
>  /*
> ==========================================================
> ==============
>   *   FUNCTION PROTOTYPES
> @@ -240,6 +242,23 @@ uint32_t smfnd_down(SaClmNodeIdT i_node_id)
>  uint32_t smfnd_exec_remote_cmd(const char *i_cmd, const
> SmfndNodeDest *i_smfnd,
>                              uint32_t i_timeout, uint32_t i_localTimeout)
>  {
> +        uint32_t result = smfnd_remote_cmd(i_cmd, i_smfnd, i_timeout,
> +                                           i_localTimeout);
> +        log_rsp_errors(i_cmd, i_timeout, result);
> +        return result;
> +}
> +
> +/**
> + * smfnd_remote_cmd
> + * @param i_cmd Remote command to be executed
> + * @param i_smfnd Info about the smfnd node where to execute
> + *                     the command
> + * @param i_timeout Max time the command may take in 10 ms
> + */
> +uint32_t smfnd_remote_cmd(const char *i_cmd, const SmfndNodeDest
> *i_smfnd,
> +                          uint32_t i_timeout, uint32_t i_localTimeout)
> +
> +{
>       SMFSV_EVT cmd_req_asynch;
>       SMFSV_EVT *cmd_rsp = 0;
>       uint32_t rc;
> @@ -270,7 +289,7 @@ uint32_t smfnd_exec_remote_cmd(const char
> *i_cmd, const SmfndNodeDest *i_smfnd,
>       if (i_smfnd->rem_svc_pvt_ver == 1) {
>               /* This addressed smfnd can only handle the old cmd req
> message
>                * format */
> -             return smfnd_remote_cmd(i_cmd, i_smfnd->dest,
> i_timeout);
> +             return smfnd_legacy_remote_cmd(i_cmd, i_smfnd->dest,
> i_timeout);
>       }
> 
>       /* A new smfnd can handle the asynch message */
> @@ -297,53 +316,58 @@ uint32_t smfnd_exec_remote_cmd(const char
> *i_cmd, const SmfndNodeDest *i_smfnd,
>               return SMFSV_CMD_EXEC_FAILED;
>       }
> 
> -     if (cmd_rsp->info.smfd.event.cmd_rsp.result != 0) { /* 0 = cmd OK */
> -             switch (cmd_rsp->info.smfd.event.cmd_rsp.result &
> 0xffff0000) {
> -             case SMFSV_CMD_EXEC_FAILED: {
> -                     LOG_ER("Command %s failed to start (%u)", i_cmd,
> -                            cmd_rsp->info.smfd.event.cmd_rsp.result &
> -                                0xffff);
> -                     break;
> -             }
> -             case SMFSV_CMD_TIMEOUT: {
> -                     LOG_ER("Command %s timed out (timeout %u ms)",
> i_cmd,
> -                            i_timeout * 10);
> -                     break;
> -             }
> -             case SMFSV_CMD_RESULT_CODE: {
> -                     LOG_ER("Command %s returned error %u", i_cmd,
> -                            cmd_rsp->info.smfd.event.cmd_rsp.result &
> -                                0xffff);
> -                     break;
> -             }
> -             case SMFSV_CMD_SIGNAL_TERM: {
> -                     LOG_ER("Command %s terminated by signal %u",
> i_cmd,
> -                            cmd_rsp->info.smfd.event.cmd_rsp.result &
> -                                0xffff);
> -                     break;
> -             }
> -             default: {
> -                     LOG_ER("Command %s failed by unknown reason
> %x", i_cmd,
> -                            cmd_rsp->info.smfd.event.cmd_rsp.result);
> -                     break;
> -             }
> -             }
> -     }
> -
>       rc = cmd_rsp->info.smfd.event.cmd_rsp.result;
>       free(cmd_rsp);
>       return rc;
>  }
> 
>  /**
> - * smfnd_remote_cmd
> + * @param i_cmd Name of remote command that was executed
> + * @param i_timeout Max time out for the remote command in 10 ms
> + * @param i_result Result code from smfnd_remote_cmd
> + */
> +void log_rsp_errors(const char *i_cmd, uint32_t i_timeout, uint32_t
> i_result) {
> +       if (i_result != 0) { /* 0 = cmd OK */
> +               switch (i_result & 0xffff0000) {
> +               case SMFSV_CMD_EXEC_FAILED: {
> +                       LOG_ER("Command %s failed to start (%u)",
> +                               i_cmd, i_result & 0xffff);
> +                       break;
> +               }
> +               case SMFSV_CMD_TIMEOUT: {
> +                       LOG_ER("Command %s timed out (timeout %u ms)",
> +                               i_cmd, i_timeout * 10);
> +                       break;
> +               }
> +               case SMFSV_CMD_RESULT_CODE: {
> +                       LOG_ER("Command %s returned error %u",
> +                               i_cmd, i_result & 0xffff);
> +                       break;
> +               }
> +               case SMFSV_CMD_SIGNAL_TERM: {
> +                       LOG_ER("Command %s terminated by signal %u",
> +                               i_cmd, i_result & 0xffff);
> +                       break;
> +               }
> +               default: {
> +                       LOG_ER("Command %s failed by unknown reason %x",
> +                               i_cmd, i_result);
> +                       break;
> +               }
> +               } // switch
> +       } // if
> +}
> +
> +
> +/**
> + * smfnd_legacy_remote_cmd
>   * @param i_cmd Remote command to be executed
>   * @param i_smfnd_dest Destination to the node where to execute
>   *                     the command
>   * @param i_timeout Max time the command may take
>   */
> -uint32_t smfnd_remote_cmd(const char *i_cmd, MDS_DEST i_smfnd_dest,
> -                       uint32_t i_timeout)
> +uint32_t smfnd_legacy_remote_cmd(const char *i_cmd, MDS_DEST
> i_smfnd_dest,
> +                              uint32_t i_timeout)
>  {
>       SMFSV_EVT cmd_req;
>       SMFSV_EVT *cmd_rsp = NULL;
> diff --git a/src/smf/smfd/smfd_smfnd.h b/src/smf/smfd/smfd_smfnd.h
> index 46892f8..2a8c2e9 100644
> --- a/src/smf/smfd/smfd_smfnd.h
> +++ b/src/smf/smfd/smfd_smfnd.h
> @@ -80,6 +80,10 @@ uint32_t smfnd_down(SaClmNodeIdT node_id);
>  bool smfnd_for_name(const char *i_nodeName, SmfndNodeDest
> *o_nodeDest);
>  uint32_t smfnd_exec_remote_cmd(const char *i_cmd, const
> SmfndNodeDest *i_smfnd,
>                                 uint32_t i_timeout, uint32_t i_localTimeout);
> +// Remote command without error logging for endpoint exit codes
> +uint32_t smfnd_remote_cmd(const char *i_cmd, const SmfndNodeDest
> *i_smfnd,
> +                          uint32_t i_timeout, uint32_t i_localTimeout);
> +
> 
>  #ifdef __cplusplus
>  }
> --
> 2.7.4


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to