Hi Rafael Ack. See comment tagged [Lennart]
Thanks Lennart > -----Original Message----- > From: Rafael Odzakow > Sent: den 19 maj 2017 14:17 > To: Lennart Lund <[email protected]>; > [email protected] > Cc: [email protected]; Rafael Odzakow > <[email protected]> > Subject: [PATCH 1/1] smf: try to wait for opensafd status before executing > reboot [#2464] > > There are cases when opensafd startup is still ongoing and SMF will send > out a reboot command for a node. Because opensafd has taken a lock the > reboot command will not be able to call opensafd stop. It is suggested > that SMF tries to wait for the release of the lock with "opensafd > status". The waiting time is short and SMF continues with reboot even if > the lock is not released. > > ticket #2459 allows SMF to query the status of opensafd. > > - Refactor smf remote command to have two versions, one that logs errors > of > the endpoint command and one without error logging. > --- > src/smf/smfd/SmfUpgradeStep.cc | 23 ++++++++++ > src/smf/smfd/smfd_smfnd.c | 102 +++++++++++++++++++++++++------- > --------- > src/smf/smfd/smfd_smfnd.h | 4 ++ > 3 files changed, 90 insertions(+), 39 deletions(-) > > diff --git a/src/smf/smfd/SmfUpgradeStep.cc > b/src/smf/smfd/SmfUpgradeStep.cc > index 2ffeab1..fc54019 100644 > --- a/src/smf/smfd/SmfUpgradeStep.cc > +++ b/src/smf/smfd/SmfUpgradeStep.cc > @@ -54,6 +54,7 @@ > #include "smf/smfd/SmfRollback.h" > #include "smf/smfd/SmfUtils.h" > #include "osaf/immutil/immutil.h" > +#include "osaf/configmake.h" > #include "smf/smfd/smfd_smfnd.h" > #include "smfd.h" > #include "base/osaf_time.h" > @@ -2299,6 +2300,28 @@ bool SmfUpgradeStep::nodeReboot() { > goto done; > } > > + // Try to make sure opensafd is not in startup phase otherwise reboot > will > + // not trigger opensafd stop. [Lennart] Instead of using a counter a timed loop could be used. A possible future improvement could be to make the timeout time configurable via SMF configuration object > + int counter = 0; > + while (counter < 5) { > + TRACE("checking status of opensafd"); > + std::string command = LSBINITDIR; > + command += "/opensafd status"; > + cmdrc = smfnd_remote_cmd(command.c_str(), &nodeDest, > + cliTimeout, localTimeout); > + if ((cmdrc & 0xffff0000) == SMFSV_CMD_RESULT_CODE && > + (cmdrc & 0xffff) == 150) { > + // The lockfile is taken, try again > + LOG_WA("opensafd status, retcode[%u] retry in 2 seconds", > + cmdrc & 0xffff); > + struct timespec time = {2, 0}; > + osaf_nanosleep(&time); > + counter += 1; > + } else { > + break; > + } > + } > + > /* When executing a reboot command on a node the command will never > return > so we want a short local timeout. Since the smfnd is handling the > cli timeout we want that to be much longer so that the reboot command > diff --git a/src/smf/smfd/smfd_smfnd.c b/src/smf/smfd/smfd_smfnd.c > index 23770ef..7384637 100644 > --- a/src/smf/smfd/smfd_smfnd.c > +++ b/src/smf/smfd/smfd_smfnd.c > @@ -55,8 +55,10 @@ static SaVersionT clmVersion = {'B', 1, 1}; > > static pthread_mutex_t smfnd_list_lock = PTHREAD_MUTEX_INITIALIZER; > > -static uint32_t smfnd_remote_cmd(const char *i_cmd, MDS_DEST > i_smfnd_dest, > - uint32_t i_timeout); > +static uint32_t smfnd_legacy_remote_cmd(const char *i_cmd, > + MDS_DEST i_smfnd_dest, > + uint32_t i_timeout); > +void log_rsp_errors(const char *i_cmd, uint32_t i_timeout, uint32_t > i_result); > > /* > ========================================================== > ============== > * FUNCTION PROTOTYPES > @@ -240,6 +242,23 @@ uint32_t smfnd_down(SaClmNodeIdT i_node_id) > uint32_t smfnd_exec_remote_cmd(const char *i_cmd, const > SmfndNodeDest *i_smfnd, > uint32_t i_timeout, uint32_t i_localTimeout) > { > + uint32_t result = smfnd_remote_cmd(i_cmd, i_smfnd, i_timeout, > + i_localTimeout); > + log_rsp_errors(i_cmd, i_timeout, result); > + return result; > +} > + > +/** > + * smfnd_remote_cmd > + * @param i_cmd Remote command to be executed > + * @param i_smfnd Info about the smfnd node where to execute > + * the command > + * @param i_timeout Max time the command may take in 10 ms > + */ > +uint32_t smfnd_remote_cmd(const char *i_cmd, const SmfndNodeDest > *i_smfnd, > + uint32_t i_timeout, uint32_t i_localTimeout) > + > +{ > SMFSV_EVT cmd_req_asynch; > SMFSV_EVT *cmd_rsp = 0; > uint32_t rc; > @@ -270,7 +289,7 @@ uint32_t smfnd_exec_remote_cmd(const char > *i_cmd, const SmfndNodeDest *i_smfnd, > if (i_smfnd->rem_svc_pvt_ver == 1) { > /* This addressed smfnd can only handle the old cmd req > message > * format */ > - return smfnd_remote_cmd(i_cmd, i_smfnd->dest, > i_timeout); > + return smfnd_legacy_remote_cmd(i_cmd, i_smfnd->dest, > i_timeout); > } > > /* A new smfnd can handle the asynch message */ > @@ -297,53 +316,58 @@ uint32_t smfnd_exec_remote_cmd(const char > *i_cmd, const SmfndNodeDest *i_smfnd, > return SMFSV_CMD_EXEC_FAILED; > } > > - if (cmd_rsp->info.smfd.event.cmd_rsp.result != 0) { /* 0 = cmd OK */ > - switch (cmd_rsp->info.smfd.event.cmd_rsp.result & > 0xffff0000) { > - case SMFSV_CMD_EXEC_FAILED: { > - LOG_ER("Command %s failed to start (%u)", i_cmd, > - cmd_rsp->info.smfd.event.cmd_rsp.result & > - 0xffff); > - break; > - } > - case SMFSV_CMD_TIMEOUT: { > - LOG_ER("Command %s timed out (timeout %u ms)", > i_cmd, > - i_timeout * 10); > - break; > - } > - case SMFSV_CMD_RESULT_CODE: { > - LOG_ER("Command %s returned error %u", i_cmd, > - cmd_rsp->info.smfd.event.cmd_rsp.result & > - 0xffff); > - break; > - } > - case SMFSV_CMD_SIGNAL_TERM: { > - LOG_ER("Command %s terminated by signal %u", > i_cmd, > - cmd_rsp->info.smfd.event.cmd_rsp.result & > - 0xffff); > - break; > - } > - default: { > - LOG_ER("Command %s failed by unknown reason > %x", i_cmd, > - cmd_rsp->info.smfd.event.cmd_rsp.result); > - break; > - } > - } > - } > - > rc = cmd_rsp->info.smfd.event.cmd_rsp.result; > free(cmd_rsp); > return rc; > } > > /** > - * smfnd_remote_cmd > + * @param i_cmd Name of remote command that was executed > + * @param i_timeout Max time out for the remote command in 10 ms > + * @param i_result Result code from smfnd_remote_cmd > + */ > +void log_rsp_errors(const char *i_cmd, uint32_t i_timeout, uint32_t > i_result) { > + if (i_result != 0) { /* 0 = cmd OK */ > + switch (i_result & 0xffff0000) { > + case SMFSV_CMD_EXEC_FAILED: { > + LOG_ER("Command %s failed to start (%u)", > + i_cmd, i_result & 0xffff); > + break; > + } > + case SMFSV_CMD_TIMEOUT: { > + LOG_ER("Command %s timed out (timeout %u ms)", > + i_cmd, i_timeout * 10); > + break; > + } > + case SMFSV_CMD_RESULT_CODE: { > + LOG_ER("Command %s returned error %u", > + i_cmd, i_result & 0xffff); > + break; > + } > + case SMFSV_CMD_SIGNAL_TERM: { > + LOG_ER("Command %s terminated by signal %u", > + i_cmd, i_result & 0xffff); > + break; > + } > + default: { > + LOG_ER("Command %s failed by unknown reason %x", > + i_cmd, i_result); > + break; > + } > + } // switch > + } // if > +} > + > + > +/** > + * smfnd_legacy_remote_cmd > * @param i_cmd Remote command to be executed > * @param i_smfnd_dest Destination to the node where to execute > * the command > * @param i_timeout Max time the command may take > */ > -uint32_t smfnd_remote_cmd(const char *i_cmd, MDS_DEST i_smfnd_dest, > - uint32_t i_timeout) > +uint32_t smfnd_legacy_remote_cmd(const char *i_cmd, MDS_DEST > i_smfnd_dest, > + uint32_t i_timeout) > { > SMFSV_EVT cmd_req; > SMFSV_EVT *cmd_rsp = NULL; > diff --git a/src/smf/smfd/smfd_smfnd.h b/src/smf/smfd/smfd_smfnd.h > index 46892f8..2a8c2e9 100644 > --- a/src/smf/smfd/smfd_smfnd.h > +++ b/src/smf/smfd/smfd_smfnd.h > @@ -80,6 +80,10 @@ uint32_t smfnd_down(SaClmNodeIdT node_id); > bool smfnd_for_name(const char *i_nodeName, SmfndNodeDest > *o_nodeDest); > uint32_t smfnd_exec_remote_cmd(const char *i_cmd, const > SmfndNodeDest *i_smfnd, > uint32_t i_timeout, uint32_t i_localTimeout); > +// Remote command without error logging for endpoint exit codes > +uint32_t smfnd_remote_cmd(const char *i_cmd, const SmfndNodeDest > *i_smfnd, > + uint32_t i_timeout, uint32_t i_localTimeout); > + > > #ifdef __cplusplus > } > -- > 2.7.4 ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
