Hi Thuan

I have some comments, see below snippet from the code in function 
getNodeDestination() in SmfUtils.cc
I have not tested


  // Try to get the node director for a while. If the nodes reboot very fast
  // after a cluster reboot, it could happend the rebooted nodes comes up before
  // the last one is rebooted. This could make the campaign to fail when the
  // campaign continue.
  if (strcmp(className, "SaClmNode") == 0) {
    int timeout = 10 * ONE_SECOND;
    while (smfnd_for_name(i_node.c_str(), o_nodeDest) == false) {
      if (timeout <= 0) {
        LOG_NO("Failed to get node dest for clm node %s", i_node.c_str());
        return false;
      }
      struct timespec time = {2 * ONE_SECOND, 0};
      osaf_nanosleep(&time);
      // [Lennart] This will in practice change the timeout to 5 seconds and
      // to do it like this is confusing and requires a reader to not miss this 
code line
      // It is likely that a reader will believe it is still 10 seconds
      // It is better to change the timeout setting above to 5 * ONE_SECOND
      // 
      timeout -= 2;
      // [Lennart] Even if the following code line is not changed by you but is 
not
      // a good way to write code. First elapsedTime is not a boolean and
      // secondly no if statements should be written as one liner like this
      // Should be changed to:
      // if (elapsedTime != 0) {
      //   *elapsedTime = *elapsedTime + 2 * ONE_SECOND;
      // }
      // It is good to fix things like this if it is found in a place where 
other
      // changes are done
      if (elapsedTime) *elapsedTime = *elapsedTime + 2 * ONE_SECOND;
      if (maxWaitTime != -1) {
        // [Lennart] The same problem as above with elapsedTime as above
        // Change to (elapsedTime != 0)
        if ((elapsedTime) && (*elapsedTime >= maxWaitTime)) {
          LOG_NO("Failed to get node dest for clm node %s", i_node.c_str());
          return false;
        }
      }
    }
    LOG_NO("%s: className '%s'", __FUNCTION__, className);
    return true;

Thanks
Lennart

> -----Original Message-----
> From: thuan.tran <[email protected]>
> Sent: den 25 september 2018 09:04
> To: Lennart Lund <[email protected]>; Gary Lee
> <[email protected]>
> Cc: [email protected]; Thuan Tran
> <[email protected]>
> Subject: [PATCH 1/1] smf: campaign is executing forever until cluster reset
> [#1353]
> 
> The function getNodeDestination() reset elapsedTime to zero cause
> the node reboot timeout at waitForNodeDestination() never reach.
> If scenario that node reboot cannot come back then campaign is stuck
> in executing forever until cluster reset.
> ---
>  src/smf/smfd/SmfUpgradeStep.cc |  1 +
>  src/smf/smfd/SmfUtils.cc       | 11 ++++-------
>  2 files changed, 5 insertions(+), 7 deletions(-)
> 
> diff --git a/src/smf/smfd/SmfUpgradeStep.cc
> b/src/smf/smfd/SmfUpgradeStep.cc
> index 4c0ddd192..80da668de 100644
> --- a/src/smf/smfd/SmfUpgradeStep.cc
> +++ b/src/smf/smfd/SmfUpgradeStep.cc
> @@ -2399,6 +2399,7 @@ bool SmfUpgradeStep::nodeReboot() {
>        "SmfUpgradeStep::nodeReboot: Waiting to get node destination with
> increased UP counter");
> 
>    while (true) {
> +    elapsedTime = 0;
>      for (nodeIt = rebootedNodeList.begin(); nodeIt !=
> rebootedNodeList.end();) {
>        if (getNodeDestination((*nodeIt).node_name, &nodeDest,
> &elapsedTime,
>                               -1)) {
> diff --git a/src/smf/smfd/SmfUtils.cc b/src/smf/smfd/SmfUtils.cc
> index 915c086a5..4ac5af163 100644
> --- a/src/smf/smfd/SmfUtils.cc
> +++ b/src/smf/smfd/SmfUtils.cc
> @@ -95,9 +95,6 @@ bool getNodeDestination(const std::string &i_node,
> SmfndNodeDest *o_nodeDest,
> 
>    TRACE("Find destination for node '%s'", i_node.c_str());
> 
> -  if (elapsedTime)  // Initialize elapsedTime to zero.
> -    *elapsedTime = 0;
> -
>    /* It seems SaAmfNode objects can be stored, but the code
>     * indicates that SaClmNode's are expected. Anyway an attempt
>     * to go for it is probably faster that examining IMM classes
> @@ -133,10 +130,10 @@ bool getNodeDestination(const std::string
> &i_node, SmfndNodeDest *o_nodeDest,
>        }
>        struct timespec time = {2 * ONE_SECOND, 0};
>        osaf_nanosleep(&time);
> -      timeout--;
> +      timeout -= 2;
>        if (elapsedTime) *elapsedTime = *elapsedTime + 2 * ONE_SECOND;
>        if (maxWaitTime != -1) {
> -        if (*elapsedTime >= maxWaitTime) {
> +        if ((elapsedTime) && (*elapsedTime >= maxWaitTime)) {
>            LOG_NO("Failed to get node dest for clm node %s", i_node.c_str());
>            return false;
>          }
> @@ -165,11 +162,11 @@ bool getNodeDestination(const std::string
> &i_node, SmfndNodeDest *o_nodeDest,
>        }
>        struct timespec time = {2 * ONE_SECOND, 0};
>        osaf_nanosleep(&time);
> -      timeout--;
> +      timeout -= 2;
>        if (elapsedTime) *elapsedTime = *elapsedTime + 2 * ONE_SECOND;
> 
>        if (maxWaitTime != -1) {
> -        if (*elapsedTime >= maxWaitTime) {
> +        if ((elapsedTime) && (*elapsedTime >= maxWaitTime)) {
>            LOG_NO("Failed to get node dest for clm node %s", i_node.c_str());
>            free(nodeName);
>            return false;
> --
> 2.18.0



_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to