Hi Thuan
I have some comments, see below snippet from the code in function
getNodeDestination() in SmfUtils.cc
I have not tested
// Try to get the node director for a while. If the nodes reboot very fast
// after a cluster reboot, it could happend the rebooted nodes comes up before
// the last one is rebooted. This could make the campaign to fail when the
// campaign continue.
if (strcmp(className, "SaClmNode") == 0) {
int timeout = 10 * ONE_SECOND;
while (smfnd_for_name(i_node.c_str(), o_nodeDest) == false) {
if (timeout <= 0) {
LOG_NO("Failed to get node dest for clm node %s", i_node.c_str());
return false;
}
struct timespec time = {2 * ONE_SECOND, 0};
osaf_nanosleep(&time);
// [Lennart] This will in practice change the timeout to 5 seconds and
// to do it like this is confusing and requires a reader to not miss this
code line
// It is likely that a reader will believe it is still 10 seconds
// It is better to change the timeout setting above to 5 * ONE_SECOND
//
timeout -= 2;
// [Lennart] Even if the following code line is not changed by you but is
not
// a good way to write code. First elapsedTime is not a boolean and
// secondly no if statements should be written as one liner like this
// Should be changed to:
// if (elapsedTime != 0) {
// *elapsedTime = *elapsedTime + 2 * ONE_SECOND;
// }
// It is good to fix things like this if it is found in a place where
other
// changes are done
if (elapsedTime) *elapsedTime = *elapsedTime + 2 * ONE_SECOND;
if (maxWaitTime != -1) {
// [Lennart] The same problem as above with elapsedTime as above
// Change to (elapsedTime != 0)
if ((elapsedTime) && (*elapsedTime >= maxWaitTime)) {
LOG_NO("Failed to get node dest for clm node %s", i_node.c_str());
return false;
}
}
}
LOG_NO("%s: className '%s'", __FUNCTION__, className);
return true;
Thanks
Lennart
> -----Original Message-----
> From: thuan.tran <[email protected]>
> Sent: den 25 september 2018 09:04
> To: Lennart Lund <[email protected]>; Gary Lee
> <[email protected]>
> Cc: [email protected]; Thuan Tran
> <[email protected]>
> Subject: [PATCH 1/1] smf: campaign is executing forever until cluster reset
> [#1353]
>
> The function getNodeDestination() reset elapsedTime to zero cause
> the node reboot timeout at waitForNodeDestination() never reach.
> If scenario that node reboot cannot come back then campaign is stuck
> in executing forever until cluster reset.
> ---
> src/smf/smfd/SmfUpgradeStep.cc | 1 +
> src/smf/smfd/SmfUtils.cc | 11 ++++-------
> 2 files changed, 5 insertions(+), 7 deletions(-)
>
> diff --git a/src/smf/smfd/SmfUpgradeStep.cc
> b/src/smf/smfd/SmfUpgradeStep.cc
> index 4c0ddd192..80da668de 100644
> --- a/src/smf/smfd/SmfUpgradeStep.cc
> +++ b/src/smf/smfd/SmfUpgradeStep.cc
> @@ -2399,6 +2399,7 @@ bool SmfUpgradeStep::nodeReboot() {
> "SmfUpgradeStep::nodeReboot: Waiting to get node destination with
> increased UP counter");
>
> while (true) {
> + elapsedTime = 0;
> for (nodeIt = rebootedNodeList.begin(); nodeIt !=
> rebootedNodeList.end();) {
> if (getNodeDestination((*nodeIt).node_name, &nodeDest,
> &elapsedTime,
> -1)) {
> diff --git a/src/smf/smfd/SmfUtils.cc b/src/smf/smfd/SmfUtils.cc
> index 915c086a5..4ac5af163 100644
> --- a/src/smf/smfd/SmfUtils.cc
> +++ b/src/smf/smfd/SmfUtils.cc
> @@ -95,9 +95,6 @@ bool getNodeDestination(const std::string &i_node,
> SmfndNodeDest *o_nodeDest,
>
> TRACE("Find destination for node '%s'", i_node.c_str());
>
> - if (elapsedTime) // Initialize elapsedTime to zero.
> - *elapsedTime = 0;
> -
> /* It seems SaAmfNode objects can be stored, but the code
> * indicates that SaClmNode's are expected. Anyway an attempt
> * to go for it is probably faster that examining IMM classes
> @@ -133,10 +130,10 @@ bool getNodeDestination(const std::string
> &i_node, SmfndNodeDest *o_nodeDest,
> }
> struct timespec time = {2 * ONE_SECOND, 0};
> osaf_nanosleep(&time);
> - timeout--;
> + timeout -= 2;
> if (elapsedTime) *elapsedTime = *elapsedTime + 2 * ONE_SECOND;
> if (maxWaitTime != -1) {
> - if (*elapsedTime >= maxWaitTime) {
> + if ((elapsedTime) && (*elapsedTime >= maxWaitTime)) {
> LOG_NO("Failed to get node dest for clm node %s", i_node.c_str());
> return false;
> }
> @@ -165,11 +162,11 @@ bool getNodeDestination(const std::string
> &i_node, SmfndNodeDest *o_nodeDest,
> }
> struct timespec time = {2 * ONE_SECOND, 0};
> osaf_nanosleep(&time);
> - timeout--;
> + timeout -= 2;
> if (elapsedTime) *elapsedTime = *elapsedTime + 2 * ONE_SECOND;
>
> if (maxWaitTime != -1) {
> - if (*elapsedTime >= maxWaitTime) {
> + if ((elapsedTime) && (*elapsedTime >= maxWaitTime)) {
> LOG_NO("Failed to get node dest for clm node %s", i_node.c_str());
> free(nodeName);
> return false;
> --
> 2.18.0
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel