Hi Vu, Ack, code review only
Thanks, Ravi -----Original Message----- From: Vu Minh Nguyen [mailto:[email protected]] Sent: Tuesday, January 09, 2018 6:54 PM To: [email protected]; [email protected] Cc: [email protected]; Vu Minh Nguyen <[email protected]> Subject: [PATCH 1/1] imm: immnd asserts at veterans due to mismatched data during sync [#2748] Don't allow to make any changes to IMM data model after the SYNC abort has been sent to active IMMD but the abort response not yet arrived at the coord yet. Otherwise, all veteran nodes will be restarted at sync finalize due to data mismatch. --- src/imm/immnd/ImmModel.cc | 74 ++++++++++++++++++++++++++++++++++++++--- src/imm/immnd/ImmModel.h | 2 ++ src/imm/immnd/immnd_cb.h | 1 - src/imm/immnd/immnd_evt.c | 82 +++++++--------------------------------------- src/imm/immnd/immnd_init.h | 5 +++ src/imm/immnd/immnd_proc.c | 14 ++++---- 6 files changed, 95 insertions(+), 83 deletions(-) diff --git a/src/imm/immnd/ImmModel.cc b/src/imm/immnd/ImmModel.cc index fcd354e..4d875f4 100644 --- a/src/imm/immnd/ImmModel.cc +++ b/src/imm/immnd/ImmModel.cc @@ -2260,26 +2260,90 @@ void immModel_implementerDelete(IMMND_CB *cb, const char *implementerName) { ImmModel::instance(&cb->immModel)->implementerDelete(implementerName); } +void immModel_sendSyncAbortAt(IMMND_CB *cb, struct timespec time) { + ImmModel::instance(&cb->immModel)->SendSyncAbortAt(time); +} + +void immModel_getSyncAbortRsp(IMMND_CB *cb) { + ImmModel::instance(&cb->immModel)->GetSyncAbortRsp(); +} + /*====================================================================*/ ImmModel::ImmModel() : loaderPid(-1) {} +//> +// When sSyncAbortingAt != {0,0}, it means the SYNC has been aborted // +by coord and that abort message not yet broadcasted to IMMNDs. +// Therefore, NODE STATE at IMMND coord (IMM_NODE_FULLY_AVAILABLE) is +// different with veteran node(s) (IMM_NODE_R_AVAILABLE) at the moment. +// So, any change to IMM data model at the coord will result data +mismatchs // comparing with veterans, and lead to veterans restarted at +sync finalize // later on. +// +// In addition, we check the time here to avoid the worst cases // such +as the abort message arrived at active IMMD, but later on // it fails +to send the response (e.g: hang IMMD, or reboot IMMD, etc.) // For that +reason, if we don't receive the response within 6 seconds, // consider +sending abort failed. +// +//< + +// Store the start time of sync abort sent static struct timespec +sSyncAbortSentAt; void ImmModel::SendSyncAbortAt(timespec& time) { + sSyncAbortSentAt = time; +} + +void ImmModel::GetSyncAbortRsp() { + sSyncAbortSentAt.tv_sec = 0; + sSyncAbortSentAt.tv_nsec = 0; +} + +static bool is_sync_aborting() { + bool unset = (sSyncAbortSentAt.tv_nsec == 0 && + sSyncAbortSentAt.tv_sec == 0); + + if (unset) return false; + + time_t duration_in_second = 0xffffffff; struct timespec now, + duration; osaf_clock_gettime(CLOCK_MONOTONIC, &now); + osaf_timespec_subtract(&now, &sSyncAbortSentAt, &duration); + duration_in_second = duration.tv_sec; if (duration_in_second > + DEFAULT_TIMEOUT_SEC) { + sSyncAbortSentAt.tv_sec = 0; + sSyncAbortSentAt.tv_nsec = 0; + } + + return (duration_in_second <= DEFAULT_TIMEOUT_SEC); } + bool ImmModel::immNotWritable() { + bool notwritable = true; switch (sImmNodeState) { case IMM_NODE_R_AVAILABLE: case IMM_NODE_UNKNOWN: case IMM_NODE_ISOLATED: - return true; + break; case IMM_NODE_W_AVAILABLE: case IMM_NODE_FULLY_AVAILABLE: - case IMM_NODE_LOADING: - return false; + case IMM_NODE_LOADING: { + notwritable = false; + break; + } - default: + default: { LOG_ER("Impossible node state, will terminate"); + abort(); + } } - abort(); + + // When the sync abort has been sent by coord but the abort sync rsp + not yet // arrived at the coord yet, we will not allow to make any + change to IMM data // model to avoid data mismatch at finalize sync. + return (notwritable || is_sync_aborting()); } /* immNotPbeWritable returning true means: diff --git a/src/imm/immnd/ImmModel.h b/src/imm/immnd/ImmModel.h index 9e4c54a..6bdd1b9 100644 --- a/src/imm/immnd/ImmModel.h +++ b/src/imm/immnd/ImmModel.h @@ -391,6 +391,8 @@ class ImmModel { void getNonCriticalCcbs(IdVector& cv); void getOldCriticalCcbs(IdVector& cv, SaUint32T* pbeConn, unsigned int* pbeNodeId, SaUint32T* pbeId); + void SendSyncAbortAt(timespec& time); void GetSyncAbortRsp(); bool immNotWritable(); bool immNotPbeWritable(bool isPrtoClient = true); void* getPbeOi(SaUint32T* pbeConn, unsigned int* pbeNode, diff --git a/src/imm/immnd/immnd_cb.h b/src/imm/immnd/immnd_cb.h index 1a20ac3..7614d27 100644 --- a/src/imm/immnd/immnd_cb.h +++ b/src/imm/immnd/immnd_cb.h @@ -129,7 +129,6 @@ typedef struct immnd_cb_tag { uint8_t mIntroduced; // Ack received on introduce message uint8_t mSyncRequested; // true=> I am coord, other req sync uint8_t mPendSync; // 1=>sync announced but not received. - struct timespec mSyncAbortSentAt; // Store the start time of sync abort sent uint8_t mSyncFinalizing; // 1=>finalizeSync sent but not received. uint8_t mSync; // true => this node is being synced (client). uint8_t mCanBeCoord; // If!=0 then SC, 2 => 2pbe arbitration, 4 => diff --git a/src/imm/immnd/immnd_evt.c b/src/imm/immnd/immnd_evt.c index 9807741..b95dcdc 100644 --- a/src/imm/immnd/immnd_evt.c +++ b/src/imm/immnd/immnd_evt.c @@ -10056,12 +10056,10 @@ uint32_t immnd_evt_proc_abort_sync(IMMND_CB *cb, IMMND_EVT *evt, osafassert(cb->mRulingEpoch <= evt->info.ctrl.rulingEpoch); cb->mRulingEpoch = evt->info.ctrl.rulingEpoch; - cb->mSyncAbortSentAt.tv_sec = 0; - cb->mSyncAbortSentAt.tv_nsec = 0; - LOG_WA("Global ABORT SYNC received for epoch %u", cb->mRulingEpoch); if (cb->mIsCoord) { /* coord should already be up to date. */ + immModel_getSyncAbortRsp(cb); osafassert(cb->mMyEpoch == cb->mRulingEpoch); osafassert(!(cb->mSyncFinalizing)); } else { /* Noncoord IMMNDs */ @@ -10952,49 +10950,6 @@ static void immnd_evt_proc_discard_node(IMMND_CB *cb, IMMND_EVT *evt, } /**************************************************************************** - * Name : immnd_is_sync_aborting - * - * Description : to check if sync abort is ongoing or not. - * - * When mSyncAbortingAt != {0,0}, it means the SYNC has been aborted - * by coord and that abort message not yet broadcasted to IMMNDs. - * Therefore, NODE STATE at IMMND coord (IMM_NODE_FULLY_AVAILABLE) is - * different with veteran node(s) (IMM_NODE_R_AVAILABLE) at the moment. - * So, IMMND_EVT_D2ND_ADMINIT or IMMND_EVT_D2ND_CCBINIT msg comes to - * IMMND coord has to be rejected to synchronize with the result - * at veterans. - * - * In addition, we check the time here to avoid the worst cases - * such as the abort message arrived at active IMMD, but later on - * it fails to send the response (e.g: hang IMMD, or reboot IMMD, etc.) - * For that reason, if we don't receive the response within 6 seconds, - * consider sending abort failed. - * - * Arguments : IMMND_CB *cb - IMMND CB pointer - */ -static bool immnd_is_sync_aborting(IMMND_CB* cb) -{ - bool unset = (cb->mSyncAbortSentAt.tv_nsec == 0 && - cb->mSyncAbortSentAt.tv_sec == 0); - - if (unset) return false; - - /* Inherit from DEFAULT_TIMEOUT_SEC */ - const unsigned kTimeout = 6; - time_t duration_in_second = 0xffffffff; - struct timespec now, duration; - osaf_clock_gettime(CLOCK_MONOTONIC, &now); - osaf_timespec_subtract(&now, &cb->mSyncAbortSentAt, &duration); - duration_in_second = duration.tv_sec; - if (duration_in_second > kTimeout) { - cb->mSyncAbortSentAt.tv_sec = 0; - cb->mSyncAbortSentAt.tv_nsec = 0; - } - - return (duration_in_second <= kTimeout); -} - -/**************************************************************************** * Name : immnd_evt_proc_adminit_rsp * * Description : Function to process adminit response from Director @@ -11028,18 +10983,11 @@ static void immnd_evt_proc_adminit_rsp(IMMND_CB *cb, IMMND_EVT *evt, nodeId = m_IMMSV_UNPACK_HANDLE_LOW(clnt_hdl); ownerId = evt->info.adminitGlobal.globalOwnerId; - - if (cb->mIsCoord && immnd_is_sync_aborting(cb)) { - LOG_NO("The SYNC abort is not yet completed." - " Return TRY_AGAIN to user."); - err = SA_AIS_ERR_TRY_AGAIN; - } else { - err = immModel_adminOwnerCreate(cb, - &(evt->info.adminitGlobal.i), - ownerId, - (originatedAtThisNd) ? conn : 0, - nodeId); - } + err = immModel_adminOwnerCreate(cb, + &(evt->info.adminitGlobal.i), + ownerId, + (originatedAtThisNd) ? conn : 0, + nodeId); if (originatedAtThisNd) { /*Send reply to client from this ND. */ immnd_client_node_get(cb, clnt_hdl, &cl_node); @@ -12066,18 +12014,12 @@ static void immnd_evt_proc_ccbinit_rsp(IMMND_CB *cb, IMMND_EVT *evt, /* Remember latest ccb_id for IMMD recovery. */ cb->mLatestCcbId = evt->info.ccbinitGlobal.globalCcbId; - if (cb->mIsCoord && immnd_is_sync_aborting(cb)) { - LOG_NO("The SYNC abort is not yet completed." - " Return TRY_AGAIN to user."); - err = SA_AIS_ERR_TRY_AGAIN; - } else { - err = immModel_ccbCreate(cb, - evt->info.ccbinitGlobal.i.adminOwnerId, - evt->info.ccbinitGlobal.i.ccbFlags, - ccbId, - nodeId, - (originatedAtThisNd) ? conn : 0); - } + err = immModel_ccbCreate(cb, + evt->info.ccbinitGlobal.i.adminOwnerId, + evt->info.ccbinitGlobal.i.ccbFlags, + ccbId, + nodeId, + (originatedAtThisNd) ? conn : 0); if (originatedAtThisNd) { /*Send reply to client from this ND. */ immnd_client_node_get(cb, clnt_hdl, &cl_node); diff --git a/src/imm/immnd/immnd_init.h b/src/imm/immnd/immnd_init.h index 1791236..81f2f2c 100644 --- a/src/imm/immnd/immnd_init.h +++ b/src/imm/immnd/immnd_init.h @@ -31,6 +31,7 @@ #define IMM_IMMND_IMMND_INIT_H_ #include <saClm.h> +#include <time.h> #include "imm/common/immsv_evt_model.h" #include "imm/common/immsv_api.h" @@ -480,6 +481,10 @@ void immmModel_getLocalImplementers(IMMND_CB *cb, SaUint32T *arrSize, void immModel_implementerDelete(IMMND_CB *cb, const char *implementerName); +void immModel_sendSyncAbortAt(IMMND_CB *cb, struct timespec time); + +void immModel_getSyncAbortRsp(IMMND_CB *cb); + #ifdef __cplusplus } #endif diff --git a/src/imm/immnd/immnd_proc.c b/src/imm/immnd/immnd_proc.c index 4575095..54201a4 100644 --- a/src/imm/immnd/immnd_proc.c +++ b/src/imm/immnd/immnd_proc.c @@ -1134,14 +1134,14 @@ void immnd_abortSync(IMMND_CB *cb) } /* - * This mark is to say sync abort has been sent to active IMMD - * and not yet broadcasted to all IMMNDs. In other words, the NODE STATE - * at IMMND coord (IMM_NODE_FULLY_AVAILABLE) is different with veterans - * (IMM_NODE_R_AVAILABLE) at the moment. So, based on this information, - * IMMND_EVT_D2ND_ADMINIT or IMMND_EVT_D2ND_CCBINIT msg comes to IMMND - * coord has to be rejected to synchronize with the result at veterans. + * Save the time of sending SYNC ABORT. The time will be reset when + * the abort sync rsp arrives at the coord. Any messages that will make + * changes to IMM data model at coord while the abort message not yet + * been broadcasted by active IMMD will be rejected with TRY_AGAIN code. */ - osaf_clock_gettime(CLOCK_MONOTONIC, &cb->mSyncAbortSentAt); + struct timespec time; + osaf_clock_gettime(CLOCK_MONOTONIC, &time); + immModel_sendSyncAbortAt(cb, time); TRACE_LEAVE(); } -- 1.9.1 ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
