Re: [devel] [PATCH 3/5] osaf: allow active SC to be preferred during network split [#2996]
ack, review only. Thanks/Minh On 21/1/19 2:52 pm, Gary Lee wrote: Add FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE option to allow active SC to be preferred during a network split. The default behavior is to prefer the larger partition to maintain existing behaviour. Add configuration support for FMS_RELAXED_NODE_PROMOTION. --- src/osaf/consensus/consensus.cc | 39 --- src/osaf/consensus/consensus.h | 9 +++-- src/osaf/consensus/key_value.cc | 8 ++-- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/src/osaf/consensus/consensus.cc b/src/osaf/consensus/consensus.cc index 112af7d..5304c4f 100644 --- a/src/osaf/consensus/consensus.cc +++ b/src/osaf/consensus/consensus.cc @@ -64,6 +64,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool graceful_takeover, cluster_size); if (rc != SA_AIS_OK) { LOG_WA("Takeover request failed (%d)", rc); + rc = SA_AIS_ERR_EXIST; return rc; } take_over_request_created = true; @@ -99,7 +100,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool graceful_takeover, if (rc == SA_AIS_OK) { LOG_NO("Active controller set to %s", base::Conf::NodeName().c_str()); } else { -LOG_ER("Failed to promote this node (%u)", rc); +LOG_WA("Failed to promote this node (%u)", rc); } return rc; @@ -197,6 +198,10 @@ bool Consensus::IsWritable() const { bool Consensus::IsRemoteFencingEnabled() const { return use_remote_fencing_; } +bool Consensus::IsRelaxedNodePromotionEnabled() const { + return relaxed_node_promotion_; +} + std::string Consensus::CurrentActive() const { TRACE_ENTER(); if (use_consensus_ == false) { @@ -228,6 +233,10 @@ Consensus::Consensus() { uint32_t split_brain_enable = base::GetEnv("FMS_SPLIT_BRAIN_PREVENTION", 0); std::string kv_store_cmd = base::GetEnv("FMS_KEYVALUE_STORE_PLUGIN_CMD", ""); uint32_t use_remote_fencing = base::GetEnv("FMS_USE_REMOTE_FENCING", 0); + uint32_t prioritise_partition_size = +base::GetEnv("FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE", 1); + uint32_t relaxed_node_promotion = +base::GetEnv("FMS_RELAXED_NODE_PROMOTION", 0); // if not specified in fmd.conf, // takeover requests are valid for 20 seconds @@ -246,6 +255,14 @@ Consensus::Consensus() { use_remote_fencing_ = true; } + if (prioritise_partition_size == 1) { +prioritise_partition_size_ = true; + } + + if (use_consensus_ == true && relaxed_node_promotion == 1) { +relaxed_node_promotion_ = true; + } + // needed for base::Conf::NodeName() later base::Conf::InitNodeName(); } @@ -373,6 +390,10 @@ SaAisErrorT Consensus::CreateTakeoverRequest(const std::string& current_owner, return CreateTakeoverRequest(current_owner, proposed_owner, cluster_size); } + if (rc != SA_AIS_OK) { + return rc; + } + // wait up to max_takeover_retry seconds for request to be answered retries = 0; while (retries < max_takeover_retry) { @@ -546,9 +567,21 @@ Consensus::TakeoverState Consensus::HandleTakeoverRequest( LOG_NO("Other network size: %" PRIu64 ", our network size: %" PRIu64, proposed_cluster_size, cluster_size); + const std::string state_str = +tokens[static_cast(TakeoverElements::STATE)]; + TakeoverState result; - if (proposed_cluster_size > cluster_size) { -result = TakeoverState::ACCEPTED; + if (state_str != +TakeoverStateStr[static_cast(TakeoverState::NEW)]) { +return TakeoverState::UNDEFINED; + } + + if (prioritise_partition_size_ == true) { +if (proposed_cluster_size > cluster_size) { + result = TakeoverState::ACCEPTED; +} else { + result = TakeoverState::REJECTED; +} } else { result = TakeoverState::REJECTED; } diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h index 6421c7c..2fbd3bd 100644 --- a/src/osaf/consensus/consensus.h +++ b/src/osaf/consensus/consensus.h @@ -57,6 +57,9 @@ class Consensus { // Is remote fencing enabled? bool IsRemoteFencingEnabled() const; + // Is relaxed node promotion enabled? + bool IsRelaxedNodePromotionEnabled() const; + Consensus(); virtual ~Consensus(); @@ -66,7 +69,7 @@ class Consensus { UNDEFINED = 0, NEW = 1, ACCEPTED = 2, -REJECTED = 3, +REJECTED = 3 }; enum class TakeoverElements : std::uint8_t { @@ -85,13 +88,15 @@ class Consensus { private: bool use_consensus_ = false; bool use_remote_fencing_ = false; + bool prioritise_partition_size_ = false; + bool relaxed_node_promotion_ = false; uint32_t takeover_valid_time; uint32_t max_takeover_retry; const std::string kTestKeyname = "opensaf_write_test"; const std::chrono::milliseconds kSleepInterval = std::chrono::milliseconds(1000); // in ms static constexpr uint32_t kLockTimeout = 0; // lock is persistent
[devel] [PATCH 3/5] osaf: allow active SC to be preferred during network split [#2996]
Add FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE option to allow active SC to be preferred during a network split. The default behavior is to prefer the larger partition to maintain existing behaviour. Add configuration support for FMS_RELAXED_NODE_PROMOTION. --- src/osaf/consensus/consensus.cc | 39 --- src/osaf/consensus/consensus.h | 9 +++-- src/osaf/consensus/key_value.cc | 8 ++-- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/src/osaf/consensus/consensus.cc b/src/osaf/consensus/consensus.cc index 112af7d..5304c4f 100644 --- a/src/osaf/consensus/consensus.cc +++ b/src/osaf/consensus/consensus.cc @@ -64,6 +64,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool graceful_takeover, cluster_size); if (rc != SA_AIS_OK) { LOG_WA("Takeover request failed (%d)", rc); + rc = SA_AIS_ERR_EXIST; return rc; } take_over_request_created = true; @@ -99,7 +100,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool graceful_takeover, if (rc == SA_AIS_OK) { LOG_NO("Active controller set to %s", base::Conf::NodeName().c_str()); } else { -LOG_ER("Failed to promote this node (%u)", rc); +LOG_WA("Failed to promote this node (%u)", rc); } return rc; @@ -197,6 +198,10 @@ bool Consensus::IsWritable() const { bool Consensus::IsRemoteFencingEnabled() const { return use_remote_fencing_; } +bool Consensus::IsRelaxedNodePromotionEnabled() const { + return relaxed_node_promotion_; +} + std::string Consensus::CurrentActive() const { TRACE_ENTER(); if (use_consensus_ == false) { @@ -228,6 +233,10 @@ Consensus::Consensus() { uint32_t split_brain_enable = base::GetEnv("FMS_SPLIT_BRAIN_PREVENTION", 0); std::string kv_store_cmd = base::GetEnv("FMS_KEYVALUE_STORE_PLUGIN_CMD", ""); uint32_t use_remote_fencing = base::GetEnv("FMS_USE_REMOTE_FENCING", 0); + uint32_t prioritise_partition_size = +base::GetEnv("FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE", 1); + uint32_t relaxed_node_promotion = +base::GetEnv("FMS_RELAXED_NODE_PROMOTION", 0); // if not specified in fmd.conf, // takeover requests are valid for 20 seconds @@ -246,6 +255,14 @@ Consensus::Consensus() { use_remote_fencing_ = true; } + if (prioritise_partition_size == 1) { +prioritise_partition_size_ = true; + } + + if (use_consensus_ == true && relaxed_node_promotion == 1) { +relaxed_node_promotion_ = true; + } + // needed for base::Conf::NodeName() later base::Conf::InitNodeName(); } @@ -373,6 +390,10 @@ SaAisErrorT Consensus::CreateTakeoverRequest(const std::string& current_owner, return CreateTakeoverRequest(current_owner, proposed_owner, cluster_size); } + if (rc != SA_AIS_OK) { + return rc; + } + // wait up to max_takeover_retry seconds for request to be answered retries = 0; while (retries < max_takeover_retry) { @@ -546,9 +567,21 @@ Consensus::TakeoverState Consensus::HandleTakeoverRequest( LOG_NO("Other network size: %" PRIu64 ", our network size: %" PRIu64, proposed_cluster_size, cluster_size); + const std::string state_str = +tokens[static_cast(TakeoverElements::STATE)]; + TakeoverState result; - if (proposed_cluster_size > cluster_size) { -result = TakeoverState::ACCEPTED; + if (state_str != +TakeoverStateStr[static_cast(TakeoverState::NEW)]) { +return TakeoverState::UNDEFINED; + } + + if (prioritise_partition_size_ == true) { +if (proposed_cluster_size > cluster_size) { + result = TakeoverState::ACCEPTED; +} else { + result = TakeoverState::REJECTED; +} } else { result = TakeoverState::REJECTED; } diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h index 6421c7c..2fbd3bd 100644 --- a/src/osaf/consensus/consensus.h +++ b/src/osaf/consensus/consensus.h @@ -57,6 +57,9 @@ class Consensus { // Is remote fencing enabled? bool IsRemoteFencingEnabled() const; + // Is relaxed node promotion enabled? + bool IsRelaxedNodePromotionEnabled() const; + Consensus(); virtual ~Consensus(); @@ -66,7 +69,7 @@ class Consensus { UNDEFINED = 0, NEW = 1, ACCEPTED = 2, -REJECTED = 3, +REJECTED = 3 }; enum class TakeoverElements : std::uint8_t { @@ -85,13 +88,15 @@ class Consensus { private: bool use_consensus_ = false; bool use_remote_fencing_ = false; + bool prioritise_partition_size_ = false; + bool relaxed_node_promotion_ = false; uint32_t takeover_valid_time; uint32_t max_takeover_retry; const std::string kTestKeyname = "opensaf_write_test"; const std::chrono::milliseconds kSleepInterval = std::chrono::milliseconds(1000); // in ms static constexpr uint32_t kLockTimeout = 0; // lock is persistent by default - static constexpr uint32_t kMaxRetry = 30; + static constexpr uint32_t kMaxRetry = 3; void CheckForExistingTakeoverRequest();