Re: [devel] [PATCH 3/5] osaf: allow active SC to be preferred during network split [#2996]

2019-01-21 Thread Minh Hon Chau

ack, review only. Thanks/Minh

On 21/1/19 2:52 pm, Gary Lee wrote:

Add FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE option to allow
active SC to be preferred during a network split. The default
behavior is to prefer the larger partition to maintain
existing behaviour.

Add configuration support for FMS_RELAXED_NODE_PROMOTION.
---
  src/osaf/consensus/consensus.cc | 39 ---
  src/osaf/consensus/consensus.h  |  9 +++--
  src/osaf/consensus/key_value.cc |  8 ++--
  3 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/src/osaf/consensus/consensus.cc b/src/osaf/consensus/consensus.cc
index 112af7d..5304c4f 100644
--- a/src/osaf/consensus/consensus.cc
+++ b/src/osaf/consensus/consensus.cc
@@ -64,6 +64,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool 
graceful_takeover,
 cluster_size);
  if (rc != SA_AIS_OK) {
LOG_WA("Takeover request failed (%d)", rc);
+  rc = SA_AIS_ERR_EXIST;
return rc;
  }
  take_over_request_created = true;
@@ -99,7 +100,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool 
graceful_takeover,
if (rc == SA_AIS_OK) {
  LOG_NO("Active controller set to %s", base::Conf::NodeName().c_str());
} else {
-LOG_ER("Failed to promote this node (%u)", rc);
+LOG_WA("Failed to promote this node (%u)", rc);
}
  
return rc;

@@ -197,6 +198,10 @@ bool Consensus::IsWritable() const {
  
  bool Consensus::IsRemoteFencingEnabled() const { return use_remote_fencing_; }
  
+bool Consensus::IsRelaxedNodePromotionEnabled() const {

+  return relaxed_node_promotion_;
+}
+
  std::string Consensus::CurrentActive() const {
TRACE_ENTER();
if (use_consensus_ == false) {
@@ -228,6 +233,10 @@ Consensus::Consensus() {
uint32_t split_brain_enable = base::GetEnv("FMS_SPLIT_BRAIN_PREVENTION", 0);
std::string kv_store_cmd = base::GetEnv("FMS_KEYVALUE_STORE_PLUGIN_CMD", 
"");
uint32_t use_remote_fencing = base::GetEnv("FMS_USE_REMOTE_FENCING", 0);
+  uint32_t prioritise_partition_size =
+base::GetEnv("FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE", 1);
+  uint32_t relaxed_node_promotion =
+base::GetEnv("FMS_RELAXED_NODE_PROMOTION", 0);
  
// if not specified in fmd.conf,

// takeover requests are valid for 20 seconds
@@ -246,6 +255,14 @@ Consensus::Consensus() {
  use_remote_fencing_ = true;
}
  
+  if (prioritise_partition_size == 1) {

+prioritise_partition_size_ = true;
+  }
+
+  if (use_consensus_ == true && relaxed_node_promotion == 1) {
+relaxed_node_promotion_ = true;
+  }
+
// needed for base::Conf::NodeName() later
base::Conf::InitNodeName();
  }
@@ -373,6 +390,10 @@ SaAisErrorT Consensus::CreateTakeoverRequest(const 
std::string& current_owner,
  return CreateTakeoverRequest(current_owner, proposed_owner, cluster_size);
}
  
+  if (rc != SA_AIS_OK) {

+ return rc;
+  }
+
// wait up to max_takeover_retry seconds for request to be answered
retries = 0;
while (retries < max_takeover_retry) {
@@ -546,9 +567,21 @@ Consensus::TakeoverState Consensus::HandleTakeoverRequest(
LOG_NO("Other network size: %" PRIu64 ", our network size: %" PRIu64,
   proposed_cluster_size, cluster_size);
  
+  const std::string state_str =

+tokens[static_cast(TakeoverElements::STATE)];
+
TakeoverState result;
-  if (proposed_cluster_size > cluster_size) {
-result = TakeoverState::ACCEPTED;
+  if (state_str !=
+TakeoverStateStr[static_cast(TakeoverState::NEW)]) {
+return TakeoverState::UNDEFINED;
+  }
+
+  if (prioritise_partition_size_ == true) {
+if (proposed_cluster_size > cluster_size) {
+  result = TakeoverState::ACCEPTED;
+} else {
+  result = TakeoverState::REJECTED;
+}
} else {
  result = TakeoverState::REJECTED;
}
diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h
index 6421c7c..2fbd3bd 100644
--- a/src/osaf/consensus/consensus.h
+++ b/src/osaf/consensus/consensus.h
@@ -57,6 +57,9 @@ class Consensus {
// Is remote fencing enabled?
bool IsRemoteFencingEnabled() const;
  
+  // Is relaxed node promotion enabled?

+  bool IsRelaxedNodePromotionEnabled() const;
+
Consensus();
virtual ~Consensus();
  
@@ -66,7 +69,7 @@ class Consensus {

  UNDEFINED = 0,
  NEW = 1,
  ACCEPTED = 2,
-REJECTED = 3,
+REJECTED = 3
};
  
enum class TakeoverElements : std::uint8_t {

@@ -85,13 +88,15 @@ class Consensus {
   private:
bool use_consensus_ = false;
bool use_remote_fencing_ = false;
+  bool prioritise_partition_size_ = false;
+  bool relaxed_node_promotion_ = false;
uint32_t takeover_valid_time;
uint32_t max_takeover_retry;
const std::string kTestKeyname = "opensaf_write_test";
const std::chrono::milliseconds kSleepInterval =
std::chrono::milliseconds(1000);  // in ms
static constexpr uint32_t kLockTimeout = 0;  // lock is persistent 

[devel] [PATCH 3/5] osaf: allow active SC to be preferred during network split [#2996]

2019-01-20 Thread Gary Lee
Add FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE option to allow
active SC to be preferred during a network split. The default
behavior is to prefer the larger partition to maintain
existing behaviour.

Add configuration support for FMS_RELAXED_NODE_PROMOTION.
---
 src/osaf/consensus/consensus.cc | 39 ---
 src/osaf/consensus/consensus.h  |  9 +++--
 src/osaf/consensus/key_value.cc |  8 ++--
 3 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/src/osaf/consensus/consensus.cc b/src/osaf/consensus/consensus.cc
index 112af7d..5304c4f 100644
--- a/src/osaf/consensus/consensus.cc
+++ b/src/osaf/consensus/consensus.cc
@@ -64,6 +64,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool 
graceful_takeover,
cluster_size);
 if (rc != SA_AIS_OK) {
   LOG_WA("Takeover request failed (%d)", rc);
+  rc = SA_AIS_ERR_EXIST;
   return rc;
 }
 take_over_request_created = true;
@@ -99,7 +100,7 @@ SaAisErrorT Consensus::PromoteThisNode(const bool 
graceful_takeover,
   if (rc == SA_AIS_OK) {
 LOG_NO("Active controller set to %s", base::Conf::NodeName().c_str());
   } else {
-LOG_ER("Failed to promote this node (%u)", rc);
+LOG_WA("Failed to promote this node (%u)", rc);
   }
 
   return rc;
@@ -197,6 +198,10 @@ bool Consensus::IsWritable() const {
 
 bool Consensus::IsRemoteFencingEnabled() const { return use_remote_fencing_; }
 
+bool Consensus::IsRelaxedNodePromotionEnabled() const {
+  return relaxed_node_promotion_;
+}
+
 std::string Consensus::CurrentActive() const {
   TRACE_ENTER();
   if (use_consensus_ == false) {
@@ -228,6 +233,10 @@ Consensus::Consensus() {
   uint32_t split_brain_enable = base::GetEnv("FMS_SPLIT_BRAIN_PREVENTION", 0);
   std::string kv_store_cmd = base::GetEnv("FMS_KEYVALUE_STORE_PLUGIN_CMD", "");
   uint32_t use_remote_fencing = base::GetEnv("FMS_USE_REMOTE_FENCING", 0);
+  uint32_t prioritise_partition_size =
+base::GetEnv("FMS_TAKEOVER_PRIORITISE_PARTITION_SIZE", 1);
+  uint32_t relaxed_node_promotion =
+base::GetEnv("FMS_RELAXED_NODE_PROMOTION", 0);
 
   // if not specified in fmd.conf,
   // takeover requests are valid for 20 seconds
@@ -246,6 +255,14 @@ Consensus::Consensus() {
 use_remote_fencing_ = true;
   }
 
+  if (prioritise_partition_size == 1) {
+prioritise_partition_size_ = true;
+  }
+
+  if (use_consensus_ == true && relaxed_node_promotion == 1) {
+relaxed_node_promotion_ = true;
+  }
+
   // needed for base::Conf::NodeName() later
   base::Conf::InitNodeName();
 }
@@ -373,6 +390,10 @@ SaAisErrorT Consensus::CreateTakeoverRequest(const 
std::string& current_owner,
 return CreateTakeoverRequest(current_owner, proposed_owner, cluster_size);
   }
 
+  if (rc != SA_AIS_OK) {
+ return rc;
+  }
+
   // wait up to max_takeover_retry seconds for request to be answered
   retries = 0;
   while (retries < max_takeover_retry) {
@@ -546,9 +567,21 @@ Consensus::TakeoverState Consensus::HandleTakeoverRequest(
   LOG_NO("Other network size: %" PRIu64 ", our network size: %" PRIu64,
  proposed_cluster_size, cluster_size);
 
+  const std::string state_str =
+tokens[static_cast(TakeoverElements::STATE)];
+
   TakeoverState result;
-  if (proposed_cluster_size > cluster_size) {
-result = TakeoverState::ACCEPTED;
+  if (state_str !=
+TakeoverStateStr[static_cast(TakeoverState::NEW)]) {
+return TakeoverState::UNDEFINED;
+  }
+
+  if (prioritise_partition_size_ == true) {
+if (proposed_cluster_size > cluster_size) {
+  result = TakeoverState::ACCEPTED;
+} else {
+  result = TakeoverState::REJECTED;
+}
   } else {
 result = TakeoverState::REJECTED;
   }
diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h
index 6421c7c..2fbd3bd 100644
--- a/src/osaf/consensus/consensus.h
+++ b/src/osaf/consensus/consensus.h
@@ -57,6 +57,9 @@ class Consensus {
   // Is remote fencing enabled?
   bool IsRemoteFencingEnabled() const;
 
+  // Is relaxed node promotion enabled?
+  bool IsRelaxedNodePromotionEnabled() const;
+
   Consensus();
   virtual ~Consensus();
 
@@ -66,7 +69,7 @@ class Consensus {
 UNDEFINED = 0,
 NEW = 1,
 ACCEPTED = 2,
-REJECTED = 3,
+REJECTED = 3
   };
 
   enum class TakeoverElements : std::uint8_t {
@@ -85,13 +88,15 @@ class Consensus {
  private:
   bool use_consensus_ = false;
   bool use_remote_fencing_ = false;
+  bool prioritise_partition_size_ = false;
+  bool relaxed_node_promotion_ = false;
   uint32_t takeover_valid_time;
   uint32_t max_takeover_retry;
   const std::string kTestKeyname = "opensaf_write_test";
   const std::chrono::milliseconds kSleepInterval =
   std::chrono::milliseconds(1000);  // in ms
   static constexpr uint32_t kLockTimeout = 0;  // lock is persistent by default
-  static constexpr uint32_t kMaxRetry = 30;
+  static constexpr uint32_t kMaxRetry = 3;
 
   void CheckForExistingTakeoverRequest();