The following pull request was submitted through Github.
It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/7373

This e-mail was sent by the LXC bot, direct replies will not reach the author
unless they happen to be subscribed to this list.

=== Description (from pull-request) ===

From 3aec38499274068bf1862e0c0e4c60ae0fe3fee1 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanay...@canonical.com>
Date: Sat, 16 May 2020 18:15:40 +0100
Subject: [PATCH 1/2] Attempt to demote only offline nodes that are stand-by

Signed-off-by: Free Ekanayaka <free.ekanay...@canonical.com>
---
 lxd/cluster/membership.go | 48 +++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/lxd/cluster/membership.go b/lxd/cluster/membership.go
index 0013310b6b..a70833ae32 100644
--- a/lxd/cluster/membership.go
+++ b/lxd/cluster/membership.go
@@ -533,34 +533,32 @@ func Rebalance(state *state.State, gateway *Gateway) 
(string, []db.RaftNode, err
        candidates := make([]string, 0)
        for i, info := range currentRaftNodes {
                node := nodesByAddress[info.Address]
-               if node.IsOffline(offlineThreshold) && info.Role != 
db.RaftSpare {
-                       // Even the heartbeat timestamp is not recent
-                       // enough, let's try to connect to the node,
-                       // just in case the heartbeat is lagging behind
-                       // for some reason and the node is actually up.
-                       client, err := Connect(node.Address, gateway.cert, true)
-                       if err == nil {
-                               _, _, err = client.GetServer()
-                       }
-                       if err != nil {
-                               client, err := gateway.getClient()
-                               if err != nil {
-                                       return "", nil, errors.Wrap(err, 
"Failed to connect to local dqlite node")
-                               }
-                               defer client.Close()
-                               ctx, cancel := 
context.WithTimeout(context.Background(), 5*time.Second)
-                               defer cancel()
-                               err = client.Assign(ctx, info.ID, db.RaftSpare)
-                               if err != nil {
-                                       return "", nil, errors.Wrap(err, 
"Failed to demote offline node")
+               if node.IsOffline(offlineThreshold) {
+                       if info.Role == db.RaftStandBy {
+                               // Even the heartbeat timestamp is not recent
+                               // enough, let's try to connect to the node,
+                               // just in case the heartbeat is lagging behind
+                               // for some reason and the node is actually up.
+                               client, err := Connect(node.Address, 
gateway.cert, true)
+                               if err == nil {
+                                       _, _, err = client.GetServer()
                                }
-                               err = state.Cluster.Transaction(func(tx 
*db.ClusterTx) error {
-                                       return tx.RemoveNodeRole(node.ID, 
db.ClusterRoleDatabase)
-                               })
                                if err != nil {
-                                       return "", nil, errors.Wrap(err, 
"Failed to update node role")
+                                       client, err := gateway.getClient()
+                                       if err != nil {
+                                               return "", nil, 
errors.Wrap(err, "Failed to connect to local dqlite node")
+                                       }
+                                       defer client.Close()
+                                       ctx, cancel := 
context.WithTimeout(context.Background(), 5*time.Second)
+                                       defer cancel()
+                                       logger.Infof(
+                                               "Demote offline stand-by node 
%s (%s) to spare", node.Name, node.Address)
+                                       err = client.Assign(ctx, info.ID, 
db.RaftSpare)
+                                       if err != nil {
+                                               return "", nil, 
errors.Wrap(err, "Failed to demote offline node")
+                                       }
+                                       currentRaftNodes[i].Role = db.RaftSpare
                                }
-                               currentRaftNodes[i].Role = db.RaftSpare
                                continue
                        }
                }

From b64f7858c5028e40945b8955fddd28cd16299535 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanay...@canonical.com>
Date: Sat, 16 May 2020 18:17:12 +0100
Subject: [PATCH 2/2] When demoting a voter to spare, transition to stand-by
 first

This will let the node know that it's not a voter anymore, and
avoid disrupting the cluster.

Signed-off-by: Free Ekanayaka <free.ekanay...@canonical.com>
---
 lxd/api_cluster.go        |  2 +-
 lxd/cluster/membership.go | 42 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go
index dd0095b9e8..94eac84708 100644
--- a/lxd/api_cluster.go
+++ b/lxd/api_cluster.go
@@ -634,7 +634,7 @@ func clusterPutJoin(d *Daemon, req api.ClusterPut) 
response.Response {
                // role changes.
                _, _, err = client.RawQuery("POST", 
"/internal/cluster/rebalance", nil, "")
                if err != nil {
-                       return errors.Wrap(err, "Failed cluster rebalance 
request")
+                       logger.Warnf("Failed to trigger cluster rebalance: %v", 
err)
                }
 
                return nil
diff --git a/lxd/cluster/membership.go b/lxd/cluster/membership.go
index a70833ae32..b6f30cdc86 100644
--- a/lxd/cluster/membership.go
+++ b/lxd/cluster/membership.go
@@ -590,7 +590,7 @@ func Rebalance(state *state.State, gateway *Gateway) 
(string, []db.RaftNode, err
        address := ""
        for _, candidate := range candidates {
                node := nodesByAddress[candidate]
-               logger.Debugf(
+               logger.Infof(
                        "Found spare node %s (%s) to be promoted to %s", 
node.Name, node.Address, role)
                address = node.Address
                break
@@ -613,8 +613,6 @@ func Rebalance(state *state.State, gateway *Gateway) 
(string, []db.RaftNode, err
 
 // Assign a new role to the local dqlite node.
 func Assign(state *state.State, gateway *Gateway, nodes []db.RaftNode) error {
-       logger.Info("Assign new role to dqlite node")
-
        // Figure out our own address.
        address := ""
        err := state.Cluster.Transaction(func(tx *db.ClusterTx) error {
@@ -715,6 +713,44 @@ assign:
        }
        defer client.Close()
 
+       // If we're stepping back to spare, let's first transition to stand-by
+       // and wait for the configuration change to be notified to us. This
+       // prevent us from thinking we're still voters and potentially disrupt
+       // the cluster.
+       if info.Role == db.RaftSpare {
+               err = client.Assign(ctx, info.ID, db.RaftStandBy)
+               if err != nil {
+                       return errors.Wrap(err, "Failed to step back to 
stand-by")
+               }
+               local, err := gateway.getClient()
+               if err != nil {
+                       return errors.Wrap(err, "Failed to get local dqlite 
client")
+               }
+               notified := false
+               for i := 0; i < 10; i++ {
+                       time.Sleep(500 * time.Millisecond)
+                       servers, err := local.Cluster(context.Background())
+                       if err != nil {
+                               return errors.Wrap(err, "Failed to get current 
cluster")
+                       }
+                       for _, server := range servers {
+                               if server.Address != info.Address {
+                                       continue
+                               }
+                               if server.Role == db.RaftStandBy {
+                                       notified = true
+                                       break
+                               }
+                       }
+                       if notified {
+                               break
+                       }
+               }
+               if !notified {
+                       return fmt.Errorf("Timeout waiting for configuration 
change notification")
+               }
+       }
+
        err = client.Assign(ctx, info.ID, info.Role)
        if err != nil {
                return errors.Wrap(err, "Failed to assign role")
_______________________________________________
lxc-devel mailing list
lxc-devel@lists.linuxcontainers.org
http://lists.linuxcontainers.org/listinfo/lxc-devel

Reply via email to