The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/7373
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) ===
From 3aec38499274068bf1862e0c0e4c60ae0fe3fee1 Mon Sep 17 00:00:00 2001 From: Free Ekanayaka <free.ekanay...@canonical.com> Date: Sat, 16 May 2020 18:15:40 +0100 Subject: [PATCH 1/2] Attempt to demote only offline nodes that are stand-by Signed-off-by: Free Ekanayaka <free.ekanay...@canonical.com> --- lxd/cluster/membership.go | 48 +++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/lxd/cluster/membership.go b/lxd/cluster/membership.go index 0013310b6b..a70833ae32 100644 --- a/lxd/cluster/membership.go +++ b/lxd/cluster/membership.go @@ -533,34 +533,32 @@ func Rebalance(state *state.State, gateway *Gateway) (string, []db.RaftNode, err candidates := make([]string, 0) for i, info := range currentRaftNodes { node := nodesByAddress[info.Address] - if node.IsOffline(offlineThreshold) && info.Role != db.RaftSpare { - // Even the heartbeat timestamp is not recent - // enough, let's try to connect to the node, - // just in case the heartbeat is lagging behind - // for some reason and the node is actually up. - client, err := Connect(node.Address, gateway.cert, true) - if err == nil { - _, _, err = client.GetServer() - } - if err != nil { - client, err := gateway.getClient() - if err != nil { - return "", nil, errors.Wrap(err, "Failed to connect to local dqlite node") - } - defer client.Close() - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - err = client.Assign(ctx, info.ID, db.RaftSpare) - if err != nil { - return "", nil, errors.Wrap(err, "Failed to demote offline node") + if node.IsOffline(offlineThreshold) { + if info.Role == db.RaftStandBy { + // Even the heartbeat timestamp is not recent + // enough, let's try to connect to the node, + // just in case the heartbeat is lagging behind + // for some reason and the node is actually up. + client, err := Connect(node.Address, gateway.cert, true) + if err == nil { + _, _, err = client.GetServer() } - err = state.Cluster.Transaction(func(tx *db.ClusterTx) error { - return tx.RemoveNodeRole(node.ID, db.ClusterRoleDatabase) - }) if err != nil { - return "", nil, errors.Wrap(err, "Failed to update node role") + client, err := gateway.getClient() + if err != nil { + return "", nil, errors.Wrap(err, "Failed to connect to local dqlite node") + } + defer client.Close() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + logger.Infof( + "Demote offline stand-by node %s (%s) to spare", node.Name, node.Address) + err = client.Assign(ctx, info.ID, db.RaftSpare) + if err != nil { + return "", nil, errors.Wrap(err, "Failed to demote offline node") + } + currentRaftNodes[i].Role = db.RaftSpare } - currentRaftNodes[i].Role = db.RaftSpare continue } } From b64f7858c5028e40945b8955fddd28cd16299535 Mon Sep 17 00:00:00 2001 From: Free Ekanayaka <free.ekanay...@canonical.com> Date: Sat, 16 May 2020 18:17:12 +0100 Subject: [PATCH 2/2] When demoting a voter to spare, transition to stand-by first This will let the node know that it's not a voter anymore, and avoid disrupting the cluster. Signed-off-by: Free Ekanayaka <free.ekanay...@canonical.com> --- lxd/api_cluster.go | 2 +- lxd/cluster/membership.go | 42 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go index dd0095b9e8..94eac84708 100644 --- a/lxd/api_cluster.go +++ b/lxd/api_cluster.go @@ -634,7 +634,7 @@ func clusterPutJoin(d *Daemon, req api.ClusterPut) response.Response { // role changes. _, _, err = client.RawQuery("POST", "/internal/cluster/rebalance", nil, "") if err != nil { - return errors.Wrap(err, "Failed cluster rebalance request") + logger.Warnf("Failed to trigger cluster rebalance: %v", err) } return nil diff --git a/lxd/cluster/membership.go b/lxd/cluster/membership.go index a70833ae32..b6f30cdc86 100644 --- a/lxd/cluster/membership.go +++ b/lxd/cluster/membership.go @@ -590,7 +590,7 @@ func Rebalance(state *state.State, gateway *Gateway) (string, []db.RaftNode, err address := "" for _, candidate := range candidates { node := nodesByAddress[candidate] - logger.Debugf( + logger.Infof( "Found spare node %s (%s) to be promoted to %s", node.Name, node.Address, role) address = node.Address break @@ -613,8 +613,6 @@ func Rebalance(state *state.State, gateway *Gateway) (string, []db.RaftNode, err // Assign a new role to the local dqlite node. func Assign(state *state.State, gateway *Gateway, nodes []db.RaftNode) error { - logger.Info("Assign new role to dqlite node") - // Figure out our own address. address := "" err := state.Cluster.Transaction(func(tx *db.ClusterTx) error { @@ -715,6 +713,44 @@ assign: } defer client.Close() + // If we're stepping back to spare, let's first transition to stand-by + // and wait for the configuration change to be notified to us. This + // prevent us from thinking we're still voters and potentially disrupt + // the cluster. + if info.Role == db.RaftSpare { + err = client.Assign(ctx, info.ID, db.RaftStandBy) + if err != nil { + return errors.Wrap(err, "Failed to step back to stand-by") + } + local, err := gateway.getClient() + if err != nil { + return errors.Wrap(err, "Failed to get local dqlite client") + } + notified := false + for i := 0; i < 10; i++ { + time.Sleep(500 * time.Millisecond) + servers, err := local.Cluster(context.Background()) + if err != nil { + return errors.Wrap(err, "Failed to get current cluster") + } + for _, server := range servers { + if server.Address != info.Address { + continue + } + if server.Role == db.RaftStandBy { + notified = true + break + } + } + if notified { + break + } + } + if !notified { + return fmt.Errorf("Timeout waiting for configuration change notification") + } + } + err = client.Assign(ctx, info.ID, info.Role) if err != nil { return errors.Wrap(err, "Failed to assign role")
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel