The following pull request was submitted through Github.
It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/7138

This e-mail was sent by the LXC bot, direct replies will not reach the author
unless they happen to be subscribed to this list.

=== Description (from pull-request) ===
Should fix #7133.
From 98894fe289541249e9c5775df114a6664ec485c2 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanay...@canonical.com>
Date: Mon, 6 Apr 2020 10:56:00 +0100
Subject: [PATCH 1/5] lxd/cluster: add RemoveRaftNode() to force removing a
 raft node

Signed-off-by: Free Ekanayaka <free.ekanay...@canonical.com>
---
 lxd/cluster/recover.go | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/lxd/cluster/recover.go b/lxd/cluster/recover.go
index 780feb5a1c..b41eb3cf62 100644
--- a/lxd/cluster/recover.go
+++ b/lxd/cluster/recover.go
@@ -1,10 +1,13 @@
 package cluster
 
 import (
+       "context"
        "fmt"
        "path/filepath"
+       "time"
 
        dqlite "github.com/canonical/go-dqlite"
+       client "github.com/canonical/go-dqlite/client"
        "github.com/lxc/lxd/lxd/db"
        "github.com/lxc/lxd/lxd/node"
        "github.com/pkg/errors"
@@ -87,3 +90,38 @@ func Recover(database *db.Node) error {
 
        return nil
 }
+
+// RemoveRaftNode removes a raft node from the raft configuration.
+func RemoveRaftNode(gateway *Gateway, address string) error {
+       nodes, err := gateway.currentRaftNodes()
+       if err != nil {
+               return errors.Wrap(err, "Failed to get current raft nodes")
+       }
+       var id uint64
+       for _, node := range nodes {
+               if node.Address == address {
+                       id = node.ID
+                       break
+               }
+       }
+       if id == 0 {
+               return fmt.Errorf("No raft node with address %q", address)
+       }
+
+       ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+       defer cancel()
+       client, err := client.FindLeader(
+               ctx, gateway.NodeStore(),
+               client.WithDialFunc(gateway.raftDial()),
+               client.WithLogFunc(DqliteLog),
+       )
+       if err != nil {
+               return errors.Wrap(err, "Failed to connect to cluster leader")
+       }
+       defer client.Close()
+       err = client.Remove(ctx, id)
+       if err != nil {
+               return errors.Wrap(err, "Failed to remove node")
+       }
+       return nil
+}

From 85db63806a239ca011ede24d6d81641ad46a27d6 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanay...@canonical.com>
Date: Mon, 6 Apr 2020 10:56:23 +0100
Subject: [PATCH 2/5] api: Add "DELETE /internal/cluster/raft/<address>"
 endpoint

Signed-off-by: Free Ekanayaka <free.ekanay...@canonical.com>
---
 lxd/api_cluster.go  | 17 +++++++++++++++++
 lxd/api_internal.go |  1 +
 2 files changed, 18 insertions(+)

diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go
index f9db3dc5da..5a812856e5 100644
--- a/lxd/api_cluster.go
+++ b/lxd/api_cluster.go
@@ -76,6 +76,12 @@ var internalClusterHandoverCmd = APIEndpoint{
        Post: APIEndpointAction{Handler: internalClusterPostHandover},
 }
 
+var internalClusterRaftNodeCmd = APIEndpoint{
+       Path: "cluster/raft-node/{address}",
+
+       Delete: APIEndpointAction{Handler: internalClusterRaftNodeDelete},
+}
+
 // Return information about the cluster.
 func clusterGet(d *Daemon, r *http.Request) response.Response {
        name := ""
@@ -1491,3 +1497,14 @@ func clusterCheckNetworksMatch(cluster *db.Cluster, 
reqNetworks []api.Network) e
        }
        return nil
 }
+
+// Used as low-level recovering helper.
+func internalClusterRaftNodeDelete(d *Daemon, r *http.Request) 
response.Response {
+       address := mux.Vars(r)["address"]
+       err := cluster.RemoveRaftNode(d.gateway, address)
+       if err != nil {
+               return response.SmartError(err)
+       }
+
+       return response.SyncResponse(true, nil)
+}
diff --git a/lxd/api_internal.go b/lxd/api_internal.go
index 46873aeb52..c5f1def35d 100644
--- a/lxd/api_internal.go
+++ b/lxd/api_internal.go
@@ -49,6 +49,7 @@ var apiInternal = []APIEndpoint{
        internalGarbageCollectorCmd,
        internalRAFTSnapshotCmd,
        internalClusterHandoverCmd,
+       internalClusterRaftNodeCmd,
 }
 
 var internalShutdownCmd = APIEndpoint{

From a2ec082e11f788dbd6ce91ebd4836e68099edba8 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanay...@canonical.com>
Date: Mon, 6 Apr 2020 10:57:17 +0100
Subject: [PATCH 3/5] lxd: Add "lxd cluster remove-raft-node" recovery command

Signed-off-by: Free Ekanayaka <free.ekanay...@canonical.com>
---
 lxd/main_cluster.go | 71 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

diff --git a/lxd/main_cluster.go b/lxd/main_cluster.go
index 228b144a98..f9341b10bd 100644
--- a/lxd/main_cluster.go
+++ b/lxd/main_cluster.go
@@ -12,6 +12,7 @@ import (
        "github.com/lxc/lxd/lxd/cluster"
        "github.com/lxc/lxd/lxd/db"
        "github.com/lxc/lxd/lxd/sys"
+       "github.com/lxc/lxd/lxd/util"
        "github.com/lxc/lxd/shared"
        "github.com/pkg/errors"
        "github.com/spf13/cobra"
@@ -36,6 +37,10 @@ func (c *cmdCluster) Command() *cobra.Command {
        recover := cmdClusterRecoverFromQuorumLoss{global: c.global}
        cmd.AddCommand(recover.Command())
 
+       // Remove a raft node.
+       removeRaftNode := cmdClusterRemoveRaftNode{global: c.global}
+       cmd.AddCommand(removeRaftNode.Command())
+
        return cmd
 }
 
@@ -102,7 +107,7 @@ func (c *cmdClusterRecoverFromQuorumLoss) Run(cmd 
*cobra.Command, args []string)
                return fmt.Errorf("The LXD daemon is running, please stop it 
first.")
        }
 
-       // Prompt for confiromation unless --quiet was passed.
+       // Prompt for confirmation unless --quiet was passed.
        if !c.flagNonInteractive {
                err := c.promptConfirmation()
                if err != nil {
@@ -147,3 +152,67 @@ Do you want to proceed? (yes/no): `)
        }
        return nil
 }
+
+type cmdClusterRemoveRaftNode struct {
+       global             *cmdGlobal
+       flagNonInteractive bool
+}
+
+func (c *cmdClusterRemoveRaftNode) Command() *cobra.Command {
+       cmd := &cobra.Command{}
+       cmd.Use = "remove-raft-node <address>"
+       cmd.Aliases = []string{"ls"}
+       cmd.Short = "Remove a raft node from the raft configuration"
+
+       cmd.RunE = c.Run
+
+       cmd.Flags().BoolVarP(&c.flagNonInteractive, "quiet", "q", false, "Don't 
require user confirmation")
+
+       return cmd
+}
+
+func (c *cmdClusterRemoveRaftNode) Run(cmd *cobra.Command, args []string) 
error {
+       if len(args) != 1 {
+               cmd.Help()
+               return fmt.Errorf("Missing required arguments")
+       }
+
+       address := util.CanonicalNetworkAddress(args[0])
+
+       // Prompt for confirmation unless --quiet was passed.
+       if !c.flagNonInteractive {
+               err := c.promptConfirmation()
+               if err != nil {
+                       return err
+               }
+       }
+
+       client, err := lxd.ConnectLXDUnix("", nil)
+       if err != nil {
+               return errors.Wrapf(err, "Failed to connect to LXD daemon")
+       }
+
+       endpoint := fmt.Sprintf("/internal/cluster/raft-node/%s", address)
+       _, _, err = client.RawQuery("DELETE", endpoint, nil, "")
+       if err != nil {
+               return err
+       }
+
+       return nil
+}
+
+func (c *cmdClusterRemoveRaftNode) promptConfirmation() error {
+       reader := bufio.NewReader(os.Stdin)
+       fmt.Printf(`You should run this command only if you ended up in an
+inconsistent state where a node has been uncleanly removed (i.e. it doesn't 
show
+up in "lxc cluster list" but it's still in the raft configuration).
+
+Do you want to proceed? (yes/no): `)
+       input, _ := reader.ReadString('\n')
+       input = strings.TrimSuffix(input, "\n")
+
+       if !shared.StringInSlice(strings.ToLower(input), []string{"yes"}) {
+               return fmt.Errorf("Remove raft node operation aborted")
+       }
+       return nil
+}

From ee4329a2c70447def9265eefc6ef5b999351dfd0 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanay...@canonical.com>
Date: Mon, 6 Apr 2020 11:29:12 +0100
Subject: [PATCH 4/5] doc: Add paragraph about "lxd cluster remove-raft-node"

Signed-off-by: Free Ekanayaka <free.ekanay...@canonical.com>
---
 doc/clustering.md | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/doc/clustering.md b/doc/clustering.md
index e9ee9d5605..cb90f4ed55 100644
--- a/doc/clustering.md
+++ b/doc/clustering.md
@@ -217,7 +217,7 @@ transition to the Blocked state, until you upgrade the very 
last
 one. At that point the blocked nodes will notice that there is no
 out-of-date node left and will become operational again.
 
-### Disaster recovery
+### Recover from quorum loss
 
 Every LXD cluster has up to 3 members that serve as database nodes. If you
 permanently lose a majority of the cluster members that are serving as database
@@ -294,6 +294,24 @@ lxc delete bionic
 lxc pull file bionic/etc/hosts .
 ```
 
+### Manually altering Raft membership
+
+There might be situations in which you need to manually alter the Raft
+membership configuration of the cluster because some unexpected behavior
+occurred.
+
+For example if you have a cluster member that was removed uncleanly it might 
not
+show up in `lxc cluster list` but still be part of the Raft configuration (you
+can see that with `lxd sql local "SELECT * FROM raft_nodes").
+
+In that case you can run:
+
+```bash
+lxd cluster remove-raft-node <address>
+```
+
+to remove the leftover node.
+
 ## Images
 
 By default, LXD will replicate images on as many cluster members as you

From b1df3fb8e5402b4f2a8f7f3e07b71a538152e8f0 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanay...@canonical.com>
Date: Mon, 6 Apr 2020 11:29:36 +0100
Subject: [PATCH 5/5] test: Add test exercising "lxd cluster remove-raft-node"

Signed-off-by: Free Ekanayaka <free.ekanay...@canonical.com>
---
 test/suites/clustering.sh | 106 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/test/suites/clustering.sh b/test/suites/clustering.sh
index ab4e35714e..801cdc33a2 100644
--- a/test/suites/clustering.sh
+++ b/test/suites/clustering.sh
@@ -1695,3 +1695,109 @@ test_clustering_rebalance() {
   kill_lxd "${LXD_THREE_DIR}"
   kill_lxd "${LXD_FOUR_DIR}"
 }
+
+# Recover a cluster where a raft node was removed from the nodes table but not
+# from the raft configuration.
+test_clustering_remove_raft_node() {
+  # shellcheck disable=2039
+  local LXD_DIR
+
+  setup_clustering_bridge
+  prefix="lxd$$"
+  bridge="${prefix}"
+
+  setup_clustering_netns 1
+  LXD_ONE_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_ONE_DIR}"
+  ns1="${prefix}1"
+  spawn_lxd_and_bootstrap_cluster "${ns1}" "${bridge}" "${LXD_ONE_DIR}"
+
+  # Add a newline at the end of each line. YAML as weird rules..
+  cert=$(sed ':a;N;$!ba;s/\n/\n\n/g' "${LXD_ONE_DIR}/server.crt")
+
+  # Spawn a second node
+  setup_clustering_netns 2
+  LXD_TWO_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_TWO_DIR}"
+  ns2="${prefix}2"
+  spawn_lxd_and_join_cluster "${ns2}" "${bridge}" "${cert}" 2 1 
"${LXD_TWO_DIR}"
+
+  # Configuration keys can be changed on any node.
+  LXD_DIR="${LXD_TWO_DIR}" lxc config set cluster.offline_threshold 40
+  LXD_DIR="${LXD_ONE_DIR}" lxc info | grep -q 'cluster.offline_threshold: "40"'
+  LXD_DIR="${LXD_TWO_DIR}" lxc info | grep -q 'cluster.offline_threshold: "40"'
+
+  # The preseeded network bridge exists on all nodes.
+  ns1_pid="$(cat "${TEST_DIR}/ns/${ns1}/PID")"
+  ns2_pid="$(cat "${TEST_DIR}/ns/${ns2}/PID")"
+  nsenter -m -n -t "${ns1_pid}" -- ip link show "${bridge}" > /dev/null
+  nsenter -m -n -t "${ns2_pid}" -- ip link show "${bridge}" > /dev/null
+
+  # Create a pending network and pool, to show that they are not
+  # considered when checking if the joining node has all the required
+  # networks and pools.
+  LXD_DIR="${LXD_TWO_DIR}" lxc storage create pool1 dir --target node1
+  LXD_DIR="${LXD_ONE_DIR}" lxc network create net1 --target node2
+
+  # Spawn a third node, using the non-leader node2 as join target.
+  setup_clustering_netns 3
+  LXD_THREE_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_THREE_DIR}"
+  ns3="${prefix}3"
+  spawn_lxd_and_join_cluster "${ns3}" "${bridge}" "${cert}" 3 2 
"${LXD_THREE_DIR}"
+
+  # Spawn a fourth node, this will be a non-database node.
+  setup_clustering_netns 4
+  LXD_FOUR_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_FOUR_DIR}"
+  ns4="${prefix}4"
+  spawn_lxd_and_join_cluster "${ns4}" "${bridge}" "${cert}" 4 1 
"${LXD_FOUR_DIR}"
+
+  # Kill the second node, to prevent it from transferring its database role at 
shutdown.
+  kill -9 "$(cat "${LXD_TWO_DIR}/lxd.pid")"
+
+  # Remove the second node from the database but not from the raft 
configuration.
+  LXD_DIR="${LXD_ONE_DIR}" lxd sql global "DELETE FROM nodes WHERE address = 
'10.1.1.102:8443'"
+
+  # The node does not appear anymore in the cluster list.
+  ! LXD_DIR="${LXD_ONE_DIR}" lxc cluster list | grep -q "node2" || false
+
+  # There are only 2 database nodes.
+  LXD_DIR="${LXD_ONE_DIR}" lxc cluster list | grep "node1" | grep -q "YES"
+  LXD_DIR="${LXD_ONE_DIR}" lxc cluster list | grep "node3" | grep -q "YES"
+  LXD_DIR="${LXD_ONE_DIR}" lxc cluster list | grep "node4" | grep -q "NO"
+
+  # The second node is still in the raft_nodes table.
+  LXD_DIR="${LXD_ONE_DIR}" lxd sql local "SELECT * FROM raft_nodes" | grep -q 
"10.1.1.102"
+
+  # Force removing the raft node.
+  LXD_DIR="${LXD_ONE_DIR}" lxd cluster remove-raft-node -q "10.1.1.102"
+
+  # Wait for a heartbeat to propagate.
+  sleep 15
+
+  # We're back to 3 database nodes.
+  LXD_DIR="${LXD_ONE_DIR}" lxc cluster list | grep "node1" | grep -q "YES"
+  LXD_DIR="${LXD_ONE_DIR}" lxc cluster list | grep "node3" | grep -q "YES"
+  LXD_DIR="${LXD_ONE_DIR}" lxc cluster list | grep "node4" | grep -q "YES"
+
+  # The second node is gone from the raft_nodes_table.
+  ! LXD_DIR="${LXD_ONE_DIR}" lxd sql local "SELECT * FROM raft_nodes" | grep 
-q "10.1.1.102" || false
+
+  LXD_DIR="${LXD_ONE_DIR}" lxd shutdown
+  LXD_DIR="${LXD_THREE_DIR}" lxd shutdown
+  LXD_DIR="${LXD_FOUR_DIR}" lxd shutdown
+  sleep 0.5
+  rm -f "${LXD_ONE_DIR}/unix.socket"
+  rm -f "${LXD_TWO_DIR}/unix.socket"
+  rm -f "${LXD_THREE_DIR}/unix.socket"
+  rm -f "${LXD_FOUR_DIR}/unix.socket"
+
+  teardown_clustering_netns
+  teardown_clustering_bridge
+
+  kill_lxd "${LXD_ONE_DIR}"
+  kill_lxd "${LXD_TWO_DIR}"
+  kill_lxd "${LXD_THREE_DIR}"
+  kill_lxd "${LXD_FOUR_DIR}"
+}
_______________________________________________
lxc-devel mailing list
lxc-devel@lists.linuxcontainers.org
http://lists.linuxcontainers.org/listinfo/lxc-devel

Reply via email to