The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/4518
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === This should mitigate/solve #4485 and similar issues. Signed-off-by: Free Ekanayaka <[email protected]>
From 270516df3c0e82adef0b86854d440da740069dd6 Mon Sep 17 00:00:00 2001 From: Free Ekanayaka <[email protected]> Date: Wed, 2 May 2018 08:37:40 +0000 Subject: [PATCH] Take raft snapshots more frequently and at shutdown This should mitigate/solve #4485 and similar issues. Signed-off-by: Free Ekanayaka <[email protected]> --- lxd/cluster/raft.go | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/lxd/cluster/raft.go b/lxd/cluster/raft.go index fa540ce35..6a8df0547 100644 --- a/lxd/cluster/raft.go +++ b/lxd/cluster/raft.go @@ -13,8 +13,6 @@ import ( "strings" "time" - "golang.org/x/net/context" - "github.com/CanonicalLtd/dqlite" "github.com/CanonicalLtd/raft-http" "github.com/CanonicalLtd/raft-membership" @@ -270,11 +268,32 @@ func (i *raftInstance) MembershipChanger() raftmembership.Changer { func (i *raftInstance) Shutdown() error { logger.Info("Stop raft instance") - // Stop raft asynchronously to allow for a timeout. - errCh := make(chan error) + // Invoke raft APIs asynchronously to allow for a timeout. timeout := 10 * time.Second - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() + + // FIXME/TODO: We take a snapshot before when shutting down the daemon + // so there will be no uncompacted raft logs at the next + // startup. This is a workaround for slow log replay when + // the LXD daemon starts (see #4485). A more proper fix + // should be probably implemented in dqlite. + errCh := make(chan error) + timer := time.After(timeout) + go func() { + errCh <- i.raft.Snapshot().Error() + }() + // In case of error we just log a warning, since this is not really + // fatal. + select { + case err := <-errCh: + if err != nil && err != raft.ErrNothingNewToSnapshot { + logger.Warnf("Failed to take raft snapshot: %v", err) + } + case <-timer: + logger.Warnf("Timeout waiting for raft to take a snapshot") + } + + errCh = make(chan error) + timer = time.After(timeout) go func() { errCh <- i.raft.Shutdown().Error() }() @@ -283,7 +302,7 @@ func (i *raftInstance) Shutdown() error { if err != nil { return errors.Wrap(err, "failed to shutdown raft") } - case <-ctx.Done(): + case <-timer: logger.Debug("Timeout waiting for raft to shutdown") return fmt.Errorf("raft did not shutdown within %s", timeout) @@ -381,6 +400,14 @@ func raftConfig(latency float64) *raft.Config { for _, duration := range durations { scale(duration) } + + // FIXME/TODO: We increase the frequency of snapshots here to keep the + // number of uncompacted raft logs low, and workaround slow + // log replay when the LXD daemon starts (see #4485). A more + // proper fix should be probably implemented in dqlite. + config.SnapshotThreshold = 64 + config.TrailingLogs = 128 + return config }
_______________________________________________ lxc-devel mailing list [email protected] http://lists.linuxcontainers.org/listinfo/lxc-devel
