The following pull request was submitted through Github.
It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/4518

This e-mail was sent by the LXC bot, direct replies will not reach the author
unless they happen to be subscribed to this list.

=== Description (from pull-request) ===
This should mitigate/solve #4485 and similar issues.

Signed-off-by: Free Ekanayaka <[email protected]>
From 270516df3c0e82adef0b86854d440da740069dd6 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <[email protected]>
Date: Wed, 2 May 2018 08:37:40 +0000
Subject: [PATCH] Take raft snapshots more frequently and at shutdown

This should mitigate/solve #4485 and similar issues.

Signed-off-by: Free Ekanayaka <[email protected]>
---
 lxd/cluster/raft.go | 41 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/lxd/cluster/raft.go b/lxd/cluster/raft.go
index fa540ce35..6a8df0547 100644
--- a/lxd/cluster/raft.go
+++ b/lxd/cluster/raft.go
@@ -13,8 +13,6 @@ import (
        "strings"
        "time"
 
-       "golang.org/x/net/context"
-
        "github.com/CanonicalLtd/dqlite"
        "github.com/CanonicalLtd/raft-http"
        "github.com/CanonicalLtd/raft-membership"
@@ -270,11 +268,32 @@ func (i *raftInstance) MembershipChanger() 
raftmembership.Changer {
 func (i *raftInstance) Shutdown() error {
        logger.Info("Stop raft instance")
 
-       // Stop raft asynchronously to allow for a timeout.
-       errCh := make(chan error)
+       // Invoke raft APIs asynchronously to allow for a timeout.
        timeout := 10 * time.Second
-       ctx, cancel := context.WithTimeout(context.Background(), timeout)
-       defer cancel()
+
+       // FIXME/TODO: We take a snapshot before when shutting down the daemon
+       //             so there will be no uncompacted raft logs at the next
+       //             startup. This is a workaround for slow log replay when
+       //             the LXD daemon starts (see #4485). A more proper fix
+       //             should be probably implemented in dqlite.
+       errCh := make(chan error)
+       timer := time.After(timeout)
+       go func() {
+               errCh <- i.raft.Snapshot().Error()
+       }()
+       // In case of error we just log a warning, since this is not really
+       // fatal.
+       select {
+       case err := <-errCh:
+               if err != nil && err != raft.ErrNothingNewToSnapshot {
+                       logger.Warnf("Failed to take raft snapshot: %v", err)
+               }
+       case <-timer:
+               logger.Warnf("Timeout waiting for raft to take a snapshot")
+       }
+
+       errCh = make(chan error)
+       timer = time.After(timeout)
        go func() {
                errCh <- i.raft.Shutdown().Error()
        }()
@@ -283,7 +302,7 @@ func (i *raftInstance) Shutdown() error {
                if err != nil {
                        return errors.Wrap(err, "failed to shutdown raft")
                }
-       case <-ctx.Done():
+       case <-timer:
                logger.Debug("Timeout waiting for raft to shutdown")
                return fmt.Errorf("raft did not shutdown within %s", timeout)
 
@@ -381,6 +400,14 @@ func raftConfig(latency float64) *raft.Config {
        for _, duration := range durations {
                scale(duration)
        }
+
+       // FIXME/TODO: We increase the frequency of snapshots here to keep the
+       //             number of uncompacted raft logs low, and workaround slow
+       //             log replay when the LXD daemon starts (see #4485). A more
+       //             proper fix should be probably implemented in dqlite.
+       config.SnapshotThreshold = 64
+       config.TrailingLogs = 128
+
        return config
 }
 
_______________________________________________
lxc-devel mailing list
[email protected]
http://lists.linuxcontainers.org/listinfo/lxc-devel

Reply via email to