The following pull request was submitted through Github.
It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/2345

This e-mail was sent by the LXC bot, direct replies will not reach the author
unless they happen to be subscribed to this list.

=== Description (from pull-request) ===

From efd6bbf0d3632843c45fc4838796b4373e2e7738 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Graber?= <stgra...@ubuntu.com>
Date: Mon, 5 Sep 2016 18:43:26 -0400
Subject: [PATCH] Rework container operation locking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This should fix a number of race conditions around start, stop and shutdown.

Closes #2297

Signed-off-by: Stéphane Graber <stgra...@ubuntu.com>
---
 lxd/container_lxc.go | 216 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 139 insertions(+), 77 deletions(-)

diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index ed9ef60..f28c99a 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -28,9 +28,55 @@ import (
        log "gopkg.in/inconshreveable/log15.v2"
 )
 
-// Global variables
-var lxcStoppingContainersLock sync.Mutex
-var lxcStoppingContainers map[int]*sync.WaitGroup = 
make(map[int]*sync.WaitGroup)
+// Operation locking
+type lxcContainerOperation struct {
+       action   string
+       chanDone chan error
+       err      error
+       id       int
+       timeout  int
+}
+
+func (op *lxcContainerOperation) Create(id int, action string, timeout int) 
*lxcContainerOperation {
+       op.id = id
+       op.action = action
+       op.timeout = timeout
+       op.chanDone = make(chan error, 0)
+
+       if timeout > 1 {
+               go func(op *lxcContainerOperation) {
+                       time.Sleep(time.Second * time.Duration(op.timeout))
+                       op.Done(fmt.Errorf("Container %s operation timed out 
after %d seconds", op.action, op.timeout))
+               }(op)
+       }
+
+       return op
+}
+
+func (op *lxcContainerOperation) Wait() error {
+       <-op.chanDone
+
+       return op.err
+}
+
+func (op *lxcContainerOperation) Done(err error) {
+       lxcContainerOperationsLock.Lock()
+       defer lxcContainerOperationsLock.Unlock()
+
+       // Check if already done
+       runningOp, ok := lxcContainerOperations[op.id]
+       if !ok || runningOp != op {
+               return
+       }
+
+       op.err = err
+       close(op.chanDone)
+
+       delete(lxcContainerOperations, op.id)
+}
+
+var lxcContainerOperationsLock sync.Mutex
+var lxcContainerOperations map[int]*lxcContainerOperation = 
make(map[int]*lxcContainerOperation)
 
 // Helper functions
 func lxcSetConfigItem(c *lxc.Container, key string, value string) error {
@@ -248,6 +294,51 @@ type containerLXC struct {
        storage  storage
 }
 
+func (c *containerLXC) createOperation(action string, timeout int) 
(*lxcContainerOperation, error) {
+       op, _ := c.getOperation("")
+       if op != nil {
+               return nil, fmt.Errorf("Container is already running a %s 
operation", op.action)
+       }
+
+       lxcContainerOperationsLock.Lock()
+       defer lxcContainerOperationsLock.Unlock()
+
+       op = &lxcContainerOperation{}
+       op.Create(c.id, action, timeout)
+       lxcContainerOperations[c.id] = op
+
+       return lxcContainerOperations[c.id], nil
+}
+
+func (c *containerLXC) getOperation(action string) (*lxcContainerOperation, 
error) {
+       lxcContainerOperationsLock.Lock()
+       defer lxcContainerOperationsLock.Unlock()
+
+       op := lxcContainerOperations[c.id]
+
+       if op == nil {
+               return nil, fmt.Errorf("No running %s container operation", 
action)
+       }
+
+       if action != "" && op.action != action {
+               return nil, fmt.Errorf("Container is running a %s operation, 
not a %s operation", op.action, action)
+       }
+
+       return op, nil
+}
+
+func (c *containerLXC) waitOperation() error {
+       op, _ := c.getOperation("")
+       if op != nil {
+               err := op.Wait()
+               if err != nil {
+                       return err
+               }
+       }
+
+       return nil
+}
+
 func (c *containerLXC) init() error {
        // Compute the expanded config and device list
        err := c.expandConfig()
@@ -1262,15 +1353,15 @@ func (c *containerLXC) startCommon() (string, error) {
 }
 
 func (c *containerLXC) Start(stateful bool) error {
-       // Wait for container tear down to finish
-       lxcStoppingContainersLock.Lock()
-       wgStopping, stopping := lxcStoppingContainers[c.id]
-       lxcStoppingContainersLock.Unlock()
-       if stopping {
-               wgStopping.Wait()
+       // Setup a new operation
+       op, err := c.createOperation("start", 30)
+       if err != nil {
+               return err
        }
+       defer op.Done(nil)
 
-       if err := setupSharedMounts(); err != nil {
+       err = setupSharedMounts()
+       if err != nil {
                return fmt.Errorf("Daemon failed to setup shared mounts base: 
%s.\nDoes security.nesting need to be turned on?", err)
        }
 
@@ -1441,35 +1532,14 @@ func (c *containerLXC) OnStart() error {
        return nil
 }
 
-// Container shutdown locking
-func (c *containerLXC) setupStopping() *sync.WaitGroup {
-       // Handle locking
-       lxcStoppingContainersLock.Lock()
-       defer lxcStoppingContainersLock.Unlock()
-
-       // Existing entry
-       wg, stopping := lxcStoppingContainers[c.id]
-       if stopping {
-               return wg
-       }
-
-       // Setup new entry
-       lxcStoppingContainers[c.id] = &sync.WaitGroup{}
-
-       go func(wg *sync.WaitGroup, id int) {
-               wg.Wait()
-
-               lxcStoppingContainersLock.Lock()
-               defer lxcStoppingContainersLock.Unlock()
-
-               delete(lxcStoppingContainers, id)
-       }(lxcStoppingContainers[c.id], c.id)
-
-       return lxcStoppingContainers[c.id]
-}
-
 // Stop functions
 func (c *containerLXC) Stop(stateful bool) error {
+       // Setup a new operation
+       op, err := c.createOperation("stop", 30)
+       if err != nil {
+               return err
+       }
+
        // Handle stateful stop
        if stateful {
                // Cleanup any existing state
@@ -1478,86 +1548,75 @@ func (c *containerLXC) Stop(stateful bool) error {
 
                err := os.MkdirAll(stateDir, 0700)
                if err != nil {
+                       op.Done(err)
                        return err
                }
 
                // Checkpoint
                err = c.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", true, 
false)
                if err != nil {
+                       op.Done(err)
                        return err
                }
 
                c.stateful = true
                err = dbContainerSetStateful(c.daemon.db, c.id, true)
                if err != nil {
+                       op.Done(err)
                        return err
                }
 
+               op.Done(nil)
                return nil
        }
 
        // Load the go-lxc struct
-       err := c.initLXC()
+       err = c.initLXC()
        if err != nil {
+               op.Done(err)
                return err
        }
 
        // Attempt to freeze the container first, helps massively with fork 
bombs
        c.Freeze()
 
-       // Handle locking
-       wg := c.setupStopping()
-
-       // Stop the container
-       wg.Add(1)
        if err := c.c.Stop(); err != nil {
-               wg.Done()
+               op.Done(err)
                return err
        }
 
-       // Mark ourselves as done
-       wg.Done()
-
-       // Wait for any other teardown routines to finish
-       wg.Wait()
-
-       return nil
+       return op.Wait()
 }
 
 func (c *containerLXC) Shutdown(timeout time.Duration) error {
-       // Load the go-lxc struct
-       err := c.initLXC()
+       // Setup a new operation
+       op, err := c.createOperation("shutdown", 30)
        if err != nil {
                return err
        }
 
-       // Handle locking
-       wg := c.setupStopping()
+       // Load the go-lxc struct
+       err = c.initLXC()
+       if err != nil {
+               op.Done(err)
+               return err
+       }
 
-       // Shutdown the container
-       wg.Add(1)
        if err := c.c.Shutdown(timeout); err != nil {
-               wg.Done()
+               op.Done(err)
                return err
        }
 
-       // Mark ourselves as done
-       wg.Done()
-
-       // Wait for any other teardown routines to finish
-       wg.Wait()
-
-       return nil
+       return op.Wait()
 }
 
 func (c *containerLXC) OnStop(target string) error {
-       // Get locking
-       lxcStoppingContainersLock.Lock()
-       wg, stopping := lxcStoppingContainers[c.id]
-       lxcStoppingContainersLock.Unlock()
-       if wg != nil {
-               wg.Add(1)
+       // Get operation
+       op, _ := c.getOperation("")
+       if op != nil && !shared.StringInSlice(op.action, []string{"stop", 
"shutdown"}) {
+               return fmt.Errorf("Container is already running a %s 
operation", op.action)
        }
+       shared.Debugf("stgraber: op = %s", op)
 
        // Make sure we can't call go-lxc functions by mistake
        c.fromHook = true
@@ -1565,7 +1624,10 @@ func (c *containerLXC) OnStop(target string) error {
        // Stop the storage for this container
        err := c.StorageStop()
        if err != nil {
-               wg.Done()
+               if op != nil {
+                       op.Done(err)
+               }
+
                return err
        }
 
@@ -1573,15 +1635,15 @@ func (c *containerLXC) OnStop(target string) error {
        AAUnloadProfile(c)
 
        // FIXME: The go routine can go away once we can rely on LXC_TARGET
-       go func(c *containerLXC, target string, wg *sync.WaitGroup) {
+       go func(c *containerLXC, target string, op *lxcContainerOperation) {
                c.fromHook = false
 
                // Unlock on return
-               if wg != nil {
-                       defer wg.Done()
+               if op != nil {
+                       defer op.Done(nil)
                }
 
-               if target == "unknown" && stopping {
+               if target == "unknown" && op != nil {
                        target = "stop"
                }
 
@@ -1644,7 +1706,7 @@ func (c *containerLXC) OnStop(target string) error {
                if c.ephemeral {
                        c.Delete()
                }
-       }(c, target, wg)
+       }(c, target, op)
 
        return nil
 }
_______________________________________________
lxc-devel mailing list
lxc-devel@lists.linuxcontainers.org
http://lists.linuxcontainers.org/listinfo/lxc-devel

Reply via email to