The following pull request was submitted through Github.
It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/1754

This e-mail was sent by the LXC bot, direct replies will not reach the author
unless they happen to be subscribed to this list.

=== Description (from pull-request) ===
If a container monitor is unresponsive, we may wait forever for
responses over the lxc command socket.

An easy way to reproduce this is to choose a container monitor
process (look for a process like:
	[lxc monitor] /var/lib/lxc/containers containername
) and suspend it with 'kill -STOP'.

So put a one-second timeout around calls to the go-lxc State()
function.  This leads to lxc list feedback like:

0 ✓ serge@sl ~ $ lxc list
+------+-------+------+------+------------+-----------+
| NAME | STATE | IPV4 | IPV6 |    TYPE    | SNAPSHOTS |
+------+-------+------+------+------------+-----------+
| x1   | ERROR |      |      | PERSISTENT | 0         |
+------+-------+------+------+------------+-----------+
0 ✓ serge@sl ~ $ lxc info x1
error: Monitor is hung
1 ✗ serge@sl ~ $ lxc stop x1
error: Monitor is hung

If there were thousands of containers with hung monitors the 1s
each would add up, but this is supposed to be a mitigation for a
rare case.  If we end up with a lot of hung monitors we should
figure out why and prevent it.

Closes #1752

Signed-off-by: Serge Hallyn <serge.hal...@ubuntu.com>
From 51ec4f40bb5a3cb92674c4a5895b7cd9fb4f2bf5 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serge.hal...@ubuntu.com>
Date: Mon, 14 Mar 2016 18:52:01 -0700
Subject: [PATCH] Handle unresponsive container monitors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a container monitor is unresponsive, we may wait forever for
responses over the lxc command socket.

An easy way to reproduce this is to choose a container monitor
process (look for a process like:
        [lxc monitor] /var/lib/lxc/containers containername
) and suspend it with 'kill -STOP'.

So put a one-second timeout around calls to the go-lxc State()
function.  This leads to lxc list feedback like:

0 ✓ serge@sl ~ $ lxc list
+------+-------+------+------+------------+-----------+
| NAME | STATE | IPV4 | IPV6 |    TYPE    | SNAPSHOTS |
+------+-------+------+------+------------+-----------+
| x1   | ERROR |      |      | PERSISTENT | 0         |
+------+-------+------+------+------------+-----------+
0 ✓ serge@sl ~ $ lxc info x1
error: Monitor is hung
1 ✗ serge@sl ~ $ lxc stop x1
error: Monitor is hung

If there were thousands of containers with hung monitors the 1s
each would add up, but this is supposed to be a mitigation for a
rare case.  If we end up with a lot of hung monitors we should
figure out why and prevent it.

Closes #1752

Signed-off-by: Serge Hallyn <serge.hal...@ubuntu.com>
---
 lxc/list.go           | 19 +++++++++++++++----
 lxd/container_lxc.go  | 38 +++++++++++++++++++++++++++++++++++---
 lxd/containers_get.go |  8 +++++++-
 shared/status.go      |  3 +++
 4 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/lxc/list.go b/lxc/list.go
index 8b24d4b..b0a7f79 100644
--- a/lxc/list.go
+++ b/lxc/list.go
@@ -230,7 +230,7 @@ func (c *listCmd) listContainers(d *lxd.Client, cinfos 
[]shared.ContainerInfo, f
                }
 
                for _, column := range columns {
-                       if column.NeedsState && cInfo.StatusCode != 
shared.Stopped {
+                       if column.NeedsState && cIsActive(cInfo) {
                                _, ok := cStates[cInfo.Name]
                                if ok {
                                        continue
@@ -367,8 +367,19 @@ func (c *listCmd) statusColumnData(cInfo 
shared.ContainerInfo, cState *shared.Co
        return strings.ToUpper(cInfo.Status)
 }
 
+func cIsActive(cInfo shared.ContainerInfo) bool {
+       switch cInfo.StatusCode {
+       case shared.Stopped:
+               return false
+       case shared.Error:
+               return false
+       default:
+               return true
+       }
+}
+
 func (c *listCmd) IP4ColumnData(cInfo shared.ContainerInfo, cState 
*shared.ContainerState, cSnaps []shared.SnapshotInfo) string {
-       if cInfo.StatusCode != shared.Stopped {
+       if cIsActive(cInfo) {
                ipv4s := []string{}
                for netName, net := range cState.Network {
                        if net.Type == "loopback" {
@@ -392,7 +403,7 @@ func (c *listCmd) IP4ColumnData(cInfo shared.ContainerInfo, 
cState *shared.Conta
 }
 
 func (c *listCmd) IP6ColumnData(cInfo shared.ContainerInfo, cState 
*shared.ContainerState, cSnaps []shared.SnapshotInfo) string {
-       if cInfo.StatusCode != shared.Stopped {
+       if cIsActive(cInfo) {
                ipv6s := []string{}
                for netName, net := range cState.Network {
                        if net.Type == "loopback" {
@@ -428,7 +439,7 @@ func (c *listCmd) numberSnapshotsColumnData(cInfo 
shared.ContainerInfo, cState *
 }
 
 func (c *listCmd) PIDColumnData(cInfo shared.ContainerInfo, cState 
*shared.ContainerState, cSnaps []shared.SnapshotInfo) string {
-       if cInfo.StatusCode != shared.Stopped {
+       if cIsActive(cInfo) {
                return fmt.Sprintf("%d", cState.Pid)
        }
 
diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index 3e3ca99..e63413c 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -1482,6 +1482,25 @@ func (c *containerLXC) Unfreeze() error {
        return c.c.Unfreeze()
 }
 
+var LxcMonitorStateError = fmt.Errorf("Monitor is hung")
+
+// Get lxc container state, with 1 second timeout
+// If we don't get a reply, assume the lxc monitor is hung
+func (c *containerLXC) GetLxcState() (lxc.State, error) {
+       monitor := make(chan lxc.State, 1)
+
+       go func(c *lxc.Container) {
+               monitor <- c.State()
+       }(c.c)
+
+       select {
+       case state := <-monitor:
+               return state, nil
+       case <-time.After(time.Second):
+               return lxc.StateMap["FROZEN"], LxcMonitorStateError
+       }
+}
+
 func (c *containerLXC) Render() (interface{}, error) {
        // Load the go-lxc struct
        err := c.initLXC()
@@ -1507,7 +1526,11 @@ func (c *containerLXC) Render() (interface{}, error) {
                }, nil
        } else {
                // FIXME: Render shouldn't directly access the go-lxc struct
-               statusCode := shared.FromLXCState(int(c.c.State()))
+               cState, err := c.GetLxcState()
+               if err != nil {
+                       return nil, err
+               }
+               statusCode := shared.FromLXCState(int(cState))
 
                return &shared.ContainerInfo{
                        Architecture:    architectureName,
@@ -1534,7 +1557,11 @@ func (c *containerLXC) RenderState() 
(*shared.ContainerState, error) {
        }
 
        // FIXME: RenderState shouldn't directly access the go-lxc struct
-       statusCode := shared.FromLXCState(int(c.c.State()))
+       cState, err := c.GetLxcState()
+       if err != nil {
+               return nil, err
+       }
+       statusCode := shared.FromLXCState(int(cState))
        status := shared.ContainerState{
                Status:     statusCode.String(),
                StatusCode: statusCode,
@@ -4245,7 +4272,12 @@ func (c *containerLXC) State() string {
                return "BROKEN"
        }
 
-       return c.c.State().String()
+       cString := "Error"
+       state, err := c.GetLxcState()
+       if err == nil {
+               cString = state.String()
+       }
+       return cString
 }
 
 // Various container paths
diff --git a/lxd/containers_get.go b/lxd/containers_get.go
index fb00956..082979d 100644
--- a/lxd/containers_get.go
+++ b/lxd/containers_get.go
@@ -67,7 +67,13 @@ func doContainerGet(d *Daemon, cname string) 
(*shared.ContainerInfo, Response) {
        }
 
        cts, err := c.Render()
-       if err != nil {
+       if err == LxcMonitorStateError {
+               return &shared.ContainerInfo{
+                       Name:       cname,
+                       Status:     "Error",
+                       StatusCode: 112,
+               }, nil
+       } else if err != nil {
                return nil, SmartError(err)
        }
 
diff --git a/shared/status.go b/shared/status.go
index 35bff80..96010c4 100644
--- a/shared/status.go
+++ b/shared/status.go
@@ -15,6 +15,7 @@ const (
        Freezing         StatusCode = 109
        Frozen           StatusCode = 110
        Thawed           StatusCode = 111
+       Error            StatusCode = 112
 
        Success StatusCode = 200
 
@@ -39,6 +40,7 @@ func (o StatusCode) String() string {
                Freezing:         "Freezing",
                Frozen:           "Frozen",
                Thawed:           "Thawed",
+               Error:            "Error",
        }[o]
 }
 
@@ -61,5 +63,6 @@ func FromLXCState(state int) StatusCode {
                6: Freezing,
                7: Frozen,
                8: Thawed,
+               9: Error,
        }[state]
 }
_______________________________________________
lxc-devel mailing list
lxc-devel@lists.linuxcontainers.org
http://lists.linuxcontainers.org/listinfo/lxc-devel

Reply via email to