The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/1754
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === If a container monitor is unresponsive, we may wait forever for responses over the lxc command socket. An easy way to reproduce this is to choose a container monitor process (look for a process like: [lxc monitor] /var/lib/lxc/containers containername ) and suspend it with 'kill -STOP'. So put a one-second timeout around calls to the go-lxc State() function. This leads to lxc list feedback like: 0 ✓ serge@sl ~ $ lxc list +------+-------+------+------+------------+-----------+ | NAME | STATE | IPV4 | IPV6 | TYPE | SNAPSHOTS | +------+-------+------+------+------------+-----------+ | x1 | ERROR | | | PERSISTENT | 0 | +------+-------+------+------+------------+-----------+ 0 ✓ serge@sl ~ $ lxc info x1 error: Monitor is hung 1 ✗ serge@sl ~ $ lxc stop x1 error: Monitor is hung If there were thousands of containers with hung monitors the 1s each would add up, but this is supposed to be a mitigation for a rare case. If we end up with a lot of hung monitors we should figure out why and prevent it. Closes #1752 Signed-off-by: Serge Hallyn <serge.hal...@ubuntu.com>
From 51ec4f40bb5a3cb92674c4a5895b7cd9fb4f2bf5 Mon Sep 17 00:00:00 2001 From: Serge Hallyn <serge.hal...@ubuntu.com> Date: Mon, 14 Mar 2016 18:52:01 -0700 Subject: [PATCH] Handle unresponsive container monitors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a container monitor is unresponsive, we may wait forever for responses over the lxc command socket. An easy way to reproduce this is to choose a container monitor process (look for a process like: [lxc monitor] /var/lib/lxc/containers containername ) and suspend it with 'kill -STOP'. So put a one-second timeout around calls to the go-lxc State() function. This leads to lxc list feedback like: 0 ✓ serge@sl ~ $ lxc list +------+-------+------+------+------------+-----------+ | NAME | STATE | IPV4 | IPV6 | TYPE | SNAPSHOTS | +------+-------+------+------+------------+-----------+ | x1 | ERROR | | | PERSISTENT | 0 | +------+-------+------+------+------------+-----------+ 0 ✓ serge@sl ~ $ lxc info x1 error: Monitor is hung 1 ✗ serge@sl ~ $ lxc stop x1 error: Monitor is hung If there were thousands of containers with hung monitors the 1s each would add up, but this is supposed to be a mitigation for a rare case. If we end up with a lot of hung monitors we should figure out why and prevent it. Closes #1752 Signed-off-by: Serge Hallyn <serge.hal...@ubuntu.com> --- lxc/list.go | 19 +++++++++++++++---- lxd/container_lxc.go | 38 +++++++++++++++++++++++++++++++++++--- lxd/containers_get.go | 8 +++++++- shared/status.go | 3 +++ 4 files changed, 60 insertions(+), 8 deletions(-) diff --git a/lxc/list.go b/lxc/list.go index 8b24d4b..b0a7f79 100644 --- a/lxc/list.go +++ b/lxc/list.go @@ -230,7 +230,7 @@ func (c *listCmd) listContainers(d *lxd.Client, cinfos []shared.ContainerInfo, f } for _, column := range columns { - if column.NeedsState && cInfo.StatusCode != shared.Stopped { + if column.NeedsState && cIsActive(cInfo) { _, ok := cStates[cInfo.Name] if ok { continue @@ -367,8 +367,19 @@ func (c *listCmd) statusColumnData(cInfo shared.ContainerInfo, cState *shared.Co return strings.ToUpper(cInfo.Status) } +func cIsActive(cInfo shared.ContainerInfo) bool { + switch cInfo.StatusCode { + case shared.Stopped: + return false + case shared.Error: + return false + default: + return true + } +} + func (c *listCmd) IP4ColumnData(cInfo shared.ContainerInfo, cState *shared.ContainerState, cSnaps []shared.SnapshotInfo) string { - if cInfo.StatusCode != shared.Stopped { + if cIsActive(cInfo) { ipv4s := []string{} for netName, net := range cState.Network { if net.Type == "loopback" { @@ -392,7 +403,7 @@ func (c *listCmd) IP4ColumnData(cInfo shared.ContainerInfo, cState *shared.Conta } func (c *listCmd) IP6ColumnData(cInfo shared.ContainerInfo, cState *shared.ContainerState, cSnaps []shared.SnapshotInfo) string { - if cInfo.StatusCode != shared.Stopped { + if cIsActive(cInfo) { ipv6s := []string{} for netName, net := range cState.Network { if net.Type == "loopback" { @@ -428,7 +439,7 @@ func (c *listCmd) numberSnapshotsColumnData(cInfo shared.ContainerInfo, cState * } func (c *listCmd) PIDColumnData(cInfo shared.ContainerInfo, cState *shared.ContainerState, cSnaps []shared.SnapshotInfo) string { - if cInfo.StatusCode != shared.Stopped { + if cIsActive(cInfo) { return fmt.Sprintf("%d", cState.Pid) } diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go index 3e3ca99..e63413c 100644 --- a/lxd/container_lxc.go +++ b/lxd/container_lxc.go @@ -1482,6 +1482,25 @@ func (c *containerLXC) Unfreeze() error { return c.c.Unfreeze() } +var LxcMonitorStateError = fmt.Errorf("Monitor is hung") + +// Get lxc container state, with 1 second timeout +// If we don't get a reply, assume the lxc monitor is hung +func (c *containerLXC) GetLxcState() (lxc.State, error) { + monitor := make(chan lxc.State, 1) + + go func(c *lxc.Container) { + monitor <- c.State() + }(c.c) + + select { + case state := <-monitor: + return state, nil + case <-time.After(time.Second): + return lxc.StateMap["FROZEN"], LxcMonitorStateError + } +} + func (c *containerLXC) Render() (interface{}, error) { // Load the go-lxc struct err := c.initLXC() @@ -1507,7 +1526,11 @@ func (c *containerLXC) Render() (interface{}, error) { }, nil } else { // FIXME: Render shouldn't directly access the go-lxc struct - statusCode := shared.FromLXCState(int(c.c.State())) + cState, err := c.GetLxcState() + if err != nil { + return nil, err + } + statusCode := shared.FromLXCState(int(cState)) return &shared.ContainerInfo{ Architecture: architectureName, @@ -1534,7 +1557,11 @@ func (c *containerLXC) RenderState() (*shared.ContainerState, error) { } // FIXME: RenderState shouldn't directly access the go-lxc struct - statusCode := shared.FromLXCState(int(c.c.State())) + cState, err := c.GetLxcState() + if err != nil { + return nil, err + } + statusCode := shared.FromLXCState(int(cState)) status := shared.ContainerState{ Status: statusCode.String(), StatusCode: statusCode, @@ -4245,7 +4272,12 @@ func (c *containerLXC) State() string { return "BROKEN" } - return c.c.State().String() + cString := "Error" + state, err := c.GetLxcState() + if err == nil { + cString = state.String() + } + return cString } // Various container paths diff --git a/lxd/containers_get.go b/lxd/containers_get.go index fb00956..082979d 100644 --- a/lxd/containers_get.go +++ b/lxd/containers_get.go @@ -67,7 +67,13 @@ func doContainerGet(d *Daemon, cname string) (*shared.ContainerInfo, Response) { } cts, err := c.Render() - if err != nil { + if err == LxcMonitorStateError { + return &shared.ContainerInfo{ + Name: cname, + Status: "Error", + StatusCode: 112, + }, nil + } else if err != nil { return nil, SmartError(err) } diff --git a/shared/status.go b/shared/status.go index 35bff80..96010c4 100644 --- a/shared/status.go +++ b/shared/status.go @@ -15,6 +15,7 @@ const ( Freezing StatusCode = 109 Frozen StatusCode = 110 Thawed StatusCode = 111 + Error StatusCode = 112 Success StatusCode = 200 @@ -39,6 +40,7 @@ func (o StatusCode) String() string { Freezing: "Freezing", Frozen: "Frozen", Thawed: "Thawed", + Error: "Error", }[o] } @@ -61,5 +63,6 @@ func FromLXCState(state int) StatusCode { 6: Freezing, 7: Frozen, 8: Thawed, + 9: Error, }[state] }
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel