The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/1678
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === This makes it possible to have the container save its state at stop time, then restore its state on restart. The feature is mostly interesting as a way to do a "suspend to disk" kinda of equivalent where there is a guarantee that no work will be done after the tasks are dumped to disk. Expected use of the feature is to stop containers when more important containers need the memory resources as well as a way to do a quick host reboot without loosing running state. This branch requires the client to specifically ask for state to be save and restored at both stop and start time. The command line client is set so that state isn't capture on stop by default (requires --stateful) but is restored automatically on start (unless --stateless is passed). Once checkpoint/restore as proved to be reliable, we should probably introduce a server option, or a container option to have this be used on host reboot in place of a standard container shutdown. Closes #1558 Signed-off-by: Stéphane Graber <stgra...@ubuntu.com>
From 14cbb2cd34d2dd27d8c0d9784720d311fab90ad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= <stgra...@ubuntu.com> Date: Sat, 27 Feb 2016 01:30:02 -0500 Subject: [PATCH] Implement stateful container stop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes it possible to have the container save its state at stop time, then restore its state on restart. The feature is mostly interesting as a way to do a "suspend to disk" kinda of equivalent where there is a guarantee that no work will be done after the tasks are dumped to disk. Expected use of the feature is to stop containers when more important containers need the memory resources as well as a way to do a quick host reboot without loosing running state. This branch requires the client to specifically ask for state to be save and restored at both stop and start time. The command line client is set so that state isn't capture on stop by default (requires --stateful) but is restored automatically on start (unless --stateless is passed). Once checkpoint/restore as proved to be reliable, we should probably introduce a server option, or a container option to have this be used on host reboot in place of a standard container shutdown. Closes #1558 Signed-off-by: Stéphane Graber <stgra...@ubuntu.com> --- client.go | 14 +++++++++--- lxc/action.go | 18 ++++++++++++++- lxc/delete.go | 2 +- lxc/launch.go | 2 +- lxc/main.go | 8 +++---- lxc/publish.go | 4 ++-- lxd/container.go | 4 ++-- lxd/container_lxc.go | 61 +++++++++++++++++++++++++++++++++++++++++++++----- lxd/container_state.go | 38 ++++++++++++++++++++++--------- lxd/containers.go | 4 ++-- specs/rest-api.md | 3 ++- 11 files changed, 125 insertions(+), 33 deletions(-) diff --git a/client.go b/client.go index c5b1468..664efde 100644 --- a/client.go +++ b/client.go @@ -1435,15 +1435,23 @@ func (c *Client) Exec(name string, cmd []string, env map[string]string, return op.Metadata.GetInt("return") } -func (c *Client) Action(name string, action shared.ContainerAction, timeout int, force bool) (*Response, error) { +func (c *Client) Action(name string, action shared.ContainerAction, timeout int, force bool, stateful bool) (*Response, error) { + body := shared.Jmap{ + "action": action, + "timeout": timeout, + "force": force} + if action == "start" { current, err := c.ContainerState(name) if err == nil && current.StatusCode == shared.Frozen { - action = "unfreeze" + body["action"] = "unfreeze" } } - body := shared.Jmap{"action": action, "timeout": timeout, "force": force} + if shared.StringInSlice(string(action), []string{"start", "stop"}) { + body["stateful"] = stateful + } + return c.put(fmt.Sprintf("containers/%s/state", name), body, Async) } diff --git a/lxc/action.go b/lxc/action.go index f358da4..4d35266 100644 --- a/lxc/action.go +++ b/lxc/action.go @@ -16,6 +16,8 @@ type actionCmd struct { name string timeout int force bool + stateful bool + stateless bool } func (c *actionCmd) showByDefault() bool { @@ -33,6 +35,8 @@ func (c *actionCmd) flags() { if c.hasTimeout { gnuflag.IntVar(&c.timeout, "timeout", -1, i18n.G("Time to wait for the container before killing it.")) gnuflag.BoolVar(&c.force, "force", false, i18n.G("Force the container to shutdown.")) + gnuflag.BoolVar(&c.stateful, "stateful", false, i18n.G("Store the container state (only for stop).")) + gnuflag.BoolVar(&c.stateless, "stateless", false, i18n.G("Ignore the container state (only forstart).")) } } @@ -41,6 +45,18 @@ func (c *actionCmd) run(config *lxd.Config, args []string) error { return errArgs } + state := false + + // Never store state unless asked to + if c.action == "start" && !c.stateless { + state = true + } + + // Always restore state (if present) unless asked not to + if c.action == "stop" && c.stateful { + state = true + } + for _, nameArg := range args { remote, name := config.ParseRemoteAndContainer(nameArg) d, err := lxd.NewClient(config, remote) @@ -48,7 +64,7 @@ func (c *actionCmd) run(config *lxd.Config, args []string) error { return err } - resp, err := d.Action(name, c.action, c.timeout, c.force) + resp, err := d.Action(name, c.action, c.timeout, c.force, state) if err != nil { return err } diff --git a/lxc/delete.go b/lxc/delete.go index 716832b..afa3d7d 100644 --- a/lxc/delete.go +++ b/lxc/delete.go @@ -92,7 +92,7 @@ func (c *deleteCmd) run(config *lxd.Config, args []string) error { return fmt.Errorf(i18n.G("The container is currently running, stop it first or pass --force.")) } - resp, err := d.Action(name, shared.Stop, -1, true) + resp, err := d.Action(name, shared.Stop, -1, true, false) if err != nil { return err } diff --git a/lxc/launch.go b/lxc/launch.go index e2c9bd6..c065872 100644 --- a/lxc/launch.go +++ b/lxc/launch.go @@ -120,7 +120,7 @@ func (c *launchCmd) run(config *lxd.Config, args []string) error { } fmt.Printf(i18n.G("Starting %s")+"\n", name) - resp, err = d.Action(name, shared.Start, -1, false) + resp, err = d.Action(name, shared.Start, -1, false, false) if err != nil { return err } diff --git a/lxc/main.go b/lxc/main.go index 88845f5..a25cafe 100644 --- a/lxc/main.go +++ b/lxc/main.go @@ -182,15 +182,15 @@ var commands = map[string]command{ "list": &listCmd{}, "monitor": &monitorCmd{}, "move": &moveCmd{}, - "pause": &actionCmd{shared.Freeze, false, false, "pause", -1, false}, + "pause": &actionCmd{shared.Freeze, false, false, "pause", -1, false, false, false}, "profile": &profileCmd{}, "publish": &publishCmd{}, "remote": &remoteCmd{}, - "restart": &actionCmd{shared.Restart, true, true, "restart", -1, false}, + "restart": &actionCmd{shared.Restart, true, true, "restart", -1, false, false, false}, "restore": &restoreCmd{}, "snapshot": &snapshotCmd{}, - "start": &actionCmd{shared.Start, false, true, "start", -1, false}, - "stop": &actionCmd{shared.Stop, true, true, "stop", -1, false}, + "start": &actionCmd{shared.Start, false, true, "start", -1, false, false, false}, + "stop": &actionCmd{shared.Stop, true, true, "stop", -1, false, false, false}, "version": &versionCmd{}, } diff --git a/lxc/publish.go b/lxc/publish.go index 8ccd663..690dfdf 100644 --- a/lxc/publish.go +++ b/lxc/publish.go @@ -97,7 +97,7 @@ func (c *publishCmd) run(config *lxd.Config, args []string) error { } } - resp, err := s.Action(cName, shared.Stop, -1, true) + resp, err := s.Action(cName, shared.Stop, -1, true, false) if err != nil { return err } @@ -110,7 +110,7 @@ func (c *publishCmd) run(config *lxd.Config, args []string) error { if op.StatusCode == shared.Failure { return fmt.Errorf(i18n.G("Stopping container failed!")) } - defer s.Action(cName, shared.Start, -1, true) + defer s.Action(cName, shared.Start, -1, true, false) if wasEphemeral { ct.Ephemeral = true diff --git a/lxd/container.go b/lxd/container.go index 0abfbd6..e5ab8ec 100644 --- a/lxd/container.go +++ b/lxd/container.go @@ -311,8 +311,8 @@ type container interface { // Container actions Freeze() error Shutdown(timeout time.Duration) error - Start() error - Stop() error + Start(stateful bool) error + Stop(stateful bool) error Unfreeze() error // Snapshots & migration diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go index b52738c..b5b8438 100644 --- a/lxd/container_lxc.go +++ b/lxd/container_lxc.go @@ -1070,7 +1070,7 @@ func (c *containerLXC) startCommon() (string, error) { return configPath, nil } -func (c *containerLXC) Start() error { +func (c *containerLXC) Start(stateful bool) error { // Wait for container tear down to finish wgStopping, stopping := lxcStoppingContainers[c.id] if stopping { @@ -1083,6 +1083,25 @@ func (c *containerLXC) Start() error { return err } + // If stateful, restore now + if stateful && shared.PathExists(c.StatePath()) { + err := c.c.Restore(lxc.RestoreOptions{ + Directory: c.StatePath(), + Verbose: true, + }) + + err2 := os.RemoveAll(c.StatePath()) + if err2 != nil { + return err2 + } + + if err != nil { + return err + } + + return nil + } + // Start the LXC container out, err := exec.Command( c.daemon.execPath, @@ -1232,7 +1251,33 @@ func (c *containerLXC) setupStopping() *sync.WaitGroup { } // Stop functions -func (c *containerLXC) Stop() error { +func (c *containerLXC) Stop(stateful bool) error { + // Handle stateful stop + if stateful { + // Cleanup any existing state + stateDir := c.StatePath() + os.RemoveAll(stateDir) + + err := os.MkdirAll(stateDir, 0700) + if err != nil { + return err + } + + // Checkpoint + opts := lxc.CheckpointOptions{Directory: stateDir, Stop: true, Verbose: true} + err = c.Checkpoint(opts) + err2 := CollectCRIULogFile(c, stateDir, "snapshot", "dump") + if err2 != nil { + shared.Log.Warn("failed to collect criu log file", log.Ctx{"error": err2}) + } + + if err != nil { + return err + } + + return nil + } + // Load the go-lxc struct err := c.initLXC() if err != nil { @@ -1351,7 +1396,7 @@ func (c *containerLXC) OnStop(target string) error { // Reboot the container if target == "reboot" { - c.Start() + c.Start(false) return } @@ -1474,7 +1519,7 @@ func (c *containerLXC) Restore(sourceContainer container) error { wasRunning := false if c.IsRunning() { wasRunning = true - if err := c.Stop(); err != nil { + if err := c.Stop(false); err != nil { shared.Log.Error( "Could not stop container", log.Ctx{ @@ -1528,12 +1573,16 @@ func (c *containerLXC) Restore(sourceContainer container) error { shared.Log.Error("failed to delete snapshot state", "path", c.StatePath(), "err", err2) } - return err + if err != nil { + return err + } + + return nil } // Restart the container if wasRunning { - return c.Start() + return c.Start(false) } return nil diff --git a/lxd/container_state.go b/lxd/container_state.go index 9446617..5e4ced7 100644 --- a/lxd/container_state.go +++ b/lxd/container_state.go @@ -7,13 +7,15 @@ import ( "time" "github.com/gorilla/mux" + "github.com/lxc/lxd/shared" ) type containerStatePutReq struct { - Action string `json:"action"` - Timeout int `json:"timeout"` - Force bool `json:"force"` + Action string `json:"action"` + Timeout int `json:"timeout"` + Force bool `json:"force"` + Stateful bool `json:"stateful"` } func containerState(d *Daemon, r *http.Request) Response { @@ -53,15 +55,25 @@ func containerStatePut(d *Daemon, r *http.Request) Response { switch shared.ContainerAction(raw.Action) { case shared.Start: do = func(op *operation) error { - if err = c.Start(); err != nil { + if err = c.Start(raw.Stateful); err != nil { return err } return nil } case shared.Stop: - if raw.Timeout == 0 || raw.Force { + if raw.Stateful { do = func(op *operation) error { - if err = c.Stop(); err != nil { + err := c.Stop(raw.Stateful) + if err != nil { + return err + } + + return nil + } + } else if raw.Timeout == 0 || raw.Force { + do = func(op *operation) error { + err = c.Stop(false) + if err != nil { return err } @@ -73,30 +85,36 @@ func containerStatePut(d *Daemon, r *http.Request) Response { } } else { do = func(op *operation) error { - if err = c.Shutdown(time.Duration(raw.Timeout) * time.Second); err != nil { + err = c.Shutdown(time.Duration(raw.Timeout) * time.Second) + if err != nil { return err } if c.IsEphemeral() { c.Delete() } + return nil } } case shared.Restart: do = func(op *operation) error { if raw.Timeout == 0 || raw.Force { - if err = c.Stop(); err != nil { + err = c.Stop(false) + if err != nil { return err } } else { - if err = c.Shutdown(time.Duration(raw.Timeout) * time.Second); err != nil { + err = c.Shutdown(time.Duration(raw.Timeout) * time.Second) + if err != nil { return err } } - if err = c.Start(); err != nil { + err = c.Start(false) + if err != nil { return err } + return nil } case shared.Freeze: diff --git a/lxd/containers.go b/lxd/containers.go index 520bb58..6a02273 100644 --- a/lxd/containers.go +++ b/lxd/containers.go @@ -114,7 +114,7 @@ func containersRestart(d *Daemon) error { continue } - c.Start() + c.Start(false) autoStartDelayInt, err := strconv.Atoi(autoStartDelay) if err == nil { @@ -155,7 +155,7 @@ func containersShutdown(d *Daemon) error { wg.Add(1) go func() { c.Shutdown(time.Second * 30) - c.Stop() + c.Stop(false) wg.Done() }() } diff --git a/specs/rest-api.md b/specs/rest-api.md index 09fca34..9dbacf9 100644 --- a/specs/rest-api.md +++ b/specs/rest-api.md @@ -754,7 +754,8 @@ Input: { "action": "stop", # State change action (stop, start, restart, freeze or unfreeze) "timeout": 30, # A timeout after which the state change is considered as failed - "force": true # Force the state change (currently only valid for stop and restart where it means killing the container) + "force": true, # Force the state change (currently only valid for stop and restart where it means killing the container) + "stateful": true # Whether to store or restore runtime state before stopping or startiong (only valid for stop and start, defaults to false) } ## /1.0/containers/\<name\>/files
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel