1. commonize waitpid users to use a single helper.  We frequently want
to run something in a clean namespace, or fork off a script.  This
lets us keep the function doing fork:(1)exec(2)waitpid simpler.

2. start a blockdev backend implementation.  This will be used for
mounting, copying, and snapshotting container filesystems.

3. implement lvm, directory, and overlayfs backends.

4. For overlayfs, support a new lxc.rootfs format of
'bdevtype:<extra>'.  This means you can now use overlayfs-based
containers without using lxc-start-ephemeral, by using
lxc.rootfs = overlayfs:/readonly-dir:writeable-dir

5. add a set of simple clone testcases

The testcase shows how to use this.  There are two types of clones: copy
and snapshot.  Right now you create a copy clone from lvm->lvm and dir->dir,
a snapshot clone from lvm->lvm, dir->overlayfs and overlayfs->overlayfs.
Note that this means you can now use the api to do incremental image
development, as is done manually by docker:

        // create original container, directory based
        c1 = lxc_container_new("c1", NULL);
        c1->save_config(c1, NULL);
        c1->createl(c1, "ubuntu", NULL);
        c1->load_config(c1, NULL);

        // start it, log in and make some changes
        c1->want_daemonize(c1);
        c1->startl(c1, 0, NULL, NULL);
        // log in and do stuff, shut it down

        // create overlayfs clone
        c2 = c1->clone(c1, "c2", NULL, LXC_CLONE_SNAPSHOT, "overlayfs",
                NULL, 0);
        lxc_container_put(c1);
        // start it, log in make some changes
        c2->want_daemonize(c2);
        c2->startl(c2, 0, NULL, NULL);
        // log in and do stuff, shut it down

        c3 = c2->clone(c2, "c3", NULL, LXC_CLONE_SNAPSHOT, "overlayfs",
                NULL, 0);
        lxc_container_put(c2);
        // etc

c2 mounts c1's rootfs overlayed with /var/lib/lxc/c2/delta0.  When c3 is
created, c2's delta0 is rsync'ed to c3's, and c3 mounts c1's rootfs
overlayed with its rsynced /var/lib/lxc/c3/delta0.

Once Stéphane implements :) python bindings, lxc-clone can be switched
to python using this implementation.

Still to do (there's more, but off top of my head):

1. support btrfs, zfs, aufs
2. have clone handle other mount entries (right now it only clones
the rootfs)
3. bindings
4. re-write lxc-clone
5. add lxc.numsnapshots - in the above case, c1 should not be
destroyable until all its clones are gone
6. Move bdev to its own directory (src/bdev) with one backing store
per file
7. Consider using fewer execs (for lvcreate etc)

Signed-off-by: Serge Hallyn <serge.hal...@ubuntu.com>
---
 src/lxc/Makefile.am    |   2 +
 src/lxc/bdev.c         | 945 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/lxc/bdev.h         |  50 +++
 src/lxc/conf.c         |  11 +-
 src/lxc/lxccontainer.c | 551 +++++++++++++++++++++++++---
 src/lxc/lxccontainer.h |  38 ++
 src/lxc/utils.c        |  20 ++
 src/lxc/utils.h        |   5 +
 src/tests/Makefile.am  |   4 +-
 src/tests/clonetest.c  | 178 ++++++++++
 10 files changed, 1756 insertions(+), 48 deletions(-)
 create mode 100644 src/lxc/bdev.c
 create mode 100644 src/lxc/bdev.h
 create mode 100644 src/tests/clonetest.c

diff --git a/src/lxc/Makefile.am b/src/lxc/Makefile.am
index ebeca466..cc2f163 100644
--- a/src/lxc/Makefile.am
+++ b/src/lxc/Makefile.am
@@ -16,6 +16,7 @@ pkginclude_HEADERS = \
                attach.h \
                lxccontainer.h \
                lxclock.h \
+               bdev.h \
                version.h
 
 if IS_BIONIC
@@ -36,6 +37,7 @@ so_PROGRAMS = liblxc.so
 
 liblxc_so_SOURCES = \
        arguments.c arguments.h \
+       bdev.c bdev.h \
        commands.c commands.h \
        start.c start.h \
        stop.c \
diff --git a/src/lxc/bdev.c b/src/lxc/bdev.c
new file mode 100644
index 0000000..7f109af
--- /dev/null
+++ b/src/lxc/bdev.c
@@ -0,0 +1,945 @@
+/*
+ * lxc: linux Container library
+ *
+ * (C) Copyright IBM Corp. 2007, 2008
+ *
+ * Authors:
+ * Daniel Lezcano <daniel.lezcano at free.fr>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * this is all just a first shot for experiment.  If we go this route, much
+ * shoudl change.  bdev should be a directory with per-bdev file.  Things which
+ * I'm doing by calling out to userspace should sometimes be done through
+ * libraries like liblvm2
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sched.h>
+#include <sys/mount.h>
+#include <sys/wait.h>
+#include "lxc.h"
+#include "config.h"
+#include "conf.h"
+#include "bdev.h"
+#include "log.h"
+#include "error.h"
+#include "utils.h"
+#include "namespace.h"
+#include "parse.h"
+
+lxc_log_define(bdev, lxc);
+
+/* Define unshare() if missing from the C library */
+/* this is also in attach.c and lxccontainer.c: commonize it in utils.c */
+#ifndef HAVE_UNSHARE
+static int unshare(int flags)
+{
+#ifdef __NR_unshare
+return syscall(__NR_unshare, flags);
+#else
+errno = ENOSYS;
+return -1;
+#endif
+}
+#endif
+
+static int do_rsync(const char *src, const char *dest)
+{
+       // call out to rsync
+       pid_t pid;
+       char *s;
+       size_t l;
+
+       pid = fork();
+       if (pid < 0)
+               return -1;
+       if (pid > 0)
+               return wait_for_pid(pid);
+       l = strlen(src) + 2;
+       s = malloc(l);
+       if (!s)
+               exit(1);
+       strcpy(s, src);
+       s[l-2] = '/';
+       s[l-1] = '\0';
+
+       return execlp("rsync", "rsync", "-a", s, dest, (char *)NULL);
+}
+
+static int blk_getsize(const char *path, unsigned long *size)
+{
+       int fd, ret;
+
+       fd = open(path, O_RDONLY);
+       if (!fd)
+               return -1;
+       ret = ioctl(fd, BLKGETSIZE64, size);
+       close(fd);
+       return ret;
+}
+
+/*
+ * These are copied from conf.c.  However as conf.c will be moved to using
+ * the callback system, they can be pulled from there eventually, so we
+ * don't need to pollute utils.c with these low level functions
+ */
+static int find_fstype_cb(char* buffer, void *data)
+{
+       struct cbarg {
+               const char *rootfs;
+               const char *target;
+               int mntopt;
+       } *cbarg = data;
+
+       char *fstype;
+
+       /* we don't try 'nodev' entries */
+       if (strstr(buffer, "nodev"))
+               return 0;
+
+       fstype = buffer;
+       fstype += lxc_char_left_gc(fstype, strlen(fstype));
+       fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0';
+
+       DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
+             cbarg->rootfs, cbarg->target, fstype);
+
+       if (mount(cbarg->rootfs, cbarg->target, fstype, cbarg->mntopt, NULL)) {
+               DEBUG("mount failed with error: %s", strerror(errno));
+               return 0;
+       }
+
+       INFO("mounted '%s' on '%s', with fstype '%s'",
+            cbarg->rootfs, cbarg->target, fstype);
+
+       return 1;
+}
+
+static int mount_unknow_fs(const char *rootfs, const char *target, int mntopt)
+{
+       int i;
+
+       struct cbarg {
+               const char *rootfs;
+               const char *target;
+               int mntopt;
+       } cbarg = {
+               .rootfs = rootfs,
+               .target = target,
+               .mntopt = mntopt,
+       };
+
+       /*
+        * find the filesystem type with brute force:
+        * first we check with /etc/filesystems, in case the modules
+        * are auto-loaded and fall back to the supported kernel fs
+        */
+       char *fsfile[] = {
+               "/etc/filesystems",
+               "/proc/filesystems",
+       };
+
+       for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) {
+
+               int ret;
+
+               if (access(fsfile[i], F_OK))
+                       continue;
+
+               ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg);
+               if (ret < 0) {
+                       ERROR("failed to parse '%s'", fsfile[i]);
+                       return -1;
+               }
+
+               if (ret)
+                       return 0;
+       }
+
+       ERROR("failed to determine fs type for '%s'", rootfs);
+       return -1;
+}
+
+static int do_mkfs(const char *path, const char *fstype)
+{
+       pid_t pid;
+
+       if ((pid = fork()) < 0) {
+               ERROR("error forking");
+               return -1;
+       }
+       if (pid > 0)
+               return wait_for_pid(pid);
+
+       return execlp("mkfs", "mkfs", "-t", fstype, path, NULL);
+}
+
+static char *linkderef(char *path, char *dest)
+{
+       struct stat sbuf;
+       ssize_t ret;
+
+       ret = stat(path, &sbuf);
+       if (ret < 0)
+               return NULL;
+       if (!S_ISLNK(sbuf.st_mode))
+               return path;
+       ret = readlink(path, dest, MAXPATHLEN);
+       if (ret < 0) {
+               SYSERROR("error reading link %s", path);
+               return NULL;
+       } else if (ret >= MAXPATHLEN) {
+               ERROR("link in %s too long", path);
+               return NULL;
+       }
+       dest[ret] = '\0';
+       return dest;
+}
+
+/*
+ * Given a bdev (presumably blockdev-based), detect the fstype
+ * by trying mounting (in a private mntns) it.
+ * @bdev: bdev to investigate
+ * @type: preallocated char* in which to write the fstype
+ * @len: length of passed in char*
+ * Returns length of fstype, of -1 on error
+ */
+static int detect_fs(struct bdev *bdev, char *type, int len)
+{
+       int  p[2], ret;
+       size_t linelen;
+       pid_t pid;
+       FILE *f;
+       char *sp1, *sp2, *sp3, *line = NULL;
+
+       if (!bdev || !bdev->src || !bdev->dest)
+               return -1;
+
+       if (pipe(p) < 0)
+               return -1;
+       if ((pid = fork()) < 0)
+               return -1;
+       if (pid > 0) {
+               int status;
+               close(p[1]);
+               memset(type, 0, len);
+               ret = read(p[0], type, len-1);
+               close(p[0]);
+               if (ret < 0) {
+                       SYSERROR("error reading from pipe");
+                       wait(&status);
+                       return -1;
+               } else if (ret == 0) {
+                       ERROR("child exited early - fstype not found");
+                       wait(&status);
+                       return -1;
+               }
+               wait(&status);
+               type[len-1] = '\0';
+               INFO("detected fstype %s for %s", type, bdev->src);
+               return ret;
+       }
+
+       if (unshare(CLONE_NEWNS) < 0)
+               exit(1);
+
+       ret = mount_unknow_fs(bdev->src, bdev->dest, 0);
+       if (ret < 0) {
+               ERROR("failed mounting %s onto %s to detect fstype", bdev->src, 
bdev->dest);
+               exit(1);
+       }
+       // if symlink, get the real dev name
+       char devpath[MAXPATHLEN];
+       char *l = linkderef(bdev->src, devpath);
+       if (!l)
+               exit(1);
+       f = fopen("/proc/self/mounts", "r");
+       if (!f)
+               exit(1);
+       while (getline(&line, &linelen, f) != -1) {
+               sp1 = index(line, ' ');
+               if (!sp1)
+                       exit(1);
+               *sp1 = '\0';
+               if (strcmp(line, l))
+                       continue;
+               sp2 = index(sp1+1, ' ');
+               if (!sp2)
+                       exit(1);
+               *sp2 = '\0';
+               sp3 = index(sp2+1, ' ');
+               if (!sp3)
+                       exit(1);
+               *sp3 = '\0';
+               sp2++;
+               if (write(p[1], sp2, strlen(sp2)) != strlen(sp2))
+                       exit(1);
+               exit(0);
+       }
+       exit(1);
+}
+
+struct bdev_type {
+       char *name;
+       struct bdev_ops *ops;
+};
+
+static int is_dir(const char *path)
+{
+       struct stat statbuf;
+       int ret = stat(path, &statbuf);
+       if (ret == 0 && S_ISDIR(statbuf.st_mode))
+               return 1;
+       return 0;
+}
+
+static int dir_detect(const char *path)
+{
+       if (strncmp(path, "dir:", 4) == 0)
+               return 1; // take their word for it
+       if (is_dir(path))
+               return 1;
+       return 0;
+}
+
+//
+// XXXXXXX plain directory bind mount ops
+//
+int dir_mount(struct bdev *bdev)
+{
+       if (strcmp(bdev->type, "dir"))
+               return -22;
+       if (!bdev->src || !bdev->dest)
+               return -22;
+       return mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC, NULL);
+}
+
+int dir_umount(struct bdev *bdev)
+{
+       if (strcmp(bdev->type, "dir"))
+               return -22;
+       if (!bdev->src || !bdev->dest)
+               return -22;
+       return umount(bdev->dest);
+}
+
+/* the bulk of this needs to become a common helper */
+static char *dir_new_path(char *src, const char *oldname, const char *name,
+                       const char *oldpath, const char *lxcpath)
+{
+       char *ret, *p, *p2;
+       int l1, l2, nlen;
+
+       nlen = strlen(src) + 1;
+       l1 = strlen(oldpath);
+       p = src;
+       /* if src starts with oldpath, look for oldname only after
+        * that path */
+       if (strncmp(src, oldpath, l1) == 0) {
+               p += l1;
+               nlen += (strlen(lxcpath) - l1);
+       }
+       l2 = strlen(oldname);
+       while ((p = strstr(p, oldname)) != NULL) {
+               p += l2;
+               nlen += strlen(name) - l2;
+       }
+
+       ret = malloc(nlen);
+       if (!ret)
+               return NULL;
+
+       p = ret;
+       if (strncmp(src, oldpath, l1) == 0) {
+               p += sprintf(p, "%s", lxcpath);
+               src += l1;
+       }
+
+       while ((p2 = strstr(src, oldname)) != NULL) {
+               strncpy(p, src, p2-src); // copy text up to oldname
+               p += p2-src; // move target pointer (p)
+               p += sprintf(p, "%s", name); // print new name in place of 
oldname
+               src = p2 + l2;  // move src to end of oldname
+       }
+       sprintf(p, "%s", src);  // copy the rest of src
+       return ret;
+}
+
+/*
+ * for a simple directory bind mount, we substitute the old container
+ * name and paths for the new
+ */
+static int dir_clonepaths(struct bdev *orig, struct bdev *new, const char 
*oldname,
+               const char *cname, const char *oldpath, const char *lxcpath, 
int snap,
+               unsigned long newsize)
+{
+       if (snap) {
+               ERROR("directories cannot be snapshotted.  Try overlayfs.");
+               return -1;
+       }
+       if (!orig->dest || !orig->src)
+               return -1;
+       if (orig->data) {
+               new->data = strdup(orig->data);
+               if (!new->data)
+                       return -1;
+       }
+
+       new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath);
+       if (!new->dest)
+               return -1;
+       new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath);
+       if (!new->src)
+               return -1;
+
+       return 0;
+}
+
+struct bdev_ops dir_ops = {
+       .detect = &dir_detect,
+       .mount = &dir_mount,
+       .umount = &dir_umount,
+       .clone_paths = &dir_clonepaths,
+};
+
+//
+// LVM ops
+//
+
+/*
+ * Look at /sys/dev/block/maj:min/dm/uuid.  If it contains the hardcoded LVM
+ * prefix "LVM-", then this is an lvm2 LV
+ */
+static int lvm_detect(const char *path)
+{
+       char devp[MAXPATHLEN], buf[4];
+       FILE *fout;
+       int ret;
+       struct stat statbuf;
+
+       if (strncmp(path, "lvm:", 4) == 0)
+               return 1; // take their word for it
+
+       ret = stat(path, &statbuf);
+       if (ret != 0)
+               return 0;
+       if (!S_ISBLK(statbuf.st_mode))
+               return 0;
+
+       ret = snprintf(devp, MAXPATHLEN, "/sys/dev/block/%d:%d/dm/uuid",
+                       major(statbuf.st_rdev), minor(statbuf.st_rdev));
+       if (ret < 0 || ret >= MAXPATHLEN) {
+               ERROR("lvm uuid pathname too long");
+               return 0;
+       }
+       fout = fopen(devp, "r");
+       if (!fout)
+               return 0;
+       ret = fread(buf, 1, 4, fout);
+       fclose(fout);
+       if (ret != 4 || strncmp(buf, "LVM-", 4) != 0)
+               return 0;
+       return 1;
+}
+
+static int lvm_mount(struct bdev *bdev)
+{
+       if (strcmp(bdev->type, "lvm"))
+               return -22;
+       if (!bdev->src || !bdev->dest)
+               return -22;
+       /* if we might pass in data sometime, then we'll have to enrich
+        * mount_unknow_fs */
+       return mount_unknow_fs(bdev->src, bdev->dest, 0);
+}
+
+static int lvm_umount(struct bdev *bdev)
+{
+       if (strcmp(bdev->type, "lvm"))
+               return -22;
+       if (!bdev->src || !bdev->dest)
+               return -22;
+       return umount(bdev->dest);
+}
+
+/*
+ * path must be '/dev/$vg/$lv', $vg must be an existing VG, and $lv must
+ * not yet exist.  This function will attempt to create /dev/$vg/$lv of
+ * size $size.
+ */
+static int lvm_create(const char *path, unsigned long size)
+{
+       int ret, pid;
+       char sz[24], *pathdup, *vg, *lv;
+
+       if ((pid = fork()) < 0) {
+               SYSERROR("failed fork");
+               return -1;
+       }
+       if (pid > 0)
+               return wait_for_pid(pid);
+
+       // lvcreate default size is in M, not bytes.
+       ret = snprintf(sz, 24, "%lu", size/1000000);
+       if (ret < 0 || ret >= 24)
+               exit(1);
+
+       pathdup = strdup(path);
+       if (!pathdup)
+               exit(1);
+       lv = rindex(pathdup, '/');
+       if (!lv) {
+               free(pathdup);
+               exit(1);
+       }
+       *lv = '\0';
+       lv++;
+       vg = rindex(pathdup, '/');
+       if (!vg)
+               exit(1);
+       vg++;
+       ret = execlp("lvcreate", "lvcreate", "-L", sz, vg, "-n", lv, (char 
*)NULL);
+       free(pathdup);
+       return ret;
+}
+
+static int lvm_snapshot(const char *orig, const char *path, unsigned long size)
+{
+       int ret, pid;
+       char sz[24], *pathdup, *lv;
+
+       if ((pid = fork()) < 0) {
+               SYSERROR("failed fork");
+               return -1;
+       }
+       if (pid > 0)
+               return wait_for_pid(pid);
+       // lvcreate default size is in M, not bytes.
+       ret = snprintf(sz, 24, "%lu", size/1000000);
+       if (ret < 0 || ret >= 24)
+               exit(1);
+
+       pathdup = strdup(path);
+       if (!pathdup)
+               exit(1);
+       lv = rindex(pathdup, '/');
+       if (!lv) {
+               free(pathdup);
+               exit(1);
+       }
+       *lv = '\0';
+       lv++;
+
+       ret = execlp("lvcreate", "lvcreate", "-s", "-L", sz, "-n", lv, orig, 
(char *)NULL);
+       free(pathdup);
+       return ret;
+}
+
+static int lvm_clonepaths(struct bdev *orig, struct bdev *new, const char 
*oldname,
+               const char *cname, const char *oldpath, const char *lxcpath, 
int snap,
+               unsigned long newsize)
+{
+       char fstype[100];
+       unsigned long size = newsize;
+
+       if (!orig->src || !orig->dest)
+               return -1;
+
+       if (orig->data) {
+               new->data = strdup(orig->data);
+               if (!new->data)
+                       return -1;
+       }
+       new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath);
+       if (!new->dest)
+               return -1;
+       if (mkdir_p(new->dest, 0755) < 0)
+               return -1;
+
+
+       new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath);
+       if (!new->src)
+               return -1;
+
+       if (!newsize && blk_getsize(orig->src, &size) < 0) {
+               ERROR("Error getting size of %s", orig->src);
+               return -1;
+       }
+       if (snap) {
+               if (lvm_snapshot(orig->src, new->src, size) < 0) {
+                       ERROR("could not create %s snapshot of %s", new->src, 
orig->src);
+                       return -1;
+               }
+       } else {
+               if (lvm_create(new->src, size) < 0) {
+                       ERROR("Error creating new lvm blockdev");
+                       return -1;
+               }
+               if (detect_fs(orig, fstype, 100) < 0) {
+                       ERROR("could not find fstype for %s", orig->src);
+                       return -1;
+               }
+               if (do_mkfs(new->src, fstype) < 0) {
+                       ERROR("Error creating filesystem type %s on %s", fstype,
+                               new->src);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+struct bdev_ops lvm_ops = {
+       .detect = &lvm_detect,
+       .mount = &lvm_mount,
+       .umount = &lvm_umount,
+       .clone_paths = &lvm_clonepaths,
+};
+
+//
+// overlayfs ops
+//
+
+static int overlayfs_detect(const char *path)
+{
+       if (strncmp(path, "overlayfs:", 10) == 0)
+               return 1; // take their word for it
+       return 0;
+}
+
+//
+// XXXXXXX plain directory bind mount ops
+//
+int overlayfs_mount(struct bdev *bdev)
+{
+       char *options, *dup, *lower, *upper;
+       int len;
+       int ret;
+
+       if (strcmp(bdev->type, "overlayfs"))
+               return -22;
+       if (!bdev->src || !bdev->dest)
+               return -22;
+
+       //  separately mount it first
+       //  mount -t overlayfs -oupperdir=${upper},lowerdir=${lower} lower dest
+       dup = strdupa(bdev->src);
+       if (!(lower = index(dup, ':')))
+               return -22;
+       if (!(upper = index(++lower, ':')))
+               return -22;
+       *upper = '\0';
+       upper++;
+
+       // TODO We should check whether bdev->src is a blockdev, and if so
+       // but for now, only support overlays of a basic directory
+
+       len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=") + 1;
+       options = alloca(len);
+       ret = snprintf(options, len, "upperdir=%s,lowerdir=%s", upper, lower);
+       if (ret < 0 || ret >= len)
+               return -1;
+       ret = mount(lower, bdev->dest, "overlayfs", MS_MGC_VAL, options);
+       if (ret < 0)
+               SYSERROR("overlayfs: error mounting %s onto %s options %s",
+                       lower, bdev->dest, options);
+       else
+               INFO("overlayfs: mounted %s onto %s options %s",
+                       lower, bdev->dest, options);
+       return ret;
+}
+
+int overlayfs_umount(struct bdev *bdev)
+{
+       if (strcmp(bdev->type, "overlayfs"))
+               return -22;
+       if (!bdev->src || !bdev->dest)
+               return -22;
+       return umount(bdev->dest);
+}
+
+static int overlayfs_clonepaths(struct bdev *orig, struct bdev *new, const 
char *oldname,
+               const char *cname, const char *oldpath, const char *lxcpath, 
int snap,
+               unsigned long newsize)
+{
+       if (!snap) {
+               ERROR("overlayfs is only for snapshot clones");
+               return -22;
+       }
+
+       if (!orig->src || !orig->dest)
+               return -1;
+
+       new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath);
+       if (!new->dest)
+               return -1;
+       if (mkdir_p(new->dest, 0755) < 0)
+               return -1;
+
+       if (strcmp(orig->type, "dir") == 0) {
+               char *delta;
+               int ret, len;
+               if (!snap)
+                       return -1;
+               // if we have /var/lib/lxc/c2/rootfs, then delta will be
+               //            /var/lib/lxc/c2/delta0
+               delta = strdup(new->dest);
+               if (!delta) {
+                       return -1;
+               }
+               if (strlen(delta) < 6) {
+                       free(delta);
+                       return -22;
+               }
+               strcpy(&delta[strlen(delta)-6], "delta0");
+               if ((ret = mkdir(delta, 0755)) < 0) {
+                       SYSERROR("error: mkdir %s", delta);
+                       free(delta);
+                       return -1;
+               }
+
+               // the src will be 'overlayfs:lowerdir:upperdir'
+               len = strlen(delta) + strlen(orig->src) + 12;
+               new->src = malloc(len);
+               if (!new->src) {
+                       free(delta);
+                       return -ENOMEM;
+               }
+               ret = snprintf(new->src, len, "overlayfs:%s:%s", orig->src, 
delta);
+               free(delta);
+               if (ret < 0 || ret >= len)
+                       return -ENOMEM;
+       } else if (strcmp(orig->type, "lvm") == 0) {
+               ERROR("overlayfs clone of lvm container is not yet supported");
+               return -1;
+       } else if (strcmp(orig->type, "overlayfs") == 0) {
+               // What exactly do we want to do here?
+               // I think we want to use the original lowerdir, with a
+               // private delta which is originally rsynced from the
+               // original delta
+               char *osrc, *odelta, *nsrc, *ndelta;
+               int len, ret;
+               if (!(osrc = strdup(orig->src)))
+                       return -22;
+               nsrc = index(osrc, ':') + 1;
+               if (nsrc != osrc + 10 || (odelta = index(nsrc, ':')) == NULL) {
+                       free(osrc);
+                       return -22;
+               }
+               *odelta = '\0';
+               odelta++;
+               ndelta = dir_new_path(odelta, oldname, cname, oldpath, lxcpath);
+               if (!ndelta) {
+                       free(osrc);
+                       return -ENOMEM;
+               }
+               if (do_rsync(odelta, ndelta) < 0) {
+                       ERROR("copying overlayfs delta");
+                       return -1;
+               }
+               len = strlen(nsrc) + strlen(ndelta) + 12;
+               new->src = malloc(len);
+               if (!new->src) {
+                       free(osrc);
+                       free(ndelta);
+                       return -ENOMEM;
+               }
+               ret = snprintf(new->src, len, "overlayfs:%s:%s", nsrc, ndelta);
+               free(osrc);
+               free(ndelta);
+               if (ret < 0 || ret >= len)
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+struct bdev_ops overlayfs_ops = {
+       .detect = &overlayfs_detect,
+       .mount = &overlayfs_mount,
+       .umount = &overlayfs_umount,
+       .clone_paths = &overlayfs_clonepaths,
+};
+
+struct bdev_type bdevs[] = {
+       {.name = "lvm", .ops = &lvm_ops,},
+       {.name = "dir", .ops = &dir_ops,},
+       {.name = "overlayfs", .ops = &overlayfs_ops,},
+};
+
+static const size_t numbdevs = sizeof(bdevs) / sizeof(struct bdev_type);
+
+void bdev_put(struct bdev *bdev)
+{
+       if (bdev->data)
+               free(bdev->data);
+       if (bdev->src)
+               free(bdev->src);
+       if (bdev->dest)
+               free(bdev->dest);
+       free(bdev);
+}
+
+struct bdev *bdev_get(const char *type)
+{
+       int i;
+       struct bdev *bdev;
+
+       for (i=0; i<numbdevs; i++) {
+               if (strcmp(bdevs[i].name, type) == 0)
+                       break;
+       }
+       if (i == numbdevs)
+               return NULL;
+       bdev = malloc(sizeof(struct bdev));
+       if (!bdev)
+               return NULL;
+       memset(bdev, 0, sizeof(struct bdev));
+       bdev->ops = bdevs[i].ops;
+       bdev->type = bdevs[i].name;
+       return bdev;
+}
+
+struct bdev *bdev_init(const char *src, const char *dst, const char *data)
+{
+       int i;
+       struct bdev *bdev;
+
+       for (i=0; i<numbdevs; i++) {
+               int r;
+               r = bdevs[i].ops->detect(src);
+               if (r)
+                       break;
+       }
+       if (i == numbdevs)
+               return NULL;
+       bdev = malloc(sizeof(struct bdev));
+       if (!bdev)
+               return NULL;
+       memset(bdev, 0, sizeof(struct bdev));
+       bdev->ops = bdevs[i].ops;
+       bdev->type = bdevs[i].name;
+       if (data)
+               bdev->data = strdup(data);
+       if (src)
+               bdev->src = strdup(src);
+       if (dst)
+               bdev->dest = strdup(dst);
+
+       return bdev;
+}
+
+/*
+ * If we're not snaphotting, then bdev_copy becomes a simple case of mount
+ * the original, mount the new, and rsync the contents.
+ */
+struct bdev *bdev_copy(const char *src, const char *oldname, const char *cname,
+                       const char *oldpath, const char *lxcpath, const char 
*bdevtype,
+                       int snap, const char *bdevdata, unsigned long newsize)
+{
+       struct bdev *orig, *new;
+       pid_t pid;
+
+       /* if the container name doesn't show up in the rootfs path, then
+        * we don't know how to come up with a new name
+        */
+       if (strstr(src, oldname) == NULL) {
+               ERROR("original rootfs path %s doesn't include container name 
%s",
+                       src, oldname);
+               return NULL;
+       }
+
+       orig = bdev_init(src, NULL, NULL);
+       if (!orig) {
+               ERROR("failed to detect blockdev type for %s\n", src);
+               return NULL;
+       }
+
+       if (!orig->dest) {
+               int ret;
+               orig->dest = malloc(MAXPATHLEN);
+               if (!orig->dest) {
+                       ERROR("out of memory");
+                       bdev_put(orig);
+                       return NULL;
+               }
+               ret = snprintf(orig->dest, MAXPATHLEN, "%s/%s/rootfs", oldpath, 
oldname);
+               if (ret < 0 || ret >= MAXPATHLEN) {
+                       ERROR("rootfs path too long");
+                       bdev_put(orig);
+                       return NULL;
+               }
+       }
+
+       new = bdev_get(bdevtype ? bdevtype : orig->type);
+       if (!new) {
+               ERROR("no such block device type: %s", bdevtype ? bdevtype : 
orig->type);
+               bdev_put(orig);
+               return NULL;
+       }
+
+       if (new->ops->clone_paths(orig, new, oldname, cname, oldpath, lxcpath, 
snap, newsize) < 0) {
+               ERROR("failed getting pathnames for cloned storage: %s\n", src);
+               bdev_put(orig);
+               bdev_put(new);
+               return NULL;
+       }
+
+       pid = fork();
+       if (pid < 0) {
+               SYSERROR("fork");
+               bdev_put(orig);
+               bdev_put(new);
+               return NULL;
+       }
+
+       if (pid > 0) {
+               int ret = wait_for_pid(pid);
+               bdev_put(orig);
+               if (ret < 0) {
+                       bdev_put(new);
+                       return NULL;
+               }
+               return new;
+       }
+
+       if (unshare(CLONE_NEWNS) < 0) {
+               SYSERROR("unshare CLONE_NEWNS");
+               exit(1);
+       }
+       if (snap)
+               exit(0);
+
+       // If not a snapshot, copy the fs.
+       if (orig->ops->mount(orig) < 0) {
+               ERROR("failed mounting %s onto %s\n", src, orig->dest);
+               exit(1);
+       }
+       if (new->ops->mount(new) < 0) {
+               ERROR("failed mounting %s onto %s\n", new->src, new->dest);
+               exit(1);
+       }
+       if (do_rsync(orig->dest, new->dest) < 0) {
+               ERROR("rsyncing %s to %s\n", orig->src, new->src);
+               exit(1);
+       }
+       // don't bother umounting, ns exit will do that
+
+       exit(0);
+}
diff --git a/src/lxc/bdev.h b/src/lxc/bdev.h
new file mode 100644
index 0000000..131f158
--- /dev/null
+++ b/src/lxc/bdev.h
@@ -0,0 +1,50 @@
+#ifndef __LXC_BDEV_H
+#define __LXC_BDEV_H
+/* blockdev operations for:
+ * dir, raw, btrfs, overlayfs, aufs, lvm, loop, zfs, btrfs
+ * someday: qemu-nbd, qcow2, qed
+ */
+
+#include "config.h"
+#include "lxccontainer.h"
+
+struct bdev;
+
+struct bdev_ops {
+       /* detect whether path is of this bdev type */
+       int (*detect)(const char *path);
+       // mount requires src and dest to be set.
+       int (*mount)(struct bdev *bdev);
+       int (*umount)(struct bdev *bdev);
+       /* given original mount, rename the paths for cloned container */
+       int (*clone_paths)(struct bdev *orig, struct bdev *new, const char 
*oldname,
+                       const char *cname, const char *oldpath, const char 
*lxcpath,
+                       int snap, unsigned long newsize);
+};
+
+struct bdev {
+       struct bdev_ops *ops;
+       char *type;
+       char *src;
+       char *dest;
+       char *data;
+};
+
+/*
+ * Instantiate a bdev object.  The src is used to determine which blockdev
+ * type this should be.  The dst and data are optional, and will be used
+ * in case of mount/umount.
+ *
+ * Optionally, src can be 'dir:/var/lib/lxc/c1' or 'lvm:/dev/lxc/c1'.  For
+ * other backing stores, this will allow additional options.  In particular,
+ * "overlayfs:/var/lib/lxc/canonical/rootfs:/var/lib/lxc/c1/delta" will mean
+ * use /var/lib/lxc/canonical/rootfs as lower dir, and /var/lib/lxc/c1/delta
+ * as the upper, writeable layer.
+ */
+struct bdev *bdev_init(const char *src, const char *dst, const char *data);
+
+struct bdev *bdev_copy(const char *src, const char *oldname, const char *cname,
+                       const char *oldpath, const char *lxcpath, const char 
*bdevtype,
+                       int snap, const char *bdevdata, unsigned long newsize);
+void bdev_put(struct bdev *bdev);
+#endif
diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index cf97eef..67b1c7f 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -64,6 +64,7 @@
 #include "log.h"
 #include "lxc.h"       /* for lxc_cgroup_set() */
 #include "caps.h"       /* for lxc_caps_last_cap() */
+#include "bdev.h"
 
 #if HAVE_APPARMOR
 #include <apparmor.h>
@@ -590,8 +591,8 @@ int pin_rootfs(const char *rootfs)
                return -2;
 
        if (!realpath(rootfs, absrootfs)) {
-               SYSERROR("failed to get real path for '%s'", rootfs);
-               return -1;
+               INFO("failed to get real path for '%s', not pinning", rootfs);
+               return -2;
        }
 
        if (access(absrootfs, F_OK)) {
@@ -1163,6 +1164,12 @@ static int setup_rootfs(struct lxc_conf *conf)
                }
        }
 
+       // First try mounting rootfs using a bdev
+       struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount, NULL);
+       if (bdev && bdev->ops->mount(bdev) == 0) {
+               DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
+               return 0;
+       }
        if (mount_rootfs(rootfs->path, rootfs->mount)) {
                ERROR("failed to mount rootfs");
                return -1;
diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c
index 1df6a98..40c1c3c 100644
--- a/src/lxc/lxccontainer.c
+++ b/src/lxc/lxccontainer.c
@@ -17,22 +17,41 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <sched.h>
+#include "config.h"
 #include "lxc.h"
 #include "state.h"
 #include "lxccontainer.h"
 #include "conf.h"
-#include "config.h"
 #include "confile.h"
 #include "cgroup.h"
 #include "commands.h"
 #include "version.h"
 #include "log.h"
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <errno.h>
+#include "bdev.h"
 #include <lxc/utils.h>
 
+/* Define unshare() if missing from the C library */
+/* this is also in attach.c and lxccontainer.c: commonize it in utils.c */
+#ifndef HAVE_UNSHARE
+static int unshare(int flags)
+{
+#ifdef __NR_unshare
+       return syscall(__NR_unshare, flags);
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+#else
+int unshare(int);
+#endif
+
 lxc_log_define(lxc_container, lxc);
 
 /* LOCKING
@@ -534,10 +553,8 @@ static bool lxcapi_create(struct lxc_container *c, char 
*t, char *const argv[])
 {
        bool bret = false;
        pid_t pid;
-       int ret, status;
-       char *tpath = NULL;
-       int len, nargs = 0;
-       char **newargv;
+       char *tpath = NULL, **newargv;
+       int ret, len, nargs = 0;
 
        if (!c)
                return false;
@@ -564,7 +581,7 @@ static bool lxcapi_create(struct lxc_container *c, char *t, 
char *const argv[])
                goto out;
 
        /* we're going to fork.  but since we'll wait for our child, we
-          don't need to lxc_container_get */
+        * don't need to lxc_container_get */
 
        if (lxclock(c->slock, 0)) {
                ERROR("failed to grab global container lock for %s\n", c->name);
@@ -635,26 +652,8 @@ static bool lxcapi_create(struct lxc_container *c, char 
*t, char *const argv[])
                exit(1);
        }
 
-again:
-       ret = waitpid(pid, &status, 0);
-       if (ret == -1) {
-               if (errno == -EINTR)
-                       goto again;
-               SYSERROR("waitpid failed");
-               goto out_unlock;
-       }
-       if (ret != pid)
-               goto again;
-       if (!WIFEXITED(status))  { // did not exit normally
-               // we could set an error code and string inside the
-               // container_struct here if we like
-               ERROR("container creation template exited abnormally\n");
-               goto out_unlock;
-       }
-
-       if (WEXITSTATUS(status) != 0) {
-               ERROR("container creation template for %s exited with %d\n",
-                     c->name, WEXITSTATUS(status));
+       if (wait_for_pid(pid) != 0) {
+               ERROR("container creation template for %s failed\n", c->name);
                goto out_unlock;
        }
 
@@ -820,7 +819,6 @@ static bool lxcapi_save_config(struct lxc_container *c, 
const char *alt_file)
 static bool lxcapi_destroy(struct lxc_container *c)
 {
        pid_t pid;
-       int ret, status;
 
        if (!c)
                return false;
@@ -838,23 +836,12 @@ static bool lxcapi_destroy(struct lxc_container *c)
                exit(1);
        }
 
-again:
-       ret = waitpid(pid, &status, 0);
-       if (ret == -1) {
-               if (errno == -EINTR)
-                       goto again;
-               perror("waitpid");
-               return false;
-       }
-       if (ret != pid)
-               goto again;
-       if (!WIFEXITED(status))  { // did not exit normally
-               // we could set an error code and string inside the
-               // container_struct here if we like
+       if (wait_for_pid(pid) < 0) {
+               ERROR("Error destroying container %s", c->name);
                return false;
        }
 
-       return WEXITSTATUS(status) == 0;
+       return true;
 }
 
 static bool lxcapi_set_config_item(struct lxc_container *c, const char *key, 
const char *v)
@@ -1025,6 +1012,479 @@ const char *lxc_get_version(void)
        return lxc_version();
 }
 
+static int copy_file(char *old, char *new)
+{
+       int in, out;
+       ssize_t len, ret;
+       char buf[8096];
+       struct stat sbuf;
+
+       if (file_exists(new)) {
+               ERROR("copy destination %s exists", new);
+               return -1;
+       }
+       ret = stat(old, &sbuf);
+       if (ret < 0) {
+               SYSERROR("stat'ing %s", old);
+               return -1;
+       }
+
+       in = open(old, O_RDONLY);
+       if (in < 0) {
+               SYSERROR("opening original file %s", old);
+               return -1;
+       }
+       out = open(new, O_CREAT | O_EXCL | O_WRONLY, 0644);
+       if (out < 0) {
+               SYSERROR("opening new file %s", new);
+               close(in);
+               return -1;
+       }
+
+       while (1) {
+               len = read(in, buf, 8096);
+               if (len < 0) {
+                       SYSERROR("reading old file %s", old);
+                       goto err;
+               }
+               if (len == 0)
+                       break;
+               ret = write(out, buf, len);
+               if (ret < len) {  // should we retry?
+                       SYSERROR("write to new file %s was interrupted", new);
+                       goto err;
+               }
+       }
+       close(in);
+       close(out);
+
+       // we set mode, but not owner/group
+       ret = chmod(new, sbuf.st_mode);
+       if (ret) {
+               SYSERROR("setting mode on %s", new);
+               return -1;
+       }
+
+       return 0;
+
+err:
+       close(in);
+       close(out);
+       return -1;
+}
+
+/*
+ * we're being passed result of two strstrs(x, y).  We want to write
+ * all data up to the first found string, or to end of the string if
+ * neither string was found.
+ * This function will return the earliest found string if any, or else
+ * NULL
+ */
+static const char *lowest_nonnull(const char *p1, const char *p2)
+{
+       if (!p1)
+               return p2;
+       if (!p2)
+               return p1;
+       return p1 < p2 ? p1 : p2;
+}
+
+static int update_name_and_paths(const char *path, struct lxc_container *oldc,
+               const char *newname, const char *newpath)
+{
+       FILE *f;
+       size_t flen;
+       char *contents;
+       const char *p0, *p1, *p2, *end;
+       const char *oldpath = oldc->get_config_path(oldc);
+       const char *oldname = oldc->name;
+
+       f = fopen(path, "r");
+       if (!f) {
+               SYSERROR("opening old config");
+               return -1;
+       }
+       if (fseek(f, 0, SEEK_END) < 0) {
+               SYSERROR("seeking to end of old config");
+               fclose(f);
+               return -1;
+       }
+       flen = ftell(f);
+       if (flen < 0) {
+               fclose(f);
+               SYSERROR("telling size of old config");
+               return -1;
+       }
+       if (fseek(f, 0, SEEK_SET) < 0) {
+               fclose(f);
+               SYSERROR("rewinding old config");
+               return -1;
+       }
+       contents = malloc(flen);
+       if (!contents) {
+               SYSERROR("out of memory");
+               fclose(f);
+       }
+       if (fread(contents, 1, flen, f) != flen) {
+               free(contents);
+               fclose(f);
+               SYSERROR("reading old config");
+               return -1;
+       }
+       if (fclose(f) < 0) {
+               free(contents);
+               SYSERROR("closing old config");
+               return -1;
+       }
+
+       f = fopen(path, "w");
+       if (!f) {
+               SYSERROR("reopening config");
+               free(contents);
+               return -1;
+       }
+
+       p0 = contents;
+       end = contents + flen;
+       while (1) {
+               p1 = strstr(p0, oldpath);
+               p2 = strstr(p0, oldname);
+               if (!p1 && !p2) {
+                       // write the rest and be done
+                       if (fwrite(p0, 1, (end-p0), f) != (end-p0)) {
+                               SYSERROR("writing new config");
+                               free(contents);
+                               fclose(f);
+                               return -1;
+                       }
+                       free(contents);
+                       fclose(f);
+                       // success
+                       return 0;
+               } else {
+                       const char *p = lowest_nonnull(p1, p2);
+                       const char *new = (p == p2) ? newname : newpath;
+                       if (fwrite(p0, 1, (p-p0), f) != (p-p0)) {
+                               SYSERROR("writing new config");
+                               free(contents);
+                               fclose(f);
+                               return -1;
+                       }
+                       p0 = p;
+                       // now write the newpath or newname
+                       if (fwrite(new, 1, strlen(new), f) != strlen(new)) {
+                               SYSERROR("writing new name or path in new 
config");
+                               free(contents);
+                               fclose(f);
+                               return -1;
+                       }
+                       p0 += (p == p2) ? strlen(oldname) : strlen(oldpath);
+               }
+       }
+}
+
+static int copyhooks(struct lxc_container *oldc, struct lxc_container *c)
+{
+       int i;
+       int ret;
+       struct lxc_list *it;
+
+       for (i=0; i<NUM_LXC_HOOKS; i++) {
+               lxc_list_for_each(it, &c->lxc_conf->hooks[i]) {
+                       char *hookname = it->elem;
+                       char *fname = rindex(hookname, '/');
+                       char tmppath[MAXPATHLEN];
+                       if (!fname) // relative path - we don't support, but 
maybe we should
+                               return 0;
+                       // copy the script, and change the entry in confile
+                       ret = snprintf(tmppath, MAXPATHLEN, "%s/%s/%s",
+                                       c->config_path, c->name, fname+1);
+                       if (ret < 0 || ret >= MAXPATHLEN)
+                               return -1;
+                       ret = copy_file(it->elem, tmppath);
+                       if (ret < 0)
+                               return -1;
+                       free(it->elem);
+                       it->elem = strdup(tmppath);
+                       if (!it->elem) {
+                               ERROR("out of memory copying hook path");
+                               return -1;
+                       }
+                       update_name_and_paths(it->elem, oldc, c->name, 
c->get_config_path(c));
+               }
+       }
+
+       c->save_config(c, NULL);
+       return 0;
+}
+
+static void new_hwaddr(char *hwaddr)
+{
+       snprintf(hwaddr, 18, "00:16:3e:%02x:%02x:%02x",
+                       rand() % 255, rand() % 255, rand() % 255);
+}
+
+static void network_new_hwaddrs(struct lxc_container *c)
+{
+       struct lxc_list *it;
+       lxc_list_for_each(it, &c->lxc_conf->network) {
+               struct lxc_netdev *n = it->elem;
+               if (n->hwaddr)
+                       new_hwaddr(n->hwaddr);
+       }
+}
+
+static int copy_fstab(struct lxc_container *oldc, struct lxc_container *c)
+{
+       char newpath[MAXPATHLEN];
+       char *oldpath = oldc->lxc_conf->fstab;
+       int ret;
+
+       if (!oldpath)
+               return 0;
+
+       char *p = rindex(oldpath, '/');
+       if (!p)
+               return -1;
+       ret = snprintf(newpath, MAXPATHLEN, "%s/%s%s",
+                       c->config_path, c->name, p);
+       if (ret < 0 || ret >= MAXPATHLEN) {
+               ERROR("error printing new path for %s", oldpath);
+               return -1;
+       }
+       if (file_exists(newpath)) {
+               ERROR("error: fstab file %s exists", newpath);
+               return -1;
+       }
+
+       if (copy_file(oldpath, newpath) < 0) {
+               ERROR("error: copying %s to %s", oldpath, newpath);
+               return -1;
+       }
+       free(c->lxc_conf->fstab);
+       c->lxc_conf->fstab = strdup(newpath);
+       if (!c->lxc_conf->fstab) {
+               ERROR("error: allocating pathname");
+               return -1;
+       }
+
+       return 0;
+}
+
+static int copy_storage(struct lxc_container *c0, struct lxc_container *c,
+               const char *newtype, int flags, const char *bdevdata, unsigned 
long newsize)
+{
+       struct bdev *bdev;
+
+       bdev = bdev_copy(c0->lxc_conf->rootfs.path, c0->name, c->name,
+                       c0->config_path, c->config_path, newtype, !!(flags & 
LXC_CLONE_SNAPSHOT),
+                       bdevdata, newsize);
+       if (!bdev) {
+               ERROR("error copying storage");
+               return -1;
+       }
+       free(c->lxc_conf->rootfs.path);
+       c->lxc_conf->rootfs.path = strdup(bdev->src);
+       bdev_put(bdev);
+       if (!c->lxc_conf->rootfs.path)
+               return -1;
+       // here we could also update all lxc.mount.entries or even
+       // items in the lxc.mount fstab list.  As discussed on m-l,
+       // we could do either any source paths starting with the
+       // lxcpath/oldname, or simply anythign which is not a virtual
+       // fs or a bind mount.
+       return 0;
+}
+
+static int clone_update_rootfs(struct lxc_container *c, int flags)
+{
+       int ret = -1;
+       char path[MAXPATHLEN];
+       struct bdev *bdev;
+       FILE *fout;
+       pid_t pid;
+
+       if (flags & LXC_CLONE_KEEPNAME)
+               return 0;
+
+       /* update hostname in rootfs */
+       /* we're going to mount, so run in a clean namespace to simplify 
cleanup */
+
+       pid = fork();
+       if (pid < 0)
+               return -1;
+       if (pid > 0)
+               return wait_for_pid(pid);
+
+       if (unshare(CLONE_NEWNS) < 0) {
+               ERROR("error unsharing mounts");
+               exit(1);
+       }
+       bdev = bdev_init(c->lxc_conf->rootfs.path, c->lxc_conf->rootfs.mount, 
NULL);
+       if (!bdev)
+               exit(1);
+       if (bdev->ops->mount(bdev) < 0)
+               exit(1);
+       ret = snprintf(path, MAXPATHLEN, "%s/etc/hostname", bdev->dest);
+       if (ret < 0 || ret >= MAXPATHLEN)
+               exit(1);
+       if (!(fout = fopen(path, "w"))) {
+               SYSERROR("unable to open %s: ignoring\n", path);
+               exit(0);
+       }
+       if (fprintf(fout, "%s", c->name) < 0)
+               exit(1);
+       if (fclose(fout) < 0)
+               exit(1);
+       exit(0);
+}
+
+/*
+ * We want to support:
+sudo lxc-clone -o o1 -n n1 -s -L|-fssize fssize -v|--vgname vgname \
+        -p|--lvprefix lvprefix -t|--fstype fstype  -B backingstore
+
+-s [ implies overlayfs]
+-s -B overlayfs
+-s -B aufs
+
+only rootfs gets converted (copied/snapshotted) on clone.
+*/
+
+static int create_file_dirname(char *path)
+{
+       char *p = rindex(path, '/');
+       int ret;
+
+       if (!p)
+               return -1;
+       *p = '\0';
+       ret = mkdir(path, 0755);
+       if (ret && errno != EEXIST)
+               SYSERROR("creating container path %s\n", path);
+       *p = '/';
+       return ret;
+}
+
+struct lxc_container *lxcapi_clone(struct lxc_container *c, const char 
*newname,
+               const char *lxcpath, enum lxc_clone_flags flags,
+               const char *bdevtype, const char *bdevdata, unsigned long 
newsize)
+{
+       struct lxc_container *c2 = NULL;
+       char newpath[MAXPATHLEN];
+       int ret;
+       const char *n, *l;
+       FILE *fout;
+
+       if (!c || !c->is_defined(c))
+               return NULL;
+
+       if (lxclock(c->privlock, 0))
+               return NULL;
+
+       if (c->is_running(c)) {
+               ERROR("error: Original container (%s) is running", c->name);
+               goto out;
+       }
+
+       // Make sure the container doesn't yet exist.
+       n = newname ? newname : c->name;
+       l = lxcpath ? lxcpath : c->get_config_path(c);
+       ret = snprintf(newpath, MAXPATHLEN, "%s/%s/config", l, n);
+       if (ret < 0  || ret >= MAXPATHLEN) {
+               SYSERROR("clone: failed making config pathname");
+               goto out;
+       }
+       if (file_exists(newpath)) {
+               ERROR("error: clone: %s exists", newpath);
+               goto out;
+       }
+
+       if (create_file_dirname(newpath) < 0) {
+               ERROR("Error creating container dir for %s", newpath);
+               goto out;
+       }
+
+       // copy the configuration, tweak it as needed,
+       fout = fopen(newpath, "w");
+       if (!fout) {
+               SYSERROR("open %s", newpath);
+               goto out;
+       }
+       write_config(fout, c->lxc_conf);
+       fclose(fout);
+
+       if (update_name_and_paths(newpath, c, n, l) < 0) {
+               ERROR("Error updating name in cloned config");
+               goto out;
+       }
+
+       sprintf(newpath, "%s/%s/rootfs", l, n);
+       if (mkdir(newpath, 0755) < 0) {
+               SYSERROR("error creating %s", newpath);
+               goto out;
+       }
+
+       c2 = lxc_container_new(n, l);
+       if (!c) {
+               ERROR("clone: failed to create new container (%s %s)", n, l);
+               goto out;
+       }
+
+       // copy hooks if requested
+       if (flags & LXC_CLONE_COPYHOOKS) {
+               ret = copyhooks(c, c2);
+               if (ret < 0) {
+                       ERROR("error copying hooks");
+                       c2->destroy(c2);
+                       lxc_container_put(c2);
+                       goto out;
+               }
+       }
+
+       if (copy_fstab(c, c2) < 0) {
+               ERROR("error copying fstab");
+               c2->destroy(c2);
+               lxc_container_put(c2);
+               goto out;
+       }
+
+       // update macaddrs
+       if (!(flags & LXC_CLONE_KEEPMACADDR))
+               network_new_hwaddrs(c2);
+
+       // copy/snapshot rootfs's
+       ret = copy_storage(c, c2, bdevtype, flags, bdevdata, newsize);
+       if (ret < 0) {
+               c2->destroy(c2);
+               lxc_container_put(c2);
+               goto out;
+       }
+
+       if (!c2->save_config(c2, NULL)) {
+               c2->destroy(c2);
+               lxc_container_put(c2);
+               goto out;
+       }
+
+       if (clone_update_rootfs(c2, flags) < 0) {
+               //c2->destroy(c2);
+               lxc_container_put(c2);
+               goto out;
+       }
+
+       // TODO: update c's lxc.snapshot = count
+       lxcunlock(c->privlock);
+       return c2;
+
+out:
+       lxcunlock(c->privlock);
+       if (c2)
+               lxc_container_put(c2);
+               
+       return NULL;
+}
+
 struct lxc_container *lxc_container_new(const char *name, const char 
*configpath)
 {
        struct lxc_container *c;
@@ -1101,6 +1561,7 @@ struct lxc_container *lxc_container_new(const char *name, 
const char *configpath
        c->set_cgroup_item = lxcapi_set_cgroup_item;
        c->get_config_path = lxcapi_get_config_path;
        c->set_config_path = lxcapi_set_config_path;
+       c->clone = lxcapi_clone;
 
        /* we'll allow the caller to update these later */
        if (lxc_log_init(NULL, "none", NULL, "lxc_container", 0)) {
diff --git a/src/lxc/lxccontainer.h b/src/lxc/lxccontainer.h
index de9854c..3bebdf3 100644
--- a/src/lxc/lxccontainer.h
+++ b/src/lxc/lxccontainer.h
@@ -1,9 +1,19 @@
+#ifndef __LXC_CONTAINER_H
+#define __LXC_CONTAINER_H
 #include "lxclock.h"
 #include <stdlib.h>
 #include <malloc.h>
 
 #include <stdbool.h>
 
+enum lxc_clone_flags {
+       LXC_CLONE_KEEPNAME,
+       LXC_CLONE_COPYHOOKS,
+       LXC_CLONE_KEEPMACADDR,
+       LXC_CLONE_SNAPSHOT,
+       LXC_CLONE_MAXFLAGS,
+};
+
 struct lxc_container {
        // private fields
        char *name;
@@ -72,6 +82,33 @@ struct lxc_container {
        const char *(*get_config_path)(struct lxc_container *c);
        bool (*set_config_path)(struct lxc_container *c, const char *path);
 
+       /*
+        * @c: the original container
+        * @newname: new name for the container.  If NULL, the same name is 
used, and
+        *  a new lxcpath MUST be specified.
+        * @lxcpath: lxcpath in which to create the new container.  If NULL, 
then the
+        *  original container's lxcpath will be used.  (Shoudl we use the 
default
+        *  instead?)
+        * @flags: additional flags to modify cloning behavior.
+        *  LXC_CLONE_KEEPNAME: don't edit the rootfs to change the hostname.
+        *  LXC_CLONE_COPYHOOKS: copy all hooks into the container dir
+        *  LXC_CLONE_KEEPMACADDR: don't change the mac address on network 
interfaces.
+        *  LXC_CLONE_SNAPSHOT: snapshot the original filesystem(s).  If 
@devtype was not
+        *   specified, then do so with the native bdevtype if possible, else 
use an
+        *   overlayfs.
+        * @bdevtype: optionally force the cloned bdevtype to a specified 
plugin.  By
+        *  default the original  is used (subject to snapshot requirements).
+        * @bdevdata: information about how to create the new storage (i.e. 
fstype and
+        *  fsdata)
+        * @newsize: in case of a block device backing store, an optional size. 
 If 0,
+        *  then the original backing store's size will be used if possible.  
Note this
+        *  only applies to the rootfs.  For any other filesystems, the 
original size
+        *  will be duplicated.
+        */
+       struct lxc_container *(*clone)(struct lxc_container *c, const char 
*newname,
+               const char *lxcpath, enum lxc_clone_flags flags, const char 
*bdevtype,
+               const char *bdevdata, unsigned long newsize);
+
 #if 0
        bool (*commit_cgroups)(struct lxc_container *c);
        bool (*reread_cgroups)(struct lxc_container *c);
@@ -93,3 +130,4 @@ const char *lxc_get_version(void);
 char ** lxc_get_valid_keys();
 char ** lxc_get_valid_values(char *key);
 #endif
+#endif
diff --git a/src/lxc/utils.c b/src/lxc/utils.c
index e07ca7b..c4cd6a2 100644
--- a/src/lxc/utils.c
+++ b/src/lxc/utils.c
@@ -34,6 +34,8 @@
 #include <dirent.h>
 #include <fcntl.h>
 #include <libgen.h>
+#include <sys/types.h>
+#include <sys/wait.h>
 
 #include "log.h"
 
@@ -188,3 +190,21 @@ out:
                fclose(fin);
        return default_lxcpath;
 }
+
+int wait_for_pid(pid_t pid)
+{
+       int status, ret;
+
+again:
+       ret = waitpid(pid, &status, 0);
+       if (ret == -1) {
+               if (errno == -EINTR)
+                       goto again;
+               return -1;
+       }
+       if (ret != pid)
+               goto again;
+       if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+               return -1;
+       return 0;
+}
diff --git a/src/lxc/utils.h b/src/lxc/utils.h
index 8954503..0a27903 100644
--- a/src/lxc/utils.h
+++ b/src/lxc/utils.h
@@ -32,4 +32,9 @@ extern int mkdir_p(const char *dir, mode_t mode);
  */
 extern const char *default_lxc_path(void);
 
+/*
+ * wait on a child we forked
+ */
+extern int wait_for_pid(pid_t pid);
+
 #endif
diff --git a/src/tests/Makefile.am b/src/tests/Makefile.am
index 4cbeeb3..c0ce648 100644
--- a/src/tests/Makefile.am
+++ b/src/tests/Makefile.am
@@ -13,6 +13,7 @@ lxc_test_get_item_SOURCES = get_item.c
 lxc_test_getkeys_SOURCES = getkeys.c
 lxc_test_lxcpath_SOURCES = lxcpath.c
 lxc_test_cgpath_SOURCES = cgpath.c
+lxc_test_clonetest_SOURCES = clonetest.c
 
 AM_CFLAGS=-I$(top_srcdir)/src \
        -DLXCROOTFSMOUNT=\"$(LXCROOTFSMOUNT)\" \
@@ -23,7 +24,7 @@ AM_CFLAGS=-I$(top_srcdir)/src \
 bin_PROGRAMS = lxc-test-containertests lxc-test-locktests lxc-test-startone \
        lxc-test-destroytest lxc-test-saveconfig lxc-test-createtest \
        lxc-test-shutdowntest lxc-test-get_item lxc-test-getkeys 
lxc-test-lxcpath \
-       lxc-test-cgpath
+       lxc-test-cgpath lxc-test-clonetest
 
 endif
 
@@ -38,4 +39,5 @@ EXTRA_DIST = \
        lxcpath.c \
        saveconfig.c \
        shutdowntest.c \
+       clonetest.c \
        startone.c
diff --git a/src/tests/clonetest.c b/src/tests/clonetest.c
new file mode 100644
index 0000000..fcb5ea6
--- /dev/null
+++ b/src/tests/clonetest.c
@@ -0,0 +1,178 @@
+/* liblxcapi
+ *
+ * Copyright © 2012 Serge Hallyn <serge.hal...@ubuntu.com>.
+ * Copyright © 2012 Canonical Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include "../lxc/lxccontainer.h"
+
+#include <unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#define MYNAME "clonetest1"
+#define MYNAME2 "clonetest2"
+
+int main(int argc, char *argv[])
+{
+       struct lxc_container *c = NULL, *c2 = NULL, *c3 = NULL;
+       int ret = 1;
+
+       c = lxc_container_new(MYNAME, NULL);
+       c2 = lxc_container_new(MYNAME2, NULL);
+       if (c) {
+               c->destroy(c);
+               lxc_container_put(c);
+               c = NULL;
+       }
+       if (c2) {
+               c2->destroy(c2);
+               lxc_container_put(c2);
+               c2 = NULL;
+       }
+
+       if ((c = lxc_container_new(MYNAME, NULL)) == NULL) {
+               fprintf(stderr, "%d: error opening lxc_container %s\n", 
__LINE__, MYNAME);
+               ret = 1;
+               goto out;
+       }
+       c->save_config(c, NULL);
+       if (!c->createl(c, "ubuntu", NULL)) {
+               fprintf(stderr, "%d: failed to create a container\n", __LINE__);
+               goto out;
+       }
+       c->load_config(c, NULL);
+
+       if (!c->is_defined(c)) {
+               fprintf(stderr, "%d: %s thought it was not defined\n", 
__LINE__, MYNAME);
+               goto out;
+       }
+
+       c2 = c->clone(c, MYNAME2, NULL, 0, NULL, NULL, 0);
+       if (!c2) {
+               fprintf(stderr, "%d: %s clone returned NULL\n", __LINE__, 
MYNAME2);
+               goto out;
+       }
+
+       if (!c2->is_defined(c)) {
+               fprintf(stderr, "%d: %s not defined after clone\n", __LINE__, 
MYNAME2);
+               goto out;
+       }
+
+       fprintf(stderr, "directory backing store tests passed\n");
+
+       // now test with lvm
+       // Only do this if clonetestlvm1 exists - user has to set this up
+       // in advance
+       //c2->destroy(c2);
+       lxc_container_put(c2);
+       //c->destroy(c);
+       lxc_container_put(c);
+       c = NULL;
+
+       c2 = lxc_container_new("clonetestlvm2", NULL);
+       if (c2) {
+               if (c2->is_defined(c2))
+                       c2->destroy(c2);
+               lxc_container_put(c2);
+       }
+       c2 = lxc_container_new("clonetest-o1", NULL);
+       if (c2) {
+               if (c2->is_defined(c2))
+                       c2->destroy(c2);
+               lxc_container_put(c2);
+       }
+       c2 = lxc_container_new("clonetest-o2", NULL);
+       if (c2) {
+               if (c2->is_defined(c2))
+                       c2->destroy(c2);
+               lxc_container_put(c2);
+       }
+       c2 = NULL;
+
+       // lvm-copied
+       c = lxc_container_new("clonetestlvm1", NULL);
+       if (!c) {
+               fprintf(stderr, "failed loading clonetestlvm1\n");
+               goto out;
+       }
+       if (!c->is_defined(c)) {
+               fprintf(stderr, "clonetestlvm1 does not exist, skipping lvm 
tests\n");
+               ret = 0;
+               goto out;
+       }
+
+       if ((c2 = c->clone(c, "clonetestlvm2", NULL, 0, NULL, NULL, 0)) == 
NULL) {
+               fprintf(stderr, "lvm clone failed\n");
+               goto out;
+       }
+
+       lxc_container_put(c2);
+
+       // lvm-snapshot
+       c2 = lxc_container_new("clonetestlvm3", NULL);
+       if (c2) {
+               if (c2->is_defined(c2))
+                       c2->destroy(c2);
+               lxc_container_put(c2);
+               c2 = NULL;
+       }
+
+       if ((c2 = c->clone(c, "clonetestlvm3", NULL, LXC_CLONE_SNAPSHOT, NULL, 
NULL, 0)) == NULL) {
+               fprintf(stderr, "lvm clone failed\n");
+               goto out;
+       }
+       lxc_container_put(c2);
+       lxc_container_put(c);
+       c = c2 = NULL;
+
+       if ((c = lxc_container_new(MYNAME, NULL)) == NULL) {
+               fprintf(stderr, "error opening original container for overlay 
test\n");
+               goto out;
+       }
+
+       // Now create an overlayfs clone of a dir-backed container
+       if ((c2 = c->clone(c, "clonetest-o1", NULL, LXC_CLONE_SNAPSHOT, 
"overlayfs", NULL, 0)) == NULL) {
+               fprintf(stderr, "overlayfs clone of dir failed\n");
+               goto out;
+       }
+
+       // Now create an overlayfs clone of the overlayfs clone
+       if ((c3 = c2->clone(c2, "clonetest-o2", NULL, LXC_CLONE_SNAPSHOT, 
"overlayfs", NULL, 0)) == NULL) {
+               fprintf(stderr, "overlayfs clone of overlayfs failed\n");
+               goto out;
+       }
+
+       fprintf(stderr, "all clone tests passed for %s\n", c->name);
+       ret = 0;
+
+out:
+       if (c3) {
+               lxc_container_put(c3);
+       }
+       if (c2) {
+               //c2->destroy(c2); // keep around to verify manuall
+               lxc_container_put(c2);
+       }
+       if (c) {
+               //c->destroy(c);
+               lxc_container_put(c);
+       }
+       exit(ret);
+}
-- 
1.8.1.2


------------------------------------------------------------------------------
Precog is a next-generation analytics platform capable of advanced
analytics on semi-structured data. The platform includes APIs for building
apps and a phenomenal toolset for data science. Developers can use
our toolset for easy data analysis & visualization. Get a free account!
http://www2.precog.com/precogplatform/slashdotnewsletter
_______________________________________________
Lxc-devel mailing list
Lxc-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/lxc-devel

Reply via email to