Support checkpoint and restore of both private and shared
hugepage-backed mappings established via mmap(MAP_HUGETLB).  Introduce
APIs for checkpoint and restart of individual huge pages which are to
be used by the sysv SHM_HUGETLB c/r code.

Signed-off-by: Nathan Lynch <[email protected]>
---
 include/linux/checkpoint.h     |    4 +-
 include/linux/checkpoint_hdr.h |   16 +++
 include/linux/hugetlb.h        |   11 ++
 mm/checkpoint.c                |   13 ++
 mm/hugetlb.c                   |  257 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 300 insertions(+), 1 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 4e25042..d9a65a7 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -299,12 +299,14 @@ extern unsigned long generic_vma_restore(struct mm_struct 
*mm,
 extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
                               struct file *file, struct ckpt_hdr_vma *h);
 
+extern int hugetlb_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+                          struct ckpt_hdr_vma *hdr);
+
 extern int checkpoint_memory_contents(struct ckpt_ctx *ctx,
                                      struct vm_area_struct *vma,
                                      struct inode *inode);
 extern int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode);
 
-
 #define CKPT_VMA_NOT_SUPPORTED                                         \
        (VM_IO | VM_HUGETLB | VM_NONLINEAR | VM_PFNMAP |                \
         VM_RESERVED | VM_HUGETLB | VM_NONLINEAR |      \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f4f9577..bda5d74 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -151,6 +151,8 @@ enum {
 #define CKPT_HDR_VMA CKPT_HDR_VMA
        CKPT_HDR_PGARR,
 #define CKPT_HDR_PGARR CKPT_HDR_PGARR
+       CKPT_HDR_HPAGE,
+#define CKPT_HDR_HPAGE CKPT_HDR_HPAGE
        CKPT_HDR_MM_CONTEXT,
 #define CKPT_HDR_MM_CONTEXT CKPT_HDR_MM_CONTEXT
 
@@ -881,6 +883,10 @@ enum vma_type {
 #define CKPT_VMA_SHM_IPC CKPT_VMA_SHM_IPC
        CKPT_VMA_SHM_IPC_SKIP,  /* shared sysvipc (skip contents) */
 #define CKPT_VMA_SHM_IPC_SKIP CKPT_VMA_SHM_IPC_SKIP
+       CKPT_VMA_HUGETLB,
+#define CKPT_VMA_HUGETLB CKPT_VMA_HUGETLB
+       CKPT_VMA_HUGETLB_SKIP,
+#define CKPT_VMA_HUGETLB_SKIP CKPT_VMA_HUGETLB_SKIP
        CKPT_VMA_MAX,
 #define CKPT_VMA_MAX CKPT_VMA_MAX
 };
@@ -907,6 +913,16 @@ struct ckpt_hdr_pgarr {
        __u64 nr_pages;         /* number of pages to saved */
 } __attribute__((aligned(8)));
 
+/* huge page */
+struct ckpt_hdr_hpage {
+       struct ckpt_hdr h;
+       union {
+               __u64 vaddr;
+               __u64 index;
+       };
+       __u16 shift;
+} __attribute__((aligned(8)));
+
 /* signals */
 struct ckpt_sigset {
        __u8 sigset[CKPT_ARCH_NSIG / 8];
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 78b4bc6..3808c04 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -47,6 +47,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, 
long to,
                                                struct vm_area_struct *vma,
                                                int acctflags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
+int hugetlb_restore_page(struct ckpt_ctx *ctx, struct page *page);
+int hugetlb_checkpoint_page(struct ckpt_ctx *ctx, struct page *page);
 
 extern unsigned long hugepages_treat_as_movable;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -323,6 +325,15 @@ static inline unsigned int pages_per_huge_page(struct 
hstate *h)
 {
        return 1;
 }
+
+static inline int hugetlb_restore_page(struct ckpt_ctx *ctx, struct page *page)
+{
+       return -ENOSYS;
+}
+static inline int hugetlb_checkpoint_page(struct ckpt_ctx *ctx, struct page 
*page)
+{
+       return -ENOSYS;
+}
 #endif
 
 #endif /* _LINUX_HUGETLB_H */
diff --git a/mm/checkpoint.c b/mm/checkpoint.c
index 70300e8..8d9a168 100644
--- a/mm/checkpoint.c
+++ b/mm/checkpoint.c
@@ -1021,6 +1021,8 @@ static unsigned long calc_map_flags_bits(unsigned long 
orig_vm_flags)
                vm_flags |= MAP_PRIVATE;
        if (orig_vm_flags & VM_NORESERVE)
                vm_flags |= MAP_NORESERVE;
+       if (orig_vm_flags & VM_HUGETLB)
+               vm_flags |= MAP_HUGETLB;
 
        return vm_flags;
 }
@@ -1180,6 +1182,17 @@ static struct restore_vma_ops restore_vma_ops[] = {
                .vma_type = CKPT_VMA_SHM_IPC_SKIP,
                .restore = ipcshm_restore,
        },
+       /* hugeltb */
+       {
+               .vma_name = "HUGETLB",
+               .vma_type = CKPT_VMA_HUGETLB,
+               .restore = hugetlb_restore,
+       },
+       {
+               .vma_name = "HUGETLB (SKIP)",
+               .vma_type = CKPT_VMA_HUGETLB_SKIP,
+               .restore = hugetlb_restore,
+       },
 };
 
 /**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6034dc9..3b5942c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -8,7 +8,10 @@
 #include <linux/mm.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
+#include <linux/checkpoint.h>
+#include <linux/file.h>
 #include <linux/highmem.h>
+#include <linux/mman.h>
 #include <linux/mmu_notifier.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
@@ -2057,10 +2060,264 @@ static int hugetlb_vm_op_fault(struct vm_area_struct 
*vma, struct vm_fault *vmf)
        return 0;
 }
 
+#define ckpt_debug_hpage_hdr(hdr) \
+       ckpt_debug("vaddr=%#llx shift=%hu\n", (hdr)->vaddr, (hdr)->shift)
+
+static void ckpt_hdr_hpage_init(struct ckpt_hdr_hpage *hdr, unsigned long 
shift)
+{
+       hdr->h.type = CKPT_HDR_HPAGE;
+       hdr->h.len = sizeof(struct ckpt_hdr_hpage);
+       hdr->shift = shift;
+       hdr->vaddr = 0; /* to be filled in by user */
+}
+
+int hugetlb_checkpoint_page(struct ckpt_ctx *ctx, struct page *head)
+{
+       unsigned int nr_pages;
+       struct page *page;
+       int ret = 0;
+       int i;
+
+       nr_pages = pages_per_huge_page(page_hstate(head));
+       page = head;
+
+       for (i = 0; i < nr_pages; i++) {
+               void *ptr;
+
+               cond_resched();
+
+               ptr = kmap_atomic(page, KM_USER1);
+               copy_page(ctx->scratch_page, ptr);
+               kunmap_atomic(ptr, KM_USER1);
+               ret = ckpt_kwrite(ctx, ctx->scratch_page, PAGE_SIZE);
+               if (ret < 0)
+                       break;
+
+               page = mem_map_next(page, head, i + 1);
+       }
+
+       return ret;
+}
+
+#define CKPT_HDR_HPAGE_LAST ~(0UL)
+static bool ckpt_hdr_hpage_last(const struct ckpt_hdr_hpage *hdr)
+{
+       return hdr->vaddr == CKPT_HDR_HPAGE_LAST;
+}
+
+static int hugetlb_dump_contents(struct ckpt_ctx *ctx, struct vm_area_struct 
*vma)
+{
+       struct ckpt_hdr_hpage hdr;
+       unsigned long pageshift;
+       unsigned long pagesize;
+       unsigned long addr;
+       int ret;
+
+       pageshift = huge_page_shift(hstate_vma(vma));
+       pagesize = vma_kernel_pagesize(vma);
+
+       ckpt_hdr_hpage_init(&hdr, pageshift);
+
+       for (addr = vma->vm_start; addr < vma->vm_end; addr += pagesize) {
+               struct page *page = NULL;
+
+               down_read(&vma->vm_mm->mmap_sem);
+               ret = __get_user_pages(ctx->tsk, vma->vm_mm,
+                                      addr, 1, FOLL_DUMP | FOLL_GET,
+                                      &page, NULL);
+               /* FOLL_DUMP gives -EFAULT for holes */
+               if (ret == -EFAULT)
+                       ret = 0;
+               up_read(&vma->vm_mm->mmap_sem);
+
+               if (ret < 0)
+                       goto release;
+               if (!page)
+                       continue;
+
+               hdr.vaddr = addr;
+
+               ckpt_debug_hpage_hdr(&hdr);
+
+               ret = ckpt_write_obj(ctx, &hdr.h);
+               if (ret < 0)
+                       goto release;
+
+               ret = hugetlb_checkpoint_page(ctx, page);
+release:
+               if (page)
+                       page_cache_release(page);
+               if (ret < 0)
+                       break;
+       }
+
+       if (ret < 0)
+               goto err;
+       hdr.vaddr = CKPT_HDR_HPAGE_LAST;
+       ret = ckpt_write_obj(ctx, &hdr.h);
+err:
+       return ret;
+}
+
+static int hugetlb_vm_op_checkpoint(struct ckpt_ctx *ctx, struct 
vm_area_struct *vma)
+{
+       enum vma_type vma_type;
+       int ino_objref;
+       int ret, first;
+
+       BUG_ON(!(vma->vm_flags & VM_HUGETLB));
+       BUG_ON(!vma->vm_file);
+
+       ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
+       if (ret < 0)
+               return ret;
+
+       ino_objref = ckpt_obj_lookup_add(ctx, vma->vm_file->f_dentry->d_inode,
+                                        CKPT_OBJ_INODE, &first);
+       if (ino_objref < 0)
+               return ino_objref;
+
+       vma_type = first ? CKPT_VMA_HUGETLB : CKPT_VMA_HUGETLB_SKIP;
+
+       ret = generic_vma_checkpoint(ctx, vma, vma_type, 0, ino_objref);
+       if (ret)
+               return ret;
+
+       if (vma_type == CKPT_VMA_HUGETLB)
+               ret = hugetlb_dump_contents(ctx, vma);
+
+       return ret;
+}
+
+int hugetlb_restore_page(struct ckpt_ctx *ctx, struct page *head)
+{
+       unsigned int nr_pages;
+       struct page *page;
+       int ret = 0;
+       int i;
+
+       nr_pages = pages_per_huge_page(page_hstate(head));
+       page = head;
+
+       for (i = 0; i < nr_pages; i++) {
+               void *ptr;
+
+               ret = ckpt_kread(ctx, ctx->scratch_page, PAGE_SIZE);
+               if (ret < 0)
+                       break;
+
+               cond_resched();
+
+               ptr = kmap_atomic(page, KM_USER1);
+               copy_page(ptr, ctx->scratch_page);
+               kunmap_atomic(ptr, KM_USER1);
+
+               page = mem_map_next(page, head, i + 1);
+       }
+
+       return ret;
+}
+
+static int hugetlb_restore_contents(struct ckpt_ctx *ctx)
+{
+       int ret = 0;
+
+       while (1) {
+               struct ckpt_hdr_hpage *hdr;
+               unsigned long addr;
+               struct page *page;
+               bool last;
+
+               hdr = ckpt_read_obj_type(ctx, sizeof(*hdr), CKPT_HDR_HPAGE);
+               if (IS_ERR(hdr)) {
+                       ret = PTR_ERR(hdr);
+                       break;
+               }
+
+               ckpt_debug_hpage_hdr(hdr);
+               last = ckpt_hdr_hpage_last(hdr);
+               addr = (unsigned long)hdr->vaddr;
+
+               ckpt_hdr_put(ctx, hdr);
+
+               if (last)
+                       break;
+
+               down_read(&current->mm->mmap_sem);
+               ret = get_user_pages(current, current->mm, addr, 1, 1, 1,
+                                    &page, NULL);
+               up_read(&current->mm->mmap_sem);
+
+               if (ret < 0)
+                       break;
+
+               ret = hugetlb_restore_page(ctx, page);
+
+               page_cache_release(page);
+
+               if (ret < 0)
+                       break;
+       }
+
+       return ret;
+}
+
+int hugetlb_restore(struct ckpt_ctx *ctx, struct mm_struct *mm, struct 
ckpt_hdr_vma *hdr)
+{
+       unsigned long addr;
+       struct file *file;
+       int ret = 0;
+
+       if (!(hdr->vm_flags & (VM_HUGETLB)))
+               return -EINVAL;
+
+       file = ckpt_obj_try_fetch(ctx, hdr->ino_objref, CKPT_OBJ_FILE);
+       if (PTR_ERR(file) == -EINVAL)
+               file = NULL;
+       if (IS_ERR(file))
+               return PTR_ERR(file);
+
+       /* To do: don't assume same default_hstate on source and destinaton */
+       if (!file) {
+               struct user_struct *user = NULL;
+               unsigned long len;
+
+               if (hdr->vma_type != CKPT_VMA_HUGETLB)
+                       return -EINVAL;
+
+               /* see sys_mmap_pgoff */
+               len = hdr->vm_end - hdr->vm_start;
+               len = ALIGN(len, huge_page_size(&default_hstate));
+               file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+                                         &user, HUGETLB_ANONHUGE_INODE);
+               if (IS_ERR(file))
+                       return PTR_ERR(file);
+               ret = ckpt_obj_insert(ctx, file, hdr->ino_objref, 
CKPT_OBJ_FILE);
+               if (ret < 0)
+                       goto out;
+       } else {
+               if (hdr->vma_type != CKPT_VMA_HUGETLB_SKIP)
+                       return -EINVAL;
+               get_file(file);
+       }
+
+       addr = generic_vma_restore(mm, file, hdr);
+       if (IS_ERR((void *)addr))
+               ret = PTR_ERR((void *)addr);
+       else if (hdr->vma_type == CKPT_VMA_HUGETLB)
+               ret = hugetlb_restore_contents(ctx);
+out:
+       fput(file);
+       return ret;
+}
+
 const struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
        .open = hugetlb_vm_op_open,
        .close = hugetlb_vm_op_close,
+#ifdef CONFIG_CHECKPOINT
+       .checkpoint = hugetlb_vm_op_checkpoint,
+#endif
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
-- 
1.7.2.2

_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to