On 09/14/2010 04:02 PM, Nathan Lynch wrote:
> Large page-backed shm regions require special handling, especially
> during restart.  The association of a large page with a shm region's
> inode can occur only in the context of a process causing a fault with
> the region mapped into its mm.  In order to restore that association,
> temporarily shmat-attach the restored SHM_HUGETLB region to the
> restarting process's mm, using the just-restored ipc namespace
> instead of the current one (the nsproxy switch hasn't occured yet).
> 
> Since the temporary shmat of the region during restart causes some of
> the shm attributes to be updated, re-restore them from the ipc_shm
> checkpoint header after unmapping.

Would it work to just move the original call to load_ipc_shm_hdr()
further down in restore_ipc_shm(), especially since the mutex is
not needed anymore - that way you don't need to re-restore them ?

I'm not too familiar with HUGETLB code otherwise, so hoping that
others review those parts while I find time to study it ...

Thanks,

Oren.

> 
> Signed-off-by: Nathan Lynch <[email protected]>
> ---
>  ipc/checkpoint_shm.c |  154 
> ++++++++++++++++++++++++++++++++++++++++++++++----
>  1 files changed, 142 insertions(+), 12 deletions(-)
> 
> diff --git a/ipc/checkpoint_shm.c b/ipc/checkpoint_shm.c
> index 69ba35a..7f9d701 100644
> --- a/ipc/checkpoint_shm.c
> +++ b/ipc/checkpoint_shm.c
> @@ -32,6 +32,69 @@
>   * ipc checkpoint
>   */
>  
> +#define CKPT_HDR_HPAGE_LAST ~(0UL)
> +static bool ckpt_hdr_hpage_last(const struct ckpt_hdr_hpage *hdr)
> +{
> +     return hdr->index == CKPT_HDR_HPAGE_LAST;
> +}
> +
> +static void ckpt_hdr_hpage_init(struct ckpt_hdr_hpage *hdr, unsigned long 
> shift)
> +{
> +     hdr->h.type = CKPT_HDR_HPAGE;
> +     hdr->h.len = sizeof(struct ckpt_hdr_hpage);
> +     hdr->shift = shift;
> +     hdr->index = 0; /* to be filled in by user */
> +}
> +
> +static int shm_hugetlb_checkpoint_contents(struct ckpt_ctx *ctx, struct file 
> *filp)
> +{
> +     struct hstate *h = hstate_file(filp);
> +     struct address_space *mapping = filp->f_mapping;
> +     struct inode *inode = mapping->host;
> +     struct ckpt_hdr_hpage hdr;
> +     unsigned long end_index;
> +     unsigned long index;
> +     ssize_t retval = 0;
> +     loff_t isize;
> +
> +     isize = i_size_read(inode);
> +     if (isize == 0)
> +             goto out;
> +
> +     end_index = (isize - 1) >> huge_page_shift(h);
> +
> +     ckpt_hdr_hpage_init(&hdr, huge_page_shift(h));
> +
> +     for (index = 0; index < end_index + 1; index++) {
> +             struct page *page;
> +
> +             page = find_get_page(mapping, index);
> +
> +             /* skip holes */
> +             if (!page)
> +                     continue;
> +
> +             hdr.index = index;
> +
> +             retval = ckpt_write_obj(ctx, &hdr.h);
> +             if (retval < 0)
> +                     goto release;
> +
> +             retval = hugetlb_checkpoint_page(ctx, page);
> +release:
> +             page_cache_release(page);
> +             if (retval < 0)
> +                     break;
> +     }
> +
> +     if (retval < 0)
> +             goto out;
> +     hdr.index = CKPT_HDR_HPAGE_LAST;
> +     retval = ckpt_write_obj(ctx, &hdr.h);
> +out:
> +     return retval;
> +}
> +
>  /* called with the msgids->rw_mutex is read-held */
>  static int fill_ipc_shm_hdr(struct ckpt_ctx *ctx,
>                           struct ckpt_hdr_ipc_shm *h,
> @@ -59,10 +122,8 @@ static int fill_ipc_shm_hdr(struct ckpt_ctx *ctx,
>  
>       h->flags = 0;
>  
> -     /* check if shm was setup with SHM_HUGETLB (unsupported yet) */
>       if (is_file_hugepages(shp->shm_file)) {
> -             pr_warning("c/r: unsupported SHM_HUGETLB\n");
> -             ret = -ENOSYS;
> +             h->flags |= SHM_HUGETLB;
>       } else {
>               struct shmem_inode_info *info;
>  
> @@ -117,7 +178,10 @@ int checkpoint_ipc_shm(int id, void *p, void *data)
>       if (ret < 0)
>               goto out;
>  
> -     ret = checkpoint_memory_contents(ctx, NULL, inode);
> +     if (is_file_hugepages(shp->shm_file))
> +             ret = shm_hugetlb_checkpoint_contents(ctx, shp->shm_file);
> +     else
> +             ret = checkpoint_memory_contents(ctx, NULL, inode);
>   out:
>       ckpt_hdr_put(ctx, h);
>       return ret;
> @@ -149,6 +213,75 @@ struct dq_ipcshm_del {
>       int id;
>  };
>  
> +static void __load_ipc_shm_hdr(const struct ckpt_hdr_ipc_shm *h, struct 
> shmid_kernel *shp)
> +{
> +     shp->shm_atim = h->shm_atim;
> +     shp->shm_dtim = h->shm_dtim;
> +     shp->shm_ctim = h->shm_ctim;
> +     shp->shm_cprid = h->shm_cprid;
> +     shp->shm_lprid = h->shm_lprid;
> +}
> +
> +static int shm_hugetlb_restore_contents(struct ckpt_ctx *ctx, struct 
> ipc_namespace *ipcns, struct shmid_kernel *shp, const struct ckpt_hdr_ipc_shm 
> *hdr)
> +{
> +     unsigned long start;
> +     int ret;
> +
> +     ret = do_shmat_ns_pgoff(ipcns, shp->shm_perm.id, (char __user *)0,
> +                             0, &start, 0, 0);
> +     if (ret != 0)
> +             return ret;
> +
> +     ckpt_debug("temporarily using %#lx for huge shm restore\n", start);
> +
> +     while (1) {
> +             struct ckpt_hdr_hpage *hdr;
> +             unsigned long hpagesize;
> +             unsigned long index;
> +             unsigned long addr;
> +             struct page *page;
> +             bool last;
> +
> +             hdr = ckpt_read_obj_type(ctx, sizeof(*hdr), CKPT_HDR_HPAGE);
> +             if (IS_ERR(hdr)) {
> +                     ret = PTR_ERR(hdr);
> +                     break;
> +             }
> +
> +             last = ckpt_hdr_hpage_last(hdr);
> +             index = (unsigned long)hdr->index;
> +             hpagesize = 1UL << hdr->shift;
> +
> +             ckpt_hdr_put(ctx, hdr);
> +
> +             if (last)
> +                     break;
> +
> +             addr = start + (hpagesize * index);
> +
> +             down_read(&current->mm->mmap_sem);
> +             ret = get_user_pages(current, current->mm, addr, 1, 1, 1,
> +                                  &page, NULL);
> +             up_read(&current->mm->mmap_sem);
> +
> +             if (ret < 0)
> +                     break;
> +
> +             ret = hugetlb_restore_page(ctx, page);
> +
> +             page_cache_release(page);
> +
> +             if (ret < 0)
> +                     break;
> +     }
> +
> +     sys_shmdt((void __user *)start);
> +
> +     __load_ipc_shm_hdr(hdr, shp);
> +
> +     return ret;
> +}
> +
>  static int _ipc_shm_delete(struct ipc_namespace *ns, int id)
>  {
>       mm_segment_t old_fs;
> @@ -190,11 +323,7 @@ static int load_ipc_shm_hdr(struct ckpt_ctx *ctx,
>       if (h->shm_cprid < 0 || h->shm_lprid < 0)
>               return -EINVAL;
>  
> -     shp->shm_atim = h->shm_atim;
> -     shp->shm_dtim = h->shm_dtim;
> -     shp->shm_ctim = h->shm_ctim;
> -     shp->shm_cprid = h->shm_cprid;
> -     shp->shm_lprid = h->shm_lprid;
> +     __load_ipc_shm_hdr(h, shp);
>  
>       return 0;
>  }
> @@ -224,8 +353,6 @@ int restore_ipc_shm(struct ckpt_ctx *ctx, struct 
> ipc_namespace *ns)
>       ret = -ENOSYS;
>       if (h->mlock_uid != (unsigned int) -1)  /* FIXME: support SHM_LOCK */
>               goto out;
> -     if (h->flags & SHM_HUGETLB)     /* FIXME: support SHM_HUGETLB */
> -             goto out;
>  
>       shmflag = h->flags | h->perms.mode | IPC_CREAT | IPC_EXCL;
>       ckpt_debug("shm: do_shmget size %lld flag %#x id %d\n",
> @@ -294,7 +421,10 @@ int restore_ipc_shm(struct ckpt_ctx *ctx, struct 
> ipc_namespace *ns)
>       ret = ckpt_obj_insert(ctx, file, h->objref, CKPT_OBJ_FILE);
>       if (ret < 0)
>               goto fput;
> -     ret = restore_memory_contents(ctx, file->f_dentry->d_inode);
> +     if (is_file_hugepages(file))
> +             ret = shm_hugetlb_restore_contents(ctx, ns, shp, h);
> +     else
> +             ret = restore_memory_contents(ctx, file->f_dentry->d_inode);
>  fput:
>       fput(file);
>  
_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to