Matt Helsley wrote:
> Save/restore epoll items during checkpoint/restart respectively.
> kmalloc failures should be dealt with more kindly than just error-out
> because epoll is made to poll many thousands of file descriptors.
> Subsequent patches will change epoll c/r to "chunk" its output/input
> respectively.

[...]

>  
> @@ -311,9 +313,11 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
>       }
>  
>       ret = deferqueue_run(ctx->files_deferq);
> -     ckpt_debug("files_deferq ran %d entries\n", ret);
> -     if (ret > 0)
> +     if (ret > 0) {
> +             ckpt_debug("file checkpoint deferred %d work items\n", ret);
>               ret = 0;
> +     }
> +

With your permission, I'll do this hunk as a separate patch (and
the restore counterpart too). So you can remove from your patch.

>   out:
>       kfree(fdtable);
>       return ret;
> @@ -604,6 +608,13 @@ static struct restore_file_ops restore_file_ops[] = {
>               .file_type = CKPT_FILE_TTY,
>               .restore = tty_file_restore,
>       },
> +#ifdef CONFIG_EPOLL
> +     {
> +             .file_name = "EPOLL",
> +             .file_type = CKPT_FILE_EPOLL,
> +             .restore = ep_file_restore,
> +     },
> +#endif
>  };
>  
>  static struct file *do_restore_file(struct ckpt_ctx *ctx)
> @@ -731,9 +742,11 @@ static struct files_struct *do_restore_file_table(struct 
> ckpt_ctx *ctx)
>       }
>  
>       ret = deferqueue_run(ctx->files_deferq);
> -     ckpt_debug("files_deferq ran %d entries\n", ret);
> -     if (ret > 0)
> +     if (ret > 0) {
> +             ckpt_debug("file restore deferred %d work items\n", ret);
>               ret = 0;
> +     }
> +

(This part too).

>   out:
>       ckpt_hdr_put(ctx, h);
>       if (!ret) {

[...]

>  
> +#ifdef CONFIG_CHECKPOINT
> +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
> +{
> +     struct rb_node *rbp;
> +     struct eventpoll *ep;
> +     int ret = 0;
> +
> +     ep = file->private_data;
> +     mutex_lock(&ep->mtx);
> +     for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> +             struct epitem *epi;
> +
> +             epi = rb_entry(rbp, struct epitem, rbn);
> +             ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);

s/ckpt_obj_collect/ckpt_collect_file/

> +             if (ret < 0)
> +                     break;
> +     }
> +     mutex_unlock(&ep->mtx);
> +     return ret;
> +}
> +
> +struct epoll_deferq_entry {
> +     struct ckpt_ctx *ctx;
> +     struct file *epfile;
> +};
> +
> +static int ep_items_checkpoint(void *data)
> +{
> +     struct epoll_deferq_entry *ep_dq_entry = data;
> +     struct ckpt_ctx *ctx;
> +     struct file *file;
> +     struct ckpt_hdr_eventpoll_items *h;
> +     struct rb_node *rbp;
> +     struct eventpoll *ep;
> +     __s32 epfile_objref;
> +     int i, ret;
> +
> +     file = ep_dq_entry->epfile;
> +     ctx = ep_dq_entry->ctx;
> +
> +     epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
> +     BUG_ON(epfile_objref <= 0);
> +
> +

Nit: extraneous newline :p

> +     ep = file->private_data;
> +     mutex_lock(&ep->mtx);
> +     for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
> +     mutex_unlock(&ep->mtx);
> +
> +     h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
> +                           CKPT_HDR_EPOLL_ITEMS);
> +     if (!h)
> +             return -ENOMEM;
> +
> +     h->num_items = i;
> +     h->epfile_objref = epfile_objref;
> +
> +     ret = 0;
> +     mutex_lock(&ep->mtx);
> +     for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
> +             struct epitem *epi;
> +             int objref;
> +
> +             epi = rb_entry(rbp, struct epitem, rbn);
> +             objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> +             if (objref <= 0) {

(Should never return 0)

> +                     ret = -EBUSY; /* checkpoint obj leak */
> +                     break;
> +             }
> +             h->items[i].fd = epi->ffd.fd;
> +             h->items[i].file_objref = objref;
> +             h->items[i].events = epi->event.events;
> +             h->items[i].data = epi->event.data;
> +     }
> +     mutex_unlock(&ep->mtx);
> +     if (!ret && (i != h->num_items))
> +             /*
> +              * We raced with another thread between our first and second

Not a thread (cannot be a thread, because we checkpoint whole thread
groups). Any process who shared access to this file pointer.

> +              * walks of the rbtree such that there weren't the same number
> +              * of items. This means there is a checkpoint "leak".
> +              */
> +             ret = -EBUSY;
> +     if (ret == -EBUSY)
> +             ckpt_write_err(ctx, "ep_items_checkpoint(): checkpoint leak 
> detected.\n", "");

It would be useful (for the user) to know which fd/item caused the
leak, or whether the leak was because i != h->num_items  (You could
use two ckpt_write_err(), one inside the mutex and on here).

Also, the prototype of ckpt_write_err() has changed, see the code.

> +     else if (!ret)
> +             ret = ckpt_write_obj(ctx, &h->h);

What other value could @ret have ?

> +     ckpt_hdr_put(ctx, &h->h);
> +     return ret;
> +}
> +
> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> +     struct ckpt_hdr_file *h;
> +     struct epoll_deferq_entry ep_dq_entry;
> +     int ret = -ENOMEM;
> +
> +     h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> +     if (!h)
> +             return -ENOMEM;
> +     h->f_type = CKPT_FILE_EPOLL;
> +     ret = checkpoint_file_common(ctx, file, h);
> +     if (ret < 0)
> +             goto out;
> +     ret = ckpt_write_obj(ctx, &h->h);
> +     if (ret < 0)
> +             goto out;
> +
> +     /*
> +      * Defer saving the epoll items until all of the ffd.file pointers
> +      * have an objref; after the file table has been checkpointed.
> +      */
> +     ep_dq_entry.ctx = ctx;
> +     ep_dq_entry.epfile = file;
> +     ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
> +                          sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
> +out:
> +     ckpt_hdr_put(ctx, h);
> +     return ret;
> +}
> +
> +static int ep_items_restore(void *data)
> +{
> +     struct ckpt_ctx *ctx = deferqueue_data_ptr(data);
> +     struct ckpt_hdr_eventpoll_items *h;
> +     struct eventpoll *ep;
> +     struct file *epfile = NULL;
> +     int ret, i = 0, remaining_watches;
> +
> +     h = ckpt_read_obj(ctx, 0,
> +                       sizeof(*h) + max_user_watches*sizeof(h->items[0]));

s/ckpt_read_obj/ckpt_read_buf_type/
Saves you the @type and @len test below.

> +     if (IS_ERR(h))
> +             return PTR_ERR(h);
> +
> +     ret = -EINVAL;
> +     if ((h->h.type != CKPT_HDR_EPOLL_ITEMS) ||
> +         (h->h.len < sizeof(*h)))
> +             goto out;
> +
> +     /* Make sure the items match the size we expect */
> +     if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
> +             goto out;
> +
> +     epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
> +     BUG_ON(IS_ERR(epfile));
> +     BUG_ON(!is_file_epoll(epfile));
> +
> +     /* Make sure there are enough watches left. */
> +     ret = -ENOSPC;
> +     ep = epfile->private_data;
> +     remaining_watches = (max_user_watches -
> +                          atomic_read(&ep->user->epoll_watches));
> +     if (h->num_items > remaining_watches)
> +             goto out;

I unsure if this is necessary: running out of watches will be
detected anyway later on. I wouldn't worry about early detection.

> +
> +     ret = 0;
> +     /* Restore the epoll items/watches */
> +     for (i = 0; !ret && i < h->num_items; i++) {
> +             struct epoll_event epev;
> +             struct file *tfile;
> +
> +             /* Get the file* for the target file */
> +             if (h->items[i].file_objref <= 0) {
> +                     ret = -EINVAL;
> +                     break;
> +             }

I should probably change the code elsewhere too, but this test
is unnecessary because ckpt_obj_fetch() will complain anyway.

> +             tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
> +                                    CKPT_OBJ_FILE);
> +             if (IS_ERR(tfile)) {
> +                     ret = PTR_ERR(tfile);
> +                     break;
> +             }
> +
> +             epev.events = h->items[i].events;
> +             epev.data = h->items[i].data;
> +
> +             ret = do_epoll_ctl(EPOLL_CTL_ADD, h->items[i].fd,
> +                                epfile, tfile, &epev);
> +     }
> +out:
> +     ckpt_hdr_put(ctx, h);
> +     return ret;
> +}
> +
> +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> +                          struct ckpt_hdr_file *h)
> +{
> +     struct file *epfile;
> +     int epfd, ret;
> +
> +     if (h->h.type != CKPT_HDR_FILE ||
> +         h->h.len  != sizeof(*h) ||
> +         h->f_type != CKPT_FILE_EPOLL)
> +             return ERR_PTR(-EINVAL);
> +
> +     epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
> +     if (epfd < 0)
> +             return ERR_PTR(epfd);
> +     epfile = fget(epfd);
> +     BUG_ON(!epfile);
> +
> +     /*
> +      * Needed before we can properly restore the watches and enforce the
> +      * limit on watch numbers.
> +      */
> +     ret = restore_file_common(ctx, epfile, h);
> +     if (ret < 0)
> +             goto fput_out;
> +
> +     /*
> +      * Defer restoring the epoll items until the file table is
> +      * fully restored. Ensures that valid file objrefs will resolve.
> +      */
> +     ret = deferqueue_add_ptr(ctx->files_deferq, ctx, ep_items_restore, 
> NULL);
> +     if (ret < 0) {
> +fput_out:
> +             fput(epfile);
> +             epfile = ERR_PTR(ret);
> +     }
> +     sys_close(epfd); /* harmless even if an error occured */

Nit: you can move this up to right after fget().

[...]

Oren.


_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to