Quoting Matt Helsley (matth...@us.ibm.com):
> Currently we allocate memory to output all of the epoll items in one
> big chunk. At 20 bytes per item, and since epoll was designed to
> support on the order of 10,000 items, we may find ourselves kmalloc'ing
> 200,000 bytes. That's an order 7 allocation whereas the heuristic for
> difficult allocations, PAGE_ALLOC_COST_ORDER, is 3.
> 
> Instead, output the epoll header and items separately. Chunk the output
> much like the pid array gets chunked. This ensures that even sub-order 0
> allocations will enable checkpoint of large epoll sets. A subsequent
> patch will do something similar for the restore path.
> 
> Signed-off-by: Matt Helsley <matth...@us.ibm.com>

Feels a bit auto-tune-magic-happy :) but looks good

Acked-by: Serge Hallyn <se...@us.ibm.com>

> ---
>  fs/eventpoll.c |   71 ++++++++++++++++++++++++++++++++++++-------------------
>  1 files changed, 46 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 4706ec5..2506b40 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -1480,7 +1480,7 @@ static int ep_items_checkpoint(void *data)
>       struct rb_node *rbp;
>       struct eventpoll *ep;
>       __s32 epfile_objref;
> -     int i, num_items, ret;
> +     int num_items = 0, nchunk, ret;
> 
>       ctx = dq_entry->ctx;
> 
> @@ -1489,9 +1489,8 @@ static int ep_items_checkpoint(void *data)
> 
>       ep = dq_entry->epfile->private_data;
>       mutex_lock(&ep->mtx);
> -     for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
> +     for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), num_items++) {}
>       mutex_unlock(&ep->mtx);
> -     num_items = i;
> 
>       h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_EPOLL_ITEMS);
>       if (!h)
> @@ -1503,36 +1502,58 @@ static int ep_items_checkpoint(void *data)
>       if (ret || !num_items)
>               return ret;
> 
> -     items = kzalloc(sizeof(*items)*num_items, GFP_KERNEL);
> +     ret = ckpt_write_obj_type(ctx, NULL, sizeof(*items)*num_items,
> +                               CKPT_HDR_BUFFER);
> +     if (ret < 0)
> +             return ret;
> +
> +     nchunk = num_items;
> +     do {
> +             items = kzalloc(sizeof(*items)*nchunk, GFP_KERNEL);
> +             if (items)
> +                     break;
> +             nchunk = nchunk >> 1;
> +     } while (nchunk > 0);
>       if (!items)
>               return -ENOMEM;
> +
> +     /*
> +      * Walk the rbtree copying items into the chunk of memory and then
> +      * writing them to the checkpoint image
> +      */
>       ret = 0;
> -     i = 0;
>       mutex_lock(&ep->mtx);
> -     for (rbp = rb_first(&ep->rbr); i < num_items && rbp; rbp = rb_next(rbp),
> -          i++) {
> -             struct epitem *epi;
> -             int objref;
> -
> -             epi = rb_entry(rbp, struct epitem, rbn);
> -             items[i].fd = epi->ffd.fd;
> -             items[i].events = epi->event.events;
> -             items[i].data = epi->event.data;
> -             objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> -             if (objref <= 0) {
> -                     ret = -EBUSY; /* missing item -- checkpoint obj leak */
> -                     break;
> +     rbp = rb_first(&ep->rbr);
> +     while ((num_items > 0) && rbp) {
> +             int n = min(num_items, nchunk);
> +             int j;
> +
> +             for (j = 0; rbp && j < n; j++, rbp = rb_next(rbp)) {
> +                     struct epitem *epi;
> +                     int objref;
> +
> +                     epi = rb_entry(rbp, struct epitem, rbn);
> +                     items[j].fd = epi->ffd.fd;
> +                     items[j].events = epi->event.events;
> +                     items[j].data = epi->event.data;
> +                     objref = ckpt_obj_lookup(ctx, epi->ffd.file,
> +                                              CKPT_OBJ_FILE);
> +                     if (objref <= 0)
> +                             goto unlock;
> +                     items[j].file_objref = objref;
>               }
> -             items[i].file_objref = objref;
> +             ret = ckpt_kwrite(ctx, items, n*sizeof(*items));
> +             if (ret < 0)
> +                     break;
> +             num_items -= n;
>       }
> +unlock:
>       mutex_unlock(&ep->mtx);
> -     if (i == num_items && rbp)
> -             ret = -EBUSY; /* extra item(s) -- checkpoint obj leak */
> -     if (!ret)
> -             ret = ckpt_write_buffer(ctx, items, sizeof(*items)*num_items);
> -     else
> -             ckpt_write_err(ctx, "E", "checkpoint leak detected.\n", ret);
>       kfree(items);
> +     if (num_items != 0 || (num_items == 0 && rbp))
> +             ret = -EBUSY; /* extra item(s) -- checkpoint obj leak */
> +     if (ret)
> +             ckpt_write_err(ctx, "E", " checkpointing epoll items.\n", ret);
>       return ret;
>  }
> 
> -- 
> 1.5.6.3
> 
> 
> _______________________________________________
> Containers mailing list
> contain...@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
_______________________________________________
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel

Reply via email to