Re: [Qemu-devel] [RESEND PATCH v3 kernel 4/7] virtio-balloon: speed up inflate/deflate process

2016-10-25 Thread Li, Liang Z
> > +static inline void init_pfn_range(struct virtio_balloon *vb) {
> > +   vb->min_pfn = ULONG_MAX;
> > +   vb->max_pfn = 0;
> > +}
> > +
> > +static inline void update_pfn_range(struct virtio_balloon *vb,
> > +struct page *page)
> > +{
> > +   unsigned long balloon_pfn = page_to_balloon_pfn(page);
> > +
> > +   if (balloon_pfn < vb->min_pfn)
> > +   vb->min_pfn = balloon_pfn;
> > +   if (balloon_pfn > vb->max_pfn)
> > +   vb->max_pfn = balloon_pfn;
> > +}
> > +
> 
> rename to hint these are all bitmap related.

Will change in v4.

> 
> 
> >  static void tell_host(struct virtio_balloon *vb, struct virtqueue
> > *vq)  {
> > -   struct scatterlist sg;
> > -   unsigned int len;
> > +   struct scatterlist sg, sg2[BALLOON_BMAP_COUNT + 1];
> > +   unsigned int len, i;
> > +
> > +   if (virtio_has_feature(vb->vdev,
> VIRTIO_BALLOON_F_PAGE_BITMAP)) {
> > +   struct balloon_bmap_hdr *hdr = vb->bmap_hdr;
> > +   unsigned long bmap_len;
> > +   int nr_pfn, nr_used_bmap, nr_buf;
> > +
> > +   nr_pfn = vb->end_pfn - vb->start_pfn + 1;
> > +   nr_pfn = roundup(nr_pfn, BITS_PER_LONG);
> > +   nr_used_bmap = nr_pfn / PFNS_PER_BMAP;
> > +   bmap_len = nr_pfn / BITS_PER_BYTE;
> > +   nr_buf = nr_used_bmap + 1;
> > +
> > +   /* cmd, reserved and req_id are init to 0, unused here */
> > +   hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT);
> > +   hdr->start_pfn = cpu_to_virtio64(vb->vdev, vb->start_pfn);
> > +   hdr->bmap_len = cpu_to_virtio64(vb->vdev, bmap_len);
> > +   sg_init_table(sg2, nr_buf);
> > +   sg_set_buf([0], hdr, sizeof(struct balloon_bmap_hdr));
> > +   for (i = 0; i < nr_used_bmap; i++) {
> > +   unsigned int  buf_len = BALLOON_BMAP_SIZE;
> > +
> > +   if (i + 1 == nr_used_bmap)
> > +   buf_len = bmap_len - BALLOON_BMAP_SIZE
> * i;
> > +   sg_set_buf([i + 1], vb->page_bitmap[i],
> buf_len);
> > +   }
> >
> > -   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
> > +   while (vq->num_free < nr_buf)
> > +   msleep(2);
> 
> 
> What's going on here? Who is expected to update num_free?
> 

I just want to wait until the vq have enough space to write the bitmap, I 
thought qemu
side will update the vq->num_free, is it wrong?

> 
> 
> > +   if (virtqueue_add_outbuf(vq, sg2, nr_buf, vb, GFP_KERNEL)
> == 0)
> > +   virtqueue_kick(vq);
> >
> > -   /* We should always be able to add one buffer to an empty queue.
> */
> > -   virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL);
> > -   virtqueue_kick(vq);
> > +   } else {
> > +   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb-
> >num_pfns);
> > +
> > +   /* We should always be able to add one buffer to an empty
> > +* queue. */
> 
> Pls use a multiple comment style consistent with kernel coding style.

Will change in next version.

> 
> > +   virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL);
> > +   virtqueue_kick(vq);
> > +   }
> >
> > /* When host has read buffer, this completes via balloon_ack */
> > wait_event(vb->acked, virtqueue_get_buf(vq, )); @@ -138,13
> > +199,93 @@ static void set_page_pfns(struct virtio_balloon *vb,
> >   page_to_balloon_pfn(page) + i);  }
> >
> > -static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
> > +static void extend_page_bitmap(struct virtio_balloon *vb) {
> > +   int i;
> > +   unsigned long bmap_len, bmap_count;
> > +
> > +   bmap_len = ALIGN(get_max_pfn(), BITS_PER_LONG) /
> BITS_PER_BYTE;
> > +   bmap_count = bmap_len / BALLOON_BMAP_SIZE;
> > +   if (bmap_len % BALLOON_BMAP_SIZE)
> > +   bmap_count++;
> > +   if (bmap_count > BALLOON_BMAP_COUNT)
> > +   bmap_count = BALLOON_BMAP_COUNT;
> > +
> 
> This is doing simple things in tricky ways.
> Please use macros such as ALIGN and max instead of if.
> 

Will change.

> 
> > +   for (i = 1; i < bmap_count; i++) {
> 
> why 1?

In probe stage, already allocated one bitmap.

> 
> > +   vb->page_bitmap[i] = kmalloc(BALLOON_BMAP_SIZE,
> GFP_ATOMIC);
> 
> why GFP_ATOMIC?

Yes, GFP_ATOMIC is not necessary.

> and what will free the previous buffer?

The previous buffer will not be freed.

> 
> 
> > +   if (vb->page_bitmap[i])
> > +   vb->nr_page_bmap++;
> > +   else
> > +   break;
> 
> and what will happen then?

I plan to use the previous allocated buffer to save the bitmap, need more code 
for kmalloc failure?

> > -static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
> > +static unsigned int leak_balloon(struct virtio_balloon *vb, size_t num,
> > +   bool use_bmap)
> 
> this is just a feature bit - why not get it internally?

Indeed.

> > @@ -218,8 +374,14 @@ static unsigned 

Re: [Qemu-devel] [RESEND PATCH v3 kernel 4/7] virtio-balloon: speed up inflate/deflate process

2016-10-25 Thread Michael S. Tsirkin
On Fri, Oct 21, 2016 at 02:24:37PM +0800, Liang Li wrote:
> The implementation of the current virtio-balloon is not very
> efficient, the time spends on different stages of inflating
> the balloon to 7GB of a 8GB idle guest:
> 
> a. allocating pages (6.5%)
> b. sending PFNs to host (68.3%)
> c. address translation (6.1%)
> d. madvise (19%)
> 
> It takes about 4126ms for the inflating process to complete.
> Debugging shows that the bottle neck are the stage b and stage d.
> 
> If using a bitmap to send the page info instead of the PFNs, we
> can reduce the overhead in stage b quite a lot. Furthermore, we
> can do the address translation and call madvise() with a bulk of
> RAM pages, instead of the current page per page way, the overhead
> of stage c and stage d can also be reduced a lot.
> 
> This patch is the kernel side implementation which is intended to
> speed up the inflating & deflating process by adding a new feature
> to the virtio-balloon device. With this new feature, inflating the
> balloon to 7GB of a 8GB idle guest only takes 590ms, the
> performance improvement is about 85%.
> 
> TODO: optimize stage a by allocating/freeing a chunk of pages
> instead of a single page at a time.
> 
> Signed-off-by: Liang Li 
> Suggested-by: Michael S. Tsirkin 
> Cc: Michael S. Tsirkin 
> Cc: Paolo Bonzini 
> Cc: Cornelia Huck 
> Cc: Amit Shah 
> ---
>  drivers/virtio/virtio_balloon.c | 233 
> +++-
>  1 file changed, 209 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 59ffe5a..c31839c 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -42,6 +42,10 @@
>  #define OOM_VBALLOON_DEFAULT_PAGES 256
>  #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
>  
> +#define BALLOON_BMAP_SIZE(8 * PAGE_SIZE)
> +#define PFNS_PER_BMAP(BALLOON_BMAP_SIZE * BITS_PER_BYTE)
> +#define BALLOON_BMAP_COUNT   32
> +
>  static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
>  module_param(oom_pages, int, S_IRUSR | S_IWUSR);
>  MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
> @@ -67,6 +71,13 @@ struct virtio_balloon {
>  
>   /* Number of balloon pages we've told the Host we're not using. */
>   unsigned int num_pages;
> + /* Pointer of the bitmap header. */
> + void *bmap_hdr;
> + /* Bitmap and bitmap count used to tell the host the pages */
> + unsigned long *page_bitmap[BALLOON_BMAP_COUNT];
> + unsigned int nr_page_bmap;
> + /* Used to record the processed pfn range */
> + unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
>   /*
>* The pages we've told the Host we're not using are enqueued
>* at vb_dev_info->pages list.
> @@ -110,16 +121,66 @@ static void balloon_ack(struct virtqueue *vq)
>   wake_up(>acked);
>  }
>  
> +static inline void init_pfn_range(struct virtio_balloon *vb)
> +{
> + vb->min_pfn = ULONG_MAX;
> + vb->max_pfn = 0;
> +}
> +
> +static inline void update_pfn_range(struct virtio_balloon *vb,
> +  struct page *page)
> +{
> + unsigned long balloon_pfn = page_to_balloon_pfn(page);
> +
> + if (balloon_pfn < vb->min_pfn)
> + vb->min_pfn = balloon_pfn;
> + if (balloon_pfn > vb->max_pfn)
> + vb->max_pfn = balloon_pfn;
> +}
> +

rename to hint these are all bitmap related.


>  static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
>  {
> - struct scatterlist sg;
> - unsigned int len;
> + struct scatterlist sg, sg2[BALLOON_BMAP_COUNT + 1];
> + unsigned int len, i;
> +
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) {
> + struct balloon_bmap_hdr *hdr = vb->bmap_hdr;
> + unsigned long bmap_len;
> + int nr_pfn, nr_used_bmap, nr_buf;
> +
> + nr_pfn = vb->end_pfn - vb->start_pfn + 1;
> + nr_pfn = roundup(nr_pfn, BITS_PER_LONG);
> + nr_used_bmap = nr_pfn / PFNS_PER_BMAP;
> + bmap_len = nr_pfn / BITS_PER_BYTE;
> + nr_buf = nr_used_bmap + 1;
> +
> + /* cmd, reserved and req_id are init to 0, unused here */
> + hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT);
> + hdr->start_pfn = cpu_to_virtio64(vb->vdev, vb->start_pfn);
> + hdr->bmap_len = cpu_to_virtio64(vb->vdev, bmap_len);
> + sg_init_table(sg2, nr_buf);
> + sg_set_buf([0], hdr, sizeof(struct balloon_bmap_hdr));
> + for (i = 0; i < nr_used_bmap; i++) {
> + unsigned int  buf_len = BALLOON_BMAP_SIZE;
> +
> + if (i + 1 == nr_used_bmap)
> + buf_len = bmap_len - BALLOON_BMAP_SIZE * i;
> + sg_set_buf([i + 1], vb->page_bitmap[i], buf_len);
> +

[Qemu-devel] [RESEND PATCH v3 kernel 4/7] virtio-balloon: speed up inflate/deflate process

2016-10-21 Thread Liang Li
The implementation of the current virtio-balloon is not very
efficient, the time spends on different stages of inflating
the balloon to 7GB of a 8GB idle guest:

a. allocating pages (6.5%)
b. sending PFNs to host (68.3%)
c. address translation (6.1%)
d. madvise (19%)

It takes about 4126ms for the inflating process to complete.
Debugging shows that the bottle neck are the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we
can reduce the overhead in stage b quite a lot. Furthermore, we
can do the address translation and call madvise() with a bulk of
RAM pages, instead of the current page per page way, the overhead
of stage c and stage d can also be reduced a lot.

This patch is the kernel side implementation which is intended to
speed up the inflating & deflating process by adding a new feature
to the virtio-balloon device. With this new feature, inflating the
balloon to 7GB of a 8GB idle guest only takes 590ms, the
performance improvement is about 85%.

TODO: optimize stage a by allocating/freeing a chunk of pages
instead of a single page at a time.

Signed-off-by: Liang Li 
Suggested-by: Michael S. Tsirkin 
Cc: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: Cornelia Huck 
Cc: Amit Shah 
---
 drivers/virtio/virtio_balloon.c | 233 +++-
 1 file changed, 209 insertions(+), 24 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 59ffe5a..c31839c 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -42,6 +42,10 @@
 #define OOM_VBALLOON_DEFAULT_PAGES 256
 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
 
+#define BALLOON_BMAP_SIZE  (8 * PAGE_SIZE)
+#define PFNS_PER_BMAP  (BALLOON_BMAP_SIZE * BITS_PER_BYTE)
+#define BALLOON_BMAP_COUNT 32
+
 static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
@@ -67,6 +71,13 @@ struct virtio_balloon {
 
/* Number of balloon pages we've told the Host we're not using. */
unsigned int num_pages;
+   /* Pointer of the bitmap header. */
+   void *bmap_hdr;
+   /* Bitmap and bitmap count used to tell the host the pages */
+   unsigned long *page_bitmap[BALLOON_BMAP_COUNT];
+   unsigned int nr_page_bmap;
+   /* Used to record the processed pfn range */
+   unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
/*
 * The pages we've told the Host we're not using are enqueued
 * at vb_dev_info->pages list.
@@ -110,16 +121,66 @@ static void balloon_ack(struct virtqueue *vq)
wake_up(>acked);
 }
 
+static inline void init_pfn_range(struct virtio_balloon *vb)
+{
+   vb->min_pfn = ULONG_MAX;
+   vb->max_pfn = 0;
+}
+
+static inline void update_pfn_range(struct virtio_balloon *vb,
+struct page *page)
+{
+   unsigned long balloon_pfn = page_to_balloon_pfn(page);
+
+   if (balloon_pfn < vb->min_pfn)
+   vb->min_pfn = balloon_pfn;
+   if (balloon_pfn > vb->max_pfn)
+   vb->max_pfn = balloon_pfn;
+}
+
 static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 {
-   struct scatterlist sg;
-   unsigned int len;
+   struct scatterlist sg, sg2[BALLOON_BMAP_COUNT + 1];
+   unsigned int len, i;
+
+   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) {
+   struct balloon_bmap_hdr *hdr = vb->bmap_hdr;
+   unsigned long bmap_len;
+   int nr_pfn, nr_used_bmap, nr_buf;
+
+   nr_pfn = vb->end_pfn - vb->start_pfn + 1;
+   nr_pfn = roundup(nr_pfn, BITS_PER_LONG);
+   nr_used_bmap = nr_pfn / PFNS_PER_BMAP;
+   bmap_len = nr_pfn / BITS_PER_BYTE;
+   nr_buf = nr_used_bmap + 1;
+
+   /* cmd, reserved and req_id are init to 0, unused here */
+   hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT);
+   hdr->start_pfn = cpu_to_virtio64(vb->vdev, vb->start_pfn);
+   hdr->bmap_len = cpu_to_virtio64(vb->vdev, bmap_len);
+   sg_init_table(sg2, nr_buf);
+   sg_set_buf([0], hdr, sizeof(struct balloon_bmap_hdr));
+   for (i = 0; i < nr_used_bmap; i++) {
+   unsigned int  buf_len = BALLOON_BMAP_SIZE;
+
+   if (i + 1 == nr_used_bmap)
+   buf_len = bmap_len - BALLOON_BMAP_SIZE * i;
+   sg_set_buf([i + 1], vb->page_bitmap[i], buf_len);
+   }
 
-   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   while (vq->num_free < nr_buf)
+   msleep(2);
+   if (virtqueue_add_outbuf(vq, sg2, nr_buf, vb, GFP_KERNEL) == 0)
+