Re: [Qemu-devel] [QEMU 1/7] balloon: speed up inflating & deflating process

2016-06-19 Thread Li, Liang Z
> >
> >  virtqueue_push(vq, elem, offset); @@ -374,6 +489,7 @@ static
> > uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
> >  VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
> >  f |= dev->host_features;
> >  virtio_add_feature(, VIRTIO_BALLOON_F_STATS_VQ);
> > +virtio_add_feature(, VIRTIO_BALLOON_F_PAGE_BITMAP);
> >  return f;
> >  }
> >
> 
> Pls add features to virtio_balloon_properties.
> You also need to handle compatibility by disabling for old machine types.
> 

I forgot that, will add in next version.

> > --- a/include/standard-headers/linux/virtio_balloon.h
> > +++ b/include/standard-headers/linux/virtio_balloon.h
> > @@ -34,6 +34,7 @@
> >  #define VIRTIO_BALLOON_F_MUST_TELL_HOST0 /* Tell before
> reclaiming pages */
> >  #define VIRTIO_BALLOON_F_STATS_VQ  1 /* Memory Stats virtqueue
> */
> >  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon
> on OOM */
> > +#define VIRTIO_BALLOON_F_PAGE_BITMAP  3 /* Use page bitmap to
> send
> > +page info */
> >
> >  /* Size of a PFN in the balloon interface. */  #define
> > VIRTIO_BALLOON_PFN_SHIFT 12
> 
> We want to keep this in sync with Linux.
> Let's get a minimal patch to extend this header merged in linux, then update
> this one.

OK. Can this be independent of the virtio-balloon SPEC? As I understand it,
 it will not get merged before the SPEC is set?

Thanks!
Liang



Re: [Qemu-devel] [QEMU 1/7] balloon: speed up inflating & deflating process

2016-06-18 Thread Michael S. Tsirkin
On Mon, Jun 13, 2016 at 06:16:43PM +0800, Liang Li wrote:
> The implementation of the current virtio-balloon is not very efficient,
> Bellow is test result of time spends on inflating the balloon to 3GB of
> a 4GB idle guest:
> 
> a. allocating pages (6.5%, 103ms)
> b. sending PFNs to host (68.3%, 787ms)
> c. address translation (6.1%, 96ms)
> d. madvise (19%, 300ms)
> 
> It takes about 1577ms for the whole inflating process to complete. The
> test shows that the bottle neck is the stage b and stage d.
> 
> If using a bitmap to send the page info instead of the PFNs, we can
> reduce the overhead spends on stage b quite a lot. Furthermore, it's
> possible to do the address translation and do the madvise with a bulk
> of pages, instead of the current page per page way, so the overhead of
> stage c and stage d can also be reduced a lot.
> 
> This patch is the QEMU side implementation which is intended to speed
> up the inflating & deflating process by adding a new feature to the
> virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB
> idle guest only takes 210ms, it's about 8 times as fast as before.
> 
> TODO: optimize stage a by allocating/freeing a chunk of pages instead
> of a single page at a time.
> 
> Signed-off-by: Liang Li 
> ---
>  hw/virtio/virtio-balloon.c  | 159 
> 
>  include/standard-headers/linux/virtio_balloon.h |   1 +
>  2 files changed, 139 insertions(+), 21 deletions(-)
> 
> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
> index 8c15e09..8cf74c2 100644
> --- a/hw/virtio/virtio-balloon.c
> +++ b/hw/virtio/virtio-balloon.c
> @@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate)
>  #endif
>  }
>  
> +static void do_balloon_bulk_pages(ram_addr_t base_pfn, int page_shift,
> +  unsigned long len, bool deflate)
> +{
> +ram_addr_t size, processed, chunk, base;
> +void *addr;
> +MemoryRegionSection section = {.mr = NULL};
> +
> +size = (len << page_shift);
> +base = (base_pfn << page_shift);
> +
> +for (processed = 0; processed < size; processed += chunk) {
> +chunk = size - processed;
> +while (chunk >= TARGET_PAGE_SIZE) {
> +section = memory_region_find(get_system_memory(),
> + base + processed, chunk);
> +if (!section.mr) {
> +chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE);
> +} else {
> +break;
> +}
> +}
> +
> +if (section.mr &&
> +(int128_nz(section.size) && memory_region_is_ram(section.mr))) {
> +addr = section.offset_within_region +
> +   memory_region_get_ram_ptr(section.mr);
> +qemu_madvise(addr, chunk,
> + deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
> +} else {
> +fprintf(stderr, "can't find the chunk, skip\n");
> +chunk = TARGET_PAGE_SIZE;
> +}
> +}
> +}
> +
> +static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long *bitmap,
> +   unsigned long len, int page_shift, bool 
> deflate)
> +{
> +#if defined(__linux__)
> +unsigned long end  = len * 8;
> +unsigned long current = 0;
> +
> +if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
> + kvm_has_sync_mmu())) {
> +while (current < end) {
> +unsigned long one = find_next_bit(bitmap, end, current);
> +
> +if (one < end) {
> +unsigned long zero = find_next_zero_bit(bitmap, end, one + 
> 1);
> +unsigned long page_length;
> +
> +if (zero >= end) {
> +page_length = end - one;
> +} else {
> +page_length = zero - one;
> +}
> +
> +if (page_length) {
> +do_balloon_bulk_pages(base_pfn + one, page_shift,
> +  page_length, deflate);
> +}
> +current = one + page_length;
> +} else {
> +current = one;
> +}
> +}
> +}
> +#endif
> +}
> +
>  static const char *balloon_stat_names[] = {
> [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
> [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
> @@ -78,6 +148,12 @@ static bool balloon_stats_supported(const VirtIOBalloon 
> *s)
>  return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
>  }
>  
> +static bool balloon_page_bitmap_supported(const VirtIOBalloon *s)
> +{
> +VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
> +}
> +
>  static bool balloon_stats_enabled(const VirtIOBalloon *s)
>  {
>  return s->stats_poll_interval > 0;
> @@ -224,27 +300,66 @@ static void 

Re: [Qemu-devel] [QEMU 1/7] balloon: speed up inflating & deflating process

2016-06-16 Thread Li, Liang Z
>  +chunk = TARGET_PAGE_SIZE;
>  +}
>  +}
>  +}
>  +
>  +static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long
> >>> *bitmap,
>  +   unsigned long len, int page_shift,
>  +bool deflate) { #if defined(__linux__)
> >>>
> >>> Why do you need this #if here?
> >>>
> >> Ooh,  it is wrong to add the '#if' here, will remove.
> >>
> > No, it is needed, just follow the code in balloon_page().
> > only Linux support the madvise().
> 
> I think it is not needed anymore today and the #if in balloon_page could be
> removed, too: As far as I can see, the #if there is from the early days, when
> there was no wrapper around madvise() yet. But nowadays, we've got the
> qemu_madvise() wrapper which takes care of either using madvise(),
> posix_madvise() or doing nothing, so the virtio-balloon code should be able
> to work without the #if now.
> 
>  Thomas

You are right! I will remove both of them.

Thanks!
Liang





Re: [Qemu-devel] [QEMU 1/7] balloon: speed up inflating & deflating process

2016-06-14 Thread Thomas Huth
On 14.06.2016 16:41, Li, Liang Z wrote:
>>> On 13.06.2016 12:16, Liang Li wrote:
 The implementation of the current virtio-balloon is not very
 efficient, Bellow is test result of time spends on inflating the
 balloon to 3GB of a 4GB idle guest:

 a. allocating pages (6.5%, 103ms)
 b. sending PFNs to host (68.3%, 787ms) c. address translation (6.1%,
 96ms) d. madvise (19%, 300ms)

 It takes about 1577ms for the whole inflating process to complete.
 The test shows that the bottle neck is the stage b and stage d.

 If using a bitmap to send the page info instead of the PFNs, we can
 reduce the overhead spends on stage b quite a lot. Furthermore, it's
 possible to do the address translation and do the madvise with a
 bulk of pages, instead of the current page per page way, so the
 overhead of stage c and stage d can also be reduced a lot.

 This patch is the QEMU side implementation which is intended to
 speed up the inflating & deflating process by adding a new feature
 to the virtio-balloon device. And now, inflating the balloon to 3GB
 of a 4GB idle guest only takes 210ms, it's about 8 times as fast as before.
[...]
 +chunk = TARGET_PAGE_SIZE;
 +}
 +}
 +}
 +
 +static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long
>>> *bitmap,
 +   unsigned long len, int page_shift,
 +bool deflate) { #if defined(__linux__)
>>>
>>> Why do you need this #if here?
>>>
>> Ooh,  it is wrong to add the '#if' here, will remove.
>>
> No, it is needed, just follow the code in balloon_page().
> only Linux support the madvise().

I think it is not needed anymore today and the #if in balloon_page could
be removed, too: As far as I can see, the #if there is from the early
days, when there was no wrapper around madvise() yet. But nowadays,
we've got the qemu_madvise() wrapper which takes care of either using
madvise(), posix_madvise() or doing nothing, so the virtio-balloon code
should be able to work without the #if now.

 Thomas




Re: [Qemu-devel] [QEMU 1/7] balloon: speed up inflating & deflating process

2016-06-14 Thread Li, Liang Z
> > On 13.06.2016 12:16, Liang Li wrote:
> > > The implementation of the current virtio-balloon is not very
> > > efficient, Bellow is test result of time spends on inflating the
> > > balloon to 3GB of a 4GB idle guest:
> > >
> > > a. allocating pages (6.5%, 103ms)
> > > b. sending PFNs to host (68.3%, 787ms) c. address translation (6.1%,
> > > 96ms) d. madvise (19%, 300ms)
> > >
> > > It takes about 1577ms for the whole inflating process to complete.
> > > The test shows that the bottle neck is the stage b and stage d.
> > >
> > > If using a bitmap to send the page info instead of the PFNs, we can
> > > reduce the overhead spends on stage b quite a lot. Furthermore, it's
> > > possible to do the address translation and do the madvise with a
> > > bulk of pages, instead of the current page per page way, so the
> > > overhead of stage c and stage d can also be reduced a lot.
> > >
> > > This patch is the QEMU side implementation which is intended to
> > > speed up the inflating & deflating process by adding a new feature
> > > to the virtio-balloon device. And now, inflating the balloon to 3GB
> > > of a 4GB idle guest only takes 210ms, it's about 8 times as fast as 
> > > before.
> > >
> > > TODO: optimize stage a by allocating/freeing a chunk of pages
> > > instead of a single page at a time.
> > >
> > > Signed-off-by: Liang Li 
> > > ---
> > >  hw/virtio/virtio-balloon.c  | 159 
> > > 
> > >  include/standard-headers/linux/virtio_balloon.h |   1 +
> > >  2 files changed, 139 insertions(+), 21 deletions(-)
> > >
> > > diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
> > > index 8c15e09..8cf74c2 100644
> > > --- a/hw/virtio/virtio-balloon.c
> > > +++ b/hw/virtio/virtio-balloon.c
> > > @@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate)
> > > #endif  }
> > >
> > > +static void do_balloon_bulk_pages(ram_addr_t base_pfn, int
> page_shift,
> > > +  unsigned long len, bool deflate) {
> > > +ram_addr_t size, processed, chunk, base;
> > > +void *addr;
> > > +MemoryRegionSection section = {.mr = NULL};
> > > +
> > > +size = (len << page_shift);
> > > +base = (base_pfn << page_shift);
> > > +
> > > +for (processed = 0; processed < size; processed += chunk) {
> > > +chunk = size - processed;
> > > +while (chunk >= TARGET_PAGE_SIZE) {
> > > +section = memory_region_find(get_system_memory(),
> > > + base + processed, chunk);
> > > +if (!section.mr) {
> > > +chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE);
> > > +} else {
> > > +break;
> > > +}
> > > +}
> > > +
> > > +if (section.mr &&
> > > +(int128_nz(section.size) && 
> > > memory_region_is_ram(section.mr)))
> {
> > > +addr = section.offset_within_region +
> > > +   memory_region_get_ram_ptr(section.mr);
> > > +qemu_madvise(addr, chunk,
> > > + deflate ? QEMU_MADV_WILLNEED :
> > QEMU_MADV_DONTNEED);
> > > +} else {
> > > +fprintf(stderr, "can't find the chunk, skip\n");
> >
> > Please try to avoid new fprintf(stderr, ...) in the QEMU sources.
> > Use error_report(...) or in this case maybe rather
> > qemu_log_mask(LOG_GUEST_ERROR, ...) instead, and try to use a more
> > reasonable error message (e.g. that it is clear that the error
> > happened in the balloon code).
> >
> 
> Indeed, the error message is no good, will change in next version.
> 
> > > +chunk = TARGET_PAGE_SIZE;
> > > +}
> > > +}
> > > +}
> > > +
> > > +static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long
> > *bitmap,
> > > +   unsigned long len, int page_shift,
> > > +bool deflate) { #if defined(__linux__)
> >
> > Why do you need this #if here?
> >
> 
> Ooh,  it is wrong to add the '#if' here, will remove.
No, it is needed, just follow the code in balloon_page().
only Linux support the madvise().

Liang



Re: [Qemu-devel] [QEMU 1/7] balloon: speed up inflating & deflating process

2016-06-14 Thread Li, Liang Z
> Subject: Re: [QEMU 1/7] balloon: speed up inflating & deflating process
> 
> On 13.06.2016 12:16, Liang Li wrote:
> > The implementation of the current virtio-balloon is not very
> > efficient, Bellow is test result of time spends on inflating the
> > balloon to 3GB of a 4GB idle guest:
> >
> > a. allocating pages (6.5%, 103ms)
> > b. sending PFNs to host (68.3%, 787ms) c. address translation (6.1%,
> > 96ms) d. madvise (19%, 300ms)
> >
> > It takes about 1577ms for the whole inflating process to complete. The
> > test shows that the bottle neck is the stage b and stage d.
> >
> > If using a bitmap to send the page info instead of the PFNs, we can
> > reduce the overhead spends on stage b quite a lot. Furthermore, it's
> > possible to do the address translation and do the madvise with a bulk
> > of pages, instead of the current page per page way, so the overhead of
> > stage c and stage d can also be reduced a lot.
> >
> > This patch is the QEMU side implementation which is intended to speed
> > up the inflating & deflating process by adding a new feature to the
> > virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB
> > idle guest only takes 210ms, it's about 8 times as fast as before.
> >
> > TODO: optimize stage a by allocating/freeing a chunk of pages instead
> > of a single page at a time.
> >
> > Signed-off-by: Liang Li 
> > ---
> >  hw/virtio/virtio-balloon.c  | 159 
> > 
> >  include/standard-headers/linux/virtio_balloon.h |   1 +
> >  2 files changed, 139 insertions(+), 21 deletions(-)
> >
> > diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
> > index 8c15e09..8cf74c2 100644
> > --- a/hw/virtio/virtio-balloon.c
> > +++ b/hw/virtio/virtio-balloon.c
> > @@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate)
> > #endif  }
> >
> > +static void do_balloon_bulk_pages(ram_addr_t base_pfn, int page_shift,
> > +  unsigned long len, bool deflate) {
> > +ram_addr_t size, processed, chunk, base;
> > +void *addr;
> > +MemoryRegionSection section = {.mr = NULL};
> > +
> > +size = (len << page_shift);
> > +base = (base_pfn << page_shift);
> > +
> > +for (processed = 0; processed < size; processed += chunk) {
> > +chunk = size - processed;
> > +while (chunk >= TARGET_PAGE_SIZE) {
> > +section = memory_region_find(get_system_memory(),
> > + base + processed, chunk);
> > +if (!section.mr) {
> > +chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE);
> > +} else {
> > +break;
> > +}
> > +}
> > +
> > +if (section.mr &&
> > +(int128_nz(section.size) && memory_region_is_ram(section.mr))) 
> > {
> > +addr = section.offset_within_region +
> > +   memory_region_get_ram_ptr(section.mr);
> > +qemu_madvise(addr, chunk,
> > + deflate ? QEMU_MADV_WILLNEED :
> QEMU_MADV_DONTNEED);
> > +} else {
> > +fprintf(stderr, "can't find the chunk, skip\n");
> 
> Please try to avoid new fprintf(stderr, ...) in the QEMU sources.
> Use error_report(...) or in this case maybe rather
> qemu_log_mask(LOG_GUEST_ERROR, ...) instead, and try to use a more
> reasonable error message (e.g. that it is clear that the error happened in the
> balloon code).
> 

Indeed, the error message is no good, will change in next version.

> > +chunk = TARGET_PAGE_SIZE;
> > +}
> > +}
> > +}
> > +
> > +static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long
> *bitmap,
> > +   unsigned long len, int page_shift,
> > +bool deflate) { #if defined(__linux__)
> 
> Why do you need this #if here?
> 

Ooh,  it is wrong to add the '#if' here, will remove.

Thanks a lot!

Liang


Re: [Qemu-devel] [QEMU 1/7] balloon: speed up inflating & deflating process

2016-06-14 Thread Thomas Huth
On 13.06.2016 12:16, Liang Li wrote:
> The implementation of the current virtio-balloon is not very efficient,
> Bellow is test result of time spends on inflating the balloon to 3GB of
> a 4GB idle guest:
> 
> a. allocating pages (6.5%, 103ms)
> b. sending PFNs to host (68.3%, 787ms)
> c. address translation (6.1%, 96ms)
> d. madvise (19%, 300ms)
> 
> It takes about 1577ms for the whole inflating process to complete. The
> test shows that the bottle neck is the stage b and stage d.
> 
> If using a bitmap to send the page info instead of the PFNs, we can
> reduce the overhead spends on stage b quite a lot. Furthermore, it's
> possible to do the address translation and do the madvise with a bulk
> of pages, instead of the current page per page way, so the overhead of
> stage c and stage d can also be reduced a lot.
> 
> This patch is the QEMU side implementation which is intended to speed
> up the inflating & deflating process by adding a new feature to the
> virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB
> idle guest only takes 210ms, it's about 8 times as fast as before.
> 
> TODO: optimize stage a by allocating/freeing a chunk of pages instead
> of a single page at a time.
> 
> Signed-off-by: Liang Li 
> ---
>  hw/virtio/virtio-balloon.c  | 159 
> 
>  include/standard-headers/linux/virtio_balloon.h |   1 +
>  2 files changed, 139 insertions(+), 21 deletions(-)
> 
> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
> index 8c15e09..8cf74c2 100644
> --- a/hw/virtio/virtio-balloon.c
> +++ b/hw/virtio/virtio-balloon.c
> @@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate)
>  #endif
>  }
>  
> +static void do_balloon_bulk_pages(ram_addr_t base_pfn, int page_shift,
> +  unsigned long len, bool deflate)
> +{
> +ram_addr_t size, processed, chunk, base;
> +void *addr;
> +MemoryRegionSection section = {.mr = NULL};
> +
> +size = (len << page_shift);
> +base = (base_pfn << page_shift);
> +
> +for (processed = 0; processed < size; processed += chunk) {
> +chunk = size - processed;
> +while (chunk >= TARGET_PAGE_SIZE) {
> +section = memory_region_find(get_system_memory(),
> + base + processed, chunk);
> +if (!section.mr) {
> +chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE);
> +} else {
> +break;
> +}
> +}
> +
> +if (section.mr &&
> +(int128_nz(section.size) && memory_region_is_ram(section.mr))) {
> +addr = section.offset_within_region +
> +   memory_region_get_ram_ptr(section.mr);
> +qemu_madvise(addr, chunk,
> + deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
> +} else {
> +fprintf(stderr, "can't find the chunk, skip\n");

Please try to avoid new fprintf(stderr, ...) in the QEMU sources.
Use error_report(...) or in this case maybe rather
qemu_log_mask(LOG_GUEST_ERROR, ...) instead, and try to use a more
reasonable error message (e.g. that it is clear that the error happened
in the balloon code).

> +chunk = TARGET_PAGE_SIZE;
> +}
> +}
> +}
> +
> +static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long *bitmap,
> +   unsigned long len, int page_shift, bool 
> deflate)
> +{
> +#if defined(__linux__)

Why do you need this #if here?

> +unsigned long end  = len * 8;
> +unsigned long current = 0;
> +
> +if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
> + kvm_has_sync_mmu())) {
> +while (current < end) {
> +unsigned long one = find_next_bit(bitmap, end, current);
> +
> +if (one < end) {
> +unsigned long zero = find_next_zero_bit(bitmap, end, one + 
> 1);
> +unsigned long page_length;
> +
> +if (zero >= end) {
> +page_length = end - one;
> +} else {
> +page_length = zero - one;
> +}
> +
> +if (page_length) {
> +do_balloon_bulk_pages(base_pfn + one, page_shift,
> +  page_length, deflate);
> +}
> +current = one + page_length;
> +} else {
> +current = one;
> +}
> +}
> +}
> +#endif
> +}

 Thomas




[Qemu-devel] [QEMU 1/7] balloon: speed up inflating & deflating process

2016-06-13 Thread Liang Li
The implementation of the current virtio-balloon is not very efficient,
Bellow is test result of time spends on inflating the balloon to 3GB of
a 4GB idle guest:

a. allocating pages (6.5%, 103ms)
b. sending PFNs to host (68.3%, 787ms)
c. address translation (6.1%, 96ms)
d. madvise (19%, 300ms)

It takes about 1577ms for the whole inflating process to complete. The
test shows that the bottle neck is the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we can
reduce the overhead spends on stage b quite a lot. Furthermore, it's
possible to do the address translation and do the madvise with a bulk
of pages, instead of the current page per page way, so the overhead of
stage c and stage d can also be reduced a lot.

This patch is the QEMU side implementation which is intended to speed
up the inflating & deflating process by adding a new feature to the
virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB
idle guest only takes 210ms, it's about 8 times as fast as before.

TODO: optimize stage a by allocating/freeing a chunk of pages instead
of a single page at a time.

Signed-off-by: Liang Li 
---
 hw/virtio/virtio-balloon.c  | 159 
 include/standard-headers/linux/virtio_balloon.h |   1 +
 2 files changed, 139 insertions(+), 21 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 8c15e09..8cf74c2 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate)
 #endif
 }
 
+static void do_balloon_bulk_pages(ram_addr_t base_pfn, int page_shift,
+  unsigned long len, bool deflate)
+{
+ram_addr_t size, processed, chunk, base;
+void *addr;
+MemoryRegionSection section = {.mr = NULL};
+
+size = (len << page_shift);
+base = (base_pfn << page_shift);
+
+for (processed = 0; processed < size; processed += chunk) {
+chunk = size - processed;
+while (chunk >= TARGET_PAGE_SIZE) {
+section = memory_region_find(get_system_memory(),
+ base + processed, chunk);
+if (!section.mr) {
+chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE);
+} else {
+break;
+}
+}
+
+if (section.mr &&
+(int128_nz(section.size) && memory_region_is_ram(section.mr))) {
+addr = section.offset_within_region +
+   memory_region_get_ram_ptr(section.mr);
+qemu_madvise(addr, chunk,
+ deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
+} else {
+fprintf(stderr, "can't find the chunk, skip\n");
+chunk = TARGET_PAGE_SIZE;
+}
+}
+}
+
+static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long *bitmap,
+   unsigned long len, int page_shift, bool deflate)
+{
+#if defined(__linux__)
+unsigned long end  = len * 8;
+unsigned long current = 0;
+
+if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
+ kvm_has_sync_mmu())) {
+while (current < end) {
+unsigned long one = find_next_bit(bitmap, end, current);
+
+if (one < end) {
+unsigned long zero = find_next_zero_bit(bitmap, end, one + 1);
+unsigned long page_length;
+
+if (zero >= end) {
+page_length = end - one;
+} else {
+page_length = zero - one;
+}
+
+if (page_length) {
+do_balloon_bulk_pages(base_pfn + one, page_shift,
+  page_length, deflate);
+}
+current = one + page_length;
+} else {
+current = one;
+}
+}
+}
+#endif
+}
+
 static const char *balloon_stat_names[] = {
[VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
[VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
@@ -78,6 +148,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s)
 return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
 }
 
+static bool balloon_page_bitmap_supported(const VirtIOBalloon *s)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
+return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+}
+
 static bool balloon_stats_enabled(const VirtIOBalloon *s)
 {
 return s->stats_poll_interval > 0;
@@ -224,27 +300,66 @@ static void virtio_balloon_handle_output(VirtIODevice 
*vdev, VirtQueue *vq)
 return;
 }
 
-while (iov_to_buf(elem->out_sg, elem->out_num, offset, , 4) == 4) {
-ram_addr_t pa;
-ram_addr_t addr;
-int p = virtio_ldl_p(vdev, );
-
-pa = (ram_addr_t) p << VIRTIO_BALLOON_PFN_SHIFT;