[PATCH v14 4/5] mm: support reporting free page blocks

2017-08-16 Thread Wei Wang
This patch adds support to walk through the free page blocks in the
system and report them via a callback function. Some page blocks may
leave the free list after zone->lock is released, so it is the caller's
responsibility to either detect or prevent the use of such pages.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michal Hocko 
Cc: Michael S. Tsirkin 
---
 include/linux/mm.h |  6 ++
 mm/page_alloc.c| 44 
 2 files changed, 50 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5..cd29b9f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1835,6 +1835,12 @@ extern void free_area_init_node(int nid, unsigned long * 
zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 
+extern void walk_free_mem_block(void *opaque1,
+   unsigned int min_order,
+   void (*visit)(void *opaque2,
+ unsigned long pfn,
+ unsigned long nr_pages));
+
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
  * into the buddy system. The freed pages will be poisoned with pattern
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d00f74..a721a35 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4762,6 +4762,50 @@ void show_free_areas(unsigned int filter, nodemask_t 
*nodemask)
show_swap_cache_info();
 }
 
+/**
+ * walk_free_mem_block - Walk through the free page blocks in the system
+ * @opaque1: the context passed from the caller
+ * @min_order: the minimum order of free lists to check
+ * @visit: the callback function given by the caller
+ *
+ * The function is used to walk through the free page blocks in the system,
+ * and each free page block is reported to the caller via the @visit callback.
+ * Please note:
+ * 1) The function is used to report hints of free pages, so the caller should
+ * not use those reported pages after the callback returns.
+ * 2) The callback is invoked with the zone->lock being held, so it should not
+ * block and should finish as soon as possible.
+ */
+void walk_free_mem_block(void *opaque1,
+unsigned int min_order,
+void (*visit)(void *opaque2,
+  unsigned long pfn,
+  unsigned long nr_pages))
+{
+   struct zone *zone;
+   struct page *page;
+   struct list_head *list;
+   unsigned int order;
+   enum migratetype mt;
+   unsigned long pfn, flags;
+
+   for_each_populated_zone(zone) {
+   for (order = MAX_ORDER - 1;
+order < MAX_ORDER && order >= min_order; order--) {
+   for (mt = 0; mt < MIGRATE_TYPES; mt++) {
+   spin_lock_irqsave(>lock, flags);
+   list = >free_area[order].free_list[mt];
+   list_for_each_entry(page, list, lru) {
+   pfn = page_to_pfn(page);
+   visit(opaque1, pfn, 1 << order);
+   }
+   spin_unlock_irqrestore(>lock, flags);
+   }
+   }
+   }
+}
+EXPORT_SYMBOL_GPL(walk_free_mem_block);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
zoneref->zone = zone;
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v14 5/5] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-08-16 Thread Wei Wang
Add a new vq to report hints of guest free pages to the host.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
---
 drivers/virtio/virtio_balloon.c | 167 +++-
 include/uapi/linux/virtio_balloon.h |   1 +
 2 files changed, 147 insertions(+), 21 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 72041b4..e6755bc 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -54,11 +54,12 @@ static struct vfsmount *balloon_mnt;
 
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
 
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
struct work_struct update_balloon_size_work;
+   struct work_struct report_free_page_work;
 
/* Prevent updating balloon when it is being canceled. */
spinlock_t stop_update_lock;
@@ -90,6 +91,13 @@ struct virtio_balloon {
/* Memory statistics */
struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
 
+   /*
+* Used by the device and driver to signal each other.
+* device->driver: start the free page report.
+* driver->device: end the free page report.
+*/
+   __virtio32 report_free_page_signal;
+
/* To register callback in oom notifier call chain */
struct notifier_block nb;
 };
@@ -174,6 +182,17 @@ static void send_balloon_page_sg(struct virtio_balloon *vb,
} while (unlikely(ret == -ENOSPC));
 }
 
+static void send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+   unsigned int len;
+
+   add_one_sg(vq, addr, size);
+   virtqueue_kick(vq);
+   /* Release entries if there are */
+   while (virtqueue_get_buf(vq, ))
+   ;
+}
+
 /*
  * Send balloon pages in sgs to host. The balloon pages are recorded in the
  * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
@@ -511,42 +530,143 @@ static void update_balloon_size_func(struct work_struct 
*work)
queue_work(system_freezable_wq, work);
 }
 
+static void virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
+  unsigned long nr_pages)
+{
+   struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
+   void *addr = (void *)pfn_to_kaddr(pfn);
+   uint32_t len = nr_pages << PAGE_SHIFT;
+
+   send_free_page_sg(vb->free_page_vq, addr, len);
+}
+
+static void report_free_page_completion(struct virtio_balloon *vb)
+{
+   struct virtqueue *vq = vb->free_page_vq;
+   struct scatterlist sg;
+   unsigned int len;
+   int ret;
+
+   sg_init_one(, >report_free_page_signal, sizeof(__virtio32));
+retry:
+   ret = virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL);
+   virtqueue_kick(vq);
+   if (unlikely(ret == -ENOSPC)) {
+   wait_event(vb->acked, virtqueue_get_buf(vq, ));
+   goto retry;
+   }
+}
+
+static void report_free_page(struct work_struct *work)
+{
+   struct virtio_balloon *vb;
+
+   vb = container_of(work, struct virtio_balloon, report_free_page_work);
+   walk_free_mem_block(vb, 0, _balloon_send_free_pages);
+   report_free_page_completion(vb);
+}
+
+static void free_page_request(struct virtqueue *vq)
+{
+   struct virtio_balloon *vb = vq->vdev->priv;
+
+   queue_work(system_freezable_wq, >report_free_page_work);
+}
+
 static int init_vqs(struct virtio_balloon *vb)
 {
-   struct virtqueue *vqs[3];
-   vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request 
};
-   static const char * const names[] = { "inflate", "deflate", "stats" };
-   int err, nvqs;
+   struct virtqueue **vqs;
+   vq_callback_t **callbacks;
+   const char **names;
+   struct scatterlist sg;
+   int i, nvqs, err = -ENOMEM;
+
+   /* Inflateq and deflateq are used unconditionally */
+   nvqs = 2;
+   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ))
+   nvqs++;
+   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
+   nvqs++;
+
+   /* Allocate space for find_vqs parameters */
+   vqs = kcalloc(nvqs, sizeof(*vqs), GFP_KERNEL);
+   if (!vqs)
+   goto err_vq;
+   callbacks = kmalloc_array(nvqs, sizeof(*callbacks), GFP_KERNEL);
+   if (!callbacks)
+   goto err_callback;
+   names = kmalloc_array(nvqs, sizeof(*names), GFP_KERNEL);
+   if (!names)
+   goto err_names;
+
+   callbacks[0] = balloon_ack;
+   names[0] = "inflate";
+   callbacks[1] = balloon_ack;
+   names[1] = "deflate";
+
+   i = 2;
+   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+  

[PATCH v14 0/5] Virtio-balloon Enhancement

2017-08-16 Thread Wei Wang
This patch series enhances the existing virtio-balloon with the following
new features:
1) fast ballooning: transfer ballooned pages between the guest and host in
chunks using sgs, instead of one by one; and
2) free_page_vq: a new virtqueue to report guest free pages to the host.

The second feature can be used to accelerate live migration of VMs. Here
are some details:

Live migration needs to transfer the VM's memory from the source machine
to the destination round by round. For the 1st round, all the VM's memory
is transferred. From the 2nd round, only the pieces of memory that were
written by the guest (after the 1st round) are transferred. One method
that is popularly used by the hypervisor to track which part of memory is
written is to write-protect all the guest memory.

The second feature  enables the optimization of the 1st round memory
transfer - the hypervisor can skip the transfer of guest free pages in the
1st round. It is not concerned that the memory pages are used after they
are given to the hypervisor as a hint of the free pages, because they will
be tracked by the hypervisor and transferred in the next round if they are
used and written.

Change Log:
v13->v14:
1) xbitmap: move the code from lib/radix-tree.c to lib/xbitmap.c.
2) xbitmap: consolidate the implementation of xb_bit_set/clear/test into
one xb_bit_ops.
3) xbitmap: add documents for the exported APIs.
4) mm: rewrite the function to walk through free page blocks.
5) virtio-balloon: when reporting a free page blcok to the device, if the
vq is full (less likey to happen in practice), just skip reporting this
block, instead of busywaiting till an entry gets released.
6) virtio-balloon: fail the probe function if adding the signal buf in
init_vqs fails.

v12->v13:
1) mm: use a callback function to handle the the free page blocks from the
report function. This avoids exposing the zone internal to a kernel module.
2) virtio-balloon: send balloon pages or a free page block using a single sg
each time. This has the benefits of simpler implementation with no new APIs.
3) virtio-balloon: the free_page_vq is used to report free pages only (no
multiple usages interleaving)
4) virtio-balloon: Balloon pages and free page blocks are sent via input sgs,
and the completion signal to the host is sent via an output sg.

v11->v12:
1) xbitmap: use the xbitmap from Matthew Wilcox to record ballooned pages.
2) virtio-ring: enable the driver to build up a desc chain using vring desc.
3) virtio-ring: Add locking to the existing START_USE() and END_USE() macro
to lock/unlock the vq when a vq operation starts/ends.
4) virtio-ring: add virtqueue_kick_sync() and virtqueue_kick_async()
5) virtio-balloon: describe chunks of ballooned pages and free pages blocks
directly using one or more chains of desc from the vq.

v10->v11:
1) virtio_balloon: use vring_desc to describe a chunk;
2) virtio_ring: support to add an indirect desc table to virtqueue;
3)  virtio_balloon: use cmdq to report guest memory statistics.

v9->v10:
1) mm: put report_unused_page_block() under CONFIG_VIRTIO_BALLOON;
2) virtio-balloon: add virtballoon_validate();
3) virtio-balloon: msg format change;
4) virtio-balloon: move miscq handling to a task on system_freezable_wq;
5) virtio-balloon: code cleanup.

v8->v9:
1) Split the two new features, VIRTIO_BALLOON_F_BALLOON_CHUNKS and
VIRTIO_BALLOON_F_MISC_VQ, which were mixed together in the previous
implementation;
2) Simpler function to get the free page block.

v7->v8:
1) Use only one chunk format, instead of two.
2) re-write the virtio-balloon implementation patch.
3) commit changes
4) patch re-org

Matthew Wilcox (1):
  lib/xbitmap: Introduce xbitmap

Wei Wang (4):
  lib/xbitmap: add xb_find_next_bit() and xb_zero()
  virtio-balloon: VIRTIO_BALLOON_F_SG
  mm: support reporting free page blocks
  virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

 drivers/virtio/virtio_balloon.c | 324 +++-
 include/linux/mm.h  |   6 +
 include/linux/radix-tree.h  |   3 +
 include/linux/xbitmap.h |  64 +++
 include/uapi/linux/virtio_balloon.h |   2 +
 lib/Makefile|   2 +-
 lib/radix-tree.c|  22 ++-
 lib/xbitmap.c   | 215 
 mm/page_alloc.c |  44 +
 9 files changed, 640 insertions(+), 42 deletions(-)
 create mode 100644 include/linux/xbitmap.h
 create mode 100644 lib/xbitmap.c

-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v14 1/5] lib/xbitmap: Introduce xbitmap

2017-08-16 Thread Wei Wang
From: Matthew Wilcox 

The eXtensible Bitmap is a sparse bitmap representation which is
efficient for set bits which tend to cluster.  It supports up to
'unsigned long' worth of bits, and this commit adds the bare bones --
xb_set_bit(), xb_clear_bit() and xb_test_bit().

Signed-off-by: Matthew Wilcox 
Signed-off-by: Wei Wang 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Michael S. Tsirkin 
---
 include/linux/radix-tree.h |   3 +
 include/linux/xbitmap.h|  61 
 lib/Makefile   |   2 +-
 lib/radix-tree.c   |  22 +-
 lib/xbitmap.c  | 176 +
 5 files changed, 260 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/xbitmap.h
 create mode 100644 lib/xbitmap.c

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 3e57350..e1203b1 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -309,6 +309,8 @@ void radix_tree_iter_replace(struct radix_tree_root *,
const struct radix_tree_iter *, void __rcu **slot, void *entry);
 void radix_tree_replace_slot(struct radix_tree_root *,
 void __rcu **slot, void *entry);
+bool __radix_tree_delete(struct radix_tree_root *root,
+struct radix_tree_node *node, void __rcu **slot);
 void __radix_tree_delete_node(struct radix_tree_root *,
  struct radix_tree_node *,
  radix_tree_update_node_t update_node,
@@ -325,6 +327,7 @@ unsigned int radix_tree_gang_lookup(const struct 
radix_tree_root *,
 unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *,
void __rcu ***results, unsigned long *indices,
unsigned long first_index, unsigned int max_items);
+int __radix_tree_preload(gfp_t gfp_mask, unsigned int nr);
 int radix_tree_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
diff --git a/include/linux/xbitmap.h b/include/linux/xbitmap.h
new file mode 100644
index 000..5edbf84
--- /dev/null
+++ b/include/linux/xbitmap.h
@@ -0,0 +1,61 @@
+/*
+ * eXtensible Bitmaps
+ * Copyright (c) 2017 Microsoft Corporation 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * eXtensible Bitmaps provide an unlimited-size sparse bitmap facility.
+ * All bits are initially zero.
+ */
+
+#ifndef __XBITMAP_H__
+#define __XBITMAP_H__
+
+#include 
+
+struct xb {
+   struct radix_tree_root xbrt;
+};
+
+#define XB_INIT {  \
+   .xbrt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT),\
+}
+#define DEFINE_XB(name)struct xb name = XB_INIT
+
+static inline void xb_init(struct xb *xb)
+{
+   INIT_RADIX_TREE(>xbrt, IDR_RT_MARKER | GFP_NOWAIT);
+}
+
+int xb_set_bit(struct xb *xb, unsigned long bit);
+bool xb_test_bit(const struct xb *xb, unsigned long bit);
+void xb_clear_bit(struct xb *xb, unsigned long bit);
+
+/* Check if the xb tree is empty */
+static inline bool xb_is_empty(const struct xb *xb)
+{
+   return radix_tree_empty(>xbrt);
+}
+
+void xb_preload(gfp_t gfp);
+
+/**
+ * xb_preload_end - end preload section started with xb_preload()
+ *
+ * Each xb_preload() should be matched with an invocation of this
+ * function. See xb_preload() for details.
+ */
+static inline void xb_preload_end(void)
+{
+   preempt_enable();
+}
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index 40c1837..ea50496 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -18,7 +18,7 @@ KCOV_INSTRUMENT_dynamic_debug.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
-idr.o int_sqrt.o extable.o \
+idr.o xbitmap.o int_sqrt.o extable.o \
 sha1.o chacha20.o irq_regs.o argv_split.o \
 flex_proportions.o ratelimit.o show_mem.o \
 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 898e879..ee72e2c 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -463,7 +463,7 @@ radix_tree_node_free(struct radix_tree_node *node)
  * To make use of this facility, the radix tree must be initialised without
  * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
  

[PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-16 Thread Wei Wang
Add a new feature, VIRTIO_BALLOON_F_SG, which enables the transfer
of balloon (i.e. inflated/deflated) pages using scatter-gather lists
to the host.

The implementation of the previous virtio-balloon is not very
efficient, because the balloon pages are transferred to the
host one by one. Here is the breakdown of the time in percentage
spent on each step of the balloon inflating process (inflating
7GB of an 8GB idle guest).

1) allocating pages (6.5%)
2) sending PFNs to host (68.3%)
3) address translation (6.1%)
4) madvise (19%)

It takes about 4126ms for the inflating process to complete.
The above profiling shows that the bottlenecks are stage 2)
and stage 4).

This patch optimizes step 2) by transferring pages to the host in
sgs. An sg describes a chunk of guest physically continuous pages.
With this mechanism, step 4) can also be optimized by doing address
translation and madvise() in chunks rather than page by page.

With this new feature, the above ballooning process takes ~541ms
resulting in an improvement of ~87%.

TODO: optimize stage 1) by allocating/freeing a chunk of pages
instead of a single page each time.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Suggested-by: Michael S. Tsirkin 
---
 drivers/virtio/virtio_balloon.c | 157 
 include/uapi/linux/virtio_balloon.h |   1 +
 2 files changed, 141 insertions(+), 17 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f0b3a0b..72041b4 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -79,6 +80,9 @@ struct virtio_balloon {
/* Synchronize access/update to this struct virtio_balloon elements */
struct mutex balloon_lock;
 
+   /* The xbitmap used to record ballooned pages */
+   struct xb page_xb;
+
/* The array of pfns we tell the Host about. */
unsigned int num_pfns;
__virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
@@ -141,13 +145,98 @@ static void set_page_pfns(struct virtio_balloon *vb,
  page_to_balloon_pfn(page) + i);
 }
 
+static int add_one_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+   struct scatterlist sg;
+
+   sg_init_one(, addr, size);
+   return virtqueue_add_inbuf(vq, , 1, vq, GFP_KERNEL);
+}
+
+static void send_balloon_page_sg(struct virtio_balloon *vb,
+struct virtqueue *vq,
+void *addr,
+uint32_t size)
+{
+   unsigned int len;
+   int ret;
+
+   do {
+   ret = add_one_sg(vq, addr, size);
+   virtqueue_kick(vq);
+   wait_event(vb->acked, virtqueue_get_buf(vq, ));
+   /*
+* It is uncommon to see the vq is full, because the sg is sent
+* one by one and the device is able to handle it in time. But
+* if that happens, we go back to retry after an entry gets
+* released.
+*/
+   } while (unlikely(ret == -ENOSPC));
+}
+
+/*
+ * Send balloon pages in sgs to host. The balloon pages are recorded in the
+ * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
+ * The page xbitmap is searched for continuous "1" bits, which correspond
+ * to continuous pages, to chunk into sgs.
+ *
+ * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that
+ * need to be searched.
+ */
+static void tell_host_sgs(struct virtio_balloon *vb,
+ struct virtqueue *vq,
+ unsigned long page_xb_start,
+ unsigned long page_xb_end)
+{
+   unsigned long sg_pfn_start, sg_pfn_end;
+   void *sg_addr;
+   uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE);
+
+   sg_pfn_start = page_xb_start;
+   while (sg_pfn_start < page_xb_end) {
+   sg_pfn_start = xb_find_next_bit(>page_xb, sg_pfn_start,
+   page_xb_end, 1);
+   if (sg_pfn_start == page_xb_end + 1)
+   break;
+   sg_pfn_end = xb_find_next_bit(>page_xb, sg_pfn_start + 1,
+ page_xb_end, 0);
+   sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
+   sg_len = (sg_pfn_end - sg_pfn_start) << PAGE_SHIFT;
+   while (sg_len > sg_max_len) {
+   send_balloon_page_sg(vb, vq, sg_addr, sg_max_len);
+   sg_addr += sg_max_len;
+   sg_len -= sg_max_len;
+   }
+   send_balloon_page_sg(vb, vq, sg_addr, sg_len);
+   xb_zero(>page_xb, sg_pfn_start, sg_pfn_end);
+ 

[PATCH v14 2/5] lib/xbitmap: add xb_find_next_bit() and xb_zero()

2017-08-16 Thread Wei Wang
xb_find_next_bit() is used to find the next "1" or "0" bit in the
given range. xb_zero() is used to zero the given range of bits.

Signed-off-by: Wei Wang 
Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Michal Hocko 
Cc: Michael S. Tsirkin 
---
 include/linux/xbitmap.h |  3 +++
 lib/xbitmap.c   | 39 +++
 2 files changed, 42 insertions(+)

diff --git a/include/linux/xbitmap.h b/include/linux/xbitmap.h
index 5edbf84..739d08c 100644
--- a/include/linux/xbitmap.h
+++ b/include/linux/xbitmap.h
@@ -38,6 +38,9 @@ static inline void xb_init(struct xb *xb)
 int xb_set_bit(struct xb *xb, unsigned long bit);
 bool xb_test_bit(const struct xb *xb, unsigned long bit);
 void xb_clear_bit(struct xb *xb, unsigned long bit);
+void xb_zero(struct xb *xb, unsigned long start, unsigned long end);
+unsigned long xb_find_next_bit(struct xb *xb, unsigned long start,
+  unsigned long end, bool set);
 
 /* Check if the xb tree is empty */
 static inline bool xb_is_empty(const struct xb *xb)
diff --git a/lib/xbitmap.c b/lib/xbitmap.c
index cc766d9..2267ac2 100644
--- a/lib/xbitmap.c
+++ b/lib/xbitmap.c
@@ -174,3 +174,42 @@ void xb_preload(gfp_t gfp)
}
 }
 EXPORT_SYMBOL(xb_preload);
+
+/**
+ *  xb_zero - zero a range of bits in the xbitmap
+ *  @xb: the xbitmap that the bits reside in
+ *  @start: the start of the range, inclusive
+ *  @end: the end of the range, inclusive
+ */
+void xb_zero(struct xb *xb, unsigned long start, unsigned long end)
+{
+   unsigned long i;
+
+   for (i = start; i <= end; i++)
+   xb_clear_bit(xb, i);
+}
+EXPORT_SYMBOL(xb_zero);
+
+/**
+ * xb_find_next_bit - find next 1 or 0 in the give range of bits
+ * @xb: the xbitmap that the bits reside in
+ * @start: the start of the range, inclusive
+ * @end: the end of the range, inclusive
+ * @set: the polarity (1 or 0) of the next bit to find
+ *
+ * Return the index of the found bit in the xbitmap. If the returned index
+ * exceeds @end, it indicates that no such bit is found in the given range.
+ */
+unsigned long xb_find_next_bit(struct xb *xb, unsigned long start,
+  unsigned long end, bool set)
+{
+   unsigned long i;
+
+   for (i = start; i <= end; i++) {
+   if (xb_test_bit(xb, i) == set)
+   break;
+   }
+
+   return i;
+}
+EXPORT_SYMBOL(xb_find_next_bit);
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [patch net-next 0/3] net/sched: Improve getting objects by indexes

2017-08-16 Thread Chris Wilson
Quoting Christian K├Ânig (2017-08-16 08:49:07)
> Am 16.08.2017 um 04:12 schrieb Chris Mi:
> > Using current TC code, it is very slow to insert a lot of rules.
> >
> > In order to improve the rules update rate in TC,
> > we introduced the following two changes:
> >  1) changed cls_flower to use IDR to manage the filters.
> >  2) changed all act_xxx modules to use IDR instead of
> > a small hash table
> >
> > But IDR has a limitation that it uses int. TC handle uses u32.
> > To make sure there is no regression, we also changed IDR to use
> > unsigned long. All clients of IDR are changed to use new IDR API.
> 
> WOW, wait a second. The idr change is touching a lot of drivers and to 
> be honest doesn't looks correct at all.
> 
> Just look at the first chunk of your modification:
> > @@ -998,8 +999,9 @@ int bsg_register_queue(struct request_queue *q, struct 
> > device *parent,
> >   
> >   mutex_lock(_mutex);
> >   
> > - ret = idr_alloc(_minor_idr, bcd, 0, BSG_MAX_DEVS, GFP_KERNEL);
> > - if (ret < 0) {
> > + ret = idr_alloc(_minor_idr, bcd, _index, 0, BSG_MAX_DEVS,
> > + GFP_KERNEL);
> > + if (ret) {
> >   if (ret == -ENOSPC) {
> >   printk(KERN_ERR "bsg: too many bsg devices\n");
> >   ret = -EINVAL;
> The condition "if (ret)" will now always be true after the first 
> allocation and so we always run into the error handling after that.

ret is now purely the error code, so it doesn't look that suspicious.

> I've never read the bsg code before, but that's certainly not correct. 
> And that incorrect pattern repeats over and over again in this code.
> 
> Apart from that why the heck do you want to allocate more than 1<<31 
> handles?

And more to the point, arbitrarily changing the maximum to ULONG_MAX
where the ABI only supports U32_MAX is dangerous. Unless you do the
analysis otherwise, you have to replace all the end=0 with end=INT_MAX
to maintain existing behaviour.
-Chris
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [virtio-dev] [PATCH v13 0/5] Virtio-balloon Enhancement

2017-08-16 Thread Wei Wang

On 08/16/2017 01:57 PM, Adam Tao wrote:

On Thu, Aug 03, 2017 at 02:38:14PM +0800, Wei Wang wrote:

This patch series enhances the existing virtio-balloon with the following
new features:
1) fast ballooning: transfer ballooned pages between the guest and host in
chunks using sgs, instead of one by one; and
2) free_page_vq: a new virtqueue to report guest free pages to the host.


Hi wei,
The reason we add the new vq for the migration feature is based on
what(original design based on inflate and deflate vq)?
I am wondering if we add new feature in the future do we still need to add new 
type
of vq?
Do we need to add one command queue for the common purpose(including
different type of requests except the in/deflate ones)?
Thanks
Adam


Hi Adam,

The the free_page_vq is added to report free pages to the hypervisor.
Neither inflate nor deflate vq was for this purpose.

Based on the current implementation, a vq dedicated to one usage (i.e. 
report

free pages) is better, since mixing with other usages, e.g. a command vq to
handle multiple commands at the same time, would have some issues (e.g. one
being delayed by another due to some resource control), and it also 
results in

more complex interfaces between the driver and device.

For future usages which are still unknown at present, I think we can discuss
them case by case in the future.

Best,
Wei


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization