Re: [PATCH v3 0/2] shrink virtio baloon on OOM in guest
On Wed, Oct 15, 2014 at 07:47:42PM +0400, Denis V. Lunev wrote: > Excessive virtio_balloon inflation can cause invocation of OOM-killer, when > Linux is under severe memory pressure. Various mechanisms are responsible for > correct virtio_balloon memory management. Nevertheless it is often the case > that these control tools does not have enough time to react on fast changing > memory load. As a result OS runs out of memory and invokes OOM-killer. > The balancing of memory by use of the virtio balloon should not cause the > termination of processes while there are pages in the balloon. Now there is > no way for virtio balloon driver to free memory at the last moment before > some process get killed by OOM-killer. > > This does not provide a security breach as baloon itself is running > inside guest OS and is working in the cooperation with the host. Thus > some improvements from guest side should be considered as normal. > > To solve the problem, introduce a virtio_balloon callback which is expected > to be called from the oom notifier call chain in out_of_memory() function. > If virtio balloon could release some memory, it will make the system to > return and retry the allocation that forced the out of memory killer to run. > > Patch 1 of this series adds support for implementation of virtio_balloon > callback, so now leak_balloon() function returns number of freed pages. > Patch 2 implements virtio_balloon callback itself. > > Changes from v2: > - added feature bit to control OOM baloon behavior from host > Changes from v1: > - minor cosmetic tweaks suggested by rusty@ > > Signed-off-by: Raushaniya Maksudova > Signed-off-by: Denis V. Lunev > CC: Rusty Russell > CC: Michael S. Tsirkin With the feature bit, I think it's fine. Acked-by: Michael S. Tsirkin ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH RFC v3 1/3] virtio_net: enable tx interrupt
On newer hosts that support delayed tx interrupts, we probably don't have much to gain from orphaning packets early. Based on patch by Jason Wang. Note: this might degrade performance for hosts without event idx support. Should be addressed by the next patch. Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 133 +++ 1 file changed, 89 insertions(+), 44 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 13d0a8b..14f4cda 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -72,6 +72,8 @@ struct send_queue { /* Name of the send queue: output.$index */ char name[40]; + + struct napi_struct napi; }; /* Internal representation of a receive virtqueue */ @@ -217,15 +219,41 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask) return p; } +static unsigned int free_old_xmit_skbs(struct netdev_queue *txq, + struct send_queue *sq, int budget) +{ + struct sk_buff *skb; + unsigned int len; + struct virtnet_info *vi = sq->vq->vdev->priv; + struct virtnet_stats *stats = this_cpu_ptr(vi->stats); + unsigned int packets = 0; + + while (packets < budget && + (skb = virtqueue_get_buf(sq->vq, &len)) != NULL) { + pr_debug("Sent skb %p\n", skb); + + u64_stats_update_begin(&stats->tx_syncp); + stats->tx_bytes += skb->len; + stats->tx_packets++; + u64_stats_update_end(&stats->tx_syncp); + + dev_kfree_skb_any(skb); + packets++; + } + + if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) + netif_tx_start_queue(txq); + + return packets; +} + static void skb_xmit_done(struct virtqueue *vq) { struct virtnet_info *vi = vq->vdev->priv; + struct send_queue *sq = &vi->sq[vq2txq(vq)]; - /* Suppress further interrupts. */ - virtqueue_disable_cb(vq); - - /* We were probably waiting for more output buffers. */ - netif_wake_subqueue(vi->dev, vq2txq(vq)); + virtqueue_disable_cb(sq->vq); + napi_schedule(&sq->napi); } static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx) @@ -774,6 +802,31 @@ again: return received; } +static int virtnet_poll_tx(struct napi_struct *napi, int budget) +{ + struct send_queue *sq = + container_of(napi, struct send_queue, napi); + struct virtnet_info *vi = sq->vq->vdev->priv; + struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq)); + unsigned int sent; + + __netif_tx_lock(txq, smp_processor_id()); + sent = free_old_xmit_skbs(txq, sq, budget); + if (sent < budget) { + napi_complete(napi); + /* Note: we must enable cb *after* napi_complete, because +* napi_schedule calls from callbacks that trigger before +* napi_complete are ignored. +*/ + if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { + virtqueue_disable_cb(sq->vq); + napi_schedule(&sq->napi); + } + } + __netif_tx_unlock(txq); + return sent; +} + #ifdef CONFIG_NET_RX_BUSY_POLL /* must be called with local_bh_disable()d */ static int virtnet_busy_poll(struct napi_struct *napi) @@ -822,30 +875,12 @@ static int virtnet_open(struct net_device *dev) if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); virtnet_napi_enable(&vi->rq[i]); + napi_enable(&vi->sq[i].napi); } return 0; } -static void free_old_xmit_skbs(struct send_queue *sq) -{ - struct sk_buff *skb; - unsigned int len; - struct virtnet_info *vi = sq->vq->vdev->priv; - struct virtnet_stats *stats = this_cpu_ptr(vi->stats); - - while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) { - pr_debug("Sent skb %p\n", skb); - - u64_stats_update_begin(&stats->tx_syncp); - stats->tx_bytes += skb->len; - stats->tx_packets++; - u64_stats_update_end(&stats->tx_syncp); - - dev_kfree_skb_any(skb); - } -} - static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) { struct skb_vnet_hdr *hdr; @@ -911,7 +946,9 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) sg_set_buf(sq->sg, hdr, hdr_len); num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; } - return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); + + return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, + GFP_ATOMIC); } static netdev_tx_t start_xmit(struct sk_buff *skb, stru
[PATCH RFC v3 2/3] virtio_net: bql
Improve tx batching using byte queue limits. Should be especially effective for MQ. Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 14f4cda..b83d39d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -227,6 +227,7 @@ static unsigned int free_old_xmit_skbs(struct netdev_queue *txq, struct virtnet_info *vi = sq->vq->vdev->priv; struct virtnet_stats *stats = this_cpu_ptr(vi->stats); unsigned int packets = 0; + unsigned int bytes = 0; while (packets < budget && (skb = virtqueue_get_buf(sq->vq, &len)) != NULL) { @@ -234,6 +235,7 @@ static unsigned int free_old_xmit_skbs(struct netdev_queue *txq, u64_stats_update_begin(&stats->tx_syncp); stats->tx_bytes += skb->len; + bytes += skb->len; stats->tx_packets++; u64_stats_update_end(&stats->tx_syncp); @@ -241,6 +243,8 @@ static unsigned int free_old_xmit_skbs(struct netdev_queue *txq, packets++; } + netdev_tx_completed_queue(txq, packets, bytes); + if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) netif_tx_start_queue(txq); @@ -959,6 +963,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) int err; struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); bool kick = !skb->xmit_more; + unsigned int bytes = skb->len; virtqueue_disable_cb(sq->vq); @@ -976,6 +981,8 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } + netdev_tx_sent_queue(txq, bytes); + /* Apparently nice girls don't return TX_BUSY; stop the queue * before it gets out of hand. Naturally, this wastes entries. */ if (sq->vq->num_free < 2+MAX_SKB_FRAGS) -- MST ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH RFC v3 3/3] virtio-net: optimize free_old_xmit_skbs stats
From: Jason Wang We already have counters for sent packets and sent bytes. Use them to reduce the number of u64_stats_update_begin/end(). Take care not to bother with stats update when called speculatively. Based on a patch by Jason Wang. Cc: Rusty Russell Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 16 +++- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b83d39d..c2b69f8 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -233,16 +233,22 @@ static unsigned int free_old_xmit_skbs(struct netdev_queue *txq, (skb = virtqueue_get_buf(sq->vq, &len)) != NULL) { pr_debug("Sent skb %p\n", skb); - u64_stats_update_begin(&stats->tx_syncp); - stats->tx_bytes += skb->len; bytes += skb->len; - stats->tx_packets++; - u64_stats_update_end(&stats->tx_syncp); + packets++; dev_kfree_skb_any(skb); - packets++; } + /* Avoid overhead when no packets have been processed +* happens when called speculatively from start_xmit. */ + if (!packets) + return 0; + + u64_stats_update_begin(&stats->tx_syncp); + stats->tx_bytes += bytes; + stats->tx_packets += packets; + u64_stats_update_end(&stats->tx_syncp); + netdev_tx_completed_queue(txq, packets, bytes); if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) -- MST ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.
On Mon, Oct 20, 2014 at 10:42:11AM +1030, Rusty Russell wrote: > Amos Kong writes: > > We got a warning in boot stage when above set_current_rng() is executed, > > it can be fixed by init rng->ref in hwrng_init(). > > > > > > @@ -166,6 +169,8 @@ static inline int hwrng_init(struct hwrng *rng) > > if (current_quality > 0 && !hwrng_fill) > > start_khwrngd(); > > > > + kref_init(&rng->ref); > > + > > return 0; > > } > > OK, I folded this fix on. Thanks. Reviewed-by: Amos Kong > Thanks, > Rusty. > > hw_random: use reference counts on each struct hwrng. > > current_rng holds one reference, and we bump it every time we want > to do a read from it. > > This means we only hold the rng_mutex to grab or drop a reference, > so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't > block on read of /dev/hwrng. > > Using a kref is overkill (we're always under the rng_mutex), but > a standard pattern. > > This also solves the problem that the hwrng_fillfn thread was > accessing current_rng without a lock, which could change (eg. to NULL) > underneath it. > > v3: initialize kref (thanks Amos Kong) > v2: fix missing put_rng() on exit path (thanks Amos Kong) > Signed-off-by: Rusty Russell > > diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c > index a0905c818e13..0ecac38da954 100644 > --- a/drivers/char/hw_random/core.c > +++ b/drivers/char/hw_random/core.c ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 2/2] virtio_balloon: free some memory from balloon on OOM
"Denis V. Lunev" writes: > From: Raushaniya Maksudova > > Excessive virtio_balloon inflation can cause invocation of OOM-killer, > when Linux is under severe memory pressure. Various mechanisms are > responsible for correct virtio_balloon memory management. Nevertheless > it is often the case that these control tools does not have enough time > to react on fast changing memory load. As a result OS runs out of memory > and invokes OOM-killer. The balancing of memory by use of the virtio > balloon should not cause the termination of processes while there are > pages in the balloon. Now there is no way for virtio balloon driver to > free some memory at the last moment before some process will be get > killed by OOM-killer. > > This does not provide a security breach as balloon itself is running > inside guest OS and is working in the cooperation with the host. Thus > some improvements from guest side should be considered as normal. > > To solve the problem, introduce a virtio_balloon callback which is > expected to be called from the oom notifier call chain in out_of_memory() > function. If virtio balloon could release some memory, it will make > the system to return and retry the allocation that forced the out of > memory killer to run. > > Allocate virtio feature bit for this: it is not set by default, > the the guest will not deflate virtio balloon on OOM without explicit > permission from host. > > Signed-off-by: Raushaniya Maksudova > Signed-off-by: Denis V. Lunev > CC: Rusty Russell > CC: Michael S. Tsirkin > --- > drivers/virtio/virtio_balloon.c | 52 > + > include/uapi/linux/virtio_balloon.h | 1 + > 2 files changed, 53 insertions(+) > > diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c > index 66cac10..88d73a0 100644 > --- a/drivers/virtio/virtio_balloon.c > +++ b/drivers/virtio/virtio_balloon.c > @@ -28,6 +28,7 @@ > #include > #include > #include > +#include > > /* > * Balloon device works in 4K page units. So each page is pointed to by > @@ -36,6 +37,12 @@ > */ > #define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> > VIRTIO_BALLOON_PFN_SHIFT) > #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 > +#define OOM_VBALLOON_DEFAULT_PAGES 256 > +#define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80 > + > +static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; > +module_param(oom_pages, int, S_IRUSR | S_IWUSR); > +MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); > > struct virtio_balloon > { > @@ -71,6 +78,9 @@ struct virtio_balloon > /* Memory statistics */ > int need_stats_update; > struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; > + > + /* To register callback in oom notifier call chain */ > + struct notifier_block nb; > }; > > static struct virtio_device_id id_table[] = { > @@ -290,6 +300,38 @@ static void update_balloon_size(struct virtio_balloon > *vb) > &actual); > } > > +/* > + * virtballoon_oom_notify - release pages when system is under severe > + * memory pressure (called from out_of_memory()) > + * @self : notifier block struct > + * @dummy: not used > + * @parm : returned - number of freed pages > + * > + * The balancing of memory by use of the virtio balloon should not cause > + * the termination of processes while there are pages in the balloon. > + * If virtio balloon manages to release some memory, it will make the > + * system return and retry the allocation that forced the OOM killer > + * to run. > + */ > +static int virtballoon_oom_notify(struct notifier_block *self, > + unsigned long dummy, void *parm) > +{ > + struct virtio_balloon *vb; > + unsigned long *freed; > + unsigned num_freed_pages; > + > + vb = container_of(self, struct virtio_balloon, nb); > + if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) > + return NOTIFY_OK; > + > + freed = parm; > + num_freed_pages = leak_balloon(vb, oom_pages); > + update_balloon_size(vb); > + *freed += num_freed_pages; > + > + return NOTIFY_OK; > +} > + > static int balloon(void *_vballoon) > { > struct virtio_balloon *vb = _vballoon; OK, I applied these. I thought it a bit weird that you're checking the feature in the callback, rather than only registering the callback if the feature exists. But it's clear enough. Thanks, Rusty. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.
Amos Kong writes: > From: Rusty Russell > > current_rng holds one reference, and we bump it every time we want > to do a read from it. > > This means we only hold the rng_mutex to grab or drop a reference, > so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't > block on read of /dev/hwrng. > > Using a kref is overkill (we're always under the rng_mutex), but > a standard pattern. > > This also solves the problem that the hwrng_fillfn thread was > accessing current_rng without a lock, which could change (eg. to NULL) > underneath it. > > V2: reduce reference count in signal_pending path OK, I changed it to do the put_rng before the check, so instead of: > @@ -208,17 +256,19 @@ static ssize_t rng_dev_read(struct file *filp, char > __user *buf, > > if (signal_pending(current)) { > err = -ERESTARTSYS; > + put_rng(rng); > goto out; > } > + > + put_rng(rng); > } > out: > return ret ? : err; > -out_unlock: > - mutex_unlock(&rng_mutex); > - goto out; > + > out_unlock_reading: > mutex_unlock(&reading_mutex); > - goto out_unlock; > + put_rng(rng); > + goto out; > } We have: mutex_unlock(&reading_mutex); put_rng(rng); if (need_resched()) schedule_timeout_interruptible(1); if (signal_pending(current)) { err = -ERESTARTSYS; goto out; } } out: return ret ? : err; out_unlock_reading: mutex_unlock(&reading_mutex); put_rng(rng); goto out; } Cheers, Rusty. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.
Amos Kong writes: > We got a warning in boot stage when above set_current_rng() is executed, > it can be fixed by init rng->ref in hwrng_init(). > > > @@ -166,6 +169,8 @@ static inline int hwrng_init(struct hwrng *rng) > if (current_quality > 0 && !hwrng_fill) > start_khwrngd(); > > + kref_init(&rng->ref); > + > return 0; > } OK, I folded this fix on. Thanks, Rusty. hw_random: use reference counts on each struct hwrng. current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v3: initialize kref (thanks Amos Kong) v2: fix missing put_rng() on exit path (thanks Amos Kong) Signed-off-by: Rusty Russell diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c818e13..0ecac38da954 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include #include #include +#include #include @@ -91,6 +92,59 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng->cleanup) + rng->cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(&rng_mutex)); + kref_get(&rng->ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + BUG_ON(!mutex_is_locked(&rng_mutex)); + if (!current_rng) + return; + + kref_put(¤t_rng->ref, cleanup_rng); + current_rng = NULL; +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(&rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(&rng->ref); + + mutex_unlock(&rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* +* Hold rng_mutex here so we serialize in case they set_current_rng +* on rng again immediately. +*/ + mutex_lock(&rng_mutex); + if (rng) + kref_put(&rng->ref, cleanup_rng); + mutex_unlock(&rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng->init) { @@ -110,13 +164,9 @@ static inline int hwrng_init(struct hwrng *rng) if (current_quality > 0 && !hwrng_fill) start_khwrngd(); - return 0; -} + kref_init(&rng->ref); -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng && rng->cleanup) - rng->cleanup(rng); + return 0; } static int rng_dev_open(struct inode *inode, struct file *filp) @@ -154,21 +204,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(&rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(&reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp->f_flags & O_NONBLOCK)); if (bytes_read < 0) { @@ -200,8 +251,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(&rng_mutex); mutex_unlock(&reading_mutex); + put_rng(rng); if (need_resched()) schedule_timeout_interruptible(1); @@ -213,12 +264,11 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } out: return ret ? : err; -out_unlock: - mutex_unlock(&rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(&reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +307