[PATCH v5 REPOST 6/6] hw_random: don't init list element we're about to add to list.
From: Rusty Russell ru...@rustcorp.com.au Another interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a9286bf..4d13ac5 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -489,7 +489,6 @@ int hwrng_register(struct hwrng *rng) goto out_unlock; } } - INIT_LIST_HEAD(rng-list); list_add_tail(rng-list, rng_list); if (old_rng !rng-init) { -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 REPOST 3/6] hw_random: use reference counts on each struct hwrng.
From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v5: drop redundant kref_init() v4: decrease last reference for triggering the cleanup v3: initialize kref (thanks Amos Kong) v2: fix missing put_rng() on exit path (thanks Amos Kong) Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 135 -- include/linux/hw_random.h | 2 + 2 files changed, 94 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c8..83516cb 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include linux/delay.h #include linux/slab.h #include linux/random.h +#include linux/err.h #include asm/uaccess.h @@ -91,6 +92,60 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng-cleanup) + rng-cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + kref_get(rng-ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + /* decrease last reference for triggering the cleanup */ + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(rng-ref); + + mutex_unlock(rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* +* Hold rng_mutex here so we serialize in case they set_current_rng +* on rng again immediately. +*/ + mutex_lock(rng_mutex); + if (rng) + kref_put(rng-ref, cleanup_rng); + mutex_unlock(rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng-init) { @@ -113,12 +168,6 @@ static inline int hwrng_init(struct hwrng *rng) return 0; } -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng rng-cleanup) - rng-cleanup(rng); -} - static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ @@ -154,21 +203,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,8 +250,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); + put_rng(rng); if (need_resched()) schedule_timeout_interruptible(1); @@ -213,12 +263,11 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev
[PATCH v5 REPOST 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock
In next patch, we use reference counting for each struct hwrng, changing reference count also needs to take mutex_lock. Before releasing the lock, if we try to stop a kthread that waits to take the lock to reduce the referencing count, deadlock will occur. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index b1b6042..a0905c8 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng) } } if (list_empty(rng_list)) { + mutex_unlock(rng_mutex); unregister_miscdev(); if (hwrng_fill) kthread_stop(hwrng_fill); - } - - mutex_unlock(rng_mutex); + } else + mutex_unlock(rng_mutex); } EXPORT_SYMBOL_GPL(hwrng_unregister); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 REPOST 1/6] hw_random: place mutex around read functions and buffers.
From: Rusty Russell ru...@rustcorp.com.au There's currently a big lock around everything, and it means that we can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current) while the rng is reading. This is a real problem when the rng is slow, or blocked (eg. virtio_rng with qemu's default /dev/random backend) This doesn't help (it leaves the current lock untouched), just adds a lock to protect the read function and the static buffers, in preparation for transition. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..b1b6042 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -53,7 +53,10 @@ static struct hwrng *current_rng; static struct task_struct *hwrng_fill; static LIST_HEAD(rng_list); +/* Protects rng_list and current_rng */ static DEFINE_MUTEX(rng_mutex); +/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */ +static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; @@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng) unsigned char bytes[16]; int bytes_read; + mutex_lock(reading_mutex); bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1); + mutex_unlock(reading_mutex); if (bytes_read 0) add_device_randomness(bytes, bytes_read); } @@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait) { int present; + BUG_ON(!mutex_is_locked(reading_mutex)); if (rng-read) return rng-read(rng, (void *)buffer, size, wait); @@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, goto out_unlock; } + mutex_lock(reading_mutex); if (!data_avail) { bytes_read = rng_get_data(current_rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { err = bytes_read; - goto out_unlock; + goto out_unlock_reading; } data_avail = bytes_read; } @@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (!data_avail) { if (filp-f_flags O_NONBLOCK) { err = -EAGAIN; - goto out_unlock; + goto out_unlock_reading; } } else { len = data_avail; @@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (copy_to_user(buf + ret, rng_buffer + data_avail, len)) { err = -EFAULT; - goto out_unlock; + goto out_unlock_reading; } size -= len; @@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + mutex_unlock(reading_mutex); if (need_resched()) schedule_timeout_interruptible(1); @@ -208,6 +216,9 @@ out: out_unlock: mutex_unlock(rng_mutex); goto out; +out_unlock_reading: + mutex_unlock(reading_mutex); + goto out_unlock; } @@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused) while (!kthread_should_stop()) { if (!current_rng) break; + mutex_lock(reading_mutex); rc = rng_get_data(current_rng, rng_fillbuf, rng_buffer_size(), 1); + mutex_unlock(reading_mutex); if (rc = 0) { pr_warn(hwrng: no data available\n); msleep_interruptible(1); continue; } + /* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, rc * current_quality * 8 10); } -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 REPOST 0/6] fix hw_random stuck
When I hotunplug a busy virtio-rng device or try to access hwrng attributes in non-smp guest, it gets stuck. My hotplug tests: | test 0: | hotunplug rng device from qemu monitor | | test 1: | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 2: | guest) # dd if=/dev/random of=/dev/null | hotunplug rng device from qemu monitor | | test 4: | guest) # dd if=/dev/hwrng of=/dev/null | cat /sys/devices/virtual/misc/hw_random/rng_* | | test 5: | guest) # dd if=/dev/hwrng of=/dev/null | cancel dd process after 10 seconds | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 6: | use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo V5: reset cleanup_done flag, drop redundant init of reference count, use compiler barrier to prevent recording. V4: update patch 4 to fix corrupt, decrease last reference for triggering the cleanup, fix unregister race pointed by Herbert V3: initialize kref to 1 V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference counting issue Amos Kong (1): hw_random: move some code out mutex_lock for avoiding underlying deadlock Rusty Russell (5): hw_random: place mutex around read functions and buffers. hw_random: use reference counts on each struct hwrng. hw_random: fix unregister race. hw_random: don't double-check old_rng. hw_random: don't init list element we're about to add to list. drivers/char/hw_random/core.c | 173 ++ include/linux/hw_random.h | 3 + 2 files changed, 126 insertions(+), 50 deletions(-) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 REPOST 5/6] hw_random: don't double-check old_rng.
From: Rusty Russell ru...@rustcorp.com.au Interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 067270b..a9286bf 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -476,14 +476,13 @@ int hwrng_register(struct hwrng *rng) } old_rng = current_rng; + err = 0; if (!old_rng) { err = hwrng_init(rng); if (err) goto out_unlock; set_current_rng(rng); - } - err = 0; - if (!old_rng) { + err = register_miscdev(); if (err) { drop_current_rng(); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 REPOST 4/6] hw_random: fix unregister race.
From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. v5: reset cleanup_done flag, use compiler barrier to prevent recording. v4: add cleanup_done flag to insure that cleanup is done Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 12 include/linux/hw_random.h | 1 + 2 files changed, 13 insertions(+) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 83516cb..067270b 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex); static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; +static DECLARE_WAIT_QUEUE_HEAD(rng_done); static unsigned short current_quality; static unsigned short default_quality; /* = 0; default to off */ @@ -98,6 +99,11 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + + /* cleanup_done should be updated after cleanup finishes */ + smp_wmb(); + rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -498,6 +504,8 @@ int hwrng_register(struct hwrng *rng) add_early_randomness(rng); } + rng-cleanup_done = false; + out_unlock: mutex_unlock(rng_mutex); out: @@ -529,6 +537,10 @@ void hwrng_unregister(struct hwrng *rng) kthread_stop(hwrng_fill); } else mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); } EXPORT_SYMBOL_GPL(hwrng_unregister); diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index c212e71..7832e50 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -46,6 +46,7 @@ struct hwrng { /* internal. */ struct list_head list; struct kref ref; + bool cleanup_done; }; /** Register a new Hardware Random Number Generator driver. */ -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 4/6] hw_random: fix unregister race.
On Wed, Nov 12, 2014 at 02:33:00PM +1030, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. v4: add cleanup_done flag to insure that cleanup is done That's a bit weird. The usual pattern would be to hold a reference until we're actually finished, but this reference is a bit weird. We hold the mutex across cleanup, so we could grab that but we have to take care sleeping inside wait_event, otherwise Peter will have to fix my code again :) AFAICT the wake_woken() stuff isn't merged yet, so your patch will have to do for now. @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -536,6 +539,11 @@ void hwrng_unregister(struct hwrng *rng) kthread_stop(hwrng_fill); } else mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); + The atomic_read() isn't necessary here. At least, we need it to convert refcount from atomic_t to int. Otherwise, we will touch this error: error: invalid operands to binary == (have 'atomic_t' and 'int') However, you should probably init cleanup_done in hwrng_register(). (Probably noone does unregister then register, but let's be clear). Thanks, Rusty. -- Amos. signature.asc Description: Digital signature
[PATCH v5 4/6] hw_random: fix unregister race.
From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. v5: reset cleanup_done flag, use compiler barrier to prevent recording. v4: add cleanup_done flag to insure that cleanup is done Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 12 include/linux/hw_random.h | 1 + 2 files changed, 13 insertions(+) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 83516cb..067270b 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex); static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; +static DECLARE_WAIT_QUEUE_HEAD(rng_done); static unsigned short current_quality; static unsigned short default_quality; /* = 0; default to off */ @@ -98,6 +99,11 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + + /* cleanup_done should be updated after cleanup finishes */ + smp_wmb(); + rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -498,6 +504,8 @@ int hwrng_register(struct hwrng *rng) add_early_randomness(rng); } + rng-cleanup_done = false; + out_unlock: mutex_unlock(rng_mutex); out: @@ -529,6 +537,10 @@ void hwrng_unregister(struct hwrng *rng) kthread_stop(hwrng_fill); } else mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); } EXPORT_SYMBOL_GPL(hwrng_unregister); diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index c212e71..7832e50 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -46,6 +46,7 @@ struct hwrng { /* internal. */ struct list_head list; struct kref ref; + bool cleanup_done; }; /** Register a new Hardware Random Number Generator driver. */ -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 6/6] hw_random: don't init list element we're about to add to list.
From: Rusty Russell ru...@rustcorp.com.au Another interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a9286bf..4d13ac5 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -489,7 +489,6 @@ int hwrng_register(struct hwrng *rng) goto out_unlock; } } - INIT_LIST_HEAD(rng-list); list_add_tail(rng-list, rng_list); if (old_rng !rng-init) { -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 5/6] hw_random: don't double-check old_rng.
From: Rusty Russell ru...@rustcorp.com.au Interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 067270b..a9286bf 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -476,14 +476,13 @@ int hwrng_register(struct hwrng *rng) } old_rng = current_rng; + err = 0; if (!old_rng) { err = hwrng_init(rng); if (err) goto out_unlock; set_current_rng(rng); - } - err = 0; - if (!old_rng) { + err = register_miscdev(); if (err) { drop_current_rng(); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 3/6] hw_random: use reference counts on each struct hwrng.
From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v5: drop redundant kref_init() v4: decrease last reference for triggering the cleanup v3: initialize kref (thanks Amos Kong) v2: fix missing put_rng() on exit path (thanks Amos Kong) Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 135 -- include/linux/hw_random.h | 2 + 2 files changed, 94 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c8..83516cb 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include linux/delay.h #include linux/slab.h #include linux/random.h +#include linux/err.h #include asm/uaccess.h @@ -91,6 +92,60 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng-cleanup) + rng-cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + kref_get(rng-ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + /* decrease last reference for triggering the cleanup */ + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(rng-ref); + + mutex_unlock(rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* +* Hold rng_mutex here so we serialize in case they set_current_rng +* on rng again immediately. +*/ + mutex_lock(rng_mutex); + if (rng) + kref_put(rng-ref, cleanup_rng); + mutex_unlock(rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng-init) { @@ -113,12 +168,6 @@ static inline int hwrng_init(struct hwrng *rng) return 0; } -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng rng-cleanup) - rng-cleanup(rng); -} - static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ @@ -154,21 +203,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,8 +250,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); + put_rng(rng); if (need_resched()) schedule_timeout_interruptible(1); @@ -213,12 +263,11 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev
[PATCH v5 0/6] fix hw_random stuck
When I hotunplug a busy virtio-rng device or try to access hwrng attributes in non-smp guest, it gets stuck. My hotplug tests: | test 0: | hotunplug rng device from qemu monitor | | test 1: | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 2: | guest) # dd if=/dev/random of=/dev/null | hotunplug rng device from qemu monitor | | test 4: | guest) # dd if=/dev/hwrng of=/dev/null | cat /sys/devices/virtual/misc/hw_random/rng_* | | test 5: | guest) # dd if=/dev/hwrng of=/dev/null | cancel dd process after 10 seconds | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 6: | use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo V5: reset cleanup_done flag, drop redundant init of reference count, use compiler barrier to prevent recording. V4: update patch 4 to fix corrupt, decrease last reference for triggering the cleanup, fix unregister race pointed by Herbert V3: initialize kref to 1 V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference counting issue Amos Kong (1): hw_random: move some code out mutex_lock for avoiding underlying deadlock Rusty Russell (5): hw_random: place mutex around read functions and buffers. hw_random: use reference counts on each struct hwrng. hw_random: fix unregister race. hw_random: don't double-check old_rng. hw_random: don't init list element we're about to add to list. drivers/char/hw_random/core.c | 173 ++ include/linux/hw_random.h | 3 + 2 files changed, 126 insertions(+), 50 deletions(-) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock
In next patch, we use reference counting for each struct hwrng, changing reference count also needs to take mutex_lock. Before releasing the lock, if we try to stop a kthread that waits to take the lock to reduce the referencing count, deadlock will occur. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index b1b6042..a0905c8 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng) } } if (list_empty(rng_list)) { + mutex_unlock(rng_mutex); unregister_miscdev(); if (hwrng_fill) kthread_stop(hwrng_fill); - } - - mutex_unlock(rng_mutex); + } else + mutex_unlock(rng_mutex); } EXPORT_SYMBOL_GPL(hwrng_unregister); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 1/6] hw_random: place mutex around read functions and buffers.
From: Rusty Russell ru...@rustcorp.com.au There's currently a big lock around everything, and it means that we can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current) while the rng is reading. This is a real problem when the rng is slow, or blocked (eg. virtio_rng with qemu's default /dev/random backend) This doesn't help (it leaves the current lock untouched), just adds a lock to protect the read function and the static buffers, in preparation for transition. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..b1b6042 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -53,7 +53,10 @@ static struct hwrng *current_rng; static struct task_struct *hwrng_fill; static LIST_HEAD(rng_list); +/* Protects rng_list and current_rng */ static DEFINE_MUTEX(rng_mutex); +/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */ +static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; @@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng) unsigned char bytes[16]; int bytes_read; + mutex_lock(reading_mutex); bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1); + mutex_unlock(reading_mutex); if (bytes_read 0) add_device_randomness(bytes, bytes_read); } @@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait) { int present; + BUG_ON(!mutex_is_locked(reading_mutex)); if (rng-read) return rng-read(rng, (void *)buffer, size, wait); @@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, goto out_unlock; } + mutex_lock(reading_mutex); if (!data_avail) { bytes_read = rng_get_data(current_rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { err = bytes_read; - goto out_unlock; + goto out_unlock_reading; } data_avail = bytes_read; } @@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (!data_avail) { if (filp-f_flags O_NONBLOCK) { err = -EAGAIN; - goto out_unlock; + goto out_unlock_reading; } } else { len = data_avail; @@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (copy_to_user(buf + ret, rng_buffer + data_avail, len)) { err = -EFAULT; - goto out_unlock; + goto out_unlock_reading; } size -= len; @@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + mutex_unlock(reading_mutex); if (need_resched()) schedule_timeout_interruptible(1); @@ -208,6 +216,9 @@ out: out_unlock: mutex_unlock(rng_mutex); goto out; +out_unlock_reading: + mutex_unlock(reading_mutex); + goto out_unlock; } @@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused) while (!kthread_should_stop()) { if (!current_rng) break; + mutex_lock(reading_mutex); rc = rng_get_data(current_rng, rng_fillbuf, rng_buffer_size(), 1); + mutex_unlock(reading_mutex); if (rc = 0) { pr_warn(hwrng: no data available\n); msleep_interruptible(1); continue; } + /* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, rc * current_quality * 8 10); } -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 4/6] hw_random: fix unregister race.
On Wed, Nov 12, 2014 at 02:33:00PM +1030, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. v4: add cleanup_done flag to insure that cleanup is done That's a bit weird. The usual pattern would be to hold a reference until we're actually finished, but this reference is a bit weird. The cleanup function is a callback function of kref_put(), we can't use the same reference count inside cleanup function. We hold the mutex across cleanup, so we could grab that but we have to take care sleeping inside wait_event, otherwise Peter will have to fix my code again :) We didn't hold rng_mutex inside cleanup_rng(), am I missing something? AFAICT the wake_woken() stuff isn't merged yet, so your patch will have to do for now. Can you provide some patches/mail link here? I searched nothing about wake_woken. @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -536,6 +539,11 @@ void hwrng_unregister(struct hwrng *rng) kthread_stop(hwrng_fill); } else mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); + The atomic_read() isn't necessary here. However, you should probably init cleanup_done in hwrng_register(). (Probably noone does unregister then register, but let's be clear). Got it. Thanks, Rusty. } EXPORT_SYMBOL_GPL(hwrng_unregister); diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index c212e71..7832e50 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -46,6 +46,7 @@ struct hwrng { /* internal. */ struct list_head list; struct kref ref; + bool cleanup_done; }; /** Register a new Hardware Random Number Generator driver. */ -- 1.9.3 -- Amos. signature.asc Description: Digital signature
Re: [PATCH v4 3/6] hw_random: use reference counts on each struct hwrng.
On Wed, Nov 12, 2014 at 02:11:23PM +1030, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v4: decrease last reference for triggering the cleanup This doesn't make any sense: +static void drop_current_rng(void) +{ + struct hwrng *rng = current_rng; + + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + /* release current_rng reference */ + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; + + /* decrease last reference for triggering the cleanup */ + kref_put(rng-ref, cleanup_rng); +} Why would it drop the refcount twice? This doesn't make sense. Hmm, because you added kref_init, which initializes the reference count to 1, you created this bug. I saw some kernel code uses kref_* helper functions, the reference conter is initialized to 1. Some code didn't use the helper functions to increase/decrease the reference counter. So I will drop kref_init() and the second kref_put(). Leave out the kref_init, and let it naturally be 0 (until, and if, it becomes current_rng). Add a comment if you want. OK, thanks. Thanks, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- Amos. signature.asc Description: Digital signature
Re: [PATCH 3/5] hw_random: fix unregister race.
On Tue, Oct 21, 2014 at 10:15:23PM +0800, Herbert Xu wrote: On Thu, Sep 18, 2014 at 12:18:24PM +0930, Rusty Russell wrote: The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index dc9092a1075d..b4a21e9521cf 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex); static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; +static DECLARE_WAIT_QUEUE_HEAD(rng_done); static unsigned short current_quality; static unsigned short default_quality; /* = 0; default to off */ @@ -98,6 +99,7 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -529,6 +531,9 @@ void hwrng_unregister(struct hwrng *rng) } mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, atomic_read(rng-ref.refcount) == 0); Hi Rusty, After initializing (kref_init()), the refcount is 1, so we need one more kref_put() after each drop_current_rng() to release last reference count, then cleanup function will be called. While it's obviously better than what we have now, I don't believe this is 100% safe as the cleanup function might still be running even after the ref count hits zero. Once we return from this function the module may be unloaded so we need to ensure that nothing is running at this point. I found wait_event() can still pass and finish unregister even cleanup function isn't called (wake_up_all() isn't called). So I added a flag cleanup_done to indicate that the rng device is cleaned up. + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); I will post the new v4 later. Cheers, -- Email: Herbert Xu herb...@gondor.apana.org.au Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4 6/6] hw_random: don't init list element we're about to add to list.
From: Rusty Russell ru...@rustcorp.com.au Another interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index fc5de7d..b2cc8a1 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -493,7 +493,6 @@ int hwrng_register(struct hwrng *rng) goto out_unlock; } } - INIT_LIST_HEAD(rng-list); list_add_tail(rng-list, rng_list); if (old_rng !rng-init) { -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock
In next patch, we use reference counting for each struct hwrng, changing reference count also needs to take mutex_lock. Before releasing the lock, if we try to stop a kthread that waits to take the lock to reduce the referencing count, deadlock will occur. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index b1b6042..a0905c8 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng) } } if (list_empty(rng_list)) { + mutex_unlock(rng_mutex); unregister_miscdev(); if (hwrng_fill) kthread_stop(hwrng_fill); - } - - mutex_unlock(rng_mutex); + } else + mutex_unlock(rng_mutex); } EXPORT_SYMBOL_GPL(hwrng_unregister); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4 0/6] fix hw_random stuck
When I hotunplug a busy virtio-rng device or try to access hwrng attributes in non-smp guest, it gets stuck. My hotplug tests: | test 0: | hotunplug rng device from qemu monitor | | test 1: | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 2: | guest) # dd if=/dev/random of=/dev/null | hotunplug rng device from qemu monitor | | test 4: | guest) # dd if=/dev/hwrng of=/dev/null | cat /sys/devices/virtual/misc/hw_random/rng_* | | test 5: | guest) # dd if=/dev/hwrng of=/dev/null | cancel dd process after 10 seconds | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 6: | use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo V4: update patch 4 to fix corrupt, decrease last reference for triggering the cleanup, fix unregister race pointed by Herbert V3: initialize kref to 1 V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference counting issue Amos Kong (1): hw_random: move some code out mutex_lock for avoiding underlying deadlock Rusty Russell (5): hw_random: place mutex around read functions and buffers. hw_random: use reference counts on each struct hwrng. hw_random: fix unregister race. hw_random: don't double-check old_rng. hw_random: don't init list element we're about to add to list. drivers/char/hw_random/core.c | 176 ++ include/linux/hw_random.h | 3 + 2 files changed, 129 insertions(+), 50 deletions(-) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4 3/6] hw_random: use reference counts on each struct hwrng.
From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v4: decrease last reference for triggering the cleanup v3: initialize kref (thanks Amos Kong) v2: fix missing put_rng() on exit path (thanks Amos Kong) Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 142 +- include/linux/hw_random.h | 2 + 2 files changed, 101 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c8..27ad6b4 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include linux/delay.h #include linux/slab.h #include linux/random.h +#include linux/err.h #include asm/uaccess.h @@ -91,6 +92,65 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng-cleanup) + rng-cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + kref_get(rng-ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + struct hwrng *rng = current_rng; + + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + /* release current_rng reference */ + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; + + /* decrease last reference for triggering the cleanup */ + kref_put(rng-ref, cleanup_rng); +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(rng-ref); + + mutex_unlock(rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* +* Hold rng_mutex here so we serialize in case they set_current_rng +* on rng again immediately. +*/ + mutex_lock(rng_mutex); + if (rng) + kref_put(rng-ref, cleanup_rng); + mutex_unlock(rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng-init) { @@ -110,13 +170,9 @@ static inline int hwrng_init(struct hwrng *rng) if (current_quality 0 !hwrng_fill) start_khwrngd(); - return 0; -} + kref_init(rng-ref); -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng rng-cleanup) - rng-cleanup(rng); + return 0; } static int rng_dev_open(struct inode *inode, struct file *filp) @@ -154,21 +210,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,8 +257,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); + put_rng(rng); if (need_resched()) schedule_timeout_interruptible(1); @@ -213,12 +270,11 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex
[PATCH v4 1/6] hw_random: place mutex around read functions and buffers.
From: Rusty Russell ru...@rustcorp.com.au There's currently a big lock around everything, and it means that we can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current) while the rng is reading. This is a real problem when the rng is slow, or blocked (eg. virtio_rng with qemu's default /dev/random backend) This doesn't help (it leaves the current lock untouched), just adds a lock to protect the read function and the static buffers, in preparation for transition. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..b1b6042 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -53,7 +53,10 @@ static struct hwrng *current_rng; static struct task_struct *hwrng_fill; static LIST_HEAD(rng_list); +/* Protects rng_list and current_rng */ static DEFINE_MUTEX(rng_mutex); +/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */ +static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; @@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng) unsigned char bytes[16]; int bytes_read; + mutex_lock(reading_mutex); bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1); + mutex_unlock(reading_mutex); if (bytes_read 0) add_device_randomness(bytes, bytes_read); } @@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait) { int present; + BUG_ON(!mutex_is_locked(reading_mutex)); if (rng-read) return rng-read(rng, (void *)buffer, size, wait); @@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, goto out_unlock; } + mutex_lock(reading_mutex); if (!data_avail) { bytes_read = rng_get_data(current_rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { err = bytes_read; - goto out_unlock; + goto out_unlock_reading; } data_avail = bytes_read; } @@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (!data_avail) { if (filp-f_flags O_NONBLOCK) { err = -EAGAIN; - goto out_unlock; + goto out_unlock_reading; } } else { len = data_avail; @@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (copy_to_user(buf + ret, rng_buffer + data_avail, len)) { err = -EFAULT; - goto out_unlock; + goto out_unlock_reading; } size -= len; @@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + mutex_unlock(reading_mutex); if (need_resched()) schedule_timeout_interruptible(1); @@ -208,6 +216,9 @@ out: out_unlock: mutex_unlock(rng_mutex); goto out; +out_unlock_reading: + mutex_unlock(reading_mutex); + goto out_unlock; } @@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused) while (!kthread_should_stop()) { if (!current_rng) break; + mutex_lock(reading_mutex); rc = rng_get_data(current_rng, rng_fillbuf, rng_buffer_size(), 1); + mutex_unlock(reading_mutex); if (rc = 0) { pr_warn(hwrng: no data available\n); msleep_interruptible(1); continue; } + /* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, rc * current_quality * 8 10); } -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4 4/6] hw_random: fix unregister race.
From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. v4: add cleanup_done flag to insure that cleanup is done Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 8 include/linux/hw_random.h | 1 + 2 files changed, 9 insertions(+) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 27ad6b4..c31bf91 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex); static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; +static DECLARE_WAIT_QUEUE_HEAD(rng_done); static unsigned short current_quality; static unsigned short default_quality; /* = 0; default to off */ @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -536,6 +539,11 @@ void hwrng_unregister(struct hwrng *rng) kthread_stop(hwrng_fill); } else mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); + } EXPORT_SYMBOL_GPL(hwrng_unregister); diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index c212e71..7832e50 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -46,6 +46,7 @@ struct hwrng { /* internal. */ struct list_head list; struct kref ref; + bool cleanup_done; }; /** Register a new Hardware Random Number Generator driver. */ -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4 5/6] hw_random: don't double-check old_rng.
From: Rusty Russell ru...@rustcorp.com.au Interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c31bf91..fc5de7d 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -480,14 +480,13 @@ int hwrng_register(struct hwrng *rng) } old_rng = current_rng; + err = 0; if (!old_rng) { err = hwrng_init(rng); if (err) goto out_unlock; set_current_rng(rng); - } - err = 0; - if (!old_rng) { + err = register_miscdev(); if (err) { drop_current_rng(); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 4/6] hw_random: fix unregister race.
On Fri, Oct 31, 2014 at 03:23:21PM +0800, Herbert Xu wrote: On Fri, Oct 31, 2014 at 10:28:00AM +1030, Rusty Russell wrote: Herbert Xu herb...@gondor.apana.org.au writes: On Thu, Sep 18, 2014 at 08:37:45PM +0800, Amos Kong wrote: From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. Signed-off-by: Rusty Russell ru...@rustcorp.com.au You totally corrupted Rusty's patch. If you're going to repost his series you better make sure that you've got the right patches. Just as well though as it made me think a little more about this patch :) OK Amos, can you please repost the complete series? Please fix the unregister race I identified first. Ok, I redownload the patches from https://patchwork.kernel.org/project/LKML/list/?submitter=5 and rebase fixes of me and rusty on it. I will post a V4 later. Thanks. -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 4/6] hw_random: fix unregister race.
On Sun, Nov 02, 2014 at 11:08:09PM +0800, Herbert Xu wrote: On Sun, Nov 02, 2014 at 11:06:13PM +0800, Amos Kong wrote: On Fri, Oct 31, 2014 at 03:23:21PM +0800, Herbert Xu wrote: On Fri, Oct 31, 2014 at 10:28:00AM +1030, Rusty Russell wrote: Herbert Xu herb...@gondor.apana.org.au writes: On Thu, Sep 18, 2014 at 08:37:45PM +0800, Amos Kong wrote: From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. Signed-off-by: Rusty Russell ru...@rustcorp.com.au You totally corrupted Rusty's patch. If you're going to repost his series you better make sure that you've got the right patches. Just as well though as it made me think a little more about this patch :) OK Amos, can you please repost the complete series? Please fix the unregister race I identified first. Ok, I redownload the patches from https://patchwork.kernel.org/project/LKML/list/?submitter=5 and rebase fixes of me and rusty on it. I will post a V4 later. Thanks. Please fix the unregister race I pointed out before doing a v4. Thanks for the remind, I got it right now :-) -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.
On Mon, Oct 20, 2014 at 10:42:11AM +1030, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: We got a warning in boot stage when above set_current_rng() is executed, it can be fixed by init rng-ref in hwrng_init(). @@ -166,6 +169,8 @@ static inline int hwrng_init(struct hwrng *rng) if (current_quality 0 !hwrng_fill) start_khwrngd(); + kref_init(rng-ref); + return 0; } OK, I folded this fix on. Thanks. Reviewed-by: Amos Kong ak...@redhat.com Thanks, Rusty. hw_random: use reference counts on each struct hwrng. current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v3: initialize kref (thanks Amos Kong) v2: fix missing put_rng() on exit path (thanks Amos Kong) Signed-off-by: Rusty Russell ru...@rustcorp.com.au diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c818e13..0ecac38da954 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] hw_random: use reference counts on each struct hwrng.
On Thu, Sep 18, 2014 at 12:18:23PM +0930, Rusty Russell wrote: current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. Hi Rusty, Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 135 -- include/linux/hw_random.h | 2 + 2 files changed, 94 insertions(+), 43 deletions(-) ... static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ @@ -154,21 +202,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,7 +249,6 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); if (need_resched()) @@ -210,15 +258,16 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, err = -ERESTARTSYS; We need put_rng() in this error path. Otherwise, unhotplug will hang in the end of hwrng_unregister() |/* Just in case rng is reading right now, wait. */ |wait_event(rng_done, atomic_read(rng-ref.refcount) == 0); Steps to reproduce the hang: guest) # dd if=/dev/hwrng of=/dev/null cancel dd process after 10 seconds guest) # dd if=/dev/hwrng of=/dev/null hotunplug rng device from qemu monitor result: device can't be removed (still can find in QEMU monitor) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 96fa067..4e22d70 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -258,6 +258,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (signal_pending(current)) { err = -ERESTARTSYS; + put_rng(rng); goto out; } goto out; } + + put_rng(rng); } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev, err = hwrng_init(rng); if (err) break; - hwrng_cleanup(current_rng); - current_rng = rng; + drop_current_rng(); + set_current_rng(rng); err = 0; break; } @@ -272,17 +321,15 @@ static ssize_t hwrng_attr_current_show(struct device *dev, struct device_attribute *attr, char *buf) { - int err; ssize_t ret; - const char *name = none; + struct hwrng *rng; - err = mutex_lock_interruptible(rng_mutex); - if (err) - return -ERESTARTSYS; - if (current_rng) - name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); - mutex_unlock(rng_mutex); + rng = get_current_rng(); + if (IS_ERR(rng)) + return PTR_ERR(rng); + + ret = snprintf(buf, PAGE_SIZE, %s\n, rng ? rng-name : none); + put_rng(rng); return ret; } @@ -357,12 +404,16 @@ static int hwrng_fillfn(void *unused) long rc; while (!kthread_should_stop()) { -
[PATCH v2 6/6] hw_random: don't init list element we're about to add to list.
From: Rusty Russell ru...@rustcorp.com.au Another interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 6420409..12187dd 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -486,7 +486,6 @@ int hwrng_register(struct hwrng *rng) goto out_unlock; } } - INIT_LIST_HEAD(rng-list); list_add_tail(rng-list, rng_list); if (old_rng !rng-init) { -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock
In next patch, we use reference counting for each struct hwrng, changing reference count also needs to take mutex_lock. Before releasing the lock, if we try to stop a kthread that waits to take the lock to reduce the referencing count, deadlock will occur. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index b1b6042..a0905c8 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng) } } if (list_empty(rng_list)) { + mutex_unlock(rng_mutex); unregister_miscdev(); if (hwrng_fill) kthread_stop(hwrng_fill); - } - - mutex_unlock(rng_mutex); + } else + mutex_unlock(rng_mutex); } EXPORT_SYMBOL_GPL(hwrng_unregister); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 4/6] hw_random: fix unregister race.
From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index ee9e504..9f6f5bb 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex); static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; +static DECLARE_WAIT_QUEUE_HEAD(rng_done); static unsigned short current_quality; static unsigned short default_quality; /* = 0; default to off */ @@ -98,6 +99,7 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 5/6] hw_random: don't double-check old_rng.
From: Rusty Russell ru...@rustcorp.com.au Interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 9f6f5bb..6420409 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -473,14 +473,13 @@ int hwrng_register(struct hwrng *rng) } old_rng = current_rng; + err = 0; if (!old_rng) { err = hwrng_init(rng); if (err) goto out_unlock; set_current_rng(rng); - } - err = 0; - if (!old_rng) { + err = register_miscdev(); if (err) { drop_current_rng(); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.
From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. V2: reduce reference count in signal_pending path Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 136 +- include/linux/hw_random.h | 2 + 2 files changed, 95 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c8..ee9e504 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include linux/delay.h #include linux/slab.h #include linux/random.h +#include linux/err.h #include asm/uaccess.h @@ -91,6 +92,59 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng-cleanup) + rng-cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + kref_get(rng-ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(rng-ref); + + mutex_unlock(rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* +* Hold rng_mutex here so we serialize in case they set_current_rng +* on rng again immediately. +*/ + mutex_lock(rng_mutex); + if (rng) + kref_put(rng-ref, cleanup_rng); + mutex_unlock(rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng-init) { @@ -113,12 +167,6 @@ static inline int hwrng_init(struct hwrng *rng) return 0; } -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng rng-cleanup) - rng-cleanup(rng); -} - static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ @@ -154,21 +202,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,7 +249,6 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); if (need_resched()) @@ -208,17 +256,19 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (signal_pending(current)) { err = -ERESTARTSYS; + put_rng(rng); goto out; } + + put_rng(rng); } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +307,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev, err = hwrng_init(rng); if (err
[PATCH v2 0/6] fix hw_random stuck
When I hotunplug a busy virtio-rng device or try to access hwrng attributes in non-smp guest, it gets stuck. My original was pain, Rusty posted a real fix. This patchset fixed two issue in v1, and tested by my 6+ cases. | test 0: | hotunplug rng device from qemu monitor | | test 1: | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 2: | guest) # dd if=/dev/random of=/dev/null | hotunplug rng device from qemu monitor | | test 4: | guest) # dd if=/dev/hwrng of=/dev/null | cat /sys/devices/virtual/misc/hw_random/rng_* | | test 5: | guest) # dd if=/dev/hwrng of=/dev/null | cancel dd process after 10 seconds | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 6: | use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference counting issue Amos Kong (1): hw_random: move some code out mutex_lock for avoiding underlying deadlock Rusty Russell (5): hw_random: place mutex around read functions and buffers. hw_random: use reference counts on each struct hwrng. hw_random: fix unregister race. hw_random: don't double-check old_rng. hw_random: don't init list element we're about to add to list. drivers/char/hw_random/core.c | 166 +- include/linux/hw_random.h | 2 + 2 files changed, 117 insertions(+), 51 deletions(-) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 1/6] hw_random: place mutex around read functions and buffers.
From: Rusty Russell ru...@rustcorp.com.au There's currently a big lock around everything, and it means that we can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current) while the rng is reading. This is a real problem when the rng is slow, or blocked (eg. virtio_rng with qemu's default /dev/random backend) This doesn't help (it leaves the current lock untouched), just adds a lock to protect the read function and the static buffers, in preparation for transition. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..b1b6042 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -53,7 +53,10 @@ static struct hwrng *current_rng; static struct task_struct *hwrng_fill; static LIST_HEAD(rng_list); +/* Protects rng_list and current_rng */ static DEFINE_MUTEX(rng_mutex); +/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */ +static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; @@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng) unsigned char bytes[16]; int bytes_read; + mutex_lock(reading_mutex); bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1); + mutex_unlock(reading_mutex); if (bytes_read 0) add_device_randomness(bytes, bytes_read); } @@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait) { int present; + BUG_ON(!mutex_is_locked(reading_mutex)); if (rng-read) return rng-read(rng, (void *)buffer, size, wait); @@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, goto out_unlock; } + mutex_lock(reading_mutex); if (!data_avail) { bytes_read = rng_get_data(current_rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { err = bytes_read; - goto out_unlock; + goto out_unlock_reading; } data_avail = bytes_read; } @@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (!data_avail) { if (filp-f_flags O_NONBLOCK) { err = -EAGAIN; - goto out_unlock; + goto out_unlock_reading; } } else { len = data_avail; @@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (copy_to_user(buf + ret, rng_buffer + data_avail, len)) { err = -EFAULT; - goto out_unlock; + goto out_unlock_reading; } size -= len; @@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + mutex_unlock(reading_mutex); if (need_resched()) schedule_timeout_interruptible(1); @@ -208,6 +216,9 @@ out: out_unlock: mutex_unlock(rng_mutex); goto out; +out_unlock_reading: + mutex_unlock(reading_mutex); + goto out_unlock; } @@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused) while (!kthread_should_stop()) { if (!current_rng) break; + mutex_lock(reading_mutex); rc = rng_get_data(current_rng, rng_fillbuf, rng_buffer_size(), 1); + mutex_unlock(reading_mutex); if (rc = 0) { pr_warn(hwrng: no data available\n); msleep_interruptible(1); continue; } + /* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, rc * current_quality * 8 10); } -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 2/3] hw_random: fix stuck in catting hwrng attributes
On Thu, Sep 18, 2014 at 12:13:08PM +0930, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: I started a QEMU (non-smp) guest with one virtio-rng device, and read random data from /dev/hwrng by dd: # dd if=/dev/hwrng of=/dev/null In the same time, if I check hwrng attributes from sysfs by cat: # cat /sys/class/misc/hw_random/rng_* The cat process always gets stuck with slow backend (5 k/s), if we use a quick backend (1.2 M/s), the cat process will cost 1 to 2 minutes. The stuck doesn't exist for smp guest. Reading syscall enters kernel and call rng_dev_read(), it's user context. We used need_resched() to check if other tasks need to be run, but it almost always return false, and re-hold the mutex lock. The attributes accessing process always fails to hold the lock, so the cat gets stuck. User context doesn't allow other user contexts run on that CPU, unless the kernel code sleeps for some reason. This is why the need_reshed() always return false here. This patch removed need_resched() and always schedule other tasks then other tasks can have chance to hold the lock and execute protected code. Hi Rusty, OK, this is going to be a rant. Your explanation doesn't make sense at all. Worse, your solution breaks the advice of Kernighan Plaugher: Don't patch bad code - rewrite it.. But worst of all, this detailed explanation might have convinced me you understood the problem better than I did, and applied your patch. I'm sorry about the misleading. I did some tests. For me, as expected, the process spends its time inside the virtio rng read function, holding the mutex and thus blocking sysfs access; it's not a failure of this code at all. Got it now. The catting hang bug was found when I try to fix unhotplug issue, the unhotplug issue can't be reproduced if I try to debug by gdb or printk. So I forgot to debug cat hang ... but spend time to misunderstand schedle code :( Your schedule_timeout() fix probably just helps by letting the host refresh entropy, so we spend less time waiting in the read fn. I will post a series, which unfortunately is only lightly tested, then I'm going to have some beer to begin my holiday. That may help me forget my disappointment at seeing respected fellow developers monkey-patching random code they don't understand. I just posted a V2 with two additional fixes, hotunplugging works well now :) Grrr Enjoy your holiday! Amos Rusty. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..263a370 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(1); if (signal_pending(current)) { err = -ERESTARTSYS; -- 1.9.3 -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.
On Thu, Sep 18, 2014 at 08:37:44PM +0800, Amos Kong wrote: From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. V2: reduce reference count in signal_pending path Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com Reply myself. --- drivers/char/hw_random/core.c | 136 +- include/linux/hw_random.h | 2 + 2 files changed, 95 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c8..ee9e504 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include linux/delay.h #include linux/slab.h #include linux/random.h +#include linux/err.h #include asm/uaccess.h @@ -91,6 +92,59 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng-cleanup) + rng-cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + kref_get(rng-ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(rng-ref); + + mutex_unlock(rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* + * Hold rng_mutex here so we serialize in case they set_current_rng + * on rng again immediately. + */ + mutex_lock(rng_mutex); + if (rng) + kref_put(rng-ref, cleanup_rng); + mutex_unlock(rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng-init) { @@ -113,12 +167,6 @@ static inline int hwrng_init(struct hwrng *rng) return 0; } -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng rng-cleanup) - rng-cleanup(rng); -} - static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ @@ -154,21 +202,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,7 +249,6 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); if (need_resched()) @@ -208,17 +256,19 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (signal_pending(current)) { err = -ERESTARTSYS; + put_rng(rng); goto out; } + + put_rng(rng); } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +307,8 @@ static ssize_t hwrng_attr_current_store(struct
[PATCH v2 1/3] virtio-rng cleanup: move some code out of mutex protection
It doesn't save too much cpu time as expected, just a cleanup. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..c591d7e 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -270,8 +270,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); return ret; } @@ -284,19 +284,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); return ret; } -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 3/3] hw_random: increase schedule timeout in rng_dev_read()
This patch increases the schedule timeout to 10 jiffies, it's more appropriate, then other takes can easy to hold the mutex lock. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 263a370..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,7 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); if (signal_pending(current)) { err = -ERESTARTSYS; -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 0/3] fix stuck in accessing hwrng attributes
If we read hwrng by long-running dd process, it takes too much cpu time and almost hold the mutex lock. When we check hwrng attributes from sysfs by cat, it gets stuck in waiting the lock releaseing. The problem can only be reproduced with non-smp guest with slow backend. This patchset resolves the issue by changing rng_dev_read() to always schedule 10 jiffies after release mutex lock, then cat process can have chance to get the lock and execute protected code without stuck. Thanks. V2: update commitlog to describe PATCH 2, split second patch. Amos Kong (3): virtio-rng cleanup: move some code out of mutex protection hw_random: fix stuck in catting hwrng attributes hw_random: increase schedule timeout in rng_dev_read() drivers/char/hw_random/core.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 2/3] hw_random: fix stuck in catting hwrng attributes
I started a QEMU (non-smp) guest with one virtio-rng device, and read random data from /dev/hwrng by dd: # dd if=/dev/hwrng of=/dev/null In the same time, if I check hwrng attributes from sysfs by cat: # cat /sys/class/misc/hw_random/rng_* The cat process always gets stuck with slow backend (5 k/s), if we use a quick backend (1.2 M/s), the cat process will cost 1 to 2 minutes. The stuck doesn't exist for smp guest. Reading syscall enters kernel and call rng_dev_read(), it's user context. We used need_resched() to check if other tasks need to be run, but it almost always return false, and re-hold the mutex lock. The attributes accessing process always fails to hold the lock, so the cat gets stuck. User context doesn't allow other user contexts run on that CPU, unless the kernel code sleeps for some reason. This is why the need_reshed() always return false here. This patch removed need_resched() and always schedule other tasks then other tasks can have chance to hold the lock and execute protected code. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..263a370 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(1); if (signal_pending(current)) { err = -ERESTARTSYS; -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 3/3] hw_random: increase schedule timeout in rng_dev_read()
On Mon, Sep 15, 2014 at 06:13:31PM +0200, Michael Büsch wrote: On Tue, 16 Sep 2014 00:02:29 +0800 Amos Kong ak...@redhat.com wrote: This patch increases the schedule timeout to 10 jiffies, it's more appropriate, then other takes can easy to hold the mutex lock. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 263a370..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,7 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); if (signal_pending(current)) { err = -ERESTARTSYS; Does a schedule of 1 ms or 10 ms decrease the throughput? In my test environment, 1 jiffe always works (100%), as suggested by Amit 10 jiffes is more appropriate. After applied current 3 patches, there is a throughput regression. 1.2 M/s - 6 K/s We can only schedule in the end of loop (size == 0), and only for non-smp guest. So smp guest won't be effected. | if (!size num_online_cpus() == 1) | schedule_timeout_interruptible(timeout); Set timeout to 1: non-smp guest with quick backend (1.2M/s) - about 49K/s) Set timeout to 10: non-smp guest with quick backend (1.2M/s) - about 490K/s) We might need other benchmark to test the performance, but we can see the bug clearly caused a regression. As we discussed in other thread, need_resched() should work in this case, so those patches might be wrong fixing. I think we need some benchmarks. -- Michael -- Amos. pgpBH8wzlVvPd.pgp Description: PGP signature
Re: [PATCH v2 1/3] virtio-rng cleanup: move some code out of mutex protection
On Mon, Sep 15, 2014 at 06:13:20PM +0200, Michael Büsch wrote: On Tue, 16 Sep 2014 00:02:27 +0800 Amos Kong ak...@redhat.com wrote: It doesn't save too much cpu time as expected, just a cleanup. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..c591d7e 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -270,8 +270,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); I'm not sure this is safe. Name is just a pointer. What if the hwrng gets unregistered after unlock and just before the snprintf? Oh, it points to protected current_rng-name, I will drop this cleanup. Thanks. return ret; } @@ -284,19 +284,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); return ret; } This looks ok. -- Michael -- Amos. pgpFpxTwIJswL.pgp Description: PGP signature
Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
CC linux-kernel Original thread: http://comments.gmane.org/gmane.linux.kernel.virtualization/22775 On Mon, Sep 15, 2014 at 06:48:46PM +0200, Radim Krčmář wrote: 2014-09-14 10:25+0800, Amos Kong: On Sun, Sep 14, 2014 at 09:12:08AM +0800, Amos Kong wrote: ... diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); If cond_resched() does not work, it is a bug elsewehere. Thanks for your reply, Jason also told me the TIF_NEED_RESCHED should be set in this case, then need_resched() returns true. I will investigate the issue and reply you later. Problem only occurred in non-smp guest, we can improve it to: if(!is_smp()) schedule_timeout_interruptible(10); is_smp() is only available for arm arch, we need a general one. (It is num_online_cpus() 1.) -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
On Thu, Sep 11, 2014 at 09:08:03PM +0930, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: When I check hwrng attributes in sysfs, cat process always gets stuck if guest has only 1 vcpu and uses a slow rng backend. Currently we check if there is any tasks waiting to be run on current cpu in rng_dev_read() by need_resched(). But need_resched() doesn't work because rng_dev_read() is executing in user context. I don't understand this explanation? I'd expect the sysfs process to be woken by the mutex_unlock(). But actually sysfs process's not woken always, this is they the process gets stuck. If we're really high priority (vs. the sysfs process) then I can see why we'd need schedule_timeout_interruptible() instead of just schedule(), and in that case, need_resched() would be false too. You could argue that's intended behaviour, but I can't see how it happens in the normal case anyway. What am I missing? Thanks, Rusty. This patch removed need_resched() and increase delay to 10 jiffies, then other tasks can have chance to execute protected code. Delaying 1 jiffy also works, but 10 jiffies is safer. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); if (signal_pending(current)) { err = -ERESTARTSYS; -- 1.9.3 -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
On Sun, Sep 14, 2014 at 01:12:58AM +0800, Amos Kong wrote: On Thu, Sep 11, 2014 at 09:08:03PM +0930, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: When I check hwrng attributes in sysfs, cat process always gets stuck if guest has only 1 vcpu and uses a slow rng backend. Currently we check if there is any tasks waiting to be run on current cpu in rng_dev_read() by need_resched(). But need_resched() doesn't work because rng_dev_read() is executing in user context. I don't understand this explanation? I'd expect the sysfs process to be woken by the mutex_unlock(). But actually sysfs process's not woken always, this is they the process gets stuck. %s/they/why/ Hi Rusty, Reference: http://www.linuxgrill.com/anonymous/fire/netfilter/kernel-hacking-HOWTO-2.html read() syscall of /dev/hwrng will enter into kernel, the read operation is rng_dev_read(), it's userspace context (not interrupt context). Userspace context doesn't allow other user contexts run on that CPU, unless the kernel code sleeps for some reason. In this case, the need_resched() doesn't work. My solution is removing need_resched() and use an appropriate delay by schedule_timeout_interruptible(10). Thanks, Amos If we're really high priority (vs. the sysfs process) then I can see why we'd need schedule_timeout_interruptible() instead of just schedule(), and in that case, need_resched() would be false too. You could argue that's intended behaviour, but I can't see how it happens in the normal case anyway. What am I missing? Thanks, Rusty. This patch removed need_resched() and increase delay to 10 jiffies, then other tasks can have chance to execute protected code. Delaying 1 jiffy also works, but 10 jiffies is safer. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); if (signal_pending(current)) { err = -ERESTARTSYS; -- -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
On Thu, Sep 11, 2014 at 11:38:38AM +0530, Amit Shah wrote: On (Wed) 10 Sep 2014 [17:07:07], Amos Kong wrote: When I check hwrng attributes in sysfs, cat process always gets stuck if guest has only 1 vcpu and uses a slow rng backend. Currently we check if there is any tasks waiting to be run on current cpu in rng_dev_read() by need_resched(). But need_resched() doesn't work because rng_dev_read() is executing in user context. This patch removed need_resched() and increase delay to 10 jiffies, then other tasks can have chance to execute protected code. Delaying 1 jiffy also works, but 10 jiffies is safer. Hi Amit, I'd prefer two patches for this one: one to remove the need_resched() check, and the other to increase the timeout. If Rusty agrees with this fix, I will respin to update the commitlog with clear description and split the patches to 3. Thanks for the review. Anyway, Reviewed-by: Amit Shah amit.s...@redhat.com -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
On Sun, Sep 14, 2014 at 09:12:08AM +0800, Amos Kong wrote: ... diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); Problem only occurred in non-smp guest, we can improve it to: if(!is_smp()) schedule_timeout_interruptible(10); is_smp() is only available for arm arch, we need a general one. if (signal_pending(current)) { err = -ERESTARTSYS; -- -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 0/2] virtio-rng: fix hotunplug
RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1121540 When we try to hot-unplugging a busy virtio-rng device, the device can't be removed. And the reading process in guest gets stuck. Those two patches fixed this issue by completing have_data completion and preventing invalid reading. Thanks for the help of Amit. Cc: sta...@vger.kernel.org V2: reset data_avail (Amit) adjust unregister order V3: split patch, update commitlog Amos Kong (2): virtio-rng: fix stuck of hot-unplugging busy device virtio-rng: skip reading when we start to remove the device drivers/char/hw_random/virtio-rng.c | 7 +++ 1 file changed, 7 insertions(+) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 1/2] virtio-rng: fix stuck of hot-unplugging busy device
When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch exits the waiting by completing have_data completion before unregistering, resets data_avail to avoid the hwrng core use wrong buffer bytes. Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- drivers/char/hw_random/virtio-rng.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..849b228 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,6 +137,8 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-data_avail = 0; + complete(vi-have_data); vdev-config-reset(vdev); vi-busy = false; if (vi-hwrng_register_done) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 2/2] virtio-rng: skip reading when we start to remove the device
Before we really unregister the hwrng device, reading will get stuck if the virtio device is reset. We should return error for reading when we start to remove the device. Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- drivers/char/hw_random/virtio-rng.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 849b228..132c9cc 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -36,6 +36,7 @@ struct virtrng_info { int index; bool busy; bool hwrng_register_done; + bool hwrng_removed; }; @@ -68,6 +69,9 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) int ret; struct virtrng_info *vi = (struct virtrng_info *)rng-priv; + if (vi-hwrng_removed) + return -ENODEV; + if (!vi-busy) { vi-busy = true; init_completion(vi-have_data); @@ -137,6 +141,7 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-hwrng_removed = true; vi-data_avail = 0; complete(vi-have_data); vdev-config-reset(vdev); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] virtio-rng: fix stuck of hot-unplugging busy device
On Wed, Sep 10, 2014 at 11:04:54AM +0530, Amit Shah wrote: Hi Amos, On (Tue) 09 Sep 2014 [19:14:02], Amos Kong wrote: When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch exits the waiting by completing have_data completion before unregistering, resets data_avail to avoid the hwrng core use wrong buffer bytes. Before real unregistering we should return -ENODEV for reading. When we hot-unplug the device, dd process in guest will exit. $ dd if=/dev/hwrng of=/dev/null dd: error reading ‘/dev/hwrng’: No such device Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- V2: reset data_avail (Amit) adjust unregister order Thanks, this looks good. Can you please split into two parts, the complete() in one, and the hwrng_register_done stuff into another? I just posted the V3, split to two patches, and updated the commitlog to describe why we need to return ENODEV for reading. Thanks, Amit -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: RFC virtio-rng: fail to read sysfs of a busy device
On Wed, Sep 10, 2014 at 11:22:12AM +0530, Amit Shah wrote: On (Tue) 09 Sep 2014 [23:23:07], Amos Kong wrote: (Resend to fix the subject) Hi Amit, Rusty RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062 steps: - Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest - check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*' Result: cat process will get stuck, it will return if we kill dd process. How common is it going to be to have a long-running 'dd' process on /dev/hwrng? Not a common usage, but we have this strict testing. Also, with the new khwrng thread, reading from /dev/hwrng isn't required -- just use /dev/random? Yes. (This doesn't mean we shouldn't fix the issue here...) Completely agree :-) We have some static variables (eg, current_rng, data_avail, etc) in hw_random/core.c, they are protected by rng_mutex. I try to workaround this issue by undelay(100) after mutex_unlock() in rng_dev_read(). This gives chance for hwrng_attr_*_show() to get mutex. This patch also contains some cleanup, moving some code out of mutex protection. Do you have some suggestion? Thanks. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..fa69020 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + udelay(100); We have a need_resched() right below. Why doesn't that work? need_resched() is giving chance for userspace to if (need_resched()) It never success in my debugging. If we remove this check and always call schedule_timeout_interruptible(1), problem also disappears. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..263a370 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(1); if (signal_pending(current)) { err = -ERESTARTSYS; schedule_timeout_interruptible(1); @@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device *dev, int err; struct hwrng *rng; The following hunk doesn't work: + err = -ENODEV; err = mutex_lock_interruptible(rng_mutex); err is being set to another value in the next line! if (err) return -ERESTARTSYS; - err = -ENODEV; And all usage of err below now won't have -ENODEV but some other value. Oops! list_for_each_entry(rng, rng_list, list) { if (strcmp(rng-name, buf) == 0) { if (rng == current_rng) { @@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); This looks OK... return ret; } @@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); But this isn't resulting in savings; the majority of the time is being spent in the for loop, and that writes to the buffer. Right BTW I don't expect strcat'ing to the buf in each of these scenarios is a long operation, so this reworking doesn't strike to me as something we should pursue. Amit -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: RFC virtio-rng: fail to read sysfs of a busy device
On Wed, Sep 10, 2014 at 02:49:38PM +0800, Amos Kong wrote: On Wed, Sep 10, 2014 at 11:22:12AM +0530, Amit Shah wrote: On (Tue) 09 Sep 2014 [23:23:07], Amos Kong wrote: (Resend to fix the subject) Hi Amit, Rusty RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062 steps: - Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest - check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*' Result: cat process will get stuck, it will return if we kill dd process. How common is it going to be to have a long-running 'dd' process on /dev/hwrng? Not a common usage, but we have this strict testing. For -smp 1: It's easy to reproduce with slow backend (/dev/random). cat can return most of time with some delay if we use quick backend (/dev/urandom). But for -smp 2: I didn't touch this problem even with slow backend. Also, with the new khwrng thread, reading from /dev/hwrng isn't required -- just use /dev/random? Yes. (This doesn't mean we shouldn't fix the issue here...) Completely agree :-) We have some static variables (eg, current_rng, data_avail, etc) in hw_random/core.c, they are protected by rng_mutex. I try to workaround this issue by undelay(100) after mutex_unlock() in rng_dev_read(). This gives chance for hwrng_attr_*_show() to get mutex. This patch also contains some cleanup, moving some code out of mutex protection. Do you have some suggestion? Thanks. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..fa69020 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + udelay(100); We have a need_resched() right below. Why doesn't that work? [smp 1] Why need_resched() always return zero? what's the original purpose of it ? if (need_resched()) It never success in my debugging. If we remove this check and always call schedule_timeout_interruptible(1), problem also disappears. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..263a370 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(1); if (signal_pending(current)) { err = -ERESTARTSYS; schedule_timeout_interruptible(1); @@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device *dev, int err; struct hwrng *rng; The following hunk doesn't work: + err = -ENODEV; err = mutex_lock_interruptible(rng_mutex); err is being set to another value in the next line! if (err) return -ERESTARTSYS; - err = -ENODEV; And all usage of err below now won't have -ENODEV but some other value. Oops! list_for_each_entry(rng, rng_list, list) { if (strcmp(rng-name, buf) == 0) { if (rng == current_rng) { @@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); This looks OK... return ret; } @@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); But this isn't resulting in savings; the majority of the time is being spent in the for loop, and that writes to the buffer. Right BTW I don't expect strcat'ing to the buf in each of these scenarios is a long operation, so this reworking doesn't strike to me as something we should pursue. Amit -- Amos. -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo
[PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
When I check hwrng attributes in sysfs, cat process always gets stuck if guest has only 1 vcpu and uses a slow rng backend. Currently we check if there is any tasks waiting to be run on current cpu in rng_dev_read() by need_resched(). But need_resched() doesn't work because rng_dev_read() is executing in user context. This patch removed need_resched() and increase delay to 10 jiffies, then other tasks can have chance to execute protected code. Delaying 1 jiffy also works, but 10 jiffies is safer. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); if (signal_pending(current)) { err = -ERESTARTSYS; -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] virtio-rng cleanup: move some code out of mutex protection
It doesn't save too much cpu time as expected, just a cleanup. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..c591d7e 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -270,8 +270,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); return ret; } @@ -284,19 +284,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); return ret; } -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/2] fix stuck in catting hwrng attributes
If we read hwrng by long-running dd process, it takes too much cpu time. When we check hwrng attributes from sysfs by cat, it gets stuck. The problem can only be reproduced with non-smp guest with slow backend. This patchset changed hwrng core to always delay 10 jiffies, cat process have chance to execute protected code, the problem is resolved. Thanks. Amos Kong (2): virtio-rng cleanup: move some code out of mutex protection virtio-rng: fix stuck in catting hwrng attributes drivers/char/hw_random/core.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virtio-rng: complete have_data completion in removing device
On Mon, Sep 08, 2014 at 11:29:51PM +0800, Amos Kong wrote: On Wed, Aug 06, 2014 at 01:55:29PM +0530, Amit Shah wrote: On (Wed) 06 Aug 2014 [16:05:41], Amos Kong wrote: On Wed, Aug 06, 2014 at 01:35:15AM +0800, Amos Kong wrote: When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch fixed the hang by completing have_data completion before unregistering a virtio-rng device. Hi Amit, Before applying this patch, it's blocking insider wait_for_completion_killable() Applied this patch, wait_for_completion_killable() returns 0, and vi-data_avail becomes 0, then rng_get_date() will return 0. Thanks for checking this. Is it expected result? Hi Amit I think what will happen is vi-data_avail will be set to whatever it was set last. In case of a previous successful read request, the data_avail will be set to whatever number of bytes the host gave. On doing a hot-unplug on the succeeding wait, the value in data_avail will be re-used, and the hwrng core will wrongly take some bytes in the buffer as input from the host. So, I think we need to set vi-data_avail = 0; before calling wait_event_completion_killable(). I finally understand content, we need to set vi-data_avail to 0 before returning virtio_read(), it might enter wait_event_completion_killable() when we try to remove the device. We call complete() in remove_common(), then wait_event_completion_killable() will exit, but virtio_read() will be re-entered if the device still isn't unregistered, then re-stuck inside wait_event_completion_killable(). I tested some complex condition (both quick/slow backend), I found some problem in below two patches. I will post another fix later. The test result is expected. 1. Hotplug remove virtio-rng0, dd process will exit with an error: dd: error reading ‘/dev/hwrng’: Operation not permitted virtio-rng0 disappear from 'info pci' 2. Re-read by dd, hotplug virtio-rng1, dd process exit with same error, virtio-rng1 disappear Thanks, Amos Amit In my latest debugging, I found the hang is caused by unexpected reading when we started to remove the device. I have two draft fix, 1) is skip unexpected reading by checking a remove flag. 2) is unregistering device at the beginning of remove_common(). I think second patch is better if it won't cause new problem. The original patch (complete in remove_common()) is still necessary. Test results: hotplug issue disappeared (dd process will quit). diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..028797c 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -35,6 +35,7 @@ struct virtrng_info { unsigned int data_avail; int index; bool busy; +bool remove; bool hwrng_register_done; }; @@ -68,6 +69,9 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) int ret; struct virtrng_info *vi = (struct virtrng_info *)rng-priv; +if (vi-remove) + return 0; + if (!vi-busy) { vi-busy = true; init_completion(vi-have_data); @@ -137,6 +141,8 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-remove = true; + complete(vi-have_data); vdev-config-reset(vdev); vi-busy = false; if (vi-hwrng_register_done) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..9b8c2ce 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,10 +137,11 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; - vdev-config-reset(vdev); - vi-busy = false; if (vi-hwrng_register_done) hwrng_unregister(vi-hwrng); + complete(vi-have_data); + vdev-config-reset(vdev); + vi-busy = false; vdev-config-del_vqs(vdev); ida_simple_remove(rng_index_ida, vi-index); kfree(vi); -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] virtio-rng: fix stuck of hot-unplugging busy device
When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch exits the waiting by completing have_data completion before unregistering, resets data_avail to avoid the hwrng core use wrong buffer bytes. Before real unregistering we should return -ENODEV for reading. When we hot-unplug the device, dd process in guest will exit. $ dd if=/dev/hwrng of=/dev/null dd: error reading ‘/dev/hwrng’: No such device Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- V2: reset data_avail (Amit) adjust unregister order --- drivers/char/hw_random/virtio-rng.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..e76433b 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -68,6 +68,10 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) int ret; struct virtrng_info *vi = (struct virtrng_info *)rng-priv; + if (!vi-hwrng_register_done) { + return -ENODEV; + } + if (!vi-busy) { vi-busy = true; init_completion(vi-have_data); @@ -137,10 +141,14 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-data_avail = 0; + complete(vi-have_data); vdev-config-reset(vdev); vi-busy = false; - if (vi-hwrng_register_done) + if (vi-hwrng_register_done) { + vi-hwrng_register_done = false; hwrng_unregister(vi-hwrng); + } vdev-config-del_vqs(vdev); ida_simple_remove(rng_index_ida, vi-index); kfree(vi); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
mutex
Hi Amit, Rusty RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062 steps: - Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest - check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*' Result: cat process will get stuck, it will return if we kill dd process. We have some static variables (eg, current_rng, data_avail, etc) in hw_random/core.c, they are protected by rng_mutex. I try to workaround this issue by undelay(100) after mutex_unlock() in rng_dev_read(). This gives chance for hwrng_attr_*_show() to get mutex. This patch also contains some cleanup, moving some code out of mutex protection. Do you have some suggestion? Thanks. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..fa69020 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + udelay(100); if (need_resched()) schedule_timeout_interruptible(1); @@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device *dev, int err; struct hwrng *rng; + err = -ENODEV; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - err = -ENODEV; list_for_each_entry(rng, rng_list, list) { if (strcmp(rng-name, buf) == 0) { if (rng == current_rng) { @@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); return ret; } @@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); return ret; } -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RFC virtio-rng: fail to read sysfs of a busy device
(Resend to fix the subject) Hi Amit, Rusty RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062 steps: - Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest - check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*' Result: cat process will get stuck, it will return if we kill dd process. We have some static variables (eg, current_rng, data_avail, etc) in hw_random/core.c, they are protected by rng_mutex. I try to workaround this issue by undelay(100) after mutex_unlock() in rng_dev_read(). This gives chance for hwrng_attr_*_show() to get mutex. This patch also contains some cleanup, moving some code out of mutex protection. Do you have some suggestion? Thanks. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..fa69020 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + udelay(100); if (need_resched()) schedule_timeout_interruptible(1); @@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device *dev, int err; struct hwrng *rng; + err = -ENODEV; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - err = -ENODEV; list_for_each_entry(rng, rng_list, list) { if (strcmp(rng-name, buf) == 0) { if (rng == current_rng) { @@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); return ret; } @@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); return ret; } -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virtio-rng: complete have_data completion in removing device
On Wed, Aug 06, 2014 at 01:55:29PM +0530, Amit Shah wrote: On (Wed) 06 Aug 2014 [16:05:41], Amos Kong wrote: On Wed, Aug 06, 2014 at 01:35:15AM +0800, Amos Kong wrote: When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch fixed the hang by completing have_data completion before unregistering a virtio-rng device. Hi Amit, Before applying this patch, it's blocking insider wait_for_completion_killable() Applied this patch, wait_for_completion_killable() returns 0, and vi-data_avail becomes 0, then rng_get_date() will return 0. Thanks for checking this. Is it expected result? I think what will happen is vi-data_avail will be set to whatever it was set last. In case of a previous successful read request, the data_avail will be set to whatever number of bytes the host gave. On doing a hot-unplug on the succeeding wait, the value in data_avail will be re-used, and the hwrng core will wrongly take some bytes in the buffer as input from the host. So, I think we need to set vi-data_avail = 0; before calling wait_event_completion_killable(). Amit In my latest debugging, I found the hang is caused by unexpected reading when we started to remove the device. I have two draft fix, 1) is skip unexpected reading by checking a remove flag. 2) is unregistering device at the beginning of remove_common(). I think second patch is better if it won't cause new problem. The original patch (complete in remove_common()) is still necessary. Test results: hotplug issue disappeared (dd process will quit). diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..028797c 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -35,6 +35,7 @@ struct virtrng_info { unsigned int data_avail; int index; bool busy; +bool remove; bool hwrng_register_done; }; @@ -68,6 +69,9 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) int ret; struct virtrng_info *vi = (struct virtrng_info *)rng-priv; +if (vi-remove) + return 0; + if (!vi-busy) { vi-busy = true; init_completion(vi-have_data); @@ -137,6 +141,8 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-remove = true; + complete(vi-have_data); vdev-config-reset(vdev); vi-busy = false; if (vi-hwrng_register_done) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..9b8c2ce 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,10 +137,11 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; - vdev-config-reset(vdev); - vi-busy = false; if (vi-hwrng_register_done) hwrng_unregister(vi-hwrng); + complete(vi-have_data); + vdev-config-reset(vdev); + vi-busy = false; vdev-config-del_vqs(vdev); ida_simple_remove(rng_index_ida, vi-index); kfree(vi); -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] virtio-pci: share config interrupt between virtio devices
On Mon, Sep 01, 2014 at 09:37:30AM +0300, Michael S. Tsirkin wrote: Hi Michael, On Mon, Sep 01, 2014 at 01:41:54PM +0800, Amos Kong wrote: One VM only has 128 msix interrupt, virtio-config interrupt has less workload. This patch shares one normal interrupt Thanks for your quick reply. normal == INT#x? Please don't call it normal. The proper name is legacy INT#x. OK So you are trying to use legacy INT#x at the same time with MSI-X? This does not work: the PCI spec says: While enabled for MSI or MSI-X operation, a function is prohibited from using its INTx# pin (if implemented) to request service (MSI, MSI-X, and INTx# are mutually exclusive). It means we can't use interrupt and triggered-interrupt together for one PCI device. I will study this problem. does the patch work for you? If it does it might be a (minor) spec violation in kvm. I did some basic testing (multiple nics, scp, ping, etc), it works. Besides, INT#x really leads to terrible performance because sharing is forced even if there aren't many devices. Why do we need INT#x? How about setting IRQF_SHARED for the config interrupt while using MSI-X? You'd have to read ISR to check that the interrupt was intended for your device. I have a draft patch to share one MSI-X for all virtio-config, it has some problem in hotplugging devices. I will continue this way. for configuration between virtio devices. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/virtio/virtio_pci.c | 41 - 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 3d1463c..b1263b3 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -52,6 +52,7 @@ struct virtio_pci_device /* Name strings for interrupts. This size should be enough, * and I'm too lazy to allocate each name separately. */ char (*msix_names)[256]; + char config_msix_name[256]; /* Number of available vectors */ unsigned msix_vectors; /* Vectors allocated, excluding per-vq vectors if any */ @@ -282,12 +283,6 @@ static void vp_free_vectors(struct virtio_device *vdev) free_cpumask_var(vp_dev-msix_affinity_masks[i]); if (vp_dev-msix_enabled) { - /* Disable the vector used for configuration */ - iowrite16(VIRTIO_MSI_NO_VECTOR, - vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Flush the write out to device */ - ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - pci_disable_msix(vp_dev-pci_dev); vp_dev-msix_enabled = 0; } @@ -339,24 +334,18 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, goto error; vp_dev-msix_enabled = 1; - /* Set the vector used for configuration */ - v = vp_dev-msix_used_vectors; - snprintf(vp_dev-msix_names[v], sizeof *vp_dev-msix_names, + /* Set shared IRQ for configuration */ + snprintf(vp_dev-config_msix_name, sizeof(*vp_dev-msix_names), %s-config, name); - err = request_irq(vp_dev-msix_entries[v].vector, - vp_config_changed, 0, vp_dev-msix_names[v], + err = request_irq(vp_dev-pci_dev-irq, + vp_config_changed, + IRQF_SHARED, + vp_dev-config_msix_name, vp_dev); - if (err) - goto error; - ++vp_dev-msix_used_vectors; - - iowrite16(v, vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Verify we had enough resources to assign the vector */ - v = ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - if (v == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; + if (!err) + vp_dev-intx_enabled = 1; + else goto error; - } if (!per_vq_vectors) { /* Shared vector for all VQs */ @@ -535,14 +524,16 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, goto error_request; } else { if (per_vq_vectors) { - /* Best option: one for change interrupt, one per vq. */ - nvectors = 1; + /* Best option: one normal interrupt for change, + one msix per vq. */ + nvectors = 0; for (i = 0; i nvqs; ++i) if (callbacks[i]) ++nvectors; } else { - /* Second best: one for change, shared for all vqs. */ - nvectors = 2; + /* Second best: one normal interrupt for + change, share one msix for all vqs. */ + nvectors = 1
[PATCH RFC] virtio-pci: share config interrupt between virtio devices
One VM only has 128 msix interrupt, virtio-config interrupt has less workload. This patch shares one normal interrupt for configuration between virtio devices. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/virtio/virtio_pci.c | 41 - 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 3d1463c..b1263b3 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -52,6 +52,7 @@ struct virtio_pci_device /* Name strings for interrupts. This size should be enough, * and I'm too lazy to allocate each name separately. */ char (*msix_names)[256]; + char config_msix_name[256]; /* Number of available vectors */ unsigned msix_vectors; /* Vectors allocated, excluding per-vq vectors if any */ @@ -282,12 +283,6 @@ static void vp_free_vectors(struct virtio_device *vdev) free_cpumask_var(vp_dev-msix_affinity_masks[i]); if (vp_dev-msix_enabled) { - /* Disable the vector used for configuration */ - iowrite16(VIRTIO_MSI_NO_VECTOR, - vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Flush the write out to device */ - ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - pci_disable_msix(vp_dev-pci_dev); vp_dev-msix_enabled = 0; } @@ -339,24 +334,18 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, goto error; vp_dev-msix_enabled = 1; - /* Set the vector used for configuration */ - v = vp_dev-msix_used_vectors; - snprintf(vp_dev-msix_names[v], sizeof *vp_dev-msix_names, + /* Set shared IRQ for configuration */ + snprintf(vp_dev-config_msix_name, sizeof(*vp_dev-msix_names), %s-config, name); - err = request_irq(vp_dev-msix_entries[v].vector, - vp_config_changed, 0, vp_dev-msix_names[v], + err = request_irq(vp_dev-pci_dev-irq, + vp_config_changed, + IRQF_SHARED, + vp_dev-config_msix_name, vp_dev); - if (err) - goto error; - ++vp_dev-msix_used_vectors; - - iowrite16(v, vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Verify we had enough resources to assign the vector */ - v = ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - if (v == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; + if (!err) + vp_dev-intx_enabled = 1; + else goto error; - } if (!per_vq_vectors) { /* Shared vector for all VQs */ @@ -535,14 +524,16 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, goto error_request; } else { if (per_vq_vectors) { - /* Best option: one for change interrupt, one per vq. */ - nvectors = 1; + /* Best option: one normal interrupt for change, + one msix per vq. */ + nvectors = 0; for (i = 0; i nvqs; ++i) if (callbacks[i]) ++nvectors; } else { - /* Second best: one for change, shared for all vqs. */ - nvectors = 2; + /* Second best: one normal interrupt for + change, share one msix for all vqs. */ + nvectors = 1; } err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virtio-rng: complete have_data completion in removing device
On Wed, Aug 06, 2014 at 01:35:15AM +0800, Amos Kong wrote: When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch fixed the hang by completing have_data completion before unregistering a virtio-rng device. Hi Amit, Before applying this patch, it's blocking insider wait_for_completion_killable() Applied this patch, wait_for_completion_killable() returns 0, and vi-data_avail becomes 0, then rng_get_date() will return 0. Is it expected result? Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- drivers/char/hw_random/virtio-rng.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 0027137..416b15c 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,6 +137,7 @@ static void remove_common(struct virtio_device *vdev) struct virtrng_info *vi = vdev-priv; vdev-config-reset(vdev); + complete(vi-have_data); vi-busy = false; if (vi-hwrng_register_done) hwrng_unregister(vi-hwrng); -- 1.9.3 ___ Virtualization mailing list virtualizat...@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: When I boot two virtio-rng devices, guest will hang
3.16 (guest hangs with two rng devices) 3.16 + quick fix (can startup with two rng devices) (hotplug issue 1 + hotplug issue 2 exist) lates torvalds/linux.git + amit 4 patches (can startup with two rng devices) (only hotplug issue 2 exists) However, the 4 patches also fixed the hang issue, the hotplug issue was fixed a little. The hotplug issue is effected by the backend, or maybe it's not a real issue, because the rng device can be hot-removed after dd process is killed. Hotplug issue 1: 1. boot up guest with two rng device (rng0 uses /dev/urandom, rng1 uses /dev/random) 2. read data by dd in guest 3 (option 1). hot-remove rng0, then hot-remove rng1 - result: _only rng1_ can't be removed until dd process is killed 3 (option 2). hot-remove rng1, then hot-remove rng0 - result: two devices can be removed successfully, dd process will exit automatically. If we use /dev/urandom for rng0 and rng1, _rng0 rng1_ can be removed, dd process will exit automatically. Hotplug issue 2: If we use /dev/random for rng0 and rng1, _rng0 rng1_ can't be removed until dd process is killed. Hotplug issue 3: If we use /dev/random for rng0 and rng1, _only rng1_ can't be removed until dd process is killed. (The difference between /dev/random and /dev/urandom is the speed.) Thanks, Amos -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] virtio-rng: complete have_data completion in removing device
When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch fixed the hang by completing have_data completion before unregistering a virtio-rng device. Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- drivers/char/hw_random/virtio-rng.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 0027137..416b15c 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,6 +137,7 @@ static void remove_common(struct virtio_device *vdev) struct virtrng_info *vi = vdev-priv; vdev-config-reset(vdev); + complete(vi-have_data); vi-busy = false; if (vi-hwrng_register_done) hwrng_unregister(vi-hwrng); -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: When I boot two virtio-rng devices, guest will hang
On Tue, Aug 05, 2014 at 06:28:54PM +0800, Amos Kong wrote: 3.16 (guest hangs with two rng devices) 3.16 + quick fix (can startup with two rng devices) (hotplug issue 1 + hotplug issue 2 exist) lates torvalds/linux.git + amit 4 patches (can startup with two rng devices) (only hotplug issue 2 exists) However, the 4 patches also fixed the hang issue, the hotplug issue was fixed a little. The hotplug issue is effected by the backend, or maybe it's not a real issue, because the rng device can be hot-removed after dd process is killed. Hotplug issue 1: 1. boot up guest with two rng device (rng0 uses /dev/urandom, rng1 uses /dev/random) 2. read data by dd in guest 3 (option 1). hot-remove rng0, then hot-remove rng1 - result: _only rng1_ can't be removed until dd process is killed 3 (option 2). hot-remove rng1, then hot-remove rng0 - result: two devices can be removed successfully, dd process will exit automatically. If we use /dev/urandom for rng0 and rng1, _rng0 rng1_ can be removed, dd process will exit automatically. Hotplug issue 2: If we use /dev/random for rng0 and rng1, _rng0 rng1_ can't be removed until dd process is killed. Hotplug issue 3: If we use /dev/random for rng0 and rng1, _only rng1_ can't be removed until dd process is killed. Hi Amit, I finally found the root problem and posted a fix to upstream: http://lists.linuxfoundation.org/pipermail/virtualization/2014-August/027049.html It can help to fix the hotplug issues on 3.16 latest kernel, so stable kernel is CCed. (The difference between /dev/random and /dev/urandom is the speed.) Thanks, Amos -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
When I boot two virtio-rng devices, guest will hang
QEMU commandline: ./x86_64-softmmu/qemu-system-x86_64 --enable-kvm -m 2000 -drive file=/images/nolvm.qcow2 --kernel /home/devel/linux/arch/x86/boot/bzImage -append ro root=/dev/sda1 console=ttyS0,115200 -monitor unix:/tmp/m,nowait,server -device virtio-net-pci,netdev=h0,vectors=17,mq=on,id=n0 -netdev tap,id=h0,queues=8 -device virtio-net-pci,netdev=h1,vectors=0,mq=on,id=n1 -netdev tap,id=h1,queues=8 -vnc :0 -mon chardev=qmp,mode=control,pretty=on -chardev socket,id=qmp,host=localhost,port=1234,server,nowait -serial stdio -object rng-random,filename=/dev/urandom,id=rng0 -device virtio-rng-pci,rng=rng0,id=h0 -object rng-random,filename=/dev/urandom,id=rng1 -device virtio-rng-pci,rng=rng1,id=h1 It works when I only add one virtio-rng device. Did you touch this problem? I'm using latest net-next/master (ac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f) [0.00] Initializing cgroup subsys cpuset [0.00] Initializing cgroup subsys cpu [0.00] Initializing cgroup subsys cpuacct [0.00] Linux version 3.16.0-rc6+ (amos@z) (gcc version 4.8.3 20140624 (Red Hat 4.8.3-1) (GCC) ) #363 SMP Mon Jul 28 15:05:44 CST 2014 [0.00] Command line: ro root=/dev/sda1 console=ttyS0,115200 [0.00] e820: BIOS-provided physical RAM map: [0.00] BIOS-e820: [mem 0x-0x0009fbff] usable [0.00] BIOS-e820: [mem 0x0009fc00-0x0009] reserved [0.00] BIOS-e820: [mem 0x000f-0x000f] reserved [0.00] BIOS-e820: [mem 0x0010-0x7cffdfff] usable [0.00] BIOS-e820: [mem 0x7cffe000-0x7cff] reserved [0.00] BIOS-e820: [mem 0xfeffc000-0xfeff] reserved [0.00] BIOS-e820: [mem 0xfffc-0x] reserved [0.00] NX (Execute Disable) protection: active [0.00] SMBIOS 2.8 present. [0.00] Hypervisor detected: KVM [0.00] AGP: No AGP bridge found [0.00] e820: last_pfn = 0x7cffe max_arch_pfn = 0x4 [0.00] PAT not supported by CPU. [0.00] found SMP MP-table at [mem 0x000f0e90-0x000f0e9f] mapped at [880f0e90] [0.00] init_memory_mapping: [mem 0x-0x000f] [0.00] init_memory_mapping: [mem 0x7cc0-0x7cdf] [0.00] init_memory_mapping: [mem 0x7c00-0x7cbf] [0.00] init_memory_mapping: [mem 0x0010-0x7bff] [0.00] init_memory_mapping: [mem 0x7ce0-0x7cffdfff] [0.00] ACPI: Early table checksum verification disabled [0.00] ACPI: RSDP 0x000F0C90 14 (v00 BOCHS ) [0.00] ACPI: RSDT 0x7CFFFEF0 34 (v01 BOCHS BXPCRSDT 0001 BXPC 0001) [0.00] ACPI: FACP 0x7CFFF1D3 74 (v01 BOCHS BXPCFACP 0001 BXPC 0001) [0.00] ACPI: DSDT 0x7CFFE040 001193 (v01 BOCHS BXPCDSDT 0001 BXPC 0001) [0.00] ACPI: FACS 0x7CFFE000 40 [0.00] ACPI: SSDT 0x7CFFF247 000BF9 (v01 BOCHS BXPCSSDT 0001 BXPC 0001) [0.00] ACPI: APIC 0x7CFFFE40 78 (v01 BOCHS BXPCAPIC 0001 BXPC 0001) [0.00] ACPI: HPET 0x7CFFFEB8 38 (v01 BOCHS BXPCHPET 0001 BXPC 0001) [0.00] No NUMA configuration found [0.00] Faking a node at [mem 0x-0x7cffdfff] [0.00] Initmem setup node 0 [mem 0x-0x7cffdfff] [0.00] NODE_DATA [mem 0x7cfea000-0x7cffdfff] [0.00] kvm-clock: Using msrs 4b564d01 and 4b564d00 [0.00] kvm-clock: cpu 0, msr 0:7cfe8001, primary cpu clock [0.00] Zone ranges: [0.00] DMA [mem 0x1000-0x00ff] [0.00] DMA32[mem 0x0100-0x] [0.00] Normal empty [0.00] Movable zone start for each node [0.00] Early memory node ranges [0.00] node 0: [mem 0x1000-0x0009efff] [0.00] node 0: [mem 0x0010-0x7cffdfff] [0.00] ACPI: PM-Timer IO Port: 0x608 [0.00] ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled) [0.00] ACPI: LAPIC_NMI (acpi_id[0xff] dfl dfl lint[0x1]) [0.00] ACPI: IOAPIC (id[0x00] address[0xfec0] gsi_base[0]) [0.00] IOAPIC[0]: apic_id 0, version 17, address 0xfec0, GSI 0-23 [0.00] ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl) [0.00] ACPI: INT_SRC_OVR (bus 0 bus_irq 5 global_irq 5 high level) [0.00] ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level) [0.00] ACPI: INT_SRC_OVR (bus 0 bus_irq 10 global_irq 10 high level) [0.00] ACPI: INT_SRC_OVR (bus 0 bus_irq 11 global_irq 11 high level) [0.00] Using ACPI (MADT) for SMP configuration information [0.00] ACPI: HPET id: 0x8086a201 base: 0xfed0 [0.00] smpboot: Allowing 1 CPUs, 0 hotplug CPUs [0.00] PM: Registered nosave memory: [mem 0x0009f000-0x0009] [0.00] PM:
Re: When I boot two virtio-rng devices, guest will hang
On Mon, Jul 28, 2014 at 01:25:14PM +0530, Amit Shah wrote: On (Mon) 28 Jul 2014 [15:32:42], Amos Kong wrote: QEMU commandline: ./x86_64-softmmu/qemu-system-x86_64 --enable-kvm -m 2000 -drive file=/images/nolvm.qcow2 --kernel /home/devel/linux/arch/x86/boot/bzImage -append ro root=/dev/sda1 console=ttyS0,115200 -monitor unix:/tmp/m,nowait,server -device virtio-net-pci,netdev=h0,vectors=17,mq=on,id=n0 -netdev tap,id=h0,queues=8 -device virtio-net-pci,netdev=h1,vectors=0,mq=on,id=n1 -netdev tap,id=h1,queues=8 -vnc :0 -mon chardev=qmp,mode=control,pretty=on -chardev socket,id=qmp,host=localhost,port=1234,server,nowait -serial stdio -object rng-random,filename=/dev/urandom,id=rng0 -device virtio-rng-pci,rng=rng0,id=h0 -object rng-random,filename=/dev/urandom,id=rng1 -device virtio-rng-pci,rng=rng1,id=h1 It works when I only add one virtio-rng device. Did you touch this problem? I'm using latest net-next/master (ac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f) Hi Amit, snip [0.223503] Non-volatile memory driver v1.3 [1.172293] tsc: Refined TSC clocksource calibration: 2893.436 MHz qemu: terminating on signal 2 -- (I have to cancel QEMU process by Ctrl + C) This looks similar to what I saw when driver asks for randomness from the host before probe is completed. Does the following patch help? This patch was already inclued in latest net-next/master patch commit: e052dbf554610e2104c5a7518c4d8374bed701bb While the driver is setup (DRIVER_OK is set), the vqs aren't marked usable before the probe for the other device finishes as well. That's not a scenario I considered. We can try to put this patch in 3.16, but it won't be applicable for 3.17, where this is handled the proper way. Also, 3.15 isn't affected, since that doesn't have multi-device support. Also attaching a patch that enables traces in qemu so it's easier to see what's going on. diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index e9b15bc..124cac5 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -34,12 +34,11 @@ struct virtrng_info { unsigned int data_avail; struct completion have_data; bool busy; + bool probe_done; char name[25]; int index; }; -static bool probe_done; - static void random_recv_done(struct virtqueue *vq) { struct virtrng_info *vi = vq-vdev-priv; @@ -73,7 +72,7 @@ static int virtio_read(struct hwrng *rng, void *buf, -size_t size, bool wait) * Don't ask host for data till we're setup. This call can * happen during hwrng_register(), after commit d9e7972619. */ - if (unlikely(!probe_done)) + if (unlikely(!vi-probe_done)) return 0; if (!vi-busy) { @@ -146,7 +145,7 @@ static int probe_common(struct virtio_device *vdev) return err; } - probe_done = true; + vi-probe_done = true; return 0; } Amit From ec1aa555b67628beefa0ac6902baa2cc2e156f58 Mon Sep 17 00:00:00 2001 Message-Id: ec1aa555b67628beefa0ac6902baa2cc2e156f58.1406533638.git.amit.s...@redhat.com From: Amit Shah amit.s...@redhat.com Date: Mon, 21 Jul 2014 14:46:28 +0530 Subject: [PATCH 1/1] virtio-rng: add some trace events Add some trace events to virtio-rng for easier debugging Signed-off-by: Amit Shah amit.s...@redhat.com --- hw/virtio/virtio-rng.c | 7 +++ trace-events | 5 + 2 files changed, 12 insertions(+) diff --git a/hw/virtio/virtio-rng.c b/hw/virtio/virtio-rng.c index 7c5a675..4a6472b 100644 --- a/hw/virtio/virtio-rng.c +++ b/hw/virtio/virtio-rng.c @@ -16,6 +16,7 @@ #include hw/virtio/virtio-rng.h #include sysemu/rng.h #include qom/object_interfaces.h +#include trace.h static bool is_guest_ready(VirtIORNG *vrng) { @@ -24,6 +25,7 @@ static bool is_guest_ready(VirtIORNG *vrng) (vdev-status VIRTIO_CONFIG_S_DRIVER_OK)) { return true; } +trace_virtio_rng_guest_not_ready(vrng); return false; } @@ -62,6 +64,7 @@ static void chr_read(void *opaque, const void *buf, size_t size) offset += len; virtqueue_push(vrng-vq, elem, len); +trace_virtio_rng_pushed(vrng, len); } virtio_notify(vdev, vrng-vq); } @@ -81,7 +84,11 @@ static void virtio_rng_process(VirtIORNG *vrng) quota = MIN((uint64_t)vrng-quota_remaining, (uint64_t)UINT32_MAX); } size = get_request_size(vrng-vq, quota); + +trace_virtio_rng_request(vrng, size, quota); + size = MIN(vrng-quota_remaining, size); + if (size) { rng_backend_request_entropy(vrng-rng, size, chr_read, vrng); } diff --git a/trace-events b/trace-events index 11a17a8..99f39ac 100644 --- a/trace-events +++ b/trace-events @@ -41,6
Re: When I boot two virtio-rng devices, guest will hang
On Mon, Jul 28, 2014 at 02:42:13PM +0530, Amit Shah wrote: On (Mon) 28 Jul 2014 [16:49:20], Amos Kong wrote: On Mon, Jul 28, 2014 at 01:25:14PM +0530, Amit Shah wrote: On (Mon) 28 Jul 2014 [15:32:42], Amos Kong wrote: QEMU commandline: ./x86_64-softmmu/qemu-system-x86_64 --enable-kvm -m 2000 -drive file=/images/nolvm.qcow2 --kernel /home/devel/linux/arch/x86/boot/bzImage -append ro root=/dev/sda1 console=ttyS0,115200 -monitor unix:/tmp/m,nowait,server -device virtio-net-pci,netdev=h0,vectors=17,mq=on,id=n0 -netdev tap,id=h0,queues=8 -device virtio-net-pci,netdev=h1,vectors=0,mq=on,id=n1 -netdev tap,id=h1,queues=8 -vnc :0 -mon chardev=qmp,mode=control,pretty=on -chardev socket,id=qmp,host=localhost,port=1234,server,nowait -serial stdio -object rng-random,filename=/dev/urandom,id=rng0 -device virtio-rng-pci,rng=rng0,id=h0 -object rng-random,filename=/dev/urandom,id=rng1 -device virtio-rng-pci,rng=rng1,id=h1 It works when I only add one virtio-rng device. Did you touch this problem? I'm using latest net-next/master (ac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f) Hi Amit, snip [0.223503] Non-volatile memory driver v1.3 [1.172293] tsc: Refined TSC clocksource calibration: 2893.436 MHz qemu: terminating on signal 2 -- (I have to cancel QEMU process by Ctrl + C) This looks similar to what I saw when driver asks for randomness from the host before probe is completed. Does the following patch help? This patch was already inclued in latest net-next/master patch commit: e052dbf554610e2104c5a7518c4d8374bed701bb No, it's a different one, goes on top of the commit you referenced. Thanks. This patch fixed the problem. -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: RFC: sharing config interrupt between virtio devices for saving MSI
On Sat, Apr 19, 2014 at 12:08:15PM +0800, Amos Kong wrote: Hi all, I'm working on this item in Upstream Networking todolist: | - Sharing config interrupts |Support more devices by sharing a single msi vector |between multiple virtio devices. |(Applies to virtio-blk too). I have this solution here, and only have draft patches of Solution 1 2, let's discuss if solution 3 is feasible. * We should not introduce perf regression in this change * little effect is acceptable because we are _sharing_ interrupt Solution 1: == share one LSI interrupt for configuration change of all virtio devices. Draft patch: share_lsi_for_all_config.patch Solution 2: == share one MSI interrupt for configuration change of all virtio devices. Draft patch: share_msix_for_all_config.patch Solution 3: == dynamic adjust interrupt policy when device is added or removed Current: --- - try to allocate a MSI to device's config - try to allocate a MSI for each vq - share one MSI for all vqs of one device - share one LSI for config vqs of one device additional dynamic policies: --- - if there is no enough MSI, adjust interrupt allocation for returning some MSI - try to find a device that allocated one MSI for each vq, change it to share one MSI for saving the MSI - try to share one MSI for config of all virtio_pci devices - try to share one LSI for config of all devices - if device is removed, try to re-allocate freed MSI to existed devices, if they aren't in best status (one MSI for config, one MSI for each vq) - if config of all devices is sharing one LSI, try to upgrade it to MSI - if config of all devices is sharing one MSI, try to allocate one MSI for configuration change of each device - try to find a device that is sharing one MSI for all vqs, try to allocate one MSI for each vq Hi Michael, Alex Any thoughts about this ? :) Thanks, Amos diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 101db3f..5ba348d 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -302,6 +302,8 @@ static void vp_free_vectors(struct virtio_device *vdev) vp_dev-msix_affinity_masks = NULL; } +//static msix_entry *config_msix_entry; +static char config_msix_name[100]; static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, bool per_vq_vectors) { @@ -341,14 +343,13 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, /* Set the vector used for configuration */ v = vp_dev-msix_used_vectors; - snprintf(vp_dev-msix_names[v], sizeof *vp_dev-msix_names, + snprintf(config_msix_name, sizeof *vp_dev-msix_names, %s-config, name); - err = request_irq(vp_dev-msix_entries[v].vector, - vp_config_changed, 0, vp_dev-msix_names[v], - vp_dev); + err = request_irq(vp_dev-pci_dev-irq, vp_config_changed, IRQF_SHARED, config_msix_name, vp_dev); if (err) goto error; - ++vp_dev-msix_used_vectors; + +//++vp_dev-msix_used_vectors; iowrite16(v, vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); /* Verify we had enough resources to assign the vector */ @@ -380,7 +381,7 @@ static int vp_request_intx(struct virtio_device *vdev) { int err; struct virtio_pci_device *vp_dev = to_vp_device(vdev); - + printk(KERN_INFO share irq: %d\n, vp_dev-pci_dev-irq); err = request_irq(vp_dev-pci_dev-irq, vp_interrupt, IRQF_SHARED, dev_name(vdev-dev), vp_dev); if (!err) @@ -536,13 +537,13 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, } else { if (per_vq_vectors) { /* Best option: one for change interrupt, one per vq. */ - nvectors = 1; + nvectors = 0; for (i = 0; i nvqs; ++i) if (callbacks[i]) ++nvectors; } else { /* Second best: one for change, shared for all vqs. */ - nvectors = 2; + nvectors = 1; } err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors); diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 101db3f..5ae1844 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -62,6 +62,8 @@ struct virtio_pci_device /* Whether we have vector per vq */ bool per_vq_vectors; + + char config_msix_name[256]; }; /* Constants for MSI-X */ @@ -302,6 +304,8 @@ static void vp_free_vectors(struct virtio_device *vdev) vp_dev-msix_affinity_masks = NULL; } +static
Re: RFC: sharing config interrupt between virtio devices for saving MSI
On Sun, Apr 27, 2014 at 06:03:04PM +0300, Michael S. Tsirkin wrote: On Sun, Apr 27, 2014 at 10:35:41PM +0800, Amos Kong wrote: On Sat, Apr 19, 2014 at 12:08:15PM +0800, Amos Kong wrote: Hi all, I'm working on this item in Upstream Networking todolist: | - Sharing config interrupts |Support more devices by sharing a single msi vector |between multiple virtio devices. |(Applies to virtio-blk too). I have this solution here, and only have draft patches of Solution 1 2, let's discuss if solution 3 is feasible. * We should not introduce perf regression in this change * little effect is acceptable because we are _sharing_ interrupt Solution 1: == share one LSI interrupt for configuration change of all virtio devices. Draft patch: share_lsi_for_all_config.patch Solution 2: == share one MSI interrupt for configuration change of all virtio devices. Draft patch: share_msix_for_all_config.patch Solution 3: == dynamic adjust interrupt policy when device is added or removed Current: --- - try to allocate a MSI to device's config - try to allocate a MSI for each vq - share one MSI for all vqs of one device - share one LSI for config vqs of one device additional dynamic policies: --- - if there is no enough MSI, adjust interrupt allocation for returning some MSI - try to find a device that allocated one MSI for each vq, change it to share one MSI for saving the MSI - try to share one MSI for config of all virtio_pci devices - try to share one LSI for config of all devices - if device is removed, try to re-allocate freed MSI to existed devices, if they aren't in best status (one MSI for config, one MSI for each vq) - if config of all devices is sharing one LSI, try to upgrade it to MSI - if config of all devices is sharing one MSI, try to allocate one MSI for configuration change of each device - try to find a device that is sharing one MSI for all vqs, try to allocate one MSI for each vq Hi Michael, Alex Any thoughts about this ? :) Thanks, Amos I don't think we need to worry about dynamic. Dynamic policy can always get good balance. If we use a fixed policy, devices might share IRQ with some unused msix, or sometimes it's not fair enough for all devices. Questions: just by using IRQF_SHARED, do we always end up with a shared interrupt? Or only if system is running out of vectors? In solution1: always share one LSI for config of all virtio devices In solution2: always share one MSI for config of all virtio devices Did you test that interrupt is actually shared and all handlers are called? Yes. I can find many devices (virtio.0, virtio.1,...) share same interrupt in guest's /proc/interrupts. And nics work well. We don't stop after the 1st handler that returned OK? When we set IRQF_SHARED flag, dev_id is necessary. In irq_wake_thread() it's used to identify the device for which the irq_thread should be woke. So when we share one IRQ, one single interrupt will only wake one handler once. -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: add detail error message when fail to add ioeventfd
On Thu, May 23, 2013 at 09:46:07AM +0200, Stefan Hajnoczi wrote: On Wed, May 22, 2013 at 09:48:21PM +0800, Amos Kong wrote: On Wed, May 22, 2013 at 11:32:27AM +0200, Stefan Hajnoczi wrote: On Wed, May 22, 2013 at 12:57:35PM +0800, Amos Kong wrote: I try to hotplug 28 * 8 multiple-function devices to guest with old host kernel, ioeventfds in host kernel will be exhausted, then qemu fails to allocate ioeventfds for blk/nic devices. It's better to add detail error here. Signed-off-by: Amos Kong ak...@redhat.com --- kvm-all.c |4 1 files changed, 4 insertions(+), 0 deletions(-) It would be nice to make kvm bus scalable so that the hardcoded in-kernel I/O device limit can be lifted. I had increased kernel NR_IOBUS_DEVS to 1000 (a limitation is needed for security) in last Mar, and make resizing kvm_io_range array dynamical. The maximum should not be hardcoded. File descriptor, maximum memory, etc are all controlled by rlimits. And since ioeventfds are file descriptors they are already limited by the maximum number of file descriptors. For implement the dynamically resize the kvm_io_range array, I re-allocate new array (with new size) and free old array when the array flexes. The array is only resized when add/remove ioeventfds. It will not effect the perf. Why is there a need to impose a hardcoded limit? I will send a patch to fix it. Stefan -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm: exclude ioeventfd from counting kvm_io_range limit
We can easily reach the 1000 limit by start VM with a couple hundred I/O devices (multifunction=on). The hardcode limit already been adjusted 3 times (6 ~ 200 ~ 300 ~ 1000). In userspace, we already have maximum file descriptor to limit ioeventfd count. But kvm_io_bus devices also are used for pit, pic, ioapic, coalesced_mmio. They couldn't be limited by maximum file descriptor. Currently only ioeventfds take too much kvm_io_bus devices, so just exclude it from counting kvm_io_range limit. Also fixed one indent issue in kvm_host.h Signed-off-by: Amos Kong ak...@redhat.com --- include/linux/kvm_host.h | 3 ++- virt/kvm/eventfd.c | 2 ++ virt/kvm/kvm_main.c | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f0eea07..ef261ab 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -144,7 +144,8 @@ struct kvm_io_range { #define NR_IOBUS_DEVS 1000 struct kvm_io_bus { - int dev_count; + int dev_count; + int ioeventfd_count; struct kvm_io_range range[]; }; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 64ee720..1550637 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -753,6 +753,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) if (ret 0) goto unlock_fail; + kvm-buses[bus_idx]-ioeventfd_count++; list_add_tail(p-list, kvm-ioeventfds); mutex_unlock(kvm-slots_lock); @@ -798,6 +799,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) continue; kvm_io_bus_unregister_dev(kvm, bus_idx, p-dev); + kvm-buses[bus_idx]-ioeventfd_count--; ioeventfd_release(p); ret = 0; break; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 302681c..c6d9baf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2926,7 +2926,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, struct kvm_io_bus *new_bus, *bus; bus = kvm-buses[bus_idx]; - if (bus-dev_count NR_IOBUS_DEVS - 1) + /* exclude ioeventfd which is limited by maximum fd */ + if (bus-dev_count - bus-ioeventfd_count NR_IOBUS_DEVS - 1) return -ENOSPC; new_bus = kzalloc(sizeof(*bus) + ((bus-dev_count + 1) * -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: add detail error message when fail to add ioeventfd
On Wed, May 22, 2013 at 11:32:27AM +0200, Stefan Hajnoczi wrote: On Wed, May 22, 2013 at 12:57:35PM +0800, Amos Kong wrote: I try to hotplug 28 * 8 multiple-function devices to guest with old host kernel, ioeventfds in host kernel will be exhausted, then qemu fails to allocate ioeventfds for blk/nic devices. It's better to add detail error here. Signed-off-by: Amos Kong ak...@redhat.com --- kvm-all.c |4 1 files changed, 4 insertions(+), 0 deletions(-) It would be nice to make kvm bus scalable so that the hardcoded in-kernel I/O device limit can be lifted. I had increased kernel NR_IOBUS_DEVS to 1000 (a limitation is needed for security) in last Mar, and make resizing kvm_io_range array dynamical. Reviewed-by: Stefan Hajnoczi stefa...@redhat.com -- Amos. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm: add detail error message when fail to add ioeventfd
I try to hotplug 28 * 8 multiple-function devices to guest with old host kernel, ioeventfds in host kernel will be exhausted, then qemu fails to allocate ioeventfds for blk/nic devices. It's better to add detail error here. Signed-off-by: Amos Kong ak...@redhat.com --- kvm-all.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 8222729..3d5f7b7 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -813,6 +813,8 @@ static void kvm_mem_ioeventfd_add(MemoryListener *listener, r = kvm_set_ioeventfd_mmio(fd, section-offset_within_address_space, data, true, section-size, match_data); if (r 0) { +fprintf(stderr, %s: error adding ioeventfd: %s\n, +__func__, strerror(-r)); abort(); } } @@ -843,6 +845,8 @@ static void kvm_io_ioeventfd_add(MemoryListener *listener, r = kvm_set_ioeventfd_pio(fd, section-offset_within_address_space, data, true, section-size, match_data); if (r 0) { +fprintf(stderr, %s: error adding ioeventfd: %s\n, +__func__, strerror(-r)); abort(); } } -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [QEMU PATCH v4 2/3] virtio-net: introduce a new macaddr control
On Mon, Jan 21, 2013 at 05:08:26PM +0100, Stefan Hajnoczi wrote: On Sat, Jan 19, 2013 at 09:54:27AM +0800, ak...@redhat.com wrote: @@ -350,6 +351,18 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, struct virtio_net_ctrl_mac mac_data; size_t s; +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) { +if (iov_size(iov, iov_cnt) != ETH_ALEN) { +return VIRTIO_NET_ERR; +} +s = iov_to_buf(iov, iov_cnt, 0, n-mac, sizeof(n-mac)); +if (s != sizeof(n-mac)) { +return VIRTIO_NET_ERR; +} Since iov_size() was checked before iov_to_buf(), we never hit this error. And if we did n-mac would be trashed (i.e. error handling is not complete). You are right. iov_size() computes the size by accounting iov[].iov_lens, the first check is enough. I think assert(s == sizeof(n-mac)) is more appropriate appropriate. Also, please change ETH_ALEN to sizeof(n-mac) to make the relationship between the check and the copy clear. Will update this patch. if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) { if (iov_size(iov, iov_cnt) != sizeof(n-mac)) { return VIRTIO_NET_ERR; } s = iov_to_buf(iov, iov_cnt, 0, n-mac, sizeof(n-mac)); assert(s == sizeof(n-mac)); qemu_format_nic_info_str(n-nic-nc, n-mac); return VIRTIO_NET_OK; } Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [QEMU PATCH v4 1/3] virtio-net: remove layout assumptions for ctrl vq
On Mon, Jan 21, 2013 at 05:03:30PM +0100, Stefan Hajnoczi wrote: On Sat, Jan 19, 2013 at 09:54:26AM +0800, ak...@redhat.com wrote: From: Michael S. Tsirkin m...@redhat.com Virtio-net code makes assumption about virtqueue descriptor layout (e.g. sg[0] is the header, sg[1] is the data buffer). This patch makes code not rely on the layout of descriptors. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Amos Kong ak...@redhat.com --- hw/virtio-net.c | 128 1 file changed, 74 insertions(+), 54 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 3bb01b1..113e194 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -315,44 +315,44 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) } static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd, - VirtQueueElement *elem) + struct iovec *iov, unsigned int iov_cnt) { uint8_t on; +size_t s; -if (elem-out_num != 2 || elem-out_sg[1].iov_len != sizeof(on)) { -error_report(virtio-net ctrl invalid rx mode command); -exit(1); +s = iov_to_buf(iov, iov_cnt, 0, on, sizeof(on)); +if (s != sizeof(on)) { +return VIRTIO_NET_ERR; } -on = ldub_p(elem-out_sg[1].iov_base); - -if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) +if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) { n-promisc = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) { n-allmulti = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) { n-alluni = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) { n-nomulti = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) { n-nouni = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) { n-nobcast = on; -else +} else { return VIRTIO_NET_ERR; +} return VIRTIO_NET_OK; } static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, - VirtQueueElement *elem) + struct iovec *iov, unsigned int iov_cnt) { struct virtio_net_ctrl_mac mac_data; +size_t s; -if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem-out_num != 3 || -elem-out_sg[1].iov_len sizeof(mac_data) || -elem-out_sg[2].iov_len sizeof(mac_data)) +if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) { return VIRTIO_NET_ERR; +} n-mac_table.in_use = 0; n-mac_table.first_multi = 0; @@ -360,54 +360,71 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, n-mac_table.multi_overflow = 0; memset(n-mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN); -mac_data.entries = ldl_p(elem-out_sg[1].iov_base); +s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries, + sizeof(mac_data.entries)); Hi Stefan, can we adjust the endianness after each iov_to_buf() copy? diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 72d7857..0088d6c 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -321,6 +321,7 @@ static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd, size_t s; s = iov_to_buf(iov, iov_cnt, 0, on, sizeof(on)); +on = ldub_p(on); if (s != sizeof(on)) { return VIRTIO_NET_ERR; } @@ -362,7 +363,7 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries, sizeof(mac_data.entries)); - +mac_data.entries = ldl_p(mac_data.entries); if (s != sizeof(mac_data.entries)) { return VIRTIO_NET_ERR; } @@ -389,7 +390,7 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries, sizeof(mac_data.entries)); - +mac_data.entries = ldl_p(mac_data.entries); if (s != sizeof(mac_data.entries)) { return VIRTIO_NET_ERR; } @@ -421,6 +422,7 @@ static int virtio_net_handle_vlan_table(VirtIONet *n, uint8_t cmd, size_t s; s = iov_to_buf(iov, iov_cnt, 0, vid, sizeof(vid)); +vid = lduw_p(vid); if (s != sizeof(vid)) { return VIRTIO_NET_ERR; } @@ -458,6 +460,8 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) iov = elem.out_sg; iov_cnt = elem.out_num; s = iov_to_buf(iov, iov_cnt, 0, ctrl, sizeof(ctrl)); +ctrl.class = ldub_p(ctrl.class); +ctrl.cmd = ldub_p
[QEMU PATCH v5 3/3] virtio-net: rename ctrl rx commands
This patch makes rx commands consistent with specification. Signed-off-by: Amos Kong ak...@redhat.com --- hw/virtio-net.c | 14 +++--- hw/virtio-net.h | 14 +++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index acef5a5..ac4434e 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -326,17 +326,17 @@ static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd, return VIRTIO_NET_ERR; } -if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) { +if (cmd == VIRTIO_NET_CTRL_RX_PROMISC) { n-promisc = on; -} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) { +} else if (cmd == VIRTIO_NET_CTRL_RX_ALLMULTI) { n-allmulti = on; -} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) { +} else if (cmd == VIRTIO_NET_CTRL_RX_ALLUNI) { n-alluni = on; -} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) { +} else if (cmd == VIRTIO_NET_CTRL_RX_NOMULTI) { n-nomulti = on; -} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) { +} else if (cmd == VIRTIO_NET_CTRL_RX_NOUNI) { n-nouni = on; -} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) { +} else if (cmd == VIRTIO_NET_CTRL_RX_NOBCAST) { n-nobcast = on; } else { return VIRTIO_NET_ERR; @@ -473,7 +473,7 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) iov_discard_front(iov, iov_cnt, sizeof(ctrl)); if (s != sizeof(ctrl)) { status = VIRTIO_NET_ERR; -} else if (ctrl.class == VIRTIO_NET_CTRL_RX_MODE) { +} else if (ctrl.class == VIRTIO_NET_CTRL_RX) { status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt); } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) { status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt); diff --git a/hw/virtio-net.h b/hw/virtio-net.h index 1ec632f..c0bb284 100644 --- a/hw/virtio-net.h +++ b/hw/virtio-net.h @@ -99,13 +99,13 @@ typedef uint8_t virtio_net_ctrl_ack; * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature. * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA. */ -#define VIRTIO_NET_CTRL_RX_MODE0 - #define VIRTIO_NET_CTRL_RX_MODE_PROMISC 0 - #define VIRTIO_NET_CTRL_RX_MODE_ALLMULTI 1 - #define VIRTIO_NET_CTRL_RX_MODE_ALLUNI 2 - #define VIRTIO_NET_CTRL_RX_MODE_NOMULTI 3 - #define VIRTIO_NET_CTRL_RX_MODE_NOUNI4 - #define VIRTIO_NET_CTRL_RX_MODE_NOBCAST 5 +#define VIRTIO_NET_CTRL_RX0 + #define VIRTIO_NET_CTRL_RX_PROMISC 0 + #define VIRTIO_NET_CTRL_RX_ALLMULTI 1 + #define VIRTIO_NET_CTRL_RX_ALLUNI 2 + #define VIRTIO_NET_CTRL_RX_NOMULTI 3 + #define VIRTIO_NET_CTRL_RX_NOUNI4 + #define VIRTIO_NET_CTRL_RX_NOBCAST 5 /* * Control the MAC -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[QEMU PATCH v5 1/3] virtio-net: remove layout assumptions for ctrl vq
From: Michael S. Tsirkin m...@redhat.com Virtio-net code makes assumption about virtqueue descriptor layout (e.g. sg[0] is the header, sg[1] is the data buffer). This patch makes code not rely on the layout of descriptors. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Amos Kong ak...@redhat.com --- hw/virtio-net.c | 129 --- 1 files changed, 75 insertions(+), 54 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 3bb01b1..af1f3a1 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -315,44 +315,44 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) } static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd, - VirtQueueElement *elem) + struct iovec *iov, unsigned int iov_cnt) { uint8_t on; +size_t s; -if (elem-out_num != 2 || elem-out_sg[1].iov_len != sizeof(on)) { -error_report(virtio-net ctrl invalid rx mode command); -exit(1); +s = iov_to_buf(iov, iov_cnt, 0, on, sizeof(on)); +if (s != sizeof(on)) { +return VIRTIO_NET_ERR; } -on = ldub_p(elem-out_sg[1].iov_base); - -if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) +if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) { n-promisc = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) { n-allmulti = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) { n-alluni = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) { n-nomulti = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) { n-nouni = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) { n-nobcast = on; -else +} else { return VIRTIO_NET_ERR; +} return VIRTIO_NET_OK; } static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, - VirtQueueElement *elem) + struct iovec *iov, unsigned int iov_cnt) { struct virtio_net_ctrl_mac mac_data; +size_t s; -if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem-out_num != 3 || -elem-out_sg[1].iov_len sizeof(mac_data) || -elem-out_sg[2].iov_len sizeof(mac_data)) +if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) { return VIRTIO_NET_ERR; +} n-mac_table.in_use = 0; n-mac_table.first_multi = 0; @@ -360,54 +360,72 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, n-mac_table.multi_overflow = 0; memset(n-mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN); -mac_data.entries = ldl_p(elem-out_sg[1].iov_base); +s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries, + sizeof(mac_data.entries)); +mac_data.entries = ldl_p(mac_data.entries); +if (s != sizeof(mac_data.entries)) { +return VIRTIO_NET_ERR; +} +iov_discard_front(iov, iov_cnt, s); -if (sizeof(mac_data.entries) + -(mac_data.entries * ETH_ALEN) elem-out_sg[1].iov_len) +if (mac_data.entries * ETH_ALEN iov_size(iov, iov_cnt)) { return VIRTIO_NET_ERR; +} if (mac_data.entries = MAC_TABLE_ENTRIES) { -memcpy(n-mac_table.macs, elem-out_sg[1].iov_base + sizeof(mac_data), - mac_data.entries * ETH_ALEN); +s = iov_to_buf(iov, iov_cnt, 0, n-mac_table.macs, + mac_data.entries * ETH_ALEN); +if (s != mac_data.entries * ETH_ALEN) { +return VIRTIO_NET_ERR; +} n-mac_table.in_use += mac_data.entries; } else { n-mac_table.uni_overflow = 1; } +iov_discard_front(iov, iov_cnt, mac_data.entries * ETH_ALEN); + n-mac_table.first_multi = n-mac_table.in_use; -mac_data.entries = ldl_p(elem-out_sg[2].iov_base); +s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries, + sizeof(mac_data.entries)); +mac_data.entries = ldl_p(mac_data.entries); +if (s != sizeof(mac_data.entries)) { +return VIRTIO_NET_ERR; +} + +iov_discard_front(iov, iov_cnt, s); -if (sizeof(mac_data.entries) + -(mac_data.entries * ETH_ALEN) elem-out_sg[2].iov_len) +if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) { return VIRTIO_NET_ERR; +} -if (mac_data.entries) { -if (n-mac_table.in_use + mac_data.entries = MAC_TABLE_ENTRIES) { -memcpy(n-mac_table.macs + (n-mac_table.in_use * ETH_ALEN), - elem-out_sg[2].iov_base + sizeof(mac_data), - mac_data.entries * ETH_ALEN); -n-mac_table.in_use += mac_data.entries; -} else { -n
[QEMU PATCH v5 0/3] virtio-net: fix of ctrl commands
Currently virtio-net code relys on the layout of descriptor, this patchset removed the assumptions and introduced a control command to set mac address. Last patch is a trivial renaming. V2: check guest's iov_len V3: fix of migration compatibility make mac field in config space read-only when new feature is acked V4: add fix of descriptor layout assumptions, trivial rename V5: fix endianness after iov_to_buf copy Amos Kong (2): virtio-net: introduce a new macaddr control virtio-net: rename ctrl rx commands Michael S. Tsirkin (1): virtio-net: remove layout assumptions for ctrl vq hw/pc_piix.c|4 ++ hw/virtio-net.c | 142 +- hw/virtio-net.h | 26 +++ 3 files changed, 108 insertions(+), 64 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 0/3] make mac programming for virtio net more robust
Currenly mac is programmed byte by byte. This means that we have an intermediate step where mac is wrong. Third patch introduced a new vq control command to set mac address, it's atomic. V2: check return of sending command, delay eth_mac_addr() V3: restore software address when fail to set hardware address V4: split eth_mac_addr, fix error handle V5: rebase patches to net-next tree Amos Kong (2): move virtnet_send_command() above virtnet_set_mac_address() virtio-net: introduce a new control to set macaddr Stefan Hajnoczi (1): net: split eth_mac_addr for better error handling drivers/net/virtio_net.c| 110 ++- include/linux/etherdevice.h |2 + include/uapi/linux/virtio_net.h |8 +++- net/ethernet/eth.c | 41 -- 4 files changed, 106 insertions(+), 55 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 1/3] move virtnet_send_command() above virtnet_set_mac_address()
We want to send vq command to set mac address in virtnet_set_mac_address(), so do this function moving. Fixed a little issue of coding style. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c | 89 ++--- 1 files changed, 44 insertions(+), 45 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a6fcf15..395ab4f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -753,6 +753,50 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +/* + * Send command via the control virtqueue and check status. Commands + * supported by the hypervisor, as indicated by feature bits, should + * never fail unless improperly formated. + */ +static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, +struct scatterlist *data, int out, int in) +{ + struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; + struct virtio_net_ctrl_hdr ctrl; + virtio_net_ctrl_ack status = ~0; + unsigned int tmp; + int i; + + /* Caller should know better */ + BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) || + (out + in VIRTNET_SEND_COMMAND_SG_MAX)); + + out++; /* Add header */ + in++; /* Add return status */ + + ctrl.class = class; + ctrl.cmd = cmd; + + sg_init_table(sg, out + in); + + sg_set_buf(sg[0], ctrl, sizeof(ctrl)); + for_each_sg(data, s, out + in - 2, i) + sg_set_buf(sg[i + 1], sg_virt(s), s-length); + sg_set_buf(sg[out + in - 1], status, sizeof(status)); + + BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC) 0); + + virtqueue_kick(vi-cvq); + + /* Spin for a response, the kick causes an ioport write, trapping +* into the hypervisor, so the request should be handled immediately. +*/ + while (!virtqueue_get_buf(vi-cvq, tmp)) + cpu_relax(); + + return status == VIRTIO_NET_OK; +} + static int virtnet_set_mac_address(struct net_device *dev, void *p) { struct virtnet_info *vi = netdev_priv(dev); @@ -819,51 +863,6 @@ static void virtnet_netpoll(struct net_device *dev) } #endif -/* - * Send command via the control virtqueue and check status. Commands - * supported by the hypervisor, as indicated by feature bits, should - * never fail unless improperly formated. - */ -static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, -struct scatterlist *data, int out, int in) -{ - struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; - struct virtio_net_ctrl_hdr ctrl; - virtio_net_ctrl_ack status = ~0; - unsigned int tmp; - int i; - - /* Caller should know better */ - BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) || - (out + in VIRTNET_SEND_COMMAND_SG_MAX)); - - out++; /* Add header */ - in++; /* Add return status */ - - ctrl.class = class; - ctrl.cmd = cmd; - - sg_init_table(sg, out + in); - - sg_set_buf(sg[0], ctrl, sizeof(ctrl)); - for_each_sg(data, s, out + in - 2, i) - sg_set_buf(sg[i + 1], sg_virt(s), s-length); - sg_set_buf(sg[out + in - 1], status, sizeof(status)); - - BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC) 0); - - virtqueue_kick(vi-cvq); - - /* -* Spin for a response, the kick causes an ioport write, trapping -* into the hypervisor, so the request should be handled immediately. -*/ - while (!virtqueue_get_buf(vi-cvq, tmp)) - cpu_relax(); - - return status == VIRTIO_NET_OK; -} - static void virtnet_ack_link_announce(struct virtnet_info *vi) { rtnl_lock(); -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 2/3] net: split eth_mac_addr for better error handling
From: Stefan Hajnoczi stefa...@gmail.com When we set mac address, software mac address in system and hardware mac address all need to be updated. Current eth_mac_addr() doesn't allow callers to implement error handling nicely. This patch split eth_mac_addr() to prepare part and real commit part, then we can prepare first, and try to change hardware address, then do the real commit if hardware address is set successfully. Signed-off-by: Stefan Hajnoczi stefa...@gmail.com Signed-off-by: Amos Kong ak...@redhat.com --- include/linux/etherdevice.h |2 ++ net/ethernet/eth.c | 41 +++-- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 1a43e1b..c623861 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -40,6 +40,8 @@ extern int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, extern void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); +extern int eth_prepare_mac_addr_change(struct net_device *dev, void *p); +extern void eth_commit_mac_addr_change(struct net_device *dev, void *p); extern int eth_mac_addr(struct net_device *dev, void *p); extern int eth_change_mtu(struct net_device *dev, int new_mtu); extern int eth_validate_addr(struct net_device *dev); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index bc39c8c..a36c85e 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -272,6 +272,36 @@ void eth_header_cache_update(struct hh_cache *hh, EXPORT_SYMBOL(eth_header_cache_update); /** + * eth_prepare_mac_addr_change - prepare for mac change + * @dev: network device + * @p: socket address + */ +int eth_prepare_mac_addr_change(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + if (!(dev-priv_flags IFF_LIVE_ADDR_CHANGE) netif_running(dev)) + return -EBUSY; + if (!is_valid_ether_addr(addr-sa_data)) + return -EADDRNOTAVAIL; + return 0; +} +EXPORT_SYMBOL(eth_prepare_mac_addr_change); + +/** + * eth_commit_mac_addr_change - commit mac change + * @dev: network device + * @p: socket address + */ +void eth_commit_mac_addr_change(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + memcpy(dev-dev_addr, addr-sa_data, ETH_ALEN); +} +EXPORT_SYMBOL(eth_commit_mac_addr_change); + +/** * eth_mac_addr - set new Ethernet hardware address * @dev: network device * @p: socket address @@ -283,13 +313,12 @@ EXPORT_SYMBOL(eth_header_cache_update); */ int eth_mac_addr(struct net_device *dev, void *p) { - struct sockaddr *addr = p; + int ret; - if (!(dev-priv_flags IFF_LIVE_ADDR_CHANGE) netif_running(dev)) - return -EBUSY; - if (!is_valid_ether_addr(addr-sa_data)) - return -EADDRNOTAVAIL; - memcpy(dev-dev_addr, addr-sa_data, ETH_ALEN); + ret = eth_prepare_mac_addr_change(dev, p); + if (ret 0) + return ret; + eth_commit_mac_addr_change(dev, p); return 0; } EXPORT_SYMBOL(eth_mac_addr); -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5 3/3] virtio-net: introduce a new control to set macaddr
Currently we write MAC address to pci config space byte by byte, this means that we have an intermediate step where mac is wrong. This patch introduced a new control command to set MAC address, it's atomic. VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c| 21 ++--- include/uapi/linux/virtio_net.h |8 +++- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 395ab4f..701408a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -802,14 +802,28 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi-vdev; int ret; + struct sockaddr *addr = p; + struct scatterlist sg; - ret = eth_mac_addr(dev, p); + ret = eth_prepare_mac_addr_change(dev, p); if (ret) return ret; - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + sg_init_one(sg, addr-sa_data, dev-addr_len); + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, + VIRTIO_NET_CTRL_MAC_ADDR_SET, + sg, 1, 0)) { + dev_warn(vdev-dev, +Failed to set mac address by vq command.\n); + return -EINVAL; + } + } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { vdev-config-set(vdev, offsetof(struct virtio_net_config, mac), - dev-dev_addr, dev-addr_len); + addr-sa_data, dev-addr_len); + } + + eth_commit_mac_addr_change(dev, p); return 0; } @@ -1627,6 +1641,7 @@ static unsigned int features[] = { VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, + VIRTIO_NET_F_CTRL_MAC_ADDR, }; static struct virtio_driver virtio_net_driver = { diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 848e358..a5a8c88 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -53,6 +53,7 @@ * network */ #define VIRTIO_NET_F_MQ22 /* Device supports Receive Flow * Steering */ +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ #define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */ @@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack; #define VIRTIO_NET_CTRL_RX_NOBCAST 5 /* - * Control the MAC filter table. + * Control the MAC * * The MAC filter table is managed by the hypervisor, the guest should * assume the size is infinite. Filtering should be considered @@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack; * first sg list contains unicast addresses, the second is for multicast. * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature * is available. + * + * The ADDR_SET command requests one out scatterlist, it contains a + * 6 bytes MAC address. This functionality is present if the + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. */ struct virtio_net_ctrl_mac { __u32 entries; @@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac { #define VIRTIO_NET_CTRL_MAC1 #define VIRTIO_NET_CTRL_MAC_TABLE_SET0 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 /* * Control VLAN filtering -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 2/2] virtio-net: introduce a new control to set macaddr
On Fri, Jan 18, 2013 at 12:00:42PM +0100, Stefan Hajnoczi wrote: On Thu, Jan 17, 2013 at 06:40:12PM +0800, ak...@redhat.com wrote: diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 395ab4f..837c978 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -802,14 +802,32 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi-vdev; int ret; + struct scatterlist sg; + char save_addr[ETH_ALEN]; + unsigned char save_aatype; + + memcpy(save_addr, dev-dev_addr, ETH_ALEN); + save_aatype = dev-addr_assign_type; ret = eth_mac_addr(dev, p); if (ret) return ret; - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + sg_init_one(sg, dev-dev_addr, dev-addr_len); + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, + VIRTIO_NET_CTRL_MAC_ADDR_SET, + sg, 1, 0)) { + dev_warn(vdev-dev, +Failed to set mac address by vq command.\n); + memcpy(dev-dev_addr, save_addr, ETH_ALEN); + dev-addr_assign_type = save_aatype; + return -EINVAL; + } eth_mac_addr() doesn't allow callers to implement error handling nicely. Although you didn't duplicate it's code directly, this patch still leaks internals of eth_mac_addr(). How about splitting eth_mac_addr() in a separate patch: Agree, then we can have nice error handling. I will send a V4 and include this patch. Thanks. int eth_prepare_mac_addr_change(struct net_device *dev, void *p) { struct sockaddr *addr = p; if (!(dev-priv_flags IFF_LIVE_ADDR_CHANGE) netif_running(dev)) return -EBUSY; if (!is_valid_ether_addr(addr-sa_data)) return -EADDRNOTAVAIL; return 0; } ... Now virtio_net.c does: ret = eth_prepare_mac_addr_change(dev, p); if (ret 0) return ret; if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { sg_init_one(sg, dev-dev_addr, dev-addr_len); trivial issue, %s/dev-dev_addr/addr-sa_data if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET, sg, 1, 0)) { dev_warn(vdev-dev, Failed to set mac address by vq command.\n); return -EINVAL; } } ... eth_commit_mac_addr_change(dev, p); return 0; Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [QEMU PATCH v4 1/3] virtio-net: remove layout assumptions for ctrl vq
On Sat, Jan 19, 2013 at 09:54:26AM +0800, ak...@redhat.com wrote: From: Michael S. Tsirkin m...@redhat.com Virtio-net code makes assumption about virtqueue descriptor layout (e.g. sg[0] is the header, sg[1] is the data buffer). This patch makes code not rely on the layout of descriptors. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Amos Kong ak...@redhat.com --- hw/virtio-net.c | 128 1 file changed, 74 insertions(+), 54 deletions(-) Had tested this patch with following scenarios: VIRTIO_NET_CTRL_RX_MODE 1) ip link eth0 promisc on/off 2) ip link set eth0 allmulticast on/off 3) ip link set eth0 multicast on/off VIRTIO_NET_CTRL_MAC 4) ifconfig eth0 hw ether 52:54:00:12:34:57 5) ping guest after joined guest into multicast group (225.0.0.1 ~ 225.0.0.10) VIRTIO_NET_CTRL_VLAN 6) vconfig add eth0 2; vconfig rem eth0.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [QEMU PATCH v2] virtio-net: introduce a new macaddr control
On Thu, Jan 17, 2013 at 01:45:11PM +0800, Amos Kong wrote: On Thu, Jan 17, 2013 at 11:49:20AM +1030, Rusty Russell wrote: ak...@redhat.com writes: @@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, { struct virtio_net_ctrl_mac mac_data; +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET elem-out_num == 2 +elem-out_sg[1].iov_len == ETH_ALEN) { +/* Set MAC address */ +memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len); +qemu_format_nic_info_str(n-nic-nc, n-mac); +return VIRTIO_NET_OK; +} Does the rest of the net device still rely on the layout of descriptors? No, only info string of net client relies on n-mac I misunderstood. There is no clear limitation of how much descriptor are used for each vq command, but many commands rely on the layout of descriptiors. eg: virtio-net: VIRTIO_NET_CTRL_RX_PROMISC VIRTIO_NET_CTRL_RX_ALLMULTI VIRTIO_NET_CTRL_MAC_TABLE_SET etc If so, OK, we'll fix them all together. If not, this introduces a new one. Cheers, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [QEMU PATCH v2] virtio-net: introduce a new macaddr control
On Thu, Jan 17, 2013 at 02:13:08PM +0200, Michael S. Tsirkin wrote: On Thu, Jan 17, 2013 at 11:49:20AM +1030, Rusty Russell wrote: ak...@redhat.com writes: @@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, { struct virtio_net_ctrl_mac mac_data; +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET elem-out_num == 2 +elem-out_sg[1].iov_len == ETH_ALEN) { +/* Set MAC address */ +memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len); +qemu_format_nic_info_str(n-nic-nc, n-mac); +return VIRTIO_NET_OK; +} Does the rest of the net device still rely on the layout of descriptors? If so, OK, we'll fix them all together. If not, this introduces a new one. Cheers, Rusty. The following fixes all existing users. Got to deal with some urgent stuff so did not test yet - Amos, would you like to include this in your patchset and build on it, test it all together? No problem. If not I'll get to it next week. --- virtio-net: remove layout assumptions for ctrl vq Signed-off-by: Michael S. Tsirkin m...@redhat.com ... -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 2/2] virtio-net: introduce a new control to set macaddr
On Wed, Jan 16, 2013 at 02:20:39PM +0800, Jason Wang wrote: On Wednesday, January 16, 2013 01:57:01 PM ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com Currently we write MAC address to pci config space byte by byte, this means that we have an intermediate step where mac is wrong. This patch introduced a new control command to set MAC address in one time. VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c| 24 +--- include/uapi/linux/virtio_net.h | 8 +++- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 395ab4f..c8901b6 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -802,16 +802,25 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi-vdev; int ret; + struct sockaddr *addr = p; + struct scatterlist sg; - ret = eth_mac_addr(dev, p); - if (ret) - return ret; - - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + sg_init_one(sg, addr-sa_data, dev-addr_len); + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, + VIRTIO_NET_CTRL_MAC_ADDR_SET, + sg, 1, 0)) { + dev_warn(vdev-dev, +Failed to set mac address by vq command.\n); + return -EINVAL; + } + } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { vdev-config-set(vdev, offsetof(struct virtio_net_config, mac), - dev-dev_addr, dev-addr_len); + addr-sa_data, dev-addr_len); + } + ret = eth_mac_addr(dev, p); The you will the validity check in eth_mac_addr which may result a wrong mac address to be set in the hardware (or is there any check in qemu) and a inconsistency bettween what kernel assumes and qemu has. You can take a look at netvsc driver that calls eth_mac_addr() first and restore the software mac address when fail to enforce it to hardware. Thanks for the catching, I will move eth_mac_addr() back to above, just restore addr if fail to send command. I will also use DEFINE_PROP_BIT to fix migration issue, thanks. Thanks - return 0; + return ret; } -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [QEMU PATCH v2] virtio-net: introduce a new macaddr control
On Thu, Jan 17, 2013 at 11:49:20AM +1030, Rusty Russell wrote: ak...@redhat.com writes: @@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, { struct virtio_net_ctrl_mac mac_data; +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET elem-out_num == 2 +elem-out_sg[1].iov_len == ETH_ALEN) { +/* Set MAC address */ +memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len); +qemu_format_nic_info_str(n-nic-nc, n-mac); +return VIRTIO_NET_OK; +} Does the rest of the net device still rely on the layout of descriptors? No, only info string of net client relies on n-mac If so, OK, we'll fix them all together. If not, this introduces a new one. Cheers, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virtio-spec: set mac address by a new vq command
On Wed, Jan 16, 2013 at 11:22:23AM +0200, Michael S. Tsirkin wrote: On Wed, Jan 16, 2013 at 10:13:28AM +0100, Stefan Hajnoczi wrote: On Wed, Jan 16, 2013 at 03:33:24PM +0800, ak...@redhat.com wrote: +\change_inserted -1930653948 1358320004 +The command VIRTIO_NET_CTRL_MAC_ADDR_SET is used to set +\begin_inset Quotes eld +\end_inset + +physical +\begin_inset Quotes erd +\end_inset + + address of the network card. The physical address of the network card? That term is not defined anywhere in the specification. I will replace it with 'the default MAC address' Perhaps it's best to explain that the config space mac field and VIRTIO_NET_CTRL_MAC_ADDR_SET both set the default MAC address which rx filtering accepts. (The MAC table is an additional set of MAC addresses which rx filtering accepts.) It would also be worth explaining that VIRTIO_NET_CTRL_MAC_ADDR_SET is atomic whereas the config space mac field is not. Therefore, VIRTIO_NET_CTRL_MAC_ADDR_SET is preferred, especially while the NIC is up. Ok, will send a v2, thanks. Stefan It's probably best to simply make the config space field read-only if the feature bit is acked. It's reasonable. diff --git a/hw/virtio-net.c b/hw/virtio-net.c index c18e276..54c5eae 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -93,7 +93,8 @@ static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config) memcpy(netcfg, config, sizeof(netcfg)); -if (memcmp(netcfg.mac, n-mac, ETH_ALEN)) { +if (!(n-vdev.guest_features VIRTIO_NET_F_CTRL_MAC_ADDR 1) +memcmp(netcfg.mac, n-mac, ETH_ALEN)) { memcpy(n-mac, netcfg.mac, ETH_ALEN); qemu_format_nic_info_str(n-nic-nc, n-mac); } -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr
On Thu, Jan 10, 2013 at 10:57:05PM +0800, Jason Wang wrote: On 01/10/2013 10:45 PM, ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com Currently we write MAC address to pci config space byte by byte, this means that we have an intermediate step where mac is wrong. This patch introduced a new control command to set MAC address in one time. VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c| 16 +++- include/uapi/linux/virtio_net.h | 8 +++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 395ab4f..ff22bcd 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -803,13 +803,26 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) struct virtio_device *vdev = vi-vdev; int ret; + struct scatterlist sg; + ret = eth_mac_addr(dev, p); if (ret) return ret; - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + /* Set MAC address by sending vq command */ + sg_init_one(sg, dev-dev_addr, dev-addr_len); + virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, +VIRTIO_NET_CTRL_MAC_ADDR_SET, +sg, 1, 0); + return 0; + } + Better to check the return of virtnet_send_command() and give a warn like other command. Btw, need we fail back to try the old way then? Yes, it's necessary to check the return value of virtnet_send_command(). In fail case, I like to return -EINVAL to userspace, because we don't only want to set mac successfully, we also want to resolve the addr inconsistent issue by this feature (vq cmd). -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/2] make mac programming for virtio net more robust
On Thu, Jan 10, 2013 at 10:45:39PM +0800, ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com Currenly mac is programmed byte by byte. This means that we have an intermediate step where mac is wrong. Second patch introduced a new vq control command to set mac address in one time. I just posted RFC patches (kernel qemu) out, specifiction update will be sent when the solution is accepted. MST has another solution: 1. add a feature to disable mac address in pci space 2. introduce a new vq control cmd to add new mac address to mac filter table. (no long check n-mac in receiver_filter()) Welcome to give comments, thanks! Amos -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: Fix SREGS documentation reference
On Tue, Dec 11, 2012 at 9:38 PM, Mihai Caraman mihai.cara...@freescale.com wrote: Reflect the uapi folder change in SREGS API documentation. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- Documentation/virtual/kvm/api.txt |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a4df553..9cf591d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -345,7 +345,7 @@ struct kvm_sregs { __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; }; -/* ppc -- see arch/powerpc/include/asm/kvm.h */ +/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */ Trivial fix. Reviewed-by: Amos Kong kongjian...@gmail.com interrupt_bitmap is a bitmap of pending external interrupts. At most one bit may be set. This interrupt has been acknowledged by the APIC -- 1.7.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: Fix SREGS documentation reference
On Tue, Dec 11, 2012 at 9:38 PM, Mihai Caraman mihai.cara...@freescale.com wrote: Reflect the uapi folder change in SREGS API documentation. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- Documentation/virtual/kvm/api.txt |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a4df553..9cf591d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -345,7 +345,7 @@ struct kvm_sregs { __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; }; -/* ppc -- see arch/powerpc/include/asm/kvm.h */ +/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */ Trivial fix. Reviewed-by: Amos Kong kongjian...@gmail.com interrupt_bitmap is a bitmap of pending external interrupts. At most one bit may be set. This interrupt has been acknowledged by the APIC -- 1.7.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: qemu-kvm: remove boot=on|off drive parameter compatibility
- Original Message - Option is deprecated and warning has been in place for one year. Signed-off-by: Marcelo Tosatti mtosa...@redhat.com diff --git a/blockdev.c b/blockdev.c index 4a5266e..7c83baa 100644 --- a/blockdev.c +++ b/blockdev.c @@ -432,12 +432,6 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi) return NULL; } -if (qemu_opt_get(opts, boot) != NULL) { -fprintf(stderr, qemu-kvm: boot=on|off is deprecated and will be -ignored. Future versions will reject this parameter. Please -update your scripts.\n); -} - on_write_error = BLOCK_ERR_STOP_ENOSPC; if ((buf = qemu_opt_get(opts, werror)) != NULL) { if (type != IF_IDE type != IF_SCSI type != IF_VIRTIO type != IF_NONE) { diff --git a/qemu-config.c b/qemu-config.c index 3eaee48..eba977e 100644 --- a/qemu-config.c +++ b/qemu-config.c @@ -114,10 +114,6 @@ static QemuOptsList qemu_drive_opts = { .name = copy-on-read, .type = QEMU_OPT_BOOL, .help = copy read data from backing file into image file, -},{ -.name = boot, -.type = QEMU_OPT_BOOL, -.help = (deprecated, ignored), }, { /* end of list */ } }, Reviewed-by: Amos Kong ak...@redhat.com -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/1] kvmclock: fix guest stop notification
- Original Message - On Thu, Sep 20, 2012 at 09:46:41AM -0300, Marcelo Tosatti wrote: On Thu, Sep 20, 2012 at 01:55:20PM +0530, Amit Shah wrote: Commit f349c12c0434e29c79ecde89029320c4002f7253 added the guest stop In commitlog of f349c12c0434e29c79ecde89029320c4002f7253: ## This patch uses the qemu Notifier system to tell the guest it _is about to be_ stopped notification, but it did it in a way that the stop notification would never reach the kernel. The kvm_vm_state_changed() function gets a value of 0 for the 'running' parameter when the VM is stopped, making all the code added previously dead code. This patch reworks the code so that it's called when 'running' is 0, which indicates the VM was stopped. Amit, did you touch any real issue? guest gets call trace with current code? which kind of context? Someone told me he got call trace when shutdown guest by 'init 0', I didn't verify this issue. CC: Eric B Munson emun...@mgebm.net CC: Raghavendra K T raghavendra...@linux.vnet.ibm.com CC: Andreas Färber afaer...@suse.de CC: Marcelo Tosatti mtosa...@redhat.com CC: Paolo Bonzini pbonz...@redhat.com CC: Laszlo Ersek ler...@redhat.com Signed-off-by: Amit Shah amit.s...@redhat.com --- hw/kvm/clock.c | 21 +++-- 1 files changed, 11 insertions(+), 10 deletions(-) diff --git a/hw/kvm/clock.c b/hw/kvm/clock.c index 824b978..f3427eb 100644 --- a/hw/kvm/clock.c +++ b/hw/kvm/clock.c @@ -71,18 +71,19 @@ static void kvmclock_vm_state_change(void *opaque, int running, I found this function is only called when resume vm (here running is 1, it means vm is already resumed? we don't call that ioctl _before_ resume). kvmclock_vm_state_change() is not called when I stop vm through qemu monitor command. if (running) { s-clock_valid = false; +return; +} -if (!cap_clock_ctrl) { -return; -} -for (penv = first_cpu; penv != NULL; penv = penv-next_cpu) { -ret = kvm_vcpu_ioctl(penv, KVM_KVMCLOCK_CTRL, 0); -if (ret) { -if (ret != -EINVAL) { -fprintf(stderr, %s: %s\n, __func__, strerror(-ret)); -} -return; +if (!cap_clock_ctrl) { +return; +} +for (penv = first_cpu; penv != NULL; penv = penv-next_cpu) { +ret = kvm_vcpu_ioctl(penv, KVM_KVMCLOCK_CTRL, 0); +if (ret) { +if (ret != -EINVAL) { +fprintf(stderr, %s: %s\n, __func__, strerror(-ret)); } +return; } } } -- 1.7.7.6 ACK Avi, please merge through uq/master. NACK, guest should be notified when the VM is starting, not when stopping. # from api.txt ioctl (KVM_CAP_KVMCLOCK_CTRL) can be called any time _after_ pausing the vcpu, but _before_ it is resumed. Thanks, Amos -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html