[PATCH v5 REPOST 3/6] hw_random: use reference counts on each struct hwrng.
From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v5: drop redundant kref_init() v4: decrease last reference for triggering the cleanup v3: initialize kref (thanks Amos Kong) v2: fix missing put_rng() on exit path (thanks Amos Kong) Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 135 -- include/linux/hw_random.h | 2 + 2 files changed, 94 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c8..83516cb 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include linux/delay.h #include linux/slab.h #include linux/random.h +#include linux/err.h #include asm/uaccess.h @@ -91,6 +92,60 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng-cleanup) + rng-cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + kref_get(rng-ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + /* decrease last reference for triggering the cleanup */ + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(rng-ref); + + mutex_unlock(rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* +* Hold rng_mutex here so we serialize in case they set_current_rng +* on rng again immediately. +*/ + mutex_lock(rng_mutex); + if (rng) + kref_put(rng-ref, cleanup_rng); + mutex_unlock(rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng-init) { @@ -113,12 +168,6 @@ static inline int hwrng_init(struct hwrng *rng) return 0; } -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng rng-cleanup) - rng-cleanup(rng); -} - static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ @@ -154,21 +203,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,8 +250,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); + put_rng(rng); if (need_resched()) schedule_timeout_interruptible(1); @@ -213,12 +263,11 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev
[PATCH v5 REPOST 5/6] hw_random: don't double-check old_rng.
From: Rusty Russell ru...@rustcorp.com.au Interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 067270b..a9286bf 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -476,14 +476,13 @@ int hwrng_register(struct hwrng *rng) } old_rng = current_rng; + err = 0; if (!old_rng) { err = hwrng_init(rng); if (err) goto out_unlock; set_current_rng(rng); - } - err = 0; - if (!old_rng) { + err = register_miscdev(); if (err) { drop_current_rng(); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 REPOST 0/6] fix hw_random stuck
When I hotunplug a busy virtio-rng device or try to access hwrng attributes in non-smp guest, it gets stuck. My hotplug tests: | test 0: | hotunplug rng device from qemu monitor | | test 1: | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 2: | guest) # dd if=/dev/random of=/dev/null | hotunplug rng device from qemu monitor | | test 4: | guest) # dd if=/dev/hwrng of=/dev/null | cat /sys/devices/virtual/misc/hw_random/rng_* | | test 5: | guest) # dd if=/dev/hwrng of=/dev/null | cancel dd process after 10 seconds | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 6: | use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo V5: reset cleanup_done flag, drop redundant init of reference count, use compiler barrier to prevent recording. V4: update patch 4 to fix corrupt, decrease last reference for triggering the cleanup, fix unregister race pointed by Herbert V3: initialize kref to 1 V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference counting issue Amos Kong (1): hw_random: move some code out mutex_lock for avoiding underlying deadlock Rusty Russell (5): hw_random: place mutex around read functions and buffers. hw_random: use reference counts on each struct hwrng. hw_random: fix unregister race. hw_random: don't double-check old_rng. hw_random: don't init list element we're about to add to list. drivers/char/hw_random/core.c | 173 ++ include/linux/hw_random.h | 3 + 2 files changed, 126 insertions(+), 50 deletions(-) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 REPOST 1/6] hw_random: place mutex around read functions and buffers.
From: Rusty Russell ru...@rustcorp.com.au There's currently a big lock around everything, and it means that we can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current) while the rng is reading. This is a real problem when the rng is slow, or blocked (eg. virtio_rng with qemu's default /dev/random backend) This doesn't help (it leaves the current lock untouched), just adds a lock to protect the read function and the static buffers, in preparation for transition. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..b1b6042 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -53,7 +53,10 @@ static struct hwrng *current_rng; static struct task_struct *hwrng_fill; static LIST_HEAD(rng_list); +/* Protects rng_list and current_rng */ static DEFINE_MUTEX(rng_mutex); +/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */ +static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; @@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng) unsigned char bytes[16]; int bytes_read; + mutex_lock(reading_mutex); bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1); + mutex_unlock(reading_mutex); if (bytes_read 0) add_device_randomness(bytes, bytes_read); } @@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait) { int present; + BUG_ON(!mutex_is_locked(reading_mutex)); if (rng-read) return rng-read(rng, (void *)buffer, size, wait); @@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, goto out_unlock; } + mutex_lock(reading_mutex); if (!data_avail) { bytes_read = rng_get_data(current_rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { err = bytes_read; - goto out_unlock; + goto out_unlock_reading; } data_avail = bytes_read; } @@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (!data_avail) { if (filp-f_flags O_NONBLOCK) { err = -EAGAIN; - goto out_unlock; + goto out_unlock_reading; } } else { len = data_avail; @@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (copy_to_user(buf + ret, rng_buffer + data_avail, len)) { err = -EFAULT; - goto out_unlock; + goto out_unlock_reading; } size -= len; @@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + mutex_unlock(reading_mutex); if (need_resched()) schedule_timeout_interruptible(1); @@ -208,6 +216,9 @@ out: out_unlock: mutex_unlock(rng_mutex); goto out; +out_unlock_reading: + mutex_unlock(reading_mutex); + goto out_unlock; } @@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused) while (!kthread_should_stop()) { if (!current_rng) break; + mutex_lock(reading_mutex); rc = rng_get_data(current_rng, rng_fillbuf, rng_buffer_size(), 1); + mutex_unlock(reading_mutex); if (rc = 0) { pr_warn(hwrng: no data available\n); msleep_interruptible(1); continue; } + /* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, rc * current_quality * 8 10); } -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 REPOST 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock
In next patch, we use reference counting for each struct hwrng, changing reference count also needs to take mutex_lock. Before releasing the lock, if we try to stop a kthread that waits to take the lock to reduce the referencing count, deadlock will occur. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index b1b6042..a0905c8 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng) } } if (list_empty(rng_list)) { + mutex_unlock(rng_mutex); unregister_miscdev(); if (hwrng_fill) kthread_stop(hwrng_fill); - } - - mutex_unlock(rng_mutex); + } else + mutex_unlock(rng_mutex); } EXPORT_SYMBOL_GPL(hwrng_unregister); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 REPOST 4/6] hw_random: fix unregister race.
From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. v5: reset cleanup_done flag, use compiler barrier to prevent recording. v4: add cleanup_done flag to insure that cleanup is done Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 12 include/linux/hw_random.h | 1 + 2 files changed, 13 insertions(+) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 83516cb..067270b 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex); static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; +static DECLARE_WAIT_QUEUE_HEAD(rng_done); static unsigned short current_quality; static unsigned short default_quality; /* = 0; default to off */ @@ -98,6 +99,11 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + + /* cleanup_done should be updated after cleanup finishes */ + smp_wmb(); + rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -498,6 +504,8 @@ int hwrng_register(struct hwrng *rng) add_early_randomness(rng); } + rng-cleanup_done = false; + out_unlock: mutex_unlock(rng_mutex); out: @@ -529,6 +537,10 @@ void hwrng_unregister(struct hwrng *rng) kthread_stop(hwrng_fill); } else mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); } EXPORT_SYMBOL_GPL(hwrng_unregister); diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index c212e71..7832e50 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -46,6 +46,7 @@ struct hwrng { /* internal. */ struct list_head list; struct kref ref; + bool cleanup_done; }; /** Register a new Hardware Random Number Generator driver. */ -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 REPOST 6/6] hw_random: don't init list element we're about to add to list.
From: Rusty Russell ru...@rustcorp.com.au Another interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a9286bf..4d13ac5 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -489,7 +489,6 @@ int hwrng_register(struct hwrng *rng) goto out_unlock; } } - INIT_LIST_HEAD(rng-list); list_add_tail(rng-list, rng_list); if (old_rng !rng-init) { -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v4 4/6] hw_random: fix unregister race.
On Wed, Nov 12, 2014 at 02:33:00PM +1030, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. v4: add cleanup_done flag to insure that cleanup is done That's a bit weird. The usual pattern would be to hold a reference until we're actually finished, but this reference is a bit weird. We hold the mutex across cleanup, so we could grab that but we have to take care sleeping inside wait_event, otherwise Peter will have to fix my code again :) AFAICT the wake_woken() stuff isn't merged yet, so your patch will have to do for now. @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -536,6 +539,11 @@ void hwrng_unregister(struct hwrng *rng) kthread_stop(hwrng_fill); } else mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); + The atomic_read() isn't necessary here. At least, we need it to convert refcount from atomic_t to int. Otherwise, we will touch this error: error: invalid operands to binary == (have 'atomic_t' and 'int') However, you should probably init cleanup_done in hwrng_register(). (Probably noone does unregister then register, but let's be clear). Thanks, Rusty. -- Amos. signature.asc Description: Digital signature ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock
In next patch, we use reference counting for each struct hwrng, changing reference count also needs to take mutex_lock. Before releasing the lock, if we try to stop a kthread that waits to take the lock to reduce the referencing count, deadlock will occur. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index b1b6042..a0905c8 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng) } } if (list_empty(rng_list)) { + mutex_unlock(rng_mutex); unregister_miscdev(); if (hwrng_fill) kthread_stop(hwrng_fill); - } - - mutex_unlock(rng_mutex); + } else + mutex_unlock(rng_mutex); } EXPORT_SYMBOL_GPL(hwrng_unregister); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 1/6] hw_random: place mutex around read functions and buffers.
From: Rusty Russell ru...@rustcorp.com.au There's currently a big lock around everything, and it means that we can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current) while the rng is reading. This is a real problem when the rng is slow, or blocked (eg. virtio_rng with qemu's default /dev/random backend) This doesn't help (it leaves the current lock untouched), just adds a lock to protect the read function and the static buffers, in preparation for transition. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..b1b6042 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -53,7 +53,10 @@ static struct hwrng *current_rng; static struct task_struct *hwrng_fill; static LIST_HEAD(rng_list); +/* Protects rng_list and current_rng */ static DEFINE_MUTEX(rng_mutex); +/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */ +static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; @@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng) unsigned char bytes[16]; int bytes_read; + mutex_lock(reading_mutex); bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1); + mutex_unlock(reading_mutex); if (bytes_read 0) add_device_randomness(bytes, bytes_read); } @@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait) { int present; + BUG_ON(!mutex_is_locked(reading_mutex)); if (rng-read) return rng-read(rng, (void *)buffer, size, wait); @@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, goto out_unlock; } + mutex_lock(reading_mutex); if (!data_avail) { bytes_read = rng_get_data(current_rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { err = bytes_read; - goto out_unlock; + goto out_unlock_reading; } data_avail = bytes_read; } @@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (!data_avail) { if (filp-f_flags O_NONBLOCK) { err = -EAGAIN; - goto out_unlock; + goto out_unlock_reading; } } else { len = data_avail; @@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (copy_to_user(buf + ret, rng_buffer + data_avail, len)) { err = -EFAULT; - goto out_unlock; + goto out_unlock_reading; } size -= len; @@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + mutex_unlock(reading_mutex); if (need_resched()) schedule_timeout_interruptible(1); @@ -208,6 +216,9 @@ out: out_unlock: mutex_unlock(rng_mutex); goto out; +out_unlock_reading: + mutex_unlock(reading_mutex); + goto out_unlock; } @@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused) while (!kthread_should_stop()) { if (!current_rng) break; + mutex_lock(reading_mutex); rc = rng_get_data(current_rng, rng_fillbuf, rng_buffer_size(), 1); + mutex_unlock(reading_mutex); if (rc = 0) { pr_warn(hwrng: no data available\n); msleep_interruptible(1); continue; } + /* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, rc * current_quality * 8 10); } -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 3/6] hw_random: use reference counts on each struct hwrng.
From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v5: drop redundant kref_init() v4: decrease last reference for triggering the cleanup v3: initialize kref (thanks Amos Kong) v2: fix missing put_rng() on exit path (thanks Amos Kong) Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 135 -- include/linux/hw_random.h | 2 + 2 files changed, 94 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c8..83516cb 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include linux/delay.h #include linux/slab.h #include linux/random.h +#include linux/err.h #include asm/uaccess.h @@ -91,6 +92,60 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng-cleanup) + rng-cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + kref_get(rng-ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + /* decrease last reference for triggering the cleanup */ + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(rng-ref); + + mutex_unlock(rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* +* Hold rng_mutex here so we serialize in case they set_current_rng +* on rng again immediately. +*/ + mutex_lock(rng_mutex); + if (rng) + kref_put(rng-ref, cleanup_rng); + mutex_unlock(rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng-init) { @@ -113,12 +168,6 @@ static inline int hwrng_init(struct hwrng *rng) return 0; } -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng rng-cleanup) - rng-cleanup(rng); -} - static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ @@ -154,21 +203,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,8 +250,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); + put_rng(rng); if (need_resched()) schedule_timeout_interruptible(1); @@ -213,12 +263,11 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev
[PATCH v5 0/6] fix hw_random stuck
When I hotunplug a busy virtio-rng device or try to access hwrng attributes in non-smp guest, it gets stuck. My hotplug tests: | test 0: | hotunplug rng device from qemu monitor | | test 1: | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 2: | guest) # dd if=/dev/random of=/dev/null | hotunplug rng device from qemu monitor | | test 4: | guest) # dd if=/dev/hwrng of=/dev/null | cat /sys/devices/virtual/misc/hw_random/rng_* | | test 5: | guest) # dd if=/dev/hwrng of=/dev/null | cancel dd process after 10 seconds | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 6: | use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo V5: reset cleanup_done flag, drop redundant init of reference count, use compiler barrier to prevent recording. V4: update patch 4 to fix corrupt, decrease last reference for triggering the cleanup, fix unregister race pointed by Herbert V3: initialize kref to 1 V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference counting issue Amos Kong (1): hw_random: move some code out mutex_lock for avoiding underlying deadlock Rusty Russell (5): hw_random: place mutex around read functions and buffers. hw_random: use reference counts on each struct hwrng. hw_random: fix unregister race. hw_random: don't double-check old_rng. hw_random: don't init list element we're about to add to list. drivers/char/hw_random/core.c | 173 ++ include/linux/hw_random.h | 3 + 2 files changed, 126 insertions(+), 50 deletions(-) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 4/6] hw_random: fix unregister race.
From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. v5: reset cleanup_done flag, use compiler barrier to prevent recording. v4: add cleanup_done flag to insure that cleanup is done Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 12 include/linux/hw_random.h | 1 + 2 files changed, 13 insertions(+) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 83516cb..067270b 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex); static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; +static DECLARE_WAIT_QUEUE_HEAD(rng_done); static unsigned short current_quality; static unsigned short default_quality; /* = 0; default to off */ @@ -98,6 +99,11 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + + /* cleanup_done should be updated after cleanup finishes */ + smp_wmb(); + rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -498,6 +504,8 @@ int hwrng_register(struct hwrng *rng) add_early_randomness(rng); } + rng-cleanup_done = false; + out_unlock: mutex_unlock(rng_mutex); out: @@ -529,6 +537,10 @@ void hwrng_unregister(struct hwrng *rng) kthread_stop(hwrng_fill); } else mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); } EXPORT_SYMBOL_GPL(hwrng_unregister); diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index c212e71..7832e50 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -46,6 +46,7 @@ struct hwrng { /* internal. */ struct list_head list; struct kref ref; + bool cleanup_done; }; /** Register a new Hardware Random Number Generator driver. */ -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 6/6] hw_random: don't init list element we're about to add to list.
From: Rusty Russell ru...@rustcorp.com.au Another interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a9286bf..4d13ac5 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -489,7 +489,6 @@ int hwrng_register(struct hwrng *rng) goto out_unlock; } } - INIT_LIST_HEAD(rng-list); list_add_tail(rng-list, rng_list); if (old_rng !rng-init) { -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 5/6] hw_random: don't double-check old_rng.
From: Rusty Russell ru...@rustcorp.com.au Interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 067270b..a9286bf 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -476,14 +476,13 @@ int hwrng_register(struct hwrng *rng) } old_rng = current_rng; + err = 0; if (!old_rng) { err = hwrng_init(rng); if (err) goto out_unlock; set_current_rng(rng); - } - err = 0; - if (!old_rng) { + err = register_miscdev(); if (err) { drop_current_rng(); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v4 4/6] hw_random: fix unregister race.
On Wed, Nov 12, 2014 at 02:33:00PM +1030, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. v4: add cleanup_done flag to insure that cleanup is done That's a bit weird. The usual pattern would be to hold a reference until we're actually finished, but this reference is a bit weird. The cleanup function is a callback function of kref_put(), we can't use the same reference count inside cleanup function. We hold the mutex across cleanup, so we could grab that but we have to take care sleeping inside wait_event, otherwise Peter will have to fix my code again :) We didn't hold rng_mutex inside cleanup_rng(), am I missing something? AFAICT the wake_woken() stuff isn't merged yet, so your patch will have to do for now. Can you provide some patches/mail link here? I searched nothing about wake_woken. @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -536,6 +539,11 @@ void hwrng_unregister(struct hwrng *rng) kthread_stop(hwrng_fill); } else mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); + The atomic_read() isn't necessary here. However, you should probably init cleanup_done in hwrng_register(). (Probably noone does unregister then register, but let's be clear). Got it. Thanks, Rusty. } EXPORT_SYMBOL_GPL(hwrng_unregister); diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index c212e71..7832e50 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -46,6 +46,7 @@ struct hwrng { /* internal. */ struct list_head list; struct kref ref; + bool cleanup_done; }; /** Register a new Hardware Random Number Generator driver. */ -- 1.9.3 -- Amos. signature.asc Description: Digital signature ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v4 3/6] hw_random: use reference counts on each struct hwrng.
On Wed, Nov 12, 2014 at 02:11:23PM +1030, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v4: decrease last reference for triggering the cleanup This doesn't make any sense: +static void drop_current_rng(void) +{ + struct hwrng *rng = current_rng; + + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + /* release current_rng reference */ + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; + + /* decrease last reference for triggering the cleanup */ + kref_put(rng-ref, cleanup_rng); +} Why would it drop the refcount twice? This doesn't make sense. Hmm, because you added kref_init, which initializes the reference count to 1, you created this bug. I saw some kernel code uses kref_* helper functions, the reference conter is initialized to 1. Some code didn't use the helper functions to increase/decrease the reference counter. So I will drop kref_init() and the second kref_put(). Leave out the kref_init, and let it naturally be 0 (until, and if, it becomes current_rng). Add a comment if you want. OK, thanks. Thanks, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- Amos. signature.asc Description: Digital signature ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [3.16 stable PATCH 0/2] virtio-rng: two backports to fix stuck
On Wed, Nov 05, 2014 at 04:02:49PM +, Luis Henriques wrote: Hi Amos On Tue, Nov 04, 2014 at 12:32:27PM +0800, Amos Kong wrote: On Sat, Oct 11, 2014 at 06:51:47AM +0800, Amos Kong wrote: I received two mails about faile to apply patches to 3.16-stable tree: FAILED: patch [PATCH] virtio-rng: skip reading when we start to remove the device failed to apply to 3.16-stable tree FAILED: patch [PATCH] virtio-rng: fix stuck of hot-unplugging busy device failed to apply to 3.16-stable tree Amit already backported two patches for 3.16-stable, then cherry-pick of my two patches works. Thanks. Ping Greg, thanks. I'm now doing the extended stable maintenance of the 3.16 kernel, as Greg has EOL'ed it. Anyway, I will be queuing these 2 patches for the extended 3.16 kernel. Thank you! Thanks a lot! Cheers, -- Luís Amos Kong (2): virtio-rng: fix stuck of hot-unplugging busy device virtio-rng: skip reading when we start to remove the device drivers/char/hw_random/virtio-rng.c | 7 +++ 1 file changed, 7 insertions(+) -- Amos. signature.asc Description: Digital signature ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 3/5] hw_random: fix unregister race.
On Tue, Oct 21, 2014 at 10:15:23PM +0800, Herbert Xu wrote: On Thu, Sep 18, 2014 at 12:18:24PM +0930, Rusty Russell wrote: The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index dc9092a1075d..b4a21e9521cf 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex); static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; +static DECLARE_WAIT_QUEUE_HEAD(rng_done); static unsigned short current_quality; static unsigned short default_quality; /* = 0; default to off */ @@ -98,6 +99,7 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -529,6 +531,9 @@ void hwrng_unregister(struct hwrng *rng) } mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, atomic_read(rng-ref.refcount) == 0); Hi Rusty, After initializing (kref_init()), the refcount is 1, so we need one more kref_put() after each drop_current_rng() to release last reference count, then cleanup function will be called. While it's obviously better than what we have now, I don't believe this is 100% safe as the cleanup function might still be running even after the ref count hits zero. Once we return from this function the module may be unloaded so we need to ensure that nothing is running at this point. I found wait_event() can still pass and finish unregister even cleanup function isn't called (wake_up_all() isn't called). So I added a flag cleanup_done to indicate that the rng device is cleaned up. + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); I will post the new v4 later. Cheers, -- Email: Herbert Xu herb...@gondor.apana.org.au Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v4 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock
In next patch, we use reference counting for each struct hwrng, changing reference count also needs to take mutex_lock. Before releasing the lock, if we try to stop a kthread that waits to take the lock to reduce the referencing count, deadlock will occur. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index b1b6042..a0905c8 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng) } } if (list_empty(rng_list)) { + mutex_unlock(rng_mutex); unregister_miscdev(); if (hwrng_fill) kthread_stop(hwrng_fill); - } - - mutex_unlock(rng_mutex); + } else + mutex_unlock(rng_mutex); } EXPORT_SYMBOL_GPL(hwrng_unregister); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v4 0/6] fix hw_random stuck
When I hotunplug a busy virtio-rng device or try to access hwrng attributes in non-smp guest, it gets stuck. My hotplug tests: | test 0: | hotunplug rng device from qemu monitor | | test 1: | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 2: | guest) # dd if=/dev/random of=/dev/null | hotunplug rng device from qemu monitor | | test 4: | guest) # dd if=/dev/hwrng of=/dev/null | cat /sys/devices/virtual/misc/hw_random/rng_* | | test 5: | guest) # dd if=/dev/hwrng of=/dev/null | cancel dd process after 10 seconds | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 6: | use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo V4: update patch 4 to fix corrupt, decrease last reference for triggering the cleanup, fix unregister race pointed by Herbert V3: initialize kref to 1 V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference counting issue Amos Kong (1): hw_random: move some code out mutex_lock for avoiding underlying deadlock Rusty Russell (5): hw_random: place mutex around read functions and buffers. hw_random: use reference counts on each struct hwrng. hw_random: fix unregister race. hw_random: don't double-check old_rng. hw_random: don't init list element we're about to add to list. drivers/char/hw_random/core.c | 176 ++ include/linux/hw_random.h | 3 + 2 files changed, 129 insertions(+), 50 deletions(-) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v4 4/6] hw_random: fix unregister race.
From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. v4: add cleanup_done flag to insure that cleanup is done Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 8 include/linux/hw_random.h | 1 + 2 files changed, 9 insertions(+) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 27ad6b4..c31bf91 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex); static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; +static DECLARE_WAIT_QUEUE_HEAD(rng_done); static unsigned short current_quality; static unsigned short default_quality; /* = 0; default to off */ @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + rng-cleanup_done = true; + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) @@ -536,6 +539,11 @@ void hwrng_unregister(struct hwrng *rng) kthread_stop(hwrng_fill); } else mutex_unlock(rng_mutex); + + /* Just in case rng is reading right now, wait. */ + wait_event(rng_done, rng-cleanup_done + atomic_read(rng-ref.refcount) == 0); + } EXPORT_SYMBOL_GPL(hwrng_unregister); diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index c212e71..7832e50 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -46,6 +46,7 @@ struct hwrng { /* internal. */ struct list_head list; struct kref ref; + bool cleanup_done; }; /** Register a new Hardware Random Number Generator driver. */ -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v4 3/6] hw_random: use reference counts on each struct hwrng.
From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v4: decrease last reference for triggering the cleanup v3: initialize kref (thanks Amos Kong) v2: fix missing put_rng() on exit path (thanks Amos Kong) Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 142 +- include/linux/hw_random.h | 2 + 2 files changed, 101 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c8..27ad6b4 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include linux/delay.h #include linux/slab.h #include linux/random.h +#include linux/err.h #include asm/uaccess.h @@ -91,6 +92,65 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng-cleanup) + rng-cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + kref_get(rng-ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + struct hwrng *rng = current_rng; + + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + /* release current_rng reference */ + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; + + /* decrease last reference for triggering the cleanup */ + kref_put(rng-ref, cleanup_rng); +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(rng-ref); + + mutex_unlock(rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* +* Hold rng_mutex here so we serialize in case they set_current_rng +* on rng again immediately. +*/ + mutex_lock(rng_mutex); + if (rng) + kref_put(rng-ref, cleanup_rng); + mutex_unlock(rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng-init) { @@ -110,13 +170,9 @@ static inline int hwrng_init(struct hwrng *rng) if (current_quality 0 !hwrng_fill) start_khwrngd(); - return 0; -} + kref_init(rng-ref); -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng rng-cleanup) - rng-cleanup(rng); + return 0; } static int rng_dev_open(struct inode *inode, struct file *filp) @@ -154,21 +210,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,8 +257,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); + put_rng(rng); if (need_resched()) schedule_timeout_interruptible(1); @@ -213,12 +270,11 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex
[PATCH v4 5/6] hw_random: don't double-check old_rng.
From: Rusty Russell ru...@rustcorp.com.au Interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c31bf91..fc5de7d 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -480,14 +480,13 @@ int hwrng_register(struct hwrng *rng) } old_rng = current_rng; + err = 0; if (!old_rng) { err = hwrng_init(rng); if (err) goto out_unlock; set_current_rng(rng); - } - err = 0; - if (!old_rng) { + err = register_miscdev(); if (err) { drop_current_rng(); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v4 6/6] hw_random: don't init list element we're about to add to list.
From: Rusty Russell ru...@rustcorp.com.au Another interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index fc5de7d..b2cc8a1 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -493,7 +493,6 @@ int hwrng_register(struct hwrng *rng) goto out_unlock; } } - INIT_LIST_HEAD(rng-list); list_add_tail(rng-list, rng_list); if (old_rng !rng-init) { -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [3.16 stable PATCH 0/2] virtio-rng: two backports to fix stuck
On Sat, Oct 11, 2014 at 06:51:47AM +0800, Amos Kong wrote: I received two mails about faile to apply patches to 3.16-stable tree: FAILED: patch [PATCH] virtio-rng: skip reading when we start to remove the device failed to apply to 3.16-stable tree FAILED: patch [PATCH] virtio-rng: fix stuck of hot-unplugging busy device failed to apply to 3.16-stable tree Amit already backported two patches for 3.16-stable, then cherry-pick of my two patches works. Thanks. Ping Greg, thanks. Amos Kong (2): virtio-rng: fix stuck of hot-unplugging busy device virtio-rng: skip reading when we start to remove the device drivers/char/hw_random/virtio-rng.c | 7 +++ 1 file changed, 7 insertions(+) -- 1.9.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 4/6] hw_random: fix unregister race.
On Fri, Oct 31, 2014 at 03:23:21PM +0800, Herbert Xu wrote: On Fri, Oct 31, 2014 at 10:28:00AM +1030, Rusty Russell wrote: Herbert Xu herb...@gondor.apana.org.au writes: On Thu, Sep 18, 2014 at 08:37:45PM +0800, Amos Kong wrote: From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. Signed-off-by: Rusty Russell ru...@rustcorp.com.au You totally corrupted Rusty's patch. If you're going to repost his series you better make sure that you've got the right patches. Just as well though as it made me think a little more about this patch :) OK Amos, can you please repost the complete series? Please fix the unregister race I identified first. Ok, I redownload the patches from https://patchwork.kernel.org/project/LKML/list/?submitter=5 and rebase fixes of me and rusty on it. I will post a V4 later. Thanks. -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 4/6] hw_random: fix unregister race.
On Sun, Nov 02, 2014 at 11:08:09PM +0800, Herbert Xu wrote: On Sun, Nov 02, 2014 at 11:06:13PM +0800, Amos Kong wrote: On Fri, Oct 31, 2014 at 03:23:21PM +0800, Herbert Xu wrote: On Fri, Oct 31, 2014 at 10:28:00AM +1030, Rusty Russell wrote: Herbert Xu herb...@gondor.apana.org.au writes: On Thu, Sep 18, 2014 at 08:37:45PM +0800, Amos Kong wrote: From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. Signed-off-by: Rusty Russell ru...@rustcorp.com.au You totally corrupted Rusty's patch. If you're going to repost his series you better make sure that you've got the right patches. Just as well though as it made me think a little more about this patch :) OK Amos, can you please repost the complete series? Please fix the unregister race I identified first. Ok, I redownload the patches from https://patchwork.kernel.org/project/LKML/list/?submitter=5 and rebase fixes of me and rusty on it. I will post a V4 later. Thanks. Please fix the unregister race I pointed out before doing a v4. Thanks for the remind, I got it right now :-) -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.
On Mon, Oct 20, 2014 at 10:42:11AM +1030, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: We got a warning in boot stage when above set_current_rng() is executed, it can be fixed by init rng-ref in hwrng_init(). @@ -166,6 +169,8 @@ static inline int hwrng_init(struct hwrng *rng) if (current_quality 0 !hwrng_fill) start_khwrngd(); + kref_init(rng-ref); + return 0; } OK, I folded this fix on. Thanks. Reviewed-by: Amos Kong ak...@redhat.com Thanks, Rusty. hw_random: use reference counts on each struct hwrng. current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. v3: initialize kref (thanks Amos Kong) v2: fix missing put_rng() on exit path (thanks Amos Kong) Signed-off-by: Rusty Russell ru...@rustcorp.com.au diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c818e13..0ecac38da954 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[3.16 stable PATCH 2/2] virtio-rng: skip reading when we start to remove the device
Before we really unregister the hwrng device, reading will get stuck if the virtio device is reset. We should return error for reading when we start to remove the device. Signed-off-by: Amos Kong ak...@redhat.com Reviewed-by: Amit Shah amit.s...@redhat.com Cc: sta...@vger.kernel.org Signed-off-by: Rusty Russell ru...@rustcorp.com.au (cherry picked from commit f49819560f53b7f3a596a8ea2e6764dc86695b62) Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/virtio-rng.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index b50252c..cb1688a 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -37,6 +37,7 @@ struct virtrng_info { char name[25]; int index; bool hwrng_register_done; + bool hwrng_removed; }; @@ -69,6 +70,9 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) int ret; struct virtrng_info *vi = (struct virtrng_info *)rng-priv; + if (vi-hwrng_removed) + return -ENODEV; + if (!vi-busy) { vi-busy = true; init_completion(vi-have_data); @@ -137,6 +141,7 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-hwrng_removed = true; vi-data_avail = 0; complete(vi-have_data); vdev-config-reset(vdev); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[3.16 stable PATCH 1/2] virtio-rng: fix stuck of hot-unplugging busy device
When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch exits the waiting by completing have_data completion before unregistering, resets data_avail to avoid the hwrng core use wrong buffer bytes. Signed-off-by: Amos Kong ak...@redhat.com Reviewed-by: Amit Shah amit.s...@redhat.com Cc: sta...@vger.kernel.org Signed-off-by: Rusty Russell ru...@rustcorp.com.au (cherry picked from commit 3856e548372513665670ca5db60d9a74b970fe0d) Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/virtio-rng.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index f1aa13b..b50252c 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,6 +137,8 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-data_avail = 0; + complete(vi-have_data); vdev-config-reset(vdev); vi-busy = false; if (vi-hwrng_register_done) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[3.16 stable PATCH 0/2] virtio-rng: two backports to fix stuck
I received two mails about faile to apply patches to 3.16-stable tree: FAILED: patch [PATCH] virtio-rng: skip reading when we start to remove the device failed to apply to 3.16-stable tree FAILED: patch [PATCH] virtio-rng: fix stuck of hot-unplugging busy device failed to apply to 3.16-stable tree Amit already backported two patches for 3.16-stable, then cherry-pick of my two patches works. Thanks. Amos Kong (2): virtio-rng: fix stuck of hot-unplugging busy device virtio-rng: skip reading when we start to remove the device drivers/char/hw_random/virtio-rng.c | 7 +++ 1 file changed, 7 insertions(+) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 2/5] hw_random: use reference counts on each struct hwrng.
On Thu, Sep 18, 2014 at 12:18:23PM +0930, Rusty Russell wrote: current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. Hi Rusty, Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 135 -- include/linux/hw_random.h | 2 + 2 files changed, 94 insertions(+), 43 deletions(-) ... static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ @@ -154,21 +202,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,7 +249,6 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); if (need_resched()) @@ -210,15 +258,16 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, err = -ERESTARTSYS; We need put_rng() in this error path. Otherwise, unhotplug will hang in the end of hwrng_unregister() |/* Just in case rng is reading right now, wait. */ |wait_event(rng_done, atomic_read(rng-ref.refcount) == 0); Steps to reproduce the hang: guest) # dd if=/dev/hwrng of=/dev/null cancel dd process after 10 seconds guest) # dd if=/dev/hwrng of=/dev/null hotunplug rng device from qemu monitor result: device can't be removed (still can find in QEMU monitor) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 96fa067..4e22d70 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -258,6 +258,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (signal_pending(current)) { err = -ERESTARTSYS; + put_rng(rng); goto out; } goto out; } + + put_rng(rng); } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev, err = hwrng_init(rng); if (err) break; - hwrng_cleanup(current_rng); - current_rng = rng; + drop_current_rng(); + set_current_rng(rng); err = 0; break; } @@ -272,17 +321,15 @@ static ssize_t hwrng_attr_current_show(struct device *dev, struct device_attribute *attr, char *buf) { - int err; ssize_t ret; - const char *name = none; + struct hwrng *rng; - err = mutex_lock_interruptible(rng_mutex); - if (err) - return -ERESTARTSYS; - if (current_rng) - name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); - mutex_unlock(rng_mutex); + rng = get_current_rng(); + if (IS_ERR(rng)) + return PTR_ERR(rng); + + ret = snprintf(buf, PAGE_SIZE, %s\n, rng ? rng-name : none); + put_rng(rng); return ret; } @@ -357,12 +404,16 @@ static int hwrng_fillfn(void *unused) long rc; while (!kthread_should_stop()) { -
[PATCH v2 1/6] hw_random: place mutex around read functions and buffers.
From: Rusty Russell ru...@rustcorp.com.au There's currently a big lock around everything, and it means that we can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current) while the rng is reading. This is a real problem when the rng is slow, or blocked (eg. virtio_rng with qemu's default /dev/random backend) This doesn't help (it leaves the current lock untouched), just adds a lock to protect the read function and the static buffers, in preparation for transition. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..b1b6042 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -53,7 +53,10 @@ static struct hwrng *current_rng; static struct task_struct *hwrng_fill; static LIST_HEAD(rng_list); +/* Protects rng_list and current_rng */ static DEFINE_MUTEX(rng_mutex); +/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */ +static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; @@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng) unsigned char bytes[16]; int bytes_read; + mutex_lock(reading_mutex); bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1); + mutex_unlock(reading_mutex); if (bytes_read 0) add_device_randomness(bytes, bytes_read); } @@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size, int wait) { int present; + BUG_ON(!mutex_is_locked(reading_mutex)); if (rng-read) return rng-read(rng, (void *)buffer, size, wait); @@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, goto out_unlock; } + mutex_lock(reading_mutex); if (!data_avail) { bytes_read = rng_get_data(current_rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { err = bytes_read; - goto out_unlock; + goto out_unlock_reading; } data_avail = bytes_read; } @@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (!data_avail) { if (filp-f_flags O_NONBLOCK) { err = -EAGAIN; - goto out_unlock; + goto out_unlock_reading; } } else { len = data_avail; @@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (copy_to_user(buf + ret, rng_buffer + data_avail, len)) { err = -EFAULT; - goto out_unlock; + goto out_unlock_reading; } size -= len; @@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + mutex_unlock(reading_mutex); if (need_resched()) schedule_timeout_interruptible(1); @@ -208,6 +216,9 @@ out: out_unlock: mutex_unlock(rng_mutex); goto out; +out_unlock_reading: + mutex_unlock(reading_mutex); + goto out_unlock; } @@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused) while (!kthread_should_stop()) { if (!current_rng) break; + mutex_lock(reading_mutex); rc = rng_get_data(current_rng, rng_fillbuf, rng_buffer_size(), 1); + mutex_unlock(reading_mutex); if (rc = 0) { pr_warn(hwrng: no data available\n); msleep_interruptible(1); continue; } + /* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, rc * current_quality * 8 10); } -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v2 0/6] fix hw_random stuck
When I hotunplug a busy virtio-rng device or try to access hwrng attributes in non-smp guest, it gets stuck. My original was pain, Rusty posted a real fix. This patchset fixed two issue in v1, and tested by my 6+ cases. | test 0: | hotunplug rng device from qemu monitor | | test 1: | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 2: | guest) # dd if=/dev/random of=/dev/null | hotunplug rng device from qemu monitor | | test 4: | guest) # dd if=/dev/hwrng of=/dev/null | cat /sys/devices/virtual/misc/hw_random/rng_* | | test 5: | guest) # dd if=/dev/hwrng of=/dev/null | cancel dd process after 10 seconds | guest) # dd if=/dev/hwrng of=/dev/null | hotunplug rng device from qemu monitor | | test 6: | use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference counting issue Amos Kong (1): hw_random: move some code out mutex_lock for avoiding underlying deadlock Rusty Russell (5): hw_random: place mutex around read functions and buffers. hw_random: use reference counts on each struct hwrng. hw_random: fix unregister race. hw_random: don't double-check old_rng. hw_random: don't init list element we're about to add to list. drivers/char/hw_random/core.c | 166 +- include/linux/hw_random.h | 2 + 2 files changed, 117 insertions(+), 51 deletions(-) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v2 5/6] hw_random: don't double-check old_rng.
From: Rusty Russell ru...@rustcorp.com.au Interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 9f6f5bb..6420409 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -473,14 +473,13 @@ int hwrng_register(struct hwrng *rng) } old_rng = current_rng; + err = 0; if (!old_rng) { err = hwrng_init(rng); if (err) goto out_unlock; set_current_rng(rng); - } - err = 0; - if (!old_rng) { + err = register_miscdev(); if (err) { drop_current_rng(); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.
From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. V2: reduce reference count in signal_pending path Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 136 +- include/linux/hw_random.h | 2 + 2 files changed, 95 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c8..ee9e504 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include linux/delay.h #include linux/slab.h #include linux/random.h +#include linux/err.h #include asm/uaccess.h @@ -91,6 +92,59 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng-cleanup) + rng-cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + kref_get(rng-ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(rng-ref); + + mutex_unlock(rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* +* Hold rng_mutex here so we serialize in case they set_current_rng +* on rng again immediately. +*/ + mutex_lock(rng_mutex); + if (rng) + kref_put(rng-ref, cleanup_rng); + mutex_unlock(rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng-init) { @@ -113,12 +167,6 @@ static inline int hwrng_init(struct hwrng *rng) return 0; } -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng rng-cleanup) - rng-cleanup(rng); -} - static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ @@ -154,21 +202,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,7 +249,6 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); if (need_resched()) @@ -208,17 +256,19 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (signal_pending(current)) { err = -ERESTARTSYS; + put_rng(rng); goto out; } + + put_rng(rng); } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +307,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev, err = hwrng_init(rng); if (err
[PATCH v2 4/6] hw_random: fix unregister race.
From: Rusty Russell ru...@rustcorp.com.au The previous patch added one potential problem: we can still be reading from a hwrng when it's unregistered. Add a wait for zero in the hwrng_unregister path. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index ee9e504..9f6f5bb 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex); static DEFINE_MUTEX(reading_mutex); static int data_avail; static u8 *rng_buffer, *rng_fillbuf; +static DECLARE_WAIT_QUEUE_HEAD(rng_done); static unsigned short current_quality; static unsigned short default_quality; /* = 0; default to off */ @@ -98,6 +99,7 @@ static inline void cleanup_rng(struct kref *kref) if (rng-cleanup) rng-cleanup(rng); + wake_up_all(rng_done); } static void set_current_rng(struct hwrng *rng) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v2 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock
In next patch, we use reference counting for each struct hwrng, changing reference count also needs to take mutex_lock. Before releasing the lock, if we try to stop a kthread that waits to take the lock to reduce the referencing count, deadlock will occur. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index b1b6042..a0905c8 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng) } } if (list_empty(rng_list)) { + mutex_unlock(rng_mutex); unregister_miscdev(); if (hwrng_fill) kthread_stop(hwrng_fill); - } - - mutex_unlock(rng_mutex); + } else + mutex_unlock(rng_mutex); } EXPORT_SYMBOL_GPL(hwrng_unregister); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v2 6/6] hw_random: don't init list element we're about to add to list.
From: Rusty Russell ru...@rustcorp.com.au Another interesting anti-pattern. Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/char/hw_random/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 6420409..12187dd 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -486,7 +486,6 @@ int hwrng_register(struct hwrng *rng) goto out_unlock; } } - INIT_LIST_HEAD(rng-list); list_add_tail(rng-list, rng_list); if (old_rng !rng-init) { -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.
On Thu, Sep 18, 2014 at 08:37:44PM +0800, Amos Kong wrote: From: Rusty Russell ru...@rustcorp.com.au current_rng holds one reference, and we bump it every time we want to do a read from it. This means we only hold the rng_mutex to grab or drop a reference, so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't block on read of /dev/hwrng. Using a kref is overkill (we're always under the rng_mutex), but a standard pattern. This also solves the problem that the hwrng_fillfn thread was accessing current_rng without a lock, which could change (eg. to NULL) underneath it. V2: reduce reference count in signal_pending path Signed-off-by: Rusty Russell ru...@rustcorp.com.au Signed-off-by: Amos Kong ak...@redhat.com Reply myself. --- drivers/char/hw_random/core.c | 136 +- include/linux/hw_random.h | 2 + 2 files changed, 95 insertions(+), 43 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index a0905c8..ee9e504 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -42,6 +42,7 @@ #include linux/delay.h #include linux/slab.h #include linux/random.h +#include linux/err.h #include asm/uaccess.h @@ -91,6 +92,59 @@ static void add_early_randomness(struct hwrng *rng) add_device_randomness(bytes, bytes_read); } +static inline void cleanup_rng(struct kref *kref) +{ + struct hwrng *rng = container_of(kref, struct hwrng, ref); + + if (rng-cleanup) + rng-cleanup(rng); +} + +static void set_current_rng(struct hwrng *rng) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + kref_get(rng-ref); + current_rng = rng; +} + +static void drop_current_rng(void) +{ + BUG_ON(!mutex_is_locked(rng_mutex)); + if (!current_rng) + return; + + kref_put(current_rng-ref, cleanup_rng); + current_rng = NULL; +} + +/* Returns ERR_PTR(), NULL or refcounted hwrng */ +static struct hwrng *get_current_rng(void) +{ + struct hwrng *rng; + + if (mutex_lock_interruptible(rng_mutex)) + return ERR_PTR(-ERESTARTSYS); + + rng = current_rng; + if (rng) + kref_get(rng-ref); + + mutex_unlock(rng_mutex); + return rng; +} + +static void put_rng(struct hwrng *rng) +{ + /* + * Hold rng_mutex here so we serialize in case they set_current_rng + * on rng again immediately. + */ + mutex_lock(rng_mutex); + if (rng) + kref_put(rng-ref, cleanup_rng); + mutex_unlock(rng_mutex); +} + static inline int hwrng_init(struct hwrng *rng) { if (rng-init) { @@ -113,12 +167,6 @@ static inline int hwrng_init(struct hwrng *rng) return 0; } -static inline void hwrng_cleanup(struct hwrng *rng) -{ - if (rng rng-cleanup) - rng-cleanup(rng); -} - static int rng_dev_open(struct inode *inode, struct file *filp) { /* enforce read-only access to this chrdev */ @@ -154,21 +202,22 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ssize_t ret = 0; int err = 0; int bytes_read, len; + struct hwrng *rng; while (size) { - if (mutex_lock_interruptible(rng_mutex)) { - err = -ERESTARTSYS; + rng = get_current_rng(); + if (IS_ERR(rng)) { + err = PTR_ERR(rng); goto out; } - - if (!current_rng) { + if (!rng) { err = -ENODEV; - goto out_unlock; + goto out; } mutex_lock(reading_mutex); if (!data_avail) { - bytes_read = rng_get_data(current_rng, rng_buffer, + bytes_read = rng_get_data(rng, rng_buffer, rng_buffer_size(), !(filp-f_flags O_NONBLOCK)); if (bytes_read 0) { @@ -200,7 +249,6 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, ret += len; } - mutex_unlock(rng_mutex); mutex_unlock(reading_mutex); if (need_resched()) @@ -208,17 +256,19 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, if (signal_pending(current)) { err = -ERESTARTSYS; + put_rng(rng); goto out; } + + put_rng(rng); } out: return ret ? : err; -out_unlock: - mutex_unlock(rng_mutex); - goto out; + out_unlock_reading: mutex_unlock(reading_mutex); - goto out_unlock; + put_rng(rng); + goto out; } @@ -257,8 +307,8 @@ static ssize_t hwrng_attr_current_store(struct
[PATCH v2 1/3] virtio-rng cleanup: move some code out of mutex protection
It doesn't save too much cpu time as expected, just a cleanup. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..c591d7e 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -270,8 +270,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); return ret; } @@ -284,19 +284,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); return ret; } -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v2 0/3] fix stuck in accessing hwrng attributes
If we read hwrng by long-running dd process, it takes too much cpu time and almost hold the mutex lock. When we check hwrng attributes from sysfs by cat, it gets stuck in waiting the lock releaseing. The problem can only be reproduced with non-smp guest with slow backend. This patchset resolves the issue by changing rng_dev_read() to always schedule 10 jiffies after release mutex lock, then cat process can have chance to get the lock and execute protected code without stuck. Thanks. V2: update commitlog to describe PATCH 2, split second patch. Amos Kong (3): virtio-rng cleanup: move some code out of mutex protection hw_random: fix stuck in catting hwrng attributes hw_random: increase schedule timeout in rng_dev_read() drivers/char/hw_random/core.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v2 2/3] hw_random: fix stuck in catting hwrng attributes
I started a QEMU (non-smp) guest with one virtio-rng device, and read random data from /dev/hwrng by dd: # dd if=/dev/hwrng of=/dev/null In the same time, if I check hwrng attributes from sysfs by cat: # cat /sys/class/misc/hw_random/rng_* The cat process always gets stuck with slow backend (5 k/s), if we use a quick backend (1.2 M/s), the cat process will cost 1 to 2 minutes. The stuck doesn't exist for smp guest. Reading syscall enters kernel and call rng_dev_read(), it's user context. We used need_resched() to check if other tasks need to be run, but it almost always return false, and re-hold the mutex lock. The attributes accessing process always fails to hold the lock, so the cat gets stuck. User context doesn't allow other user contexts run on that CPU, unless the kernel code sleeps for some reason. This is why the need_reshed() always return false here. This patch removed need_resched() and always schedule other tasks then other tasks can have chance to hold the lock and execute protected code. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..263a370 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(1); if (signal_pending(current)) { err = -ERESTARTSYS; -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v2 3/3] hw_random: increase schedule timeout in rng_dev_read()
This patch increases the schedule timeout to 10 jiffies, it's more appropriate, then other takes can easy to hold the mutex lock. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 263a370..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,7 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); if (signal_pending(current)) { err = -ERESTARTSYS; -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 3/3] hw_random: increase schedule timeout in rng_dev_read()
On Mon, Sep 15, 2014 at 06:13:31PM +0200, Michael Büsch wrote: On Tue, 16 Sep 2014 00:02:29 +0800 Amos Kong ak...@redhat.com wrote: This patch increases the schedule timeout to 10 jiffies, it's more appropriate, then other takes can easy to hold the mutex lock. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 263a370..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,7 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); if (signal_pending(current)) { err = -ERESTARTSYS; Does a schedule of 1 ms or 10 ms decrease the throughput? In my test environment, 1 jiffe always works (100%), as suggested by Amit 10 jiffes is more appropriate. After applied current 3 patches, there is a throughput regression. 1.2 M/s - 6 K/s We can only schedule in the end of loop (size == 0), and only for non-smp guest. So smp guest won't be effected. | if (!size num_online_cpus() == 1) | schedule_timeout_interruptible(timeout); Set timeout to 1: non-smp guest with quick backend (1.2M/s) - about 49K/s) Set timeout to 10: non-smp guest with quick backend (1.2M/s) - about 490K/s) We might need other benchmark to test the performance, but we can see the bug clearly caused a regression. As we discussed in other thread, need_resched() should work in this case, so those patches might be wrong fixing. I think we need some benchmarks. -- Michael -- Amos. pgpEioLyquXyk.pgp Description: PGP signature ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 1/3] virtio-rng cleanup: move some code out of mutex protection
On Mon, Sep 15, 2014 at 06:13:20PM +0200, Michael Büsch wrote: On Tue, 16 Sep 2014 00:02:27 +0800 Amos Kong ak...@redhat.com wrote: It doesn't save too much cpu time as expected, just a cleanup. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..c591d7e 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -270,8 +270,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); I'm not sure this is safe. Name is just a pointer. What if the hwrng gets unregistered after unlock and just before the snprintf? Oh, it points to protected current_rng-name, I will drop this cleanup. Thanks. return ret; } @@ -284,19 +284,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); return ret; } This looks ok. -- Michael -- Amos. pgpLebQhrkTiH.pgp Description: PGP signature ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
CC linux-kernel Original thread: http://comments.gmane.org/gmane.linux.kernel.virtualization/22775 On Mon, Sep 15, 2014 at 06:48:46PM +0200, Radim Krčmář wrote: 2014-09-14 10:25+0800, Amos Kong: On Sun, Sep 14, 2014 at 09:12:08AM +0800, Amos Kong wrote: ... diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); If cond_resched() does not work, it is a bug elsewehere. Thanks for your reply, Jason also told me the TIF_NEED_RESCHED should be set in this case, then need_resched() returns true. I will investigate the issue and reply you later. Problem only occurred in non-smp guest, we can improve it to: if(!is_smp()) schedule_timeout_interruptible(10); is_smp() is only available for arm arch, we need a general one. (It is num_online_cpus() 1.) -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
On Thu, Sep 11, 2014 at 09:08:03PM +0930, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: When I check hwrng attributes in sysfs, cat process always gets stuck if guest has only 1 vcpu and uses a slow rng backend. Currently we check if there is any tasks waiting to be run on current cpu in rng_dev_read() by need_resched(). But need_resched() doesn't work because rng_dev_read() is executing in user context. I don't understand this explanation? I'd expect the sysfs process to be woken by the mutex_unlock(). But actually sysfs process's not woken always, this is they the process gets stuck. If we're really high priority (vs. the sysfs process) then I can see why we'd need schedule_timeout_interruptible() instead of just schedule(), and in that case, need_resched() would be false too. You could argue that's intended behaviour, but I can't see how it happens in the normal case anyway. What am I missing? Thanks, Rusty. This patch removed need_resched() and increase delay to 10 jiffies, then other tasks can have chance to execute protected code. Delaying 1 jiffy also works, but 10 jiffies is safer. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); if (signal_pending(current)) { err = -ERESTARTSYS; -- 1.9.3 -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
On Sun, Sep 14, 2014 at 01:12:58AM +0800, Amos Kong wrote: On Thu, Sep 11, 2014 at 09:08:03PM +0930, Rusty Russell wrote: Amos Kong ak...@redhat.com writes: When I check hwrng attributes in sysfs, cat process always gets stuck if guest has only 1 vcpu and uses a slow rng backend. Currently we check if there is any tasks waiting to be run on current cpu in rng_dev_read() by need_resched(). But need_resched() doesn't work because rng_dev_read() is executing in user context. I don't understand this explanation? I'd expect the sysfs process to be woken by the mutex_unlock(). But actually sysfs process's not woken always, this is they the process gets stuck. %s/they/why/ Hi Rusty, Reference: http://www.linuxgrill.com/anonymous/fire/netfilter/kernel-hacking-HOWTO-2.html read() syscall of /dev/hwrng will enter into kernel, the read operation is rng_dev_read(), it's userspace context (not interrupt context). Userspace context doesn't allow other user contexts run on that CPU, unless the kernel code sleeps for some reason. In this case, the need_resched() doesn't work. My solution is removing need_resched() and use an appropriate delay by schedule_timeout_interruptible(10). Thanks, Amos If we're really high priority (vs. the sysfs process) then I can see why we'd need schedule_timeout_interruptible() instead of just schedule(), and in that case, need_resched() would be false too. You could argue that's intended behaviour, but I can't see how it happens in the normal case anyway. What am I missing? Thanks, Rusty. This patch removed need_resched() and increase delay to 10 jiffies, then other tasks can have chance to execute protected code. Delaying 1 jiffy also works, but 10 jiffies is safer. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); if (signal_pending(current)) { err = -ERESTARTSYS; -- ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
On Thu, Sep 11, 2014 at 11:38:38AM +0530, Amit Shah wrote: On (Wed) 10 Sep 2014 [17:07:07], Amos Kong wrote: When I check hwrng attributes in sysfs, cat process always gets stuck if guest has only 1 vcpu and uses a slow rng backend. Currently we check if there is any tasks waiting to be run on current cpu in rng_dev_read() by need_resched(). But need_resched() doesn't work because rng_dev_read() is executing in user context. This patch removed need_resched() and increase delay to 10 jiffies, then other tasks can have chance to execute protected code. Delaying 1 jiffy also works, but 10 jiffies is safer. Hi Amit, I'd prefer two patches for this one: one to remove the need_resched() check, and the other to increase the timeout. If Rusty agrees with this fix, I will respin to update the commitlog with clear description and split the patches to 3. Thanks for the review. Anyway, Reviewed-by: Amit Shah amit.s...@redhat.com -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
On Sun, Sep 14, 2014 at 09:12:08AM +0800, Amos Kong wrote: ... diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); Problem only occurred in non-smp guest, we can improve it to: if(!is_smp()) schedule_timeout_interruptible(10); is_smp() is only available for arm arch, we need a general one. if (signal_pending(current)) { err = -ERESTARTSYS; -- -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v3 0/2] virtio-rng: fix hotunplug
RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1121540 When we try to hot-unplugging a busy virtio-rng device, the device can't be removed. And the reading process in guest gets stuck. Those two patches fixed this issue by completing have_data completion and preventing invalid reading. Thanks for the help of Amit. Cc: sta...@vger.kernel.org V2: reset data_avail (Amit) adjust unregister order V3: split patch, update commitlog Amos Kong (2): virtio-rng: fix stuck of hot-unplugging busy device virtio-rng: skip reading when we start to remove the device drivers/char/hw_random/virtio-rng.c | 7 +++ 1 file changed, 7 insertions(+) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v3 2/2] virtio-rng: skip reading when we start to remove the device
Before we really unregister the hwrng device, reading will get stuck if the virtio device is reset. We should return error for reading when we start to remove the device. Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- drivers/char/hw_random/virtio-rng.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 849b228..132c9cc 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -36,6 +36,7 @@ struct virtrng_info { int index; bool busy; bool hwrng_register_done; + bool hwrng_removed; }; @@ -68,6 +69,9 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) int ret; struct virtrng_info *vi = (struct virtrng_info *)rng-priv; + if (vi-hwrng_removed) + return -ENODEV; + if (!vi-busy) { vi-busy = true; init_completion(vi-have_data); @@ -137,6 +141,7 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-hwrng_removed = true; vi-data_avail = 0; complete(vi-have_data); vdev-config-reset(vdev); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v3 1/2] virtio-rng: fix stuck of hot-unplugging busy device
When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch exits the waiting by completing have_data completion before unregistering, resets data_avail to avoid the hwrng core use wrong buffer bytes. Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- drivers/char/hw_random/virtio-rng.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..849b228 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,6 +137,8 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-data_avail = 0; + complete(vi-have_data); vdev-config-reset(vdev); vi-busy = false; if (vi-hwrng_register_done) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2] virtio-rng: fix stuck of hot-unplugging busy device
On Wed, Sep 10, 2014 at 11:04:54AM +0530, Amit Shah wrote: Hi Amos, On (Tue) 09 Sep 2014 [19:14:02], Amos Kong wrote: When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch exits the waiting by completing have_data completion before unregistering, resets data_avail to avoid the hwrng core use wrong buffer bytes. Before real unregistering we should return -ENODEV for reading. When we hot-unplug the device, dd process in guest will exit. $ dd if=/dev/hwrng of=/dev/null dd: error reading ‘/dev/hwrng’: No such device Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- V2: reset data_avail (Amit) adjust unregister order Thanks, this looks good. Can you please split into two parts, the complete() in one, and the hwrng_register_done stuff into another? I just posted the V3, split to two patches, and updated the commitlog to describe why we need to return ENODEV for reading. Thanks, Amit -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: RFC virtio-rng: fail to read sysfs of a busy device
On Wed, Sep 10, 2014 at 11:22:12AM +0530, Amit Shah wrote: On (Tue) 09 Sep 2014 [23:23:07], Amos Kong wrote: (Resend to fix the subject) Hi Amit, Rusty RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062 steps: - Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest - check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*' Result: cat process will get stuck, it will return if we kill dd process. How common is it going to be to have a long-running 'dd' process on /dev/hwrng? Not a common usage, but we have this strict testing. Also, with the new khwrng thread, reading from /dev/hwrng isn't required -- just use /dev/random? Yes. (This doesn't mean we shouldn't fix the issue here...) Completely agree :-) We have some static variables (eg, current_rng, data_avail, etc) in hw_random/core.c, they are protected by rng_mutex. I try to workaround this issue by undelay(100) after mutex_unlock() in rng_dev_read(). This gives chance for hwrng_attr_*_show() to get mutex. This patch also contains some cleanup, moving some code out of mutex protection. Do you have some suggestion? Thanks. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..fa69020 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + udelay(100); We have a need_resched() right below. Why doesn't that work? need_resched() is giving chance for userspace to if (need_resched()) It never success in my debugging. If we remove this check and always call schedule_timeout_interruptible(1), problem also disappears. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..263a370 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(1); if (signal_pending(current)) { err = -ERESTARTSYS; schedule_timeout_interruptible(1); @@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device *dev, int err; struct hwrng *rng; The following hunk doesn't work: + err = -ENODEV; err = mutex_lock_interruptible(rng_mutex); err is being set to another value in the next line! if (err) return -ERESTARTSYS; - err = -ENODEV; And all usage of err below now won't have -ENODEV but some other value. Oops! list_for_each_entry(rng, rng_list, list) { if (strcmp(rng-name, buf) == 0) { if (rng == current_rng) { @@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); This looks OK... return ret; } @@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); But this isn't resulting in savings; the majority of the time is being spent in the for loop, and that writes to the buffer. Right BTW I don't expect strcat'ing to the buf in each of these scenarios is a long operation, so this reworking doesn't strike to me as something we should pursue. Amit -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: RFC virtio-rng: fail to read sysfs of a busy device
On Wed, Sep 10, 2014 at 02:49:38PM +0800, Amos Kong wrote: On Wed, Sep 10, 2014 at 11:22:12AM +0530, Amit Shah wrote: On (Tue) 09 Sep 2014 [23:23:07], Amos Kong wrote: (Resend to fix the subject) Hi Amit, Rusty RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062 steps: - Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest - check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*' Result: cat process will get stuck, it will return if we kill dd process. How common is it going to be to have a long-running 'dd' process on /dev/hwrng? Not a common usage, but we have this strict testing. For -smp 1: It's easy to reproduce with slow backend (/dev/random). cat can return most of time with some delay if we use quick backend (/dev/urandom). But for -smp 2: I didn't touch this problem even with slow backend. Also, with the new khwrng thread, reading from /dev/hwrng isn't required -- just use /dev/random? Yes. (This doesn't mean we shouldn't fix the issue here...) Completely agree :-) We have some static variables (eg, current_rng, data_avail, etc) in hw_random/core.c, they are protected by rng_mutex. I try to workaround this issue by undelay(100) after mutex_unlock() in rng_dev_read(). This gives chance for hwrng_attr_*_show() to get mutex. This patch also contains some cleanup, moving some code out of mutex protection. Do you have some suggestion? Thanks. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..fa69020 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + udelay(100); We have a need_resched() right below. Why doesn't that work? [smp 1] Why need_resched() always return zero? what's the original purpose of it ? if (need_resched()) It never success in my debugging. If we remove this check and always call schedule_timeout_interruptible(1), problem also disappears. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..263a370 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(1); if (signal_pending(current)) { err = -ERESTARTSYS; schedule_timeout_interruptible(1); @@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device *dev, int err; struct hwrng *rng; The following hunk doesn't work: + err = -ENODEV; err = mutex_lock_interruptible(rng_mutex); err is being set to another value in the next line! if (err) return -ERESTARTSYS; - err = -ENODEV; And all usage of err below now won't have -ENODEV but some other value. Oops! list_for_each_entry(rng, rng_list, list) { if (strcmp(rng-name, buf) == 0) { if (rng == current_rng) { @@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); This looks OK... return ret; } @@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); But this isn't resulting in savings; the majority of the time is being spent in the for loop, and that writes to the buffer. Right BTW I don't expect strcat'ing to the buf in each of these scenarios is a long operation, so this reworking doesn't strike to me as something we should pursue. Amit -- Amos. -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo
[PATCH 1/2] virtio-rng cleanup: move some code out of mutex protection
It doesn't save too much cpu time as expected, just a cleanup. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..c591d7e 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -270,8 +270,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); return ret; } @@ -284,19 +284,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); return ret; } -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes
When I check hwrng attributes in sysfs, cat process always gets stuck if guest has only 1 vcpu and uses a slow rng backend. Currently we check if there is any tasks waiting to be run on current cpu in rng_dev_read() by need_resched(). But need_resched() doesn't work because rng_dev_read() is executing in user context. This patch removed need_resched() and increase delay to 10 jiffies, then other tasks can have chance to execute protected code. Delaying 1 jiffy also works, but 10 jiffies is safer. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index c591d7e..b5d1b6f 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, mutex_unlock(rng_mutex); - if (need_resched()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(10); if (signal_pending(current)) { err = -ERESTARTSYS; -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH 0/2] fix stuck in catting hwrng attributes
If we read hwrng by long-running dd process, it takes too much cpu time. When we check hwrng attributes from sysfs by cat, it gets stuck. The problem can only be reproduced with non-smp guest with slow backend. This patchset changed hwrng core to always delay 10 jiffies, cat process have chance to execute protected code, the problem is resolved. Thanks. Amos Kong (2): virtio-rng cleanup: move some code out of mutex protection virtio-rng: fix stuck in catting hwrng attributes drivers/char/hw_random/core.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] virtio-rng: complete have_data completion in removing device
On Mon, Sep 08, 2014 at 11:29:51PM +0800, Amos Kong wrote: On Wed, Aug 06, 2014 at 01:55:29PM +0530, Amit Shah wrote: On (Wed) 06 Aug 2014 [16:05:41], Amos Kong wrote: On Wed, Aug 06, 2014 at 01:35:15AM +0800, Amos Kong wrote: When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch fixed the hang by completing have_data completion before unregistering a virtio-rng device. Hi Amit, Before applying this patch, it's blocking insider wait_for_completion_killable() Applied this patch, wait_for_completion_killable() returns 0, and vi-data_avail becomes 0, then rng_get_date() will return 0. Thanks for checking this. Is it expected result? Hi Amit I think what will happen is vi-data_avail will be set to whatever it was set last. In case of a previous successful read request, the data_avail will be set to whatever number of bytes the host gave. On doing a hot-unplug on the succeeding wait, the value in data_avail will be re-used, and the hwrng core will wrongly take some bytes in the buffer as input from the host. So, I think we need to set vi-data_avail = 0; before calling wait_event_completion_killable(). I finally understand content, we need to set vi-data_avail to 0 before returning virtio_read(), it might enter wait_event_completion_killable() when we try to remove the device. We call complete() in remove_common(), then wait_event_completion_killable() will exit, but virtio_read() will be re-entered if the device still isn't unregistered, then re-stuck inside wait_event_completion_killable(). I tested some complex condition (both quick/slow backend), I found some problem in below two patches. I will post another fix later. The test result is expected. 1. Hotplug remove virtio-rng0, dd process will exit with an error: dd: error reading ‘/dev/hwrng’: Operation not permitted virtio-rng0 disappear from 'info pci' 2. Re-read by dd, hotplug virtio-rng1, dd process exit with same error, virtio-rng1 disappear Thanks, Amos Amit In my latest debugging, I found the hang is caused by unexpected reading when we started to remove the device. I have two draft fix, 1) is skip unexpected reading by checking a remove flag. 2) is unregistering device at the beginning of remove_common(). I think second patch is better if it won't cause new problem. The original patch (complete in remove_common()) is still necessary. Test results: hotplug issue disappeared (dd process will quit). diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..028797c 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -35,6 +35,7 @@ struct virtrng_info { unsigned int data_avail; int index; bool busy; +bool remove; bool hwrng_register_done; }; @@ -68,6 +69,9 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) int ret; struct virtrng_info *vi = (struct virtrng_info *)rng-priv; +if (vi-remove) + return 0; + if (!vi-busy) { vi-busy = true; init_completion(vi-have_data); @@ -137,6 +141,8 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-remove = true; + complete(vi-have_data); vdev-config-reset(vdev); vi-busy = false; if (vi-hwrng_register_done) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..9b8c2ce 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,10 +137,11 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; - vdev-config-reset(vdev); - vi-busy = false; if (vi-hwrng_register_done) hwrng_unregister(vi-hwrng); + complete(vi-have_data); + vdev-config-reset(vdev); + vi-busy = false; vdev-config-del_vqs(vdev); ida_simple_remove(rng_index_ida, vi-index); kfree(vi); ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v2] virtio-rng: fix stuck of hot-unplugging busy device
When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch exits the waiting by completing have_data completion before unregistering, resets data_avail to avoid the hwrng core use wrong buffer bytes. Before real unregistering we should return -ENODEV for reading. When we hot-unplug the device, dd process in guest will exit. $ dd if=/dev/hwrng of=/dev/null dd: error reading ‘/dev/hwrng’: No such device Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- V2: reset data_avail (Amit) adjust unregister order --- drivers/char/hw_random/virtio-rng.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..e76433b 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -68,6 +68,10 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) int ret; struct virtrng_info *vi = (struct virtrng_info *)rng-priv; + if (!vi-hwrng_register_done) { + return -ENODEV; + } + if (!vi-busy) { vi-busy = true; init_completion(vi-have_data); @@ -137,10 +141,14 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-data_avail = 0; + complete(vi-have_data); vdev-config-reset(vdev); vi-busy = false; - if (vi-hwrng_register_done) + if (vi-hwrng_register_done) { + vi-hwrng_register_done = false; hwrng_unregister(vi-hwrng); + } vdev-config-del_vqs(vdev); ida_simple_remove(rng_index_ida, vi-index); kfree(vi); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
mutex
Hi Amit, Rusty RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062 steps: - Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest - check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*' Result: cat process will get stuck, it will return if we kill dd process. We have some static variables (eg, current_rng, data_avail, etc) in hw_random/core.c, they are protected by rng_mutex. I try to workaround this issue by undelay(100) after mutex_unlock() in rng_dev_read(). This gives chance for hwrng_attr_*_show() to get mutex. This patch also contains some cleanup, moving some code out of mutex protection. Do you have some suggestion? Thanks. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..fa69020 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + udelay(100); if (need_resched()) schedule_timeout_interruptible(1); @@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device *dev, int err; struct hwrng *rng; + err = -ENODEV; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - err = -ENODEV; list_for_each_entry(rng, rng_list, list) { if (strcmp(rng-name, buf) == 0) { if (rng == current_rng) { @@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); return ret; } @@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); return ret; } ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
RFC virtio-rng: fail to read sysfs of a busy device
(Resend to fix the subject) Hi Amit, Rusty RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062 steps: - Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest - check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*' Result: cat process will get stuck, it will return if we kill dd process. We have some static variables (eg, current_rng, data_avail, etc) in hw_random/core.c, they are protected by rng_mutex. I try to workaround this issue by undelay(100) after mutex_unlock() in rng_dev_read(). This gives chance for hwrng_attr_*_show() to get mutex. This patch also contains some cleanup, moving some code out of mutex protection. Do you have some suggestion? Thanks. diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index aa30a25..fa69020 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf, } mutex_unlock(rng_mutex); + udelay(100); if (need_resched()) schedule_timeout_interruptible(1); @@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device *dev, int err; struct hwrng *rng; + err = -ENODEV; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - err = -ENODEV; list_for_each_entry(rng, rng_list, list) { if (strcmp(rng-name, buf) == 0) { if (rng == current_rng) { @@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev, return -ERESTARTSYS; if (current_rng) name = current_rng-name; - ret = snprintf(buf, PAGE_SIZE, %s\n, name); mutex_unlock(rng_mutex); + ret = snprintf(buf, PAGE_SIZE, %s\n, name); return ret; } @@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct device *dev, ssize_t ret = 0; struct hwrng *rng; + buf[0] = '\0'; err = mutex_lock_interruptible(rng_mutex); if (err) return -ERESTARTSYS; - buf[0] = '\0'; list_for_each_entry(rng, rng_list, list) { strncat(buf, rng-name, PAGE_SIZE - ret - 1); ret += strlen(rng-name); strncat(buf, , PAGE_SIZE - ret - 1); ret++; } + mutex_unlock(rng_mutex); strncat(buf, \n, PAGE_SIZE - ret - 1); ret++; - mutex_unlock(rng_mutex); return ret; } -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] virtio-rng: complete have_data completion in removing device
On Wed, Aug 06, 2014 at 01:55:29PM +0530, Amit Shah wrote: On (Wed) 06 Aug 2014 [16:05:41], Amos Kong wrote: On Wed, Aug 06, 2014 at 01:35:15AM +0800, Amos Kong wrote: When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch fixed the hang by completing have_data completion before unregistering a virtio-rng device. Hi Amit, Before applying this patch, it's blocking insider wait_for_completion_killable() Applied this patch, wait_for_completion_killable() returns 0, and vi-data_avail becomes 0, then rng_get_date() will return 0. Thanks for checking this. Is it expected result? I think what will happen is vi-data_avail will be set to whatever it was set last. In case of a previous successful read request, the data_avail will be set to whatever number of bytes the host gave. On doing a hot-unplug on the succeeding wait, the value in data_avail will be re-used, and the hwrng core will wrongly take some bytes in the buffer as input from the host. So, I think we need to set vi-data_avail = 0; before calling wait_event_completion_killable(). Amit In my latest debugging, I found the hang is caused by unexpected reading when we started to remove the device. I have two draft fix, 1) is skip unexpected reading by checking a remove flag. 2) is unregistering device at the beginning of remove_common(). I think second patch is better if it won't cause new problem. The original patch (complete in remove_common()) is still necessary. Test results: hotplug issue disappeared (dd process will quit). diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..028797c 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -35,6 +35,7 @@ struct virtrng_info { unsigned int data_avail; int index; bool busy; +bool remove; bool hwrng_register_done; }; @@ -68,6 +69,9 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) int ret; struct virtrng_info *vi = (struct virtrng_info *)rng-priv; +if (vi-remove) + return 0; + if (!vi-busy) { vi-busy = true; init_completion(vi-have_data); @@ -137,6 +141,8 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; + vi-remove = true; + complete(vi-have_data); vdev-config-reset(vdev); vi-busy = false; if (vi-hwrng_register_done) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2e3139e..9b8c2ce 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,10 +137,11 @@ static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev-priv; - vdev-config-reset(vdev); - vi-busy = false; if (vi-hwrng_register_done) hwrng_unregister(vi-hwrng); + complete(vi-have_data); + vdev-config-reset(vdev); + vi-busy = false; vdev-config-del_vqs(vdev); ida_simple_remove(rng_index_ida, vi-index); kfree(vi); ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH RFC] virtio-pci: share config interrupt between virtio devices
On Mon, Sep 01, 2014 at 09:37:30AM +0300, Michael S. Tsirkin wrote: Hi Michael, On Mon, Sep 01, 2014 at 01:41:54PM +0800, Amos Kong wrote: One VM only has 128 msix interrupt, virtio-config interrupt has less workload. This patch shares one normal interrupt Thanks for your quick reply. normal == INT#x? Please don't call it normal. The proper name is legacy INT#x. OK So you are trying to use legacy INT#x at the same time with MSI-X? This does not work: the PCI spec says: While enabled for MSI or MSI-X operation, a function is prohibited from using its INTx# pin (if implemented) to request service (MSI, MSI-X, and INTx# are mutually exclusive). It means we can't use interrupt and triggered-interrupt together for one PCI device. I will study this problem. does the patch work for you? If it does it might be a (minor) spec violation in kvm. I did some basic testing (multiple nics, scp, ping, etc), it works. Besides, INT#x really leads to terrible performance because sharing is forced even if there aren't many devices. Why do we need INT#x? How about setting IRQF_SHARED for the config interrupt while using MSI-X? You'd have to read ISR to check that the interrupt was intended for your device. I have a draft patch to share one MSI-X for all virtio-config, it has some problem in hotplugging devices. I will continue this way. for configuration between virtio devices. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/virtio/virtio_pci.c | 41 - 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 3d1463c..b1263b3 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -52,6 +52,7 @@ struct virtio_pci_device /* Name strings for interrupts. This size should be enough, * and I'm too lazy to allocate each name separately. */ char (*msix_names)[256]; + char config_msix_name[256]; /* Number of available vectors */ unsigned msix_vectors; /* Vectors allocated, excluding per-vq vectors if any */ @@ -282,12 +283,6 @@ static void vp_free_vectors(struct virtio_device *vdev) free_cpumask_var(vp_dev-msix_affinity_masks[i]); if (vp_dev-msix_enabled) { - /* Disable the vector used for configuration */ - iowrite16(VIRTIO_MSI_NO_VECTOR, - vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Flush the write out to device */ - ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - pci_disable_msix(vp_dev-pci_dev); vp_dev-msix_enabled = 0; } @@ -339,24 +334,18 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, goto error; vp_dev-msix_enabled = 1; - /* Set the vector used for configuration */ - v = vp_dev-msix_used_vectors; - snprintf(vp_dev-msix_names[v], sizeof *vp_dev-msix_names, + /* Set shared IRQ for configuration */ + snprintf(vp_dev-config_msix_name, sizeof(*vp_dev-msix_names), %s-config, name); - err = request_irq(vp_dev-msix_entries[v].vector, - vp_config_changed, 0, vp_dev-msix_names[v], + err = request_irq(vp_dev-pci_dev-irq, + vp_config_changed, + IRQF_SHARED, + vp_dev-config_msix_name, vp_dev); - if (err) - goto error; - ++vp_dev-msix_used_vectors; - - iowrite16(v, vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Verify we had enough resources to assign the vector */ - v = ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - if (v == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; + if (!err) + vp_dev-intx_enabled = 1; + else goto error; - } if (!per_vq_vectors) { /* Shared vector for all VQs */ @@ -535,14 +524,16 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, goto error_request; } else { if (per_vq_vectors) { - /* Best option: one for change interrupt, one per vq. */ - nvectors = 1; + /* Best option: one normal interrupt for change, + one msix per vq. */ + nvectors = 0; for (i = 0; i nvqs; ++i) if (callbacks[i]) ++nvectors; } else { - /* Second best: one for change, shared for all vqs. */ - nvectors = 2; + /* Second best: one normal interrupt for + change, share one msix for all vqs. */ + nvectors = 1
[PATCH RFC] virtio-pci: share config interrupt between virtio devices
One VM only has 128 msix interrupt, virtio-config interrupt has less workload. This patch shares one normal interrupt for configuration between virtio devices. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/virtio/virtio_pci.c | 41 - 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 3d1463c..b1263b3 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -52,6 +52,7 @@ struct virtio_pci_device /* Name strings for interrupts. This size should be enough, * and I'm too lazy to allocate each name separately. */ char (*msix_names)[256]; + char config_msix_name[256]; /* Number of available vectors */ unsigned msix_vectors; /* Vectors allocated, excluding per-vq vectors if any */ @@ -282,12 +283,6 @@ static void vp_free_vectors(struct virtio_device *vdev) free_cpumask_var(vp_dev-msix_affinity_masks[i]); if (vp_dev-msix_enabled) { - /* Disable the vector used for configuration */ - iowrite16(VIRTIO_MSI_NO_VECTOR, - vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Flush the write out to device */ - ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - pci_disable_msix(vp_dev-pci_dev); vp_dev-msix_enabled = 0; } @@ -339,24 +334,18 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, goto error; vp_dev-msix_enabled = 1; - /* Set the vector used for configuration */ - v = vp_dev-msix_used_vectors; - snprintf(vp_dev-msix_names[v], sizeof *vp_dev-msix_names, + /* Set shared IRQ for configuration */ + snprintf(vp_dev-config_msix_name, sizeof(*vp_dev-msix_names), %s-config, name); - err = request_irq(vp_dev-msix_entries[v].vector, - vp_config_changed, 0, vp_dev-msix_names[v], + err = request_irq(vp_dev-pci_dev-irq, + vp_config_changed, + IRQF_SHARED, + vp_dev-config_msix_name, vp_dev); - if (err) - goto error; - ++vp_dev-msix_used_vectors; - - iowrite16(v, vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Verify we had enough resources to assign the vector */ - v = ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - if (v == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; + if (!err) + vp_dev-intx_enabled = 1; + else goto error; - } if (!per_vq_vectors) { /* Shared vector for all VQs */ @@ -535,14 +524,16 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, goto error_request; } else { if (per_vq_vectors) { - /* Best option: one for change interrupt, one per vq. */ - nvectors = 1; + /* Best option: one normal interrupt for change, + one msix per vq. */ + nvectors = 0; for (i = 0; i nvqs; ++i) if (callbacks[i]) ++nvectors; } else { - /* Second best: one for change, shared for all vqs. */ - nvectors = 2; + /* Second best: one normal interrupt for + change, share one msix for all vqs. */ + nvectors = 1; } err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] virtio-rng: complete have_data completion in removing device
On Wed, Aug 06, 2014 at 01:35:15AM +0800, Amos Kong wrote: When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch fixed the hang by completing have_data completion before unregistering a virtio-rng device. Hi Amit, Before applying this patch, it's blocking insider wait_for_completion_killable() Applied this patch, wait_for_completion_killable() returns 0, and vi-data_avail becomes 0, then rng_get_date() will return 0. Is it expected result? Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- drivers/char/hw_random/virtio-rng.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 0027137..416b15c 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,6 +137,7 @@ static void remove_common(struct virtio_device *vdev) struct virtrng_info *vi = vdev-priv; vdev-config-reset(vdev); + complete(vi-have_data); vi-busy = false; if (vi-hwrng_register_done) hwrng_unregister(vi-hwrng); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: When I boot two virtio-rng devices, guest will hang
3.16 (guest hangs with two rng devices) 3.16 + quick fix (can startup with two rng devices) (hotplug issue 1 + hotplug issue 2 exist) lates torvalds/linux.git + amit 4 patches (can startup with two rng devices) (only hotplug issue 2 exists) However, the 4 patches also fixed the hang issue, the hotplug issue was fixed a little. The hotplug issue is effected by the backend, or maybe it's not a real issue, because the rng device can be hot-removed after dd process is killed. Hotplug issue 1: 1. boot up guest with two rng device (rng0 uses /dev/urandom, rng1 uses /dev/random) 2. read data by dd in guest 3 (option 1). hot-remove rng0, then hot-remove rng1 - result: _only rng1_ can't be removed until dd process is killed 3 (option 2). hot-remove rng1, then hot-remove rng0 - result: two devices can be removed successfully, dd process will exit automatically. If we use /dev/urandom for rng0 and rng1, _rng0 rng1_ can be removed, dd process will exit automatically. Hotplug issue 2: If we use /dev/random for rng0 and rng1, _rng0 rng1_ can't be removed until dd process is killed. Hotplug issue 3: If we use /dev/random for rng0 and rng1, _only rng1_ can't be removed until dd process is killed. (The difference between /dev/random and /dev/urandom is the speed.) Thanks, Amos ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH] virtio-rng: complete have_data completion in removing device
When we try to hot-remove a busy virtio-rng device from QEMU monitor, the device can't be hot-removed. Because virtio-rng driver hangs at wait_for_completion_killable(). This patch fixed the hang by completing have_data completion before unregistering a virtio-rng device. Signed-off-by: Amos Kong ak...@redhat.com Cc: sta...@vger.kernel.org --- drivers/char/hw_random/virtio-rng.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 0027137..416b15c 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -137,6 +137,7 @@ static void remove_common(struct virtio_device *vdev) struct virtrng_info *vi = vdev-priv; vdev-config-reset(vdev); + complete(vi-have_data); vi-busy = false; if (vi-hwrng_register_done) hwrng_unregister(vi-hwrng); -- 1.9.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: When I boot two virtio-rng devices, guest will hang
On Tue, Aug 05, 2014 at 06:28:54PM +0800, Amos Kong wrote: 3.16 (guest hangs with two rng devices) 3.16 + quick fix (can startup with two rng devices) (hotplug issue 1 + hotplug issue 2 exist) lates torvalds/linux.git + amit 4 patches (can startup with two rng devices) (only hotplug issue 2 exists) However, the 4 patches also fixed the hang issue, the hotplug issue was fixed a little. The hotplug issue is effected by the backend, or maybe it's not a real issue, because the rng device can be hot-removed after dd process is killed. Hotplug issue 1: 1. boot up guest with two rng device (rng0 uses /dev/urandom, rng1 uses /dev/random) 2. read data by dd in guest 3 (option 1). hot-remove rng0, then hot-remove rng1 - result: _only rng1_ can't be removed until dd process is killed 3 (option 2). hot-remove rng1, then hot-remove rng0 - result: two devices can be removed successfully, dd process will exit automatically. If we use /dev/urandom for rng0 and rng1, _rng0 rng1_ can be removed, dd process will exit automatically. Hotplug issue 2: If we use /dev/random for rng0 and rng1, _rng0 rng1_ can't be removed until dd process is killed. Hotplug issue 3: If we use /dev/random for rng0 and rng1, _only rng1_ can't be removed until dd process is killed. Hi Amit, I finally found the root problem and posted a fix to upstream: http://lists.linuxfoundation.org/pipermail/virtualization/2014-August/027049.html It can help to fix the hotplug issues on 3.16 latest kernel, so stable kernel is CCed. (The difference between /dev/random and /dev/urandom is the speed.) Thanks, Amos -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: When I boot two virtio-rng devices, guest will hang
On Mon, Jul 28, 2014 at 01:25:14PM +0530, Amit Shah wrote: On (Mon) 28 Jul 2014 [15:32:42], Amos Kong wrote: QEMU commandline: ./x86_64-softmmu/qemu-system-x86_64 --enable-kvm -m 2000 -drive file=/images/nolvm.qcow2 --kernel /home/devel/linux/arch/x86/boot/bzImage -append ro root=/dev/sda1 console=ttyS0,115200 -monitor unix:/tmp/m,nowait,server -device virtio-net-pci,netdev=h0,vectors=17,mq=on,id=n0 -netdev tap,id=h0,queues=8 -device virtio-net-pci,netdev=h1,vectors=0,mq=on,id=n1 -netdev tap,id=h1,queues=8 -vnc :0 -mon chardev=qmp,mode=control,pretty=on -chardev socket,id=qmp,host=localhost,port=1234,server,nowait -serial stdio -object rng-random,filename=/dev/urandom,id=rng0 -device virtio-rng-pci,rng=rng0,id=h0 -object rng-random,filename=/dev/urandom,id=rng1 -device virtio-rng-pci,rng=rng1,id=h1 It works when I only add one virtio-rng device. Did you touch this problem? I'm using latest net-next/master (ac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f) Hi Amit, snip [0.223503] Non-volatile memory driver v1.3 [1.172293] tsc: Refined TSC clocksource calibration: 2893.436 MHz qemu: terminating on signal 2 -- (I have to cancel QEMU process by Ctrl + C) This looks similar to what I saw when driver asks for randomness from the host before probe is completed. Does the following patch help? This patch was already inclued in latest net-next/master patch commit: e052dbf554610e2104c5a7518c4d8374bed701bb While the driver is setup (DRIVER_OK is set), the vqs aren't marked usable before the probe for the other device finishes as well. That's not a scenario I considered. We can try to put this patch in 3.16, but it won't be applicable for 3.17, where this is handled the proper way. Also, 3.15 isn't affected, since that doesn't have multi-device support. Also attaching a patch that enables traces in qemu so it's easier to see what's going on. diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index e9b15bc..124cac5 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -34,12 +34,11 @@ struct virtrng_info { unsigned int data_avail; struct completion have_data; bool busy; + bool probe_done; char name[25]; int index; }; -static bool probe_done; - static void random_recv_done(struct virtqueue *vq) { struct virtrng_info *vi = vq-vdev-priv; @@ -73,7 +72,7 @@ static int virtio_read(struct hwrng *rng, void *buf, -size_t size, bool wait) * Don't ask host for data till we're setup. This call can * happen during hwrng_register(), after commit d9e7972619. */ - if (unlikely(!probe_done)) + if (unlikely(!vi-probe_done)) return 0; if (!vi-busy) { @@ -146,7 +145,7 @@ static int probe_common(struct virtio_device *vdev) return err; } - probe_done = true; + vi-probe_done = true; return 0; } Amit From ec1aa555b67628beefa0ac6902baa2cc2e156f58 Mon Sep 17 00:00:00 2001 Message-Id: ec1aa555b67628beefa0ac6902baa2cc2e156f58.1406533638.git.amit.s...@redhat.com From: Amit Shah amit.s...@redhat.com Date: Mon, 21 Jul 2014 14:46:28 +0530 Subject: [PATCH 1/1] virtio-rng: add some trace events Add some trace events to virtio-rng for easier debugging Signed-off-by: Amit Shah amit.s...@redhat.com --- hw/virtio/virtio-rng.c | 7 +++ trace-events | 5 + 2 files changed, 12 insertions(+) diff --git a/hw/virtio/virtio-rng.c b/hw/virtio/virtio-rng.c index 7c5a675..4a6472b 100644 --- a/hw/virtio/virtio-rng.c +++ b/hw/virtio/virtio-rng.c @@ -16,6 +16,7 @@ #include hw/virtio/virtio-rng.h #include sysemu/rng.h #include qom/object_interfaces.h +#include trace.h static bool is_guest_ready(VirtIORNG *vrng) { @@ -24,6 +25,7 @@ static bool is_guest_ready(VirtIORNG *vrng) (vdev-status VIRTIO_CONFIG_S_DRIVER_OK)) { return true; } +trace_virtio_rng_guest_not_ready(vrng); return false; } @@ -62,6 +64,7 @@ static void chr_read(void *opaque, const void *buf, size_t size) offset += len; virtqueue_push(vrng-vq, elem, len); +trace_virtio_rng_pushed(vrng, len); } virtio_notify(vdev, vrng-vq); } @@ -81,7 +84,11 @@ static void virtio_rng_process(VirtIORNG *vrng) quota = MIN((uint64_t)vrng-quota_remaining, (uint64_t)UINT32_MAX); } size = get_request_size(vrng-vq, quota); + +trace_virtio_rng_request(vrng, size, quota); + size = MIN(vrng-quota_remaining, size); + if (size) { rng_backend_request_entropy(vrng-rng, size, chr_read, vrng); } diff --git a/trace-events b/trace-events index 11a17a8..99f39ac 100644 --- a/trace-events +++ b/trace-events @@ -41,6
Re: When I boot two virtio-rng devices, guest will hang
On Mon, Jul 28, 2014 at 02:42:13PM +0530, Amit Shah wrote: On (Mon) 28 Jul 2014 [16:49:20], Amos Kong wrote: On Mon, Jul 28, 2014 at 01:25:14PM +0530, Amit Shah wrote: On (Mon) 28 Jul 2014 [15:32:42], Amos Kong wrote: QEMU commandline: ./x86_64-softmmu/qemu-system-x86_64 --enable-kvm -m 2000 -drive file=/images/nolvm.qcow2 --kernel /home/devel/linux/arch/x86/boot/bzImage -append ro root=/dev/sda1 console=ttyS0,115200 -monitor unix:/tmp/m,nowait,server -device virtio-net-pci,netdev=h0,vectors=17,mq=on,id=n0 -netdev tap,id=h0,queues=8 -device virtio-net-pci,netdev=h1,vectors=0,mq=on,id=n1 -netdev tap,id=h1,queues=8 -vnc :0 -mon chardev=qmp,mode=control,pretty=on -chardev socket,id=qmp,host=localhost,port=1234,server,nowait -serial stdio -object rng-random,filename=/dev/urandom,id=rng0 -device virtio-rng-pci,rng=rng0,id=h0 -object rng-random,filename=/dev/urandom,id=rng1 -device virtio-rng-pci,rng=rng1,id=h1 It works when I only add one virtio-rng device. Did you touch this problem? I'm using latest net-next/master (ac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f) Hi Amit, snip [0.223503] Non-volatile memory driver v1.3 [1.172293] tsc: Refined TSC clocksource calibration: 2893.436 MHz qemu: terminating on signal 2 -- (I have to cancel QEMU process by Ctrl + C) This looks similar to what I saw when driver asks for randomness from the host before probe is completed. Does the following patch help? This patch was already inclued in latest net-next/master patch commit: e052dbf554610e2104c5a7518c4d8374bed701bb No, it's a different one, goes on top of the commit you referenced. Thanks. This patch fixed the problem. -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH] virtio-rng: support multiple virtio-rng devices
Current hwrng core supports to register multiple hwrng devices, and there is only one device really works in the same time. QEMU alsu supports to have multiple virtio-rng backends. This patch changes virtio-rng driver to support multiple virtio-rng devices. ]# cat /sys/class/misc/hw_random/rng_available virtio_rng.0 virtio_rng.1 ]# cat /sys/class/misc/hw_random/rng_current virtio_rng.0 ]# echo -n virtio_rng.1 /sys/class/misc/hw_random/rng_current ]# dd if=/dev/hwrng of=/dev/null Signed-off-by: Amos Kong ak...@redhat.com --- drivers/char/hw_random/virtio-rng.c | 102 ++-- include/linux/hw_random.h | 2 +- 2 files changed, 64 insertions(+), 40 deletions(-) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 2ce0e22..12e242b 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -25,88 +25,108 @@ #include linux/virtio_rng.h #include linux/module.h -static struct virtqueue *vq; -static unsigned int data_avail; -static DECLARE_COMPLETION(have_data); -static bool busy; + +struct virtrng_info { + struct virtio_device *vdev; + struct hwrng hwrng; + struct virtqueue *vq; + unsigned int data_avail; + struct completion have_data; + bool busy; +}; static void random_recv_done(struct virtqueue *vq) { + struct virtrng_info *vi = vq-vdev-priv; + /* We can get spurious callbacks, e.g. shared IRQs + virtio_pci. */ - if (!virtqueue_get_buf(vq, data_avail)) + if (!virtqueue_get_buf(vi-vq, vi-data_avail)) return; - complete(have_data); + complete(vi-have_data); } /* The host will fill any buffer we give it with sweet, sweet randomness. */ -static void register_buffer(u8 *buf, size_t size) +static void register_buffer(struct virtrng_info *vi, u8 *buf, size_t size) { struct scatterlist sg; sg_init_one(sg, buf, size); /* There should always be room for one buffer. */ - virtqueue_add_inbuf(vq, sg, 1, buf, GFP_KERNEL); + virtqueue_add_inbuf(vi-vq, sg, 1, buf, GFP_KERNEL); - virtqueue_kick(vq); + virtqueue_kick(vi-vq); } static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) { int ret; + struct virtrng_info *vi = (struct virtrng_info *)rng-priv; - if (!busy) { - busy = true; - init_completion(have_data); - register_buffer(buf, size); + if (!vi-busy) { + vi-busy = true; + init_completion(vi-have_data); + register_buffer(vi, buf, size); } if (!wait) return 0; - ret = wait_for_completion_killable(have_data); + ret = wait_for_completion_killable(vi-have_data); if (ret 0) return ret; - busy = false; + vi-busy = false; - return data_avail; + return vi-data_avail; } static void virtio_cleanup(struct hwrng *rng) { - if (busy) - wait_for_completion(have_data); -} - + struct virtrng_info *vi = (struct virtrng_info *)rng-priv; -static struct hwrng virtio_hwrng = { - .name = virtio, - .cleanup= virtio_cleanup, - .read = virtio_read, -}; + if (vi-busy) + wait_for_completion(vi-have_data); +} static int probe_common(struct virtio_device *vdev) { - int err; + int err, i; + struct virtrng_info *vi = NULL; + + vi = kmalloc(sizeof(struct virtrng_info), GFP_KERNEL); + vi-hwrng.name = kmalloc(40, GFP_KERNEL); + init_completion(vi-have_data); + + vi-hwrng.read = virtio_read; + vi-hwrng.cleanup = virtio_cleanup; + vi-hwrng.priv = (unsigned long)vi; + vdev-priv = vi; - if (vq) { - /* We only support one device for now */ - return -EBUSY; - } /* We expect a single virtqueue. */ - vq = virtio_find_single_vq(vdev, random_recv_done, input); - if (IS_ERR(vq)) { - err = PTR_ERR(vq); - vq = NULL; + vi-vq = virtio_find_single_vq(vdev, random_recv_done, input); + if (IS_ERR(vi-vq)) { + err = PTR_ERR(vi-vq); + kfree(vi-hwrng.name); + vi-vq = NULL; + kfree(vi); + vi = NULL; return err; } - err = hwrng_register(virtio_hwrng); + i = 0; + do { + sprintf(vi-hwrng.name, virtio_rng.%d, i++); + err = hwrng_register(vi-hwrng); + } while (err == -EEXIST); + if (err) { vdev-config-del_vqs(vdev); - vq = NULL; + kfree(vi-hwrng.name); + vi-vq = NULL; + kfree(vi); + vi = NULL; return err; } @@ -115,11 +135,15 @@ static int probe_common
Re: RFC: sharing config interrupt between virtio devices for saving MSI
On Sat, Apr 19, 2014 at 12:08:15PM +0800, Amos Kong wrote: Hi all, I'm working on this item in Upstream Networking todolist: | - Sharing config interrupts |Support more devices by sharing a single msi vector |between multiple virtio devices. |(Applies to virtio-blk too). I have this solution here, and only have draft patches of Solution 1 2, let's discuss if solution 3 is feasible. * We should not introduce perf regression in this change * little effect is acceptable because we are _sharing_ interrupt Solution 1: == share one LSI interrupt for configuration change of all virtio devices. Draft patch: share_lsi_for_all_config.patch Solution 2: == share one MSI interrupt for configuration change of all virtio devices. Draft patch: share_msix_for_all_config.patch Solution 3: == dynamic adjust interrupt policy when device is added or removed Current: --- - try to allocate a MSI to device's config - try to allocate a MSI for each vq - share one MSI for all vqs of one device - share one LSI for config vqs of one device additional dynamic policies: --- - if there is no enough MSI, adjust interrupt allocation for returning some MSI - try to find a device that allocated one MSI for each vq, change it to share one MSI for saving the MSI - try to share one MSI for config of all virtio_pci devices - try to share one LSI for config of all devices - if device is removed, try to re-allocate freed MSI to existed devices, if they aren't in best status (one MSI for config, one MSI for each vq) - if config of all devices is sharing one LSI, try to upgrade it to MSI - if config of all devices is sharing one MSI, try to allocate one MSI for configuration change of each device - try to find a device that is sharing one MSI for all vqs, try to allocate one MSI for each vq BTW, I saw we still notify all vqs even VIRTIO_PCI_ISR_CONFIG bit of isr is set, is it necessary? [Reply myself] Quote from Virtio-spec: | 2.4.3 Dealing With Configuration Changes | Some virtio PCI devices can change the device configuration state, as reflected | in the virtio header in the PCI configuration space. In this case: | | 1. If MSI-X capability is disabled: an interrupt is delivered and the sec- | ond highest bit is set in the ISR Status field to indicate that the driver | should re-examine the configuration space. | Note that a single interrupt can | indicate both that one or more virtqueue has been used and that the con- | figuration space has changed: even if the config bit is set, virtqueues must | be scanned. It seems current code is fine. diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 101db3f..176aabc 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -259,9 +259,9 @@ static irqreturn_t vp_interrupt(int irq, void *opaque) /* Configuration change? Tell driver if it wants to know. */ if (isr VIRTIO_PCI_ISR_CONFIG) - vp_config_changed(irq, opaque); - - return vp_vring_interrupt(irq, opaque); + return vp_config_changed(irq, opaque); + else + return vp_vring_interrupt(irq, opaque); } static void vp_free_vectors(struct virtio_device *vdev) -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH] virtio_net: return error when there is no virtqueue or MQ isn't support
Currently ethtool returns zero if there is no virtqueue or MQ isn't support, we should return -ENOTSUPP to notice user. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8a852b5..eaf8266 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1053,7 +1053,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) struct net_device *dev = vi-dev; if (!vi-has_cvq || !virtio_has_feature(vi-vdev, VIRTIO_NET_F_MQ)) - return 0; + return -ENOTSUPP; s.virtqueue_pairs = queue_pairs; sg_init_one(sg, s, sizeof(s)); -- 1.9.0 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] virtio_net: return error when there is no virtqueue or MQ isn't support
On Tue, Apr 22, 2014 at 11:11:50AM +0800, Jason Wang wrote: On 04/21/2014 11:11 PM, Amos Kong wrote: Currently ethtool returns zero if there is no virtqueue or MQ isn't support, we should return -ENOTSUPP to notice user. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8a852b5..eaf8266 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1053,7 +1053,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) struct net_device *dev = vi-dev; if (!vi-has_cvq || !virtio_has_feature(vi-vdev, VIRTIO_NET_F_MQ)) - return 0; + return -ENOTSUPP; s.virtqueue_pairs = queue_pairs; sg_init_one(sg, s, sizeof(s)); How about check the return value of virtnet_set_queues() in virtnet_restore() also? If we migrate a guest from MQ supported side to MQ non-supported side, virtnet_restore() will not return error right now. If we return error when no vq or no mq support, and check return value of virtnet_set_queues() in virtnet_restore(), migration will fail. Is it expected? -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] virtio_net: return error when there is no virtqueue or MQ isn't support
On Tue, Apr 22, 2014 at 11:53:34AM +0800, Jason Wang wrote: On 04/22/2014 11:23 AM, Amos Kong wrote: On Tue, Apr 22, 2014 at 11:11:50AM +0800, Jason Wang wrote: On 04/21/2014 11:11 PM, Amos Kong wrote: Currently ethtool returns zero if there is no virtqueue or MQ isn't support, we should return -ENOTSUPP to notice user. Signed-off-by: Amos Kong ak...@redhat.com NAK this patch. --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8a852b5..eaf8266 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1053,7 +1053,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) struct net_device *dev = vi-dev; if (!vi-has_cvq || !virtio_has_feature(vi-vdev, VIRTIO_NET_F_MQ)) - return 0; + return -ENOTSUPP; s.virtqueue_pairs = queue_pairs; sg_init_one(sg, s, sizeof(s)); How about check the return value of virtnet_set_queues() in virtnet_restore() also? If we migrate a guest from MQ supported side to MQ non-supported side, virtnet_restore() will not return error right now. If we return error when no vq or no mq support, and check return value of virtnet_set_queues() in virtnet_restore(), migration will fail. Is it expected? I think what you mean is restore not migration here. The point is not to execute VIRTIO_NET_CTRL_MQ when multiqueue is not supported, so it's the caller responsibility to pass a valid queue_pairs to virtnet_set_queues(). This patch is not necessary, your last patch is fine enough to solve the issue. Do you agree? Yes. If mq is supported, - change queue to 0: virtnet_send_command() will return error - change queue to = 1: success If mq isn't support, current queue is 1. - change queue to 1: return 0 at mq checking point, nothing is changed, expected. * change queue to 0: we should return -INVAL (my first patch checked this in virtnet_set_channels()) * change queue to 1: we should return -INVAL (we already checked 'queue_pairs vi-max_queue_pairs' in virtnet_set_channels()) The * items are lack in current virtnet_send_command(), but can already checked it in virtnet_set_channels(), it's enough. So we can just apply my first patch [1]. [1] http://www.spinics.net/lists/netdev/msg279631.html Thanks Thanks. BTW, -ENOTSUPP isn't an invalid ERROR for virtio-net. /* Defined for the NFSv3 protocol */ #define ENOTSUPP524 /* Operation is not supported */ -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH] virtio_net: zero is an invald queue_pairs number
Execute ethtool -L eth0 combined 0 in guest, if multiqueue is enabled, virtnet_send_command() will return -EINVAL error, there is a validation in QEMU. But if multiqueue is disabled, virtnet_set_queues() will just return zero (success). We should return error for this situation. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7b68746..8a852b5 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1285,7 +1285,7 @@ static int virtnet_set_channels(struct net_device *dev, if (channels-rx_count || channels-tx_count || channels-other_count) return -EINVAL; - if (queue_pairs vi-max_queue_pairs) + if (queue_pairs vi-max_queue_pairs || queue_pairs == 0) return -EINVAL; get_online_cpus(); -- 1.9.0 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Qemu-devel] [RFC qemu PATCH] only writing out the last byte of MAC makes it have effect
On Mon, Mar 25, 2013 at 08:58:49AM +0200, Michael S. Tsirkin wrote: On Mon, Mar 25, 2013 at 10:23:57AM +0800, Amos Kong wrote: On Fri, Mar 22, 2013 at 10:45:09AM +1030, Rusty Russell wrote: Michael S. Tsirkin m...@redhat.com writes: On Thu, Mar 21, 2013 at 02:44:50PM +0800, Amos Kong wrote: The lengcy guests don't have mac programming command, we don't know when it's safe to use MAC. This patch changed qemu to makes MAC change effect when the last byte of MAC is written to config space. MAC address takes first 6 bytes of config space of virtio-net, the addr is 5 when the last byte is written in virtio_config_writeb(). MAC change will effect when n-mac is updated in virtio_net_set_config(). Signed-off-by: Amos Kong ak...@redhat.com Let's see what Rusty says about the spec change. Implementation notes like this belong as a footnote, eg: For older systems, it is recommended and typical that the device write byte 5 of the mac address last, so devices can use that as a trigger to commit the mac address change. Now, is this a real, or theoretical issue? Have we seen this problem in practice, or should we continue to ignore it? Hi Rusty, Michael I didn't touch any problem. MST, and you? In Linux guest, we should disable the interface before changing mac address. ifconfig eth0 down ifconfig eth0 hw ether 10:12:13:14:15:16 ifconfig eth0 up In Windows 7 guest, after changing mac address in register table, re-enabling interface to make it effect. reg add HKLM SYSTEM\CurrentControlSet\Control\Class\{4D36E972-E325-11CE-BFC1-08002BE10318}\0001] /v NetworkAddress /d 0123456789AB netsh interface set interface Local Area Connection DISABLED netsh interface set interface Local Area Connection ENABLED So when we change the mac address, guest os always disable interface to receive all packages. It seems the theoretical issue doesn't exist? Cheers, Rusty. -- Amos. Nope. Looks like no spec change is necessary. We already say it's not atomic and it looks like guests expect exactly that and disable link to prevent strange issues. I just found Jiri has a commit to allow changing mac when iface is running. It seems only the virt nics are allowned, right? But the leagcy guests don't allow to change mac when interface is running, so no spec change is necessary. [upstream kernel] commit d4fc6918f4b3cc844185f59fc518351525950449 Author: Jiri Pirko jpi...@redhat.com Date: Wed Jun 27 05:27:46 2012 + virtio_net: allow to change mac when iface is running Signed-off-by: Jiri Pirko jpi...@redhat.com Signed-off-by: David S. Miller da...@davemloft.net RH-BZ: https://bugzilla.redhat.com/show_bug.cgi?id=882868 -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Qemu-devel] [RFC qemu PATCH] only writing out the last byte of MAC makes it have effect
On Fri, Mar 22, 2013 at 10:45:09AM +1030, Rusty Russell wrote: Michael S. Tsirkin m...@redhat.com writes: On Thu, Mar 21, 2013 at 02:44:50PM +0800, Amos Kong wrote: The lengcy guests don't have mac programming command, we don't know when it's safe to use MAC. This patch changed qemu to makes MAC change effect when the last byte of MAC is written to config space. MAC address takes first 6 bytes of config space of virtio-net, the addr is 5 when the last byte is written in virtio_config_writeb(). MAC change will effect when n-mac is updated in virtio_net_set_config(). Signed-off-by: Amos Kong ak...@redhat.com Let's see what Rusty says about the spec change. Implementation notes like this belong as a footnote, eg: For older systems, it is recommended and typical that the device write byte 5 of the mac address last, so devices can use that as a trigger to commit the mac address change. Now, is this a real, or theoretical issue? Have we seen this problem in practice, or should we continue to ignore it? Hi Rusty, Michael I didn't touch any problem. MST, and you? In Linux guest, we should disable the interface before changing mac address. ifconfig eth0 down ifconfig eth0 hw ether 10:12:13:14:15:16 ifconfig eth0 up In Windows 7 guest, after changing mac address in register table, re-enabling interface to make it effect. reg add HKLM SYSTEM\CurrentControlSet\Control\Class\{4D36E972-E325-11CE-BFC1-08002BE10318}\0001] /v NetworkAddress /d 0123456789AB netsh interface set interface Local Area Connection DISABLED netsh interface set interface Local Area Connection ENABLED So when we change the mac address, guest os always disable interface to receive all packages. It seems the theoretical issue doesn't exist? Cheers, Rusty. -- Amos. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[RFC qemu PATCH] only writing out the last byte of MAC makes it have effect
The lengcy guests don't have mac programming command, we don't know when it's safe to use MAC. This patch changed qemu to makes MAC change effect when the last byte of MAC is written to config space. MAC address takes first 6 bytes of config space of virtio-net, the addr is 5 when the last byte is written in virtio_config_writeb(). MAC change will effect when n-mac is updated in virtio_net_set_config(). Signed-off-by: Amos Kong ak...@redhat.com --- hw/virtio.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/hw/virtio.c b/hw/virtio.c index 26fbc79..2e08302 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -605,8 +605,9 @@ void virtio_config_writeb(VirtIODevice *vdev, uint32_t addr, uint32_t data) stb_p(vdev-config + addr, val); -if (vdev-set_config) +if (vdev-set_config (addr == 5 || strcmp(vdev-name, virtio-net))) { vdev-set_config(vdev, vdev-config); +} } void virtio_config_writew(VirtIODevice *vdev, uint32_t addr, uint32_t data) -- 1.7.1 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[RFC virt-spec PATCH] only writing out the last byte of MAC makes it have effect
The lengcy guests don't have mac programming command, we don't know when it's safe to use MAC. We can change QEMU to make MAC change effect when the last byte of MAC is written to config space. Signed-off-by: Amos Kong ak...@redhat.com --- virtio-spec.lyx | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/virtio-spec.lyx b/virtio-spec.lyx index dbc4ef0..bb289fb 100644 --- a/virtio-spec.lyx +++ b/virtio-spec.lyx @@ -5430,7 +5430,7 @@ T_CTRL_MAC_TABLE_SET. \begin_layout Standard -\change_inserted -1930653948 1358506710 +\change_inserted -1930653948 1363832689 The config space \begin_inset Quotes eld \end_inset @@ -5464,6 +5464,15 @@ mac Therefore, VIRTIO_NET_CTRL_MAC_ADDR_SET is preferred, especially while the NIC is up. The command-specific-data is a 6-byte MAC address. +\end_layout + +\begin_layout Standard + +\change_inserted -1930653948 1363833477 +The legacy guests don't support the new command, they still change MAC address + in original way, that's not atomic. + For more robust, QEMU only makes the MAC change effect when the last byte + of MAC address is written to config space. \change_unchanged \end_layout -- 1.8.1.4 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [QEMU PATCH v4 2/3] virtio-net: introduce a new macaddr control
On Mon, Jan 21, 2013 at 05:08:26PM +0100, Stefan Hajnoczi wrote: On Sat, Jan 19, 2013 at 09:54:27AM +0800, ak...@redhat.com wrote: @@ -350,6 +351,18 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, struct virtio_net_ctrl_mac mac_data; size_t s; +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) { +if (iov_size(iov, iov_cnt) != ETH_ALEN) { +return VIRTIO_NET_ERR; +} +s = iov_to_buf(iov, iov_cnt, 0, n-mac, sizeof(n-mac)); +if (s != sizeof(n-mac)) { +return VIRTIO_NET_ERR; +} Since iov_size() was checked before iov_to_buf(), we never hit this error. And if we did n-mac would be trashed (i.e. error handling is not complete). You are right. iov_size() computes the size by accounting iov[].iov_lens, the first check is enough. I think assert(s == sizeof(n-mac)) is more appropriate appropriate. Also, please change ETH_ALEN to sizeof(n-mac) to make the relationship between the check and the copy clear. Will update this patch. if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) { if (iov_size(iov, iov_cnt) != sizeof(n-mac)) { return VIRTIO_NET_ERR; } s = iov_to_buf(iov, iov_cnt, 0, n-mac, sizeof(n-mac)); assert(s == sizeof(n-mac)); qemu_format_nic_info_str(n-nic-nc, n-mac); return VIRTIO_NET_OK; } Stefan ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [QEMU PATCH v4 1/3] virtio-net: remove layout assumptions for ctrl vq
On Mon, Jan 21, 2013 at 05:03:30PM +0100, Stefan Hajnoczi wrote: On Sat, Jan 19, 2013 at 09:54:26AM +0800, ak...@redhat.com wrote: From: Michael S. Tsirkin m...@redhat.com Virtio-net code makes assumption about virtqueue descriptor layout (e.g. sg[0] is the header, sg[1] is the data buffer). This patch makes code not rely on the layout of descriptors. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Amos Kong ak...@redhat.com --- hw/virtio-net.c | 128 1 file changed, 74 insertions(+), 54 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 3bb01b1..113e194 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -315,44 +315,44 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) } static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd, - VirtQueueElement *elem) + struct iovec *iov, unsigned int iov_cnt) { uint8_t on; +size_t s; -if (elem-out_num != 2 || elem-out_sg[1].iov_len != sizeof(on)) { -error_report(virtio-net ctrl invalid rx mode command); -exit(1); +s = iov_to_buf(iov, iov_cnt, 0, on, sizeof(on)); +if (s != sizeof(on)) { +return VIRTIO_NET_ERR; } -on = ldub_p(elem-out_sg[1].iov_base); - -if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) +if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) { n-promisc = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) { n-allmulti = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) { n-alluni = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) { n-nomulti = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) { n-nouni = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) { n-nobcast = on; -else +} else { return VIRTIO_NET_ERR; +} return VIRTIO_NET_OK; } static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, - VirtQueueElement *elem) + struct iovec *iov, unsigned int iov_cnt) { struct virtio_net_ctrl_mac mac_data; +size_t s; -if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem-out_num != 3 || -elem-out_sg[1].iov_len sizeof(mac_data) || -elem-out_sg[2].iov_len sizeof(mac_data)) +if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) { return VIRTIO_NET_ERR; +} n-mac_table.in_use = 0; n-mac_table.first_multi = 0; @@ -360,54 +360,71 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, n-mac_table.multi_overflow = 0; memset(n-mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN); -mac_data.entries = ldl_p(elem-out_sg[1].iov_base); +s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries, + sizeof(mac_data.entries)); Hi Stefan, can we adjust the endianness after each iov_to_buf() copy? diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 72d7857..0088d6c 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -321,6 +321,7 @@ static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd, size_t s; s = iov_to_buf(iov, iov_cnt, 0, on, sizeof(on)); +on = ldub_p(on); if (s != sizeof(on)) { return VIRTIO_NET_ERR; } @@ -362,7 +363,7 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries, sizeof(mac_data.entries)); - +mac_data.entries = ldl_p(mac_data.entries); if (s != sizeof(mac_data.entries)) { return VIRTIO_NET_ERR; } @@ -389,7 +390,7 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries, sizeof(mac_data.entries)); - +mac_data.entries = ldl_p(mac_data.entries); if (s != sizeof(mac_data.entries)) { return VIRTIO_NET_ERR; } @@ -421,6 +422,7 @@ static int virtio_net_handle_vlan_table(VirtIONet *n, uint8_t cmd, size_t s; s = iov_to_buf(iov, iov_cnt, 0, vid, sizeof(vid)); +vid = lduw_p(vid); if (s != sizeof(vid)) { return VIRTIO_NET_ERR; } @@ -458,6 +460,8 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) iov = elem.out_sg; iov_cnt = elem.out_num; s = iov_to_buf(iov, iov_cnt, 0, ctrl, sizeof(ctrl)); +ctrl.class = ldub_p(ctrl.class); +ctrl.cmd = ldub_p
[QEMU PATCH v5 0/3] virtio-net: fix of ctrl commands
Currently virtio-net code relys on the layout of descriptor, this patchset removed the assumptions and introduced a control command to set mac address. Last patch is a trivial renaming. V2: check guest's iov_len V3: fix of migration compatibility make mac field in config space read-only when new feature is acked V4: add fix of descriptor layout assumptions, trivial rename V5: fix endianness after iov_to_buf copy Amos Kong (2): virtio-net: introduce a new macaddr control virtio-net: rename ctrl rx commands Michael S. Tsirkin (1): virtio-net: remove layout assumptions for ctrl vq hw/pc_piix.c|4 ++ hw/virtio-net.c | 142 +- hw/virtio-net.h | 26 +++ 3 files changed, 108 insertions(+), 64 deletions(-) ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[QEMU PATCH v5 1/3] virtio-net: remove layout assumptions for ctrl vq
From: Michael S. Tsirkin m...@redhat.com Virtio-net code makes assumption about virtqueue descriptor layout (e.g. sg[0] is the header, sg[1] is the data buffer). This patch makes code not rely on the layout of descriptors. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Amos Kong ak...@redhat.com --- hw/virtio-net.c | 129 --- 1 files changed, 75 insertions(+), 54 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 3bb01b1..af1f3a1 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -315,44 +315,44 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) } static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd, - VirtQueueElement *elem) + struct iovec *iov, unsigned int iov_cnt) { uint8_t on; +size_t s; -if (elem-out_num != 2 || elem-out_sg[1].iov_len != sizeof(on)) { -error_report(virtio-net ctrl invalid rx mode command); -exit(1); +s = iov_to_buf(iov, iov_cnt, 0, on, sizeof(on)); +if (s != sizeof(on)) { +return VIRTIO_NET_ERR; } -on = ldub_p(elem-out_sg[1].iov_base); - -if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) +if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) { n-promisc = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) { n-allmulti = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) { n-alluni = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) { n-nomulti = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) { n-nouni = on; -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) { n-nobcast = on; -else +} else { return VIRTIO_NET_ERR; +} return VIRTIO_NET_OK; } static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, - VirtQueueElement *elem) + struct iovec *iov, unsigned int iov_cnt) { struct virtio_net_ctrl_mac mac_data; +size_t s; -if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem-out_num != 3 || -elem-out_sg[1].iov_len sizeof(mac_data) || -elem-out_sg[2].iov_len sizeof(mac_data)) +if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) { return VIRTIO_NET_ERR; +} n-mac_table.in_use = 0; n-mac_table.first_multi = 0; @@ -360,54 +360,72 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, n-mac_table.multi_overflow = 0; memset(n-mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN); -mac_data.entries = ldl_p(elem-out_sg[1].iov_base); +s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries, + sizeof(mac_data.entries)); +mac_data.entries = ldl_p(mac_data.entries); +if (s != sizeof(mac_data.entries)) { +return VIRTIO_NET_ERR; +} +iov_discard_front(iov, iov_cnt, s); -if (sizeof(mac_data.entries) + -(mac_data.entries * ETH_ALEN) elem-out_sg[1].iov_len) +if (mac_data.entries * ETH_ALEN iov_size(iov, iov_cnt)) { return VIRTIO_NET_ERR; +} if (mac_data.entries = MAC_TABLE_ENTRIES) { -memcpy(n-mac_table.macs, elem-out_sg[1].iov_base + sizeof(mac_data), - mac_data.entries * ETH_ALEN); +s = iov_to_buf(iov, iov_cnt, 0, n-mac_table.macs, + mac_data.entries * ETH_ALEN); +if (s != mac_data.entries * ETH_ALEN) { +return VIRTIO_NET_ERR; +} n-mac_table.in_use += mac_data.entries; } else { n-mac_table.uni_overflow = 1; } +iov_discard_front(iov, iov_cnt, mac_data.entries * ETH_ALEN); + n-mac_table.first_multi = n-mac_table.in_use; -mac_data.entries = ldl_p(elem-out_sg[2].iov_base); +s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries, + sizeof(mac_data.entries)); +mac_data.entries = ldl_p(mac_data.entries); +if (s != sizeof(mac_data.entries)) { +return VIRTIO_NET_ERR; +} + +iov_discard_front(iov, iov_cnt, s); -if (sizeof(mac_data.entries) + -(mac_data.entries * ETH_ALEN) elem-out_sg[2].iov_len) +if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) { return VIRTIO_NET_ERR; +} -if (mac_data.entries) { -if (n-mac_table.in_use + mac_data.entries = MAC_TABLE_ENTRIES) { -memcpy(n-mac_table.macs + (n-mac_table.in_use * ETH_ALEN), - elem-out_sg[2].iov_base + sizeof(mac_data), - mac_data.entries * ETH_ALEN); -n-mac_table.in_use += mac_data.entries; -} else { -n
[QEMU PATCH v5 2/3] virtio-net: introduce a new macaddr control
In virtio-net guest driver, currently we write MAC address to pci config space byte by byte, this means that we have an intermediate step where mac is wrong. This patch introduced a new control command to set MAC address, it's atomic. VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility. mac field will be set to read-only when VIRTIO_NET_F_CTRL_MAC_ADDR is acked. Signed-off-by: Amos Kong ak...@redhat.com --- hw/pc_piix.c|4 hw/virtio-net.c | 13 - hw/virtio-net.h | 12 ++-- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/hw/pc_piix.c b/hw/pc_piix.c index 0a6923d..6218350 100644 --- a/hw/pc_piix.c +++ b/hw/pc_piix.c @@ -297,6 +297,10 @@ static QEMUMachine pc_i440fx_machine_v1_4 = { .driver = usb-tablet,\ .property = usb_version,\ .value= stringify(1),\ +},{\ +.driver = virtio-net-pci,\ +.property = ctrl_mac_addr,\ +.value= off, \ } static QEMUMachine pc_machine_v1_3 = { diff --git a/hw/virtio-net.c b/hw/virtio-net.c index af1f3a1..acef5a5 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -93,7 +93,8 @@ static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config) memcpy(netcfg, config, sizeof(netcfg)); -if (memcmp(netcfg.mac, n-mac, ETH_ALEN)) { +if (!(n-vdev.guest_features VIRTIO_NET_F_CTRL_MAC_ADDR 1) +memcmp(netcfg.mac, n-mac, ETH_ALEN)) { memcpy(n-mac, netcfg.mac, ETH_ALEN); qemu_format_nic_info_str(n-nic-nc, n-mac); } @@ -350,6 +351,16 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, struct virtio_net_ctrl_mac mac_data; size_t s; +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) { +if (iov_size(iov, iov_cnt) != sizeof(n-mac)) { +return VIRTIO_NET_ERR; +} +s = iov_to_buf(iov, iov_cnt, 0, n-mac, sizeof(n-mac)); +assert(s == sizeof(n-mac)); +qemu_format_nic_info_str(n-nic-nc, n-mac); +return VIRTIO_NET_OK; +} + if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) { return VIRTIO_NET_ERR; } diff --git a/hw/virtio-net.h b/hw/virtio-net.h index d46fb98..1ec632f 100644 --- a/hw/virtio-net.h +++ b/hw/virtio-net.h @@ -44,6 +44,8 @@ #define VIRTIO_NET_F_CTRL_VLAN 19 /* Control channel VLAN filtering */ #define VIRTIO_NET_F_CTRL_RX_EXTRA 20 /* Extra RX mode control support */ +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ + #define VIRTIO_NET_S_LINK_UP1 /* Link is up */ #define TX_TIMER_INTERVAL 15 /* 150 us */ @@ -106,7 +108,7 @@ typedef uint8_t virtio_net_ctrl_ack; #define VIRTIO_NET_CTRL_RX_MODE_NOBCAST 5 /* - * Control the MAC filter table. + * Control the MAC * * The MAC filter table is managed by the hypervisor, the guest should * assume the size is infinite. Filtering should be considered @@ -119,6 +121,10 @@ typedef uint8_t virtio_net_ctrl_ack; * first sg list contains unicast addresses, the second is for multicast. * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature * is available. + * + * The ADDR_SET command requests one out scatterlist, it contains a + * 6 bytes MAC address. This functionality is present if the + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. */ struct virtio_net_ctrl_mac { uint32_t entries; @@ -126,6 +132,7 @@ struct virtio_net_ctrl_mac { }; #define VIRTIO_NET_CTRL_MAC1 #define VIRTIO_NET_CTRL_MAC_TABLE_SET0 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 /* * Control VLAN filtering @@ -158,5 +165,6 @@ struct virtio_net_ctrl_mac { DEFINE_PROP_BIT(ctrl_vq, _state, _field, VIRTIO_NET_F_CTRL_VQ, true), \ DEFINE_PROP_BIT(ctrl_rx, _state, _field, VIRTIO_NET_F_CTRL_RX, true), \ DEFINE_PROP_BIT(ctrl_vlan, _state, _field, VIRTIO_NET_F_CTRL_VLAN, true), \ -DEFINE_PROP_BIT(ctrl_rx_extra, _state, _field, VIRTIO_NET_F_CTRL_RX_EXTRA, true) +DEFINE_PROP_BIT(ctrl_rx_extra, _state, _field, VIRTIO_NET_F_CTRL_RX_EXTRA, true), \ +DEFINE_PROP_BIT(ctrl_mac_addr, _state, _field, VIRTIO_NET_F_CTRL_MAC_ADDR, true) #endif -- 1.7.1 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[QEMU PATCH v5 3/3] virtio-net: rename ctrl rx commands
This patch makes rx commands consistent with specification. Signed-off-by: Amos Kong ak...@redhat.com --- hw/virtio-net.c | 14 +++--- hw/virtio-net.h | 14 +++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index acef5a5..ac4434e 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -326,17 +326,17 @@ static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd, return VIRTIO_NET_ERR; } -if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) { +if (cmd == VIRTIO_NET_CTRL_RX_PROMISC) { n-promisc = on; -} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) { +} else if (cmd == VIRTIO_NET_CTRL_RX_ALLMULTI) { n-allmulti = on; -} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) { +} else if (cmd == VIRTIO_NET_CTRL_RX_ALLUNI) { n-alluni = on; -} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) { +} else if (cmd == VIRTIO_NET_CTRL_RX_NOMULTI) { n-nomulti = on; -} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) { +} else if (cmd == VIRTIO_NET_CTRL_RX_NOUNI) { n-nouni = on; -} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) { +} else if (cmd == VIRTIO_NET_CTRL_RX_NOBCAST) { n-nobcast = on; } else { return VIRTIO_NET_ERR; @@ -473,7 +473,7 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) iov_discard_front(iov, iov_cnt, sizeof(ctrl)); if (s != sizeof(ctrl)) { status = VIRTIO_NET_ERR; -} else if (ctrl.class == VIRTIO_NET_CTRL_RX_MODE) { +} else if (ctrl.class == VIRTIO_NET_CTRL_RX) { status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt); } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) { status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt); diff --git a/hw/virtio-net.h b/hw/virtio-net.h index 1ec632f..c0bb284 100644 --- a/hw/virtio-net.h +++ b/hw/virtio-net.h @@ -99,13 +99,13 @@ typedef uint8_t virtio_net_ctrl_ack; * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature. * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA. */ -#define VIRTIO_NET_CTRL_RX_MODE0 - #define VIRTIO_NET_CTRL_RX_MODE_PROMISC 0 - #define VIRTIO_NET_CTRL_RX_MODE_ALLMULTI 1 - #define VIRTIO_NET_CTRL_RX_MODE_ALLUNI 2 - #define VIRTIO_NET_CTRL_RX_MODE_NOMULTI 3 - #define VIRTIO_NET_CTRL_RX_MODE_NOUNI4 - #define VIRTIO_NET_CTRL_RX_MODE_NOBCAST 5 +#define VIRTIO_NET_CTRL_RX0 + #define VIRTIO_NET_CTRL_RX_PROMISC 0 + #define VIRTIO_NET_CTRL_RX_ALLMULTI 1 + #define VIRTIO_NET_CTRL_RX_ALLUNI 2 + #define VIRTIO_NET_CTRL_RX_NOMULTI 3 + #define VIRTIO_NET_CTRL_RX_NOUNI4 + #define VIRTIO_NET_CTRL_RX_NOBCAST 5 /* * Control the MAC -- 1.7.1 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 0/3] make mac programming for virtio net more robust
Currenly mac is programmed byte by byte. This means that we have an intermediate step where mac is wrong. Third patch introduced a new vq control command to set mac address, it's atomic. V2: check return of sending command, delay eth_mac_addr() V3: restore software address when fail to set hardware address V4: split eth_mac_addr, fix error handle V5: rebase patches to net-next tree Amos Kong (2): move virtnet_send_command() above virtnet_set_mac_address() virtio-net: introduce a new control to set macaddr Stefan Hajnoczi (1): net: split eth_mac_addr for better error handling drivers/net/virtio_net.c| 110 ++- include/linux/etherdevice.h |2 + include/uapi/linux/virtio_net.h |8 +++- net/ethernet/eth.c | 41 -- 4 files changed, 106 insertions(+), 55 deletions(-) ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 1/3] move virtnet_send_command() above virtnet_set_mac_address()
We want to send vq command to set mac address in virtnet_set_mac_address(), so do this function moving. Fixed a little issue of coding style. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c | 89 ++--- 1 files changed, 44 insertions(+), 45 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a6fcf15..395ab4f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -753,6 +753,50 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +/* + * Send command via the control virtqueue and check status. Commands + * supported by the hypervisor, as indicated by feature bits, should + * never fail unless improperly formated. + */ +static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, +struct scatterlist *data, int out, int in) +{ + struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; + struct virtio_net_ctrl_hdr ctrl; + virtio_net_ctrl_ack status = ~0; + unsigned int tmp; + int i; + + /* Caller should know better */ + BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) || + (out + in VIRTNET_SEND_COMMAND_SG_MAX)); + + out++; /* Add header */ + in++; /* Add return status */ + + ctrl.class = class; + ctrl.cmd = cmd; + + sg_init_table(sg, out + in); + + sg_set_buf(sg[0], ctrl, sizeof(ctrl)); + for_each_sg(data, s, out + in - 2, i) + sg_set_buf(sg[i + 1], sg_virt(s), s-length); + sg_set_buf(sg[out + in - 1], status, sizeof(status)); + + BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC) 0); + + virtqueue_kick(vi-cvq); + + /* Spin for a response, the kick causes an ioport write, trapping +* into the hypervisor, so the request should be handled immediately. +*/ + while (!virtqueue_get_buf(vi-cvq, tmp)) + cpu_relax(); + + return status == VIRTIO_NET_OK; +} + static int virtnet_set_mac_address(struct net_device *dev, void *p) { struct virtnet_info *vi = netdev_priv(dev); @@ -819,51 +863,6 @@ static void virtnet_netpoll(struct net_device *dev) } #endif -/* - * Send command via the control virtqueue and check status. Commands - * supported by the hypervisor, as indicated by feature bits, should - * never fail unless improperly formated. - */ -static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, -struct scatterlist *data, int out, int in) -{ - struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; - struct virtio_net_ctrl_hdr ctrl; - virtio_net_ctrl_ack status = ~0; - unsigned int tmp; - int i; - - /* Caller should know better */ - BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) || - (out + in VIRTNET_SEND_COMMAND_SG_MAX)); - - out++; /* Add header */ - in++; /* Add return status */ - - ctrl.class = class; - ctrl.cmd = cmd; - - sg_init_table(sg, out + in); - - sg_set_buf(sg[0], ctrl, sizeof(ctrl)); - for_each_sg(data, s, out + in - 2, i) - sg_set_buf(sg[i + 1], sg_virt(s), s-length); - sg_set_buf(sg[out + in - 1], status, sizeof(status)); - - BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC) 0); - - virtqueue_kick(vi-cvq); - - /* -* Spin for a response, the kick causes an ioport write, trapping -* into the hypervisor, so the request should be handled immediately. -*/ - while (!virtqueue_get_buf(vi-cvq, tmp)) - cpu_relax(); - - return status == VIRTIO_NET_OK; -} - static void virtnet_ack_link_announce(struct virtnet_info *vi) { rtnl_lock(); -- 1.7.1 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 2/3] net: split eth_mac_addr for better error handling
From: Stefan Hajnoczi stefa...@gmail.com When we set mac address, software mac address in system and hardware mac address all need to be updated. Current eth_mac_addr() doesn't allow callers to implement error handling nicely. This patch split eth_mac_addr() to prepare part and real commit part, then we can prepare first, and try to change hardware address, then do the real commit if hardware address is set successfully. Signed-off-by: Stefan Hajnoczi stefa...@gmail.com Signed-off-by: Amos Kong ak...@redhat.com --- include/linux/etherdevice.h |2 ++ net/ethernet/eth.c | 41 +++-- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 1a43e1b..c623861 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -40,6 +40,8 @@ extern int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, extern void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); +extern int eth_prepare_mac_addr_change(struct net_device *dev, void *p); +extern void eth_commit_mac_addr_change(struct net_device *dev, void *p); extern int eth_mac_addr(struct net_device *dev, void *p); extern int eth_change_mtu(struct net_device *dev, int new_mtu); extern int eth_validate_addr(struct net_device *dev); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index bc39c8c..a36c85e 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -272,6 +272,36 @@ void eth_header_cache_update(struct hh_cache *hh, EXPORT_SYMBOL(eth_header_cache_update); /** + * eth_prepare_mac_addr_change - prepare for mac change + * @dev: network device + * @p: socket address + */ +int eth_prepare_mac_addr_change(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + if (!(dev-priv_flags IFF_LIVE_ADDR_CHANGE) netif_running(dev)) + return -EBUSY; + if (!is_valid_ether_addr(addr-sa_data)) + return -EADDRNOTAVAIL; + return 0; +} +EXPORT_SYMBOL(eth_prepare_mac_addr_change); + +/** + * eth_commit_mac_addr_change - commit mac change + * @dev: network device + * @p: socket address + */ +void eth_commit_mac_addr_change(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + memcpy(dev-dev_addr, addr-sa_data, ETH_ALEN); +} +EXPORT_SYMBOL(eth_commit_mac_addr_change); + +/** * eth_mac_addr - set new Ethernet hardware address * @dev: network device * @p: socket address @@ -283,13 +313,12 @@ EXPORT_SYMBOL(eth_header_cache_update); */ int eth_mac_addr(struct net_device *dev, void *p) { - struct sockaddr *addr = p; + int ret; - if (!(dev-priv_flags IFF_LIVE_ADDR_CHANGE) netif_running(dev)) - return -EBUSY; - if (!is_valid_ether_addr(addr-sa_data)) - return -EADDRNOTAVAIL; - memcpy(dev-dev_addr, addr-sa_data, ETH_ALEN); + ret = eth_prepare_mac_addr_change(dev, p); + if (ret 0) + return ret; + eth_commit_mac_addr_change(dev, p); return 0; } EXPORT_SYMBOL(eth_mac_addr); -- 1.7.1 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5 3/3] virtio-net: introduce a new control to set macaddr
Currently we write MAC address to pci config space byte by byte, this means that we have an intermediate step where mac is wrong. This patch introduced a new control command to set MAC address, it's atomic. VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c| 21 ++--- include/uapi/linux/virtio_net.h |8 +++- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 395ab4f..701408a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -802,14 +802,28 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi-vdev; int ret; + struct sockaddr *addr = p; + struct scatterlist sg; - ret = eth_mac_addr(dev, p); + ret = eth_prepare_mac_addr_change(dev, p); if (ret) return ret; - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + sg_init_one(sg, addr-sa_data, dev-addr_len); + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, + VIRTIO_NET_CTRL_MAC_ADDR_SET, + sg, 1, 0)) { + dev_warn(vdev-dev, +Failed to set mac address by vq command.\n); + return -EINVAL; + } + } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { vdev-config-set(vdev, offsetof(struct virtio_net_config, mac), - dev-dev_addr, dev-addr_len); + addr-sa_data, dev-addr_len); + } + + eth_commit_mac_addr_change(dev, p); return 0; } @@ -1627,6 +1641,7 @@ static unsigned int features[] = { VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, + VIRTIO_NET_F_CTRL_MAC_ADDR, }; static struct virtio_driver virtio_net_driver = { diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 848e358..a5a8c88 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -53,6 +53,7 @@ * network */ #define VIRTIO_NET_F_MQ22 /* Device supports Receive Flow * Steering */ +#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ #define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */ @@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack; #define VIRTIO_NET_CTRL_RX_NOBCAST 5 /* - * Control the MAC filter table. + * Control the MAC * * The MAC filter table is managed by the hypervisor, the guest should * assume the size is infinite. Filtering should be considered @@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack; * first sg list contains unicast addresses, the second is for multicast. * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature * is available. + * + * The ADDR_SET command requests one out scatterlist, it contains a + * 6 bytes MAC address. This functionality is present if the + * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. */ struct virtio_net_ctrl_mac { __u32 entries; @@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac { #define VIRTIO_NET_CTRL_MAC1 #define VIRTIO_NET_CTRL_MAC_TABLE_SET0 + #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1 /* * Control VLAN filtering -- 1.7.1 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 2/2] virtio-net: introduce a new control to set macaddr
On Fri, Jan 18, 2013 at 12:00:42PM +0100, Stefan Hajnoczi wrote: On Thu, Jan 17, 2013 at 06:40:12PM +0800, ak...@redhat.com wrote: diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 395ab4f..837c978 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -802,14 +802,32 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi-vdev; int ret; + struct scatterlist sg; + char save_addr[ETH_ALEN]; + unsigned char save_aatype; + + memcpy(save_addr, dev-dev_addr, ETH_ALEN); + save_aatype = dev-addr_assign_type; ret = eth_mac_addr(dev, p); if (ret) return ret; - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + sg_init_one(sg, dev-dev_addr, dev-addr_len); + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, + VIRTIO_NET_CTRL_MAC_ADDR_SET, + sg, 1, 0)) { + dev_warn(vdev-dev, +Failed to set mac address by vq command.\n); + memcpy(dev-dev_addr, save_addr, ETH_ALEN); + dev-addr_assign_type = save_aatype; + return -EINVAL; + } eth_mac_addr() doesn't allow callers to implement error handling nicely. Although you didn't duplicate it's code directly, this patch still leaks internals of eth_mac_addr(). How about splitting eth_mac_addr() in a separate patch: Agree, then we can have nice error handling. I will send a V4 and include this patch. Thanks. int eth_prepare_mac_addr_change(struct net_device *dev, void *p) { struct sockaddr *addr = p; if (!(dev-priv_flags IFF_LIVE_ADDR_CHANGE) netif_running(dev)) return -EBUSY; if (!is_valid_ether_addr(addr-sa_data)) return -EADDRNOTAVAIL; return 0; } ... Now virtio_net.c does: ret = eth_prepare_mac_addr_change(dev, p); if (ret 0) return ret; if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { sg_init_one(sg, dev-dev_addr, dev-addr_len); trivial issue, %s/dev-dev_addr/addr-sa_data if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET, sg, 1, 0)) { dev_warn(vdev-dev, Failed to set mac address by vq command.\n); return -EINVAL; } } ... eth_commit_mac_addr_change(dev, p); return 0; Stefan ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Qemu-devel] [QEMU PATCH v2] virtio-net: introduce a new macaddr control
On Thu, Jan 17, 2013 at 01:45:11PM +0800, Amos Kong wrote: On Thu, Jan 17, 2013 at 11:49:20AM +1030, Rusty Russell wrote: ak...@redhat.com writes: @@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, { struct virtio_net_ctrl_mac mac_data; +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET elem-out_num == 2 +elem-out_sg[1].iov_len == ETH_ALEN) { +/* Set MAC address */ +memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len); +qemu_format_nic_info_str(n-nic-nc, n-mac); +return VIRTIO_NET_OK; +} Does the rest of the net device still rely on the layout of descriptors? No, only info string of net client relies on n-mac I misunderstood. There is no clear limitation of how much descriptor are used for each vq command, but many commands rely on the layout of descriptiors. eg: virtio-net: VIRTIO_NET_CTRL_RX_PROMISC VIRTIO_NET_CTRL_RX_ALLMULTI VIRTIO_NET_CTRL_MAC_TABLE_SET etc If so, OK, we'll fix them all together. If not, this introduces a new one. Cheers, Rusty. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [QEMU PATCH v2] virtio-net: introduce a new macaddr control
On Thu, Jan 17, 2013 at 02:13:08PM +0200, Michael S. Tsirkin wrote: On Thu, Jan 17, 2013 at 11:49:20AM +1030, Rusty Russell wrote: ak...@redhat.com writes: @@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, { struct virtio_net_ctrl_mac mac_data; +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET elem-out_num == 2 +elem-out_sg[1].iov_len == ETH_ALEN) { +/* Set MAC address */ +memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len); +qemu_format_nic_info_str(n-nic-nc, n-mac); +return VIRTIO_NET_OK; +} Does the rest of the net device still rely on the layout of descriptors? If so, OK, we'll fix them all together. If not, this introduces a new one. Cheers, Rusty. The following fixes all existing users. Got to deal with some urgent stuff so did not test yet - Amos, would you like to include this in your patchset and build on it, test it all together? No problem. If not I'll get to it next week. --- virtio-net: remove layout assumptions for ctrl vq Signed-off-by: Michael S. Tsirkin m...@redhat.com ... ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 2/2] virtio-net: introduce a new control to set macaddr
On Wed, Jan 16, 2013 at 02:20:39PM +0800, Jason Wang wrote: On Wednesday, January 16, 2013 01:57:01 PM ak...@redhat.com wrote: From: Amos Kong ak...@redhat.com Currently we write MAC address to pci config space byte by byte, this means that we have an intermediate step where mac is wrong. This patch introduced a new control command to set MAC address in one time. VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility. Signed-off-by: Amos Kong ak...@redhat.com --- drivers/net/virtio_net.c| 24 +--- include/uapi/linux/virtio_net.h | 8 +++- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 395ab4f..c8901b6 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -802,16 +802,25 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi-vdev; int ret; + struct sockaddr *addr = p; + struct scatterlist sg; - ret = eth_mac_addr(dev, p); - if (ret) - return ret; - - if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { + sg_init_one(sg, addr-sa_data, dev-addr_len); + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, + VIRTIO_NET_CTRL_MAC_ADDR_SET, + sg, 1, 0)) { + dev_warn(vdev-dev, +Failed to set mac address by vq command.\n); + return -EINVAL; + } + } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { vdev-config-set(vdev, offsetof(struct virtio_net_config, mac), - dev-dev_addr, dev-addr_len); + addr-sa_data, dev-addr_len); + } + ret = eth_mac_addr(dev, p); The you will the validity check in eth_mac_addr which may result a wrong mac address to be set in the hardware (or is there any check in qemu) and a inconsistency bettween what kernel assumes and qemu has. You can take a look at netvsc driver that calls eth_mac_addr() first and restore the software mac address when fail to enforce it to hardware. Thanks for the catching, I will move eth_mac_addr() back to above, just restore addr if fail to send command. I will also use DEFINE_PROP_BIT to fix migration issue, thanks. Thanks - return 0; + return ret; } ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Qemu-devel] [QEMU PATCH v2] virtio-net: introduce a new macaddr control
On Thu, Jan 17, 2013 at 11:49:20AM +1030, Rusty Russell wrote: ak...@redhat.com writes: @@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, { struct virtio_net_ctrl_mac mac_data; +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET elem-out_num == 2 +elem-out_sg[1].iov_len == ETH_ALEN) { +/* Set MAC address */ +memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len); +qemu_format_nic_info_str(n-nic-nc, n-mac); +return VIRTIO_NET_OK; +} Does the rest of the net device still rely on the layout of descriptors? No, only info string of net client relies on n-mac If so, OK, we'll fix them all together. If not, this introduces a new one. Cheers, Rusty. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] virtio-spec: set mac address by a new vq command
On Wed, Jan 16, 2013 at 11:22:23AM +0200, Michael S. Tsirkin wrote: On Wed, Jan 16, 2013 at 10:13:28AM +0100, Stefan Hajnoczi wrote: On Wed, Jan 16, 2013 at 03:33:24PM +0800, ak...@redhat.com wrote: +\change_inserted -1930653948 1358320004 +The command VIRTIO_NET_CTRL_MAC_ADDR_SET is used to set +\begin_inset Quotes eld +\end_inset + +physical +\begin_inset Quotes erd +\end_inset + + address of the network card. The physical address of the network card? That term is not defined anywhere in the specification. I will replace it with 'the default MAC address' Perhaps it's best to explain that the config space mac field and VIRTIO_NET_CTRL_MAC_ADDR_SET both set the default MAC address which rx filtering accepts. (The MAC table is an additional set of MAC addresses which rx filtering accepts.) It would also be worth explaining that VIRTIO_NET_CTRL_MAC_ADDR_SET is atomic whereas the config space mac field is not. Therefore, VIRTIO_NET_CTRL_MAC_ADDR_SET is preferred, especially while the NIC is up. Ok, will send a v2, thanks. Stefan It's probably best to simply make the config space field read-only if the feature bit is acked. It's reasonable. diff --git a/hw/virtio-net.c b/hw/virtio-net.c index c18e276..54c5eae 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -93,7 +93,8 @@ static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config) memcpy(netcfg, config, sizeof(netcfg)); -if (memcmp(netcfg.mac, n-mac, ETH_ALEN)) { +if (!(n-vdev.guest_features VIRTIO_NET_F_CTRL_MAC_ADDR 1) +memcmp(netcfg.mac, n-mac, ETH_ALEN)) { memcpy(n-mac, netcfg.mac, ETH_ALEN); qemu_format_nic_info_str(n-nic-nc, n-mac); } ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization