[PATCH v5 REPOST 6/6] hw_random: don't init list element we're about to add to list.

2014-12-08 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

Another interesting anti-pattern.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index a9286bf..4d13ac5 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -489,7 +489,6 @@ int hwrng_register(struct hwrng *rng)
goto out_unlock;
}
}
-   INIT_LIST_HEAD(rng-list);
list_add_tail(rng-list, rng_list);
 
if (old_rng  !rng-init) {
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 REPOST 3/6] hw_random: use reference counts on each struct hwrng.

2014-12-08 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

current_rng holds one reference, and we bump it every time we want
to do a read from it.

This means we only hold the rng_mutex to grab or drop a reference,
so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't
block on read of /dev/hwrng.

Using a kref is overkill (we're always under the rng_mutex), but
a standard pattern.

This also solves the problem that the hwrng_fillfn thread was
accessing current_rng without a lock, which could change (eg. to NULL)
underneath it.

v5: drop redundant kref_init()
v4: decrease last reference for triggering the cleanup
v3: initialize kref (thanks Amos Kong)
v2: fix missing put_rng() on exit path (thanks Amos Kong)
Signed-off-by: Rusty Russell ru...@rustcorp.com.au
Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 135 --
 include/linux/hw_random.h |   2 +
 2 files changed, 94 insertions(+), 43 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index a0905c8..83516cb 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -42,6 +42,7 @@
 #include linux/delay.h
 #include linux/slab.h
 #include linux/random.h
+#include linux/err.h
 #include asm/uaccess.h
 
 
@@ -91,6 +92,60 @@ static void add_early_randomness(struct hwrng *rng)
add_device_randomness(bytes, bytes_read);
 }
 
+static inline void cleanup_rng(struct kref *kref)
+{
+   struct hwrng *rng = container_of(kref, struct hwrng, ref);
+
+   if (rng-cleanup)
+   rng-cleanup(rng);
+}
+
+static void set_current_rng(struct hwrng *rng)
+{
+   BUG_ON(!mutex_is_locked(rng_mutex));
+   kref_get(rng-ref);
+   current_rng = rng;
+}
+
+static void drop_current_rng(void)
+{
+   BUG_ON(!mutex_is_locked(rng_mutex));
+   if (!current_rng)
+   return;
+
+   /* decrease last reference for triggering the cleanup */
+   kref_put(current_rng-ref, cleanup_rng);
+   current_rng = NULL;
+}
+
+/* Returns ERR_PTR(), NULL or refcounted hwrng */
+static struct hwrng *get_current_rng(void)
+{
+   struct hwrng *rng;
+
+   if (mutex_lock_interruptible(rng_mutex))
+   return ERR_PTR(-ERESTARTSYS);
+
+   rng = current_rng;
+   if (rng)
+   kref_get(rng-ref);
+
+   mutex_unlock(rng_mutex);
+   return rng;
+}
+
+static void put_rng(struct hwrng *rng)
+{
+   /*
+* Hold rng_mutex here so we serialize in case they set_current_rng
+* on rng again immediately.
+*/
+   mutex_lock(rng_mutex);
+   if (rng)
+   kref_put(rng-ref, cleanup_rng);
+   mutex_unlock(rng_mutex);
+}
+
 static inline int hwrng_init(struct hwrng *rng)
 {
if (rng-init) {
@@ -113,12 +168,6 @@ static inline int hwrng_init(struct hwrng *rng)
return 0;
 }
 
-static inline void hwrng_cleanup(struct hwrng *rng)
-{
-   if (rng  rng-cleanup)
-   rng-cleanup(rng);
-}
-
 static int rng_dev_open(struct inode *inode, struct file *filp)
 {
/* enforce read-only access to this chrdev */
@@ -154,21 +203,22 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
ssize_t ret = 0;
int err = 0;
int bytes_read, len;
+   struct hwrng *rng;
 
while (size) {
-   if (mutex_lock_interruptible(rng_mutex)) {
-   err = -ERESTARTSYS;
+   rng = get_current_rng();
+   if (IS_ERR(rng)) {
+   err = PTR_ERR(rng);
goto out;
}
-
-   if (!current_rng) {
+   if (!rng) {
err = -ENODEV;
-   goto out_unlock;
+   goto out;
}
 
mutex_lock(reading_mutex);
if (!data_avail) {
-   bytes_read = rng_get_data(current_rng, rng_buffer,
+   bytes_read = rng_get_data(rng, rng_buffer,
rng_buffer_size(),
!(filp-f_flags  O_NONBLOCK));
if (bytes_read  0) {
@@ -200,8 +250,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
ret += len;
}
 
-   mutex_unlock(rng_mutex);
mutex_unlock(reading_mutex);
+   put_rng(rng);
 
if (need_resched())
schedule_timeout_interruptible(1);
@@ -213,12 +263,11 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
}
 out:
return ret ? : err;
-out_unlock:
-   mutex_unlock(rng_mutex);
-   goto out;
+
 out_unlock_reading:
mutex_unlock(reading_mutex);
-   goto out_unlock;
+   put_rng(rng);
+   goto out;
 }
 
 
@@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev

[PATCH v5 REPOST 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock

2014-12-08 Thread Amos Kong
In next patch, we use reference counting for each struct hwrng,
changing reference count also needs to take mutex_lock. Before
releasing the lock, if we try to stop a kthread that waits to
take the lock to reduce the referencing count, deadlock will
occur.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index b1b6042..a0905c8 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng)
}
}
if (list_empty(rng_list)) {
+   mutex_unlock(rng_mutex);
unregister_miscdev();
if (hwrng_fill)
kthread_stop(hwrng_fill);
-   }
-
-   mutex_unlock(rng_mutex);
+   } else
+   mutex_unlock(rng_mutex);
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 REPOST 1/6] hw_random: place mutex around read functions and buffers.

2014-12-08 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

There's currently a big lock around everything, and it means that we
can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current)
while the rng is reading.  This is a real problem when the rng is slow,
or blocked (eg. virtio_rng with qemu's default /dev/random backend)

This doesn't help (it leaves the current lock untouched), just adds a
lock to protect the read function and the static buffers, in preparation
for transition.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index aa30a25..b1b6042 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -53,7 +53,10 @@
 static struct hwrng *current_rng;
 static struct task_struct *hwrng_fill;
 static LIST_HEAD(rng_list);
+/* Protects rng_list and current_rng */
 static DEFINE_MUTEX(rng_mutex);
+/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */
+static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
 static unsigned short current_quality;
@@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng)
unsigned char bytes[16];
int bytes_read;
 
+   mutex_lock(reading_mutex);
bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1);
+   mutex_unlock(reading_mutex);
if (bytes_read  0)
add_device_randomness(bytes, bytes_read);
 }
@@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 
*buffer, size_t size,
int wait) {
int present;
 
+   BUG_ON(!mutex_is_locked(reading_mutex));
if (rng-read)
return rng-read(rng, (void *)buffer, size, wait);
 
@@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
goto out_unlock;
}
 
+   mutex_lock(reading_mutex);
if (!data_avail) {
bytes_read = rng_get_data(current_rng, rng_buffer,
rng_buffer_size(),
!(filp-f_flags  O_NONBLOCK));
if (bytes_read  0) {
err = bytes_read;
-   goto out_unlock;
+   goto out_unlock_reading;
}
data_avail = bytes_read;
}
@@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
if (!data_avail) {
if (filp-f_flags  O_NONBLOCK) {
err = -EAGAIN;
-   goto out_unlock;
+   goto out_unlock_reading;
}
} else {
len = data_avail;
@@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
if (copy_to_user(buf + ret, rng_buffer + data_avail,
len)) {
err = -EFAULT;
-   goto out_unlock;
+   goto out_unlock_reading;
}
 
size -= len;
@@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
}
 
mutex_unlock(rng_mutex);
+   mutex_unlock(reading_mutex);
 
if (need_resched())
schedule_timeout_interruptible(1);
@@ -208,6 +216,9 @@ out:
 out_unlock:
mutex_unlock(rng_mutex);
goto out;
+out_unlock_reading:
+   mutex_unlock(reading_mutex);
+   goto out_unlock;
 }
 
 
@@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused)
while (!kthread_should_stop()) {
if (!current_rng)
break;
+   mutex_lock(reading_mutex);
rc = rng_get_data(current_rng, rng_fillbuf,
  rng_buffer_size(), 1);
+   mutex_unlock(reading_mutex);
if (rc = 0) {
pr_warn(hwrng: no data available\n);
msleep_interruptible(1);
continue;
}
+   /* Outside lock, sure, but y'know: randomness. */
add_hwgenerator_randomness((void *)rng_fillbuf, rc,
   rc * current_quality * 8  10);
}
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 REPOST 0/6] fix hw_random stuck

2014-12-08 Thread Amos Kong
When I hotunplug a busy virtio-rng device or try to access
hwrng attributes in non-smp guest, it gets stuck.

My hotplug tests:

| test 0:
|   hotunplug rng device from qemu monitor
|
| test 1:
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 2:
|   guest) # dd if=/dev/random of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 4:
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   cat /sys/devices/virtual/misc/hw_random/rng_*
|
| test 5:
|   guest) # dd if=/dev/hwrng of=/dev/null
|   cancel dd process after 10 seconds
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 6:
|   use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo

V5: reset cleanup_done flag, drop redundant init of reference count, use
compiler barrier to prevent recording.
V4: update patch 4 to fix corrupt, decrease last reference for triggering
the cleanup, fix unregister race pointed by Herbert
V3: initialize kref to 1
V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference
counting issue

Amos Kong (1):
  hw_random: move some code out mutex_lock for avoiding underlying
deadlock

Rusty Russell (5):
  hw_random: place mutex around read functions and buffers.
  hw_random: use reference counts on each struct hwrng.
  hw_random: fix unregister race.
  hw_random: don't double-check old_rng.
  hw_random: don't init list element we're about to add to list.

 drivers/char/hw_random/core.c | 173 ++
 include/linux/hw_random.h |   3 +
 2 files changed, 126 insertions(+), 50 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 REPOST 5/6] hw_random: don't double-check old_rng.

2014-12-08 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

Interesting anti-pattern.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 067270b..a9286bf 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -476,14 +476,13 @@ int hwrng_register(struct hwrng *rng)
}
 
old_rng = current_rng;
+   err = 0;
if (!old_rng) {
err = hwrng_init(rng);
if (err)
goto out_unlock;
set_current_rng(rng);
-   }
-   err = 0;
-   if (!old_rng) {
+
err = register_miscdev();
if (err) {
drop_current_rng();
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 REPOST 4/6] hw_random: fix unregister race.

2014-12-08 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

The previous patch added one potential problem: we can still be
reading from a hwrng when it's unregistered.  Add a wait for zero
in the hwrng_unregister path.

v5: reset cleanup_done flag, use compiler barrier to prevent recording.
v4: add cleanup_done flag to insure that cleanup is done

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 12 
 include/linux/hw_random.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 83516cb..067270b 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex);
 static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
+static DECLARE_WAIT_QUEUE_HEAD(rng_done);
 static unsigned short current_quality;
 static unsigned short default_quality; /* = 0; default to off */
 
@@ -98,6 +99,11 @@ static inline void cleanup_rng(struct kref *kref)
 
if (rng-cleanup)
rng-cleanup(rng);
+
+   /* cleanup_done should be updated after cleanup finishes */
+   smp_wmb();
+   rng-cleanup_done = true;
+   wake_up_all(rng_done);
 }
 
 static void set_current_rng(struct hwrng *rng)
@@ -498,6 +504,8 @@ int hwrng_register(struct hwrng *rng)
add_early_randomness(rng);
}
 
+   rng-cleanup_done = false;
+
 out_unlock:
mutex_unlock(rng_mutex);
 out:
@@ -529,6 +537,10 @@ void hwrng_unregister(struct hwrng *rng)
kthread_stop(hwrng_fill);
} else
mutex_unlock(rng_mutex);
+
+   /* Just in case rng is reading right now, wait. */
+   wait_event(rng_done, rng-cleanup_done 
+  atomic_read(rng-ref.refcount) == 0);
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index c212e71..7832e50 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -46,6 +46,7 @@ struct hwrng {
/* internal. */
struct list_head list;
struct kref ref;
+   bool cleanup_done;
 };
 
 /** Register a new Hardware Random Number Generator driver. */
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 4/6] hw_random: fix unregister race.

2014-12-05 Thread Amos Kong
On Wed, Nov 12, 2014 at 02:33:00PM +1030, Rusty Russell wrote:
 Amos Kong ak...@redhat.com writes:
  From: Rusty Russell ru...@rustcorp.com.au
 
  The previous patch added one potential problem: we can still be
  reading from a hwrng when it's unregistered.  Add a wait for zero
  in the hwrng_unregister path.
 
  v4: add cleanup_done flag to insure that cleanup is done
 
 That's a bit weird.  The usual pattern would be to hold a reference
 until we're actually finished, but this reference is a bit weird.
 
 We hold the mutex across cleanup, so we could grab that but we have to
 take care sleeping inside wait_event, otherwise Peter will have to fix
 my code again :)
 
 AFAICT the wake_woken() stuff isn't merged yet, so your patch will
 have to do for now.
 
  @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref)
   
  if (rng-cleanup)
  rng-cleanup(rng);
  +   rng-cleanup_done = true;
  +   wake_up_all(rng_done);
   }
   
   static void set_current_rng(struct hwrng *rng)
  @@ -536,6 +539,11 @@ void hwrng_unregister(struct hwrng *rng)
  kthread_stop(hwrng_fill);
  } else
  mutex_unlock(rng_mutex);
  +
  +   /* Just in case rng is reading right now, wait. */
  +   wait_event(rng_done, rng-cleanup_done 
  +  atomic_read(rng-ref.refcount) == 0);
  +
 
 The atomic_read() isn't necessary here.

At least, we need it to convert refcount from atomic_t to int.
Otherwise, we will touch this error:

  error: invalid operands to binary == (have 'atomic_t' and 'int')
 
 However, you should probably init cleanup_done in hwrng_register().
 (Probably noone does unregister then register, but let's be clear).
 
 Thanks,
 Rusty.

-- 
Amos.


signature.asc
Description: Digital signature


[PATCH v5 4/6] hw_random: fix unregister race.

2014-12-05 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

The previous patch added one potential problem: we can still be
reading from a hwrng when it's unregistered.  Add a wait for zero
in the hwrng_unregister path.

v5: reset cleanup_done flag, use compiler barrier to prevent recording.
v4: add cleanup_done flag to insure that cleanup is done

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 12 
 include/linux/hw_random.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 83516cb..067270b 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex);
 static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
+static DECLARE_WAIT_QUEUE_HEAD(rng_done);
 static unsigned short current_quality;
 static unsigned short default_quality; /* = 0; default to off */
 
@@ -98,6 +99,11 @@ static inline void cleanup_rng(struct kref *kref)
 
if (rng-cleanup)
rng-cleanup(rng);
+
+   /* cleanup_done should be updated after cleanup finishes */
+   smp_wmb();
+   rng-cleanup_done = true;
+   wake_up_all(rng_done);
 }
 
 static void set_current_rng(struct hwrng *rng)
@@ -498,6 +504,8 @@ int hwrng_register(struct hwrng *rng)
add_early_randomness(rng);
}
 
+   rng-cleanup_done = false;
+
 out_unlock:
mutex_unlock(rng_mutex);
 out:
@@ -529,6 +537,10 @@ void hwrng_unregister(struct hwrng *rng)
kthread_stop(hwrng_fill);
} else
mutex_unlock(rng_mutex);
+
+   /* Just in case rng is reading right now, wait. */
+   wait_event(rng_done, rng-cleanup_done 
+  atomic_read(rng-ref.refcount) == 0);
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index c212e71..7832e50 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -46,6 +46,7 @@ struct hwrng {
/* internal. */
struct list_head list;
struct kref ref;
+   bool cleanup_done;
 };
 
 /** Register a new Hardware Random Number Generator driver. */
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 6/6] hw_random: don't init list element we're about to add to list.

2014-12-05 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

Another interesting anti-pattern.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index a9286bf..4d13ac5 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -489,7 +489,6 @@ int hwrng_register(struct hwrng *rng)
goto out_unlock;
}
}
-   INIT_LIST_HEAD(rng-list);
list_add_tail(rng-list, rng_list);
 
if (old_rng  !rng-init) {
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 5/6] hw_random: don't double-check old_rng.

2014-12-05 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

Interesting anti-pattern.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 067270b..a9286bf 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -476,14 +476,13 @@ int hwrng_register(struct hwrng *rng)
}
 
old_rng = current_rng;
+   err = 0;
if (!old_rng) {
err = hwrng_init(rng);
if (err)
goto out_unlock;
set_current_rng(rng);
-   }
-   err = 0;
-   if (!old_rng) {
+
err = register_miscdev();
if (err) {
drop_current_rng();
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 3/6] hw_random: use reference counts on each struct hwrng.

2014-12-05 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

current_rng holds one reference, and we bump it every time we want
to do a read from it.

This means we only hold the rng_mutex to grab or drop a reference,
so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't
block on read of /dev/hwrng.

Using a kref is overkill (we're always under the rng_mutex), but
a standard pattern.

This also solves the problem that the hwrng_fillfn thread was
accessing current_rng without a lock, which could change (eg. to NULL)
underneath it.

v5: drop redundant kref_init()
v4: decrease last reference for triggering the cleanup
v3: initialize kref (thanks Amos Kong)
v2: fix missing put_rng() on exit path (thanks Amos Kong)
Signed-off-by: Rusty Russell ru...@rustcorp.com.au
Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 135 --
 include/linux/hw_random.h |   2 +
 2 files changed, 94 insertions(+), 43 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index a0905c8..83516cb 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -42,6 +42,7 @@
 #include linux/delay.h
 #include linux/slab.h
 #include linux/random.h
+#include linux/err.h
 #include asm/uaccess.h
 
 
@@ -91,6 +92,60 @@ static void add_early_randomness(struct hwrng *rng)
add_device_randomness(bytes, bytes_read);
 }
 
+static inline void cleanup_rng(struct kref *kref)
+{
+   struct hwrng *rng = container_of(kref, struct hwrng, ref);
+
+   if (rng-cleanup)
+   rng-cleanup(rng);
+}
+
+static void set_current_rng(struct hwrng *rng)
+{
+   BUG_ON(!mutex_is_locked(rng_mutex));
+   kref_get(rng-ref);
+   current_rng = rng;
+}
+
+static void drop_current_rng(void)
+{
+   BUG_ON(!mutex_is_locked(rng_mutex));
+   if (!current_rng)
+   return;
+
+   /* decrease last reference for triggering the cleanup */
+   kref_put(current_rng-ref, cleanup_rng);
+   current_rng = NULL;
+}
+
+/* Returns ERR_PTR(), NULL or refcounted hwrng */
+static struct hwrng *get_current_rng(void)
+{
+   struct hwrng *rng;
+
+   if (mutex_lock_interruptible(rng_mutex))
+   return ERR_PTR(-ERESTARTSYS);
+
+   rng = current_rng;
+   if (rng)
+   kref_get(rng-ref);
+
+   mutex_unlock(rng_mutex);
+   return rng;
+}
+
+static void put_rng(struct hwrng *rng)
+{
+   /*
+* Hold rng_mutex here so we serialize in case they set_current_rng
+* on rng again immediately.
+*/
+   mutex_lock(rng_mutex);
+   if (rng)
+   kref_put(rng-ref, cleanup_rng);
+   mutex_unlock(rng_mutex);
+}
+
 static inline int hwrng_init(struct hwrng *rng)
 {
if (rng-init) {
@@ -113,12 +168,6 @@ static inline int hwrng_init(struct hwrng *rng)
return 0;
 }
 
-static inline void hwrng_cleanup(struct hwrng *rng)
-{
-   if (rng  rng-cleanup)
-   rng-cleanup(rng);
-}
-
 static int rng_dev_open(struct inode *inode, struct file *filp)
 {
/* enforce read-only access to this chrdev */
@@ -154,21 +203,22 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
ssize_t ret = 0;
int err = 0;
int bytes_read, len;
+   struct hwrng *rng;
 
while (size) {
-   if (mutex_lock_interruptible(rng_mutex)) {
-   err = -ERESTARTSYS;
+   rng = get_current_rng();
+   if (IS_ERR(rng)) {
+   err = PTR_ERR(rng);
goto out;
}
-
-   if (!current_rng) {
+   if (!rng) {
err = -ENODEV;
-   goto out_unlock;
+   goto out;
}
 
mutex_lock(reading_mutex);
if (!data_avail) {
-   bytes_read = rng_get_data(current_rng, rng_buffer,
+   bytes_read = rng_get_data(rng, rng_buffer,
rng_buffer_size(),
!(filp-f_flags  O_NONBLOCK));
if (bytes_read  0) {
@@ -200,8 +250,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
ret += len;
}
 
-   mutex_unlock(rng_mutex);
mutex_unlock(reading_mutex);
+   put_rng(rng);
 
if (need_resched())
schedule_timeout_interruptible(1);
@@ -213,12 +263,11 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
}
 out:
return ret ? : err;
-out_unlock:
-   mutex_unlock(rng_mutex);
-   goto out;
+
 out_unlock_reading:
mutex_unlock(reading_mutex);
-   goto out_unlock;
+   put_rng(rng);
+   goto out;
 }
 
 
@@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev

[PATCH v5 0/6] fix hw_random stuck

2014-12-05 Thread Amos Kong
When I hotunplug a busy virtio-rng device or try to access
hwrng attributes in non-smp guest, it gets stuck.

My hotplug tests:

| test 0:
|   hotunplug rng device from qemu monitor
|
| test 1:
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 2:
|   guest) # dd if=/dev/random of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 4:
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   cat /sys/devices/virtual/misc/hw_random/rng_*
|
| test 5:
|   guest) # dd if=/dev/hwrng of=/dev/null
|   cancel dd process after 10 seconds
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 6:
|   use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo

V5: reset cleanup_done flag, drop redundant init of reference count, use
compiler barrier to prevent recording.
V4: update patch 4 to fix corrupt, decrease last reference for triggering
the cleanup, fix unregister race pointed by Herbert
V3: initialize kref to 1
V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference
counting issue

Amos Kong (1):
  hw_random: move some code out mutex_lock for avoiding underlying
deadlock

Rusty Russell (5):
  hw_random: place mutex around read functions and buffers.
  hw_random: use reference counts on each struct hwrng.
  hw_random: fix unregister race.
  hw_random: don't double-check old_rng.
  hw_random: don't init list element we're about to add to list.

 drivers/char/hw_random/core.c | 173 ++
 include/linux/hw_random.h |   3 +
 2 files changed, 126 insertions(+), 50 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock

2014-12-05 Thread Amos Kong
In next patch, we use reference counting for each struct hwrng,
changing reference count also needs to take mutex_lock. Before
releasing the lock, if we try to stop a kthread that waits to
take the lock to reduce the referencing count, deadlock will
occur.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index b1b6042..a0905c8 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng)
}
}
if (list_empty(rng_list)) {
+   mutex_unlock(rng_mutex);
unregister_miscdev();
if (hwrng_fill)
kthread_stop(hwrng_fill);
-   }
-
-   mutex_unlock(rng_mutex);
+   } else
+   mutex_unlock(rng_mutex);
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 1/6] hw_random: place mutex around read functions and buffers.

2014-12-05 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

There's currently a big lock around everything, and it means that we
can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current)
while the rng is reading.  This is a real problem when the rng is slow,
or blocked (eg. virtio_rng with qemu's default /dev/random backend)

This doesn't help (it leaves the current lock untouched), just adds a
lock to protect the read function and the static buffers, in preparation
for transition.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index aa30a25..b1b6042 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -53,7 +53,10 @@
 static struct hwrng *current_rng;
 static struct task_struct *hwrng_fill;
 static LIST_HEAD(rng_list);
+/* Protects rng_list and current_rng */
 static DEFINE_MUTEX(rng_mutex);
+/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */
+static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
 static unsigned short current_quality;
@@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng)
unsigned char bytes[16];
int bytes_read;
 
+   mutex_lock(reading_mutex);
bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1);
+   mutex_unlock(reading_mutex);
if (bytes_read  0)
add_device_randomness(bytes, bytes_read);
 }
@@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 
*buffer, size_t size,
int wait) {
int present;
 
+   BUG_ON(!mutex_is_locked(reading_mutex));
if (rng-read)
return rng-read(rng, (void *)buffer, size, wait);
 
@@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
goto out_unlock;
}
 
+   mutex_lock(reading_mutex);
if (!data_avail) {
bytes_read = rng_get_data(current_rng, rng_buffer,
rng_buffer_size(),
!(filp-f_flags  O_NONBLOCK));
if (bytes_read  0) {
err = bytes_read;
-   goto out_unlock;
+   goto out_unlock_reading;
}
data_avail = bytes_read;
}
@@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
if (!data_avail) {
if (filp-f_flags  O_NONBLOCK) {
err = -EAGAIN;
-   goto out_unlock;
+   goto out_unlock_reading;
}
} else {
len = data_avail;
@@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
if (copy_to_user(buf + ret, rng_buffer + data_avail,
len)) {
err = -EFAULT;
-   goto out_unlock;
+   goto out_unlock_reading;
}
 
size -= len;
@@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
}
 
mutex_unlock(rng_mutex);
+   mutex_unlock(reading_mutex);
 
if (need_resched())
schedule_timeout_interruptible(1);
@@ -208,6 +216,9 @@ out:
 out_unlock:
mutex_unlock(rng_mutex);
goto out;
+out_unlock_reading:
+   mutex_unlock(reading_mutex);
+   goto out_unlock;
 }
 
 
@@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused)
while (!kthread_should_stop()) {
if (!current_rng)
break;
+   mutex_lock(reading_mutex);
rc = rng_get_data(current_rng, rng_fillbuf,
  rng_buffer_size(), 1);
+   mutex_unlock(reading_mutex);
if (rc = 0) {
pr_warn(hwrng: no data available\n);
msleep_interruptible(1);
continue;
}
+   /* Outside lock, sure, but y'know: randomness. */
add_hwgenerator_randomness((void *)rng_fillbuf, rc,
   rc * current_quality * 8  10);
}
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 4/6] hw_random: fix unregister race.

2014-11-24 Thread Amos Kong
On Wed, Nov 12, 2014 at 02:33:00PM +1030, Rusty Russell wrote:
 Amos Kong ak...@redhat.com writes:
  From: Rusty Russell ru...@rustcorp.com.au
 
  The previous patch added one potential problem: we can still be
  reading from a hwrng when it's unregistered.  Add a wait for zero
  in the hwrng_unregister path.
 
  v4: add cleanup_done flag to insure that cleanup is done
 
 That's a bit weird.  The usual pattern would be to hold a reference
 until we're actually finished, but this reference is a bit weird.

The cleanup function is a callback function of kref_put(), we can't
use the same reference count inside cleanup function.
 
 We hold the mutex across cleanup, so we could grab that but we have to
 take care sleeping inside wait_event, otherwise Peter will have to fix
 my code again :)

We didn't hold rng_mutex inside cleanup_rng(), am I missing something?
 
 AFAICT the wake_woken() stuff isn't merged yet, so your patch will
 have to do for now.

Can you provide some patches/mail link here? I searched nothing about 
wake_woken.
 
  @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref)
   
  if (rng-cleanup)
  rng-cleanup(rng);
  +   rng-cleanup_done = true;
  +   wake_up_all(rng_done);
   }
   
   static void set_current_rng(struct hwrng *rng)
  @@ -536,6 +539,11 @@ void hwrng_unregister(struct hwrng *rng)
  kthread_stop(hwrng_fill);
  } else
  mutex_unlock(rng_mutex);
  +
  +   /* Just in case rng is reading right now, wait. */
  +   wait_event(rng_done, rng-cleanup_done 
  +  atomic_read(rng-ref.refcount) == 0);
  +
 
 The atomic_read() isn't necessary here.
 
 However, you should probably init cleanup_done in hwrng_register().
 (Probably noone does unregister then register, but let's be clear).

Got it.
 
 Thanks,
 Rusty.
 
   }
   EXPORT_SYMBOL_GPL(hwrng_unregister);
   
  diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
  index c212e71..7832e50 100644
  --- a/include/linux/hw_random.h
  +++ b/include/linux/hw_random.h
  @@ -46,6 +46,7 @@ struct hwrng {
  /* internal. */
  struct list_head list;
  struct kref ref;
  +   bool cleanup_done;
   };
   
   /** Register a new Hardware Random Number Generator driver. */
  -- 
  1.9.3

-- 
Amos.


signature.asc
Description: Digital signature


Re: [PATCH v4 3/6] hw_random: use reference counts on each struct hwrng.

2014-11-17 Thread Amos Kong
On Wed, Nov 12, 2014 at 02:11:23PM +1030, Rusty Russell wrote:
 Amos Kong ak...@redhat.com writes:
  From: Rusty Russell ru...@rustcorp.com.au
 
  current_rng holds one reference, and we bump it every time we want
  to do a read from it.
 
  This means we only hold the rng_mutex to grab or drop a reference,
  so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't
  block on read of /dev/hwrng.
 
  Using a kref is overkill (we're always under the rng_mutex), but
  a standard pattern.
 
  This also solves the problem that the hwrng_fillfn thread was
  accessing current_rng without a lock, which could change (eg. to NULL)
  underneath it.
 
  v4: decrease last reference for triggering the cleanup
 
 This doesn't make any sense:
 
  +static void drop_current_rng(void)
  +{
  +   struct hwrng *rng = current_rng;
  +
  +   BUG_ON(!mutex_is_locked(rng_mutex));
  +   if (!current_rng)
  +   return;
  +
  +   /* release current_rng reference */
  +   kref_put(current_rng-ref, cleanup_rng);
  +   current_rng = NULL;
  +
  +   /* decrease last reference for triggering the cleanup */
  +   kref_put(rng-ref, cleanup_rng);
  +}
 
 Why would it drop the refcount twice?  This doesn't make sense.
 
 Hmm, because you added kref_init, which initializes the reference count
 to 1, you created this bug.

I saw some kernel code uses kref_* helper functions, the reference
conter is initialized to 1. Some code didn't use the helper functions
to increase/decrease the reference counter. So I will drop kref_init()
and the second kref_put().
 
 Leave out the kref_init, and let it naturally be 0 (until, and if, it
 becomes current_rng).  Add a comment if you want.

OK, thanks.
 
 Thanks,
 Rusty.
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Amos.


signature.asc
Description: Digital signature


Re: [PATCH 3/5] hw_random: fix unregister race.

2014-11-03 Thread Amos Kong
On Tue, Oct 21, 2014 at 10:15:23PM +0800, Herbert Xu wrote:
 On Thu, Sep 18, 2014 at 12:18:24PM +0930, Rusty Russell wrote:
  The previous patch added one potential problem: we can still be
  reading from a hwrng when it's unregistered.  Add a wait for zero
  in the hwrng_unregister path.
  
  Signed-off-by: Rusty Russell ru...@rustcorp.com.au
  ---
   drivers/char/hw_random/core.c | 5 +
   1 file changed, 5 insertions(+)
  
  diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
  index dc9092a1075d..b4a21e9521cf 100644
  --- a/drivers/char/hw_random/core.c
  +++ b/drivers/char/hw_random/core.c
  @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex);
   static DEFINE_MUTEX(reading_mutex);
   static int data_avail;
   static u8 *rng_buffer, *rng_fillbuf;
  +static DECLARE_WAIT_QUEUE_HEAD(rng_done);
   static unsigned short current_quality;
   static unsigned short default_quality; /* = 0; default to off */
   
  @@ -98,6 +99,7 @@ static inline void cleanup_rng(struct kref *kref)
   
  if (rng-cleanup)
  rng-cleanup(rng);

rng-cleanup_done = true;

  +   wake_up_all(rng_done);
   }
   
   static void set_current_rng(struct hwrng *rng)
  @@ -529,6 +531,9 @@ void hwrng_unregister(struct hwrng *rng)
  }
   
  mutex_unlock(rng_mutex);
  +
  +   /* Just in case rng is reading right now, wait. */
  +   wait_event(rng_done, atomic_read(rng-ref.refcount) == 0);

Hi Rusty,
 
After initializing (kref_init()), the refcount is 1, so we need one
more kref_put() after each drop_current_rng() to release last
reference count, then cleanup function will be called.


 While it's obviously better than what we have now, I don't believe
 this is 100% safe as the cleanup function might still be running
 even after the ref count hits zero.  Once we return from this function
 the module may be unloaded so we need to ensure that nothing is
 running at this point.

I found wait_event() can still pass and finish unregister even cleanup
function isn't called (wake_up_all() isn't called). So I added a flag
cleanup_done to indicate that the rng device is cleaned up.


+   /* Just in case rng is reading right now, wait. */
+   wait_event(rng_done, rng-cleanup_done 
+  atomic_read(rng-ref.refcount) == 0);

I will post the new v4 later.
 
 Cheers,
 -- 
 Email: Herbert Xu herb...@gondor.apana.org.au
 Home Page: http://gondor.apana.org.au/~herbert/
 PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 6/6] hw_random: don't init list element we're about to add to list.

2014-11-03 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

Another interesting anti-pattern.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index fc5de7d..b2cc8a1 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -493,7 +493,6 @@ int hwrng_register(struct hwrng *rng)
goto out_unlock;
}
}
-   INIT_LIST_HEAD(rng-list);
list_add_tail(rng-list, rng_list);
 
if (old_rng  !rng-init) {
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock

2014-11-03 Thread Amos Kong
In next patch, we use reference counting for each struct hwrng,
changing reference count also needs to take mutex_lock. Before
releasing the lock, if we try to stop a kthread that waits to
take the lock to reduce the referencing count, deadlock will
occur.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index b1b6042..a0905c8 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng)
}
}
if (list_empty(rng_list)) {
+   mutex_unlock(rng_mutex);
unregister_miscdev();
if (hwrng_fill)
kthread_stop(hwrng_fill);
-   }
-
-   mutex_unlock(rng_mutex);
+   } else
+   mutex_unlock(rng_mutex);
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 0/6] fix hw_random stuck

2014-11-03 Thread Amos Kong
When I hotunplug a busy virtio-rng device or try to access
hwrng attributes in non-smp guest, it gets stuck.

My hotplug tests:

| test 0:
|   hotunplug rng device from qemu monitor
|
| test 1:
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 2:
|   guest) # dd if=/dev/random of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 4:
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   cat /sys/devices/virtual/misc/hw_random/rng_*
|
| test 5:
|   guest) # dd if=/dev/hwrng of=/dev/null
|   cancel dd process after 10 seconds
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 6:
|   use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo


V4: update patch 4 to fix corrupt, decrease last reference for triggering
the cleanup, fix unregister race pointed by Herbert
V3: initialize kref to 1
V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference
counting issue

Amos Kong (1):
  hw_random: move some code out mutex_lock for avoiding underlying
deadlock

Rusty Russell (5):
  hw_random: place mutex around read functions and buffers.
  hw_random: use reference counts on each struct hwrng.
  hw_random: fix unregister race.
  hw_random: don't double-check old_rng.
  hw_random: don't init list element we're about to add to list.

 drivers/char/hw_random/core.c | 176 ++
 include/linux/hw_random.h |   3 +
 2 files changed, 129 insertions(+), 50 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 3/6] hw_random: use reference counts on each struct hwrng.

2014-11-03 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

current_rng holds one reference, and we bump it every time we want
to do a read from it.

This means we only hold the rng_mutex to grab or drop a reference,
so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't
block on read of /dev/hwrng.

Using a kref is overkill (we're always under the rng_mutex), but
a standard pattern.

This also solves the problem that the hwrng_fillfn thread was
accessing current_rng without a lock, which could change (eg. to NULL)
underneath it.

v4: decrease last reference for triggering the cleanup
v3: initialize kref (thanks Amos Kong)
v2: fix missing put_rng() on exit path (thanks Amos Kong)
Signed-off-by: Rusty Russell ru...@rustcorp.com.au
Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 142 +-
 include/linux/hw_random.h |   2 +
 2 files changed, 101 insertions(+), 43 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index a0905c8..27ad6b4 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -42,6 +42,7 @@
 #include linux/delay.h
 #include linux/slab.h
 #include linux/random.h
+#include linux/err.h
 #include asm/uaccess.h
 
 
@@ -91,6 +92,65 @@ static void add_early_randomness(struct hwrng *rng)
add_device_randomness(bytes, bytes_read);
 }
 
+static inline void cleanup_rng(struct kref *kref)
+{
+   struct hwrng *rng = container_of(kref, struct hwrng, ref);
+
+   if (rng-cleanup)
+   rng-cleanup(rng);
+}
+
+static void set_current_rng(struct hwrng *rng)
+{
+   BUG_ON(!mutex_is_locked(rng_mutex));
+   kref_get(rng-ref);
+   current_rng = rng;
+}
+
+static void drop_current_rng(void)
+{
+   struct hwrng *rng = current_rng;
+
+   BUG_ON(!mutex_is_locked(rng_mutex));
+   if (!current_rng)
+   return;
+
+   /* release current_rng reference */
+   kref_put(current_rng-ref, cleanup_rng);
+   current_rng = NULL;
+
+   /* decrease last reference for triggering the cleanup */
+   kref_put(rng-ref, cleanup_rng);
+}
+
+/* Returns ERR_PTR(), NULL or refcounted hwrng */
+static struct hwrng *get_current_rng(void)
+{
+   struct hwrng *rng;
+
+   if (mutex_lock_interruptible(rng_mutex))
+   return ERR_PTR(-ERESTARTSYS);
+
+   rng = current_rng;
+   if (rng)
+   kref_get(rng-ref);
+
+   mutex_unlock(rng_mutex);
+   return rng;
+}
+
+static void put_rng(struct hwrng *rng)
+{
+   /*
+* Hold rng_mutex here so we serialize in case they set_current_rng
+* on rng again immediately.
+*/
+   mutex_lock(rng_mutex);
+   if (rng)
+   kref_put(rng-ref, cleanup_rng);
+   mutex_unlock(rng_mutex);
+}
+
 static inline int hwrng_init(struct hwrng *rng)
 {
if (rng-init) {
@@ -110,13 +170,9 @@ static inline int hwrng_init(struct hwrng *rng)
if (current_quality  0  !hwrng_fill)
start_khwrngd();
 
-   return 0;
-}
+   kref_init(rng-ref);
 
-static inline void hwrng_cleanup(struct hwrng *rng)
-{
-   if (rng  rng-cleanup)
-   rng-cleanup(rng);
+   return 0;
 }
 
 static int rng_dev_open(struct inode *inode, struct file *filp)
@@ -154,21 +210,22 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
ssize_t ret = 0;
int err = 0;
int bytes_read, len;
+   struct hwrng *rng;
 
while (size) {
-   if (mutex_lock_interruptible(rng_mutex)) {
-   err = -ERESTARTSYS;
+   rng = get_current_rng();
+   if (IS_ERR(rng)) {
+   err = PTR_ERR(rng);
goto out;
}
-
-   if (!current_rng) {
+   if (!rng) {
err = -ENODEV;
-   goto out_unlock;
+   goto out;
}
 
mutex_lock(reading_mutex);
if (!data_avail) {
-   bytes_read = rng_get_data(current_rng, rng_buffer,
+   bytes_read = rng_get_data(rng, rng_buffer,
rng_buffer_size(),
!(filp-f_flags  O_NONBLOCK));
if (bytes_read  0) {
@@ -200,8 +257,8 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
ret += len;
}
 
-   mutex_unlock(rng_mutex);
mutex_unlock(reading_mutex);
+   put_rng(rng);
 
if (need_resched())
schedule_timeout_interruptible(1);
@@ -213,12 +270,11 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
}
 out:
return ret ? : err;
-out_unlock:
-   mutex_unlock(rng_mutex);
-   goto out;
+
 out_unlock_reading:
mutex_unlock(reading_mutex

[PATCH v4 1/6] hw_random: place mutex around read functions and buffers.

2014-11-03 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

There's currently a big lock around everything, and it means that we
can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current)
while the rng is reading.  This is a real problem when the rng is slow,
or blocked (eg. virtio_rng with qemu's default /dev/random backend)

This doesn't help (it leaves the current lock untouched), just adds a
lock to protect the read function and the static buffers, in preparation
for transition.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index aa30a25..b1b6042 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -53,7 +53,10 @@
 static struct hwrng *current_rng;
 static struct task_struct *hwrng_fill;
 static LIST_HEAD(rng_list);
+/* Protects rng_list and current_rng */
 static DEFINE_MUTEX(rng_mutex);
+/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */
+static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
 static unsigned short current_quality;
@@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng)
unsigned char bytes[16];
int bytes_read;
 
+   mutex_lock(reading_mutex);
bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1);
+   mutex_unlock(reading_mutex);
if (bytes_read  0)
add_device_randomness(bytes, bytes_read);
 }
@@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 
*buffer, size_t size,
int wait) {
int present;
 
+   BUG_ON(!mutex_is_locked(reading_mutex));
if (rng-read)
return rng-read(rng, (void *)buffer, size, wait);
 
@@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
goto out_unlock;
}
 
+   mutex_lock(reading_mutex);
if (!data_avail) {
bytes_read = rng_get_data(current_rng, rng_buffer,
rng_buffer_size(),
!(filp-f_flags  O_NONBLOCK));
if (bytes_read  0) {
err = bytes_read;
-   goto out_unlock;
+   goto out_unlock_reading;
}
data_avail = bytes_read;
}
@@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
if (!data_avail) {
if (filp-f_flags  O_NONBLOCK) {
err = -EAGAIN;
-   goto out_unlock;
+   goto out_unlock_reading;
}
} else {
len = data_avail;
@@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
if (copy_to_user(buf + ret, rng_buffer + data_avail,
len)) {
err = -EFAULT;
-   goto out_unlock;
+   goto out_unlock_reading;
}
 
size -= len;
@@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
}
 
mutex_unlock(rng_mutex);
+   mutex_unlock(reading_mutex);
 
if (need_resched())
schedule_timeout_interruptible(1);
@@ -208,6 +216,9 @@ out:
 out_unlock:
mutex_unlock(rng_mutex);
goto out;
+out_unlock_reading:
+   mutex_unlock(reading_mutex);
+   goto out_unlock;
 }
 
 
@@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused)
while (!kthread_should_stop()) {
if (!current_rng)
break;
+   mutex_lock(reading_mutex);
rc = rng_get_data(current_rng, rng_fillbuf,
  rng_buffer_size(), 1);
+   mutex_unlock(reading_mutex);
if (rc = 0) {
pr_warn(hwrng: no data available\n);
msleep_interruptible(1);
continue;
}
+   /* Outside lock, sure, but y'know: randomness. */
add_hwgenerator_randomness((void *)rng_fillbuf, rc,
   rc * current_quality * 8  10);
}
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 4/6] hw_random: fix unregister race.

2014-11-03 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

The previous patch added one potential problem: we can still be
reading from a hwrng when it's unregistered.  Add a wait for zero
in the hwrng_unregister path.

v4: add cleanup_done flag to insure that cleanup is done

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 8 
 include/linux/hw_random.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 27ad6b4..c31bf91 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex);
 static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
+static DECLARE_WAIT_QUEUE_HEAD(rng_done);
 static unsigned short current_quality;
 static unsigned short default_quality; /* = 0; default to off */
 
@@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref)
 
if (rng-cleanup)
rng-cleanup(rng);
+   rng-cleanup_done = true;
+   wake_up_all(rng_done);
 }
 
 static void set_current_rng(struct hwrng *rng)
@@ -536,6 +539,11 @@ void hwrng_unregister(struct hwrng *rng)
kthread_stop(hwrng_fill);
} else
mutex_unlock(rng_mutex);
+
+   /* Just in case rng is reading right now, wait. */
+   wait_event(rng_done, rng-cleanup_done 
+  atomic_read(rng-ref.refcount) == 0);
+
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index c212e71..7832e50 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -46,6 +46,7 @@ struct hwrng {
/* internal. */
struct list_head list;
struct kref ref;
+   bool cleanup_done;
 };
 
 /** Register a new Hardware Random Number Generator driver. */
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 5/6] hw_random: don't double-check old_rng.

2014-11-03 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

Interesting anti-pattern.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index c31bf91..fc5de7d 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -480,14 +480,13 @@ int hwrng_register(struct hwrng *rng)
}
 
old_rng = current_rng;
+   err = 0;
if (!old_rng) {
err = hwrng_init(rng);
if (err)
goto out_unlock;
set_current_rng(rng);
-   }
-   err = 0;
-   if (!old_rng) {
+
err = register_miscdev();
if (err) {
drop_current_rng();
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 4/6] hw_random: fix unregister race.

2014-11-02 Thread Amos Kong
On Fri, Oct 31, 2014 at 03:23:21PM +0800, Herbert Xu wrote:
 On Fri, Oct 31, 2014 at 10:28:00AM +1030, Rusty Russell wrote:
  Herbert Xu herb...@gondor.apana.org.au writes:
   On Thu, Sep 18, 2014 at 08:37:45PM +0800, Amos Kong wrote:
   From: Rusty Russell ru...@rustcorp.com.au
   
   The previous patch added one potential problem: we can still be
   reading from a hwrng when it's unregistered.  Add a wait for zero
   in the hwrng_unregister path.
   
   Signed-off-by: Rusty Russell ru...@rustcorp.com.au
  
   You totally corrupted Rusty's patch.  If you're going to repost
   his series you better make sure that you've got the right patches.
  
   Just as well though as it made me think a little more about this
   patch :)
  
  OK Amos, can you please repost the complete series?
 
 Please fix the unregister race I identified first.

Ok, I redownload the patches from 
https://patchwork.kernel.org/project/LKML/list/?submitter=5
and rebase fixes of me and rusty on it. I will post a V4 later. Thanks.

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 4/6] hw_random: fix unregister race.

2014-11-02 Thread Amos Kong
On Sun, Nov 02, 2014 at 11:08:09PM +0800, Herbert Xu wrote:
 On Sun, Nov 02, 2014 at 11:06:13PM +0800, Amos Kong wrote:
  On Fri, Oct 31, 2014 at 03:23:21PM +0800, Herbert Xu wrote:
   On Fri, Oct 31, 2014 at 10:28:00AM +1030, Rusty Russell wrote:
Herbert Xu herb...@gondor.apana.org.au writes:
 On Thu, Sep 18, 2014 at 08:37:45PM +0800, Amos Kong wrote:
 From: Rusty Russell ru...@rustcorp.com.au
 
 The previous patch added one potential problem: we can still be
 reading from a hwrng when it's unregistered.  Add a wait for zero
 in the hwrng_unregister path.
 
 Signed-off-by: Rusty Russell ru...@rustcorp.com.au

 You totally corrupted Rusty's patch.  If you're going to repost
 his series you better make sure that you've got the right patches.

 Just as well though as it made me think a little more about this
 patch :)

OK Amos, can you please repost the complete series?
   
   Please fix the unregister race I identified first.
  
  Ok, I redownload the patches from 
  https://patchwork.kernel.org/project/LKML/list/?submitter=5
  and rebase fixes of me and rusty on it. I will post a V4 later. Thanks.
 
 Please fix the unregister race I pointed out before doing a v4.

Thanks for the remind, I got it right now  :-)

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.

2014-10-19 Thread Amos Kong
On Mon, Oct 20, 2014 at 10:42:11AM +1030, Rusty Russell wrote:
 Amos Kong ak...@redhat.com writes:
  We got a warning in boot stage when above set_current_rng() is executed,
  it can be fixed by init rng-ref in hwrng_init().
 
 
  @@ -166,6 +169,8 @@ static inline int hwrng_init(struct hwrng *rng)
  if (current_quality  0  !hwrng_fill)
  start_khwrngd();
   
  +   kref_init(rng-ref);
  +
  return 0;
   }
 
 OK, I folded this fix on.

Thanks.

Reviewed-by: Amos Kong ak...@redhat.com
 
 Thanks,
 Rusty.
 
 hw_random: use reference counts on each struct hwrng.
 
 current_rng holds one reference, and we bump it every time we want
 to do a read from it.
 
 This means we only hold the rng_mutex to grab or drop a reference,
 so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't
 block on read of /dev/hwrng.
 
 Using a kref is overkill (we're always under the rng_mutex), but
 a standard pattern.
 
 This also solves the problem that the hwrng_fillfn thread was
 accessing current_rng without a lock, which could change (eg. to NULL)
 underneath it.
 
 v3: initialize kref (thanks Amos Kong)
 v2: fix missing put_rng() on exit path (thanks Amos Kong)
 Signed-off-by: Rusty Russell ru...@rustcorp.com.au
 
 diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
 index a0905c818e13..0ecac38da954 100644
 --- a/drivers/char/hw_random/core.c
 +++ b/drivers/char/hw_random/core.c

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] hw_random: use reference counts on each struct hwrng.

2014-09-18 Thread Amos Kong
On Thu, Sep 18, 2014 at 12:18:23PM +0930, Rusty Russell wrote:
 current_rng holds one reference, and we bump it every time we want
 to do a read from it.
 
 This means we only hold the rng_mutex to grab or drop a reference,
 so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't
 block on read of /dev/hwrng.
 
 Using a kref is overkill (we're always under the rng_mutex), but
 a standard pattern.
 
 This also solves the problem that the hwrng_fillfn thread was
 accessing current_rng without a lock, which could change (eg. to NULL)
 underneath it.

Hi Rusty,
 
 Signed-off-by: Rusty Russell ru...@rustcorp.com.au
 ---
  drivers/char/hw_random/core.c | 135 
 --
  include/linux/hw_random.h |   2 +
  2 files changed, 94 insertions(+), 43 deletions(-)

...

  static int rng_dev_open(struct inode *inode, struct file *filp)
  {
   /* enforce read-only access to this chrdev */
 @@ -154,21 +202,22 @@ static ssize_t rng_dev_read(struct file *filp, char 
 __user *buf,
   ssize_t ret = 0;
   int err = 0;
   int bytes_read, len;
 + struct hwrng *rng;
  
   while (size) {
 - if (mutex_lock_interruptible(rng_mutex)) {
 - err = -ERESTARTSYS;
 + rng = get_current_rng();
 + if (IS_ERR(rng)) {
 + err = PTR_ERR(rng);
   goto out;
   }
 -
 - if (!current_rng) {
 + if (!rng) {
   err = -ENODEV;
 - goto out_unlock;
 + goto out;
   }
  
   mutex_lock(reading_mutex);
   if (!data_avail) {
 - bytes_read = rng_get_data(current_rng, rng_buffer,
 + bytes_read = rng_get_data(rng, rng_buffer,
   rng_buffer_size(),
   !(filp-f_flags  O_NONBLOCK));
   if (bytes_read  0) {
 @@ -200,7 +249,6 @@ static ssize_t rng_dev_read(struct file *filp, char 
 __user *buf,
   ret += len;
   }
  
 - mutex_unlock(rng_mutex);
   mutex_unlock(reading_mutex);
  
   if (need_resched())
 @@ -210,15 +258,16 @@ static ssize_t rng_dev_read(struct file *filp, char 
 __user *buf,
   err = -ERESTARTSYS;

We need put_rng() in this error path. Otherwise, unhotplug will hang
in the end of hwrng_unregister()

|/* Just in case rng is reading right now, wait. */
|wait_event(rng_done, atomic_read(rng-ref.refcount) == 0);

Steps to reproduce the hang:
  guest) # dd if=/dev/hwrng of=/dev/null 
  cancel dd process after 10 seconds
  guest) # dd if=/dev/hwrng of=/dev/null 
  hotunplug rng device from qemu monitor
  result: device can't be removed (still can find in QEMU monitor)


diff --git a/drivers/char/hw_random/core.c
b/drivers/char/hw_random/core.c
index 96fa067..4e22d70 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -258,6 +258,7 @@ static ssize_t rng_dev_read(struct file *filp,
char __user *buf,
 
if (signal_pending(current)) {
err = -ERESTARTSYS;
+   put_rng(rng);
goto out;
}

   goto out;
   }
 +
 + put_rng(rng);
   }
  out:
   return ret ? : err;
 -out_unlock:
 - mutex_unlock(rng_mutex);
 - goto out;
 +
  out_unlock_reading:
   mutex_unlock(reading_mutex);
 - goto out_unlock;
 + put_rng(rng);
 + goto out;
  }
  
  
 @@ -257,8 +306,8 @@ static ssize_t hwrng_attr_current_store(struct device 
 *dev,
   err = hwrng_init(rng);
   if (err)
   break;
 - hwrng_cleanup(current_rng);
 - current_rng = rng;
 + drop_current_rng();
 + set_current_rng(rng);
   err = 0;
   break;
   }
 @@ -272,17 +321,15 @@ static ssize_t hwrng_attr_current_show(struct device 
 *dev,
  struct device_attribute *attr,
  char *buf)
  {
 - int err;
   ssize_t ret;
 - const char *name = none;
 + struct hwrng *rng;
  
 - err = mutex_lock_interruptible(rng_mutex);
 - if (err)
 - return -ERESTARTSYS;
 - if (current_rng)
 - name = current_rng-name;
 - ret = snprintf(buf, PAGE_SIZE, %s\n, name);
 - mutex_unlock(rng_mutex);
 + rng = get_current_rng();
 + if (IS_ERR(rng))
 + return PTR_ERR(rng);
 +
 + ret = snprintf(buf, PAGE_SIZE, %s\n, rng ? rng-name : none);
 + put_rng(rng);
  
   return ret;
  }
 @@ -357,12 +404,16 @@ static int hwrng_fillfn(void *unused)
   long rc;
  
   while (!kthread_should_stop()) {
 -  

[PATCH v2 6/6] hw_random: don't init list element we're about to add to list.

2014-09-18 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

Another interesting anti-pattern.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 6420409..12187dd 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -486,7 +486,6 @@ int hwrng_register(struct hwrng *rng)
goto out_unlock;
}
}
-   INIT_LIST_HEAD(rng-list);
list_add_tail(rng-list, rng_list);
 
if (old_rng  !rng-init) {
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/6] hw_random: move some code out mutex_lock for avoiding underlying deadlock

2014-09-18 Thread Amos Kong
In next patch, we use reference counting for each struct hwrng,
changing reference count also needs to take mutex_lock. Before
releasing the lock, if we try to stop a kthread that waits to
take the lock to reduce the referencing count, deadlock will
occur.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index b1b6042..a0905c8 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -474,12 +474,12 @@ void hwrng_unregister(struct hwrng *rng)
}
}
if (list_empty(rng_list)) {
+   mutex_unlock(rng_mutex);
unregister_miscdev();
if (hwrng_fill)
kthread_stop(hwrng_fill);
-   }
-
-   mutex_unlock(rng_mutex);
+   } else
+   mutex_unlock(rng_mutex);
 }
 EXPORT_SYMBOL_GPL(hwrng_unregister);
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/6] hw_random: fix unregister race.

2014-09-18 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

The previous patch added one potential problem: we can still be
reading from a hwrng when it's unregistered.  Add a wait for zero
in the hwrng_unregister path.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index ee9e504..9f6f5bb 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex);
 static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
+static DECLARE_WAIT_QUEUE_HEAD(rng_done);
 static unsigned short current_quality;
 static unsigned short default_quality; /* = 0; default to off */
 
@@ -98,6 +99,7 @@ static inline void cleanup_rng(struct kref *kref)
 
if (rng-cleanup)
rng-cleanup(rng);
+   wake_up_all(rng_done);
 }
 
 static void set_current_rng(struct hwrng *rng)
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 5/6] hw_random: don't double-check old_rng.

2014-09-18 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

Interesting anti-pattern.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 9f6f5bb..6420409 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -473,14 +473,13 @@ int hwrng_register(struct hwrng *rng)
}
 
old_rng = current_rng;
+   err = 0;
if (!old_rng) {
err = hwrng_init(rng);
if (err)
goto out_unlock;
set_current_rng(rng);
-   }
-   err = 0;
-   if (!old_rng) {
+
err = register_miscdev();
if (err) {
drop_current_rng();
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.

2014-09-18 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

current_rng holds one reference, and we bump it every time we want
to do a read from it.

This means we only hold the rng_mutex to grab or drop a reference,
so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't
block on read of /dev/hwrng.

Using a kref is overkill (we're always under the rng_mutex), but
a standard pattern.

This also solves the problem that the hwrng_fillfn thread was
accessing current_rng without a lock, which could change (eg. to NULL)
underneath it.

V2: reduce reference count in signal_pending path

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 136 +-
 include/linux/hw_random.h |   2 +
 2 files changed, 95 insertions(+), 43 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index a0905c8..ee9e504 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -42,6 +42,7 @@
 #include linux/delay.h
 #include linux/slab.h
 #include linux/random.h
+#include linux/err.h
 #include asm/uaccess.h
 
 
@@ -91,6 +92,59 @@ static void add_early_randomness(struct hwrng *rng)
add_device_randomness(bytes, bytes_read);
 }
 
+static inline void cleanup_rng(struct kref *kref)
+{
+   struct hwrng *rng = container_of(kref, struct hwrng, ref);
+
+   if (rng-cleanup)
+   rng-cleanup(rng);
+}
+
+static void set_current_rng(struct hwrng *rng)
+{
+   BUG_ON(!mutex_is_locked(rng_mutex));
+   kref_get(rng-ref);
+   current_rng = rng;
+}
+
+static void drop_current_rng(void)
+{
+   BUG_ON(!mutex_is_locked(rng_mutex));
+   if (!current_rng)
+   return;
+
+   kref_put(current_rng-ref, cleanup_rng);
+   current_rng = NULL;
+}
+
+/* Returns ERR_PTR(), NULL or refcounted hwrng */
+static struct hwrng *get_current_rng(void)
+{
+   struct hwrng *rng;
+
+   if (mutex_lock_interruptible(rng_mutex))
+   return ERR_PTR(-ERESTARTSYS);
+
+   rng = current_rng;
+   if (rng)
+   kref_get(rng-ref);
+
+   mutex_unlock(rng_mutex);
+   return rng;
+}
+
+static void put_rng(struct hwrng *rng)
+{
+   /*
+* Hold rng_mutex here so we serialize in case they set_current_rng
+* on rng again immediately.
+*/
+   mutex_lock(rng_mutex);
+   if (rng)
+   kref_put(rng-ref, cleanup_rng);
+   mutex_unlock(rng_mutex);
+}
+
 static inline int hwrng_init(struct hwrng *rng)
 {
if (rng-init) {
@@ -113,12 +167,6 @@ static inline int hwrng_init(struct hwrng *rng)
return 0;
 }
 
-static inline void hwrng_cleanup(struct hwrng *rng)
-{
-   if (rng  rng-cleanup)
-   rng-cleanup(rng);
-}
-
 static int rng_dev_open(struct inode *inode, struct file *filp)
 {
/* enforce read-only access to this chrdev */
@@ -154,21 +202,22 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
ssize_t ret = 0;
int err = 0;
int bytes_read, len;
+   struct hwrng *rng;
 
while (size) {
-   if (mutex_lock_interruptible(rng_mutex)) {
-   err = -ERESTARTSYS;
+   rng = get_current_rng();
+   if (IS_ERR(rng)) {
+   err = PTR_ERR(rng);
goto out;
}
-
-   if (!current_rng) {
+   if (!rng) {
err = -ENODEV;
-   goto out_unlock;
+   goto out;
}
 
mutex_lock(reading_mutex);
if (!data_avail) {
-   bytes_read = rng_get_data(current_rng, rng_buffer,
+   bytes_read = rng_get_data(rng, rng_buffer,
rng_buffer_size(),
!(filp-f_flags  O_NONBLOCK));
if (bytes_read  0) {
@@ -200,7 +249,6 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
ret += len;
}
 
-   mutex_unlock(rng_mutex);
mutex_unlock(reading_mutex);
 
if (need_resched())
@@ -208,17 +256,19 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
 
if (signal_pending(current)) {
err = -ERESTARTSYS;
+   put_rng(rng);
goto out;
}
+
+   put_rng(rng);
}
 out:
return ret ? : err;
-out_unlock:
-   mutex_unlock(rng_mutex);
-   goto out;
+
 out_unlock_reading:
mutex_unlock(reading_mutex);
-   goto out_unlock;
+   put_rng(rng);
+   goto out;
 }
 
 
@@ -257,8 +307,8 @@ static ssize_t hwrng_attr_current_store(struct device *dev,
err = hwrng_init(rng);
if (err

[PATCH v2 0/6] fix hw_random stuck

2014-09-18 Thread Amos Kong
When I hotunplug a busy virtio-rng device or try to access
hwrng attributes in non-smp guest, it gets stuck.

My original was pain, Rusty posted a real fix. This patchset
fixed two issue in v1, and tested by my 6+ cases.


| test 0:
|   hotunplug rng device from qemu monitor
|
| test 1:
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 2:
|   guest) # dd if=/dev/random of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 4:
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   cat /sys/devices/virtual/misc/hw_random/rng_*
|
| test 5:
|   guest) # dd if=/dev/hwrng of=/dev/null
|   cancel dd process after 10 seconds
|   guest) # dd if=/dev/hwrng of=/dev/null 
|   hotunplug rng device from qemu monitor
|
| test 6:
|   use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo

V2: added patch 2 to fix a deadlock, update current patch 3 to fix reference
counting issue

Amos Kong (1):
  hw_random: move some code out mutex_lock for avoiding underlying
deadlock

Rusty Russell (5):
  hw_random: place mutex around read functions and buffers.
  hw_random: use reference counts on each struct hwrng.
  hw_random: fix unregister race.
  hw_random: don't double-check old_rng.
  hw_random: don't init list element we're about to add to list.

 drivers/char/hw_random/core.c | 166 +-
 include/linux/hw_random.h |   2 +
 2 files changed, 117 insertions(+), 51 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/6] hw_random: place mutex around read functions and buffers.

2014-09-18 Thread Amos Kong
From: Rusty Russell ru...@rustcorp.com.au

There's currently a big lock around everything, and it means that we
can't query sysfs (eg /sys/devices/virtual/misc/hw_random/rng_current)
while the rng is reading.  This is a real problem when the rng is slow,
or blocked (eg. virtio_rng with qemu's default /dev/random backend)

This doesn't help (it leaves the current lock untouched), just adds a
lock to protect the read function and the static buffers, in preparation
for transition.

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/char/hw_random/core.c | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index aa30a25..b1b6042 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -53,7 +53,10 @@
 static struct hwrng *current_rng;
 static struct task_struct *hwrng_fill;
 static LIST_HEAD(rng_list);
+/* Protects rng_list and current_rng */
 static DEFINE_MUTEX(rng_mutex);
+/* Protects rng read functions, data_avail, rng_buffer and rng_fillbuf */
+static DEFINE_MUTEX(reading_mutex);
 static int data_avail;
 static u8 *rng_buffer, *rng_fillbuf;
 static unsigned short current_quality;
@@ -81,7 +84,9 @@ static void add_early_randomness(struct hwrng *rng)
unsigned char bytes[16];
int bytes_read;
 
+   mutex_lock(reading_mutex);
bytes_read = rng_get_data(rng, bytes, sizeof(bytes), 1);
+   mutex_unlock(reading_mutex);
if (bytes_read  0)
add_device_randomness(bytes, bytes_read);
 }
@@ -128,6 +133,7 @@ static inline int rng_get_data(struct hwrng *rng, u8 
*buffer, size_t size,
int wait) {
int present;
 
+   BUG_ON(!mutex_is_locked(reading_mutex));
if (rng-read)
return rng-read(rng, (void *)buffer, size, wait);
 
@@ -160,13 +166,14 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
goto out_unlock;
}
 
+   mutex_lock(reading_mutex);
if (!data_avail) {
bytes_read = rng_get_data(current_rng, rng_buffer,
rng_buffer_size(),
!(filp-f_flags  O_NONBLOCK));
if (bytes_read  0) {
err = bytes_read;
-   goto out_unlock;
+   goto out_unlock_reading;
}
data_avail = bytes_read;
}
@@ -174,7 +181,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
if (!data_avail) {
if (filp-f_flags  O_NONBLOCK) {
err = -EAGAIN;
-   goto out_unlock;
+   goto out_unlock_reading;
}
} else {
len = data_avail;
@@ -186,7 +193,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
if (copy_to_user(buf + ret, rng_buffer + data_avail,
len)) {
err = -EFAULT;
-   goto out_unlock;
+   goto out_unlock_reading;
}
 
size -= len;
@@ -194,6 +201,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
}
 
mutex_unlock(rng_mutex);
+   mutex_unlock(reading_mutex);
 
if (need_resched())
schedule_timeout_interruptible(1);
@@ -208,6 +216,9 @@ out:
 out_unlock:
mutex_unlock(rng_mutex);
goto out;
+out_unlock_reading:
+   mutex_unlock(reading_mutex);
+   goto out_unlock;
 }
 
 
@@ -348,13 +359,16 @@ static int hwrng_fillfn(void *unused)
while (!kthread_should_stop()) {
if (!current_rng)
break;
+   mutex_lock(reading_mutex);
rc = rng_get_data(current_rng, rng_fillbuf,
  rng_buffer_size(), 1);
+   mutex_unlock(reading_mutex);
if (rc = 0) {
pr_warn(hwrng: no data available\n);
msleep_interruptible(1);
continue;
}
+   /* Outside lock, sure, but y'know: randomness. */
add_hwgenerator_randomness((void *)rng_fillbuf, rc,
   rc * current_quality * 8  10);
}
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 2/3] hw_random: fix stuck in catting hwrng attributes

2014-09-18 Thread Amos Kong
On Thu, Sep 18, 2014 at 12:13:08PM +0930, Rusty Russell wrote:
 Amos Kong ak...@redhat.com writes:
 
  I started a QEMU (non-smp) guest with one virtio-rng device, and read
  random data from /dev/hwrng by dd:
 
   # dd if=/dev/hwrng of=/dev/null 
 
  In the same time, if I check hwrng attributes from sysfs by cat:
 
   # cat /sys/class/misc/hw_random/rng_*
 
  The cat process always gets stuck with slow backend (5 k/s), if we
  use a quick backend (1.2 M/s), the cat process will cost 1 to 2
  minutes. The stuck doesn't exist for smp guest.
 
  Reading syscall enters kernel and call rng_dev_read(), it's user
  context. We used need_resched() to check if other tasks need to
  be run, but it almost always return false, and re-hold the mutex
  lock. The attributes accessing process always fails to hold the
  lock, so the cat gets stuck.
 
  User context doesn't allow other user contexts run on that CPU,
  unless the kernel code sleeps for some reason. This is why the
  need_reshed() always return false here.
 
  This patch removed need_resched() and always schedule other tasks
  then other tasks can have chance to hold the lock and execute
  protected code.
 
Hi Rusty,

 OK, this is going to be a rant.
 
 Your explanation doesn't make sense at all.  Worse, your solution breaks
 the advice of Kernighan  Plaugher: Don't patch bad code - rewrite
 it..
 
 But worst of all, this detailed explanation might have convinced me you
 understood the problem better than I did, and applied your patch.
 
I'm sorry about the misleading.

 I did some tests.  For me, as expected, the process spends its time
 inside the virtio rng read function, holding the mutex and thus blocking
 sysfs access; it's not a failure of this code at all.

Got it now.

The catting hang bug was found when I try to fix unhotplug issue, the
unhotplug issue can't be reproduced if I try to debug by gdb or
printk. So I forgot to debug cat hang ... but spend time to misunderstand
schedle code :(

 Your schedule_timeout() fix probably just helps by letting the host
 refresh entropy, so we spend less time waiting in the read fn.
 
 I will post a series, which unfortunately is only lightly tested, then
 I'm going to have some beer to begin my holiday.  That may help me
 forget my disappointment at seeing respected fellow developers
 monkey-patching random code they don't understand.

I just posted a V2 with two additional fixes, hotunplugging works well now :)

 Grrr

Enjoy your holiday!
Amos

 Rusty.

  Signed-off-by: Amos Kong ak...@redhat.com
  ---
   drivers/char/hw_random/core.c | 3 +--
   1 file changed, 1 insertion(+), 2 deletions(-)
 
  diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
  index c591d7e..263a370 100644
  --- a/drivers/char/hw_random/core.c
  +++ b/drivers/char/hw_random/core.c
  @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char 
  __user *buf,
   
  mutex_unlock(rng_mutex);
   
  -   if (need_resched())
  -   schedule_timeout_interruptible(1);
  +   schedule_timeout_interruptible(1);
   
  if (signal_pending(current)) {
  err = -ERESTARTSYS;
  -- 
  1.9.3

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/6] hw_random: use reference counts on each struct hwrng.

2014-09-18 Thread Amos Kong
On Thu, Sep 18, 2014 at 08:37:44PM +0800, Amos Kong wrote:
 From: Rusty Russell ru...@rustcorp.com.au
 
 current_rng holds one reference, and we bump it every time we want
 to do a read from it.
 
 This means we only hold the rng_mutex to grab or drop a reference,
 so accessing /sys/devices/virtual/misc/hw_random/rng_current doesn't
 block on read of /dev/hwrng.
 
 Using a kref is overkill (we're always under the rng_mutex), but
 a standard pattern.
 
 This also solves the problem that the hwrng_fillfn thread was
 accessing current_rng without a lock, which could change (eg. to NULL)
 underneath it.
 
 V2: reduce reference count in signal_pending path
 
 Signed-off-by: Rusty Russell ru...@rustcorp.com.au
 Signed-off-by: Amos Kong ak...@redhat.com

Reply myself.

 ---
  drivers/char/hw_random/core.c | 136 
 +-
  include/linux/hw_random.h |   2 +
  2 files changed, 95 insertions(+), 43 deletions(-)
 
 diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
 index a0905c8..ee9e504 100644
 --- a/drivers/char/hw_random/core.c
 +++ b/drivers/char/hw_random/core.c
 @@ -42,6 +42,7 @@
  #include linux/delay.h
  #include linux/slab.h
  #include linux/random.h
 +#include linux/err.h
  #include asm/uaccess.h
  
  
 @@ -91,6 +92,59 @@ static void add_early_randomness(struct hwrng *rng)
   add_device_randomness(bytes, bytes_read);
  }
  
 +static inline void cleanup_rng(struct kref *kref)
 +{
 + struct hwrng *rng = container_of(kref, struct hwrng, ref);
 +
 + if (rng-cleanup)
 + rng-cleanup(rng);
 +}
 +
 +static void set_current_rng(struct hwrng *rng)
 +{
 + BUG_ON(!mutex_is_locked(rng_mutex));
 + kref_get(rng-ref);
 + current_rng = rng;
 +}
 +
 +static void drop_current_rng(void)
 +{
 + BUG_ON(!mutex_is_locked(rng_mutex));
 + if (!current_rng)
 + return;
 +
 + kref_put(current_rng-ref, cleanup_rng);
 + current_rng = NULL;
 +}
 +
 +/* Returns ERR_PTR(), NULL or refcounted hwrng */
 +static struct hwrng *get_current_rng(void)
 +{
 + struct hwrng *rng;
 +
 + if (mutex_lock_interruptible(rng_mutex))
 + return ERR_PTR(-ERESTARTSYS);
 +
 + rng = current_rng;
 + if (rng)
 + kref_get(rng-ref);
 +
 + mutex_unlock(rng_mutex);
 + return rng;
 +}
 +
 +static void put_rng(struct hwrng *rng)
 +{
 + /*
 +  * Hold rng_mutex here so we serialize in case they set_current_rng
 +  * on rng again immediately.
 +  */
 + mutex_lock(rng_mutex);
 + if (rng)
 + kref_put(rng-ref, cleanup_rng);
 + mutex_unlock(rng_mutex);
 +}
 +
  static inline int hwrng_init(struct hwrng *rng)
  {
   if (rng-init) {
 @@ -113,12 +167,6 @@ static inline int hwrng_init(struct hwrng *rng)
   return 0;
  }
  
 -static inline void hwrng_cleanup(struct hwrng *rng)
 -{
 - if (rng  rng-cleanup)
 - rng-cleanup(rng);
 -}
 -
  static int rng_dev_open(struct inode *inode, struct file *filp)
  {
   /* enforce read-only access to this chrdev */
 @@ -154,21 +202,22 @@ static ssize_t rng_dev_read(struct file *filp, char 
 __user *buf,
   ssize_t ret = 0;
   int err = 0;
   int bytes_read, len;
 + struct hwrng *rng;
  
   while (size) {
 - if (mutex_lock_interruptible(rng_mutex)) {
 - err = -ERESTARTSYS;
 + rng = get_current_rng();
 + if (IS_ERR(rng)) {
 + err = PTR_ERR(rng);
   goto out;
   }
 -
 - if (!current_rng) {
 + if (!rng) {
   err = -ENODEV;
 - goto out_unlock;
 + goto out;
   }
  
   mutex_lock(reading_mutex);
   if (!data_avail) {
 - bytes_read = rng_get_data(current_rng, rng_buffer,
 + bytes_read = rng_get_data(rng, rng_buffer,
   rng_buffer_size(),
   !(filp-f_flags  O_NONBLOCK));
   if (bytes_read  0) {
 @@ -200,7 +249,6 @@ static ssize_t rng_dev_read(struct file *filp, char 
 __user *buf,
   ret += len;
   }
  
 - mutex_unlock(rng_mutex);
   mutex_unlock(reading_mutex);
  
   if (need_resched())
 @@ -208,17 +256,19 @@ static ssize_t rng_dev_read(struct file *filp, char 
 __user *buf,
  
   if (signal_pending(current)) {
   err = -ERESTARTSYS;
 + put_rng(rng);
   goto out;
   }
 +
 + put_rng(rng);
   }
  out:
   return ret ? : err;
 -out_unlock:
 - mutex_unlock(rng_mutex);
 - goto out;
 +
  out_unlock_reading:
   mutex_unlock(reading_mutex);
 - goto out_unlock;
 + put_rng(rng);
 + goto out;
  }
  
  
 @@ -257,8 +307,8 @@ static ssize_t hwrng_attr_current_store(struct

[PATCH v2 1/3] virtio-rng cleanup: move some code out of mutex protection

2014-09-15 Thread Amos Kong
It doesn't save too much cpu time as expected, just a cleanup.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index aa30a25..c591d7e 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -270,8 +270,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev,
return -ERESTARTSYS;
if (current_rng)
name = current_rng-name;
-   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
mutex_unlock(rng_mutex);
+   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
 
return ret;
 }
@@ -284,19 +284,19 @@ static ssize_t hwrng_attr_available_show(struct device 
*dev,
ssize_t ret = 0;
struct hwrng *rng;
 
+   buf[0] = '\0';
err = mutex_lock_interruptible(rng_mutex);
if (err)
return -ERESTARTSYS;
-   buf[0] = '\0';
list_for_each_entry(rng, rng_list, list) {
strncat(buf, rng-name, PAGE_SIZE - ret - 1);
ret += strlen(rng-name);
strncat(buf,  , PAGE_SIZE - ret - 1);
ret++;
}
+   mutex_unlock(rng_mutex);
strncat(buf, \n, PAGE_SIZE - ret - 1);
ret++;
-   mutex_unlock(rng_mutex);
 
return ret;
 }
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/3] hw_random: increase schedule timeout in rng_dev_read()

2014-09-15 Thread Amos Kong
This patch increases the schedule timeout to 10 jiffies, it's more
appropriate, then other takes can easy to hold the mutex lock.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 263a370..b5d1b6f 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -195,7 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
 
mutex_unlock(rng_mutex);
 
-   schedule_timeout_interruptible(1);
+   schedule_timeout_interruptible(10);
 
if (signal_pending(current)) {
err = -ERESTARTSYS;
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/3] fix stuck in accessing hwrng attributes

2014-09-15 Thread Amos Kong
If we read hwrng by long-running dd process, it takes too much cpu
time and almost hold the mutex lock. When we check hwrng attributes
from sysfs by cat, it gets stuck in waiting the lock releaseing.
The problem can only be reproduced with non-smp guest with slow backend.

This patchset resolves the issue by changing rng_dev_read() to always
schedule 10 jiffies after release mutex lock, then cat process can
have chance to get the lock and execute protected code without stuck.

Thanks.

V2: update commitlog to describe PATCH 2, split second patch.

Amos Kong (3):
  virtio-rng cleanup: move some code out of mutex protection
  hw_random: fix stuck in catting hwrng attributes
  hw_random: increase schedule timeout in rng_dev_read()

 drivers/char/hw_random/core.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/3] hw_random: fix stuck in catting hwrng attributes

2014-09-15 Thread Amos Kong
I started a QEMU (non-smp) guest with one virtio-rng device, and read
random data from /dev/hwrng by dd:

 # dd if=/dev/hwrng of=/dev/null 

In the same time, if I check hwrng attributes from sysfs by cat:

 # cat /sys/class/misc/hw_random/rng_*

The cat process always gets stuck with slow backend (5 k/s), if we
use a quick backend (1.2 M/s), the cat process will cost 1 to 2
minutes. The stuck doesn't exist for smp guest.

Reading syscall enters kernel and call rng_dev_read(), it's user
context. We used need_resched() to check if other tasks need to
be run, but it almost always return false, and re-hold the mutex
lock. The attributes accessing process always fails to hold the
lock, so the cat gets stuck.

User context doesn't allow other user contexts run on that CPU,
unless the kernel code sleeps for some reason. This is why the
need_reshed() always return false here.

This patch removed need_resched() and always schedule other tasks
then other tasks can have chance to hold the lock and execute
protected code.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index c591d7e..263a370 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
 
mutex_unlock(rng_mutex);
 
-   if (need_resched())
-   schedule_timeout_interruptible(1);
+   schedule_timeout_interruptible(1);
 
if (signal_pending(current)) {
err = -ERESTARTSYS;
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/3] hw_random: increase schedule timeout in rng_dev_read()

2014-09-15 Thread Amos Kong
On Mon, Sep 15, 2014 at 06:13:31PM +0200, Michael Büsch wrote:
 On Tue, 16 Sep 2014 00:02:29 +0800
 Amos Kong ak...@redhat.com wrote:
 
  This patch increases the schedule timeout to 10 jiffies, it's more
  appropriate, then other takes can easy to hold the mutex lock.
  
  Signed-off-by: Amos Kong ak...@redhat.com
  ---
   drivers/char/hw_random/core.c | 2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)
  
  diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
  index 263a370..b5d1b6f 100644
  --- a/drivers/char/hw_random/core.c
  +++ b/drivers/char/hw_random/core.c
  @@ -195,7 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char 
  __user *buf,
   
  mutex_unlock(rng_mutex);
   
  -   schedule_timeout_interruptible(1);
  +   schedule_timeout_interruptible(10);
   
  if (signal_pending(current)) {
  err = -ERESTARTSYS;
 
 Does a schedule of 1 ms or 10 ms decrease the throughput?

In my test environment, 1 jiffe always works (100%), as suggested by
Amit 10 jiffes is more appropriate.

After applied current 3 patches, there is a throughput regression.

  1.2 M/s - 6 K/s

We can only schedule in the end of loop (size == 0), and only for
non-smp guest. So smp guest won't be effected.

|   if (!size  num_online_cpus() == 1)
|   schedule_timeout_interruptible(timeout);


Set timeout to 1:
  non-smp guest with quick backend (1.2M/s) - about 49K/s)

Set timeout to 10:
  non-smp guest with quick backend (1.2M/s) - about 490K/s)

We might need other benchmark to test the performance, but we can
see the bug clearly caused a regression.

As we discussed in other thread, need_resched() should work in this
case, so those patches might be wrong fixing.

 I think we need some benchmarks.
 
 -- 
 Michael



-- 
Amos.


pgpBH8wzlVvPd.pgp
Description: PGP signature


Re: [PATCH v2 1/3] virtio-rng cleanup: move some code out of mutex protection

2014-09-15 Thread Amos Kong
On Mon, Sep 15, 2014 at 06:13:20PM +0200, Michael Büsch wrote:
 On Tue, 16 Sep 2014 00:02:27 +0800
 Amos Kong ak...@redhat.com wrote:
 
  It doesn't save too much cpu time as expected, just a cleanup.
  
  Signed-off-by: Amos Kong ak...@redhat.com
  ---
   drivers/char/hw_random/core.c | 6 +++---
   1 file changed, 3 insertions(+), 3 deletions(-)
  
  diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
  index aa30a25..c591d7e 100644
  --- a/drivers/char/hw_random/core.c
  +++ b/drivers/char/hw_random/core.c
  @@ -270,8 +270,8 @@ static ssize_t hwrng_attr_current_show(struct device 
  *dev,
  return -ERESTARTSYS;
  if (current_rng)
  name = current_rng-name;
  -   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
  mutex_unlock(rng_mutex);
  +   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
 
 I'm not sure this is safe.
 Name is just a pointer.
 What if the hwrng gets unregistered after unlock and just before the snprintf?

Oh, it points to protected current_rng-name, I will drop this
cleanup. Thanks.
 
  return ret;
   }
  @@ -284,19 +284,19 @@ static ssize_t hwrng_attr_available_show(struct 
  device *dev,
  ssize_t ret = 0;
  struct hwrng *rng;
   
  +   buf[0] = '\0';
  err = mutex_lock_interruptible(rng_mutex);
  if (err)
  return -ERESTARTSYS;
  -   buf[0] = '\0';
  list_for_each_entry(rng, rng_list, list) {
  strncat(buf, rng-name, PAGE_SIZE - ret - 1);
  ret += strlen(rng-name);
  strncat(buf,  , PAGE_SIZE - ret - 1);
  ret++;
  }
  +   mutex_unlock(rng_mutex);
  strncat(buf, \n, PAGE_SIZE - ret - 1);
  ret++;
  -   mutex_unlock(rng_mutex);
   
  return ret;
   }
 
 This looks ok.
 
 -- 
 Michael

-- 
Amos.


pgpFpxTwIJswL.pgp
Description: PGP signature


Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes

2014-09-15 Thread Amos Kong
CC linux-kernel

Original thread: 
http://comments.gmane.org/gmane.linux.kernel.virtualization/22775

On Mon, Sep 15, 2014 at 06:48:46PM +0200, Radim Krčmář wrote:
 2014-09-14 10:25+0800, Amos Kong:
  On Sun, Sep 14, 2014 at 09:12:08AM +0800, Amos Kong wrote:
  
  ...
  diff --git a/drivers/char/hw_random/core.c 
  b/drivers/char/hw_random/core.c
  index c591d7e..b5d1b6f 100644
  --- a/drivers/char/hw_random/core.c
  +++ b/drivers/char/hw_random/core.c
  @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, 
  char __user *buf,
   
  mutex_unlock(rng_mutex);
   
  -   if (need_resched())
  -   schedule_timeout_interruptible(1);
  +   schedule_timeout_interruptible(10);
 
 If cond_resched() does not work, it is a bug elsewehere.

Thanks for your reply, Jason also told me the TIF_NEED_RESCHED should
be set in this case, then need_resched() returns true.

I will investigate the issue and reply you later.
 
  Problem only occurred in non-smp guest, we can improve it to:
  
  if(!is_smp())
  schedule_timeout_interruptible(10);
  
  is_smp() is only available for arm arch, we need a general one.
 
 (It is num_online_cpus()  1.)

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes

2014-09-13 Thread Amos Kong
On Thu, Sep 11, 2014 at 09:08:03PM +0930, Rusty Russell wrote:
 Amos Kong ak...@redhat.com writes:
  When I check hwrng attributes in sysfs, cat process always gets
  stuck if guest has only 1 vcpu and uses a slow rng backend.
 
  Currently we check if there is any tasks waiting to be run on
  current cpu in rng_dev_read() by need_resched(). But need_resched()
  doesn't work because rng_dev_read() is executing in user context.
 
 I don't understand this explanation?  I'd expect the sysfs process to be
 woken by the mutex_unlock().

But actually sysfs process's not woken always, this is they the
process gets stuck.
 
 If we're really high priority (vs. the sysfs process) then I can see why
 we'd need schedule_timeout_interruptible() instead of just schedule(),
 and in that case, need_resched() would be false too.
 
 You could argue that's intended behaviour, but I can't see how it
 happens in the normal case anyway.
 
 What am I missing?
 
 Thanks,
 Rusty.
 
  This patch removed need_resched() and increase delay to 10 jiffies,
  then other tasks can have chance to execute protected code.
  Delaying 1 jiffy also works, but 10 jiffies is safer.
 
  Signed-off-by: Amos Kong ak...@redhat.com
  ---
   drivers/char/hw_random/core.c | 3 +--
   1 file changed, 1 insertion(+), 2 deletions(-)
 
  diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
  index c591d7e..b5d1b6f 100644
  --- a/drivers/char/hw_random/core.c
  +++ b/drivers/char/hw_random/core.c
  @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char 
  __user *buf,
   
  mutex_unlock(rng_mutex);
   
  -   if (need_resched())
  -   schedule_timeout_interruptible(1);
  +   schedule_timeout_interruptible(10);
   
  if (signal_pending(current)) {
  err = -ERESTARTSYS;
  -- 
  1.9.3

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes

2014-09-13 Thread Amos Kong
On Sun, Sep 14, 2014 at 01:12:58AM +0800, Amos Kong wrote:
 On Thu, Sep 11, 2014 at 09:08:03PM +0930, Rusty Russell wrote:
  Amos Kong ak...@redhat.com writes:
   When I check hwrng attributes in sysfs, cat process always gets
   stuck if guest has only 1 vcpu and uses a slow rng backend.
  
   Currently we check if there is any tasks waiting to be run on
   current cpu in rng_dev_read() by need_resched(). But need_resched()
   doesn't work because rng_dev_read() is executing in user context.
  
  I don't understand this explanation?  I'd expect the sysfs process to be
  woken by the mutex_unlock().
 
 But actually sysfs process's not woken always, this is they the
 process gets stuck.

%s/they/why/

Hi Rusty,


Reference:
http://www.linuxgrill.com/anonymous/fire/netfilter/kernel-hacking-HOWTO-2.html

read() syscall of /dev/hwrng will enter into kernel, the read operation is
rng_dev_read(), it's userspace context (not interrupt context).

Userspace context doesn't allow other user contexts run on that CPU,
unless the kernel code sleeps for some reason.


In this case, the need_resched() doesn't work.

My solution is removing need_resched() and use an appropriate delay by 
schedule_timeout_interruptible(10).

Thanks, Amos
  
  If we're really high priority (vs. the sysfs process) then I can see why
  we'd need schedule_timeout_interruptible() instead of just schedule(),
  and in that case, need_resched() would be false too.
  
  You could argue that's intended behaviour, but I can't see how it
  happens in the normal case anyway.
  
  What am I missing?

  Thanks,
  Rusty.
  
   This patch removed need_resched() and increase delay to 10 jiffies,
   then other tasks can have chance to execute protected code.
   Delaying 1 jiffy also works, but 10 jiffies is safer.
  
   Signed-off-by: Amos Kong ak...@redhat.com
   ---
drivers/char/hw_random/core.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
  
   diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
   index c591d7e..b5d1b6f 100644
   --- a/drivers/char/hw_random/core.c
   +++ b/drivers/char/hw_random/core.c
   @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char 
   __user *buf,

 mutex_unlock(rng_mutex);

   - if (need_resched())
   - schedule_timeout_interruptible(1);
   + schedule_timeout_interruptible(10);

 if (signal_pending(current)) {
 err = -ERESTARTSYS;
   -- 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes

2014-09-13 Thread Amos Kong
On Thu, Sep 11, 2014 at 11:38:38AM +0530, Amit Shah wrote:
 On (Wed) 10 Sep 2014 [17:07:07], Amos Kong wrote:
  When I check hwrng attributes in sysfs, cat process always gets
  stuck if guest has only 1 vcpu and uses a slow rng backend.
  
  Currently we check if there is any tasks waiting to be run on
  current cpu in rng_dev_read() by need_resched(). But need_resched()
  doesn't work because rng_dev_read() is executing in user context.
  
  This patch removed need_resched() and increase delay to 10 jiffies,
  then other tasks can have chance to execute protected code.
  Delaying 1 jiffy also works, but 10 jiffies is safer.

Hi Amit,
 
 I'd prefer two patches for this one: one to remove the need_resched()
 check, and the other to increase the timeout.

If Rusty agrees with this fix, I will respin to update the commitlog
with clear description and split the patches to 3.

Thanks for the review.
 
 Anyway,
 
 Reviewed-by: Amit Shah amit.s...@redhat.com

--
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes

2014-09-13 Thread Amos Kong
On Sun, Sep 14, 2014 at 09:12:08AM +0800, Amos Kong wrote:

...
diff --git a/drivers/char/hw_random/core.c 
b/drivers/char/hw_random/core.c
index c591d7e..b5d1b6f 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char 
__user *buf,
 
mutex_unlock(rng_mutex);
 
-   if (need_resched())
-   schedule_timeout_interruptible(1);
+   schedule_timeout_interruptible(10);

Problem only occurred in non-smp guest, we can improve it to:

if(!is_smp())
schedule_timeout_interruptible(10);

is_smp() is only available for arm arch, we need a general one.

 
if (signal_pending(current)) {
err = -ERESTARTSYS;
-- 

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 0/2] virtio-rng: fix hotunplug

2014-09-10 Thread Amos Kong
RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1121540

When we try to hot-unplugging a busy virtio-rng device, the device
can't be removed. And the reading process in guest gets stuck.

Those two patches fixed this issue by completing have_data completion
and preventing invalid reading.

Thanks for the help of Amit.

Cc: sta...@vger.kernel.org

V2: reset data_avail (Amit)
adjust unregister order
V3: split patch, update commitlog

Amos Kong (2):
  virtio-rng: fix stuck of hot-unplugging busy device
  virtio-rng: skip reading when we start to remove the device

 drivers/char/hw_random/virtio-rng.c | 7 +++
 1 file changed, 7 insertions(+)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 1/2] virtio-rng: fix stuck of hot-unplugging busy device

2014-09-10 Thread Amos Kong
When we try to hot-remove a busy virtio-rng device from QEMU monitor,
the device can't be hot-removed. Because virtio-rng driver hangs at
wait_for_completion_killable().

This patch exits the waiting by completing have_data completion before
unregistering, resets data_avail to avoid the hwrng core use wrong
buffer bytes.

Signed-off-by: Amos Kong ak...@redhat.com
Cc: sta...@vger.kernel.org
---
 drivers/char/hw_random/virtio-rng.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/char/hw_random/virtio-rng.c 
b/drivers/char/hw_random/virtio-rng.c
index 2e3139e..849b228 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -137,6 +137,8 @@ static void remove_common(struct virtio_device *vdev)
 {
struct virtrng_info *vi = vdev-priv;
 
+   vi-data_avail = 0;
+   complete(vi-have_data);
vdev-config-reset(vdev);
vi-busy = false;
if (vi-hwrng_register_done)
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 2/2] virtio-rng: skip reading when we start to remove the device

2014-09-10 Thread Amos Kong
Before we really unregister the hwrng device, reading will get stuck if
the virtio device is reset. We should return error for reading when we
start to remove the device.

Signed-off-by: Amos Kong ak...@redhat.com
Cc: sta...@vger.kernel.org
---
 drivers/char/hw_random/virtio-rng.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/char/hw_random/virtio-rng.c 
b/drivers/char/hw_random/virtio-rng.c
index 849b228..132c9cc 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -36,6 +36,7 @@ struct virtrng_info {
int index;
bool busy;
bool hwrng_register_done;
+   bool hwrng_removed;
 };
 
 
@@ -68,6 +69,9 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t 
size, bool wait)
int ret;
struct virtrng_info *vi = (struct virtrng_info *)rng-priv;
 
+   if (vi-hwrng_removed)
+   return -ENODEV;
+
if (!vi-busy) {
vi-busy = true;
init_completion(vi-have_data);
@@ -137,6 +141,7 @@ static void remove_common(struct virtio_device *vdev)
 {
struct virtrng_info *vi = vdev-priv;
 
+   vi-hwrng_removed = true;
vi-data_avail = 0;
complete(vi-have_data);
vdev-config-reset(vdev);
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] virtio-rng: fix stuck of hot-unplugging busy device

2014-09-10 Thread Amos Kong
On Wed, Sep 10, 2014 at 11:04:54AM +0530, Amit Shah wrote:
 Hi Amos,
 
 On (Tue) 09 Sep 2014 [19:14:02], Amos Kong wrote:
  When we try to hot-remove a busy virtio-rng device from QEMU monitor,
  the device can't be hot-removed. Because virtio-rng driver hangs at
  wait_for_completion_killable().
  
  This patch exits the waiting by completing have_data completion before
  unregistering, resets data_avail to avoid the hwrng core use wrong
  buffer bytes. Before real unregistering we should return -ENODEV for
  reading.
  
  When we hot-unplug the device, dd process in guest will exit.
$ dd if=/dev/hwrng of=/dev/null
dd: error reading ‘/dev/hwrng’: No such device
  
  Signed-off-by: Amos Kong ak...@redhat.com
  Cc: sta...@vger.kernel.org
  ---
  V2: reset data_avail (Amit)
  adjust unregister order
 
 Thanks, this looks good.
 
 Can you please split into two parts, the complete() in one, and the
 hwrng_register_done stuff into another?

I just posted the V3, split to two patches, and updated the commitlog
to describe why we need to return ENODEV for reading.
 
 Thanks,
 
   Amit

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RFC virtio-rng: fail to read sysfs of a busy device

2014-09-10 Thread Amos Kong
On Wed, Sep 10, 2014 at 11:22:12AM +0530, Amit Shah wrote:
 On (Tue) 09 Sep 2014 [23:23:07], Amos Kong wrote:
  (Resend to fix the subject)
  
  Hi Amit, Rusty
  
  RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062
  steps:
  - Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest
  - check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*'
  
  Result: cat process will get stuck, it will return if we kill dd process.
 
 How common is it going to be to have a long-running 'dd' process on
 /dev/hwrng?

Not a common usage, but we have this strict testing.
 
 Also, with the new khwrng thread, reading from /dev/hwrng isn't
 required -- just use /dev/random?

Yes.
 
 (This doesn't mean we shouldn't fix the issue here...)

Completely agree :-)
 
  We have some static variables (eg, current_rng, data_avail, etc) in 
  hw_random/core.c,
  they are protected by rng_mutex. I try to workaround this issue by 
  undelay(100)
  after mutex_unlock() in rng_dev_read(). This gives chance for 
  hwrng_attr_*_show()
  to get mutex.
  
  This patch also contains some cleanup, moving some code out of mutex
  protection.
  
  Do you have some suggestion? Thanks.
  
  
  diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
  index aa30a25..fa69020 100644
  --- a/drivers/char/hw_random/core.c
  +++ b/drivers/char/hw_random/core.c
  @@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char 
  __user *buf,
  }
   
  mutex_unlock(rng_mutex);
  +   udelay(100);
 
 We have a need_resched() right below.  Why doesn't that work?

need_resched() is giving chance for userspace to 
 
  if (need_resched())

It never success in my debugging.

If we remove this check and always call schedule_timeout_interruptible(1),
problem also disappears.

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index aa30a25..263a370 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp,
char __user *buf,
 
mutex_unlock(rng_mutex);
 
-   if (need_resched())
-   schedule_timeout_interruptible(1);
+   schedule_timeout_interruptible(1);
 
if (signal_pending(current)) {
err = -ERESTARTSYS;

  schedule_timeout_interruptible(1);
  @@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device 
  *dev,
  int err;
  struct hwrng *rng;
 
 The following hunk doesn't work:
 
  +   err = -ENODEV;
  err = mutex_lock_interruptible(rng_mutex);
 
 err is being set to another value in the next line!
 
  if (err)
  return -ERESTARTSYS;
  -   err = -ENODEV;
 
 And all usage of err below now won't have -ENODEV but some other value.

Oops!
 
  list_for_each_entry(rng, rng_list, list) {
  if (strcmp(rng-name, buf) == 0) {
  if (rng == current_rng) {
  @@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device 
  *dev,
  return -ERESTARTSYS;
  if (current_rng)
  name = current_rng-name;
  -   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
  mutex_unlock(rng_mutex);
  +   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
 
 This looks OK...
 
   
  return ret;
   }
  @@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct 
  device *dev,
  ssize_t ret = 0;
  struct hwrng *rng;
   
  +   buf[0] = '\0';
  err = mutex_lock_interruptible(rng_mutex);
  if (err)
  return -ERESTARTSYS;
 
  -   buf[0] = '\0';
  list_for_each_entry(rng, rng_list, list) {
  strncat(buf, rng-name, PAGE_SIZE - ret - 1);
  ret += strlen(rng-name);
  strncat(buf,  , PAGE_SIZE - ret - 1);
  ret++;
  }
  +   mutex_unlock(rng_mutex);
  strncat(buf, \n, PAGE_SIZE - ret - 1);
  ret++;
  -   mutex_unlock(rng_mutex);
 
 But this isn't resulting in savings; the majority of the time is being
 spent in the for loop, and that writes to the buffer.

Right
 
 BTW I don't expect strcat'ing to the buf in each of these scenarios is
 a long operation, so this reworking doesn't strike to me as something
 we should pursue.
 
   Amit

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RFC virtio-rng: fail to read sysfs of a busy device

2014-09-10 Thread Amos Kong
On Wed, Sep 10, 2014 at 02:49:38PM +0800, Amos Kong wrote:
 On Wed, Sep 10, 2014 at 11:22:12AM +0530, Amit Shah wrote:
  On (Tue) 09 Sep 2014 [23:23:07], Amos Kong wrote:
   (Resend to fix the subject)
   
   Hi Amit, Rusty
   
   RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062
   steps:
   - Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest
   - check sysfs files in the same time, 'cat 
   /sys/class/misc/hw_random/rng_*'
   
   Result: cat process will get stuck, it will return if we kill dd process.
  
  How common is it going to be to have a long-running 'dd' process on
  /dev/hwrng?
 
 Not a common usage, but we have this strict testing.

For -smp 1:
 It's easy to reproduce with slow backend (/dev/random). cat can return
 most of time with some delay if we use quick backend (/dev/urandom).

But for -smp 2:
 I didn't touch this problem even with slow backend.
  
  Also, with the new khwrng thread, reading from /dev/hwrng isn't
  required -- just use /dev/random?
 
 Yes.
  
  (This doesn't mean we shouldn't fix the issue here...)
 
 Completely agree :-)
  
   We have some static variables (eg, current_rng, data_avail, etc) in 
   hw_random/core.c,
   they are protected by rng_mutex. I try to workaround this issue by 
   undelay(100)
   after mutex_unlock() in rng_dev_read(). This gives chance for 
   hwrng_attr_*_show()
   to get mutex.
   
   This patch also contains some cleanup, moving some code out of mutex
   protection.
   
   Do you have some suggestion? Thanks.
   
   
   diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
   index aa30a25..fa69020 100644
   --- a/drivers/char/hw_random/core.c
   +++ b/drivers/char/hw_random/core.c
   @@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char 
   __user *buf,
 }

 mutex_unlock(rng_mutex);
   + udelay(100);
  
  We have a need_resched() right below.  Why doesn't that work?

[smp 1]
Why need_resched() always return zero? what's the original purpose of it ?

  
 if (need_resched())
 
 It never success in my debugging.
 
 If we remove this check and always call schedule_timeout_interruptible(1),
 problem also disappears.
 
 diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
 index aa30a25..263a370 100644
 --- a/drivers/char/hw_random/core.c
 +++ b/drivers/char/hw_random/core.c
 @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp,
 char __user *buf,
  
 mutex_unlock(rng_mutex);
  
 -   if (need_resched())
 -   schedule_timeout_interruptible(1);
 +   schedule_timeout_interruptible(1);
  
 if (signal_pending(current)) {
 err = -ERESTARTSYS;
 
 schedule_timeout_interruptible(1);
   @@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct 
   device *dev,
 int err;
 struct hwrng *rng;
  
  The following hunk doesn't work:
  
   + err = -ENODEV;
 err = mutex_lock_interruptible(rng_mutex);
  
  err is being set to another value in the next line!
  
 if (err)
 return -ERESTARTSYS;
   - err = -ENODEV;
  
  And all usage of err below now won't have -ENODEV but some other value.
 
 Oops!
  
 list_for_each_entry(rng, rng_list, list) {
 if (strcmp(rng-name, buf) == 0) {
 if (rng == current_rng) {
   @@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device 
   *dev,
 return -ERESTARTSYS;
 if (current_rng)
 name = current_rng-name;
   - ret = snprintf(buf, PAGE_SIZE, %s\n, name);
 mutex_unlock(rng_mutex);
   + ret = snprintf(buf, PAGE_SIZE, %s\n, name);
  
  This looks OK...
  

 return ret;
}
   @@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct 
   device *dev,
 ssize_t ret = 0;
 struct hwrng *rng;

   + buf[0] = '\0';
 err = mutex_lock_interruptible(rng_mutex);
 if (err)
 return -ERESTARTSYS;
  
   - buf[0] = '\0';
 list_for_each_entry(rng, rng_list, list) {
 strncat(buf, rng-name, PAGE_SIZE - ret - 1);
 ret += strlen(rng-name);
 strncat(buf,  , PAGE_SIZE - ret - 1);
 ret++;
 }
   + mutex_unlock(rng_mutex);
 strncat(buf, \n, PAGE_SIZE - ret - 1);
 ret++;
   - mutex_unlock(rng_mutex);
  
  But this isn't resulting in savings; the majority of the time is being
  spent in the for loop, and that writes to the buffer.
 
 Right
  
  BTW I don't expect strcat'ing to the buf in each of these scenarios is
  a long operation, so this reworking doesn't strike to me as something
  we should pursue.
  
  Amit
 
 -- 
   Amos.

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo

[PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes

2014-09-10 Thread Amos Kong
When I check hwrng attributes in sysfs, cat process always gets
stuck if guest has only 1 vcpu and uses a slow rng backend.

Currently we check if there is any tasks waiting to be run on
current cpu in rng_dev_read() by need_resched(). But need_resched()
doesn't work because rng_dev_read() is executing in user context.

This patch removed need_resched() and increase delay to 10 jiffies,
then other tasks can have chance to execute protected code.
Delaying 1 jiffy also works, but 10 jiffies is safer.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index c591d7e..b5d1b6f 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
 
mutex_unlock(rng_mutex);
 
-   if (need_resched())
-   schedule_timeout_interruptible(1);
+   schedule_timeout_interruptible(10);
 
if (signal_pending(current)) {
err = -ERESTARTSYS;
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] virtio-rng cleanup: move some code out of mutex protection

2014-09-10 Thread Amos Kong
It doesn't save too much cpu time as expected, just a cleanup.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/char/hw_random/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index aa30a25..c591d7e 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -270,8 +270,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev,
return -ERESTARTSYS;
if (current_rng)
name = current_rng-name;
-   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
mutex_unlock(rng_mutex);
+   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
 
return ret;
 }
@@ -284,19 +284,19 @@ static ssize_t hwrng_attr_available_show(struct device 
*dev,
ssize_t ret = 0;
struct hwrng *rng;
 
+   buf[0] = '\0';
err = mutex_lock_interruptible(rng_mutex);
if (err)
return -ERESTARTSYS;
-   buf[0] = '\0';
list_for_each_entry(rng, rng_list, list) {
strncat(buf, rng-name, PAGE_SIZE - ret - 1);
ret += strlen(rng-name);
strncat(buf,  , PAGE_SIZE - ret - 1);
ret++;
}
+   mutex_unlock(rng_mutex);
strncat(buf, \n, PAGE_SIZE - ret - 1);
ret++;
-   mutex_unlock(rng_mutex);
 
return ret;
 }
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] fix stuck in catting hwrng attributes

2014-09-10 Thread Amos Kong
If we read hwrng by long-running dd process, it takes too much cpu time.
When we check hwrng attributes from sysfs by cat, it gets stuck.
The problem can only be reproduced with non-smp guest with slow backend.

This patchset changed hwrng core to always delay 10 jiffies, cat process
have chance to execute protected code, the problem is resolved.

Thanks.

Amos Kong (2):
  virtio-rng cleanup: move some code out of mutex protection
  virtio-rng: fix stuck in catting hwrng attributes

 drivers/char/hw_random/core.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virtio-rng: complete have_data completion in removing device

2014-09-09 Thread Amos Kong
On Mon, Sep 08, 2014 at 11:29:51PM +0800, Amos Kong wrote:
 On Wed, Aug 06, 2014 at 01:55:29PM +0530, Amit Shah wrote:
  On (Wed) 06 Aug 2014 [16:05:41], Amos Kong wrote:
   On Wed, Aug 06, 2014 at 01:35:15AM +0800, Amos Kong wrote:
When we try to hot-remove a busy virtio-rng device from QEMU monitor,
the device can't be hot-removed. Because virtio-rng driver hangs at
wait_for_completion_killable().

This patch fixed the hang by completing have_data completion before
unregistering a virtio-rng device.
   
   Hi Amit,
   
   Before applying this patch, it's blocking insider 
   wait_for_completion_killable() 
   Applied this patch, wait_for_completion_killable() returns 0,
   and vi-data_avail becomes 0, then rng_get_date() will return 0.
  
  Thanks for checking this.
  
   Is it expected result?
  

Hi Amit

  I think what will happen is vi-data_avail will be set to whatever it
  was set last.  In case of a previous successful read request, the
  data_avail will be set to whatever number of bytes the host gave.  On
  doing a hot-unplug on the succeeding wait, the value in data_avail
  will be re-used, and the hwrng core will wrongly take some bytes in
  the buffer as input from the host.
  
  So, I think we need to set vi-data_avail = 0; before calling
  wait_event_completion_killable().

I finally understand content, we need to set vi-data_avail to 0
before returning virtio_read(), it might enter wait_event_completion_killable()
when we try to remove the device.

We call complete() in remove_common(), then wait_event_completion_killable()
will exit, but virtio_read() will be re-entered if the device still
isn't unregistered, then re-stuck inside wait_event_completion_killable().

I tested some complex condition (both quick/slow backend), I found
some problem in below two patches.

I will post another fix later. The test result is expected.

1. Hotplug remove virtio-rng0, dd process will exit with an error:
dd: error reading ‘/dev/hwrng’: Operation not permitted
   virtio-rng0 disappear from 'info pci'
 
2. Re-read by dd, hotplug virtio-rng1, dd process exit with same
   error, virtio-rng1 disappear


Thanks, Amos

  
  Amit
 
 In my latest debugging, I found the hang is caused by unexpected reading
 when we started to remove the device.
 
 I have two draft fix, 1) is skip unexpected reading by checking a
 remove flag. 2) is unregistering device at the beginning of
 remove_common(). I think second patch is better if it won't cause
 new problem.
 
 The original patch (complete in remove_common()) is still necessary.
 
 Test results: hotplug issue disappeared (dd process will quit).
 
 
 diff --git a/drivers/char/hw_random/virtio-rng.c
 b/drivers/char/hw_random/virtio-rng.c
 index 2e3139e..028797c 100644
 --- a/drivers/char/hw_random/virtio-rng.c
 +++ b/drivers/char/hw_random/virtio-rng.c
 @@ -35,6 +35,7 @@ struct virtrng_info {
 unsigned int data_avail;
 int index;
 bool busy;
 +bool remove;
 bool hwrng_register_done;
  };
  
 @@ -68,6 +69,9 @@ static int virtio_read(struct hwrng *rng, void *buf,
 size_t size, bool wait)
 int ret;
 struct virtrng_info *vi = (struct virtrng_info *)rng-priv;
  
 +if (vi-remove)
 +   return 0;
 +
 if (!vi-busy) {
 vi-busy = true;
 init_completion(vi-have_data);
 @@ -137,6 +141,8 @@ static void remove_common(struct virtio_device
 *vdev)
  {
 struct virtrng_info *vi = vdev-priv;
  
 +   vi-remove = true;
 +   complete(vi-have_data);
 vdev-config-reset(vdev);
 vi-busy = false;
 if (vi-hwrng_register_done)
 
 
 diff --git a/drivers/char/hw_random/virtio-rng.c
 b/drivers/char/hw_random/virtio-rng.c
 index 2e3139e..9b8c2ce 100644
 --- a/drivers/char/hw_random/virtio-rng.c
 +++ b/drivers/char/hw_random/virtio-rng.c
 @@ -137,10 +137,11 @@ static void remove_common(struct virtio_device
 *vdev)
  {
 struct virtrng_info *vi = vdev-priv;
  
 -   vdev-config-reset(vdev);
 -   vi-busy = false;
 if (vi-hwrng_register_done)
 hwrng_unregister(vi-hwrng);
 +   complete(vi-have_data);
 +   vdev-config-reset(vdev);
 +   vi-busy = false;
 vdev-config-del_vqs(vdev);
 ida_simple_remove(rng_index_ida, vi-index);
 kfree(vi);
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] virtio-rng: fix stuck of hot-unplugging busy device

2014-09-09 Thread Amos Kong
When we try to hot-remove a busy virtio-rng device from QEMU monitor,
the device can't be hot-removed. Because virtio-rng driver hangs at
wait_for_completion_killable().

This patch exits the waiting by completing have_data completion before
unregistering, resets data_avail to avoid the hwrng core use wrong
buffer bytes. Before real unregistering we should return -ENODEV for
reading.

When we hot-unplug the device, dd process in guest will exit.
  $ dd if=/dev/hwrng of=/dev/null
  dd: error reading ‘/dev/hwrng’: No such device

Signed-off-by: Amos Kong ak...@redhat.com
Cc: sta...@vger.kernel.org
---
V2: reset data_avail (Amit)
adjust unregister order
---
 drivers/char/hw_random/virtio-rng.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/char/hw_random/virtio-rng.c 
b/drivers/char/hw_random/virtio-rng.c
index 2e3139e..e76433b 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -68,6 +68,10 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t 
size, bool wait)
int ret;
struct virtrng_info *vi = (struct virtrng_info *)rng-priv;
 
+   if (!vi-hwrng_register_done) {
+   return -ENODEV;
+   }
+
if (!vi-busy) {
vi-busy = true;
init_completion(vi-have_data);
@@ -137,10 +141,14 @@ static void remove_common(struct virtio_device *vdev)
 {
struct virtrng_info *vi = vdev-priv;
 
+   vi-data_avail = 0;
+   complete(vi-have_data);
vdev-config-reset(vdev);
vi-busy = false;
-   if (vi-hwrng_register_done)
+   if (vi-hwrng_register_done) {
+   vi-hwrng_register_done = false;
hwrng_unregister(vi-hwrng);
+   }
vdev-config-del_vqs(vdev);
ida_simple_remove(rng_index_ida, vi-index);
kfree(vi);
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


mutex

2014-09-09 Thread Amos Kong
Hi Amit, Rusty

RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062
steps:
- Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest
- check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*'

Result: cat process will get stuck, it will return if we kill dd process.

We have some static variables (eg, current_rng, data_avail, etc) in 
hw_random/core.c,
they are protected by rng_mutex. I try to workaround this issue by undelay(100)
after mutex_unlock() in rng_dev_read(). This gives chance for 
hwrng_attr_*_show()
to get mutex.

This patch also contains some cleanup, moving some code out of mutex
protection.

Do you have some suggestion? Thanks.


diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index aa30a25..fa69020 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
}
 
mutex_unlock(rng_mutex);
+   udelay(100);
 
if (need_resched())
schedule_timeout_interruptible(1);
@@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device 
*dev,
int err;
struct hwrng *rng;
 
+   err = -ENODEV;
err = mutex_lock_interruptible(rng_mutex);
if (err)
return -ERESTARTSYS;
-   err = -ENODEV;
list_for_each_entry(rng, rng_list, list) {
if (strcmp(rng-name, buf) == 0) {
if (rng == current_rng) {
@@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev,
return -ERESTARTSYS;
if (current_rng)
name = current_rng-name;
-   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
mutex_unlock(rng_mutex);
+   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
 
return ret;
 }
@@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct device 
*dev,
ssize_t ret = 0;
struct hwrng *rng;
 
+   buf[0] = '\0';
err = mutex_lock_interruptible(rng_mutex);
if (err)
return -ERESTARTSYS;
-   buf[0] = '\0';
list_for_each_entry(rng, rng_list, list) {
strncat(buf, rng-name, PAGE_SIZE - ret - 1);
ret += strlen(rng-name);
strncat(buf,  , PAGE_SIZE - ret - 1);
ret++;
}
+   mutex_unlock(rng_mutex);
strncat(buf, \n, PAGE_SIZE - ret - 1);
ret++;
-   mutex_unlock(rng_mutex);
 
return ret;
 }
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RFC virtio-rng: fail to read sysfs of a busy device

2014-09-09 Thread Amos Kong
(Resend to fix the subject)

Hi Amit, Rusty

RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1127062
steps:
- Read random data by 'dd if=/dev/hwrng of=/dev/null' in guest
- check sysfs files in the same time, 'cat /sys/class/misc/hw_random/rng_*'

Result: cat process will get stuck, it will return if we kill dd process.

We have some static variables (eg, current_rng, data_avail, etc) in 
hw_random/core.c,
they are protected by rng_mutex. I try to workaround this issue by undelay(100)
after mutex_unlock() in rng_dev_read(). This gives chance for 
hwrng_attr_*_show()
to get mutex.

This patch also contains some cleanup, moving some code out of mutex
protection.

Do you have some suggestion? Thanks.


diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index aa30a25..fa69020 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -194,6 +194,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user 
*buf,
}
 
mutex_unlock(rng_mutex);
+   udelay(100);
 
if (need_resched())
schedule_timeout_interruptible(1);
@@ -233,10 +234,10 @@ static ssize_t hwrng_attr_current_store(struct device 
*dev,
int err;
struct hwrng *rng;
 
+   err = -ENODEV;
err = mutex_lock_interruptible(rng_mutex);
if (err)
return -ERESTARTSYS;
-   err = -ENODEV;
list_for_each_entry(rng, rng_list, list) {
if (strcmp(rng-name, buf) == 0) {
if (rng == current_rng) {
@@ -270,8 +271,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev,
return -ERESTARTSYS;
if (current_rng)
name = current_rng-name;
-   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
mutex_unlock(rng_mutex);
+   ret = snprintf(buf, PAGE_SIZE, %s\n, name);
 
return ret;
 }
@@ -284,19 +285,19 @@ static ssize_t hwrng_attr_available_show(struct device 
*dev,
ssize_t ret = 0;
struct hwrng *rng;
 
+   buf[0] = '\0';
err = mutex_lock_interruptible(rng_mutex);
if (err)
return -ERESTARTSYS;
-   buf[0] = '\0';
list_for_each_entry(rng, rng_list, list) {
strncat(buf, rng-name, PAGE_SIZE - ret - 1);
ret += strlen(rng-name);
strncat(buf,  , PAGE_SIZE - ret - 1);
ret++;
}
+   mutex_unlock(rng_mutex);
strncat(buf, \n, PAGE_SIZE - ret - 1);
ret++;
-   mutex_unlock(rng_mutex);
 
return ret;
 }

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virtio-rng: complete have_data completion in removing device

2014-09-08 Thread Amos Kong
On Wed, Aug 06, 2014 at 01:55:29PM +0530, Amit Shah wrote:
 On (Wed) 06 Aug 2014 [16:05:41], Amos Kong wrote:
  On Wed, Aug 06, 2014 at 01:35:15AM +0800, Amos Kong wrote:
   When we try to hot-remove a busy virtio-rng device from QEMU monitor,
   the device can't be hot-removed. Because virtio-rng driver hangs at
   wait_for_completion_killable().
   
   This patch fixed the hang by completing have_data completion before
   unregistering a virtio-rng device.
  
  Hi Amit,
  
  Before applying this patch, it's blocking insider 
  wait_for_completion_killable() 
  Applied this patch, wait_for_completion_killable() returns 0,
  and vi-data_avail becomes 0, then rng_get_date() will return 0.
 
 Thanks for checking this.
 
  Is it expected result?
 
 I think what will happen is vi-data_avail will be set to whatever it
 was set last.  In case of a previous successful read request, the
 data_avail will be set to whatever number of bytes the host gave.  On
 doing a hot-unplug on the succeeding wait, the value in data_avail
 will be re-used, and the hwrng core will wrongly take some bytes in
 the buffer as input from the host.
 
 So, I think we need to set vi-data_avail = 0; before calling
 wait_event_completion_killable().
 
   Amit

In my latest debugging, I found the hang is caused by unexpected reading
when we started to remove the device.

I have two draft fix, 1) is skip unexpected reading by checking a
remove flag. 2) is unregistering device at the beginning of
remove_common(). I think second patch is better if it won't cause
new problem.

The original patch (complete in remove_common()) is still necessary.

Test results: hotplug issue disappeared (dd process will quit).


diff --git a/drivers/char/hw_random/virtio-rng.c
b/drivers/char/hw_random/virtio-rng.c
index 2e3139e..028797c 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -35,6 +35,7 @@ struct virtrng_info {
unsigned int data_avail;
int index;
bool busy;
+bool remove;
bool hwrng_register_done;
 };
 
@@ -68,6 +69,9 @@ static int virtio_read(struct hwrng *rng, void *buf,
size_t size, bool wait)
int ret;
struct virtrng_info *vi = (struct virtrng_info *)rng-priv;
 
+if (vi-remove)
+   return 0;
+
if (!vi-busy) {
vi-busy = true;
init_completion(vi-have_data);
@@ -137,6 +141,8 @@ static void remove_common(struct virtio_device
*vdev)
 {
struct virtrng_info *vi = vdev-priv;
 
+   vi-remove = true;
+   complete(vi-have_data);
vdev-config-reset(vdev);
vi-busy = false;
if (vi-hwrng_register_done)


diff --git a/drivers/char/hw_random/virtio-rng.c
b/drivers/char/hw_random/virtio-rng.c
index 2e3139e..9b8c2ce 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -137,10 +137,11 @@ static void remove_common(struct virtio_device
*vdev)
 {
struct virtrng_info *vi = vdev-priv;
 
-   vdev-config-reset(vdev);
-   vi-busy = false;
if (vi-hwrng_register_done)
hwrng_unregister(vi-hwrng);
+   complete(vi-have_data);
+   vdev-config-reset(vdev);
+   vi-busy = false;
vdev-config-del_vqs(vdev);
ida_simple_remove(rng_index_ida, vi-index);
kfree(vi);

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] virtio-pci: share config interrupt between virtio devices

2014-09-01 Thread Amos Kong
On Mon, Sep 01, 2014 at 09:37:30AM +0300, Michael S. Tsirkin wrote:


Hi Michael,

 On Mon, Sep 01, 2014 at 01:41:54PM +0800, Amos Kong wrote:
  One VM only has 128 msix interrupt, virtio-config interrupt
  has less workload. This patch shares one normal interrupt

Thanks for your quick reply.

 normal == INT#x? Please don't call it normal. The proper name is
 legacy INT#x.
 
OK

 So you are trying to use legacy INT#x at the same time
 with MSI-X?  This does not work: the PCI spec says:
   While enabled for MSI or MSI-X
   operation, a function is prohibited from using its INTx# pin (if
   implemented) to request
   service (MSI, MSI-X, and INTx# are mutually exclusive).

It means we can't use interrupt and triggered-interrupt together for
one PCI device. I will study this problem.
 
 does the patch work for you? If it does it might be a (minor) spec
 violation in kvm.

I did some basic testing (multiple nics, scp, ping, etc), it works.
 
 Besides, INT#x really leads to terrible performance because
 sharing is forced even if there aren't many devices.
 
 Why do we need INT#x?
 How about setting IRQF_SHARED for the config interrupt
 while using MSI-X?
 You'd have to read ISR to check that the interrupt was
 intended for your device.
 
I have a draft patch to share one MSI-X for all virtio-config, it has
some problem in hotplugging devices. I will continue this way.
 
  for configuration between virtio devices.
  
  Signed-off-by: Amos Kong ak...@redhat.com
  ---
   drivers/virtio/virtio_pci.c | 41 -
   1 file changed, 16 insertions(+), 25 deletions(-)
  
  diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
  index 3d1463c..b1263b3 100644
  --- a/drivers/virtio/virtio_pci.c
  +++ b/drivers/virtio/virtio_pci.c
  @@ -52,6 +52,7 @@ struct virtio_pci_device
  /* Name strings for interrupts. This size should be enough,
   * and I'm too lazy to allocate each name separately. */
  char (*msix_names)[256];
  +   char config_msix_name[256];
  /* Number of available vectors */
  unsigned msix_vectors;
  /* Vectors allocated, excluding per-vq vectors if any */
  @@ -282,12 +283,6 @@ static void vp_free_vectors(struct virtio_device *vdev)
  free_cpumask_var(vp_dev-msix_affinity_masks[i]);
   
  if (vp_dev-msix_enabled) {
  -   /* Disable the vector used for configuration */
  -   iowrite16(VIRTIO_MSI_NO_VECTOR,
  - vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
  -   /* Flush the write out to device */
  -   ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
  -
  pci_disable_msix(vp_dev-pci_dev);
  vp_dev-msix_enabled = 0;
  }
  @@ -339,24 +334,18 @@ static int vp_request_msix_vectors(struct 
  virtio_device *vdev, int nvectors,
  goto error;
  vp_dev-msix_enabled = 1;
   
  -   /* Set the vector used for configuration */
  -   v = vp_dev-msix_used_vectors;
  -   snprintf(vp_dev-msix_names[v], sizeof *vp_dev-msix_names,
  +   /* Set shared IRQ for configuration */
  +   snprintf(vp_dev-config_msix_name, sizeof(*vp_dev-msix_names),
   %s-config, name);
  -   err = request_irq(vp_dev-msix_entries[v].vector,
  - vp_config_changed, 0, vp_dev-msix_names[v],
  +   err = request_irq(vp_dev-pci_dev-irq,
  + vp_config_changed,
  + IRQF_SHARED,
  + vp_dev-config_msix_name,
vp_dev);
  -   if (err)
  -   goto error;
  -   ++vp_dev-msix_used_vectors;
  -
  -   iowrite16(v, vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
  -   /* Verify we had enough resources to assign the vector */
  -   v = ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
  -   if (v == VIRTIO_MSI_NO_VECTOR) {
  -   err = -EBUSY;
  +   if (!err)
  +   vp_dev-intx_enabled = 1;
  +   else
  goto error;
  -   }
   
  if (!per_vq_vectors) {
  /* Shared vector for all VQs */
  @@ -535,14 +524,16 @@ static int vp_try_to_find_vqs(struct virtio_device 
  *vdev, unsigned nvqs,
  goto error_request;
  } else {
  if (per_vq_vectors) {
  -   /* Best option: one for change interrupt, one per vq. */
  -   nvectors = 1;
  +   /* Best option: one normal interrupt for change,
  +  one msix per vq. */
  +   nvectors = 0;
  for (i = 0; i  nvqs; ++i)
  if (callbacks[i])
  ++nvectors;
  } else {
  -   /* Second best: one for change, shared for all vqs. */
  -   nvectors = 2;
  +   /* Second best: one normal interrupt for
  +  change, share one msix for all vqs. */
  +   nvectors = 1

[PATCH RFC] virtio-pci: share config interrupt between virtio devices

2014-08-31 Thread Amos Kong
One VM only has 128 msix interrupt, virtio-config interrupt
has less workload. This patch shares one normal interrupt
for configuration between virtio devices.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/virtio/virtio_pci.c | 41 -
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 3d1463c..b1263b3 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -52,6 +52,7 @@ struct virtio_pci_device
/* Name strings for interrupts. This size should be enough,
 * and I'm too lazy to allocate each name separately. */
char (*msix_names)[256];
+   char config_msix_name[256];
/* Number of available vectors */
unsigned msix_vectors;
/* Vectors allocated, excluding per-vq vectors if any */
@@ -282,12 +283,6 @@ static void vp_free_vectors(struct virtio_device *vdev)
free_cpumask_var(vp_dev-msix_affinity_masks[i]);
 
if (vp_dev-msix_enabled) {
-   /* Disable the vector used for configuration */
-   iowrite16(VIRTIO_MSI_NO_VECTOR,
- vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
-   /* Flush the write out to device */
-   ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
-
pci_disable_msix(vp_dev-pci_dev);
vp_dev-msix_enabled = 0;
}
@@ -339,24 +334,18 @@ static int vp_request_msix_vectors(struct virtio_device 
*vdev, int nvectors,
goto error;
vp_dev-msix_enabled = 1;
 
-   /* Set the vector used for configuration */
-   v = vp_dev-msix_used_vectors;
-   snprintf(vp_dev-msix_names[v], sizeof *vp_dev-msix_names,
+   /* Set shared IRQ for configuration */
+   snprintf(vp_dev-config_msix_name, sizeof(*vp_dev-msix_names),
 %s-config, name);
-   err = request_irq(vp_dev-msix_entries[v].vector,
- vp_config_changed, 0, vp_dev-msix_names[v],
+   err = request_irq(vp_dev-pci_dev-irq,
+ vp_config_changed,
+ IRQF_SHARED,
+ vp_dev-config_msix_name,
  vp_dev);
-   if (err)
-   goto error;
-   ++vp_dev-msix_used_vectors;
-
-   iowrite16(v, vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
-   /* Verify we had enough resources to assign the vector */
-   v = ioread16(vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
-   if (v == VIRTIO_MSI_NO_VECTOR) {
-   err = -EBUSY;
+   if (!err)
+   vp_dev-intx_enabled = 1;
+   else
goto error;
-   }
 
if (!per_vq_vectors) {
/* Shared vector for all VQs */
@@ -535,14 +524,16 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, 
unsigned nvqs,
goto error_request;
} else {
if (per_vq_vectors) {
-   /* Best option: one for change interrupt, one per vq. */
-   nvectors = 1;
+   /* Best option: one normal interrupt for change,
+  one msix per vq. */
+   nvectors = 0;
for (i = 0; i  nvqs; ++i)
if (callbacks[i])
++nvectors;
} else {
-   /* Second best: one for change, shared for all vqs. */
-   nvectors = 2;
+   /* Second best: one normal interrupt for
+  change, share one msix for all vqs. */
+   nvectors = 1;
}
 
err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors);
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virtio-rng: complete have_data completion in removing device

2014-08-06 Thread Amos Kong
On Wed, Aug 06, 2014 at 01:35:15AM +0800, Amos Kong wrote:
 When we try to hot-remove a busy virtio-rng device from QEMU monitor,
 the device can't be hot-removed. Because virtio-rng driver hangs at
 wait_for_completion_killable().
 
 This patch fixed the hang by completing have_data completion before
 unregistering a virtio-rng device.

Hi Amit,

Before applying this patch, it's blocking insider 
wait_for_completion_killable() 
Applied this patch, wait_for_completion_killable() returns 0,
and vi-data_avail becomes 0, then rng_get_date() will return 0.

Is it expected result?

 
 Signed-off-by: Amos Kong ak...@redhat.com
 Cc: sta...@vger.kernel.org
 ---
  drivers/char/hw_random/virtio-rng.c | 1 +
  1 file changed, 1 insertion(+)
 
 diff --git a/drivers/char/hw_random/virtio-rng.c 
 b/drivers/char/hw_random/virtio-rng.c
 index 0027137..416b15c 100644
 --- a/drivers/char/hw_random/virtio-rng.c
 +++ b/drivers/char/hw_random/virtio-rng.c
 @@ -137,6 +137,7 @@ static void remove_common(struct virtio_device *vdev)
   struct virtrng_info *vi = vdev-priv;
  
   vdev-config-reset(vdev);
 + complete(vi-have_data);
   vi-busy = false;
   if (vi-hwrng_register_done)
   hwrng_unregister(vi-hwrng);
 -- 
 1.9.3
 
 ___
 Virtualization mailing list
 virtualizat...@lists.linux-foundation.org
 https://lists.linuxfoundation.org/mailman/listinfo/virtualization

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: When I boot two virtio-rng devices, guest will hang

2014-08-05 Thread Amos Kong
3.16 (guest hangs with two rng devices)
3.16 + quick fix (can startup with two rng devices) (hotplug issue 1 + hotplug 
issue 2 exist)
lates torvalds/linux.git + amit 4 patches (can startup with two rng devices) 
(only hotplug issue 2 exists)

However, the 4 patches also fixed the hang issue, the hotplug issue was fixed a 
little.
The hotplug issue is effected by the backend, or maybe it's not a real issue, 
because
the rng device can be hot-removed after dd process is killed.


Hotplug issue 1:
  1. boot up guest with two rng device (rng0 uses /dev/urandom, rng1 uses 
/dev/random)
  2. read data by dd in guest
  3 (option 1). hot-remove rng0, then hot-remove rng1 - result: _only rng1_ 
can't be removed until dd process is killed
  3 (option 2). hot-remove rng1, then hot-remove rng0 - result: two devices 
can be removed successfully, dd process will exit automatically.

  If we use /dev/urandom for rng0 and rng1, _rng0  rng1_ can be removed, dd 
process will exit automatically.

Hotplug issue 2:
  If we use /dev/random for rng0 and rng1, _rng0  rng1_ can't be removed until 
dd process is killed.

Hotplug issue 3:
  If we use /dev/random for rng0 and rng1, _only rng1_ can't be removed until 
dd process is killed.


(The difference between /dev/random and /dev/urandom is the speed.)

Thanks, Amos
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] virtio-rng: complete have_data completion in removing device

2014-08-05 Thread Amos Kong
When we try to hot-remove a busy virtio-rng device from QEMU monitor,
the device can't be hot-removed. Because virtio-rng driver hangs at
wait_for_completion_killable().

This patch fixed the hang by completing have_data completion before
unregistering a virtio-rng device.

Signed-off-by: Amos Kong ak...@redhat.com
Cc: sta...@vger.kernel.org
---
 drivers/char/hw_random/virtio-rng.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/char/hw_random/virtio-rng.c 
b/drivers/char/hw_random/virtio-rng.c
index 0027137..416b15c 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -137,6 +137,7 @@ static void remove_common(struct virtio_device *vdev)
struct virtrng_info *vi = vdev-priv;
 
vdev-config-reset(vdev);
+   complete(vi-have_data);
vi-busy = false;
if (vi-hwrng_register_done)
hwrng_unregister(vi-hwrng);
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: When I boot two virtio-rng devices, guest will hang

2014-08-05 Thread Amos Kong
On Tue, Aug 05, 2014 at 06:28:54PM +0800, Amos Kong wrote:
 3.16 (guest hangs with two rng devices)
 3.16 + quick fix (can startup with two rng devices) (hotplug issue 1 + 
 hotplug issue 2 exist)
 lates torvalds/linux.git + amit 4 patches (can startup with two rng devices) 
 (only hotplug issue 2 exists)
 
 However, the 4 patches also fixed the hang issue, the hotplug issue was fixed 
 a little.
 The hotplug issue is effected by the backend, or maybe it's not a real issue, 
 because
 the rng device can be hot-removed after dd process is killed.
 
 
 Hotplug issue 1:
   1. boot up guest with two rng device (rng0 uses /dev/urandom, rng1 uses 
 /dev/random)
   2. read data by dd in guest
   3 (option 1). hot-remove rng0, then hot-remove rng1 - result: _only rng1_ 
 can't be removed until dd process is killed
   3 (option 2). hot-remove rng1, then hot-remove rng0 - result: two devices 
 can be removed successfully, dd process will exit automatically.
 
   If we use /dev/urandom for rng0 and rng1, _rng0  rng1_ can be removed, dd 
 process will exit automatically.
 
 Hotplug issue 2:
   If we use /dev/random for rng0 and rng1, _rng0  rng1_ can't be removed 
 until dd process is killed.
 
 Hotplug issue 3:
   If we use /dev/random for rng0 and rng1, _only rng1_ can't be removed until 
 dd process is killed.

Hi Amit,

I finally found the root problem and posted a fix to upstream:
  
http://lists.linuxfoundation.org/pipermail/virtualization/2014-August/027049.html

It can help to fix the hotplug issues on 3.16  latest kernel, so
stable kernel is CCed.

 (The difference between /dev/random and /dev/urandom is the speed.)
 
 Thanks, Amos

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


When I boot two virtio-rng devices, guest will hang

2014-07-28 Thread Amos Kong
QEMU commandline:

./x86_64-softmmu/qemu-system-x86_64 --enable-kvm   -m 2000 -drive 
file=/images/nolvm.qcow2 --kernel /home/devel/linux/arch/x86/boot/bzImage 
-append ro root=/dev/sda1 console=ttyS0,115200  -monitor 
unix:/tmp/m,nowait,server -device 
virtio-net-pci,netdev=h0,vectors=17,mq=on,id=n0 -netdev tap,id=h0,queues=8 
-device virtio-net-pci,netdev=h1,vectors=0,mq=on,id=n1 -netdev 
tap,id=h1,queues=8  -vnc :0 -mon chardev=qmp,mode=control,pretty=on -chardev 
socket,id=qmp,host=localhost,port=1234,server,nowait -serial stdio -object 
rng-random,filename=/dev/urandom,id=rng0   -device 
virtio-rng-pci,rng=rng0,id=h0 -object rng-random,filename=/dev/urandom,id=rng1  
 -device virtio-rng-pci,rng=rng1,id=h1

It works when I only add one virtio-rng device. Did you touch this
problem?

I'm using latest net-next/master (ac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f)

[0.00] Initializing cgroup subsys cpuset
[0.00] Initializing cgroup subsys cpu
[0.00] Initializing cgroup subsys cpuacct
[0.00] Linux version 3.16.0-rc6+ (amos@z) (gcc version 4.8.3 20140624 
(Red Hat 4.8.3-1) (GCC) ) #363 SMP Mon Jul 28 15:05:44 CST 2014
[0.00] Command line: ro root=/dev/sda1 console=ttyS0,115200
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
[0.00] BIOS-e820: [mem 0x0009fc00-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000f-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0x7cffdfff] usable
[0.00] BIOS-e820: [mem 0x7cffe000-0x7cff] reserved
[0.00] BIOS-e820: [mem 0xfeffc000-0xfeff] reserved
[0.00] BIOS-e820: [mem 0xfffc-0x] reserved
[0.00] NX (Execute Disable) protection: active
[0.00] SMBIOS 2.8 present.
[0.00] Hypervisor detected: KVM
[0.00] AGP: No AGP bridge found
[0.00] e820: last_pfn = 0x7cffe max_arch_pfn = 0x4
[0.00] PAT not supported by CPU.
[0.00] found SMP MP-table at [mem 0x000f0e90-0x000f0e9f] mapped at 
[880f0e90]
[0.00] init_memory_mapping: [mem 0x-0x000f]
[0.00] init_memory_mapping: [mem 0x7cc0-0x7cdf]
[0.00] init_memory_mapping: [mem 0x7c00-0x7cbf]
[0.00] init_memory_mapping: [mem 0x0010-0x7bff]
[0.00] init_memory_mapping: [mem 0x7ce0-0x7cffdfff]
[0.00] ACPI: Early table checksum verification disabled
[0.00] ACPI: RSDP 0x000F0C90 14 (v00 BOCHS )
[0.00] ACPI: RSDT 0x7CFFFEF0 34 (v01 BOCHS  BXPCRSDT 
0001 BXPC 0001)
[0.00] ACPI: FACP 0x7CFFF1D3 74 (v01 BOCHS  BXPCFACP 
0001 BXPC 0001)
[0.00] ACPI: DSDT 0x7CFFE040 001193 (v01 BOCHS  BXPCDSDT 
0001 BXPC 0001)
[0.00] ACPI: FACS 0x7CFFE000 40
[0.00] ACPI: SSDT 0x7CFFF247 000BF9 (v01 BOCHS  BXPCSSDT 
0001 BXPC 0001)
[0.00] ACPI: APIC 0x7CFFFE40 78 (v01 BOCHS  BXPCAPIC 
0001 BXPC 0001)
[0.00] ACPI: HPET 0x7CFFFEB8 38 (v01 BOCHS  BXPCHPET 
0001 BXPC 0001)
[0.00] No NUMA configuration found
[0.00] Faking a node at [mem 0x-0x7cffdfff]
[0.00] Initmem setup node 0 [mem 0x-0x7cffdfff]
[0.00]   NODE_DATA [mem 0x7cfea000-0x7cffdfff]
[0.00] kvm-clock: Using msrs 4b564d01 and 4b564d00
[0.00] kvm-clock: cpu 0, msr 0:7cfe8001, primary cpu clock
[0.00] Zone ranges:
[0.00]   DMA  [mem 0x1000-0x00ff]
[0.00]   DMA32[mem 0x0100-0x]
[0.00]   Normal   empty
[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009efff]
[0.00]   node   0: [mem 0x0010-0x7cffdfff]
[0.00] ACPI: PM-Timer IO Port: 0x608
[0.00] ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled)
[0.00] ACPI: LAPIC_NMI (acpi_id[0xff] dfl dfl lint[0x1])
[0.00] ACPI: IOAPIC (id[0x00] address[0xfec0] gsi_base[0])
[0.00] IOAPIC[0]: apic_id 0, version 17, address 0xfec0, GSI 0-23
[0.00] ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
[0.00] ACPI: INT_SRC_OVR (bus 0 bus_irq 5 global_irq 5 high level)
[0.00] ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
[0.00] ACPI: INT_SRC_OVR (bus 0 bus_irq 10 global_irq 10 high level)
[0.00] ACPI: INT_SRC_OVR (bus 0 bus_irq 11 global_irq 11 high level)
[0.00] Using ACPI (MADT) for SMP configuration information
[0.00] ACPI: HPET id: 0x8086a201 base: 0xfed0
[0.00] smpboot: Allowing 1 CPUs, 0 hotplug CPUs
[0.00] PM: Registered nosave memory: [mem 0x0009f000-0x0009]
[0.00] PM: 

Re: When I boot two virtio-rng devices, guest will hang

2014-07-28 Thread Amos Kong
On Mon, Jul 28, 2014 at 01:25:14PM +0530, Amit Shah wrote:
 On (Mon) 28 Jul 2014 [15:32:42], Amos Kong wrote:
  QEMU commandline:
  
  ./x86_64-softmmu/qemu-system-x86_64 --enable-kvm   -m 2000 -drive 
  file=/images/nolvm.qcow2 --kernel /home/devel/linux/arch/x86/boot/bzImage 
  -append ro root=/dev/sda1 console=ttyS0,115200  -monitor 
  unix:/tmp/m,nowait,server -device 
  virtio-net-pci,netdev=h0,vectors=17,mq=on,id=n0 -netdev tap,id=h0,queues=8 
  -device virtio-net-pci,netdev=h1,vectors=0,mq=on,id=n1 -netdev 
  tap,id=h1,queues=8  -vnc :0 -mon chardev=qmp,mode=control,pretty=on 
  -chardev socket,id=qmp,host=localhost,port=1234,server,nowait -serial stdio 
  -object rng-random,filename=/dev/urandom,id=rng0   -device 
  virtio-rng-pci,rng=rng0,id=h0 -object 
  rng-random,filename=/dev/urandom,id=rng1   -device 
  virtio-rng-pci,rng=rng1,id=h1
  
  It works when I only add one virtio-rng device. Did you touch this
  problem?
  
  I'm using latest net-next/master (ac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f)
 
Hi Amit,

 snip
  [0.223503] Non-volatile memory driver v1.3
  [1.172293] tsc: Refined TSC clocksource calibration: 2893.436 MHz
  qemu: terminating on signal 2   -- (I have to cancel QEMU 
  process by Ctrl + C)
 
 This looks similar to what I saw when driver asks for randomness from the host
 before probe is completed.
 
 Does the following patch help?

This patch was already inclued in latest net-next/master
patch commit: e052dbf554610e2104c5a7518c4d8374bed701bb

 While the driver is setup (DRIVER_OK is set), the vqs aren't marked
 usable before the probe for the other device finishes as well.  That's
 not a scenario I considered.  We can try to put this patch in 3.16,
 but it won't be applicable for 3.17, where this is handled the proper
 way.  Also, 3.15 isn't affected, since that doesn't have multi-device
 support.
 
 Also attaching a patch that enables traces in qemu so it's easier to
 see what's going on.

 diff --git a/drivers/char/hw_random/virtio-rng.c
 b/drivers/char/hw_random/virtio-rng.c
 index e9b15bc..124cac5 100644
 --- a/drivers/char/hw_random/virtio-rng.c
 +++ b/drivers/char/hw_random/virtio-rng.c
 @@ -34,12 +34,11 @@ struct virtrng_info {
 unsigned int data_avail;
 struct completion have_data;
 bool busy;
 +   bool probe_done;
 char name[25];
 int index;
  };
  
 -static bool probe_done;
 -
  static void random_recv_done(struct virtqueue *vq)
  {
 struct virtrng_info *vi = vq-vdev-priv;
 @@ -73,7 +72,7 @@ static int virtio_read(struct hwrng *rng, void *buf,
 -size_t size, bool wait)
  * Don't ask host for data till we're setup.  This call can
  * happen during hwrng_register(), after commit d9e7972619.
  */
 -   if (unlikely(!probe_done))
 +   if (unlikely(!vi-probe_done))
 return 0;
  
 if (!vi-busy) {
 @@ -146,7 +145,7 @@ static int probe_common(struct virtio_device
 *vdev)
 return err;
 }
  
 -   probe_done = true;
 +   vi-probe_done = true;
 return 0;
  }
  
 
   Amit

 From ec1aa555b67628beefa0ac6902baa2cc2e156f58 Mon Sep 17 00:00:00 2001
 Message-Id: 
 ec1aa555b67628beefa0ac6902baa2cc2e156f58.1406533638.git.amit.s...@redhat.com
 From: Amit Shah amit.s...@redhat.com
 Date: Mon, 21 Jul 2014 14:46:28 +0530
 Subject: [PATCH 1/1] virtio-rng: add some trace events
 
 Add some trace events to virtio-rng for easier debugging
 
 Signed-off-by: Amit Shah amit.s...@redhat.com
 ---
  hw/virtio/virtio-rng.c | 7 +++
  trace-events   | 5 +
  2 files changed, 12 insertions(+)
 
 diff --git a/hw/virtio/virtio-rng.c b/hw/virtio/virtio-rng.c
 index 7c5a675..4a6472b 100644
 --- a/hw/virtio/virtio-rng.c
 +++ b/hw/virtio/virtio-rng.c
 @@ -16,6 +16,7 @@
  #include hw/virtio/virtio-rng.h
  #include sysemu/rng.h
  #include qom/object_interfaces.h
 +#include trace.h
  
  static bool is_guest_ready(VirtIORNG *vrng)
  {
 @@ -24,6 +25,7 @@ static bool is_guest_ready(VirtIORNG *vrng)
   (vdev-status  VIRTIO_CONFIG_S_DRIVER_OK)) {
  return true;
  }
 +trace_virtio_rng_guest_not_ready(vrng);
  return false;
  }
  
 @@ -62,6 +64,7 @@ static void chr_read(void *opaque, const void *buf, size_t 
 size)
  offset += len;
  
  virtqueue_push(vrng-vq, elem, len);
 +trace_virtio_rng_pushed(vrng, len);
  }
  virtio_notify(vdev, vrng-vq);
  }
 @@ -81,7 +84,11 @@ static void virtio_rng_process(VirtIORNG *vrng)
  quota = MIN((uint64_t)vrng-quota_remaining, (uint64_t)UINT32_MAX);
  }
  size = get_request_size(vrng-vq, quota);
 +
 +trace_virtio_rng_request(vrng, size, quota);
 +
  size = MIN(vrng-quota_remaining, size);
 +
  if (size) {
  rng_backend_request_entropy(vrng-rng, size, chr_read, vrng);
  }
 diff --git a/trace-events b/trace-events
 index 11a17a8..99f39ac 100644
 --- a/trace-events
 +++ b/trace-events
 @@ -41,6

Re: When I boot two virtio-rng devices, guest will hang

2014-07-28 Thread Amos Kong
On Mon, Jul 28, 2014 at 02:42:13PM +0530, Amit Shah wrote:
 On (Mon) 28 Jul 2014 [16:49:20], Amos Kong wrote:
  On Mon, Jul 28, 2014 at 01:25:14PM +0530, Amit Shah wrote:
   On (Mon) 28 Jul 2014 [15:32:42], Amos Kong wrote:
QEMU commandline:

./x86_64-softmmu/qemu-system-x86_64 --enable-kvm   -m 2000 -drive 
file=/images/nolvm.qcow2 --kernel 
/home/devel/linux/arch/x86/boot/bzImage -append ro root=/dev/sda1 
console=ttyS0,115200  -monitor unix:/tmp/m,nowait,server -device 
virtio-net-pci,netdev=h0,vectors=17,mq=on,id=n0 -netdev 
tap,id=h0,queues=8 -device 
virtio-net-pci,netdev=h1,vectors=0,mq=on,id=n1 -netdev 
tap,id=h1,queues=8  -vnc :0 -mon chardev=qmp,mode=control,pretty=on 
-chardev socket,id=qmp,host=localhost,port=1234,server,nowait -serial 
stdio -object rng-random,filename=/dev/urandom,id=rng0   -device 
virtio-rng-pci,rng=rng0,id=h0 -object 
rng-random,filename=/dev/urandom,id=rng1   -device 
virtio-rng-pci,rng=rng1,id=h1

It works when I only add one virtio-rng device. Did you touch this
problem?

I'm using latest net-next/master 
(ac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f)
   
  Hi Amit,
  
   snip
[0.223503] Non-volatile memory driver v1.3
[1.172293] tsc: Refined TSC clocksource calibration: 2893.436 MHz
qemu: terminating on signal 2   -- (I have to cancel 
QEMU process by Ctrl + C)
   
   This looks similar to what I saw when driver asks for randomness from the 
   host
   before probe is completed.
   
   Does the following patch help?
  
  This patch was already inclued in latest net-next/master
  patch commit: e052dbf554610e2104c5a7518c4d8374bed701bb
 
 No, it's a different one, goes on top of the commit you referenced.
 
Thanks. This patch fixed the problem.

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RFC: sharing config interrupt between virtio devices for saving MSI

2014-04-27 Thread Amos Kong
On Sat, Apr 19, 2014 at 12:08:15PM +0800, Amos Kong wrote:
 
 Hi all,
 
 I'm working on this item in Upstream Networking todolist:
 
 | - Sharing config interrupts
 |Support more devices by sharing a single msi vector
 |between multiple virtio devices.
 |(Applies to virtio-blk too).
 
 I have this solution here, and only have draft patches of Solution
 1  2, let's discuss if solution 3 is feasible.
 
 * We should not introduce perf regression in this change
 * little effect is acceptable because we are _sharing_ interrupt
 
 Solution 1:
 ==
 share one LSI interrupt for configuration change of all virtio devices.
 Draft patch: share_lsi_for_all_config.patch
 
 Solution 2:
 ==
 share one MSI interrupt for configuration change of all virtio devices.
 Draft patch: share_msix_for_all_config.patch
 
 Solution 3:
 ==
 dynamic adjust interrupt policy when device is added or removed
 Current:
 ---
 - try to allocate a MSI to device's config
 - try to allocate a MSI for each vq
 - share one MSI for all vqs of one device
 - share one LSI for config  vqs of one device
 
 additional dynamic policies:
 ---
 - if there is no enough MSI, adjust interrupt allocation for returning
   some MSI
- try to find a device that allocated one MSI for each vq, change
  it to share one MSI for saving the MSI
- try to share one MSI for config of all virtio_pci devices
- try to share one LSI for config of all devices
 
 - if device is removed, try to re-allocate freed MSI to existed
   devices, if they aren't in best status (one MSI for config, one
   MSI for each vq)
- if config of all devices is sharing one LSI, try to upgrade it to MSI
- if config of all devices is sharing one MSI, try to allocate one
  MSI for configuration change of each device
- try to find a device that is sharing one MSI for all vqs, try to
  allocate one MSI for each vq

Hi Michael, Alex

Any thoughts about this ? :)


Thanks, Amos


 diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
 index 101db3f..5ba348d 100644
 --- a/drivers/virtio/virtio_pci.c
 +++ b/drivers/virtio/virtio_pci.c
 @@ -302,6 +302,8 @@ static void vp_free_vectors(struct virtio_device *vdev)
   vp_dev-msix_affinity_masks = NULL;
  }
  
 +//static msix_entry *config_msix_entry;
 +static char config_msix_name[100];
  static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
  bool per_vq_vectors)
  {
 @@ -341,14 +343,13 @@ static int vp_request_msix_vectors(struct virtio_device 
 *vdev, int nvectors,
  
   /* Set the vector used for configuration */
   v = vp_dev-msix_used_vectors;
 - snprintf(vp_dev-msix_names[v], sizeof *vp_dev-msix_names,
 + snprintf(config_msix_name, sizeof *vp_dev-msix_names,
%s-config, name);
 - err = request_irq(vp_dev-msix_entries[v].vector,
 -   vp_config_changed, 0, vp_dev-msix_names[v],
 -   vp_dev);
 + err = request_irq(vp_dev-pci_dev-irq, vp_config_changed, IRQF_SHARED, 
 config_msix_name, vp_dev);
   if (err)
   goto error;
 - ++vp_dev-msix_used_vectors;
 +
 +//++vp_dev-msix_used_vectors;
  
   iowrite16(v, vp_dev-ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
   /* Verify we had enough resources to assign the vector */
 @@ -380,7 +381,7 @@ static int vp_request_intx(struct virtio_device *vdev)
  {
   int err;
   struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 -
 + printk(KERN_INFO share irq: %d\n, vp_dev-pci_dev-irq);
   err = request_irq(vp_dev-pci_dev-irq, vp_interrupt,
 IRQF_SHARED, dev_name(vdev-dev), vp_dev);
   if (!err)
 @@ -536,13 +537,13 @@ static int vp_try_to_find_vqs(struct virtio_device 
 *vdev, unsigned nvqs,
   } else {
   if (per_vq_vectors) {
   /* Best option: one for change interrupt, one per vq. */
 - nvectors = 1;
 + nvectors = 0;
   for (i = 0; i  nvqs; ++i)
   if (callbacks[i])
   ++nvectors;
   } else {
   /* Second best: one for change, shared for all vqs. */
 - nvectors = 2;
 + nvectors = 1;
   }
  
   err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors);

 diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
 index 101db3f..5ae1844 100644
 --- a/drivers/virtio/virtio_pci.c
 +++ b/drivers/virtio/virtio_pci.c
 @@ -62,6 +62,8 @@ struct virtio_pci_device
  
   /* Whether we have vector per vq */
   bool per_vq_vectors;
 +
 + char config_msix_name[256];
  };
  
  /* Constants for MSI-X */
 @@ -302,6 +304,8 @@ static void vp_free_vectors(struct virtio_device *vdev)
   vp_dev-msix_affinity_masks = NULL;
  }
  
 +static

Re: RFC: sharing config interrupt between virtio devices for saving MSI

2014-04-27 Thread Amos Kong
On Sun, Apr 27, 2014 at 06:03:04PM +0300, Michael S. Tsirkin wrote:
 On Sun, Apr 27, 2014 at 10:35:41PM +0800, Amos Kong wrote:
  On Sat, Apr 19, 2014 at 12:08:15PM +0800, Amos Kong wrote:
   
   Hi all,
   
   I'm working on this item in Upstream Networking todolist:
   
   | - Sharing config interrupts
   |Support more devices by sharing a single msi vector
   |between multiple virtio devices.
   |(Applies to virtio-blk too).
   
   I have this solution here, and only have draft patches of Solution
   1  2, let's discuss if solution 3 is feasible.
   
   * We should not introduce perf regression in this change
   * little effect is acceptable because we are _sharing_ interrupt
   
   Solution 1:
   ==
   share one LSI interrupt for configuration change of all virtio devices.
   Draft patch: share_lsi_for_all_config.patch
   
   Solution 2:
   ==
   share one MSI interrupt for configuration change of all virtio devices.
   Draft patch: share_msix_for_all_config.patch
   
   Solution 3:
   ==
   dynamic adjust interrupt policy when device is added or removed
   Current:
   ---
   - try to allocate a MSI to device's config
   - try to allocate a MSI for each vq
   - share one MSI for all vqs of one device
   - share one LSI for config  vqs of one device
   
   additional dynamic policies:
   ---
   - if there is no enough MSI, adjust interrupt allocation for returning
 some MSI
  - try to find a device that allocated one MSI for each vq, change
it to share one MSI for saving the MSI
  - try to share one MSI for config of all virtio_pci devices
  - try to share one LSI for config of all devices
   
   - if device is removed, try to re-allocate freed MSI to existed
 devices, if they aren't in best status (one MSI for config, one
 MSI for each vq)
  - if config of all devices is sharing one LSI, try to upgrade it to MSI
  - if config of all devices is sharing one MSI, try to allocate one
MSI for configuration change of each device
  - try to find a device that is sharing one MSI for all vqs, try to
allocate one MSI for each vq
  
  Hi Michael, Alex
  
  Any thoughts about this ? :)
  
  
  Thanks, Amos
 
 
 I don't think we need to worry about dynamic.
 
Dynamic policy can always get good balance. If we use a fixed policy,
devices might share IRQ with some unused msix, or sometimes it's not
fair enough for all devices.

 Questions: just by using IRQF_SHARED, do we always end
 up with a shared interrupt? Or only if system is running
 out of vectors?

In solution1: always share one LSI for config of all virtio devices
In solution2: always share one MSI for config of all virtio devices

 Did you test that interrupt is actually shared and all
 handlers are called?

Yes. I can find many devices (virtio.0, virtio.1,...) share same
interrupt in guest's /proc/interrupts. And nics work well.

 We don't stop after the 1st handler that
 returned OK?

When we set IRQF_SHARED flag, dev_id is necessary. In
irq_wake_thread() it's used to identify the device for which
the irq_thread should be woke.
So when we share one IRQ, one single interrupt will only wake one
handler once.

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: add detail error message when fail to add ioeventfd

2013-05-24 Thread Amos Kong
On Thu, May 23, 2013 at 09:46:07AM +0200, Stefan Hajnoczi wrote:
 On Wed, May 22, 2013 at 09:48:21PM +0800, Amos Kong wrote:
  On Wed, May 22, 2013 at 11:32:27AM +0200, Stefan Hajnoczi wrote:
   On Wed, May 22, 2013 at 12:57:35PM +0800, Amos Kong wrote:
I try to hotplug 28 * 8 multiple-function devices to guest with
old host kernel, ioeventfds in host kernel will be exhausted, then
qemu fails to allocate ioeventfds for blk/nic devices.

It's better to add detail error here.

Signed-off-by: Amos Kong ak...@redhat.com
---
 kvm-all.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)
   
   It would be nice to make kvm bus scalable so that the hardcoded
   in-kernel I/O device limit can be lifted.
  
  I had increased kernel NR_IOBUS_DEVS to 1000 (a limitation is needed for
  security) in last Mar, and make resizing kvm_io_range array dynamical.
 
 The maximum should not be hardcoded.  File descriptor, maximum memory,
 etc are all controlled by rlimits.  And since ioeventfds are file
 descriptors they are already limited by the maximum number of file
 descriptors.

For implement the dynamically resize the kvm_io_range array,
I re-allocate new array (with new size) and free old array
when the array flexes. The array is only resized when
add/remove ioeventfds. It will not effect the perf.
 
 Why is there a need to impose a hardcoded limit?

I will send a patch to fix it.
 
 Stefan

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: exclude ioeventfd from counting kvm_io_range limit

2013-05-24 Thread Amos Kong
We can easily reach the 1000 limit by start VM with a couple
hundred I/O devices (multifunction=on). The hardcode limit
already been adjusted 3 times (6 ~ 200 ~ 300 ~ 1000).

In userspace, we already have maximum file descriptor to
limit ioeventfd count. But kvm_io_bus devices also are used
for pit, pic, ioapic, coalesced_mmio. They couldn't be limited
by maximum file descriptor.

Currently only ioeventfds take too much kvm_io_bus devices,
so just exclude it from counting kvm_io_range limit.

Also fixed one indent issue in kvm_host.h

Signed-off-by: Amos Kong ak...@redhat.com
---
 include/linux/kvm_host.h | 3 ++-
 virt/kvm/eventfd.c   | 2 ++
 virt/kvm/kvm_main.c  | 3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f0eea07..ef261ab 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -144,7 +144,8 @@ struct kvm_io_range {
 #define NR_IOBUS_DEVS 1000
 
 struct kvm_io_bus {
-   int   dev_count;
+   int dev_count;
+   int ioeventfd_count;
struct kvm_io_range range[];
 };
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 64ee720..1550637 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -753,6 +753,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd 
*args)
if (ret  0)
goto unlock_fail;
 
+   kvm-buses[bus_idx]-ioeventfd_count++;
list_add_tail(p-list, kvm-ioeventfds);
 
mutex_unlock(kvm-slots_lock);
@@ -798,6 +799,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct 
kvm_ioeventfd *args)
continue;
 
kvm_io_bus_unregister_dev(kvm, bus_idx, p-dev);
+   kvm-buses[bus_idx]-ioeventfd_count--;
ioeventfd_release(p);
ret = 0;
break;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 302681c..c6d9baf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2926,7 +2926,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus 
bus_idx, gpa_t addr,
struct kvm_io_bus *new_bus, *bus;
 
bus = kvm-buses[bus_idx];
-   if (bus-dev_count  NR_IOBUS_DEVS - 1)
+   /* exclude ioeventfd which is limited by maximum fd */
+   if (bus-dev_count - bus-ioeventfd_count  NR_IOBUS_DEVS - 1)
return -ENOSPC;
 
new_bus = kzalloc(sizeof(*bus) + ((bus-dev_count + 1) *
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: add detail error message when fail to add ioeventfd

2013-05-22 Thread Amos Kong
On Wed, May 22, 2013 at 11:32:27AM +0200, Stefan Hajnoczi wrote:
 On Wed, May 22, 2013 at 12:57:35PM +0800, Amos Kong wrote:
  I try to hotplug 28 * 8 multiple-function devices to guest with
  old host kernel, ioeventfds in host kernel will be exhausted, then
  qemu fails to allocate ioeventfds for blk/nic devices.
  
  It's better to add detail error here.
  
  Signed-off-by: Amos Kong ak...@redhat.com
  ---
   kvm-all.c |4 
   1 files changed, 4 insertions(+), 0 deletions(-)
 
 It would be nice to make kvm bus scalable so that the hardcoded
 in-kernel I/O device limit can be lifted.

I had increased kernel NR_IOBUS_DEVS to 1000 (a limitation is needed for
security) in last Mar, and make resizing kvm_io_range array dynamical.

 Reviewed-by: Stefan Hajnoczi stefa...@redhat.com

-- 
Amos.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: add detail error message when fail to add ioeventfd

2013-05-21 Thread Amos Kong
I try to hotplug 28 * 8 multiple-function devices to guest with
old host kernel, ioeventfds in host kernel will be exhausted, then
qemu fails to allocate ioeventfds for blk/nic devices.

It's better to add detail error here.

Signed-off-by: Amos Kong ak...@redhat.com
---
 kvm-all.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index 8222729..3d5f7b7 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -813,6 +813,8 @@ static void kvm_mem_ioeventfd_add(MemoryListener *listener,
 r = kvm_set_ioeventfd_mmio(fd, section-offset_within_address_space,
data, true, section-size, match_data);
 if (r  0) {
+fprintf(stderr, %s: error adding ioeventfd: %s\n,
+__func__, strerror(-r));
 abort();
 }
 }
@@ -843,6 +845,8 @@ static void kvm_io_ioeventfd_add(MemoryListener *listener,
 r = kvm_set_ioeventfd_pio(fd, section-offset_within_address_space,
   data, true, section-size, match_data);
 if (r  0) {
+fprintf(stderr, %s: error adding ioeventfd: %s\n,
+__func__, strerror(-r));
 abort();
 }
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [QEMU PATCH v4 2/3] virtio-net: introduce a new macaddr control

2013-01-22 Thread Amos Kong
On Mon, Jan 21, 2013 at 05:08:26PM +0100, Stefan Hajnoczi wrote:
 On Sat, Jan 19, 2013 at 09:54:27AM +0800, ak...@redhat.com wrote:
  @@ -350,6 +351,18 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t 
  cmd,
   struct virtio_net_ctrl_mac mac_data;
   size_t s;
   
  +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) {
  +if (iov_size(iov, iov_cnt) != ETH_ALEN) {
  +return VIRTIO_NET_ERR;
  +}
  +s = iov_to_buf(iov, iov_cnt, 0, n-mac, sizeof(n-mac));
  +if (s != sizeof(n-mac)) {
  +return VIRTIO_NET_ERR;
  +}

 
 Since iov_size() was checked before iov_to_buf(), we never hit this
 error.  And if we did n-mac would be trashed (i.e. error handling is
 not complete).

You are right.
iov_size() computes the size by accounting iov[].iov_lens, the first
check is enough.
 
 I think assert(s == sizeof(n-mac)) is more appropriate appropriate.
 Also, please change ETH_ALEN to sizeof(n-mac) to make the relationship
 between the check and the copy clear.
 

Will update this patch.

 if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) {
 if (iov_size(iov, iov_cnt) != sizeof(n-mac)) {
 return VIRTIO_NET_ERR;
 }
 s = iov_to_buf(iov, iov_cnt, 0, n-mac, sizeof(n-mac));
 assert(s == sizeof(n-mac));
 qemu_format_nic_info_str(n-nic-nc, n-mac);
 return VIRTIO_NET_OK;
 }

 Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [QEMU PATCH v4 1/3] virtio-net: remove layout assumptions for ctrl vq

2013-01-22 Thread Amos Kong
On Mon, Jan 21, 2013 at 05:03:30PM +0100, Stefan Hajnoczi wrote:
 On Sat, Jan 19, 2013 at 09:54:26AM +0800, ak...@redhat.com wrote:
  From: Michael S. Tsirkin m...@redhat.com
  
  Virtio-net code makes assumption about virtqueue descriptor layout
  (e.g. sg[0] is the header, sg[1] is the data buffer).
  
  This patch makes code not rely on the layout of descriptors.
  
  Signed-off-by: Michael S. Tsirkin m...@redhat.com
  Signed-off-by: Amos Kong ak...@redhat.com
  ---
   hw/virtio-net.c | 128 
  
   1 file changed, 74 insertions(+), 54 deletions(-)
  
  diff --git a/hw/virtio-net.c b/hw/virtio-net.c
  index 3bb01b1..113e194 100644
  --- a/hw/virtio-net.c
  +++ b/hw/virtio-net.c
  @@ -315,44 +315,44 @@ static void virtio_net_set_features(VirtIODevice 
  *vdev, uint32_t features)
   }
   
   static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
  - VirtQueueElement *elem)
  + struct iovec *iov, unsigned int 
  iov_cnt)
   {
   uint8_t on;
  +size_t s;
   
  -if (elem-out_num != 2 || elem-out_sg[1].iov_len != sizeof(on)) {
  -error_report(virtio-net ctrl invalid rx mode command);
  -exit(1);
  +s = iov_to_buf(iov, iov_cnt, 0, on, sizeof(on));
  +if (s != sizeof(on)) {
  +return VIRTIO_NET_ERR;
   }
   
  -on = ldub_p(elem-out_sg[1].iov_base);
  -
  -if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC)
  +if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) {
   n-promisc = on;
  -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI)
  +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) {
   n-allmulti = on;
  -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI)
  +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) {
   n-alluni = on;
  -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI)
  +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) {
   n-nomulti = on;
  -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI)
  +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) {
   n-nouni = on;
  -else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST)
  +} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) {
   n-nobcast = on;
  -else
  +} else {
   return VIRTIO_NET_ERR;
  +}
   
   return VIRTIO_NET_OK;
   }
   
   static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
  - VirtQueueElement *elem)
  + struct iovec *iov, unsigned int iov_cnt)
   {
   struct virtio_net_ctrl_mac mac_data;
  +size_t s;
   
  -if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem-out_num != 3 ||
  -elem-out_sg[1].iov_len  sizeof(mac_data) ||
  -elem-out_sg[2].iov_len  sizeof(mac_data))
  +if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) {
   return VIRTIO_NET_ERR;
  +}
   
   n-mac_table.in_use = 0;
   n-mac_table.first_multi = 0;
  @@ -360,54 +360,71 @@ static int virtio_net_handle_mac(VirtIONet *n, 
  uint8_t cmd,
   n-mac_table.multi_overflow = 0;
   memset(n-mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
   
  -mac_data.entries = ldl_p(elem-out_sg[1].iov_base);
  +s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries,
  +   sizeof(mac_data.entries));

Hi Stefan, can we adjust the endianness after each iov_to_buf() copy?


diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 72d7857..0088d6c 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -321,6 +321,7 @@ static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t 
cmd,
 size_t s;
 
 s = iov_to_buf(iov, iov_cnt, 0, on, sizeof(on));
+on = ldub_p(on);
 if (s != sizeof(on)) {
 return VIRTIO_NET_ERR;
 }
@@ -362,7 +363,7 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
 
 s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries,
sizeof(mac_data.entries));
-
+mac_data.entries = ldl_p(mac_data.entries);
 if (s != sizeof(mac_data.entries)) {
 return VIRTIO_NET_ERR;
 }
@@ -389,7 +390,7 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
 
 s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries,
sizeof(mac_data.entries));
-
+mac_data.entries = ldl_p(mac_data.entries);
 if (s != sizeof(mac_data.entries)) {
 return VIRTIO_NET_ERR;
 }
@@ -421,6 +422,7 @@ static int virtio_net_handle_vlan_table(VirtIONet *n, 
uint8_t cmd,
 size_t s;
 
 s = iov_to_buf(iov, iov_cnt, 0, vid, sizeof(vid));
+vid = lduw_p(vid);
 if (s != sizeof(vid)) {
 return VIRTIO_NET_ERR;
 }
@@ -458,6 +460,8 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
 iov = elem.out_sg;
 iov_cnt = elem.out_num;
 s = iov_to_buf(iov, iov_cnt, 0, ctrl, sizeof(ctrl));
+ctrl.class = ldub_p(ctrl.class);
+ctrl.cmd = ldub_p

[QEMU PATCH v5 3/3] virtio-net: rename ctrl rx commands

2013-01-22 Thread Amos Kong
This patch makes rx commands consistent with specification.

Signed-off-by: Amos Kong ak...@redhat.com
---
 hw/virtio-net.c |   14 +++---
 hw/virtio-net.h |   14 +++---
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index acef5a5..ac4434e 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -326,17 +326,17 @@ static int virtio_net_handle_rx_mode(VirtIONet *n, 
uint8_t cmd,
 return VIRTIO_NET_ERR;
 }
 
-if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) {
+if (cmd == VIRTIO_NET_CTRL_RX_PROMISC) {
 n-promisc = on;
-} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) {
+} else if (cmd == VIRTIO_NET_CTRL_RX_ALLMULTI) {
 n-allmulti = on;
-} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) {
+} else if (cmd == VIRTIO_NET_CTRL_RX_ALLUNI) {
 n-alluni = on;
-} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) {
+} else if (cmd == VIRTIO_NET_CTRL_RX_NOMULTI) {
 n-nomulti = on;
-} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) {
+} else if (cmd == VIRTIO_NET_CTRL_RX_NOUNI) {
 n-nouni = on;
-} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) {
+} else if (cmd == VIRTIO_NET_CTRL_RX_NOBCAST) {
 n-nobcast = on;
 } else {
 return VIRTIO_NET_ERR;
@@ -473,7 +473,7 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, 
VirtQueue *vq)
 iov_discard_front(iov, iov_cnt, sizeof(ctrl));
 if (s != sizeof(ctrl)) {
 status = VIRTIO_NET_ERR;
-} else if (ctrl.class == VIRTIO_NET_CTRL_RX_MODE) {
+} else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
 status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt);
 } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
 status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt);
diff --git a/hw/virtio-net.h b/hw/virtio-net.h
index 1ec632f..c0bb284 100644
--- a/hw/virtio-net.h
+++ b/hw/virtio-net.h
@@ -99,13 +99,13 @@ typedef uint8_t virtio_net_ctrl_ack;
  * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature.
  * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA.
  */
-#define VIRTIO_NET_CTRL_RX_MODE0
- #define VIRTIO_NET_CTRL_RX_MODE_PROMISC  0
- #define VIRTIO_NET_CTRL_RX_MODE_ALLMULTI 1
- #define VIRTIO_NET_CTRL_RX_MODE_ALLUNI   2
- #define VIRTIO_NET_CTRL_RX_MODE_NOMULTI  3
- #define VIRTIO_NET_CTRL_RX_MODE_NOUNI4
- #define VIRTIO_NET_CTRL_RX_MODE_NOBCAST  5
+#define VIRTIO_NET_CTRL_RX0
+ #define VIRTIO_NET_CTRL_RX_PROMISC  0
+ #define VIRTIO_NET_CTRL_RX_ALLMULTI 1
+ #define VIRTIO_NET_CTRL_RX_ALLUNI   2
+ #define VIRTIO_NET_CTRL_RX_NOMULTI  3
+ #define VIRTIO_NET_CTRL_RX_NOUNI4
+ #define VIRTIO_NET_CTRL_RX_NOBCAST  5
 
 /*
  * Control the MAC
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[QEMU PATCH v5 1/3] virtio-net: remove layout assumptions for ctrl vq

2013-01-22 Thread Amos Kong
From: Michael S. Tsirkin m...@redhat.com

Virtio-net code makes assumption about virtqueue descriptor layout
(e.g. sg[0] is the header, sg[1] is the data buffer).

This patch makes code not rely on the layout of descriptors.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
Signed-off-by: Amos Kong ak...@redhat.com
---
 hw/virtio-net.c |  129 ---
 1 files changed, 75 insertions(+), 54 deletions(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 3bb01b1..af1f3a1 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -315,44 +315,44 @@ static void virtio_net_set_features(VirtIODevice *vdev, 
uint32_t features)
 }
 
 static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
- VirtQueueElement *elem)
+ struct iovec *iov, unsigned int iov_cnt)
 {
 uint8_t on;
+size_t s;
 
-if (elem-out_num != 2 || elem-out_sg[1].iov_len != sizeof(on)) {
-error_report(virtio-net ctrl invalid rx mode command);
-exit(1);
+s = iov_to_buf(iov, iov_cnt, 0, on, sizeof(on));
+if (s != sizeof(on)) {
+return VIRTIO_NET_ERR;
 }
 
-on = ldub_p(elem-out_sg[1].iov_base);
-
-if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC)
+if (cmd == VIRTIO_NET_CTRL_RX_MODE_PROMISC) {
 n-promisc = on;
-else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI)
+} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLMULTI) {
 n-allmulti = on;
-else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI)
+} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_ALLUNI) {
 n-alluni = on;
-else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI)
+} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOMULTI) {
 n-nomulti = on;
-else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI)
+} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOUNI) {
 n-nouni = on;
-else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST)
+} else if (cmd == VIRTIO_NET_CTRL_RX_MODE_NOBCAST) {
 n-nobcast = on;
-else
+} else {
 return VIRTIO_NET_ERR;
+}
 
 return VIRTIO_NET_OK;
 }
 
 static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
- VirtQueueElement *elem)
+ struct iovec *iov, unsigned int iov_cnt)
 {
 struct virtio_net_ctrl_mac mac_data;
+size_t s;
 
-if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET || elem-out_num != 3 ||
-elem-out_sg[1].iov_len  sizeof(mac_data) ||
-elem-out_sg[2].iov_len  sizeof(mac_data))
+if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) {
 return VIRTIO_NET_ERR;
+}
 
 n-mac_table.in_use = 0;
 n-mac_table.first_multi = 0;
@@ -360,54 +360,72 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t 
cmd,
 n-mac_table.multi_overflow = 0;
 memset(n-mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
 
-mac_data.entries = ldl_p(elem-out_sg[1].iov_base);
+s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries,
+   sizeof(mac_data.entries));
+mac_data.entries = ldl_p(mac_data.entries);
+if (s != sizeof(mac_data.entries)) {
+return VIRTIO_NET_ERR;
+}
+iov_discard_front(iov, iov_cnt, s);
 
-if (sizeof(mac_data.entries) +
-(mac_data.entries * ETH_ALEN)  elem-out_sg[1].iov_len)
+if (mac_data.entries * ETH_ALEN  iov_size(iov, iov_cnt)) {
 return VIRTIO_NET_ERR;
+}
 
 if (mac_data.entries = MAC_TABLE_ENTRIES) {
-memcpy(n-mac_table.macs, elem-out_sg[1].iov_base + sizeof(mac_data),
-   mac_data.entries * ETH_ALEN);
+s = iov_to_buf(iov, iov_cnt, 0, n-mac_table.macs,
+   mac_data.entries * ETH_ALEN);
+if (s != mac_data.entries * ETH_ALEN) {
+return VIRTIO_NET_ERR;
+}
 n-mac_table.in_use += mac_data.entries;
 } else {
 n-mac_table.uni_overflow = 1;
 }
 
+iov_discard_front(iov, iov_cnt, mac_data.entries * ETH_ALEN);
+
 n-mac_table.first_multi = n-mac_table.in_use;
 
-mac_data.entries = ldl_p(elem-out_sg[2].iov_base);
+s = iov_to_buf(iov, iov_cnt, 0, mac_data.entries,
+   sizeof(mac_data.entries));
+mac_data.entries = ldl_p(mac_data.entries);
+if (s != sizeof(mac_data.entries)) {
+return VIRTIO_NET_ERR;
+}
+
+iov_discard_front(iov, iov_cnt, s);
 
-if (sizeof(mac_data.entries) +
-(mac_data.entries * ETH_ALEN)  elem-out_sg[2].iov_len)
+if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) {
 return VIRTIO_NET_ERR;
+}
 
-if (mac_data.entries) {
-if (n-mac_table.in_use + mac_data.entries = MAC_TABLE_ENTRIES) {
-memcpy(n-mac_table.macs + (n-mac_table.in_use * ETH_ALEN),
-   elem-out_sg[2].iov_base + sizeof(mac_data),
-   mac_data.entries * ETH_ALEN);
-n-mac_table.in_use += mac_data.entries;
-} else {
-n

[QEMU PATCH v5 0/3] virtio-net: fix of ctrl commands

2013-01-22 Thread Amos Kong
Currently virtio-net code relys on the layout of descriptor,
this patchset removed the assumptions and introduced a control
command to set mac address. Last patch is a trivial renaming.

V2: check guest's iov_len
V3: fix of migration compatibility
make mac field in config space read-only when new feature is acked
V4: add fix of descriptor layout assumptions, trivial rename
V5: fix endianness after iov_to_buf copy

Amos Kong (2):
  virtio-net: introduce a new macaddr control
  virtio-net: rename ctrl rx commands

Michael S. Tsirkin (1):
  virtio-net: remove layout assumptions for ctrl vq

 hw/pc_piix.c|4 ++
 hw/virtio-net.c |  142 +-
 hw/virtio-net.h |   26 +++
 3 files changed, 108 insertions(+), 64 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 0/3] make mac programming for virtio net more robust

2013-01-21 Thread Amos Kong
Currenly mac is programmed byte by byte. This means that we
have an intermediate step where mac is wrong. 

Third patch introduced a new vq control command to set mac
address, it's atomic.

V2: check return of sending command, delay eth_mac_addr()
V3: restore software address when fail to set hardware address
V4: split eth_mac_addr, fix error handle
V5: rebase patches to net-next tree

Amos Kong (2):
  move virtnet_send_command() above virtnet_set_mac_address()
  virtio-net: introduce a new control to set macaddr

Stefan Hajnoczi (1):
  net: split eth_mac_addr for better error handling

 drivers/net/virtio_net.c|  110 ++-
 include/linux/etherdevice.h |2 +
 include/uapi/linux/virtio_net.h |8 +++-
 net/ethernet/eth.c  |   41 --
 4 files changed, 106 insertions(+), 55 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 1/3] move virtnet_send_command() above virtnet_set_mac_address()

2013-01-21 Thread Amos Kong
We want to send vq command to set mac address in
virtnet_set_mac_address(), so do this function moving.
Fixed a little issue of coding style.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/net/virtio_net.c |   89 ++---
 1 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index a6fcf15..395ab4f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -753,6 +753,50 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct 
net_device *dev)
return NETDEV_TX_OK;
 }
 
+/*
+ * Send command via the control virtqueue and check status.  Commands
+ * supported by the hypervisor, as indicated by feature bits, should
+ * never fail unless improperly formated.
+ */
+static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
+struct scatterlist *data, int out, int in)
+{
+   struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
+   struct virtio_net_ctrl_hdr ctrl;
+   virtio_net_ctrl_ack status = ~0;
+   unsigned int tmp;
+   int i;
+
+   /* Caller should know better */
+   BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) ||
+   (out + in  VIRTNET_SEND_COMMAND_SG_MAX));
+
+   out++; /* Add header */
+   in++; /* Add return status */
+
+   ctrl.class = class;
+   ctrl.cmd = cmd;
+
+   sg_init_table(sg, out + in);
+
+   sg_set_buf(sg[0], ctrl, sizeof(ctrl));
+   for_each_sg(data, s, out + in - 2, i)
+   sg_set_buf(sg[i + 1], sg_virt(s), s-length);
+   sg_set_buf(sg[out + in - 1], status, sizeof(status));
+
+   BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC)  0);
+
+   virtqueue_kick(vi-cvq);
+
+   /* Spin for a response, the kick causes an ioport write, trapping
+* into the hypervisor, so the request should be handled immediately.
+*/
+   while (!virtqueue_get_buf(vi-cvq, tmp))
+   cpu_relax();
+
+   return status == VIRTIO_NET_OK;
+}
+
 static int virtnet_set_mac_address(struct net_device *dev, void *p)
 {
struct virtnet_info *vi = netdev_priv(dev);
@@ -819,51 +863,6 @@ static void virtnet_netpoll(struct net_device *dev)
 }
 #endif
 
-/*
- * Send command via the control virtqueue and check status.  Commands
- * supported by the hypervisor, as indicated by feature bits, should
- * never fail unless improperly formated.
- */
-static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
-struct scatterlist *data, int out, int in)
-{
-   struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
-   struct virtio_net_ctrl_hdr ctrl;
-   virtio_net_ctrl_ack status = ~0;
-   unsigned int tmp;
-   int i;
-
-   /* Caller should know better */
-   BUG_ON(!virtio_has_feature(vi-vdev, VIRTIO_NET_F_CTRL_VQ) ||
-   (out + in  VIRTNET_SEND_COMMAND_SG_MAX));
-
-   out++; /* Add header */
-   in++; /* Add return status */
-
-   ctrl.class = class;
-   ctrl.cmd = cmd;
-
-   sg_init_table(sg, out + in);
-
-   sg_set_buf(sg[0], ctrl, sizeof(ctrl));
-   for_each_sg(data, s, out + in - 2, i)
-   sg_set_buf(sg[i + 1], sg_virt(s), s-length);
-   sg_set_buf(sg[out + in - 1], status, sizeof(status));
-
-   BUG_ON(virtqueue_add_buf(vi-cvq, sg, out, in, vi, GFP_ATOMIC)  0);
-
-   virtqueue_kick(vi-cvq);
-
-   /*
-* Spin for a response, the kick causes an ioport write, trapping
-* into the hypervisor, so the request should be handled immediately.
-*/
-   while (!virtqueue_get_buf(vi-cvq, tmp))
-   cpu_relax();
-
-   return status == VIRTIO_NET_OK;
-}
-
 static void virtnet_ack_link_announce(struct virtnet_info *vi)
 {
rtnl_lock();
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 2/3] net: split eth_mac_addr for better error handling

2013-01-21 Thread Amos Kong
From: Stefan Hajnoczi stefa...@gmail.com

When we set mac address, software mac address in system and hardware mac
address all need to be updated. Current eth_mac_addr() doesn't allow
callers to implement error handling nicely.

This patch split eth_mac_addr() to prepare part and real commit part,
then we can prepare first, and try to change hardware address, then do
the real commit if hardware address is set successfully.

Signed-off-by: Stefan Hajnoczi stefa...@gmail.com
Signed-off-by: Amos Kong ak...@redhat.com
---
 include/linux/etherdevice.h |2 ++
 net/ethernet/eth.c  |   41 +++--
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 1a43e1b..c623861 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -40,6 +40,8 @@ extern int eth_header_cache(const struct neighbour *neigh, 
struct hh_cache *hh,
 extern void eth_header_cache_update(struct hh_cache *hh,
const struct net_device *dev,
const unsigned char *haddr);
+extern int eth_prepare_mac_addr_change(struct net_device *dev, void *p);
+extern void eth_commit_mac_addr_change(struct net_device *dev, void *p);
 extern int eth_mac_addr(struct net_device *dev, void *p);
 extern int eth_change_mtu(struct net_device *dev, int new_mtu);
 extern int eth_validate_addr(struct net_device *dev);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index bc39c8c..a36c85e 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -272,6 +272,36 @@ void eth_header_cache_update(struct hh_cache *hh,
 EXPORT_SYMBOL(eth_header_cache_update);
 
 /**
+ * eth_prepare_mac_addr_change - prepare for mac change
+ * @dev: network device
+ * @p: socket address
+ */
+int eth_prepare_mac_addr_change(struct net_device *dev, void *p)
+{
+   struct sockaddr *addr = p;
+
+   if (!(dev-priv_flags  IFF_LIVE_ADDR_CHANGE)  netif_running(dev))
+   return -EBUSY;
+   if (!is_valid_ether_addr(addr-sa_data))
+   return -EADDRNOTAVAIL;
+   return 0;
+}
+EXPORT_SYMBOL(eth_prepare_mac_addr_change);
+
+/**
+ * eth_commit_mac_addr_change - commit mac change
+ * @dev: network device
+ * @p: socket address
+ */
+void eth_commit_mac_addr_change(struct net_device *dev, void *p)
+{
+   struct sockaddr *addr = p;
+
+   memcpy(dev-dev_addr, addr-sa_data, ETH_ALEN);
+}
+EXPORT_SYMBOL(eth_commit_mac_addr_change);
+
+/**
  * eth_mac_addr - set new Ethernet hardware address
  * @dev: network device
  * @p: socket address
@@ -283,13 +313,12 @@ EXPORT_SYMBOL(eth_header_cache_update);
  */
 int eth_mac_addr(struct net_device *dev, void *p)
 {
-   struct sockaddr *addr = p;
+   int ret;
 
-   if (!(dev-priv_flags  IFF_LIVE_ADDR_CHANGE)  netif_running(dev))
-   return -EBUSY;
-   if (!is_valid_ether_addr(addr-sa_data))
-   return -EADDRNOTAVAIL;
-   memcpy(dev-dev_addr, addr-sa_data, ETH_ALEN);
+   ret = eth_prepare_mac_addr_change(dev, p);
+   if (ret  0)
+   return ret;
+   eth_commit_mac_addr_change(dev, p);
return 0;
 }
 EXPORT_SYMBOL(eth_mac_addr);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 3/3] virtio-net: introduce a new control to set macaddr

2013-01-21 Thread Amos Kong
Currently we write MAC address to pci config space byte by byte,
this means that we have an intermediate step where mac is wrong.
This patch introduced a new control command to set MAC address,
it's atomic.

VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.

Signed-off-by: Amos Kong ak...@redhat.com
---
 drivers/net/virtio_net.c|   21 ++---
 include/uapi/linux/virtio_net.h |8 +++-
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 395ab4f..701408a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -802,14 +802,28 @@ static int virtnet_set_mac_address(struct net_device 
*dev, void *p)
struct virtnet_info *vi = netdev_priv(dev);
struct virtio_device *vdev = vi-vdev;
int ret;
+   struct sockaddr *addr = p;
+   struct scatterlist sg;
 
-   ret = eth_mac_addr(dev, p);
+   ret = eth_prepare_mac_addr_change(dev, p);
if (ret)
return ret;
 
-   if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
+   if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
+   sg_init_one(sg, addr-sa_data, dev-addr_len);
+   if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
+ VIRTIO_NET_CTRL_MAC_ADDR_SET,
+ sg, 1, 0)) {
+   dev_warn(vdev-dev,
+Failed to set mac address by vq command.\n);
+   return -EINVAL;
+   }
+   } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
vdev-config-set(vdev, offsetof(struct virtio_net_config, mac),
- dev-dev_addr, dev-addr_len);
+ addr-sa_data, dev-addr_len);
+   }
+
+   eth_commit_mac_addr_change(dev, p);
 
return 0;
 }
@@ -1627,6 +1641,7 @@ static unsigned int features[] = {
VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
+   VIRTIO_NET_F_CTRL_MAC_ADDR,
 };
 
 static struct virtio_driver virtio_net_driver = {
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 848e358..a5a8c88 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -53,6 +53,7 @@
 * network */
 #define VIRTIO_NET_F_MQ22  /* Device supports Receive Flow
 * Steering */
+#define VIRTIO_NET_F_CTRL_MAC_ADDR 23  /* Set MAC address */
 
 #define VIRTIO_NET_S_LINK_UP   1   /* Link is up */
 #define VIRTIO_NET_S_ANNOUNCE  2   /* Announcement is needed */
@@ -127,7 +128,7 @@ typedef __u8 virtio_net_ctrl_ack;
  #define VIRTIO_NET_CTRL_RX_NOBCAST  5
 
 /*
- * Control the MAC filter table.
+ * Control the MAC
  *
  * The MAC filter table is managed by the hypervisor, the guest should
  * assume the size is infinite.  Filtering should be considered
@@ -140,6 +141,10 @@ typedef __u8 virtio_net_ctrl_ack;
  * first sg list contains unicast addresses, the second is for multicast.
  * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
  * is available.
+ *
+ * The ADDR_SET command requests one out scatterlist, it contains a
+ * 6 bytes MAC address. This functionality is present if the
+ * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
  */
 struct virtio_net_ctrl_mac {
__u32 entries;
@@ -148,6 +153,7 @@ struct virtio_net_ctrl_mac {
 
 #define VIRTIO_NET_CTRL_MAC1
  #define VIRTIO_NET_CTRL_MAC_TABLE_SET0
+ #define VIRTIO_NET_CTRL_MAC_ADDR_SET 1
 
 /*
  * Control VLAN filtering
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/2] virtio-net: introduce a new control to set macaddr

2013-01-19 Thread Amos Kong
On Fri, Jan 18, 2013 at 12:00:42PM +0100, Stefan Hajnoczi wrote:
 On Thu, Jan 17, 2013 at 06:40:12PM +0800, ak...@redhat.com wrote:
  diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
  index 395ab4f..837c978 100644
  --- a/drivers/net/virtio_net.c
  +++ b/drivers/net/virtio_net.c
  @@ -802,14 +802,32 @@ static int virtnet_set_mac_address(struct net_device 
  *dev, void *p)
  struct virtnet_info *vi = netdev_priv(dev);
  struct virtio_device *vdev = vi-vdev;
  int ret;
  +   struct scatterlist sg;
  +   char save_addr[ETH_ALEN];
  +   unsigned char save_aatype;
  +
  +   memcpy(save_addr, dev-dev_addr, ETH_ALEN);
  +   save_aatype = dev-addr_assign_type;
   
  ret = eth_mac_addr(dev, p);
  if (ret)
  return ret;
   
  -   if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
  +   if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
  +   sg_init_one(sg, dev-dev_addr, dev-addr_len);
  +   if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
  + VIRTIO_NET_CTRL_MAC_ADDR_SET,
  + sg, 1, 0)) {
  +   dev_warn(vdev-dev,
  +Failed to set mac address by vq command.\n);
  +   memcpy(dev-dev_addr, save_addr, ETH_ALEN);
  +   dev-addr_assign_type = save_aatype;
  +   return -EINVAL;
  +   }
 
 eth_mac_addr() doesn't allow callers to implement error handling nicely.
 Although you didn't duplicate it's code directly, this patch still leaks
 internals of eth_mac_addr().
 
 How about splitting eth_mac_addr() in a separate patch:

Agree, then we can have nice error handling.
I will send a V4 and include this patch.
Thanks.
 
 int eth_prepare_mac_addr_change(struct net_device *dev, void *p)
 {
 struct sockaddr *addr = p;
 if (!(dev-priv_flags  IFF_LIVE_ADDR_CHANGE)  netif_running(dev))
 return -EBUSY;
 if (!is_valid_ether_addr(addr-sa_data))
 return -EADDRNOTAVAIL;
   return 0;
 }

...

 Now virtio_net.c does:
 
 ret = eth_prepare_mac_addr_change(dev, p);
 if (ret  0)
   return ret;
 
 if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
   sg_init_one(sg, dev-dev_addr, dev-addr_len);

 trivial issue, %s/dev-dev_addr/addr-sa_data

   if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
 VIRTIO_NET_CTRL_MAC_ADDR_SET,
 sg, 1, 0)) {
   dev_warn(vdev-dev,
Failed to set mac address by vq command.\n);
   return -EINVAL;
   }
 } ...
 
 eth_commit_mac_addr_change(dev, p);
 return 0;
 
 Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [QEMU PATCH v4 1/3] virtio-net: remove layout assumptions for ctrl vq

2013-01-18 Thread Amos Kong
On Sat, Jan 19, 2013 at 09:54:26AM +0800, ak...@redhat.com wrote:
 From: Michael S. Tsirkin m...@redhat.com
 
 Virtio-net code makes assumption about virtqueue descriptor layout
 (e.g. sg[0] is the header, sg[1] is the data buffer).
 
 This patch makes code not rely on the layout of descriptors.
 
 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 Signed-off-by: Amos Kong ak...@redhat.com
 ---
  hw/virtio-net.c | 128 
 
  1 file changed, 74 insertions(+), 54 deletions(-)

Had tested this patch with following scenarios:

VIRTIO_NET_CTRL_RX_MODE
1) ip link eth0 promisc on/off
2) ip link set eth0 allmulticast on/off
3) ip link set eth0 multicast on/off

VIRTIO_NET_CTRL_MAC
4) ifconfig eth0 hw ether 52:54:00:12:34:57
5) ping guest after joined guest into multicast
   group (225.0.0.1 ~ 225.0.0.10)

VIRTIO_NET_CTRL_VLAN
6) vconfig add eth0 2; vconfig rem eth0.2


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [QEMU PATCH v2] virtio-net: introduce a new macaddr control

2013-01-17 Thread Amos Kong
On Thu, Jan 17, 2013 at 01:45:11PM +0800, Amos Kong wrote:
 On Thu, Jan 17, 2013 at 11:49:20AM +1030, Rusty Russell wrote:
  ak...@redhat.com writes:
   @@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, 
   uint8_t cmd,
{
struct virtio_net_ctrl_mac mac_data;

   +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET  elem-out_num == 2 
   +elem-out_sg[1].iov_len == ETH_ALEN) {
   +/* Set MAC address */
   +memcpy(n-mac, elem-out_sg[1].iov_base, 
   elem-out_sg[1].iov_len);
   +qemu_format_nic_info_str(n-nic-nc, n-mac);
   +return VIRTIO_NET_OK;
   +}
  
  Does the rest of the net device still rely on the layout of descriptors?
 
 No, only info string of net client relies on n-mac

I misunderstood. There is no clear limitation of how much descriptor are
used for each vq command, but many commands rely on the layout of
descriptiors. eg:

virtio-net:
   VIRTIO_NET_CTRL_RX_PROMISC
   VIRTIO_NET_CTRL_RX_ALLMULTI
   VIRTIO_NET_CTRL_MAC_TABLE_SET
   etc
 
  If so, OK, we'll fix them all together.  If not, this introduces a new
  one.
  
  Cheers,
  Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [QEMU PATCH v2] virtio-net: introduce a new macaddr control

2013-01-17 Thread Amos Kong
On Thu, Jan 17, 2013 at 02:13:08PM +0200, Michael S. Tsirkin wrote:
 On Thu, Jan 17, 2013 at 11:49:20AM +1030, Rusty Russell wrote:
  ak...@redhat.com writes:
   @@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, 
   uint8_t cmd,
{
struct virtio_net_ctrl_mac mac_data;

   +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET  elem-out_num == 2 
   +elem-out_sg[1].iov_len == ETH_ALEN) {
   +/* Set MAC address */
   +memcpy(n-mac, elem-out_sg[1].iov_base, 
   elem-out_sg[1].iov_len);
   +qemu_format_nic_info_str(n-nic-nc, n-mac);
   +return VIRTIO_NET_OK;
   +}
  
  Does the rest of the net device still rely on the layout of descriptors?
  If so, OK, we'll fix them all together.  If not, this introduces a new
  one.
  
  Cheers,
  Rusty.
 
 The following fixes all existing users.
 Got to deal with some urgent stuff so did not test yet -
 Amos, would you like to include this in your patchset
 and build on it, test it all together?

No problem.

 If not I'll get to it next week.
 
 ---
 
 virtio-net: remove layout assumptions for ctrl vq
 
 Signed-off-by: Michael S. Tsirkin m...@redhat.com

...
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 2/2] virtio-net: introduce a new control to set macaddr

2013-01-16 Thread Amos Kong
On Wed, Jan 16, 2013 at 02:20:39PM +0800, Jason Wang wrote:
 On Wednesday, January 16, 2013 01:57:01 PM ak...@redhat.com wrote:
  From: Amos Kong ak...@redhat.com
  
  Currently we write MAC address to pci config space byte by byte,
  this means that we have an intermediate step where mac is wrong.
  This patch introduced a new control command to set MAC address
  in one time.
  
  VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.
  
  Signed-off-by: Amos Kong ak...@redhat.com
  ---
   drivers/net/virtio_net.c| 24 +---
   include/uapi/linux/virtio_net.h |  8 +++-
   2 files changed, 24 insertions(+), 8 deletions(-)
  
  diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
  index 395ab4f..c8901b6 100644
  --- a/drivers/net/virtio_net.c
  +++ b/drivers/net/virtio_net.c
  @@ -802,16 +802,25 @@ static int virtnet_set_mac_address(struct net_device
  *dev, void *p) struct virtnet_info *vi = netdev_priv(dev);
  struct virtio_device *vdev = vi-vdev;
  int ret;
  +   struct sockaddr *addr = p;
  +   struct scatterlist sg;
  
  -   ret = eth_mac_addr(dev, p);
  -   if (ret)
  -   return ret;
  -
  -   if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
  +   if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
  +   sg_init_one(sg, addr-sa_data, dev-addr_len);
  +   if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
  + VIRTIO_NET_CTRL_MAC_ADDR_SET,
  + sg, 1, 0)) {
  +   dev_warn(vdev-dev,
  +Failed to set mac address by vq command.\n);
  +   return -EINVAL;
  +   }
  +   } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
  vdev-config-set(vdev, offsetof(struct virtio_net_config, mac),
  - dev-dev_addr, dev-addr_len);
  + addr-sa_data, dev-addr_len);
  +   }
  +   ret = eth_mac_addr(dev, p);
  
 
 The you will the validity check in eth_mac_addr which may result a wrong mac 
 address to be set in the hardware (or is there any check in qemu) and a 
 inconsistency bettween what kernel assumes and qemu has.
 
 You can take a look at netvsc driver that calls eth_mac_addr() first and 
 restore the software mac address when fail to enforce it to hardware.

Thanks for the catching, I will move eth_mac_addr() back to above,
just restore addr if fail to send command.

I will also use DEFINE_PROP_BIT to fix migration issue, thanks.
 
 Thanks
  -   return 0;
  +   return ret;
   }


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [QEMU PATCH v2] virtio-net: introduce a new macaddr control

2013-01-16 Thread Amos Kong
On Thu, Jan 17, 2013 at 11:49:20AM +1030, Rusty Russell wrote:
 ak...@redhat.com writes:
  @@ -349,6 +351,14 @@ static int virtio_net_handle_mac(VirtIONet *n, uint8_t 
  cmd,
   {
   struct virtio_net_ctrl_mac mac_data;
   
  +if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET  elem-out_num == 2 
  +elem-out_sg[1].iov_len == ETH_ALEN) {
  +/* Set MAC address */
  +memcpy(n-mac, elem-out_sg[1].iov_base, elem-out_sg[1].iov_len);
  +qemu_format_nic_info_str(n-nic-nc, n-mac);
  +return VIRTIO_NET_OK;
  +}
 
 Does the rest of the net device still rely on the layout of descriptors?

No, only info string of net client relies on n-mac

 If so, OK, we'll fix them all together.  If not, this introduces a new
 one.
 
 Cheers,
 Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virtio-spec: set mac address by a new vq command

2013-01-16 Thread Amos Kong
On Wed, Jan 16, 2013 at 11:22:23AM +0200, Michael S. Tsirkin wrote:
 On Wed, Jan 16, 2013 at 10:13:28AM +0100, Stefan Hajnoczi wrote:
  On Wed, Jan 16, 2013 at 03:33:24PM +0800, ak...@redhat.com wrote:
   +\change_inserted -1930653948 1358320004
   +The command VIRTIO_NET_CTRL_MAC_ADDR_SET is used to set 
   +\begin_inset Quotes eld
   +\end_inset
   +
   +physical
   +\begin_inset Quotes erd
   +\end_inset
   +
   + address of the network card.
  
  The physical address of the network card?  That term is not defined
  anywhere in the specification.

I will replace it with 'the default MAC address'

  Perhaps it's best to explain that the config space mac field and
  VIRTIO_NET_CTRL_MAC_ADDR_SET both set the default MAC address which rx
  filtering accepts.  (The MAC table is an additional set of MAC addresses
  which rx filtering accepts.)
  
  It would also be worth explaining that VIRTIO_NET_CTRL_MAC_ADDR_SET is
  atomic whereas the config space mac field is not.  Therefore,
  VIRTIO_NET_CTRL_MAC_ADDR_SET is preferred, especially while the NIC is
  up.

Ok, will send a v2, thanks.

  Stefan
 
 It's probably best to simply make the config space field
 read-only if the feature bit is acked.

It's reasonable.

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index c18e276..54c5eae 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -93,7 +93,8 @@ static void virtio_net_set_config(VirtIODevice
*vdev, const uint8_t *config)
 
 memcpy(netcfg, config, sizeof(netcfg));
 
-if (memcmp(netcfg.mac, n-mac, ETH_ALEN)) {
+if (!(n-vdev.guest_features  VIRTIO_NET_F_CTRL_MAC_ADDR  1) 
+memcmp(netcfg.mac, n-mac, ETH_ALEN)) {
 memcpy(n-mac, netcfg.mac, ETH_ALEN);
 qemu_format_nic_info_str(n-nic-nc, n-mac);
 }

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC PATCH 2/2] virtio-net: introduce a new control to set macaddr

2013-01-15 Thread Amos Kong
On Thu, Jan 10, 2013 at 10:57:05PM +0800, Jason Wang wrote:
 On 01/10/2013 10:45 PM, ak...@redhat.com wrote:
  From: Amos Kong ak...@redhat.com
 
  Currently we write MAC address to pci config space byte by byte,
  this means that we have an intermediate step where mac is wrong.
  This patch introduced a new control command to set MAC address
  in one time.
 
  VIRTIO_NET_F_CTRL_MAC_ADDR is a new feature bit for compatibility.
 
  Signed-off-by: Amos Kong ak...@redhat.com
  ---
   drivers/net/virtio_net.c| 16 +++-
   include/uapi/linux/virtio_net.h |  8 +++-
   2 files changed, 22 insertions(+), 2 deletions(-)
 
  diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
  index 395ab4f..ff22bcd 100644
  --- a/drivers/net/virtio_net.c
  +++ b/drivers/net/virtio_net.c
  @@ -803,13 +803,26 @@ static int virtnet_set_mac_address(struct net_device 
  *dev, void *p)
  struct virtio_device *vdev = vi-vdev;
  int ret;
   
  +   struct scatterlist sg;
  +
  ret = eth_mac_addr(dev, p);
  if (ret)
  return ret;
   
  -   if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
  +   if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
  +   /* Set MAC address by sending vq command */
  +   sg_init_one(sg, dev-dev_addr, dev-addr_len);
  +   virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
  +VIRTIO_NET_CTRL_MAC_ADDR_SET,
  +sg, 1, 0);
  +   return 0;
  +   }
  +
 
 Better to check the return of virtnet_send_command() and give a warn
 like other command. Btw, need we fail back to try the old way then?

Yes, it's necessary to check the return value of
virtnet_send_command().

In fail case, I like to return -EINVAL to userspace, because we don't
only want to set mac successfully, we also want to resolve the
addr inconsistent issue by this feature (vq cmd).

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/2] make mac programming for virtio net more robust

2013-01-10 Thread Amos Kong
On Thu, Jan 10, 2013 at 10:45:39PM +0800, ak...@redhat.com wrote:
 From: Amos Kong ak...@redhat.com
 
 Currenly mac is programmed byte by byte. This means that we
 have an intermediate step where mac is wrong. 
 
 Second patch introduced a new vq control command to set mac
 address in one time.

I just posted RFC patches (kernel  qemu) out,
specifiction update will be sent when the solution
is accepted.

MST has another solution:
1. add a feature to disable mac address in pci space
2. introduce a new vq control cmd to add new mac address
   to mac filter table. (no long check n-mac in receiver_filter())

Welcome to give comments, thanks!
Amos
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: Fix SREGS documentation reference

2012-12-11 Thread Amos Kong
On Tue, Dec 11, 2012 at 9:38 PM, Mihai Caraman
mihai.cara...@freescale.com wrote:

 Reflect the uapi folder change in SREGS API documentation.

 Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
 ---
  Documentation/virtual/kvm/api.txt |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)

 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index a4df553..9cf591d 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -345,7 +345,7 @@ struct kvm_sregs {
 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
  };

 -/* ppc -- see arch/powerpc/include/asm/kvm.h */
 +/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */

Trivial fix.
Reviewed-by: Amos Kong kongjian...@gmail.com


  interrupt_bitmap is a bitmap of pending external interrupts.  At most
  one bit may be set.  This interrupt has been acknowledged by the APIC
 --
 1.7.4.1


 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: Fix SREGS documentation reference

2012-12-11 Thread Amos Kong
On Tue, Dec 11, 2012 at 9:38 PM, Mihai Caraman
mihai.cara...@freescale.com wrote:

 Reflect the uapi folder change in SREGS API documentation.

 Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
 ---
  Documentation/virtual/kvm/api.txt |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)

 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index a4df553..9cf591d 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -345,7 +345,7 @@ struct kvm_sregs {
 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
  };

 -/* ppc -- see arch/powerpc/include/asm/kvm.h */
 +/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */

Trivial fix.
Reviewed-by: Amos Kong kongjian...@gmail.com


  interrupt_bitmap is a bitmap of pending external interrupts.  At most
  one bit may be set.  This interrupt has been acknowledged by the APIC
 --
 1.7.4.1


 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm: remove boot=on|off drive parameter compatibility

2012-09-30 Thread Amos Kong
- Original Message -
 
 Option is deprecated and warning has been in place for one year.
 
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 
 diff --git a/blockdev.c b/blockdev.c
 index 4a5266e..7c83baa 100644
 --- a/blockdev.c
 +++ b/blockdev.c
 @@ -432,12 +432,6 @@ DriveInfo *drive_init(QemuOpts *opts, int
 default_to_scsi)
  return NULL;
  }
  
 -if (qemu_opt_get(opts, boot) != NULL) {
 -fprintf(stderr, qemu-kvm: boot=on|off is deprecated and
 will be 
 -ignored. Future versions will reject this
 parameter. Please 
 -update your scripts.\n);
 -}
 -
  on_write_error = BLOCK_ERR_STOP_ENOSPC;
  if ((buf = qemu_opt_get(opts, werror)) != NULL) {
  if (type != IF_IDE  type != IF_SCSI  type != IF_VIRTIO
   type != IF_NONE) {
 diff --git a/qemu-config.c b/qemu-config.c
 index 3eaee48..eba977e 100644
 --- a/qemu-config.c
 +++ b/qemu-config.c
 @@ -114,10 +114,6 @@ static QemuOptsList qemu_drive_opts = {
  .name = copy-on-read,
  .type = QEMU_OPT_BOOL,
  .help = copy read data from backing file into image
  file,
 -},{
 -.name = boot,
 -.type = QEMU_OPT_BOOL,
 -.help = (deprecated, ignored),
  },
  { /* end of list */ }
  },


Reviewed-by: Amos Kong ak...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] kvmclock: fix guest stop notification

2012-09-30 Thread Amos Kong
- Original Message -
 On Thu, Sep 20, 2012 at 09:46:41AM -0300, Marcelo Tosatti wrote:
  On Thu, Sep 20, 2012 at 01:55:20PM +0530, Amit Shah wrote:
   Commit f349c12c0434e29c79ecde89029320c4002f7253 added the guest
   stop

In commitlog of f349c12c0434e29c79ecde89029320c4002f7253: 

## This patch uses the qemu Notifier system to tell the guest it _is about to 
be_ stopped


   notification, but it did it in a way that the stop notification
   would
   never reach the kernel.  The kvm_vm_state_changed() function gets
   a
   value of 0 for the 'running' parameter when the VM is stopped,
   making
   all the code added previously dead code.
   
   This patch reworks the code so that it's called when 'running' is
   0,
   which indicates the VM was stopped.

Amit, did you touch any real issue? guest gets call trace with current code?
which kind of context?

Someone told me he got call trace when shutdown guest by 'init 0', I didn't
verify this issue.

   CC: Eric B Munson emun...@mgebm.net
   CC: Raghavendra K T raghavendra...@linux.vnet.ibm.com
   CC: Andreas Färber afaer...@suse.de
   CC: Marcelo Tosatti mtosa...@redhat.com
   CC: Paolo Bonzini pbonz...@redhat.com
   CC: Laszlo Ersek ler...@redhat.com
   Signed-off-by: Amit Shah amit.s...@redhat.com
   ---
hw/kvm/clock.c |   21 +++--
1 files changed, 11 insertions(+), 10 deletions(-)
   
   diff --git a/hw/kvm/clock.c b/hw/kvm/clock.c
   index 824b978..f3427eb 100644
   --- a/hw/kvm/clock.c
   +++ b/hw/kvm/clock.c
   @@ -71,18 +71,19 @@ static void kvmclock_vm_state_change(void
   *opaque, int running,


I found this function is only called when resume vm
(here running is 1, it means vm is already resumed?
we don't call that ioctl _before_ resume).

kvmclock_vm_state_change() is not called when I stop vm
through qemu monitor command.



if (running) {
s-clock_valid = false;
   +return;
   +}

   -if (!cap_clock_ctrl) {
   -return;
   -}
   -for (penv = first_cpu; penv != NULL; penv =
   penv-next_cpu) {
   -ret = kvm_vcpu_ioctl(penv, KVM_KVMCLOCK_CTRL, 0);
   -if (ret) {
   -if (ret != -EINVAL) {
   -fprintf(stderr, %s: %s\n, __func__,
   strerror(-ret));
   -}
   -return;
   +if (!cap_clock_ctrl) {
   +return;
   +}
   +for (penv = first_cpu; penv != NULL; penv = penv-next_cpu)
   {
   +ret = kvm_vcpu_ioctl(penv, KVM_KVMCLOCK_CTRL, 0);
   +if (ret) {
   +if (ret != -EINVAL) {
   +fprintf(stderr, %s: %s\n, __func__,
   strerror(-ret));
}
   +return;
}
}
}
   --
   1.7.7.6
  
  ACK
  
  Avi, please merge through uq/master.
 
 NACK, guest should be notified when the VM is starting, not
 when stopping.

# from api.txt
ioctl (KVM_CAP_KVMCLOCK_CTRL) can be called any time _after_ pausing
the vcpu, but _before_ it is resumed.


Thanks, Amos
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   3   4   >