[PATCH] KVM-test: Add two scripts to disable services for perf tests

2011-09-07 Thread Amos Kong
System services on guest and host take uncertain resource, it effects
the perf results. We can use the below two scripts to disable some
services of host and guest.

stop_serivices_perf.sh is used to stop the running serivices.
off_service_perf.sh is used to off services when host starts up.

We can use them to prepare environment for performance testcases.

Signed-off-by: Amos Kong ak...@redhat.com
---
 client/tests/kvm/scripts/off_services_perf.sh  |   27 
 client/tests/kvm/scripts/stop_services_perf.sh |   27 
 2 files changed, 54 insertions(+), 0 deletions(-)
 create mode 100755 client/tests/kvm/scripts/off_services_perf.sh
 create mode 100755 client/tests/kvm/scripts/stop_services_perf.sh

diff --git a/client/tests/kvm/scripts/off_services_perf.sh 
b/client/tests/kvm/scripts/off_services_perf.sh
new file mode 100755
index 000..6ddf168
--- /dev/null
+++ b/client/tests/kvm/scripts/off_services_perf.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+chkconfig  auditd off
+chkconfig  autofs off
+chkconfig  avahi-daemon off
+chkconfig  crond off
+chkconfig  cups off
+chkconfig  ip6tables off
+chkconfig  sendmail off
+chkconfig  smartd off
+chkconfig  xfs off
+chkconfig  acpid off
+chkconfig  atd off
+chkconfig  haldaemon off
+chkconfig  mdmonitor off
+chkconfig  netfs off
+chkconfig  rhnsd off
+chkconfig  rpcgssd off
+chkconfig  rpcidmapd off
+chkconfig  abrtd off
+chkconfig  kdump off
+chkconfig  koan off
+chkconfig  libvirt-guests off
+chkconfig  ntpdate off
+chkconfig  portreserve off
+chkconfig  postfix off
+chkconfig  rhsmcertd off
+chkconfig  tuned off
diff --git a/client/tests/kvm/scripts/stop_services_perf.sh 
b/client/tests/kvm/scripts/stop_services_perf.sh
new file mode 100755
index 000..84150b0
--- /dev/null
+++ b/client/tests/kvm/scripts/stop_services_perf.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+service auditd stop
+service avahi-daemon stop
+service anacron stop
+service qpidd stop
+service smartd stop
+service crond stop
+service haldaemon stop
+service opensmd stop
+service openibd stop
+service yum-updatesd stop
+service collectd stop
+service bluetooth stop
+service cups stop
+service cpuspeed stop
+service hidd stop
+service isdn stop
+service kudzu stop
+service lvm2-monitor stop
+service mcstrans stop
+service mdmonitor stop
+service messagebus stop
+service restorecond stop
+service rhnsd stop
+service rpcgssd stop
+service setroubleshoot stop
+service smartd stop

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/10] iommu/amd: Use bus_set_iommu instead of register_iommu

2011-09-07 Thread Joerg Roedel
Convert the AMD IOMMU driver to use the new interface for
publishing the iommu_ops.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 drivers/iommu/amd_iommu.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index a14f8dc..57f6f38 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2493,7 +2493,7 @@ static unsigned device_dma_ops_init(void)
 
 void __init amd_iommu_init_api(void)
 {
-   register_iommu(amd_iommu_ops);
+   bus_set_iommu(pci_bus_type, amd_iommu_ops);
 }
 
 int __init amd_iommu_init_dma_ops(void)
-- 
1.7.4.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/10] iommu/core: Remove global iommu_ops and register_iommu

2011-09-07 Thread Joerg Roedel
With all IOMMU drivers being converted to bus_set_iommu the
global iommu_ops are no longer required. The same is true
for the deprecated register_iommu function.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 drivers/iommu/iommu.c |   27 ---
 include/linux/iommu.h |1 -
 2 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index f2ced4c..82178cb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -25,50 +25,31 @@
 #include linux/errno.h
 #include linux/iommu.h
 
-static struct iommu_ops *iommu_ops;
-
-void register_iommu(struct iommu_ops *ops)
-{
-   if (iommu_ops)
-   BUG();
-
-   iommu_ops = ops;
-}
-
 void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops)
 {
 }
 
 bool iommu_present(struct bus_type *bus)
 {
-   if (bus-iommu_ops != NULL)
-   return true;
-   else
-   return iommu_ops != NULL;
+   return bus-iommu_ops != NULL;
 }
 EXPORT_SYMBOL_GPL(iommu_present);
 
 struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
 {
struct iommu_domain *domain;
-   struct iommu_ops *ops;
int ret;
 
-   if (bus-iommu_ops)
-   ops = bus-iommu_ops;
-   else
-   ops = iommu_ops;
-
-   if (ops == NULL)
+   if (bus-iommu_ops == NULL)
return NULL;
 
domain = kmalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
return NULL;
 
-   domain-ops = ops;
+   domain-ops = bus-iommu_ops;
 
-   ret = iommu_ops-domain_init(domain);
+   ret = domain-ops-domain_init(domain);
if (ret)
goto out_free;
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index de73219..7014f40 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -54,7 +54,6 @@ struct iommu_ops {
  unsigned long cap);
 };
 
-extern void register_iommu(struct iommu_ops *ops);
 extern void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops);
 extern bool iommu_present(struct bus_type *bus);
 extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
-- 
1.7.4.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 4/4] qmp/hmp: add block_set_io_throttle

2011-09-07 Thread Zhi Yong Wu
The patch introduce one new command block_set_io_throttle; For its usage 
syntax, if you have better idea, pls let me know.

Signed-off-by: Zhi Yong Wu wu...@linux.vnet.ibm.com
---
 block.c |   26 +++-
 blockdev.c  |   69 +++
 blockdev.h  |2 +
 hmp-commands.hx |   15 
 qerror.c|4 +++
 qerror.h|3 ++
 qmp-commands.hx |   52 -
 7 files changed, 168 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 8a82273..e435a79 100644
--- a/block.c
+++ b/block.c
@@ -1938,6 +1938,16 @@ static void bdrv_print_dict(QObject *obj, void *opaque)
 qdict_get_bool(qdict, ro),
 qdict_get_str(qdict, drv),
 qdict_get_bool(qdict, encrypted));
+
+monitor_printf(mon,  bps=% PRId64  bps_rd=% PRId64
+ bps_wr=% PRId64  iops=% PRId64
+ iops_rd=% PRId64  iops_wr=% PRId64,
+qdict_get_int(qdict, bps),
+qdict_get_int(qdict, bps_rd),
+qdict_get_int(qdict, bps_wr),
+qdict_get_int(qdict, iops),
+qdict_get_int(qdict, iops_rd),
+qdict_get_int(qdict, iops_wr));
 } else {
 monitor_printf(mon,  [not inserted]);
 }
@@ -1970,10 +1980,22 @@ void bdrv_info(Monitor *mon, QObject **ret_data)
 QDict *bs_dict = qobject_to_qdict(bs_obj);
 
 obj = qobject_from_jsonf({ 'file': %s, 'ro': %i, 'drv': %s, 
- 'encrypted': %i },
+ 'encrypted': %i, 
+ 'bps': % PRId64 ,
+ 'bps_rd': % PRId64 ,
+ 'bps_wr': % PRId64 ,
+ 'iops': % PRId64 ,
+ 'iops_rd': % PRId64 ,
+ 'iops_wr': % PRId64 },
  bs-filename, bs-read_only,
  bs-drv-format_name,
- bdrv_is_encrypted(bs));
+ bdrv_is_encrypted(bs),
+ bs-io_limits.bps[BLOCK_IO_LIMIT_TOTAL],
+ bs-io_limits.bps[BLOCK_IO_LIMIT_READ],
+ bs-io_limits.bps[BLOCK_IO_LIMIT_WRITE],
+ bs-io_limits.iops[BLOCK_IO_LIMIT_TOTAL],
+ bs-io_limits.iops[BLOCK_IO_LIMIT_READ],
+ bs-io_limits.iops[BLOCK_IO_LIMIT_WRITE]);
 if (bs-backing_file[0] != '\0') {
 QDict *qdict = qobject_to_qdict(obj);
 qdict_put(qdict, backing_file,
diff --git a/blockdev.c b/blockdev.c
index 619ae9f..7f5c4df 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -747,6 +747,75 @@ int do_change_block(Monitor *mon, const char *device,
 return monitor_read_bdrv_key_start(mon, bs, NULL, NULL);
 }
 
+/* throttling disk I/O limits */
+int do_block_set_io_throttle(Monitor *mon,
+   const QDict *qdict, QObject **ret_data)
+{
+const char *devname = qdict_get_str(qdict, device);
+uint64_t bps= qdict_get_try_int(qdict, bps, -1);
+uint64_t bps_rd = qdict_get_try_int(qdict, bps_rd, -1);
+uint64_t bps_wr = qdict_get_try_int(qdict, bps_wr, -1);
+uint64_t iops   = qdict_get_try_int(qdict, iops, -1);
+uint64_t iops_rd= qdict_get_try_int(qdict, iops_rd, -1);
+uint64_t iops_wr= qdict_get_try_int(qdict, iops_wr, -1);
+BlockDriverState *bs;
+
+bs = bdrv_find(devname);
+if (!bs) {
+qerror_report(QERR_DEVICE_NOT_FOUND, devname);
+return -1;
+}
+
+if ((bps == -1)  (bps_rd == -1)  (bps_wr == -1)
+ (iops == -1)  (iops_rd == -1)  (iops_wr == -1)) {
+qerror_report(QERR_MISSING_PARAMETER,
+  bps/bps_rd/bps_wr/iops/iops_rd/iops_wr);
+return -1;
+}
+
+if (((bps != -1)  ((bps_rd != -1) || (bps_wr != -1)))
+|| ((iops != -1)  ((iops_rd != -1) || (iops_wr != -1 {
+qerror_report(QERR_INVALID_PARAMETER_COMBINATION);
+return -1;
+}
+
+if (bps != -1) {
+bs-io_limits.bps[BLOCK_IO_LIMIT_TOTAL] = bps;
+bs-io_limits.bps[BLOCK_IO_LIMIT_READ]  = 0;
+bs-io_limits.bps[BLOCK_IO_LIMIT_WRITE] = 0;
+}
+
+if ((bps_rd != -1) || (bps_wr != -1)) {
+bs-io_limits.bps[BLOCK_IO_LIMIT_READ]   =
+   (bps_rd == -1) ? bs-io_limits.bps[BLOCK_IO_LIMIT_READ] : bps_rd;
+bs-io_limits.bps[BLOCK_IO_LIMIT_WRITE]  =
+   (bps_wr == -1) ? bs-io_limits.bps[BLOCK_IO_LIMIT_WRITE] : bps_wr;
+

[PATCH v8 0/4] The intro of QEMU block I/O throttling

2011-09-07 Thread Zhi Yong Wu
The main goal of the patch is to effectively cap the disk I/O speed or counts 
of one single VM.It is only one draft, so it unavoidably has some drawbacks, if 
you catch them, please let me know.

The patch will mainly introduce one block I/O throttling algorithm, one timer 
and one block queue for each I/O limits enabled drive.

When a block request is coming in, the throttling algorithm will check if its 
I/O rate or counts exceed the limits; if yes, then it will enqueue to the block 
queue; The timer will handle the I/O requests in it.

Some available features follow as below:
(1) global bps limit.
   -drive bps=xxxin bytes/s
(2) only read bps limit
   -drive bps_rd=xxx in bytes/s
(3) only write bps limit
   -drive bps_wr=xxx in bytes/s
(4) global iops limit
   -drive iops=xxx   in ios/s
(5) only read iops limit
   -drive iops_rd=xxxin ios/s
(6) only write iops limit
   -drive iops_wr=xxxin ios/s
(7) the combination of some limits.
   -drive bps=xxx,iops=xxx

Known Limitations:
(1) #1 can not coexist with #2, #3
(2) #4 can not coexist with #5, #6
(3) When bps/iops limits are specified to a small value such as 511 bytes/s, 
this VM will hang up. We are considering how to handle this senario.

Changes since code V7:
  fix the build per patch based on stefan's comments.

Zhi Yong Wu (4):
  block: add the command line support
  block: add the block queue support
  block: add block timer and throttling algorithm
  qmp/hmp: add block_set_io_throttle

 v7: Mainly simply the block queue.
 Adjust codes based on stefan's comments.

 v6: Mainly fix the aio callback issue for block queue.
 Adjust codes based on Ram Pai's comments.

 v5: add qmp/hmp support.
 Adjust the codes based on stefan's comments
 qmp/hmp: add block_set_io_throttle

 v4: fix memory leaking based on ryan's feedback.

 v3: Added the code for extending slice time, and modified the method to 
compute wait time for the timer.

 v2: The codes V2 for QEMU disk I/O limits.
 Modified the codes mainly based on stefan's comments.

 v1: Submit the codes for QEMU disk I/O limits.
 Only a code draft.


 Makefile.objs |2 +-
 block.c   |  331 +++--
 block.h   |6 +-
 block/blk-queue.c |  184 +
 block/blk-queue.h |   59 ++
 block_int.h   |   30 +
 blockdev.c|   98 
 blockdev.h|2 +
 hmp-commands.hx   |   15 +++
 qemu-config.c |   24 
 qemu-options.hx   |1 +
 qerror.c  |4 +
 qerror.h  |3 +
 qmp-commands.hx   |   52 -
 14 files changed, 796 insertions(+), 15 deletions(-)
 create mode 100644 block/blk-queue.c
 create mode 100644 block/blk-queue.h

-- 
1.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 3/5] VFIO: Base framework for new VFIO driver

2011-09-07 Thread Konrad Rzeszutek Wilk
 +static long vfio_iommu_unl_ioctl(struct file *filep,
 +  unsigned int cmd, unsigned long arg)
 +{
 + struct vfio_iommu *viommu = filep-private_data;
 + struct vfio_dma_map dm;
 + int ret = -ENOSYS;
 +
 + switch (cmd) {
 + case VFIO_IOMMU_MAP_DMA:
 + if (copy_from_user(dm, (void __user *)arg, sizeof dm))
 + return -EFAULT;
 + ret = 0; // XXX - Do something

chuckles

 + if (!ret  copy_to_user((void __user *)arg, dm, sizeof dm))
 + ret = -EFAULT;
 + break;
 +
 + case VFIO_IOMMU_UNMAP_DMA:
 + if (copy_from_user(dm, (void __user *)arg, sizeof dm))
 + return -EFAULT;
 + ret = 0; // XXX - Do something
 + if (!ret  copy_to_user((void __user *)arg, dm, sizeof dm))
 + ret = -EFAULT;
 + break;
 + }
 + return ret;
 +}
 +
 +#ifdef CONFIG_COMPAT
 +static long vfio_iommu_compat_ioctl(struct file *filep,
 + unsigned int cmd, unsigned long arg)
 +{
 + arg = (unsigned long)compat_ptr(arg);
 + return vfio_iommu_unl_ioctl(filep, cmd, arg);
 +}
 +#endif   /* CONFIG_COMPAT */
 +
 +const struct file_operations vfio_iommu_fops = {
 + .owner  = THIS_MODULE,
 + .release= vfio_iommu_release,
 + .unlocked_ioctl = vfio_iommu_unl_ioctl,
 +#ifdef CONFIG_COMPAT
 + .compat_ioctl   = vfio_iommu_compat_ioctl,
 +#endif
 +};
 diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
.. snip..
 +int vfio_group_add_dev(struct device *dev, void *data)
 +{
 + struct vfio_device_ops *ops = data;
 + struct list_head *pos;
 + struct vfio_group *vgroup = NULL;
 + struct vfio_device *vdev = NULL;
 + unsigned int group;
 + int ret = 0, new_group = 0;

'new_group' should probably be 'bool'.

 +
 + if (iommu_device_group(dev, group))
 + return 0;

-EEXIST?

 +
 + mutex_lock(vfio.group_lock);
 +
 + list_for_each(pos, vfio.group_list) {
 + vgroup = list_entry(pos, struct vfio_group, next);
 + if (vgroup-group == group)
 + break;
 + vgroup = NULL;
 + }
 +
 + if (!vgroup) {
 + int id;
 +
 + if (unlikely(idr_pre_get(vfio.idr, GFP_KERNEL) == 0)) {
 + ret = -ENOMEM;
 + goto out;
 + }
 + vgroup = kzalloc(sizeof(*vgroup), GFP_KERNEL);
 + if (!vgroup) {
 + ret = -ENOMEM;
 + goto out;
 + }
 +
 + vgroup-group = group;
 + INIT_LIST_HEAD(vgroup-device_list);
 +
 + ret = idr_get_new(vfio.idr, vgroup, id);
 + if (ret == 0  id  MINORMASK) {
 + idr_remove(vfio.idr, id);
 + kfree(vgroup);
 + ret = -ENOSPC;
 + goto out;
 + }
 +
 + vgroup-devt = MKDEV(MAJOR(vfio.devt), id);
 + list_add(vgroup-next, vfio.group_list);
 + device_create(vfio.class, NULL, vgroup-devt,
 +   vgroup, %u, group);
 +
 + new_group = 1;
 + } else {
 + list_for_each(pos, vgroup-device_list) {
 + vdev = list_entry(pos, struct vfio_device, next);
 + if (vdev-dev == dev)
 + break;
 + vdev = NULL;
 + }
 + }
 +
 + if (!vdev) {
 + /* Adding a device for a group that's already in use? */
 + /* Maybe we should attach to the domain so others can't */
 + BUG_ON(vgroup-container 
 +vgroup-container-iommu 
 +vgroup-container-iommu-refcnt);
 +
 + vdev = ops-new(dev);
 + if (IS_ERR(vdev)) {
 + /* If we just created this vgroup, tear it down */
 + if (new_group) {
 + device_destroy(vfio.class, vgroup-devt);
 + idr_remove(vfio.idr, MINOR(vgroup-devt));
 + list_del(vgroup-next);
 + kfree(vgroup);
 + }
 + ret = PTR_ERR(vdev);
 + goto out;
 + }
 + list_add(vdev-next, vgroup-device_list);
 + vdev-dev = dev;
 + vdev-ops = ops;
 + vdev-vfio = vfio;
 + }
 +out:
 + mutex_unlock(vfio.group_lock);
 + return ret;
 +}
 +
 +void vfio_group_del_dev(struct device *dev)
 +{
 + struct list_head *pos;
 + struct vfio_container *vcontainer;
 + struct vfio_group *vgroup = NULL;
 + struct vfio_device *vdev = NULL;
 + unsigned int group;
 +
 + if (iommu_device_group(dev, group))
 + return;
 +
 + 

Re: [RFC PATCH]vhost-blk: In-kernel accelerator for virtio block device

2011-09-07 Thread Liu Yuan

On 08/15/2011 12:17 PM, Badari Pulavarty wrote:

On 8/14/2011 8:20 PM, Liu Yuan wrote:

On 08/13/2011 12:12 AM, Badari Pulavarty wrote:

On 8/12/2011 4:40 AM, Liu Yuan wrote:

On 08/12/2011 04:27 PM, Liu Yuan wrote:

On 08/12/2011 12:50 PM, Badari Pulavarty wrote:

On 8/10/2011 8:19 PM, Liu Yuan wrote:

On 08/11/2011 11:01 AM, Liu Yuan wrote:


It looks like the patch wouldn't work for testing multiple 
devices.


vhost_blk_open() does
+   used_info_cachep = KMEM_CACHE(used_info, 
SLAB_HWCACHE_ALIGN |

SLAB_PANIC);



This is weird. how do you open multiple device?I just opened 
the device with following command:


-drive file=/dev/sda6,if=virtio,cache=none,aio=native -drive 
file=~/data0.img,if=virtio,cache=none,aio=native -drive 
file=~/data1.img,if=virtio,cache=none,aio=native


And I didn't meet any problem.

this would tell qemu to open three devices, and pass three FDs 
to three instances of vhost_blk module.

So KMEM_CACHE() is okay in vhost_blk_open().



Oh, you are right. KMEM_CACHE() is in the wrong place. it is 
three instances vhost worker threads created. Hmmm, but I didn't 
meet any problem when opening it and running it. So strange. 
I'll go to figure it out.


When opening second device, we get panic since 
used_info_cachep is

already created. Just to make progress I moved this call to
vhost_blk_init().

I don't see any host panics now. With single block device (dd),
it seems to work fine. But when I start testing multiple block
devices I quickly run into hangs in the guest. I see following
messages in the guest from virtio_ring.c:

virtio_blk virtio2: requests: id 0 is not a head !
virtio_blk virtio1: requests: id 0 is not a head !
virtio_blk virtio4: requests: id 1 is not a head !
virtio_blk virtio3: requests: id 39 is not a head !

Thanks,
Badari




vq-data[] is initialized by guest virtio-blk driver and 
vhost_blk is unware of it. it looks like used ID passed
over by vhost_blk to guest virtio_blk is wrong, but, it should 
not happen. :|


And I can't reproduce this on my laptop. :(


Finally, found the issue  :)

Culprit is:

+static struct io_event events[MAX_EVENTS];

With multiple devices, multiple threads could be executing 
handle_completion() (one for
each fd) at the same time. events array is global :( Need to 
make it one per device/fd.


For test, I changed MAX_EVENTS to 32 and moved events array to 
be local (stack)

to handle_completion(). Tests are running fine.

Your laptop must have single processor, hence you have only one 
thread executing handle_completion()

at any time..

Thanks,
Badari


Good catch, this is rather cool!Yup, I develop it mostly in a 
nested KVM environment. and the L2 host  only runs single 
processor :(


Thanks,
Yuan
By the way, MAX_EVENTS should be 128, as much as guest virtio_blk 
driver can batch-submit,

causing array overflow.
I have had turned on the debug, and had seen as much as over 100 
requests batched from guest OS.




Hmm.. I am not sure why you see over 100 outstanding events per fd.  
Max events could be as high as

number of number of outstanding IOs.

Anyway, instead of putting it on stack, I kmalloced it now.

Dongsu Park, Here is the complete patch.

Thanks
Badari


In the physical machine, there is a queue depth posted by block 
device driver to limit the
pending requests number, normally it is 31. But virtio driver doesn't 
post it in the guest OS.

So nothing prvents OS batch-submitting requests more than 31.

I have noticed over 100 pending requests during guest OS 
initilization and it is reproducible.


BTW, how is perf number for vhost-blk in your environment?


Right now I am doing dd tests to test out the functionality and 
stability.


I plan to collect FFSB benchmark results across 6-virtio-blk/vhost-blk 
disks with
all profiles - seq read, seq write, random read, random write with 
blocksizes varying

from 4k to 1MB.

I will start the test tomorrow. It will take few days to run thru all 
the scenarios.
I don't have an easy way to collect host CPU consumption - but for now 
lets

focus on throughput and latency. I will share the results in few days.

Thanks
Badari



Hi Badari,
how is test going?

Thanks,
Yuan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] pci: Rework config space blocking services

2011-09-07 Thread Brian King
Here is how I would prefer to rework ipr. 

Thanks,

Brian

-- 
Brian King
Linux on Power Virtualization
IBM Linux Technology Center



The PCI config space blocking API has changed to better
allow for multiple users. Update ipr to use the new API.

Signed-off-by: Brian King brk...@linux.vnet.ibm.com
---

 drivers/scsi/ipr.c |   66 +++--
 drivers/scsi/ipr.h |1 
 2 files changed, 60 insertions(+), 7 deletions(-)

diff -puN drivers/scsi/ipr.c~ipr_new_pci_block drivers/scsi/ipr.c
--- linux-2.6/drivers/scsi/ipr.c~ipr_new_pci_block  2011-09-06 
16:52:47.0 -0500
+++ linux-2.6-bjking1/drivers/scsi/ipr.c2011-09-07 08:34:54.0 
-0500
@@ -7639,8 +7639,12 @@ static int ipr_reset_restore_cfg_space(s
  **/
 static int ipr_reset_bist_done(struct ipr_cmnd *ipr_cmd)
 {
+   struct ipr_ioa_cfg *ioa_cfg = ipr_cmd-ioa_cfg;
+
ENTER;
-   pci_unblock_user_cfg_access(ipr_cmd-ioa_cfg-pdev);
+   if (ioa_cfg-ucfg_blocked)
+   pci_unblock_cfg_access(ioa_cfg-pdev);
+   ioa_cfg-ucfg_blocked = 0;
ipr_cmd-job_step = ipr_reset_restore_cfg_space;
LEAVE;
return IPR_RC_JOB_CONTINUE;
@@ -7661,8 +7665,6 @@ static int ipr_reset_start_bist(struct i
int rc = PCIBIOS_SUCCESSFUL;
 
ENTER;
-   pci_block_user_cfg_access(ioa_cfg-pdev);
-
if (ioa_cfg-ipr_chip-bist_method == IPR_MMIO)
writel(IPR_UPROCI_SIS64_START_BIST,
   ioa_cfg-regs.set_uproc_interrupt_reg32);
@@ -7674,7 +7676,9 @@ static int ipr_reset_start_bist(struct i
ipr_reset_start_timer(ipr_cmd, IPR_WAIT_FOR_BIST_TIMEOUT);
rc = IPR_RC_JOB_RETURN;
} else {
-   pci_unblock_user_cfg_access(ipr_cmd-ioa_cfg-pdev);
+   if (ioa_cfg-ucfg_blocked)
+   pci_unblock_cfg_access(ipr_cmd-ioa_cfg-pdev);
+   ioa_cfg-ucfg_blocked = 0;
ipr_cmd-s.ioasa.hdr.ioasc = 
cpu_to_be32(IPR_IOASC_PCI_ACCESS_ERROR);
rc = IPR_RC_JOB_CONTINUE;
}
@@ -7717,7 +7721,6 @@ static int ipr_reset_slot_reset(struct i
struct pci_dev *pdev = ioa_cfg-pdev;
 
ENTER;
-   pci_block_user_cfg_access(pdev);
pci_set_pcie_reset_state(pdev, pcie_warm_reset);
ipr_cmd-job_step = ipr_reset_slot_reset_done;
ipr_reset_start_timer(ipr_cmd, IPR_PCI_RESET_TIMEOUT);
@@ -7726,6 +7729,55 @@ static int ipr_reset_slot_reset(struct i
 }
 
 /**
+ * ipr_reset_block_config_access_wait - Wait for permission to block config 
access
+ * @ipr_cmd:   ipr command struct
+ *
+ * Description: This attempts to block config access to the IOA.
+ *
+ * Return value:
+ * IPR_RC_JOB_CONTINUE / IPR_RC_JOB_RETURN
+ **/
+static int ipr_reset_block_config_access_wait(struct ipr_cmnd *ipr_cmd)
+{
+   struct ipr_ioa_cfg *ioa_cfg = ipr_cmd-ioa_cfg;
+   int rc = IPR_RC_JOB_CONTINUE;
+
+   if (pci_block_cfg_access_in_atomic(ioa_cfg-pdev)) {
+   if (ipr_cmd-u.time_left) {
+   rc = IPR_RC_JOB_RETURN;
+   ipr_cmd-u.time_left -= IPR_CHECK_FOR_RESET_TIMEOUT;
+   ipr_reset_start_timer(ipr_cmd, 
IPR_CHECK_FOR_RESET_TIMEOUT);
+   } else {
+   ipr_cmd-job_step = ioa_cfg-reset;
+   dev_err(ioa_cfg-pdev-dev,
+   Timed out waiting to block config access. 
Resetting anyway.\n);
+   }
+   } else {
+   ioa_cfg-ucfg_blocked = 1;
+   ipr_cmd-job_step = ioa_cfg-reset;
+   }
+
+   return rc;
+}
+
+/**
+ * ipr_reset_block_config_access - Block config access to the IOA
+ * @ipr_cmd:   ipr command struct
+ *
+ * Description: This attempts to block config access to the IOA
+ *
+ * Return value:
+ * IPR_RC_JOB_CONTINUE
+ **/
+static int ipr_reset_block_config_access(struct ipr_cmnd *ipr_cmd)
+{
+   ipr_cmd-ioa_cfg-ucfg_blocked = 0;
+   ipr_cmd-job_step = ipr_reset_block_config_access_wait;
+   ipr_cmd-u.time_left = IPR_WAIT_FOR_RESET_TIMEOUT;
+   return IPR_RC_JOB_CONTINUE;
+}
+
+/**
  * ipr_reset_allowed - Query whether or not IOA can be reset
  * @ioa_cfg:   ioa config struct
  *
@@ -7764,7 +7816,7 @@ static int ipr_reset_wait_to_start_bist(
ipr_cmd-u.time_left -= IPR_CHECK_FOR_RESET_TIMEOUT;
ipr_reset_start_timer(ipr_cmd, IPR_CHECK_FOR_RESET_TIMEOUT);
} else {
-   ipr_cmd-job_step = ioa_cfg-reset;
+   ipr_cmd-job_step = ipr_reset_block_config_access;
rc = IPR_RC_JOB_CONTINUE;
}
 
@@ -7797,7 +7849,7 @@ static int ipr_reset_alert(struct ipr_cm
writel(IPR_UPROCI_RESET_ALERT, 
ioa_cfg-regs.set_uproc_interrupt_reg32);
ipr_cmd-job_step = ipr_reset_wait_to_start_bist;
} else {
-   ipr_cmd-job_step = ioa_cfg-reset;
+   ipr_cmd-job_step = 

[PATCH v8 2/4] block: add the command line support

2011-09-07 Thread Zhi Yong Wu
Signed-off-by: Zhi Yong Wu wu...@linux.vnet.ibm.com
---
 block.c |   59 +++
 block.h |5 
 block_int.h |3 ++
 blockdev.c  |   29 +++
 qemu-config.c   |   24 ++
 qemu-options.hx |1 +
 6 files changed, 121 insertions(+), 0 deletions(-)

diff --git a/block.c b/block.c
index 43742b7..cd75183 100644
--- a/block.c
+++ b/block.c
@@ -104,6 +104,57 @@ int is_windows_drive(const char *filename)
 }
 #endif
 
+/* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+bs-io_limits_enabled = false;
+
+if (bs-block_queue) {
+qemu_block_queue_flush(bs-block_queue);
+qemu_del_block_queue(bs-block_queue);
+bs-block_queue = NULL;
+}
+
+if (bs-block_timer) {
+qemu_del_timer(bs-block_timer);
+qemu_free_timer(bs-block_timer);
+bs-block_timer = NULL;
+}
+
+bs-slice_start = 0;
+
+bs-slice_end   = 0;
+}
+
+static void bdrv_block_timer(void *opaque)
+{
+BlockDriverState *bs = opaque;
+BlockQueue *queue= bs-block_queue;
+
+qemu_block_queue_flush(queue);
+}
+
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+bs-block_queue = qemu_new_block_queue();
+bs-block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+
+bs-slice_start = qemu_get_clock_ns(vm_clock);
+
+bs-slice_end   = bs-slice_start + BLOCK_IO_SLICE_TIME;
+}
+
+bool bdrv_io_limits_enabled(BlockDriverState *bs)
+{
+BlockIOLimit *io_limits = bs-io_limits;
+return io_limits-bps[BLOCK_IO_LIMIT_READ]
+ || io_limits-bps[BLOCK_IO_LIMIT_WRITE]
+ || io_limits-bps[BLOCK_IO_LIMIT_TOTAL]
+ || io_limits-iops[BLOCK_IO_LIMIT_READ]
+ || io_limits-iops[BLOCK_IO_LIMIT_WRITE]
+ || io_limits-iops[BLOCK_IO_LIMIT_TOTAL];
+}
+
 /* check if the path starts with protocol: */
 static int path_has_protocol(const char *path)
 {
@@ -1453,6 +1504,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
 *psecs = bs-secs;
 }
 
+/* throttling disk io limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+BlockIOLimit *io_limits)
+{
+bs-io_limits = *io_limits;
+bs-io_limits_enabled = bdrv_io_limits_enabled(bs);
+}
+
 /* Recognize floppy formats */
 typedef struct FDFormat {
 FDriveType drive;
diff --git a/block.h b/block.h
index 3ac0b94..a3e69db 100644
--- a/block.h
+++ b/block.h
@@ -58,6 +58,11 @@ void bdrv_info(Monitor *mon, QObject **ret_data);
 void bdrv_stats_print(Monitor *mon, const QObject *data);
 void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
+/* disk I/O throttling */
+void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_disable(BlockDriverState *bs);
+bool bdrv_io_limits_enabled(BlockDriverState *bs);
+
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
 BlockDriver *bdrv_find_protocol(const char *filename);
diff --git a/block_int.h b/block_int.h
index 201e635..368c776 100644
--- a/block_int.h
+++ b/block_int.h
@@ -257,6 +257,9 @@ void qemu_aio_release(void *p);
 
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
 
+void bdrv_set_io_limits(BlockDriverState *bs,
+BlockIOLimit *io_limits);
+
 #ifdef _WIN32
 int is_windows_drive(const char *filename);
 #endif
diff --git a/blockdev.c b/blockdev.c
index 2602591..619ae9f 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -236,6 +236,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
 int on_read_error, on_write_error;
 const char *devaddr;
 DriveInfo *dinfo;
+BlockIOLimit io_limits;
 int snapshot = 0;
 int ret;
 
@@ -354,6 +355,31 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
 }
 }
 
+/* disk I/O throttling */
+io_limits.bps[BLOCK_IO_LIMIT_TOTAL]  =
+   qemu_opt_get_number(opts, bps, 0);
+io_limits.bps[BLOCK_IO_LIMIT_READ]   =
+   qemu_opt_get_number(opts, bps_rd, 0);
+io_limits.bps[BLOCK_IO_LIMIT_WRITE]  =
+   qemu_opt_get_number(opts, bps_wr, 0);
+io_limits.iops[BLOCK_IO_LIMIT_TOTAL] =
+   qemu_opt_get_number(opts, iops, 0);
+io_limits.iops[BLOCK_IO_LIMIT_READ]  =
+   qemu_opt_get_number(opts, iops_rd, 0);
+io_limits.iops[BLOCK_IO_LIMIT_WRITE] =
+   qemu_opt_get_number(opts, iops_wr, 0);
+
+if (((io_limits.bps[BLOCK_IO_LIMIT_TOTAL] != 0)
+ ((io_limits.bps[BLOCK_IO_LIMIT_READ] != 0)
+|| (io_limits.bps[BLOCK_IO_LIMIT_WRITE] != 0)))
+|| ((io_limits.iops[BLOCK_IO_LIMIT_TOTAL] != 0)
+ ((io_limits.iops[BLOCK_IO_LIMIT_READ] != 0)
+|| (io_limits.iops[BLOCK_IO_LIMIT_WRITE] != 0 {
+error_report(bps(iops) and bps_rd/bps_wr(iops_rd/iops_wr)
+ cannot be used at the same time);
+return 

[PATCH v8 3/4] block: add block timer and throttling algorithm

2011-09-07 Thread Zhi Yong Wu
Note:
 1.) When bps/iops limits are specified to a small value such as 511 
bytes/s, this VM will hang up. We are considering how to handle this senario.
 2.) When dd command is issued in guest, if its option bs is set to a 
large value such as bs=1024K, the result speed will slightly bigger than the 
limits.

For these problems, if you have nice thought, pls let us know.:)

Signed-off-by: Zhi Yong Wu wu...@linux.vnet.ibm.com
---
 block.c |  246 ---
 block.h |1 -
 2 files changed, 236 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index cd75183..8a82273 100644
--- a/block.c
+++ b/block.c
@@ -30,6 +30,9 @@
 #include qemu-objects.h
 #include qemu-coroutine.h
 
+#include qemu-timer.h
+#include block/blk-queue.h
+
 #ifdef CONFIG_BSD
 #include sys/types.h
 #include sys/stat.h
@@ -72,6 +75,13 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState 
*bs,
  QEMUIOVector *iov);
 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+bool is_write, int64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
 QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -745,6 +755,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, 
int flags,
 bs-change_cb(bs-change_opaque, CHANGE_MEDIA);
 }
 
+/* throttling disk I/O limits */
+if (bs-io_limits_enabled) {
+bdrv_io_limits_enable(bs);
+}
+
 return 0;
 
 unlink_and_fail:
@@ -783,6 +798,18 @@ void bdrv_close(BlockDriverState *bs)
 if (bs-change_cb)
 bs-change_cb(bs-change_opaque, CHANGE_MEDIA);
 }
+
+/* throttling disk I/O limits */
+if (bs-block_queue) {
+qemu_del_block_queue(bs-block_queue);
+bs-block_queue = NULL;
+}
+
+if (bs-block_timer) {
+qemu_del_timer(bs-block_timer);
+qemu_free_timer(bs-block_timer);
+bs-block_timer = NULL;
+}
 }
 
 void bdrv_close_all(void)
@@ -2341,16 +2368,40 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, 
int64_t sector_num,
  BlockDriverCompletionFunc *cb, void *opaque)
 {
 BlockDriver *drv = bs-drv;
+BlockDriverAIOCB *ret;
+int64_t wait_time = -1;
 
 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
-if (!drv)
-return NULL;
-if (bdrv_check_request(bs, sector_num, nb_sectors))
+if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
 return NULL;
+}
 
-return drv-bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+/* throttling disk read I/O */
+if (bs-io_limits_enabled) {
+if (bdrv_exceed_io_limits(bs, nb_sectors, false, wait_time)) {
+ret = qemu_block_queue_enqueue(bs-block_queue, bs, bdrv_aio_readv,
+   sector_num, qiov, nb_sectors, cb, opaque);
+if (wait_time != -1) {
+qemu_mod_timer(bs-block_timer,
+   wait_time + qemu_get_clock_ns(vm_clock));
+}
+
+return ret;
+}
+}
+
+ret =  drv-bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
cb, opaque);
+if (ret) {
+if (bs-io_limits_enabled) {
+bs-io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
+  (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+bs-io_disps.ios[BLOCK_IO_LIMIT_READ]++;
+}
+}
+
+return ret;
 }
 
 typedef struct BlockCompleteData {
@@ -2396,15 +2447,14 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, 
int64_t sector_num,
 BlockDriver *drv = bs-drv;
 BlockDriverAIOCB *ret;
 BlockCompleteData *blk_cb_data;
+int64_t wait_time = -1;
 
 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
 
-if (!drv)
-return NULL;
-if (bs-read_only)
-return NULL;
-if (bdrv_check_request(bs, sector_num, nb_sectors))
+if (!drv || bs-read_only
+|| bdrv_check_request(bs, sector_num, nb_sectors)) {
 return NULL;
+}
 
 if (bs-dirty_bitmap) {
 blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
@@ -2413,13 +2463,32 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, 
int64_t sector_num,
 opaque = blk_cb_data;
 }
 
+/* throttling disk write I/O */
+if (bs-io_limits_enabled) {
+if (bdrv_exceed_io_limits(bs, nb_sectors, true, wait_time)) {
+ret = qemu_block_queue_enqueue(bs-block_queue, bs, 
bdrv_aio_writev,
+  sector_num, qiov, nb_sectors, cb, opaque);
+if 

Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Avi Kivity

On 09/07/2011 06:56 PM, Don Zickus wrote:


  And hope that no other NMI was generated while we're handling this
  one.  It's a little... fragile?

No.  If another NMI is generated while we are processing the current one
it should get latched.  Upon completion of the current one, the cpu should
jump right back into the nmi exception routine again.  The only downside
is when multiple NMIs come in during the processing of the current one.
Only one can be latched, so the others get dropped.


Ah, yes, I remember now.


But we are addressing
that.



May I ask how?  Detecting a back-to-back NMI?

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 1/4] block: add the block queue support

2011-09-07 Thread Zhi Yong Wu
Signed-off-by: Zhi Yong Wu wu...@linux.vnet.ibm.com
---
 Makefile.objs |2 +-
 block/blk-queue.c |  184 +
 block/blk-queue.h |   59 +
 block_int.h   |   27 
 4 files changed, 271 insertions(+), 1 deletions(-)
 create mode 100644 block/blk-queue.c
 create mode 100644 block/blk-queue.h

diff --git a/Makefile.objs b/Makefile.objs
index 26b885b..5dcf456 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -33,7 +33,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o 
dmg.o bochs.o vpc.o vv
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o 
qcow2-cache.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o 
blk-queue.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/blk-queue.c b/block/blk-queue.c
new file mode 100644
index 000..da01fcb
--- /dev/null
+++ b/block/blk-queue.c
@@ -0,0 +1,184 @@
+/*
+ * QEMU System Emulator queue definition for block layer
+ *
+ * Copyright (c) IBM, Corp. 2011
+ *
+ * Authors:
+ *  Zhi Yong Wu  wu...@linux.vnet.ibm.com
+ *  Stefan Hajnoczi stefa...@linux.vnet.ibm.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include block_int.h
+#include block/blk-queue.h
+#include qemu-common.h
+
+/* The APIs for block request queue on qemu block layer.
+ */
+
+struct BlockQueueAIOCB {
+BlockDriverAIOCB common;
+QTAILQ_ENTRY(BlockQueueAIOCB) entry;
+BlockRequestHandler *handler;
+BlockDriverAIOCB *real_acb;
+
+int64_t sector_num;
+QEMUIOVector *qiov;
+int nb_sectors;
+};
+
+typedef struct BlockQueueAIOCB BlockQueueAIOCB;
+
+struct BlockQueue {
+QTAILQ_HEAD(requests, BlockQueueAIOCB) requests;
+bool flushing;
+};
+
+static void qemu_block_queue_dequeue(BlockQueue *queue,
+ BlockQueueAIOCB *request)
+{
+BlockQueueAIOCB *req;
+
+assert(queue);
+while (!QTAILQ_EMPTY(queue-requests)) {
+req = QTAILQ_FIRST(queue-requests);
+if (req == request) {
+QTAILQ_REMOVE(queue-requests, req, entry);
+break;
+}
+}
+}
+
+static void qemu_block_queue_cancel(BlockDriverAIOCB *acb)
+{
+BlockQueueAIOCB *request = container_of(acb, BlockQueueAIOCB, common);
+if (request-real_acb) {
+bdrv_aio_cancel(request-real_acb);
+} else {
+assert(request-common.bs-block_queue);
+qemu_block_queue_dequeue(request-common.bs-block_queue,
+ request);
+}
+
+qemu_aio_release(request);
+}
+
+static AIOPool block_queue_pool = {
+.aiocb_size = sizeof(struct BlockQueueAIOCB),
+.cancel = qemu_block_queue_cancel,
+};
+
+BlockQueue *qemu_new_block_queue(void)
+{
+BlockQueue *queue;
+
+queue = g_malloc0(sizeof(BlockQueue));
+
+QTAILQ_INIT(queue-requests);
+
+queue-flushing = false;
+
+return queue;
+}
+
+void qemu_del_block_queue(BlockQueue *queue)
+{
+BlockQueueAIOCB *request, *next;
+
+QTAILQ_FOREACH_SAFE(request, queue-requests, entry, next) {
+QTAILQ_REMOVE(queue-requests, request, entry);
+qemu_aio_release(request);
+}
+
+g_free(queue);
+}
+
+BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
+BlockDriverState *bs,
+BlockRequestHandler *handler,
+int64_t sector_num,
+QEMUIOVector *qiov,
+int nb_sectors,
+BlockDriverCompletionFunc *cb,
+void *opaque)
+{
+BlockDriverAIOCB *acb;
+BlockQueueAIOCB *request;
+
+if 

[PATCH 0/10] IOMMU: Make iommu_ops per-bus_type

2011-09-07 Thread Joerg Roedel
Hi,

here is the new version of the patch-set to make the iommu_ops used in
the iommu-api a bus_type property. This will allow us to move code out
of the iommu drivers into generic code and it simplifies the
implementation of the Alex' device-group property.

Greg, can you have a look at patch 2 please and tell me if you have any
objections?

With this version the patch-set is complete (not as the first RFC post).
It converts all iommu drivers to use the new registration interface and
completly removes the register_iommu interface.

Regards,

Joerg

Diffstat:

 arch/ia64/kvm/kvm-ia64.c   |3 +-
 arch/x86/kvm/x86.c |3 +-
 drivers/base/bus.c |   16 ++
 drivers/iommu/amd_iommu.c  |2 +-
 drivers/iommu/intel-iommu.c|2 +-
 drivers/iommu/iommu.c  |   58 
 drivers/iommu/msm_iommu.c  |2 +-
 drivers/iommu/omap-iommu.c |2 +-
 drivers/media/video/omap3isp/isp.c |2 +-
 include/linux/device.h |9 +
 include/linux/iommu.h  |   21 +++--
 virt/kvm/iommu.c   |4 +-
 12 files changed, 86 insertions(+), 38 deletions(-)


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/10] iommu/core: Convert iommu_found to iommu_present

2011-09-07 Thread Joerg Roedel
With per-bus iommu_ops the iommu_found function needs to
work on a bus_type too. This patch adds a bus_type parameter
to that function and converts all call-places.
The function is also renamed to iommu_present because the
function now checks if an iommu is present for a given bus
and does not check for a global iommu anymore.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 arch/ia64/kvm/kvm-ia64.c |3 ++-
 arch/x86/kvm/x86.c   |3 ++-
 drivers/iommu/iommu.c|9 ++---
 include/linux/iommu.h|4 ++--
 virt/kvm/iommu.c |2 +-
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 8213efe..43f4c92 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -33,6 +33,7 @@
 #include linux/uaccess.h
 #include linux/iommu.h
 #include linux/intel-iommu.h
+#include linux/pci.h
 
 #include asm/pgtable.h
 #include asm/gcc_intrin.h
@@ -204,7 +205,7 @@ int kvm_dev_ioctl_check_extension(long ext)
r = KVM_COALESCED_MMIO_PAGE_OFFSET;
break;
case KVM_CAP_IOMMU:
-   r = iommu_found();
+   r = iommu_present(pci_bus_type);
break;
default:
r = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84a28ea..73c6a42 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -44,6 +44,7 @@
 #include linux/perf_event.h
 #include linux/uaccess.h
 #include linux/hash.h
+#include linux/pci.h
 #include trace/events/kvm.h
 
 #define CREATE_TRACE_POINTS
@@ -2095,7 +2096,7 @@ int kvm_dev_ioctl_check_extension(long ext)
r = 0;
break;
case KVM_CAP_IOMMU:
-   r = iommu_found();
+   r = iommu_present(pci_bus_type);
break;
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index adaee9b..2270127 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -39,11 +39,14 @@ void iommu_bus_init(struct bus_type *bus, struct iommu_ops 
*ops)
 {
 }
 
-bool iommu_found(void)
+bool iommu_present(struct bus_type *bus)
 {
-   return iommu_ops != NULL;
+   if (bus-iommu_ops != NULL)
+   return true;
+   else
+   return iommu_ops != NULL;
 }
-EXPORT_SYMBOL_GPL(iommu_found);
+EXPORT_SYMBOL_GPL(iommu_present);
 
 struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3bd6892..de73219 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -56,7 +56,7 @@ struct iommu_ops {
 
 extern void register_iommu(struct iommu_ops *ops);
 extern void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops);
-extern bool iommu_found(void);
+extern bool iommu_present(struct bus_type *bus);
 extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
@@ -76,7 +76,7 @@ extern int iommu_domain_has_cap(struct iommu_domain *domain,
 
 struct iommu_ops {};
 
-static inline bool iommu_found(void)
+static inline bool iommu_present(struct bus_type *bus)
 {
return false;
 }
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 20115b1..d149940 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -228,7 +228,7 @@ int kvm_iommu_map_guest(struct kvm *kvm)
 {
int r;
 
-   if (!iommu_found()) {
+   if (!iommu_present(pci_bus_type)) {
printk(KERN_ERR %s: iommu not found\n, __func__);
return -ENODEV;
}
-- 
1.7.4.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/10] iommu/core: Use bus-iommu_ops in the iommu-api

2011-09-07 Thread Joerg Roedel
Use the per-bus iommu-ops in the functions of the iommu-api
instead of the global iommu_ops.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 drivers/iommu/iommu.c |   34 +++---
 1 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2270127..f2ced4c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -83,34 +83,48 @@ EXPORT_SYMBOL_GPL(iommu_domain_alloc);
 
 void iommu_domain_free(struct iommu_domain *domain)
 {
-   iommu_ops-domain_destroy(domain);
+   if (likely(domain-ops-domain_destroy != NULL))
+   domain-ops-domain_destroy(domain);
+
kfree(domain);
 }
 EXPORT_SYMBOL_GPL(iommu_domain_free);
 
 int iommu_attach_device(struct iommu_domain *domain, struct device *dev)
 {
-   return iommu_ops-attach_dev(domain, dev);
+   if (unlikely(domain-ops-attach_dev == NULL))
+   return -ENODEV;
+
+   return domain-ops-attach_dev(domain, dev);
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
 void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
 {
-   iommu_ops-detach_dev(domain, dev);
+   if (unlikely(domain-ops-detach_dev == NULL))
+   return;
+
+   domain-ops-detach_dev(domain, dev);
 }
 EXPORT_SYMBOL_GPL(iommu_detach_device);
 
 phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
   unsigned long iova)
 {
-   return iommu_ops-iova_to_phys(domain, iova);
+   if (unlikely(domain-ops-iova_to_phys == NULL))
+   return 0;
+
+   return domain-ops-iova_to_phys(domain, iova);
 }
 EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
 
 int iommu_domain_has_cap(struct iommu_domain *domain,
 unsigned long cap)
 {
-   return iommu_ops-domain_has_cap(domain, cap);
+   if (unlikely(domain-ops-domain_has_cap == NULL))
+   return 0;
+
+   return domain-ops-domain_has_cap(domain, cap);
 }
 EXPORT_SYMBOL_GPL(iommu_domain_has_cap);
 
@@ -119,11 +133,14 @@ int iommu_map(struct iommu_domain *domain, unsigned long 
iova,
 {
size_t size;
 
+   if (unlikely(domain-ops-map == NULL))
+   return -ENODEV;
+
size = PAGE_SIZE  gfp_order;
 
BUG_ON(!IS_ALIGNED(iova | paddr, size));
 
-   return iommu_ops-map(domain, iova, paddr, gfp_order, prot);
+   return domain-ops-map(domain, iova, paddr, gfp_order, prot);
 }
 EXPORT_SYMBOL_GPL(iommu_map);
 
@@ -131,10 +148,13 @@ int iommu_unmap(struct iommu_domain *domain, unsigned 
long iova, int gfp_order)
 {
size_t size;
 
+   if (unlikely(domain-ops-unmap == NULL))
+   return -ENODEV;
+
size = PAGE_SIZE  gfp_order;
 
BUG_ON(!IS_ALIGNED(iova, size));
 
-   return iommu_ops-unmap(domain, iova, gfp_order);
+   return domain-ops-unmap(domain, iova, gfp_order);
 }
 EXPORT_SYMBOL_GPL(iommu_unmap);
-- 
1.7.4.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/10] iommu/core: Add bus_type parameter to iommu_domain_alloc

2011-09-07 Thread Joerg Roedel
This is necessary to store a pointer to the bus-specific
iommu_ops in the iommu-domain structure. It will be used
later to call into bus-specific iommu-ops.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 drivers/iommu/iommu.c  |   14 +-
 drivers/media/video/omap3isp/isp.c |2 +-
 include/linux/iommu.h  |6 --
 virt/kvm/iommu.c   |2 +-
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3b24a5b..adaee9b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -16,6 +16,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
+#include linux/device.h
 #include linux/kernel.h
 #include linux/bug.h
 #include linux/types.h
@@ -44,15 +45,26 @@ bool iommu_found(void)
 }
 EXPORT_SYMBOL_GPL(iommu_found);
 
-struct iommu_domain *iommu_domain_alloc(void)
+struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
 {
struct iommu_domain *domain;
+   struct iommu_ops *ops;
int ret;
 
+   if (bus-iommu_ops)
+   ops = bus-iommu_ops;
+   else
+   ops = iommu_ops;
+
+   if (ops == NULL)
+   return NULL;
+
domain = kmalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
return NULL;
 
+   domain-ops = ops;
+
ret = iommu_ops-domain_init(domain);
if (ret)
goto out_free;
diff --git a/drivers/media/video/omap3isp/isp.c 
b/drivers/media/video/omap3isp/isp.c
index a4baa61..a7ed985 100644
--- a/drivers/media/video/omap3isp/isp.c
+++ b/drivers/media/video/omap3isp/isp.c
@@ -2141,7 +2141,7 @@ static int isp_probe(struct platform_device *pdev)
/* to be removed once iommu migration is complete */
isp-iommu = to_iommu(isp-iommu_dev);
 
-   isp-domain = iommu_domain_alloc();
+   isp-domain = iommu_domain_alloc(pdev-dev.bus);
if (!isp-domain) {
dev_err(isp-dev, can't alloc iommu domain\n);
ret = -ENOMEM;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4739e36..3bd6892 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -25,10 +25,12 @@
 #define IOMMU_WRITE(2)
 #define IOMMU_CACHE(4) /* DMA cache coherency */
 
+struct iommu_ops;
 struct bus_type;
 struct device;
 
 struct iommu_domain {
+   struct iommu_ops *ops;
void *priv;
 };
 
@@ -55,7 +57,7 @@ struct iommu_ops {
 extern void register_iommu(struct iommu_ops *ops);
 extern void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops);
 extern bool iommu_found(void);
-extern struct iommu_domain *iommu_domain_alloc(void);
+extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
   struct device *dev);
@@ -79,7 +81,7 @@ static inline bool iommu_found(void)
return false;
 }
 
-static inline struct iommu_domain *iommu_domain_alloc(void)
+static inline struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
 {
return NULL;
 }
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 78c80f6..20115b1 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -233,7 +233,7 @@ int kvm_iommu_map_guest(struct kvm *kvm)
return -ENODEV;
}
 
-   kvm-arch.iommu_domain = iommu_domain_alloc();
+   kvm-arch.iommu_domain = iommu_domain_alloc(pci_bus_type);
if (!kvm-arch.iommu_domain)
return -ENOMEM;
 
-- 
1.7.4.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/6] KVM: x86 emulator: simplify emulate_1op_rax_rdx()

2011-09-07 Thread Avi Kivity
emulate_1op_rax_rdx() is always called with the same parameters.  Simplify
by passing just the emulation context.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/emulate.c |   42 +-
 1 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cb8dcb7..c636ee7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -322,9 +322,11 @@ struct gprefix {
}   \
} while (0)
 
-#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
+#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
do {\
unsigned long _tmp; \
+   ulong *rax = (ctxt)-regs[VCPU_REGS_RAX];  \
+   ulong *rdx = (ctxt)-regs[VCPU_REGS_RDX];  \
\
__asm__ __volatile__ (  \
_PRE_EFLAGS(0, 5, 1)  \
@@ -337,31 +339,27 @@ struct gprefix {
jmp 2b \n\t   \
.popsection \n\t  \
_ASM_EXTABLE(1b, 3b)\
-   : =m (_eflags), =r (_tmp), \
- +a (_rax), +d (_rdx), +qm(_ex)  \
-   : i (EFLAGS_MASK), m ((_src).val),  \
- a (_rax), d (_rdx));  \
+   : =m ((ctxt)-eflags), =r (_tmp),  \
+ +a (*rax), +d (*rdx), +qm(_ex)  \
+   : i (EFLAGS_MASK), m ((ctxt)-src.val), \
+ a (*rax), d (*rdx));  \
} while (0)
 
 /* instruction has only one source operand, destination is implicit (e.g. mul, 
div, imul, idiv) */
-#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _ex)   \
+#define emulate_1op_rax_rdx(ctxt, _op, _ex)\
do {\
-   switch((_src).bytes) {  \
+   switch((ctxt)-src.bytes) { \
case 1: \
-   __emulate_1op_rax_rdx(_op, _src, _rax, _rdx,\
- _eflags, b, _ex);   \
+   __emulate_1op_rax_rdx(ctxt, _op, b, _ex); \
break;  \
case 2: \
-   __emulate_1op_rax_rdx(_op, _src, _rax, _rdx,\
- _eflags, w, _ex);   \
+   __emulate_1op_rax_rdx(ctxt, _op, w, _ex); \
break;  \
case 4: \
-   __emulate_1op_rax_rdx(_op, _src, _rax, _rdx,\
- _eflags, l, _ex);   \
+   __emulate_1op_rax_rdx(ctxt, _op, l, _ex); \
break;  \
case 8: ON64(   \
-   __emulate_1op_rax_rdx(_op, _src, _rax, _rdx,\
- _eflags, q, _ex));  \
+   __emulate_1op_rax_rdx(ctxt, _op, q, _ex));\
break;  \
}   \
} while (0)
@@ -1667,8 +1665,6 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
 
 static int em_grp3(struct x86_emulate_ctxt *ctxt)
 {
-   unsigned long *rax = ctxt-regs[VCPU_REGS_RAX];
-   unsigned long *rdx = ctxt-regs[VCPU_REGS_RDX];
u8 de = 0;
 
switch (ctxt-modrm_reg) {
@@ -1682,20 +1678,16 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt)
emulate_1op(ctxt, neg);
break;
case 4: /* mul */
-   emulate_1op_rax_rdx(mul, ctxt-src, *rax, *rdx,
-   ctxt-eflags, de);
+   emulate_1op_rax_rdx(ctxt, mul, de);
break;
case 5: /* imul */
-   emulate_1op_rax_rdx(imul, ctxt-src, *rax, *rdx,
-   ctxt-eflags, de);
+   emulate_1op_rax_rdx(ctxt, imul, de);
break;
case 6: /* div */
-   

[PATCH 4/6] KVM: x86 emulator: simplify emulate_1op()

2011-09-07 Thread Avi Kivity
emulate_1op() is always called with the same parameters.  Simplify
by passing just the emulation context.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/emulate.c |   26 +-
 1 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 70c9f11..a0dd13f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -298,7 +298,7 @@ struct gprefix {
}   \
} while (0)
 
-#define __emulate_1op(_op, _dst, _eflags, _suffix) \
+#define __emulate_1op(ctxt, _op, _suffix)  \
do {\
unsigned long _tmp; \
\
@@ -306,19 +306,19 @@ struct gprefix {
_PRE_EFLAGS(0, 3, 2)  \
_op _suffix  %1;  \
_POST_EFLAGS(0, 3, 2) \
-   : =m (_eflags), +m ((_dst).val),\
+   : =m ((ctxt)-eflags), +m ((ctxt)-dst.val), \
  =r (_tmp)  \
: i (EFLAGS_MASK));   \
} while (0)
 
 /* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags)\
+#define emulate_1op(ctxt, _op) \
do {\
-   switch ((_dst).bytes) { \
-   case 1: __emulate_1op(_op, _dst, _eflags, b); break;  \
-   case 2: __emulate_1op(_op, _dst, _eflags, w); break;  \
-   case 4: __emulate_1op(_op, _dst, _eflags, l); break;  \
-   case 8: ON64(__emulate_1op(_op, _dst, _eflags, q)); break; \
+   switch ((ctxt)-dst.bytes) {\
+   case 1: __emulate_1op(ctxt, _op, b); break;   \
+   case 2: __emulate_1op(ctxt, _op, w); break;   \
+   case 4: __emulate_1op(ctxt, _op, l); break;   \
+   case 8: ON64(__emulate_1op(ctxt, _op, q)); break; \
}   \
} while (0)
 
@@ -1715,7 +1715,7 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt)
ctxt-dst.val = ~ctxt-dst.val;
break;
case 3: /* neg */
-   emulate_1op(neg, ctxt-dst, ctxt-eflags);
+   emulate_1op(ctxt, neg);
break;
case 4: /* mul */
emulate_1op_rax_rdx(mul, ctxt-src, *rax, *rdx, ctxt-eflags);
@@ -1745,10 +1745,10 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
 
switch (ctxt-modrm_reg) {
case 0: /* inc */
-   emulate_1op(inc, ctxt-dst, ctxt-eflags);
+   emulate_1op(ctxt, inc);
break;
case 1: /* dec */
-   emulate_1op(dec, ctxt-dst, ctxt-eflags);
+   emulate_1op(ctxt, dec);
break;
case 2: /* call near abs */ {
long int old_eip;
@@ -3849,10 +3849,10 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
break;
case 0x40 ... 0x47: /* inc r16/r32 */
-   emulate_1op(inc, ctxt-dst, ctxt-eflags);
+   emulate_1op(ctxt, inc);
break;
case 0x48 ... 0x4f: /* dec r16/r32 */
-   emulate_1op(dec, ctxt-dst, ctxt-eflags);
+   emulate_1op(ctxt, dec);
break;
case 0x63:  /* movsxd */
if (ctxt-mode != X86EMUL_MODE_PROT64)
-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] client/bin/autotest: mention control file in usage message

2011-09-07 Thread Alon Levy
Signed-off-by: Alon Levy al...@redhat.com
---
 client/bin/autotest |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/client/bin/autotest b/client/bin/autotest
index af521de..c84b771 100755
--- a/client/bin/autotest
+++ b/client/bin/autotest
@@ -21,7 +21,7 @@ os.environ['AUTODIR'] = autodir
 os.environ['AUTODIRBIN'] = autodirbin
 os.environ['PYTHONPATH'] = autodirbin
 
-parser = OptionParser()
+parser = OptionParser(usage='Usage: %prog [options] control-file')
 
 parser.add_option(-a, --args, dest='args',
 help=additional args to pass to control file)
@@ -76,6 +76,7 @@ options, args = parser.parse_args()
 
 # Check for a control file if not in prebuild mode.
 if len(args) != 1 and options.client_test_setup is None:
+print missing control file.
 usage()
 
 drop_caches = global_config.global_config.get_config_value('CLIENT',
-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/10] iommu/omap: Use bus_set_iommu instead of register_iommu

2011-09-07 Thread Joerg Roedel
Convert the OMAP IOMMU driver on ARM to use the new
interface for publishing the iommu_ops.

Cc: Ohad Ben-Cohen o...@wizery.com
Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 drivers/iommu/omap-iommu.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index bd5f606..16d5b76 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1250,7 +1250,7 @@ static int __init omap_iommu_init(void)
return -ENOMEM;
iopte_cachep = p;
 
-   register_iommu(omap_iommu_ops);
+   bus_set_iommu(platform_bus_type, omap_iommu_ops);
 
return platform_driver_register(omap_iommu_driver);
 }
-- 
1.7.4.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/10] Driver core: Add iommu_ops to bus_type

2011-09-07 Thread Joerg Roedel
This is the starting point to make the iommu_ops used for
the iommu-api a per-bus-type structure. It is required to
easily implement bus-specific setup in the iommu-layer.
The first user will be the iommu-group attribute in sysfs.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 drivers/base/bus.c |   16 
 drivers/iommu/iommu.c  |4 
 include/linux/device.h |9 +
 include/linux/iommu.h  |2 ++
 4 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 000e7b2..34ac706 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -1028,6 +1028,22 @@ void bus_sort_breadthfirst(struct bus_type *bus,
 }
 EXPORT_SYMBOL_GPL(bus_sort_breadthfirst);
 
+#ifdef CONFIG_IOMMU_API
+int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops)
+{
+   if (bus-iommu_ops != NULL)
+   return -EBUSY;
+
+   bus-iommu_ops = ops;
+
+   /* Do IOMMU specific setup for this bus-type */
+   iommu_bus_init(bus, ops);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(bus_set_iommu);
+#endif
+
 int __init buses_init(void)
 {
bus_kset = kset_create_and_add(bus, bus_uevent_ops, NULL);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 30b0644..3b24a5b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -34,6 +34,10 @@ void register_iommu(struct iommu_ops *ops)
iommu_ops = ops;
 }
 
+void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops)
+{
+}
+
 bool iommu_found(void)
 {
return iommu_ops != NULL;
diff --git a/include/linux/device.h b/include/linux/device.h
index c20dfbf..8240b2a 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -22,6 +22,7 @@
 #include linux/types.h
 #include linux/module.h
 #include linux/pm.h
+#include linux/iommu.h
 #include linux/atomic.h
 #include asm/device.h
 
@@ -67,6 +68,9 @@ extern void bus_remove_file(struct bus_type *, struct 
bus_attribute *);
  * @resume:Called to bring a device on this bus out of sleep mode.
  * @pm:Power management operations of this bus, callback the 
specific
  * device driver's pm-ops.
+ * @iommu_ops   IOMMU specific operations for this bus, used to attach IOMMU
+ *  driver implementations to a bus and allow the driver to do
+ *  bus-specific setup
  * @p: The private data of the driver core, only the driver core can
  * touch this.
  *
@@ -96,6 +100,8 @@ struct bus_type {
 
const struct dev_pm_ops *pm;
 
+   struct iommu_ops *iommu_ops;
+
struct subsys_private *p;
 };
 
@@ -148,6 +154,9 @@ extern int bus_unregister_notifier(struct bus_type *bus,
 #define BUS_NOTIFY_UNBOUND_DRIVER  0x0006 /* driver is unbound
  from the device */
 
+/* IOMMU related bus functions */
+int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops);
+
 extern struct kset *bus_get_kset(struct bus_type *bus);
 extern struct klist *bus_get_device_klist(struct bus_type *bus);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6470cd8..4739e36 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -25,6 +25,7 @@
 #define IOMMU_WRITE(2)
 #define IOMMU_CACHE(4) /* DMA cache coherency */
 
+struct bus_type;
 struct device;
 
 struct iommu_domain {
@@ -52,6 +53,7 @@ struct iommu_ops {
 };
 
 extern void register_iommu(struct iommu_ops *ops);
+extern void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops);
 extern bool iommu_found(void);
 extern struct iommu_domain *iommu_domain_alloc(void);
 extern void iommu_domain_free(struct iommu_domain *domain);
-- 
1.7.4.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Avi Kivity

On 09/07/2011 04:44 PM, Don Zickus wrote:


  Is there a way to tell whether an NMI was internally or externally
  generated?

  I don't think so, especially as two or more NMIs can be coalesced.
  So any NMI received on this first cpu has to check the NMI reason
  port?

Well we cheat and execute all the nmi handlers first.  If they come back
as handled, we skip the check for the external NMI.


And hope that no other NMI was generated while we're handling this one.  
It's a little... fragile?



But you are right, other than checking the reason port, there isn't a way
to determine if an NMI is internally or externally generated.


Ouch.




  
 But on the other hand, I don't really care if you can say that this path
 will never be called in a virtual machine.
  
  Does virtual machines support hot remove of cpus?  Probably not
  considering bare-metal barely supports it.
  

  They do.

But vcpus probably don't have the notion of a bsp cpu, so perhaps virtual
machines can get away with it easier?  (I don't know enough about the hot
cpu remove code to really explain it, just enough to know it can cause
problems and people are trying to address it).



The concept of a bsp exists in exactly the same way as on real hardware.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/6] KVM: x86 emulator: simplify emulate_2op_SrcV()

2011-09-07 Thread Avi Kivity
emulate_2op_SrcV(), and its siblings, emulate_2op_SrcV_nobyte()
and emulate_2op_SrcB(), all use the same calling conventions
and all get passed exactly the same parameters.  Simplify them
by passing just the emulation context.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/emulate.c |   90 +++
 1 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0453c07..3f6c6ca 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -205,64 +205,62 @@ struct gprefix {
 #define ON64(x)
 #endif
 
-#define emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
+#define emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype)  \
do {\
__asm__ __volatile__ (  \
_PRE_EFLAGS(0, 4, 2)  \
_op _suffix  %_x3,%1;   \
_POST_EFLAGS(0, 4, 2) \
-   : =m (_eflags), +q (*(_dsttype*)(_dst).val),\
+   : =m ((ctxt)-eflags),\
+ +q (*(_dsttype*)(ctxt)-dst.val),  \
  =r (_tmp)  \
-   : _y ((_src).val), i (EFLAGS_MASK));  \
+   : _y ((ctxt)-src.val), i (EFLAGS_MASK)); \
} while (0)
 
 
 /* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
+#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \
do {\
unsigned long _tmp; \
\
-   switch ((_dst).bytes) { \
+   switch ((ctxt)-dst.bytes) {\
case 2: \
-   emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,w,u16);\
+   emulate_2op(ctxt,_op,_wx,_wy,w,u16);  \
break;  \
case 4: \
-   emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,l,u32);\
+   emulate_2op(ctxt,_op,_lx,_ly,l,u32);  \
break;  \
case 8: \
-   
ON64(emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,q,u64)); \
+   ON64(emulate_2op(ctxt,_op,_qx,_qy,q,u64)); \
break;  \
}   \
} while (0)
 
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
+#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy)
 \
do { \
unsigned long _tmp;  \
-   switch ((_dst).bytes) {  \
+   switch ((ctxt)-dst.bytes) { \
case 1:  \
-   emulate_2op(_op,_src,_dst,_eflags,_bx,_by,b,u8); \
+   emulate_2op(ctxt,_op,_bx,_by,b,u8);\
break;   \
default: \
-   __emulate_2op_nobyte(_op, _src, _dst, _eflags,   \
+   __emulate_2op_nobyte(ctxt, _op,  \
 _wx, _wy, _lx, _ly, _qx, _qy);  \
break;   \
}\
} while (0)
 
 /* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags)  \
-   __emulate_2op(_op, _src, _dst, _eflags, \
- b, c, b, c, b, c, b, c)
+#define emulate_2op_SrcB(ctxt, _op)\
+   __emulate_2op(ctxt, _op, b, c, b, c, b, c, b, c)
 
 /* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags)  \
-   

[PATCH 5/6] KVM: x86 emulator: merge the two emulate_1op_rax_rdx implementations

2011-09-07 Thread Avi Kivity
We have two emulate-with-extended-accumulator implementations: once
which expect traps (_ex) and one which doesn't (plain).  Drop the
plain implementation and always use the one which expects traps;
it will simply return 0 in the _ex argument and we can happily ignore
it.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/emulate.c |   64 +++
 1 files changed, 15 insertions(+), 49 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a0dd13f..cb8dcb7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -322,21 +322,7 @@ struct gprefix {
}   \
} while (0)
 
-#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) 
\
-   do {\
-   unsigned long _tmp; \
-   \
-   __asm__ __volatile__ (  \
-   _PRE_EFLAGS(0, 4, 1)  \
-   _op _suffix  %5;  \
-   _POST_EFLAGS(0, 4, 1) \
-   : =m (_eflags), =r (_tmp), \
- +a (_rax), +d (_rdx)  \
-   : i (EFLAGS_MASK), m ((_src).val),  \
- a (_rax), d (_rdx));  \
-   } while (0)
-
-#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) 
\
+#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
do {\
unsigned long _tmp; \
\
@@ -358,46 +344,24 @@ struct gprefix {
} while (0)
 
 /* instruction has only one source operand, destination is implicit (e.g. mul, 
div, imul, idiv) */
-#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags)\
+#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _ex)   \
do {\
switch((_src).bytes) {  \
case 1: \
__emulate_1op_rax_rdx(_op, _src, _rax, _rdx,\
- _eflags, b);\
+ _eflags, b, _ex);   \
break;  \
case 2: \
__emulate_1op_rax_rdx(_op, _src, _rax, _rdx,\
- _eflags, w);\
+ _eflags, w, _ex);   \
break;  \
case 4: \
__emulate_1op_rax_rdx(_op, _src, _rax, _rdx,\
- _eflags, l);\
-   break;  \
-   case 8: \
-   ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
-  _eflags, q));  \
-   break;  \
-   }   \
-   } while (0)
-
-#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex)\
-   do {\
-   switch((_src).bytes) {  \
-   case 1: \
-   __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
-_eflags, b, _ex);\
-   break;  \
-   case 2: \
-   __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
-_eflags, w, _ex);\
-   break;  \
-   case 4: \
-   __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
-_eflags, l, _ex);\
+ _eflags, l, _ex);   \
  

[PATCH 3/6] KVM: x86 emulator: simplify emulate_2op_cl()

2011-09-07 Thread Avi Kivity
emulate_2op_cl() is always called with the same parameters.  Simplify
by passing just the emulation context.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/emulate.c |8 
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1bc9e24..70c9f11 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -263,7 +263,7 @@ struct gprefix {
__emulate_2op_nobyte(ctxt, _op, w, r, _LO32, r, , r)
 
 /* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(_op, ctxt, _suffix, _type)\
+#define __emulate_2op_cl(ctxt, _op, _suffix, _type)\
do {\
unsigned long _tmp; \
_type _clv  = (ctxt)-src2.val; \
@@ -287,13 +287,13 @@ struct gprefix {
do {\
switch ((ctxt)-dst.bytes) {\
case 2: \
-   __emulate_2op_cl(_op, ctxt, w, u16);  \
+   __emulate_2op_cl(ctxt, _op, w, u16);  \
break;  \
case 4: \
-   __emulate_2op_cl(_op, ctxt, l, u32);  \
+   __emulate_2op_cl(ctxt, _op, l, u32);  \
break;  \
case 8: \
-   ON64(__emulate_2op_cl(_op, ctxt, q, ulong));  \
+   ON64(__emulate_2op_cl(ctxt, _op, q, ulong));  \
break;  \
}   \
} while (0)
-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Don Zickus
On Wed, Sep 07, 2011 at 06:11:14PM +0300, Avi Kivity wrote:
 On 09/07/2011 04:44 PM, Don Zickus wrote:
 
   Is there a way to tell whether an NMI was internally or externally
   generated?
 
   I don't think so, especially as two or more NMIs can be coalesced.
   So any NMI received on this first cpu has to check the NMI reason
   port?
 
 Well we cheat and execute all the nmi handlers first.  If they come back
 as handled, we skip the check for the external NMI.
 
 And hope that no other NMI was generated while we're handling this
 one.  It's a little... fragile?

No.  If another NMI is generated while we are processing the current one
it should get latched.  Upon completion of the current one, the cpu should
jump right back into the nmi exception routine again.  The only downside
is when multiple NMIs come in during the processing of the current one.
Only one can be latched, so the others get dropped.  But we are addressing
that.

Cheers,
Don

 
 But you are right, other than checking the reason port, there isn't a way
 to determine if an NMI is internally or externally generated.
 
 Ouch.
 
 
 
   
  But on the other hand, I don't really care if you can say that this 
  path
  will never be called in a virtual machine.
   
   Does virtual machines support hot remove of cpus?  Probably not
   considering bare-metal barely supports it.
   
 
   They do.
 
 But vcpus probably don't have the notion of a bsp cpu, so perhaps virtual
 machines can get away with it easier?  (I don't know enough about the hot
 cpu remove code to really explain it, just enough to know it can cause
 problems and people are trying to address it).
 
 
 The concept of a bsp exists in exactly the same way as on real hardware.
 
 -- 
 error compiling committee.c: too many arguments to function
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Don Zickus
On Wed, Sep 07, 2011 at 07:13:58AM +0300, Avi Kivity wrote:
 On 09/06/2011 09:27 PM, Don Zickus wrote:
 On Tue, Sep 06, 2011 at 11:07:26AM -0700, Jeremy Fitzhardinge wrote:
 But, erm, does that even make sense?  I'm assuming the NMI reason port
 tells the CPU why it got an NMI.  If multiple CPUs can get NMIs and
 there's only a single reason port, then doesn't that mean that either 
  1)
 they all got the NMI for the same reason, or 2) having a single port 
  is
 inherently racy?  How does the locking actually work there?
 The reason port is for an external/system NMI.  All the IPI-NMI don't 
  need
 to access this register to process their handlers, ie perf.  I think in
 general the IOAPIC is configured to deliver the external NMI to one 
  cpu,
 usually the bsp cpu.  However, there has been a slow movement to free 
  the
 bsp cpu from exceptions like this to allow one to eventually hot-swap 
  the
 bsp cpu.  The spin locks in that code were an attempt to be more 
  abstract
 about who really gets the external NMI.  Of course SGI's box is setup 
  to
 deliver an external NMI to all cpus to dump the stack when the system
 isn't behaving.
   
 This is a very low usage NMI (in fact almost all cases lead to loud
 console messages).
   
 Hope that clears up some of the confusion.
 
   Hm, not really.
 
   What does it mean if two CPUs go down that path?  Should one do some NMI
   processing while the other waits around for it to finish, and then do
   some NMI processing on its own?
 
 Well the time the second one gets to the external NMI it should have been
 cleared by the first cpu, which would of course lead to the second cpu
 causing a 'Dazed and confused' message.  But on most x86 machines only one
 cpu should be routed the external NMI.  Though there is an SGI box that is
 designed to send an external NMI to all of its cpus.
 
 Is there a way to tell whether an NMI was internally or externally
 generated?
 
 I don't think so, especially as two or more NMIs can be coalesced.
 So any NMI received on this first cpu has to check the NMI reason
 port?

Well we cheat and execute all the nmi handlers first.  If they come back
as handled, we skip the check for the external NMI.

But you are right, other than checking the reason port, there isn't a way
to determine if an NMI is internally or externally generated.

 
 
   But on the other hand, I don't really care if you can say that this path
   will never be called in a virtual machine.
 
 Does virtual machines support hot remove of cpus?  Probably not
 considering bare-metal barely supports it.
 
 
 They do.

But vcpus probably don't have the notion of a bsp cpu, so perhaps virtual
machines can get away with it easier?  (I don't know enough about the hot
cpu remove code to really explain it, just enough to know it can cause
problems and people are trying to address it).

Cheers,
Don
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/6] KVM: x86 emulator: simplify emulate_2op_cl()

2011-09-07 Thread Avi Kivity
emulate_2op_cl() is always called with the same parameters.  Simplify
by passing just the emulation context.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/emulate.c |   33 +++--
 1 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3f6c6ca..1bc9e24 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -263,40 +263,37 @@ struct gprefix {
__emulate_2op_nobyte(ctxt, _op, w, r, _LO32, r, , r)
 
 /* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type)
\
+#define __emulate_2op_cl(_op, ctxt, _suffix, _type)\
do {\
unsigned long _tmp; \
-   _type _clv  = (_cl).val;\
-   _type _srcv = (_src).val;   \
-   _type _dstv = (_dst).val;   \
+   _type _clv  = (ctxt)-src2.val; \
+   _type _srcv = (ctxt)-src.val;  \
+   _type _dstv = (ctxt)-dst.val;  \
\
__asm__ __volatile__ (  \
_PRE_EFLAGS(0, 5, 2)  \
_op _suffix  %4,%1 \n \
_POST_EFLAGS(0, 5, 2) \
-   : =m (_eflags), +r (_dstv), =r (_tmp)\
+   : =m ((ctxt)-eflags), +r (_dstv), =r (_tmp) \
: c (_clv) , r (_srcv), i (EFLAGS_MASK)   \
);  \
\
-   (_cl).val  = (unsigned long) _clv;  \
-   (_src).val = (unsigned long) _srcv; \
-   (_dst).val = (unsigned long) _dstv; \
+   (ctxt)-src2.val  = (unsigned long) _clv;   \
+   (ctxt)-src2.val = (unsigned long) _srcv;   \
+   (ctxt)-dst.val = (unsigned long) _dstv;\
} while (0)
 
-#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags)  \
+#define emulate_2op_cl(ctxt, _op)  \
do {\
-   switch ((_dst).bytes) { \
+   switch ((ctxt)-dst.bytes) {\
case 2: \
-   __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
-w, unsigned short);  \
+   __emulate_2op_cl(_op, ctxt, w, u16);  \
break;  \
case 4: \
-   __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
-l, unsigned int);\
+   __emulate_2op_cl(_op, ctxt, l, u32);  \
break;  \
case 8: \
-   ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- q, unsigned long)); \
+   ON64(__emulate_2op_cl(_op, ctxt, q, ulong));  \
break;  \
}   \
} while (0)
@@ -4123,7 +4120,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
break;
case 0xa4: /* shld imm8, r, r/m */
case 0xa5: /* shld cl, r, r/m */
-   emulate_2op_cl(shld, ctxt-src2, ctxt-src, ctxt-dst, 
ctxt-eflags);
+   emulate_2op_cl(ctxt, shld);
break;
case 0xa8:  /* push gs */
rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
@@ -4137,7 +4134,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
break;
case 0xac: /* shrd imm8, r, r/m */
case 0xad: /* shrd cl, r, r/m */
-   emulate_2op_cl(shrd, ctxt-src2, ctxt-src, ctxt-dst, 
ctxt-eflags);
+   emulate_2op_cl(ctxt, shrd);
break;
case 0xae:  /* clflush */
break;
-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a 

[PATCH 0/6] Some emulator cleanups

2011-09-07 Thread Avi Kivity
Some mindless emulator cleanups while waiting for autotest.

Avi Kivity (6):
  KVM: x86 emulator: simplify emulate_2op_SrcV()
  KVM: x86 emulator: simplify emulate_2op_cl()
  KVM: x86 emulator: simplify emulate_2op_cl()
  KVM: x86 emulator: simplify emulate_1op()
  KVM: x86 emulator: merge the two emulate_1op_rax_rdx implementations
  KVM: x86 emulator: simplify emulate_1op_rax_rdx()

 arch/x86/kvm/emulate.c |  225 +++-
 1 files changed, 89 insertions(+), 136 deletions(-)

-- 
1.7.6.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/10] iommu/msm: Use bus_set_iommu instead of register_iommu

2011-09-07 Thread Joerg Roedel
Convert the MSM IOMMU driver for ARM to use the new
interface for publishing the iommu_ops.

Cc: David Brown dav...@codeaurora.org
Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 drivers/iommu/msm_iommu.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index d1733f6..5865dd2 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -728,7 +728,7 @@ static void __init setup_iommu_tex_classes(void)
 static int __init msm_iommu_init(void)
 {
setup_iommu_tex_classes();
-   register_iommu(msm_iommu_ops);
+   bus_set_iommu(platform_bus_type, msm_iommu_ops);
return 0;
 }
 
-- 
1.7.4.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/10] iommu/vt-d: Use bus_set_iommu instead of register_iommu

2011-09-07 Thread Joerg Roedel
Convert the Intel IOMMU driver to use the new interface for
publishing the iommu_ops.

Cc: David Woodhouse dw...@infradead.org
Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 drivers/iommu/intel-iommu.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index c621c98..2d53c3d 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3486,7 +3486,7 @@ int __init intel_iommu_init(void)
 
init_iommu_pm_ops();
 
-   register_iommu(intel_iommu_ops);
+   bus_set_iommu(pci_bus_type, intel_iommu_ops);
 
bus_register_notifier(pci_bus_type, device_nb);
 
-- 
1.7.4.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/10] iommu/core: Define iommu_ops and register_iommu only with CONFIG_IOMMU_API

2011-09-07 Thread Joerg Roedel
This makes it impossible to compile an iommu driver into the
kernel without selecting CONFIG_IOMMU_API.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
---
 include/linux/iommu.h |8 +++-
 1 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9940319..6470cd8 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -34,6 +34,8 @@ struct iommu_domain {
 #define IOMMU_CAP_CACHE_COHERENCY  0x1
 #define IOMMU_CAP_INTR_REMAP   0x2 /* isolates device intrs */
 
+#ifdef CONFIG_IOMMU_API
+
 struct iommu_ops {
int (*domain_init)(struct iommu_domain *domain);
void (*domain_destroy)(struct iommu_domain *domain);
@@ -49,8 +51,6 @@ struct iommu_ops {
  unsigned long cap);
 };
 
-#ifdef CONFIG_IOMMU_API
-
 extern void register_iommu(struct iommu_ops *ops);
 extern bool iommu_found(void);
 extern struct iommu_domain *iommu_domain_alloc(void);
@@ -70,9 +70,7 @@ extern int iommu_domain_has_cap(struct iommu_domain *domain,
 
 #else /* CONFIG_IOMMU_API */
 
-static inline void register_iommu(struct iommu_ops *ops)
-{
-}
+struct iommu_ops {};
 
 static inline bool iommu_found(void)
 {
-- 
1.7.4.1


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Autotest] [PATCH] client/bin/autotest: mention control file in usage message

2011-09-07 Thread Lucas Meneghel Rodrigues

On 09/07/2011 07:15 AM, Alon Levy wrote:

Signed-off-by: Alon Levyal...@redhat.com


Looks good to me, applied, thanks!

http://autotest.kernel.org/changeset/5575


---
  client/bin/autotest |3 ++-
  1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/client/bin/autotest b/client/bin/autotest
index af521de..c84b771 100755
--- a/client/bin/autotest
+++ b/client/bin/autotest
@@ -21,7 +21,7 @@ os.environ['AUTODIR'] = autodir
  os.environ['AUTODIRBIN'] = autodirbin
  os.environ['PYTHONPATH'] = autodirbin

-parser = OptionParser()
+parser = OptionParser(usage='Usage: %prog [options]control-file')

  parser.add_option(-a, --args, dest='args',
  help=additional args to pass to control file)
@@ -76,6 +76,7 @@ options, args = parser.parse_args()

  # Check for a control file if not in prebuild mode.
  if len(args) != 1 and options.client_test_setup is None:
+print missing control file.
  usage()

  drop_caches = global_config.global_config.get_config_value('CLIENT',


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Don Zickus
On Wed, Sep 07, 2011 at 07:25:24PM +0300, Avi Kivity wrote:
 On 09/07/2011 06:56 PM, Don Zickus wrote:
 
   And hope that no other NMI was generated while we're handling this
   one.  It's a little... fragile?
 
 No.  If another NMI is generated while we are processing the current one
 it should get latched.  Upon completion of the current one, the cpu should
 jump right back into the nmi exception routine again.  The only downside
 is when multiple NMIs come in during the processing of the current one.
 Only one can be latched, so the others get dropped.
 
 Ah, yes, I remember now.
 
 But we are addressing
 that.
 
 
 May I ask how?  Detecting a back-to-back NMI?

Pretty boring actually.  Currently we execute an NMI handler until one of
them returns handled.  Then we stop.  This may cause us to miss an NMI in
the case of multiple NMIs at once.  Now we are changing it to execute
_all_ the handlers to make sure we didn't miss one.  But then the downside
here is we accidentally handle an NMI that was latched.  This would cause
a 'Dazed on confused' message as that NMI was already handled by the
previous NMI.

We are working on an algorithm to detect this condition and flag it
(nothing complicated).  But it may never be perfect.

On the other hand, what else are we going to do with an edge-triggered
shared interrupt line?

Cheers,
Don
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] KVM: emulate lapic tsc deadline timer for hvm

2011-09-07 Thread Liu, Jinsong
Avi Kivity wrote:
 
 --- a/arch/x86/include/asm/msr-index.h
 +++ b/arch/x86/include/asm/msr-index.h
 @@ -229,6 +229,8 @@
   #define MSR_IA32_APICBASE_ENABLE   (111)
   #define MSR_IA32_APICBASE_BASE (0xf12)
 
 +#define MSR_IA32_TSCDEADLINE0x06e0
 +
   #define MSR_IA32_UCODE_WRITE   0x0079
   #define MSR_IA32_UCODE_REV 0x008b
 
 
 Need to add to msrs_to_save so live migration works.
 

2 questions:
1). how about add to emulated_msrs instead of msrs_to_save? msrs_to_save 
modified at runtime and depend on capacibility of host cpu.
2). do we need add code at qemu (kvm_get_msrs/ kvm_put_msrs) to expose 
MSR_IA32_TSCDEADLINE to userspace?

Thanks,
Jinsong--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM-test: Add two scripts to disable services for perf tests

2011-09-07 Thread Hagen Paul Pfeifer

On Wed, 07 Sep 2011 14:07:49 +0800, Amos Kong ak...@redhat.com wrote:



 System services on guest and host take uncertain resource, it effects

 the perf results. We can use the below two scripts to disable some

 services of host and guest.

 

 stop_serivices_perf.sh is used to stop the running serivices.

 off_service_perf.sh is used to off services when host starts up.

 

 We can use them to prepare environment for performance testcases.



Which environment? I assume this should secured by an Distribution

specific guard:



E.g.



if [ -f /etc/redhat-release ] then
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: emulate lapic tsc deadline timer for hvm

2011-09-07 Thread Avi Kivity

On 09/07/2011 07:45 PM, Liu, Jinsong wrote:

Avi Kivity wrote:

  --- a/arch/x86/include/asm/msr-index.h
  +++ b/arch/x86/include/asm/msr-index.h
  @@ -229,6 +229,8 @@
#define MSR_IA32_APICBASE_ENABLE(111)
#define MSR_IA32_APICBASE_BASE  (0xf12)

  +#define MSR_IA32_TSCDEADLINE 0x06e0
  +
#define MSR_IA32_UCODE_WRITE0x0079
#define MSR_IA32_UCODE_REV  0x008b


  Need to add to msrs_to_save so live migration works.


2 questions:
1). how about add to emulated_msrs instead of msrs_to_save? msrs_to_save 
modified at runtime and depend on capacibility of host cpu.


Look at kvm_init_msrs_list(), it does the checks.


2). do we need add code at qemu (kvm_get_msrs/ kvm_put_msrs) to expose 
MSR_IA32_TSCDEADLINE to userspace?



It should be automatic.  Better check it though that you can 
live-migrate a guest that uses TSC deadline.  Please add a testcase to 
kvm-unit-tests.git (there's x86/apic.c, can probably be added there easily).


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Avi Kivity

On 09/07/2011 07:52 PM, Don Zickus wrote:


  May I ask how?  Detecting a back-to-back NMI?

Pretty boring actually.  Currently we execute an NMI handler until one of
them returns handled.  Then we stop.  This may cause us to miss an NMI in
the case of multiple NMIs at once.  Now we are changing it to execute
_all_ the handlers to make sure we didn't miss one.


That's going to be pretty bad for kvm - those handlers become a lot more 
expensive since they involve reading MSRs.  Even worse if we start using 
NMIs as a wakeup for pv spinlocks as provided by this patchset.



But then the downside
here is we accidentally handle an NMI that was latched.  This would cause
a 'Dazed on confused' message as that NMI was already handled by the
previous NMI.

We are working on an algorithm to detect this condition and flag it
(nothing complicated).  But it may never be perfect.

On the other hand, what else are we going to do with an edge-triggered
shared interrupt line?



How about, during NMI, save %rip to a per-cpu variable.  Handle just one 
cause.  If, on the next NMI, we hit the same %rip, assume back-to-back 
NMI has occured and now handle all causes.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Jeremy Fitzhardinge
On 09/07/2011 10:09 AM, Avi Kivity wrote:
 On 09/07/2011 07:52 PM, Don Zickus wrote:
 
   May I ask how?  Detecting a back-to-back NMI?

 Pretty boring actually.  Currently we execute an NMI handler until
 one of
 them returns handled.  Then we stop.  This may cause us to miss an
 NMI in
 the case of multiple NMIs at once.  Now we are changing it to execute
 _all_ the handlers to make sure we didn't miss one.

 That's going to be pretty bad for kvm - those handlers become a lot
 more expensive since they involve reading MSRs.

How often are you going to get NMIs in a kvm guest?

   Even worse if we start using NMIs as a wakeup for pv spinlocks as
 provided by this patchset.

Hm, I'm interested to know what you're thinking in more detail.  Can you
leave an NMI pending before you block in the same way you can with
sti;halt with normal interrupts?

I was thinking you might want to do something with monitor/mwait to
implement the blocking/kick ops. (Handwave)

J
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Virtual drives performance

2011-09-07 Thread Stefan Hajnoczi
On Tue, Sep 6, 2011 at 11:25 PM, TooMeeK toomeek...@o2.pl wrote:
 First, I created mirrored storage in hypervisor from one 600-gig partition
 (yes, that's correct - I have only one drive currently), details:
 sudo mdadm --detail /dev/md3
 /dev/md3:
        Version : 1.2
  Creation Time : Thu Jul 28 20:07:00 2011
     Raid Level : raid1
     Array Size : 664187352 (633.42 GiB 680.13 GB)
  Used Dev Size : 664187352 (633.42 GiB 680.13 GB)
   Raid Devices : 2
  Total Devices : 1
    Persistence : Superblock is persistent

    Update Time : Thu Jul 28 22:07:10 2011
          State : clean, degraded
  Active Devices : 1
 Working Devices : 1
  Failed Devices : 0
  Spare Devices : 0

           Name : Server:3  (local to host Server)
           UUID : 87184170:2d9102b1:ca16a5d7:1f23fe2e
         Events : 3276

    Number   Major   Minor   RaidDevice State
       0       8       23        0      active sync   /dev/sdb7
       1       0        0        1      removed

 Partition type is Linux RAID autodetect and this drive can do 80MB/s write
 and 100 MB/s read seq.

How did you measure those figures?

To double-check sequential read throughput on the host:

# dd if=/dev/md3 of=/dev/null bs=64k count=16384 iflag=direct

The SMB results don't help narrow down a disk I/O problem.  To collect
comparable sequential read throughput inside the guest:

# dd if=/dev/vda of=/dev/null bs=64k count=16384 iflag=direct

 QEMU PC emulator version 0.12.5 (qemu-kvm-0.12.5)

Try qemu-kvm 0.15.

 Next, I've tried following combinations with virt-manager 0.8.4 (from XML of
 VM):
 1.on Debian VM with virtio drivers for both storage and NIC:
 disk type='block' device='disk'

cache='none'

 source dev='/dev/md3'/
 target dev='vdb' bus='virtio'/

You can enable Linux AIO, which typically performs better than the
default io=threads:

driver name=qemu type=raw io=native/

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Don Zickus
On Wed, Sep 07, 2011 at 08:09:37PM +0300, Avi Kivity wrote:
 On 09/07/2011 07:52 PM, Don Zickus wrote:
 
   May I ask how?  Detecting a back-to-back NMI?
 
 Pretty boring actually.  Currently we execute an NMI handler until one of
 them returns handled.  Then we stop.  This may cause us to miss an NMI in
 the case of multiple NMIs at once.  Now we are changing it to execute
 _all_ the handlers to make sure we didn't miss one.
 
 That's going to be pretty bad for kvm - those handlers become a lot
 more expensive since they involve reading MSRs.  Even worse if we
 start using NMIs as a wakeup for pv spinlocks as provided by this
 patchset.

Oh.

 
 But then the downside
 here is we accidentally handle an NMI that was latched.  This would cause
 a 'Dazed on confused' message as that NMI was already handled by the
 previous NMI.
 
 We are working on an algorithm to detect this condition and flag it
 (nothing complicated).  But it may never be perfect.
 
 On the other hand, what else are we going to do with an edge-triggered
 shared interrupt line?
 
 
 How about, during NMI, save %rip to a per-cpu variable.  Handle just
 one cause.  If, on the next NMI, we hit the same %rip, assume
 back-to-back NMI has occured and now handle all causes.

I had a similar idea a couple of months ago while debugging a continuous
flow of back-to-back NMIs from a stress-test perf application and I
couldn't get it to work.  But let me try it again, because it does make
sense as an optimization.

Thanks,
Don
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] KVM: emulate lapic tsc deadline timer for hvm

2011-09-07 Thread Liu, Jinsong
Avi Kivity wrote:
 On 09/07/2011 07:45 PM, Liu, Jinsong wrote:
 Avi Kivity wrote:
 
  --- a/arch/x86/include/asm/msr-index.h
  +++ b/arch/x86/include/asm/msr-index.h
  @@ -229,6 +229,8 @@
#define MSR_IA32_APICBASE_ENABLE(111)
#define MSR_IA32_APICBASE_BASE  (0xf12)
 
  +#define MSR_IA32_TSCDEADLINE 0x06e0
  +
#define MSR_IA32_UCODE_WRITE0x0079
#define MSR_IA32_UCODE_REV  0x008b
 
 
  Need to add to msrs_to_save so live migration works.
 
 
 2 questions:
 1). how about add to emulated_msrs instead of msrs_to_save?
 msrs_to_save modified at runtime and depend on capacibility of host
 cpu.  
 
 Look at kvm_init_msrs_list(), it does the checks.
 

Yes, what I mean is, we in fact don't need host cpu support, so it's better to 
add it to emulated_msrs.

 2). do we need add code at qemu (kvm_get_msrs/ kvm_put_msrs) to
 expose MSR_IA32_TSCDEADLINE to userspace? 
 
 
 It should be automatic.  Better check it though that you can
 live-migrate a guest that uses TSC deadline.  Please add a testcase to
 kvm-unit-tests.git (there's x86/apic.c, can probably be added there
 easily). 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Avi Kivity

On 09/07/2011 08:17 PM, Jeremy Fitzhardinge wrote:

On 09/07/2011 10:09 AM, Avi Kivity wrote:
  On 09/07/2011 07:52 PM, Don Zickus wrote:
  
 May I ask how?  Detecting a back-to-back NMI?

  Pretty boring actually.  Currently we execute an NMI handler until
  one of
  them returns handled.  Then we stop.  This may cause us to miss an
  NMI in
  the case of multiple NMIs at once.  Now we are changing it to execute
  _all_ the handlers to make sure we didn't miss one.

  That's going to be pretty bad for kvm - those handlers become a lot
  more expensive since they involve reading MSRs.

How often are you going to get NMIs in a kvm guest?


We'll soon have the perf-based watchdog firing every 60s worth of 
instructions or so.  But if we implement your new kick pvop using NMI 
then it can be _very_ often.




Even worse if we start using NMIs as a wakeup for pv spinlocks as
  provided by this patchset.

Hm, I'm interested to know what you're thinking in more detail.  Can you
leave an NMI pending before you block in the same way you can with
sti;halt with normal interrupts?


Nope.  But you can do

   if (regs-rip in critical section)
   regs-rip = after_halt;

and effectively emulate it.  The critical section is something like

critical_section_start:
if (woken_up)
goto critical_section_end;
hlt
critical_section_end:



I was thinking you might want to do something with monitor/mwait to
implement the blocking/kick ops. (Handwave)



monitor/mwait are incredibly expensive to virtualize since they require 
write-protecting a page, IPIs flying everywhere and flushing tlbs, not 
to mention my lovely hugepages being broken up mercilessly.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Avi Kivity

On 09/07/2011 08:21 PM, Don Zickus wrote:


  How about, during NMI, save %rip to a per-cpu variable.  Handle just
  one cause.  If, on the next NMI, we hit the same %rip, assume
  back-to-back NMI has occured and now handle all causes.

I had a similar idea a couple of months ago while debugging a continuous
flow of back-to-back NMIs from a stress-test perf application and I
couldn't get it to work.  But let me try it again, because it does make
sense as an optimization.




Great, thanks.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm-tool: remove addr_type - unused but set variable

2011-09-07 Thread Hagen Paul Pfeifer
Signed-off-by: Hagen Paul Pfeifer ha...@jauu.net
Cc: Sasha Levin levinsasha...@gmail.com
Cc: Pekka Enberg penb...@kernel.org
---
 tools/kvm/builtin-run.c |2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
index 725c23c..53ab911 100644
--- a/tools/kvm/builtin-run.c
+++ b/tools/kvm/builtin-run.c
@@ -177,7 +177,6 @@ static int shmem_parser(const struct option *opt, const 
char *arg, int unset)
const uint64_t default_phys_addr = SHMEM_DEFAULT_ADDR;
const char *default_handle = SHMEM_DEFAULT_HANDLE;
struct shmem_info *si = malloc(sizeof(struct shmem_info));
-   enum { PCI, UNK } addr_type = PCI;
uint64_t phys_addr;
uint64_t size;
char *handle = NULL;
@@ -193,7 +192,6 @@ static int shmem_parser(const struct option *opt, const 
char *arg, int unset)
/* parse out optional addr family */
if (strcasestr(p, pci:)) {
p += skip_pci;
-   addr_type = PCI;
} else if (strcasestr(p, mem:)) {
die(I can't add to E820 map yet.\n);
}
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/10] Driver core: Add iommu_ops to bus_type

2011-09-07 Thread Greg KH
On Wed, Sep 07, 2011 at 05:41:45PM +0200, Joerg Roedel wrote:
 This is the starting point to make the iommu_ops used for
 the iommu-api a per-bus-type structure. It is required to
 easily implement bus-specific setup in the iommu-layer.
 The first user will be the iommu-group attribute in sysfs.
 
 Signed-off-by: Joerg Roedel joerg.roe...@amd.com
 ---
  drivers/base/bus.c |   16 
  drivers/iommu/iommu.c  |4 
  include/linux/device.h |9 +
  include/linux/iommu.h  |2 ++
  4 files changed, 31 insertions(+), 0 deletions(-)
 
 diff --git a/drivers/base/bus.c b/drivers/base/bus.c
 index 000e7b2..34ac706 100644
 --- a/drivers/base/bus.c
 +++ b/drivers/base/bus.c
 @@ -1028,6 +1028,22 @@ void bus_sort_breadthfirst(struct bus_type *bus,
  }
  EXPORT_SYMBOL_GPL(bus_sort_breadthfirst);
  
 +#ifdef CONFIG_IOMMU_API
 +int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops)
 +{
 + if (bus-iommu_ops != NULL)
 + return -EBUSY;

Busy?

 +
 + bus-iommu_ops = ops;
 +
 + /* Do IOMMU specific setup for this bus-type */
 + iommu_bus_init(bus, ops);
 +
 + return 0;
 +}
 +EXPORT_SYMBOL_GPL(bus_set_iommu);

I don't understand what this function is for, and who would call it.

Please provide kerneldoc that explains this.


 +#endif
 +
  int __init buses_init(void)
  {
   bus_kset = kset_create_and_add(bus, bus_uevent_ops, NULL);
 diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
 index 30b0644..3b24a5b 100644
 --- a/drivers/iommu/iommu.c
 +++ b/drivers/iommu/iommu.c
 @@ -34,6 +34,10 @@ void register_iommu(struct iommu_ops *ops)
   iommu_ops = ops;
  }
  
 +void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops)
 +{
 +}
 +
  bool iommu_found(void)
  {
   return iommu_ops != NULL;
 diff --git a/include/linux/device.h b/include/linux/device.h
 index c20dfbf..8240b2a 100644
 --- a/include/linux/device.h
 +++ b/include/linux/device.h
 @@ -22,6 +22,7 @@
  #include linux/types.h
  #include linux/module.h
  #include linux/pm.h
 +#include linux/iommu.h
  #include linux/atomic.h
  #include asm/device.h
  
 @@ -67,6 +68,9 @@ extern void bus_remove_file(struct bus_type *, struct 
 bus_attribute *);
   * @resume:  Called to bring a device on this bus out of sleep mode.
   * @pm:  Power management operations of this bus, callback the 
 specific
   *   device driver's pm-ops.
 + * @iommu_ops   IOMMU specific operations for this bus, used to attach IOMMU
 + *  driver implementations to a bus and allow the driver to do
 + *  bus-specific setup

So why is this just not set by the bus itself, making the above function
not needed at all?

confused,

greg k-h
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/10] IOMMU: Make iommu_ops per-bus_type

2011-09-07 Thread Greg KH
On Wed, Sep 07, 2011 at 05:41:43PM +0200, Joerg Roedel wrote:
 Hi,
 
 here is the new version of the patch-set to make the iommu_ops used in
 the iommu-api a bus_type property. This will allow us to move code out
 of the iommu drivers into generic code and it simplifies the
 implementation of the Alex' device-group property.
 
 Greg, can you have a look at patch 2 please and tell me if you have any
 objections?

I object, please see my comments.

 With this version the patch-set is complete (not as the first RFC post).
 It converts all iommu drivers to use the new registration interface and
 completly removes the register_iommu interface.
 
 Regards,
 
   Joerg
 
 Diffstat:
 
  arch/ia64/kvm/kvm-ia64.c   |3 +-
  arch/x86/kvm/x86.c |3 +-
  drivers/base/bus.c |   16 ++
  drivers/iommu/amd_iommu.c  |2 +-
  drivers/iommu/intel-iommu.c|2 +-
  drivers/iommu/iommu.c  |   58 
 
  drivers/iommu/msm_iommu.c  |2 +-
  drivers/iommu/omap-iommu.c |2 +-
  drivers/media/video/omap3isp/isp.c |2 +-
  include/linux/device.h |9 +
  include/linux/iommu.h  |   21 +++--
  virt/kvm/iommu.c   |4 +-
  12 files changed, 86 insertions(+), 38 deletions(-)

So the overall work here makes for more code, right?  I fail to see the
benifit, what am I missing?

greg k-h
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] iommu/core: split mapping to page sizes as supported by the hardware

2011-09-07 Thread Ohad Ben-Cohen
When mapping a memory region, split it to page sizes as supported
by the iommu hardware. Always prefer bigger pages, when possible,
in order to reduce the TLB pressure.

The logic to do that is now added to the IOMMU core, so neither the iommu
drivers themselves nor users of the IOMMU API have to duplicate it.

This allows a more lenient granularity of mappings; traditionally the
IOMMU API took 'order' (of a page) as a mapping size, and directly let
the low level iommu drivers handle the mapping, but now that the IOMMU
core can split arbitrary memory regions into pages, we can remove this
limitation, so users don't have to split those regions by themselves.

Currently the supported page sizes are advertised once and they then
remain static. That works well for OMAP (and seemingly MSM too) but
it might not fly with intel's hardware, where the page size
capabilities seem to have the potential to be different between
several DMA remapping devices.

To simplify the migration, this patch retains the existing behavior
for the x86 IOMMU drivers, by having them advertise support for
all page sizes that are an order of 4KB.

OMAP and MSM iommu drivers are migrated to advertise support
for 4KB, 64KB, 1MB and 16MB (as supported by their hardware).

Mainline users of the IOMMU API (kvm and omap-iovmm) are adopted
to send the mapping size in bytes instead of in page order.

Tested with OMAP3 and OMAP4. Compile tested on X86-64.

Signed-off-by: Ohad Ben-Cohen o...@wizery.com
Cc: David Brown dav...@codeaurora.org
Cc: David Woodhouse dw...@infradead.org
Cc: Joerg Roedel joerg.roe...@amd.com
Cc: Stepan Moskovchenko step...@codeaurora.org
Cc: Hiroshi DOYU hiroshi.d...@nokia.com
Cc: Laurent Pinchart laurent.pinch...@ideasonboard.com
Cc: kvm@vger.kernel.org
---
 drivers/iommu/amd_iommu.c   |   20 ++-
 drivers/iommu/intel-iommu.c |   20 ++-
 drivers/iommu/iommu.c   |  129 +++
 drivers/iommu/msm_iommu.c   |8 ++-
 drivers/iommu/omap-iommu.c  |6 ++-
 drivers/iommu/omap-iovmm.c  |   12 +---
 include/linux/iommu.h   |7 +-
 virt/kvm/iommu.c|4 +-
 8 files changed, 176 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index a14f8dc..5cdfa91 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2488,12 +2488,30 @@ static unsigned device_dma_ops_init(void)
 }
 
 /*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * Traditionally the IOMMU core just handed us the mappings directly,
+ * after making sure the size is an order of a 4KB page and that the
+ * mapping has natural alignment.
+ *
+ * To retain this behavior, we currently advertise that we support
+ * all page sizes that are an order of 4KB.
+ *
+ * If at some point we'd like to utilize the IOMMU core's new behavior,
+ * we could change this to advertise the real page sizes we support.
+ */
+static unsigned long amd_iommu_pgsizes = ~0xFFFUL;
+
+/*
  * The function which clues the AMD IOMMU driver into dma_ops.
  */
 
 void __init amd_iommu_init_api(void)
 {
-   register_iommu(amd_iommu_ops);
+   register_iommu(amd_iommu_ops, amd_iommu_pgsizes, BITS_PER_LONG);
 }
 
 int __init amd_iommu_init_dma_ops(void)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index c621c98..a8c91a6 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3426,6 +3426,24 @@ static struct notifier_block device_nb = {
.notifier_call = device_notifier,
 };
 
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * Traditionally the IOMMU core just handed us the mappings directly,
+ * after making sure the size is an order of a 4KB page and that the
+ * mapping has natural alignment.
+ *
+ * To retain this behavior, we currently advertise that we support
+ * all page sizes that are an order of 4KB.
+ *
+ * If at some point we'd like to utilize the IOMMU core's new behavior,
+ * we could change this to advertise the real page sizes we support.
+ */
+static unsigned long intel_iommu_pgsizes = ~0xFFFUL;
+
 int __init intel_iommu_init(void)
 {
int ret = 0;
@@ -3486,7 +3504,7 @@ int __init intel_iommu_init(void)
 
init_iommu_pm_ops();
 
-   register_iommu(intel_iommu_ops);
+   register_iommu(intel_iommu_ops, intel_iommu_pgsizes, BITS_PER_LONG);
 
bus_register_notifier(pci_bus_type, device_nb);
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index c68ff29..e07ea03 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -16,6 +16,8 @@
  * Foundation, Inc., 59 Temple Place, 

Re: [RFC PATCH 4/5] VFIO: Add PCI device support

2011-09-07 Thread Konrad Rzeszutek Wilk
On Thu, Sep 01, 2011 at 01:50:50PM -0600, Alex Williamson wrote:
 Signed-off-by: Alex Williamson alex.william...@redhat.com
 ---
 
  drivers/vfio/Kconfig|7 ++
  drivers/vfio/Makefile   |1 
  drivers/vfio/vfio_main.c|   10 +++
  drivers/vfio/vfio_pci.c |  124 
 +++
  drivers/vfio/vfio_private.h |5 ++
  5 files changed, 147 insertions(+), 0 deletions(-)
  create mode 100644 drivers/vfio/vfio_pci.c
 
 diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
 index a150521..b17bdbd 100644
 --- a/drivers/vfio/Kconfig
 +++ b/drivers/vfio/Kconfig
 @@ -3,3 +3,10 @@ menuconfig VFIO
   depends on IOMMU_API
   help
 If you don't know what to do here, say N.
 +
 +menuconfig VFIO_PCI
 + bool VFIO support for PCI devices
 + depends on VFIO  PCI
 + default y if X86

Hahah.. And Linus is going to tear your behind for that.

Default should be 'n'
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/13] xen/pvticketlock: disable interrupts while blocking

2011-09-07 Thread Jeremy Fitzhardinge
On 09/07/2011 10:41 AM, Avi Kivity wrote:
 Hm, I'm interested to know what you're thinking in more detail.  Can you
 leave an NMI pending before you block in the same way you can with
 sti;halt with normal interrupts?


 Nope.  But you can do

if (regs-rip in critical section)
regs-rip = after_halt;

 and effectively emulate it.  The critical section is something like

 critical_section_start:
 if (woken_up)
 goto critical_section_end;
 hlt
 critical_section_end:

Hm.  It's a pity you have to deliver an actual interrupt to implement
the kick though.


 I was thinking you might want to do something with monitor/mwait to
 implement the blocking/kick ops. (Handwave)


 monitor/mwait are incredibly expensive to virtualize since they
 require write-protecting a page, IPIs flying everywhere and flushing
 tlbs, not to mention my lovely hugepages being broken up mercilessly.

Or what about a futex-like hypercall?

J

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/10] Driver core: Add iommu_ops to bus_type

2011-09-07 Thread Joerg Roedel
Hi Greg,

the bus_set_iommu() function will be called by the IOMMU driver. There
can be different drivers for the same bus, depending on the hardware. On
PCI for example, there can be the Intel or the AMD IOMMU driver that
implement the iommu-api and that register for that bus.

On Wed, Sep 07, 2011 at 11:47:50AM -0700, Greg KH wrote:
  +#ifdef CONFIG_IOMMU_API
  +int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops)
  +{
  +   if (bus-iommu_ops != NULL)
  +   return -EBUSY;
 
 Busy?

Yes, it signals to the IOMMU driver that another driver has already
registered for that bus. In the previous register_iommu() interface this
was just a BUG(), but I think returning an error to the caller is
better. It can be turned back into a BUG() if it is considered better,
though.

  +
  +   bus-iommu_ops = ops;
  +
  +   /* Do IOMMU specific setup for this bus-type */
  +   iommu_bus_init(bus, ops);
  +
  +   return 0;
  +}
  +EXPORT_SYMBOL_GPL(bus_set_iommu);
 
 I don't understand what this function is for, and who would call it.

It is called by the IOMMU driver.

 Please provide kerneldoc that explains this.

Will do.

  @@ -67,6 +68,9 @@ extern void bus_remove_file(struct bus_type *, struct 
  bus_attribute *);
* @resume:Called to bring a device on this bus out of sleep mode.
* @pm:Power management operations of this bus, callback the 
  specific
* device driver's pm-ops.
  + * @iommu_ops   IOMMU specific operations for this bus, used to attach 
  IOMMU
  + *  driver implementations to a bus and allow the driver to do
  + *  bus-specific setup
 
 So why is this just not set by the bus itself, making the above function
 not needed at all?

The IOMMUs are usually devices on the bus itself, so they are
initialized after the bus is set up and the devices on it are
populated.  So the function can not be called on bus initialization
because the IOMMU is not ready at this point.

Regards,

Joerg

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/10] IOMMU: Make iommu_ops per-bus_type

2011-09-07 Thread Joerg Roedel
On Wed, Sep 07, 2011 at 11:48:40AM -0700, Greg KH wrote:
 On Wed, Sep 07, 2011 at 05:41:43PM +0200, Joerg Roedel wrote:
  Diffstat:
  
   arch/ia64/kvm/kvm-ia64.c   |3 +-
   arch/x86/kvm/x86.c |3 +-
   drivers/base/bus.c |   16 ++
   drivers/iommu/amd_iommu.c  |2 +-
   drivers/iommu/intel-iommu.c|2 +-
   drivers/iommu/iommu.c  |   58 
  
   drivers/iommu/msm_iommu.c  |2 +-
   drivers/iommu/omap-iommu.c |2 +-
   drivers/media/video/omap3isp/isp.c |2 +-
   include/linux/device.h |9 +
   include/linux/iommu.h  |   21 +++--
   virt/kvm/iommu.c   |4 +-
   12 files changed, 86 insertions(+), 38 deletions(-)
 
 So the overall work here makes for more code, right?  I fail to see the
 benifit, what am I missing?

Yes, the code that moves code out of the IOMMU drivers is not included
here. This is only the infrastructure for future code generalization in
the IOMMU driver. That is why the iommu_bus_init() function in patch 2
is still empty :)
For example, the IOMMU drivers on x86 implement device-notifier
callbacks to know when devices are added or removed to do iommu-specific
setup. The iommu drivers also scan over the devices on a bus to do
initialization. All this device-walking code and the notifiers can be
moved to generic IOMMU code leaving only the specific setup routines in
the iommu drivers.

Regards,

Joerg

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/10] Driver core: Add iommu_ops to bus_type

2011-09-07 Thread Greg KH
On Wed, Sep 07, 2011 at 09:19:19PM +0200, Joerg Roedel wrote:
 Hi Greg,
 
 the bus_set_iommu() function will be called by the IOMMU driver. There
 can be different drivers for the same bus, depending on the hardware. On
 PCI for example, there can be the Intel or the AMD IOMMU driver that
 implement the iommu-api and that register for that bus.

Why are you pushing this down into the driver core?  What other busses
becides PCI use/need this?

If you can have a different IOMMU driver on the same bus, then wouldn't
this be a per-device thing instead of a per-bus thing?


 On Wed, Sep 07, 2011 at 11:47:50AM -0700, Greg KH wrote:
   +#ifdef CONFIG_IOMMU_API
   +int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops)
   +{
   + if (bus-iommu_ops != NULL)
   + return -EBUSY;
  
  Busy?
 
 Yes, it signals to the IOMMU driver that another driver has already
 registered for that bus. In the previous register_iommu() interface this
 was just a BUG(), but I think returning an error to the caller is
 better. It can be turned back into a BUG() if it is considered better,
 though.

Can you ever have more than one IOMMU driver per bus?  If so, this seems
wrong (see above.)

   +
   + bus-iommu_ops = ops;
   +
   + /* Do IOMMU specific setup for this bus-type */
   + iommu_bus_init(bus, ops);
   +
   + return 0;
   +}
   +EXPORT_SYMBOL_GPL(bus_set_iommu);
  
  I don't understand what this function is for, and who would call it.
 
 It is called by the IOMMU driver.
 
  Please provide kerneldoc that explains this.
 
 Will do.
 
   @@ -67,6 +68,9 @@ extern void bus_remove_file(struct bus_type *, struct 
   bus_attribute *);
 * @resume:  Called to bring a device on this bus out of sleep mode.
 * @pm:  Power management operations of this bus, callback the 
   specific
 *   device driver's pm-ops.
   + * @iommu_ops   IOMMU specific operations for this bus, used to attach 
   IOMMU
   + *  driver implementations to a bus and allow the driver to 
   do
   + *  bus-specific setup
  
  So why is this just not set by the bus itself, making the above function
  not needed at all?
 
 The IOMMUs are usually devices on the bus itself, so they are
 initialized after the bus is set up and the devices on it are
 populated.  So the function can not be called on bus initialization
 because the IOMMU is not ready at this point.

Ok, that makes more sense, please state as much in the documentation.

thanks,

greg k-h
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/5] VFIO-NG group/device/iommu framework

2011-09-07 Thread Alexander Graf

On 01.09.2011, at 21:50, Alex Williamson wrote:

 Trying to move beyond talking about how VFIO should work to
 re-writing the code.  This is pre-alpha, known broken, will
 probably crash your system but it illustrates some of how
 I see groups, devices, and iommus interacting.  This is just
 the framework, no code to actually support user space drivers
 or device assignment yet.
 
 The iommu portions are still using the FIXME PCI specific
 hooks.  Once Joerg gets some buy-in on his bus specific iommu
 patches, we can move to that.
 
 The group management is more complicated than I'd like and
 you can get groups into a bad state by killing the test program
 with devices/iommus open.  The locking is overly simplistic.
 But, it's a start.  Please make constructive comments and
 suggestions.  Patches based on v3.0.  Thanks,

Looks pretty reasonable to me so far, but I guess we only know for sure once we 
have non-PCI implemented and working with this scheme as well.
Btw I couldn't find the PCI BAR regions mmaps and general config space 
exposure. Where has that gone?


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/10] Driver core: Add iommu_ops to bus_type

2011-09-07 Thread Don Dutile

On 09/07/2011 03:44 PM, Greg KH wrote:

On Wed, Sep 07, 2011 at 09:19:19PM +0200, Joerg Roedel wrote:

Hi Greg,

the bus_set_iommu() function will be called by the IOMMU driver. There
can be different drivers for the same bus, depending on the hardware. On
PCI for example, there can be the Intel or the AMD IOMMU driver that
implement the iommu-api and that register for that bus.


Why are you pushing this down into the driver core?  What other busses
becides PCI use/need this?

If you can have a different IOMMU driver on the same bus, then wouldn't
this be a per-device thing instead of a per-bus thing?


And given the dma api takes a struct device *, it'd be more efficient
to be tied into the device structure.
Device structure would get iommu ops set by parent(bus);
if a bus (segment) doesn't provide a unique/different/layered IOMMU
then the parent bus, it inherits the parent's iommu-ops.
setting the iommu-ops in the root bus struct, seeds the iommu-ops
for the (PCI) tree.

For intel  amd IOMMUs, in early pci (bios,root?) init, you would
seed the pci root busses with appropriate IOMMU support (based on
dmar/drhd  ivrs/ivhd data structures, respectively), and
then modify the PCI code to do the inheritence (PPB code inherits
unless specific device driver for a given PPB vid-did loads a
different iommu-ops for that segment/branch).

This would enable different types of IOMMUs for different devices
(or PCI segments, or branches of PCI trees) that are designed for
different tasks -- simple IOMMUs for legacy devices; complicated, 
io-page-faulting
IOMMUs for plug-in, high-end devices on virtualizing servers for PCI (SRIOV) 
endpoints.

and as Greg indicates, is only relevant to PCI.
The catch is that dev* has to be looked at for iommu support for dma-ops.




On Wed, Sep 07, 2011 at 11:47:50AM -0700, Greg KH wrote:

+#ifdef CONFIG_IOMMU_API
+int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops)
+{
+   if (bus-iommu_ops != NULL)
+   return -EBUSY;


Busy?


Yes, it signals to the IOMMU driver that another driver has already
registered for that bus. In the previous register_iommu() interface this
was just a BUG(), but I think returning an error to the caller is
better. It can be turned back into a BUG() if it is considered better,
though.


Can you ever have more than one IOMMU driver per bus?  If so, this seems
wrong (see above.)


+
+   bus-iommu_ops = ops;
+
+   /* Do IOMMU specific setup for this bus-type */
+   iommu_bus_init(bus, ops);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(bus_set_iommu);


I don't understand what this function is for, and who would call it.


It is called by the IOMMU driver.


Please provide kerneldoc that explains this.


Will do.


@@ -67,6 +68,9 @@ extern void bus_remove_file(struct bus_type *, struct 
bus_attribute *);
   * @resume:   Called to bring a device on this bus out of sleep mode.
   * @pm:   Power management operations of this bus, callback the 
specific
   *device driver's pm-ops.
+ * @iommu_ops   IOMMU specific operations for this bus, used to attach IOMMU
+ *  driver implementations to a bus and allow the driver to do
+ *  bus-specific setup


So why is this just not set by the bus itself, making the above function
not needed at all?


The IOMMUs are usually devices on the bus itself, so they are
initialized after the bus is set up and the devices on it are
populated.  So the function can not be called on bus initialization
because the IOMMU is not ready at this point.


Ok, that makes more sense, please state as much in the documentation.

thanks,

greg k-h
___
iommu mailing list
io...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/iommu


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Virtual drives performance

2011-09-07 Thread Virtbie

On 09/07/11 00:25, TooMeeK wrote:


Next, I've tried following combinations with virt-manager 0.8.4 (from
XML of VM):
1.on Debian VM with virtio drivers for both storage and NIC:
disk type='block' device='disk'
source dev='/dev/md3'/
target dev='vdb' bus='virtio'/
partition type used in guest: EXT4
result: poor performance, 9-10MB/s sequentional copy via SMB
2.on Debian VM with virtio drivers:
disk type='block' device='disk' cache='writeback'
source dev='/dev/md3'/
target dev='vdb' bus='virtio'/
partition type used: EXT4
result: poor performance, 10-15MB/s sequentional copy via SMB
3.Direct attached partition to FreeBSD VM without virtio support
(e1000 NIC and SCSI disk):
disk type='block' device='disk' cache='writeback'


Shouldn't have you used cache=none when using virtio?
http://www.linux-kvm.org/page/Tuning_KVM
how is that performance with cache none?

Also note that when you don't specify it, I think the default is not 
none. Maybe it is writethrough, I don't remember.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Virtual drives performance

2011-09-07 Thread Asdo

On 09/07/11 00:25, TooMeeK wrote:


Next, I've tried following combinations with virt-manager 0.8.4 (from 
XML of VM):

1.on Debian VM with virtio drivers for both storage and NIC:
disk type='block' device='disk'
source dev='/dev/md3'/
target dev='vdb' bus='virtio'/
partition type used in guest: EXT4
result: poor performance, 9-10MB/s sequentional copy via SMB
2.on Debian VM with virtio drivers:
disk type='block' device='disk' cache='writeback'
source dev='/dev/md3'/
target dev='vdb' bus='virtio'/
partition type used: EXT4
result: poor performance, 10-15MB/s sequentional copy via SMB
3.Direct attached partition to FreeBSD VM without virtio support 
(e1000 NIC and SCSI disk):

disk type='block' device='disk' cache='writeback'


Shouldn't have you used cache=none when using virtio?
http://www.linux-kvm.org/page/Tuning_KVM
how is that performance with cache none?

Also note that when you don't specify it, I think the default is not 
none. Maybe it is writethrough, I don't remember.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


buildbot failure in qemu-kvm on default_i386_debian_5_0

2011-09-07 Thread qemu-kvm
The Buildbot has detected a new failure on builder default_i386_debian_5_0 
while building qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/default_i386_debian_5_0/builds/959

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_2

Build Reason: The Nightly scheduler named 'nightly_default' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot

N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf

buildbot failure in qemu-kvm on default_x86_64_out_of_tree

2011-09-07 Thread qemu-kvm
The Buildbot has detected a new failure on builder default_x86_64_out_of_tree 
while building qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/default_x86_64_out_of_tree/builds/898

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_1

Build Reason: The Nightly scheduler named 'nightly_default' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot

N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf

buildbot failure in qemu-kvm on default_x86_64_debian_5_0

2011-09-07 Thread qemu-kvm
The Buildbot has detected a new failure on builder default_x86_64_debian_5_0 
while building qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/default_x86_64_debian_5_0/builds/957

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_1

Build Reason: The Nightly scheduler named 'nightly_default' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in qemu-kvm on default_i386_out_of_tree

2011-09-07 Thread qemu-kvm
The Buildbot has detected a new failure on builder default_i386_out_of_tree 
while building qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/default_i386_out_of_tree/builds/896

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_2

Build Reason: The Nightly scheduler named 'nightly_default' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in qemu-kvm on disable_kvm_x86_64_debian_5_0

2011-09-07 Thread qemu-kvm
The Buildbot has detected a new failure on builder 
disable_kvm_x86_64_debian_5_0 while building qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/disable_kvm_x86_64_debian_5_0/builds/947

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_1

Build Reason: The Nightly scheduler named 'nightly_disable_kvm' triggered this 
build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in qemu-kvm on disable_kvm_i386_debian_5_0

2011-09-07 Thread qemu-kvm
The Buildbot has detected a new failure on builder disable_kvm_i386_debian_5_0 
while building qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/disable_kvm_i386_debian_5_0/builds/948

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_2

Build Reason: The Nightly scheduler named 'nightly_disable_kvm' triggered this 
build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot

N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf

buildbot failure in qemu-kvm on disable_kvm_i386_out_of_tree

2011-09-07 Thread qemu-kvm
The Buildbot has detected a new failure on builder disable_kvm_i386_out_of_tree 
while building qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/disable_kvm_i386_out_of_tree/builds/896

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_2

Build Reason: The Nightly scheduler named 'nightly_disable_kvm' triggered this 
build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in qemu-kvm on disable_kvm_x86_64_out_of_tree

2011-09-07 Thread qemu-kvm
The Buildbot has detected a new failure on builder 
disable_kvm_x86_64_out_of_tree while building qemu-kvm.
Full details are available at:
 
http://buildbot.b1-systems.de/qemu-kvm/builders/disable_kvm_x86_64_out_of_tree/builds/896

Buildbot URL: http://buildbot.b1-systems.de/qemu-kvm/

Buildslave for this Build: b1_qemu_kvm_1

Build Reason: The Nightly scheduler named 'nightly_disable_kvm' triggered this 
build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot

N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf

buildbot failure in kvm on i386

2011-09-07 Thread kvm
The Buildbot has detected a new failure on builder i386 while building kvm.
Full details are available at:
 http://buildbot.b1-systems.de/kvm/builders/i386/builds/294

Buildbot URL: http://buildbot.b1-systems.de/kvm/

Buildslave for this Build: b1_kvm_1

Build Reason: The Nightly scheduler named 'nightly_master' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in kvm on ppc64

2011-09-07 Thread kvm
The Buildbot has detected a new failure on builder ppc64 while building kvm.
Full details are available at:
 http://buildbot.b1-systems.de/kvm/builders/ppc64/builds/298

Buildbot URL: http://buildbot.b1-systems.de/kvm/

Buildslave for this Build: b1_kvm_1

Build Reason: The Nightly scheduler named 'nightly_master' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot

N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf

buildbot failure in kvm on s390

2011-09-07 Thread kvm
The Buildbot has detected a new failure on builder s390 while building kvm.
Full details are available at:
 http://buildbot.b1-systems.de/kvm/builders/s390/builds/294

Buildbot URL: http://buildbot.b1-systems.de/kvm/

Buildslave for this Build: b1_kvm_1

Build Reason: The Nightly scheduler named 'nightly_master' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in kvm on x86_64

2011-09-07 Thread kvm
The Buildbot has detected a new failure on builder x86_64 while building kvm.
Full details are available at:
 http://buildbot.b1-systems.de/kvm/builders/x86_64/builds/305

Buildbot URL: http://buildbot.b1-systems.de/kvm/

Buildslave for this Build: b1_kvm_1

Build Reason: The Nightly scheduler named 'nightly_master' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in kvm on ppc44x

2011-09-07 Thread kvm
The Buildbot has detected a new failure on builder ppc44x while building kvm.
Full details are available at:
 http://buildbot.b1-systems.de/kvm/builders/ppc44x/builds/301

Buildbot URL: http://buildbot.b1-systems.de/kvm/

Buildslave for this Build: b1_kvm_1

Build Reason: The Nightly scheduler named 'nightly_master' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in kvm on next-x86_64

2011-09-07 Thread kvm
The Buildbot has detected a new failure on builder next-x86_64 while building 
kvm.
Full details are available at:
 http://buildbot.b1-systems.de/kvm/builders/next-x86_64/builds/283

Buildbot URL: http://buildbot.b1-systems.de/kvm/

Buildslave for this Build: b1_kvm_1

Build Reason: The Nightly scheduler named 'nightly_next' triggered this build
Build Source Stamp: [branch next] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in kvm on next-ppc44x

2011-09-07 Thread kvm
The Buildbot has detected a new failure on builder next-ppc44x while building 
kvm.
Full details are available at:
 http://buildbot.b1-systems.de/kvm/builders/next-ppc44x/builds/283

Buildbot URL: http://buildbot.b1-systems.de/kvm/

Buildslave for this Build: b1_kvm_1

Build Reason: The Nightly scheduler named 'nightly_next' triggered this build
Build Source Stamp: [branch next] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in kvm on next-s390

2011-09-07 Thread kvm
The Buildbot has detected a new failure on builder next-s390 while building kvm.
Full details are available at:
 http://buildbot.b1-systems.de/kvm/builders/next-s390/builds/285

Buildbot URL: http://buildbot.b1-systems.de/kvm/

Buildslave for this Build: b1_kvm_1

Build Reason: The Nightly scheduler named 'nightly_next' triggered this build
Build Source Stamp: [branch next] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in kvm on next-i386

2011-09-07 Thread kvm
The Buildbot has detected a new failure on builder next-i386 while building kvm.
Full details are available at:
 http://buildbot.b1-systems.de/kvm/builders/next-i386/builds/283

Buildbot URL: http://buildbot.b1-systems.de/kvm/

Buildslave for this Build: b1_kvm_1

Build Reason: The Nightly scheduler named 'nightly_next' triggered this build
Build Source Stamp: [branch next] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot



buildbot failure in kvm on next-ppc64

2011-09-07 Thread kvm
The Buildbot has detected a new failure on builder next-ppc64 while building 
kvm.
Full details are available at:
 http://buildbot.b1-systems.de/kvm/builders/next-ppc64/builds/284

Buildbot URL: http://buildbot.b1-systems.de/kvm/

Buildslave for this Build: b1_kvm_1

Build Reason: The Nightly scheduler named 'nightly_next' triggered this build
Build Source Stamp: [branch next] HEAD
Blamelist: 

BUILD FAILED: failed git

sincerely,
 -The Buildbot

N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf

Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode

2011-09-07 Thread Roopa Prabhu
On 9/7/11 5:34 AM, Michael S. Tsirkin m...@redhat.com wrote:

 On Tue, Sep 06, 2011 at 03:35:40PM -0700, Roopa Prabhu wrote:
 This patch is an attempt at providing address filtering support for macvtap
 devices in PASSTHRU mode. Its still a work in progress.
 Briefly tested for basic functionality. Wanted to get some feedback on the
 direction before proceeding.
 
 
 Good work, thanks.
 

Thanks.

 I have hopefully CC'ed all concerned people.
 
 kvm crowd might also be interested.
 Try using ./scripts/get_maintainer.pl as well.
 
Thanks for the tip. Expanded CC list a bit more.

 PASSTHRU mode today sets the lowerdev in promiscous mode. In PASSTHRU mode
 there is a 1-1 mapping between macvtap device and physical nic or VF. And all
 filtering is done in lowerdev hw. The lowerdev does not need to be in
 promiscous mode as long as the guest filters are passed down to the lowerdev.
 This patch tries to remove the need for putting the lowerdev in promiscous
 mode. 
 I have also referred to the thread below where TUNSETTXFILTER was mentioned
 in 
 this context: 
  http://patchwork.ozlabs.org/patch/69297/
 
 This patch basically passes the addresses got by TUNSETTXFILTER to macvlan
 lowerdev.
 
 I have looked at previous work and discussions on this for qemu-kvm
 by Michael Tsirkin, Alex Williamson and Dragos Tatulea
 http://patchwork.ozlabs.org/patch/78595/
 http://patchwork.ozlabs.org/patch/47160/
 https://patchwork.kernel.org/patch/474481/
 
 Redhat bugzilla by Michael Tsirkin:
 https://bugzilla.redhat.com/show_bug.cgi?id=655013
 
 I used Michael's qemu-kvm patch for testing the changes with KVM
 
 I would like to cover both MAC and vlan filtering in this work.
 
 Open Questions/Issues:
 - There is a need for vlan filtering to complete the patch. It will require
   a new tap ioctl cmd for vlans.
   Some ideas on this are:
 
   a) TUNSETVLANFILTER: This will entail we send the whole vlan bitmap filter
 (similar to tun_filter for addresses). Passing the vlan id's to lower
 device will mean going thru the whole list of vlans every time.
 
   OR
 
   b) TUNSETVLAN with vlan id and flag to set/unset
 
   Does option 'b' sound ok ?
 
 - In this implementation we make the macvlan address list same as the address
   list that came in the filter with TUNSETTXFILTER. This will not cover cases
   where the macvlan device needs to have other addresses that are not
   necessarily in the filter. Is this a problem ?
 
 What cases do you have in mind?
 
This patch targets only macvlan PASSTHRU mode and for PASSTHRU mode I don't
see a problem with uc/mc address list being the same in all the stacked
netdevs in the path. I called that out above to make sure I was not missing
any case in PASSTHRU mode where this might be invalid. Otherwise I don't see
a problem in the simple PASSTHRU use case this patch supports.

 - The patch currently only supports passing of IFF_PROMISC and IFF_MULTICAST
 filter flags to lowerdev
 
 This patch series implements the following
 01/3 - macvlan: Add support for unicast filtering in macvlan
 02/3 - macvlan: Add function to set addr filter on lower device in passthru
 mode
 03/3 - macvtap: Add support for TUNSETTXFILTER
 
 Please comment. Thanks.
 
 Signed-off-by: Roopa Prabhu ropra...@cisco.com
 Signed-off-by: Christian Benvenuti be...@cisco.com
 Signed-off-by: David Wang dwa...@cisco.com
 
 The security isn't lower than with promisc, so I don't see
 a problem with this as such.
 
 There are more features we'll want down the road though,
 so let's see whether the interface will be able to
 satisfy them in a backwards compatible way before we
 set it in stone. Here's what I came up with:
 
 How will the filtering table be partitioned within guests?

Since this patch supports macvlan PASSTHRU mode only, in which the lower
device has 1-1 mapping to the guest nic, it does not require any
partitioning of filtering table within guests. Unless I missed understanding
something. 

If the lower device were being shared by multiple guest network interfaces
(non PASSTHRU mode), only then we will need to maintain separate filter
tables for each guest network interface in macvlan and forward the pkt to
respective guest interface after a filter lookup. This could affect
performance too I think.

I chose to support PASSTHRU Mode only at first because its simpler and all
code additions are in control path only.

 
 A way to limit what the guest can do would also be useful.
 How can this be done? selinux?

I vaguely remember a thread on the same context.. had a suggestion to
maintain pre-approved address lists and allow guest filter registration of
only those addresses for security. This seemed reasonable. Plus the ability
to support additional address registration from guest could be made
configurable (One of your ideas again from prior work).

I am not an selinux expert, but I am thinking we can use it to only allow or
disallow access or operations to the macvtap device. (?). I will check more
on this.

 
 Any 

RE: [PATCH 5/5] KVM: PPC: booke: Improve timer register emulation

2011-09-07 Thread Liu Yu-B13201
 

 -Original Message-
 From: kvm-ppc-ow...@vger.kernel.org 
 [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf
 Sent: Wednesday, September 07, 2011 3:06 AM
 To: Liu Yu-B13201
 Cc: Wood Scott-B07421; kvm-ppc@vger.kernel.org
 Subject: Re: [PATCH 5/5] KVM: PPC: booke: Improve timer 
 register emulation
 
 
 
 Am 06.09.2011 um 10:04 schrieb Alexander Graf ag...@suse.de:
 
  
  On 06.09.2011, at 05:17, Liu Yu-B13201 wrote:
  
  
  
  -Original Message-
  From: kvm-ppc-ow...@vger.kernel.org 
  [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf
  Sent: Tuesday, September 06, 2011 6:45 AM
  To: Wood Scott-B07421
  Cc: kvm-ppc@vger.kernel.org
  Subject: Re: [PATCH 5/5] KVM: PPC: booke: Improve timer 
  register emulation
  
  
  On 27.08.2011, at 01:31, Scott Wood wrote:
  
  From: Liu Yu yu@freescale.com
  
  Decrementers are now properly driven by TCR/TSR, and the guest
  has full read/write access to these registers.
  
  The decrementer keeps ticking (and setting the TSR bit) 
  regardless of
  whether the interrupts are enabled with TCR.
  
  The decrementer stops at zero, rather than going negative.
  
  Signed-off-by: Liu Yu yu@freescale.com
  [scott: added dequeue in kvmppc_booke_irqprio_deliver, and 
  dec stop-at-zero]
  Signed-off-by: Scott Wood scottw...@freescale.com
  ---
  arch/powerpc/include/asm/kvm_host.h |2 +-
  arch/powerpc/include/asm/kvm_ppc.h  |   11 +
  arch/powerpc/kvm/book3s.c   |8 
  arch/powerpc/kvm/booke.c|   80 
  +++
  arch/powerpc/kvm/booke.h|4 ++
  arch/powerpc/kvm/booke_emulate.c|   11 -
  arch/powerpc/kvm/emulate.c  |   45 ---
  arch/powerpc/kvm/powerpc.c  |   20 +
  8 files changed, 114 insertions(+), 67 deletions(-)
  
  diff --git a/arch/powerpc/include/asm/kvm_host.h 
  b/arch/powerpc/include/asm/kvm_host.h
  index 3305af4..ea08c79 100644
  --- a/arch/powerpc/include/asm/kvm_host.h
  +++ b/arch/powerpc/include/asm/kvm_host.h
  @@ -334,7 +334,7 @@ struct kvm_vcpu_arch {
 u32 tbl;
 u32 tbu;
 u32 tcr;
  -u32 tsr;
  +ulong tsr; /* we need to perform set/clr_bits() which 
  requires ulong */
 u32 ivor[64];
 ulong ivpr;
 u32 pvr;
  diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
  b/arch/powerpc/include/asm/kvm_ppc.h
  index bdaa6c8..ddaa615 100644
  --- a/arch/powerpc/include/asm/kvm_ppc.h
  +++ b/arch/powerpc/include/asm/kvm_ppc.h
  @@ -66,6 +66,7 @@ extern int 
  kvmppc_emulate_instruction(struct kvm_run *run,
  extern int kvmppc_emulate_mmio(struct kvm_run *run, struct 
  kvm_vcpu *vcpu);
  extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
  extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
  +extern void kvmppc_decrementer_func(unsigned long data);
  
  /* Core-specific hooks */
  
  @@ -197,4 +198,14 @@ int kvm_vcpu_ioctl_config_tlb(struct 
  kvm_vcpu *vcpu,
  int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
  struct kvm_dirty_tlb *cfg);
  
  +static inline void kvmppc_wakeup_vcpu(struct kvm_vcpu *vcpu)
  +{
  +if (waitqueue_active(vcpu-wq)) {
  +wake_up_interruptible(vcpu-wq);
  +vcpu-stat.halt_wakeup++;
  +} else if (vcpu-cpu != -1) {
  +smp_send_reschedule(vcpu-cpu);
  +}
  +}
  +
  #endif /* __POWERPC_KVM_PPC_H__ */
  diff --git a/arch/powerpc/kvm/book3s.c 
 b/arch/powerpc/kvm/book3s.c
  index f68a34d..b057856 100644
  --- a/arch/powerpc/kvm/book3s.c
  +++ b/arch/powerpc/kvm/book3s.c
  @@ -514,3 +514,11 @@ out:
 mutex_unlock(kvm-slots_lock);
 return r;
  }
  +
  +void kvmppc_decrementer_func(unsigned long data)
  +{
  +struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
  +
  +kvmppc_core_queue_dec(vcpu);
  +kvmppc_wakeup_vcpu(vcpu);
  +}
  diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
  index 0ed62c1..502f9ff 100644
  --- a/arch/powerpc/kvm/booke.c
  +++ b/arch/powerpc/kvm/booke.c
  @@ -205,7 +205,8 @@ void 
  kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
  static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
unsigned int priority)
  {
  -int allowed = 0;
  +int allowed = 1;
  +int dequeue = 0;
 ulong uninitialized_var(msr_mask);
 bool update_esr = false, update_dear = false;
 ulong crit_raw = vcpu-arch.shared-critical;
  @@ -258,10 +259,15 @@ static int 
  kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 allowed = vcpu-arch.shared-msr  MSR_ME;
 msr_mask = 0;
 break;
  -case BOOKE_IRQPRIO_EXTERNAL:
 case BOOKE_IRQPRIO_DECREMENTER:
  +if (!(vcpu-arch.tcr  TCR_DIE)) {
  +allowed = 0;
  +dequeue = 1;
  +}
  +/* fall through */
  +case BOOKE_IRQPRIO_EXTERNAL:
 case BOOKE_IRQPRIO_FIT:
  -allowed = vcpu-arch.shared-msr  MSR_EE;
  +allowed = allowed  vcpu-arch.shared-msr