RE: [PATCH v3 5/8] KVM: Recalculate destination vcpu map

2013-03-20 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-03-19:
 On Mon, Mar 18, 2013 at 08:47:19PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 Update destination vcpu map when ioapic entry or apic(id, ldr, dfr) is 
 changed
 
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  virt/kvm/ioapic.c |   40 ++--
  1 files changed, 38 insertions(+), 2 deletions(-)
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index 4296116..329efe1 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -87,6 +87,38 @@ static unsigned long ioapic_read_indirect(struct
 kvm_ioapic *ioapic,
  return result;
  }
 +#ifdef CONFIG_X86
 +static void rtc_irq_get_dest_vcpu(struct kvm_ioapic *ioapic, int irq)
 +{
 +union kvm_ioapic_redirect_entry *entry = ioapic-redirtbl[irq];
 +struct kvm_lapic_irq irqe;
 +
 +if (irq != 8 || entry-fields.mask)
 +return;
 +
 +spin_lock(ioapic-lock);
 How does this not deadlock? The is called from kvm_scan_ioapic_entry()
 and kvm_scan_ioapic_entry() is called from ioapic_write_indirect() with
I removed the lock before call kvm_scan_ioapic_entry() in 
ioapic_write_indirect().

 the lock already taken. You should handle that the same way we handle
 eoibitmap recalculation: signal vcpu and calculate there.
Sure.

 +irqe.dest_id = entry-fields.dest_id;
 +irqe.vector = entry-fields.vector;
 +irqe.dest_mode = entry-fields.dest_mode;
 +irqe.trig_mode = entry-fields.trig_mode;
 +irqe.delivery_mode = entry-fields.delivery_mode  8;
 +irqe.level = 1;
 +irqe.shorthand = 0;
 +
 +bitmap_zero(ioapic-rtc_status.vcpu_map, KVM_MAX_VCPUS);
 +
 +kvm_get_dest_vcpu(ioapic-kvm, irqe, ioapic-rtc_status.vcpu_map);
 +spin_unlock(ioapic-lock);
 +}
 +
 +#else
 +
 +static void rtc_irq_get_dest_vcpu(struct kvm_ioapic *ioapic, int irq)
 +{
 +return;
 +}
 +#endif
 +
  static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
  {   union kvm_ioapic_redirect_entry *pent; @@ -147,9 +179,13 @@ void
  kvm_scan_ioapic_entry(struct kvm *kvm) {struct kvm_ioapic *ioapic =
  kvm-arch.vioapic;
 -if (!kvm_apic_vid_enabled(kvm) || !ioapic)
 +if (!ioapic)
  return;
 -kvm_make_update_eoibitmap_request(kvm);
 +
 +rtc_irq_get_dest_vcpu(ioapic, 8);
 +
 +if (kvm_apic_vid_enabled(kvm))
 +kvm_make_update_eoibitmap_request(kvm);
  }
  
  static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 --
 1.7.1
 
 --
   Gleb.
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V6 5/5] virtio-scsi: reset virtqueue affinity when doing cpu hotplug

2013-03-20 Thread Wanlong Gao
Add hot cpu notifier to reset the request virtqueue affinity
when doing cpu hotplug.

Cc: linux-s...@vger.kernel.org
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
Reviewed-by: Asias He as...@redhat.com
---
 drivers/scsi/virtio_scsi.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 13d7672..0ad9017 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -110,6 +110,9 @@ struct virtio_scsi {
/* if the affinity hint is set for virtqueues */
bool affinity_hint_set;
 
+   /* CPU hotplug notifier */
+   struct notifier_block nb;
+
struct virtio_scsi_vq ctrl_vq;
struct virtio_scsi_vq event_vq;
struct virtio_scsi_vq req_vqs[];
@@ -762,6 +765,23 @@ static void virtscsi_set_affinity(struct virtio_scsi 
*vscsi, bool affinity)
put_online_cpus();
 }
 
+static int virtscsi_cpu_callback(struct notifier_block *nfb,
+unsigned long action, void *hcpu)
+{
+   struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
+   switch(action) {
+   case CPU_ONLINE:
+   case CPU_ONLINE_FROZEN:
+   case CPU_DEAD:
+   case CPU_DEAD_FROZEN:
+   __virtscsi_set_affinity(vscsi, true);
+   break;
+   default:
+   break;
+   }
+   return NOTIFY_OK;
+}
+
 static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
 struct virtqueue *vq)
 {
@@ -884,6 +904,13 @@ static int virtscsi_probe(struct virtio_device *vdev)
if (err)
goto virtscsi_init_failed;
 
+   vscsi-nb.notifier_call = virtscsi_cpu_callback;
+   err = register_hotcpu_notifier(vscsi-nb);
+   if (err) {
+   pr_err(registering cpu notifier failed\n);
+   goto scsi_add_host_failed;
+   }
+
cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
shost-cmd_per_lun = min_t(u32, cmd_per_lun, shost-can_queue);
shost-max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0x;
@@ -921,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
 
scsi_remove_host(shost);
 
+   unregister_hotcpu_notifier(vscsi-nb);
+
virtscsi_remove_vqs(vdev);
scsi_host_put(shost);
 }
-- 
1.8.2.rc2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V6 4/5] virtio-scsi: introduce multiqueue support

2013-03-20 Thread Wanlong Gao
From: Paolo Bonzini pbonz...@redhat.com

This patch adds queue steering to virtio-scsi.  When a target is sent
multiple requests, we always drive them to the same queue so that FIFO
processing order is kept.  However, if a target was idle, we can choose
a queue arbitrarily.  In this case the queue is chosen according to the
current VCPU, so the driver expects the number of request queues to be
equal to the number of VCPUs.  This makes it easy and fast to select
the queue, and also lets the driver optimize the IRQ affinity for the
virtqueues (each virtqueue's affinity is set to the CPU that owns
the queue).

The speedup comes from improving cache locality and giving CPU affinity
to the virtqueues, which is why this scheme was selected.  Assuming that
the thread that is sending requests to the device is I/O-bound, it is
likely to be sleeping at the time the ISR is executed, and thus executing
the ISR on the same processor that sent the requests is cheap.

However, the kernel will not execute the ISR on the best processor
unless you explicitly set the affinity.  This is because in practice
you will have many such I/O-bound processes and thus many otherwise
idle processors.  Then the kernel will execute the ISR on a random
processor, rather than the one that is sending requests to the device.

The alternative to per-CPU virtqueues is per-target virtqueues.  To
achieve the same locality, we could dynamically choose the virtqueue's
affinity based on the CPU of the last task that sent a request.  This
is less appealing because we do not set the affinity directly---we only
provide a hint to the irqbalanced running in userspace.  Dynamically
changing the affinity only works if the userspace applies the hint
fast enough.

Cc: linux-s...@vger.kernel.org
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
Reviewed-by: Asias He as...@redhat.com
---
 drivers/scsi/virtio_scsi.c | 282 -
 1 file changed, 254 insertions(+), 28 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index dc2daec..13d7672 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -22,12 +22,14 @@
 #include linux/virtio_ids.h
 #include linux/virtio_config.h
 #include linux/virtio_scsi.h
+#include linux/cpu.h
 #include scsi/scsi_host.h
 #include scsi/scsi_device.h
 #include scsi/scsi_cmnd.h
 
 #define VIRTIO_SCSI_MEMPOOL_SZ 64
 #define VIRTIO_SCSI_EVENT_LEN 8
+#define VIRTIO_SCSI_VQ_BASE 2
 
 /* Command queue element */
 struct virtio_scsi_cmd {
@@ -59,22 +61,58 @@ struct virtio_scsi_vq {
struct virtqueue *vq;
 };
 
-/* Per-target queue state */
+/*
+ * Per-target queue state.
+ *
+ * This struct holds the data needed by the queue steering policy.  When a
+ * target is sent multiple requests, we need to drive them to the same queue so
+ * that FIFO processing order is kept.  However, if a target was idle, we can
+ * choose a queue arbitrarily.  In this case the queue is chosen according to
+ * the current VCPU, so the driver expects the number of request queues to be
+ * equal to the number of VCPUs.  This makes it easy and fast to select the
+ * queue, and also lets the driver optimize the IRQ affinity for the virtqueues
+ * (each virtqueue's affinity is set to the CPU that owns the queue).
+ *
+ * An interesting effect of this policy is that only writes to req_vq need to
+ * take the tgt_lock.  Read can be done outside the lock because:
+ *
+ * - writes of req_vq only occur when atomic_inc_return(tgt-reqs) returns 1.
+ *   In that case, no other CPU is reading req_vq: even if they were in
+ *   virtscsi_queuecommand_multi, they would be spinning on tgt_lock.
+ *
+ * - reads of req_vq only occur when the target is not idle (reqs != 0).
+ *   A CPU that enters virtscsi_queuecommand_multi will not modify req_vq.
+ *
+ * Similarly, decrements of reqs are never concurrent with writes of req_vq.
+ * Thus they can happen outside the tgt_lock, provided of course we make reqs
+ * an atomic_t.
+ */
 struct virtio_scsi_target_state {
-   /* Never held at the same time as vq_lock.  */
+   /* This spinlock never held at the same time as vq_lock. */
spinlock_t tgt_lock;
+
+   /* Count of outstanding requests. */
+   atomic_t reqs;
+
+   /* Currently active virtqueue for requests sent to this target. */
+   struct virtio_scsi_vq *req_vq;
 };
 
 /* Driver instance state */
 struct virtio_scsi {
struct virtio_device *vdev;
 
-   struct virtio_scsi_vq ctrl_vq;
-   struct virtio_scsi_vq event_vq;
-   struct virtio_scsi_vq req_vq;
-
/* Get some buffers ready for event vq */
struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
+
+   u32 num_queues;
+
+   /* If the affinity hint is set for virtqueues */
+   bool affinity_hint_set;
+
+   struct virtio_scsi_vq ctrl_vq;
+   struct virtio_scsi_vq event_vq;
+   struct 

[PATCH V6 3/5] virtio-scsi: push vq lock/unlock into virtscsi_vq_done

2013-03-20 Thread Wanlong Gao
From: Paolo Bonzini pbonz...@redhat.com

Avoid duplicated code in all of the callers.

Cc: linux-s...@vger.kernel.org
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
Reviewed-by: Asias He as...@redhat.com
---
 drivers/scsi/virtio_scsi.c | 22 +-
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index c23560c..dc2daec 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -165,28 +165,30 @@ static void virtscsi_complete_cmd(struct virtio_scsi 
*vscsi, void *buf)
sc-scsi_done(sc);
 }
 
-static void virtscsi_vq_done(struct virtio_scsi *vscsi, struct virtqueue *vq,
+static void virtscsi_vq_done(struct virtio_scsi *vscsi,
+struct virtio_scsi_vq *virtscsi_vq,
 void (*fn)(struct virtio_scsi *vscsi, void *buf))
 {
void *buf;
unsigned int len;
+   unsigned long flags;
+   struct virtqueue *vq = virtscsi_vq-vq;
 
+   spin_lock_irqsave(virtscsi_vq-vq_lock, flags);
do {
virtqueue_disable_cb(vq);
while ((buf = virtqueue_get_buf(vq, len)) != NULL)
fn(vscsi, buf);
} while (!virtqueue_enable_cb(vq));
+   spin_unlock_irqrestore(virtscsi_vq-vq_lock, flags);
 }
 
 static void virtscsi_req_done(struct virtqueue *vq)
 {
struct Scsi_Host *sh = virtio_scsi_host(vq-vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
-   unsigned long flags;
 
-   spin_lock_irqsave(vscsi-req_vq.vq_lock, flags);
-   virtscsi_vq_done(vscsi, vq, virtscsi_complete_cmd);
-   spin_unlock_irqrestore(vscsi-req_vq.vq_lock, flags);
+   virtscsi_vq_done(vscsi, vscsi-req_vq, virtscsi_complete_cmd);
 };
 
 static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
@@ -203,11 +205,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
 {
struct Scsi_Host *sh = virtio_scsi_host(vq-vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
-   unsigned long flags;
 
-   spin_lock_irqsave(vscsi-ctrl_vq.vq_lock, flags);
-   virtscsi_vq_done(vscsi, vq, virtscsi_complete_free);
-   spin_unlock_irqrestore(vscsi-ctrl_vq.vq_lock, flags);
+   virtscsi_vq_done(vscsi, vscsi-ctrl_vq, virtscsi_complete_free);
 };
 
 static int virtscsi_kick_event(struct virtio_scsi *vscsi,
@@ -342,11 +341,8 @@ static void virtscsi_event_done(struct virtqueue *vq)
 {
struct Scsi_Host *sh = virtio_scsi_host(vq-vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
-   unsigned long flags;
 
-   spin_lock_irqsave(vscsi-event_vq.vq_lock, flags);
-   virtscsi_vq_done(vscsi, vq, virtscsi_complete_event);
-   spin_unlock_irqrestore(vscsi-event_vq.vq_lock, flags);
+   virtscsi_vq_done(vscsi, vscsi-event_vq, virtscsi_complete_event);
 };
 
 /**
-- 
1.8.2.rc2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V6 0/5] virtio-scsi multiqueue

2013-03-20 Thread Wanlong Gao
This series implements virtio-scsi queue steering, which gives
performance improvements of up to 50% (measured both with QEMU and
tcm_vhost backends).

This version rebased on Rusty's virtio ring rework patches, which
has already gone into virtio-next today.
We hope this can go into virtio-next together with the virtio ring
rework pathes.

V6: rework redo allocation of target data (James)
fix comments (Venkatesh)
rebase to virtio-next

V5: improving the grammar of 1/5 (Paolo)
move the dropping of sg_elems to 'virtio-scsi: use virtqueue_add_sgs for 
command buffers'. (Asias)

V4: rebase on virtio ring rework patches (rusty's pending-rebases branch)

V3 and be found http://marc.info/?l=linux-virtualizationm=136067440717154w=2


It would probably be easier to get it in via Rusty's tree
because of the prerequisites.  James, can we get your Acked-by?


Paolo Bonzini (3):
  virtio-scsi: pass struct virtio_scsi to virtqueue completion function
  virtio-scsi: push vq lock/unlock into virtscsi_vq_done
  virtio-scsi: introduce multiqueue support

Wanlong Gao (2):
  virtio-scsi: redo allocation of target data
  virtio-scsi: reset virtqueue affinity when doing cpu hotplug

 drivers/scsi/virtio_scsi.c | 387 -
 1 file changed, 309 insertions(+), 78 deletions(-)

-- 
1.8.2.rc2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V6 1/5] virtio-scsi: redo allocation of target data

2013-03-20 Thread Wanlong Gao
virtio_scsi_target_state is now empty.  We will find new uses for it in
the next few patches, so this patch does not drop it completely.

And as James suggested, we use entries target_alloc and target_destroy
in the host template to allocate and destroy the virtio_scsi_target_state
of each target, attach this struct to scsi_target-hostdata. Now
we can get at it from the sdev with scsi_target(sdev)-hostdata.
No messing around with fixed size arrays and bulk memory allocation
and no need to pass in the maximum target size as a parameter because
everything should now happen dynamically.

Cc: James Bottomley james.bottom...@hansenpartnership.com
Cc: linux-s...@vger.kernel.org
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 drivers/scsi/virtio_scsi.c | 71 --
 1 file changed, 25 insertions(+), 46 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index b53ba9e..ffa03e8 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -75,8 +75,6 @@ struct virtio_scsi {
 
/* Get some buffers ready for event vq */
struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
-
-   struct virtio_scsi_target_state *tgt[];
 };
 
 static struct kmem_cache *virtscsi_cmd_cache;
@@ -530,6 +528,25 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
return virtscsi_tmf(vscsi, cmd);
 }
 
+static int virtscsi_target_alloc(struct scsi_target *starget)
+{
+   struct virtio_scsi_target_state *tgt =
+   kmalloc(sizeof(*tgt), GFP_KERNEL);
+   if (!tgt)
+   return -ENOMEM;
+
+   spin_lock_init(tgt-tgt_lock);
+
+   starget-hostdata = tgt;
+   return 0;
+}
+
+static void virtscsi_target_destroy(struct scsi_target *starget)
+{
+   struct virtio_scsi_target_state *tgt = starget-hostdata;
+   kfree(tgt);
+}
+
 static struct scsi_host_template virtscsi_host_template = {
.module = THIS_MODULE,
.name = Virtio SCSI HBA,
@@ -542,6 +559,8 @@ static struct scsi_host_template virtscsi_host_template = {
.can_queue = 1024,
.dma_boundary = UINT_MAX,
.use_clustering = ENABLE_CLUSTERING,
+   .target_alloc = virtscsi_target_alloc,
+   .target_destroy = virtscsi_target_destroy,
 };
 
 #define virtscsi_config_get(vdev, fld) \
@@ -568,20 +587,6 @@ static void virtscsi_init_vq(struct virtio_scsi_vq 
*virtscsi_vq,
virtscsi_vq-vq = vq;
 }
 
-static struct virtio_scsi_target_state *virtscsi_alloc_tgt(
-   struct virtio_device *vdev)
-{
-   struct virtio_scsi_target_state *tgt;
-   gfp_t gfp_mask = GFP_KERNEL;
-
-   tgt = kmalloc(sizeof(*tgt), gfp_mask);
-   if (!tgt)
-   return NULL;
-
-   spin_lock_init(tgt-tgt_lock);
-   return tgt;
-}
-
 static void virtscsi_scan(struct virtio_device *vdev)
 {
struct Scsi_Host *shost = (struct Scsi_Host *)vdev-priv;
@@ -591,28 +596,17 @@ static void virtscsi_scan(struct virtio_device *vdev)
 
 static void virtscsi_remove_vqs(struct virtio_device *vdev)
 {
-   struct Scsi_Host *sh = virtio_scsi_host(vdev);
-   struct virtio_scsi *vscsi = shost_priv(sh);
-   u32 i, num_targets;
-
/* Stop all the virtqueues. */
vdev-config-reset(vdev);
 
-   num_targets = sh-max_id;
-   for (i = 0; i  num_targets; i++) {
-   kfree(vscsi-tgt[i]);
-   vscsi-tgt[i] = NULL;
-   }
-
vdev-config-del_vqs(vdev);
 }
 
 static int virtscsi_init(struct virtio_device *vdev,
-struct virtio_scsi *vscsi, int num_targets)
+struct virtio_scsi *vscsi)
 {
int err;
struct virtqueue *vqs[3];
-   u32 i;
 
vq_callback_t *callbacks[] = {
virtscsi_ctrl_done,
@@ -640,18 +634,6 @@ static int virtscsi_init(struct virtio_device *vdev,
if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
virtscsi_kick_event_all(vscsi);
 
-   for (i = 0; i  num_targets; i++) {
-   vscsi-tgt[i] = virtscsi_alloc_tgt(vdev);
-   if (!vscsi-tgt[i]) {
-   err = -ENOMEM;
-   goto out;
-   }
-   }
-   err = 0;
-
-out:
-   if (err)
-   virtscsi_remove_vqs(vdev);
return err;
 }
 
@@ -663,12 +645,9 @@ static int virtscsi_probe(struct virtio_device *vdev)
u32 sg_elems, num_targets;
u32 cmd_per_lun;
 
-   /* Allocate memory and link the structs together.  */
num_targets = virtscsi_config_get(vdev, max_target) + 1;
-   shost = scsi_host_alloc(virtscsi_host_template,
-   sizeof(*vscsi)
-   + num_targets * sizeof(struct virtio_scsi_target_state));
 
+   shost = scsi_host_alloc(virtscsi_host_template, sizeof(*vscsi));
if (!shost)
return -ENOMEM;
 
@@ -678,7 +657,7 @@ static int 

[PATCH V6 2/5] virtio-scsi: pass struct virtio_scsi to virtqueue completion function

2013-03-20 Thread Wanlong Gao
From: Paolo Bonzini pbonz...@redhat.com

This will be needed soon in order to retrieve the per-target
struct.

Cc: linux-s...@vger.kernel.org
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
Reviewed-by: Asias He as...@redhat.com
---
 drivers/scsi/virtio_scsi.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index ffa03e8..c23560c 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -104,7 +104,7 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, 
u32 resid)
  *
  * Called with vq_lock held.
  */
-static void virtscsi_complete_cmd(void *buf)
+static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
 {
struct virtio_scsi_cmd *cmd = buf;
struct scsi_cmnd *sc = cmd-sc;
@@ -165,7 +165,8 @@ static void virtscsi_complete_cmd(void *buf)
sc-scsi_done(sc);
 }
 
-static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf))
+static void virtscsi_vq_done(struct virtio_scsi *vscsi, struct virtqueue *vq,
+void (*fn)(struct virtio_scsi *vscsi, void *buf))
 {
void *buf;
unsigned int len;
@@ -173,7 +174,7 @@ static void virtscsi_vq_done(struct virtqueue *vq, void 
(*fn)(void *buf))
do {
virtqueue_disable_cb(vq);
while ((buf = virtqueue_get_buf(vq, len)) != NULL)
-   fn(buf);
+   fn(vscsi, buf);
} while (!virtqueue_enable_cb(vq));
 }
 
@@ -184,11 +185,11 @@ static void virtscsi_req_done(struct virtqueue *vq)
unsigned long flags;
 
spin_lock_irqsave(vscsi-req_vq.vq_lock, flags);
-   virtscsi_vq_done(vq, virtscsi_complete_cmd);
+   virtscsi_vq_done(vscsi, vq, virtscsi_complete_cmd);
spin_unlock_irqrestore(vscsi-req_vq.vq_lock, flags);
 };
 
-static void virtscsi_complete_free(void *buf)
+static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
 {
struct virtio_scsi_cmd *cmd = buf;
 
@@ -205,7 +206,7 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
unsigned long flags;
 
spin_lock_irqsave(vscsi-ctrl_vq.vq_lock, flags);
-   virtscsi_vq_done(vq, virtscsi_complete_free);
+   virtscsi_vq_done(vscsi, vq, virtscsi_complete_free);
spin_unlock_irqrestore(vscsi-ctrl_vq.vq_lock, flags);
 };
 
@@ -329,7 +330,7 @@ static void virtscsi_handle_event(struct work_struct *work)
virtscsi_kick_event(vscsi, event_node);
 }
 
-static void virtscsi_complete_event(void *buf)
+static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf)
 {
struct virtio_scsi_event_node *event_node = buf;
 
@@ -344,7 +345,7 @@ static void virtscsi_event_done(struct virtqueue *vq)
unsigned long flags;
 
spin_lock_irqsave(vscsi-event_vq.vq_lock, flags);
-   virtscsi_vq_done(vq, virtscsi_complete_event);
+   virtscsi_vq_done(vscsi, vq, virtscsi_complete_event);
spin_unlock_irqrestore(vscsi-event_vq.vq_lock, flags);
 };
 
-- 
1.8.2.rc2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V5 4/5] virtio-scsi: introduce multiqueue support

2013-03-20 Thread Wanlong Gao
On 03/20/2013 09:46 AM, Venkatesh Srinivas wrote:
 This looks pretty good!
 
 I rather like the (lack of) locking in I/O completion (around the req
 count vs. target/queue binding). It is unfortunate that you need to hold the 
 per-target lock in virtscsi_pick_vq() though; have any idea
 how much that lock hurts?

Paolo?

 
 Just two minor comments:
 
 (in struct virtio_scsi_target_data):
 +   /* This spinlock never help at the same time as vq_lock. */
 held?
 
 (in struct virtio_scsi):
 +   /* Does the affinity hint is set for virtqueues? */
 Could you rephrase that, please?

Thank you, fixed in V6, please review.

 
 Tested on qemu and w/ Google Compute Engine's virtio-scsi device.

Cool.

 
 Reviewed-and-tested-by: Venkatesh Srinivas venkate...@google.com

Do you mind review and test the V6? Thank you.

Regards,
Wanlong Gao

 
 Thanks,
 -- vs;
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V6 5/5] virtio-scsi: reset virtqueue affinity when doing cpu hotplug

2013-03-20 Thread Asias He
On Wed, Mar 20, 2013 at 03:01:24PM +0800, Wanlong Gao wrote:
 Add hot cpu notifier to reset the request virtqueue affinity
 when doing cpu hotplug.
 
 Cc: linux-s...@vger.kernel.org
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
 Reviewed-by: Asias He as...@redhat.com
 ---
  drivers/scsi/virtio_scsi.c | 29 +
  1 file changed, 29 insertions(+)
 
 diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
 index 13d7672..0ad9017 100644
 --- a/drivers/scsi/virtio_scsi.c
 +++ b/drivers/scsi/virtio_scsi.c
 @@ -110,6 +110,9 @@ struct virtio_scsi {
   /* if the affinity hint is set for virtqueues */
   bool affinity_hint_set;
  
 + /* CPU hotplug notifier */
 + struct notifier_block nb;
 +
   struct virtio_scsi_vq ctrl_vq;
   struct virtio_scsi_vq event_vq;
   struct virtio_scsi_vq req_vqs[];
 @@ -762,6 +765,23 @@ static void virtscsi_set_affinity(struct virtio_scsi 
 *vscsi, bool affinity)
   put_online_cpus();
  }
  
 +static int virtscsi_cpu_callback(struct notifier_block *nfb,
 +  unsigned long action, void *hcpu)
 +{
 + struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
 + switch(action) {
 + case CPU_ONLINE:
 + case CPU_ONLINE_FROZEN:
 + case CPU_DEAD:
 + case CPU_DEAD_FROZEN:
 + __virtscsi_set_affinity(vscsi, true);
 + break;
 + default:
 + break;
 + }
 + return NOTIFY_OK;
 +}
 +
  static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
struct virtqueue *vq)
  {
 @@ -884,6 +904,13 @@ static int virtscsi_probe(struct virtio_device *vdev)
   if (err)
   goto virtscsi_init_failed;
  
 + vscsi-nb.notifier_call = virtscsi_cpu_callback;
 + err = register_hotcpu_notifier(vscsi-nb);
 + if (err) {
 + pr_err(registering cpu notifier failed\n);
 + goto scsi_add_host_failed;
 + }
 +
   cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
   shost-cmd_per_lun = min_t(u32, cmd_per_lun, shost-can_queue);
   shost-max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0x;
 @@ -921,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
  
   scsi_remove_host(shost);
  
 + unregister_hotcpu_notifier(vscsi-nb);
 +
   virtscsi_remove_vqs(vdev);
   scsi_host_put(shost);
  }
 -- 
 1.8.2.rc2
 

This one does not apply on top of virtio-next + patch 1-4 in this series.

-- 
Asias
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V6 5/5] virtio-scsi: reset virtqueue affinity when doing cpu hotplug

2013-03-20 Thread Wanlong Gao
On 03/20/2013 03:24 PM, Asias He wrote:
 On Wed, Mar 20, 2013 at 03:01:24PM +0800, Wanlong Gao wrote:
 Add hot cpu notifier to reset the request virtqueue affinity
 when doing cpu hotplug.

 Cc: linux-s...@vger.kernel.org
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
 Reviewed-by: Asias He as...@redhat.com
 ---
  drivers/scsi/virtio_scsi.c | 29 +
  1 file changed, 29 insertions(+)

 diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
 index 13d7672..0ad9017 100644
 --- a/drivers/scsi/virtio_scsi.c
 +++ b/drivers/scsi/virtio_scsi.c
 @@ -110,6 +110,9 @@ struct virtio_scsi {
  /* if the affinity hint is set for virtqueues */
  bool affinity_hint_set;
  
 +/* CPU hotplug notifier */
 +struct notifier_block nb;
 +
  struct virtio_scsi_vq ctrl_vq;
  struct virtio_scsi_vq event_vq;
  struct virtio_scsi_vq req_vqs[];
 @@ -762,6 +765,23 @@ static void virtscsi_set_affinity(struct virtio_scsi 
 *vscsi, bool affinity)
  put_online_cpus();
  }
  
 +static int virtscsi_cpu_callback(struct notifier_block *nfb,
 + unsigned long action, void *hcpu)
 +{
 +struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
 +switch(action) {
 +case CPU_ONLINE:
 +case CPU_ONLINE_FROZEN:
 +case CPU_DEAD:
 +case CPU_DEAD_FROZEN:
 +__virtscsi_set_affinity(vscsi, true);
 +break;
 +default:
 +break;
 +}
 +return NOTIFY_OK;
 +}
 +
  static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
   struct virtqueue *vq)
  {
 @@ -884,6 +904,13 @@ static int virtscsi_probe(struct virtio_device *vdev)
  if (err)
  goto virtscsi_init_failed;
  
 +vscsi-nb.notifier_call = virtscsi_cpu_callback;
 +err = register_hotcpu_notifier(vscsi-nb);
 +if (err) {
 +pr_err(registering cpu notifier failed\n);
 +goto scsi_add_host_failed;
 +}
 +
  cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
  shost-cmd_per_lun = min_t(u32, cmd_per_lun, shost-can_queue);
  shost-max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0x;
 @@ -921,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
  
  scsi_remove_host(shost);
  
 +unregister_hotcpu_notifier(vscsi-nb);
 +
  virtscsi_remove_vqs(vdev);
  scsi_host_put(shost);
  }
 -- 
 1.8.2.rc2

 
 This one does not apply on top of virtio-next + patch 1-4 in this series.

Oops, it's my fault. There may be something wrong here. But you can use 3-way 
merge
to apply them, I'll take care next time.


Thanks,
Wanlong Gao

 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V6 1/5] virtio-scsi: redo allocation of target data

2013-03-20 Thread Asias He
On Wed, Mar 20, 2013 at 03:01:20PM +0800, Wanlong Gao wrote:
 virtio_scsi_target_state is now empty.  We will find new uses for it in
 the next few patches, so this patch does not drop it completely.
 
 And as James suggested, we use entries target_alloc and target_destroy
 in the host template to allocate and destroy the virtio_scsi_target_state
 of each target, attach this struct to scsi_target-hostdata. Now
 we can get at it from the sdev with scsi_target(sdev)-hostdata.
 No messing around with fixed size arrays and bulk memory allocation
 and no need to pass in the maximum target size as a parameter because
 everything should now happen dynamically.
 
 Cc: James Bottomley james.bottom...@hansenpartnership.com
 Cc: linux-s...@vger.kernel.org
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com

Reviewed-by: Asias He as...@redhat.com

 ---
  drivers/scsi/virtio_scsi.c | 71 
 --
  1 file changed, 25 insertions(+), 46 deletions(-)
 
 diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
 index b53ba9e..ffa03e8 100644
 --- a/drivers/scsi/virtio_scsi.c
 +++ b/drivers/scsi/virtio_scsi.c
 @@ -75,8 +75,6 @@ struct virtio_scsi {
  
   /* Get some buffers ready for event vq */
   struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
 -
 - struct virtio_scsi_target_state *tgt[];
  };
  
  static struct kmem_cache *virtscsi_cmd_cache;
 @@ -530,6 +528,25 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
   return virtscsi_tmf(vscsi, cmd);
  }
  
 +static int virtscsi_target_alloc(struct scsi_target *starget)
 +{
 + struct virtio_scsi_target_state *tgt =
 + kmalloc(sizeof(*tgt), GFP_KERNEL);
 + if (!tgt)
 + return -ENOMEM;
 +
 + spin_lock_init(tgt-tgt_lock);
 +
 + starget-hostdata = tgt;
 + return 0;
 +}
 +
 +static void virtscsi_target_destroy(struct scsi_target *starget)
 +{
 + struct virtio_scsi_target_state *tgt = starget-hostdata;
 + kfree(tgt);
 +}
 +
  static struct scsi_host_template virtscsi_host_template = {
   .module = THIS_MODULE,
   .name = Virtio SCSI HBA,
 @@ -542,6 +559,8 @@ static struct scsi_host_template virtscsi_host_template = 
 {
   .can_queue = 1024,
   .dma_boundary = UINT_MAX,
   .use_clustering = ENABLE_CLUSTERING,
 + .target_alloc = virtscsi_target_alloc,
 + .target_destroy = virtscsi_target_destroy,
  };
  
  #define virtscsi_config_get(vdev, fld) \
 @@ -568,20 +587,6 @@ static void virtscsi_init_vq(struct virtio_scsi_vq 
 *virtscsi_vq,
   virtscsi_vq-vq = vq;
  }
  
 -static struct virtio_scsi_target_state *virtscsi_alloc_tgt(
 - struct virtio_device *vdev)
 -{
 - struct virtio_scsi_target_state *tgt;
 - gfp_t gfp_mask = GFP_KERNEL;
 -
 - tgt = kmalloc(sizeof(*tgt), gfp_mask);
 - if (!tgt)
 - return NULL;
 -
 - spin_lock_init(tgt-tgt_lock);
 - return tgt;
 -}
 -
  static void virtscsi_scan(struct virtio_device *vdev)
  {
   struct Scsi_Host *shost = (struct Scsi_Host *)vdev-priv;
 @@ -591,28 +596,17 @@ static void virtscsi_scan(struct virtio_device *vdev)
  
  static void virtscsi_remove_vqs(struct virtio_device *vdev)
  {
 - struct Scsi_Host *sh = virtio_scsi_host(vdev);
 - struct virtio_scsi *vscsi = shost_priv(sh);
 - u32 i, num_targets;
 -
   /* Stop all the virtqueues. */
   vdev-config-reset(vdev);
  
 - num_targets = sh-max_id;
 - for (i = 0; i  num_targets; i++) {
 - kfree(vscsi-tgt[i]);
 - vscsi-tgt[i] = NULL;
 - }
 -
   vdev-config-del_vqs(vdev);
  }
  
  static int virtscsi_init(struct virtio_device *vdev,
 -  struct virtio_scsi *vscsi, int num_targets)
 +  struct virtio_scsi *vscsi)
  {
   int err;
   struct virtqueue *vqs[3];
 - u32 i;
  
   vq_callback_t *callbacks[] = {
   virtscsi_ctrl_done,
 @@ -640,18 +634,6 @@ static int virtscsi_init(struct virtio_device *vdev,
   if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
   virtscsi_kick_event_all(vscsi);
  
 - for (i = 0; i  num_targets; i++) {
 - vscsi-tgt[i] = virtscsi_alloc_tgt(vdev);
 - if (!vscsi-tgt[i]) {
 - err = -ENOMEM;
 - goto out;
 - }
 - }
 - err = 0;
 -
 -out:
 - if (err)
 - virtscsi_remove_vqs(vdev);
   return err;
  }
  
 @@ -663,12 +645,9 @@ static int virtscsi_probe(struct virtio_device *vdev)
   u32 sg_elems, num_targets;
   u32 cmd_per_lun;
  
 - /* Allocate memory and link the structs together.  */
   num_targets = virtscsi_config_get(vdev, max_target) + 1;
 - shost = scsi_host_alloc(virtscsi_host_template,
 - sizeof(*vscsi)
 - + num_targets * sizeof(struct virtio_scsi_target_state));
  
 + shost = 

Re: [PATCH] virtio-blk: Set default serial id

2013-03-20 Thread Stefan Hajnoczi
On Wed, Mar 20, 2013 at 01:56:08PM +0800, Asias He wrote:
 If user does not specify a serial id, e.g.
 
-device virtio-blk-pci,serial=serial_id
 or
-drive serial=serial_id
 
 no serial id will be assigned.
 
 Add a default serial id in this case to help identifying
 the disk in guest.
 
 Signed-off-by: Asias He as...@redhat.com
 ---
  hw/virtio-blk.c | 7 +++
  1 file changed, 7 insertions(+)

Autogenerated IDs have been proposed (for other devices?) before and I
think we should avoid them.

The serial in this patch depends on the internal counter we use for
savevm.  It is not a well-defined value that guests can depend on
remaining the same.

It can change between QEMU invocations - due to internal changes in QEMU
or because the management tool reordered -device options.

Users will be confused and their guests may stop working if they depend
on an ID like this.

The solution is to do persistent naming either by really passing -device
virtio-blk-pci,serial= or with udev inside the guest using the bus
address (PCI devfn) like the new persistent network interface naming for
Linux.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V6 5/5] virtio-scsi: reset virtqueue affinity when doing cpu hotplug

2013-03-20 Thread Wanlong Gao
On 03/20/2013 03:24 PM, Asias He wrote:
 On Wed, Mar 20, 2013 at 03:01:24PM +0800, Wanlong Gao wrote:
 Add hot cpu notifier to reset the request virtqueue affinity
 when doing cpu hotplug.

 Cc: linux-s...@vger.kernel.org
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
 Reviewed-by: Asias He as...@redhat.com
 ---
  drivers/scsi/virtio_scsi.c | 29 +
  1 file changed, 29 insertions(+)

 diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
 index 13d7672..0ad9017 100644
 --- a/drivers/scsi/virtio_scsi.c
 +++ b/drivers/scsi/virtio_scsi.c
 @@ -110,6 +110,9 @@ struct virtio_scsi {
  /* if the affinity hint is set for virtqueues */
  bool affinity_hint_set;
  
 +/* CPU hotplug notifier */
 +struct notifier_block nb;
 +
  struct virtio_scsi_vq ctrl_vq;
  struct virtio_scsi_vq event_vq;
  struct virtio_scsi_vq req_vqs[];
 @@ -762,6 +765,23 @@ static void virtscsi_set_affinity(struct virtio_scsi 
 *vscsi, bool affinity)
  put_online_cpus();
  }
  
 +static int virtscsi_cpu_callback(struct notifier_block *nfb,
 + unsigned long action, void *hcpu)
 +{
 +struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
 +switch(action) {
 +case CPU_ONLINE:
 +case CPU_ONLINE_FROZEN:
 +case CPU_DEAD:
 +case CPU_DEAD_FROZEN:
 +__virtscsi_set_affinity(vscsi, true);
 +break;
 +default:
 +break;
 +}
 +return NOTIFY_OK;
 +}
 +
  static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
   struct virtqueue *vq)
  {
 @@ -884,6 +904,13 @@ static int virtscsi_probe(struct virtio_device *vdev)
  if (err)
  goto virtscsi_init_failed;
  
 +vscsi-nb.notifier_call = virtscsi_cpu_callback;
 +err = register_hotcpu_notifier(vscsi-nb);
 +if (err) {
 +pr_err(registering cpu notifier failed\n);
 +goto scsi_add_host_failed;
 +}
 +
  cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
  shost-cmd_per_lun = min_t(u32, cmd_per_lun, shost-can_queue);
  shost-max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0x;
 @@ -921,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
  
  scsi_remove_host(shost);
  
 +unregister_hotcpu_notifier(vscsi-nb);
 +
  virtscsi_remove_vqs(vdev);
  scsi_host_put(shost);
  }
 -- 
 1.8.2.rc2

 
 This one does not apply on top of virtio-next + patch 1-4 in this series.

I'm very sorry.

This fault is because I modified the 4/5 from 
/* if the affinity hint is set for virtqueues */
to
/* If the affinity hint is set for virtqueues */
by hand.

You can also change If to if in 5/5 to apply this patch without 3-way merge.

Thanks,
Wanlong Gao

 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/7] KVM: MMU: fast zap all shadow pages

2013-03-20 Thread Xiao Guangrong
Changlog:
V2:
  - do not reset n_requested_mmu_pages and n_max_mmu_pages
  - batch free root shadow pages to reduce vcpu notification and mmu-lock
contention
  - remove the first patch that introduce kvm-arch.mmu_cache since we only
'memset zero' on hashtable rather than all mmu cache members in this
version
  - remove unnecessary kvm_reload_remote_mmus after kvm_mmu_zap_all

* Issue
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability.

* Idea
Since all shadow page will be zapped, we can directly zap the mmu-cache
and rmap so that vcpu will fault on the new mmu-cache, after that, we can
directly free the memory used by old mmu-cache.

The root shadow page is little especial since they are currently used by
vcpus, we can not directly free them. So, we zap the root shadow pages and
re-add them into the new mmu-cache.

* TODO
(1): free root shadow pages by using generation-number
(2): drop unnecessary @npages from kvm_arch_create_memslot

* Performance
The testcase can be found at:
http://www.gossamer-threads.com/lists/engine?do=post_attachment;postatt_id=54896;list=linux
is used to measure the time of delete / add memslot. At that time, all vcpus
are waiting, that means, no mmu-lock contention. I believe the result be more
beautiful if other vcpus and mmu notification need to hold the mmu-lock.

Guest VCPU:6, Mem:2048M

before: Run 10 times, Avg time:46078825 ns.

after: Run 10 times, Avg time:21558774 ns. (+ 113%)

Xiao Guangrong (7):
  KVM: MMU: introduce mmu_cache-pte_list_descs
  KVM: x86: introduce memslot_set_lpage_disallowed
  KVM: x86: introduce kvm_clear_all_gfn_page_info
  KVM: MMU: delete shadow page from hash list in
kvm_mmu_prepare_zap_page
  KVM: MMU: split kvm_mmu_prepare_zap_page
  KVM: MMU: fast zap all shadow pages
  KVM: MMU: drop unnecessary kvm_reload_remote_mmus after
kvm_mmu_zap_all

 arch/x86/include/asm/kvm_host.h |7 ++-
 arch/x86/kvm/mmu.c  |  105 ++-
 arch/x86/kvm/mmu.h  |1 +
 arch/x86/kvm/x86.c  |   87 +---
 include/linux/kvm_host.h|1 +
 5 files changed, 166 insertions(+), 35 deletions(-)

-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 7/7] KVM: MMU: drop unnecessary kvm_reload_remote_mmus after kvm_mmu_zap_all

2013-03-20 Thread Xiao Guangrong
It is the responsibility of kvm_mmu_zap_all that keeps the consistent
of mmu and tlbs

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/x86.c |1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 87d27a8..5bae962 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7047,7 +7047,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
kvm_mmu_zap_all(kvm);
-   kvm_reload_remote_mmus(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/7] KVM: MMU: delete shadow page from hash list in kvm_mmu_prepare_zap_page

2013-03-20 Thread Xiao Guangrong
Move deletion shadow page from the hash list from kvm_mmu_commit_zap_page to
kvm_mmu_prepare_zap_page, we that we can free the shadow page out of mmu-lock.

Also, delete the invalid shadow page from the hash list since this page can
not be reused anymore. This makes reset mmu-cache more easier - we do not need
to care all hash entries after reset mmu-cache

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |8 ++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dc37512..5578c91 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1472,7 +1472,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm 
*kvm, int nr)
 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
ASSERT(is_empty_shadow_page(sp-spt));
-   hlist_del(sp-hash_link);
+
list_del(sp-link);
free_page((unsigned long)sp-spt);
if (!sp-role.direct)
@@ -1660,7 +1660,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)
\
for_each_gfn_sp(_kvm, _sp, _gfn)\
-   if ((_sp)-role.direct || (_sp)-role.invalid) {} else
+   if ((_sp)-role.direct ||   \
+ ((_sp)-role.invalid  WARN_ON(1))) {} else
 
 /* @sp-gfn should be write-protected at the call site */
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2079,6 +2080,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
unaccount_shadowed(kvm, sp-gfn);
if (sp-unsync)
kvm_unlink_unsync_page(kvm, sp);
+
+   hlist_del_init(sp-hash_link);
+
if (!sp-root_count) {
/* Count self */
ret++;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 5/7] KVM: MMU: split kvm_mmu_prepare_zap_page

2013-03-20 Thread Xiao Guangrong
Then the new function __kvm_mmu_prepare_zap_page only zaps the shadow page
without KVM_REQ_MMU_RELOAD. Later, we will use it to batch free root shadow
pages

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   20 +++-
 1 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5578c91..e880cdd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2066,8 +2066,9 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
return zapped;
 }
 
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
-   struct list_head *invalid_list)
+static int
+__kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+  struct list_head *invalid_list)
 {
int ret;
 
@@ -2088,15 +2089,24 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
ret++;
list_move(sp-link, invalid_list);
kvm_mod_used_mmu_pages(kvm, -1);
-   } else {
+   } else
list_move(sp-link, kvm-arch.active_mmu_pages);
-   kvm_reload_remote_mmus(kvm);
-   }
 
sp-role.invalid = 1;
return ret;
 }
 
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+   struct list_head *invalid_list)
+{
+   int ret = __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+
+   if (sp-root_count)
+   kvm_reload_remote_mmus(kvm);
+
+   return ret;
+}
+
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
 {
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/7] KVM: x86: introduce kvm_clear_all_gfn_page_info

2013-03-20 Thread Xiao Guangrong
This function is used to reset the rmaps and page info of all guest page
which will be used in later patch

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/x86.c   |   31 +++
 include/linux/kvm_host.h |1 +
 2 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e0a7f67..87d27a8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6902,6 +6902,37 @@ memslot_set_lpage_disallowed(struct kvm_memory_slot 
*slot,
}
 }
 
+static void clear_memslot_page_info(struct kvm_memory_slot *slot)
+{
+   int i;
+
+   for (i = 0; i  KVM_NR_PAGE_SIZES; ++i) {
+   int lpages;
+   int level = i + 1;
+
+   lpages = gfn_to_index(slot-base_gfn + slot-npages - 1,
+ slot-base_gfn, level) + 1;
+
+   memset(slot-arch.rmap[i], 0,
+  lpages * sizeof(*slot-arch.rmap[i]));
+
+   if (i) {
+   memset(slot-arch.lpage_info[i - 1], 0,
+  sizeof(*slot-arch.lpage_info[i - 1]));
+   memslot_set_lpage_disallowed(slot, slot-npages, i,
+lpages);
+   }
+   }
+}
+
+void kvm_clear_all_gfn_page_info(struct kvm *kvm)
+{
+   struct kvm_memory_slot *slot;
+
+   kvm_for_each_memslot(slot, kvm-memslots)
+   clear_memslot_page_info(slot);
+}
+
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 {
int i;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0f4941a..1de9b79 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -491,6 +491,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem);
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
   struct kvm_memory_slot *dont);
+void kvm_clear_all_gfn_page_info(struct kvm *kvm);
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long 
npages);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/7] KVM: x86: introduce memslot_set_lpage_disallowed

2013-03-20 Thread Xiao Guangrong
It is used to set disallowed lage page on the specified level, can be
used in later patch

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/x86.c |   53 ++-
 1 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6622ac0..e0a7f67 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6868,12 +6868,45 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
}
 }
 
+static void
+memslot_set_lpage_disallowed(struct kvm_memory_slot *slot,
+unsigned long npages, int lpage_size,
+int lpages)
+{
+   struct kvm_lpage_info *lpage_info;
+   unsigned long ugfn;
+   int level = lpage_size + 1;
+
+   WARN_ON(!lpage_size);
+
+   lpage_info = slot-arch.lpage_info[lpage_size - 1];
+
+   if (slot-base_gfn  (KVM_PAGES_PER_HPAGE(level) - 1))
+   lpage_info[0].write_count = 1;
+
+   if ((slot-base_gfn + npages)  (KVM_PAGES_PER_HPAGE(level) - 1))
+   lpage_info[lpages - 1].write_count = 1;
+
+   ugfn = slot-userspace_addr  PAGE_SHIFT;
+   /*
+* If the gfn and userspace address are not aligned wrt each
+* other, or if explicitly asked to, disable large page
+* support for this slot
+*/
+   if ((slot-base_gfn ^ ugfn)  (KVM_PAGES_PER_HPAGE(level) - 1) ||
+ !kvm_largepages_enabled()) {
+   unsigned long j;
+
+   for (j = 0; j  lpages; ++j)
+   lpage_info[j].write_count = 1;
+   }
+}
+
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 {
int i;
 
for (i = 0; i  KVM_NR_PAGE_SIZES; ++i) {
-   unsigned long ugfn;
int lpages;
int level = i + 1;
 
@@ -6892,23 +6925,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot 
*slot, unsigned long npages)
if (!slot-arch.lpage_info[i - 1])
goto out_free;
 
-   if (slot-base_gfn  (KVM_PAGES_PER_HPAGE(level) - 1))
-   slot-arch.lpage_info[i - 1][0].write_count = 1;
-   if ((slot-base_gfn + npages)  (KVM_PAGES_PER_HPAGE(level) - 
1))
-   slot-arch.lpage_info[i - 1][lpages - 1].write_count = 
1;
-   ugfn = slot-userspace_addr  PAGE_SHIFT;
-   /*
-* If the gfn and userspace address are not aligned wrt each
-* other, or if explicitly asked to, disable large page
-* support for this slot
-*/
-   if ((slot-base_gfn ^ ugfn)  (KVM_PAGES_PER_HPAGE(level) - 1) 
||
-   !kvm_largepages_enabled()) {
-   unsigned long j;
-
-   for (j = 0; j  lpages; ++j)
-   slot-arch.lpage_info[i - 1][j].write_count = 1;
-   }
+   memslot_set_lpage_disallowed(slot, npages, i, lpages);
}
 
return 0;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/7] KVM: MMU: introduce mmu_cache-pte_list_descs

2013-03-20 Thread Xiao Guangrong
This list is used to link all the pte_list_desc used by mmu cache, so
we can easily free the memory used by gfn's rmap and parent spte list

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/include/asm/kvm_host.h |7 ---
 arch/x86/kvm/mmu.c  |   14 +-
 arch/x86/kvm/mmu.h  |1 +
 arch/x86/kvm/x86.c  |2 +-
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3f205c6c..c8899c6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -526,15 +526,16 @@ struct kvm_apic_map {
 };
 
 struct kvm_arch {
+   /* MMU cache members. */
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
unsigned int indirect_shadow_pages;
+   /* Hash table of struct kvm_mmu_page. */
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
-   /*
-* Hash table of struct kvm_mmu_page.
-*/
struct list_head active_mmu_pages;
+   struct list_head pte_list_descs;
+
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
int iommu_flags;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c1a9b7b..dc37512 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -156,6 +156,7 @@ module_param(dbg, bool, 0644);
 struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
+   struct list_head list;
 };
 
 struct kvm_shadow_walk_iterator {
@@ -704,11 +705,16 @@ static void *mmu_memory_cache_alloc(struct 
kvm_mmu_memory_cache *mc)
 
 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 {
-   return mmu_memory_cache_alloc(vcpu-arch.mmu_pte_list_desc_cache);
+   struct pte_list_desc *desc;
+
+   desc = mmu_memory_cache_alloc(vcpu-arch.mmu_pte_list_desc_cache);
+   list_add(desc-list, vcpu-kvm-arch.pte_list_descs);
+   return desc;
 }
 
 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 {
+   list_del(pte_list_desc-list);
kmem_cache_free(pte_list_desc_cache, pte_list_desc);
 }
 
@@ -4337,6 +4343,12 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, 
u64 addr, u64 sptes[4])
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
+void kvm_mmu_cache_init(struct kvm *kvm)
+{
+   INIT_LIST_HEAD(kvm-arch.active_mmu_pages);
+   INIT_LIST_HEAD(kvm-arch.pte_list_descs);
+}
+
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
ASSERT(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3b1ad00..2923fd2 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -50,6 +50,7 @@
 #define PFERR_RSVD_MASK (1U  3)
 #define PFERR_FETCH_MASK (1U  4)
 
+void kvm_mmu_cache_init(struct kvm *kvm);
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool 
direct);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d3c4787..6622ac0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6778,7 +6778,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (type)
return -EINVAL;
 
-   INIT_LIST_HEAD(kvm-arch.active_mmu_pages);
+   kvm_mmu_cache_init(kvm);
INIT_LIST_HEAD(kvm-arch.assigned_dev_head);
 
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 6/7] KVM: MMU: fast zap all shadow pages

2013-03-20 Thread Xiao Guangrong
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability.

Since all shadow page will be zapped, we can directly zap the mmu-cache
and rmap so that vcpu will fault on the new mmu-cache, after that, we can
directly free the memory used by old mmu-cache.

The root shadow page is little especial since they are currently used by
vcpus, we can not directly free them. So, we zap the root shadow pages and
re-add them into the new mmu-cache.

After this patch, kvm_mmu_zap_all can be faster 113% than before

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   63 ---
 1 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e880cdd..72e5bdb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4196,17 +4196,72 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 
int slot)
 
 void kvm_mmu_zap_all(struct kvm *kvm)
 {
-   struct kvm_mmu_page *sp, *node;
+   LIST_HEAD(root_mmu_pages);
LIST_HEAD(invalid_list);
+   struct list_head pte_list_descs;
+   struct kvm_mmu_page *sp, *node;
+   struct pte_list_desc *desc, *ndesc;
+   int root_sp = 0;
 
spin_lock(kvm-mmu_lock);
+
 restart:
+   /*
+* The root shadow pages are being used on vcpus that can not
+* directly removed, we filter them out and re-add them to the
+* new mmu cache.
+*/
list_for_each_entry_safe(sp, node, kvm-arch.active_mmu_pages, link)
-   if (kvm_mmu_prepare_zap_page(kvm, sp, invalid_list))
-   goto restart;
+   if (sp-root_count) {
+   int ret;
+
+   root_sp++;
+   ret = __kvm_mmu_prepare_zap_page(kvm, sp,
+invalid_list);
+   list_move(sp-link, root_mmu_pages);
+   if (ret)
+   goto restart;
+   }
+
+   list_splice(kvm-arch.active_mmu_pages, invalid_list);
+   list_replace(kvm-arch.pte_list_descs, pte_list_descs);
+
+   /*
+* Reset the mmu cache so that later vcpu will fault on the new
+* mmu cache.
+*/
+   kvm-arch.indirect_shadow_pages = 0;
+   /* Root shadow pages will be added to the new mmu cache. */
+   kvm_mod_used_mmu_pages(kvm,  -(kvm-arch.n_used_mmu_pages - root_sp));
+   memset(kvm-arch.mmu_page_hash, 0, sizeof(kvm-arch.mmu_page_hash));
+   kvm_mmu_cache_init(kvm);
+
+   /*
+* Now, the mmu cache has been reset, we can re-add the root shadow
+* pages into the cache.
+*/
+   list_replace(root_mmu_pages, kvm-arch.active_mmu_pages);
+
+   /* Reset gfn's rmap and lpage info. */
+   kvm_clear_all_gfn_page_info(kvm);
+
+   /*
+* Notify all vcpus to reload and flush TLB if root shadow pages
+* were zapped (KVM_REQ_MMU_RELOAD forces TLB to be flushed).
+*
+* The TLB need not be flushed if no root shadow page was found
+* since no vcpu uses shadow page.
+*/
+   if (root_sp)
+   kvm_reload_remote_mmus(kvm);
 
-   kvm_mmu_commit_zap_page(kvm, invalid_list);
spin_unlock(kvm-mmu_lock);
+
+   list_for_each_entry_safe(sp, node, invalid_list, link)
+   kvm_mmu_free_page(sp);
+
+   list_for_each_entry_safe(desc, ndesc, pte_list_descs, list)
+   mmu_free_pte_list_desc(desc);
 }
 
 void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virtio-blk: Set default serial id

2013-03-20 Thread Asias He
On Wed, Mar 20, 2013 at 08:52:57AM +0100, Stefan Hajnoczi wrote:
 On Wed, Mar 20, 2013 at 01:56:08PM +0800, Asias He wrote:
  If user does not specify a serial id, e.g.
  
 -device virtio-blk-pci,serial=serial_id
  or
 -drive serial=serial_id
  
  no serial id will be assigned.
  
  Add a default serial id in this case to help identifying
  the disk in guest.
  
  Signed-off-by: Asias He as...@redhat.com
  ---
   hw/virtio-blk.c | 7 +++
   1 file changed, 7 insertions(+)
 
 Autogenerated IDs have been proposed (for other devices?) before and I
 think we should avoid them.
 
 The serial in this patch depends on the internal counter we use for
 savevm.  It is not a well-defined value that guests can depend on
 remaining the same.
 
 It can change between QEMU invocations - due to internal changes in QEMU
 or because the management tool reordered -device options.
 
 Users will be confused and their guests may stop working if they depend
 on an ID like this.

So, no serial id forces user not to depend on the serial id. This is how it
works now.

Okay, let's leave it as it is, persistent id specified by user or no id
at all. This is not perfect but at least it does not break any thing.

 The solution is to do persistent naming either by really passing -device
 virtio-blk-pci,serial= or with udev inside the guest using the bus
 address (PCI devfn) like the new persistent network interface naming for
 Linux.

'-virtio-blk-pci,serial=' specified by user would be persistent, but pci
id might be changed as well.

 Stefan

-- 
Asias
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V3 WIP 3/3] disable vhost_verify_ring_mappings check

2013-03-20 Thread Michael S. Tsirkin
On Tue, Mar 19, 2013 at 06:57:08PM -0700, Nicholas A. Bellinger wrote:
 On Tue, 2013-03-19 at 09:40 +0100, Stefan Hajnoczi wrote:
  On Tue, Mar 19, 2013 at 08:34:45AM +0800, Asias He wrote:
   ---
hw/vhost.c | 2 ++
1 file changed, 2 insertions(+)
   
   diff --git a/hw/vhost.c b/hw/vhost.c
   index 4d6aee3..0c52ec4 100644
   --- a/hw/vhost.c
   +++ b/hw/vhost.c
   @@ -421,10 +421,12 @@ static void vhost_set_memory(MemoryListener 
   *listener,
return;
}

   +#if 0
if (dev-started) {
r = vhost_verify_ring_mappings(dev, start_addr, size);
assert(r = 0);
}
   +#endif
  
  Please add a comment to explain why.
  
 
 Btw, the output that Asias added in the failure case at the behest of
 MST is here:
 
 http://www.spinics.net/lists/target-devel/msg04077.html

Yes I suspected we could get l  ring_size, but this is
not the case here.

 MST seemed to think it may be a bug in cpu_physical_memory_map, but as
 this worked with the original vhost-scsi code it would seem to indicate
 something else at fault..
 
 I'll be comparing what the original code did vs. vhost-scsi-pci to track
 this down, but any extra ideas to track is down is appreciated.  ;)
 
 --nab
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V5 4/5] virtio-scsi: introduce multiqueue support

2013-03-20 Thread Paolo Bonzini
Il 20/03/2013 02:46, Venkatesh Srinivas ha scritto:
 This looks pretty good!
 
 I rather like the (lack of) locking in I/O completion (around the req
 count vs. target/queue binding). It is unfortunate that you need to hold
 the per-target lock in virtscsi_pick_vq() though; have any idea
 how much that lock hurts?

It doesn't hurt, the lock is mostly uncontended.

- if you have lots of I/O, it's held for a very small period of time; if
you have little I/O, it's uncontended anyway.

- the SCSI layer will serialize on the host lock anyway before calling
into the LLD.  Locks are pipelined so that in the end the host lock
will be a bigger bottleneck than the others.

Most of the time it only costs 2 extra atomic operations, which should
be galf a microsecond or less.

Paolo

 Just two minor comments:
 
 (in struct virtio_scsi_target_data):
 +   /* This spinlock never help at the same time as vq_lock. */
 held?
 
 (in struct virtio_scsi):
 +   /* Does the affinity hint is set for virtqueues? */
 Could you rephrase that, please?
 
 Tested on qemu and w/ Google Compute Engine's virtio-scsi device.
 
 Reviewed-and-tested-by: Venkatesh Srinivas venkate...@google.com
 
 Thanks,
 -- vs;

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86: let userspace inject interrupts into the local APIC

2013-03-20 Thread Gleb Natapov
On Tue, Mar 19, 2013 at 09:22:33PM +0100, Paolo Bonzini wrote:
 Il 19/03/2013 19:50, Gleb Natapov ha scritto:
  On Tue, Mar 19, 2013 at 07:39:24PM +0100, Paolo Bonzini wrote:
  Il 19/03/2013 19:13, Gleb Natapov ha scritto:
  There is no way for userspace to inject interrupts into a VCPU's
  local APIC, which is important in order to inject INITs coming from
  the chipset.  KVM_INTERRUPT is currently disabled when the in-kernel
  local APIC is used, so we can repurpose it.  The shorthand destination
  field must contain APIC_DEST_SELF, which has a double effect: first,
  the ICR2 register is not used and the 32-bit field of KVM_INTERRUPT is
  enough; second, it ensures that the valid range of the irq field is
  distinct in the userspace-APIC and kernel-APIC cases.
 
  Init coming from triggering INIT# line should not be modeled as INIT 
  coming from
  APIC.
 
  Then Jan's patch was wrong, and INIT should not have been an apic event
  (perhaps SIPI should).
 
  If it goes through APIC it is.
 
 Ok, I'll extract KVM_APIC_INIT handling into a separate function and
 call it synchronously from KVM_INTERRUPT, with irq = -1
 (KVM_INTERRUPT_INIT, similar to PPC's values of irq).
 KVM_INTERRUPT_INIT will be accessible even with in-kernel irqchip.
 
Why should it be accessible with in-kernel irqchip? The only valid value
for mp_state is RUNNING with userspace irqchip. We even validate it in
kvm_arch_vcpu_ioctl_set_mpstate() now.

  In fact INIT cannot be send using SELF shorthand.
 
  Where does the SDM say that?
 
  Table 10-3.
 
 Yeah, table 10-6 and 10-7 here.
 
Hmm, somebody needs to update SDM. Mine is from January 2013.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] [RFC] bookehv: Handle debug exception on guest exit

2013-03-20 Thread Bharat Bhushan
EPCR.DUVD controls whether the debug events can come in
hypervisor mode or not. When KVM guest is using the debug
resource then we do not want debug events to be captured
in guest entry/exit path. So we set EPCR.DUVD when entering
and clears EPCR.DUVD when exiting from guest.

Debug instruction complete is a post-completion debug
exception but debug event gets posted on the basis of MSR
before the instruction is executed. Now if the instruction
switches the context from guest mode (MSR.GS = 1) to hypervisor
mode (MSR.GS = 0) then the xSRR0 points to first instruction of
KVM handler and xSRR1 points that MSR.GS is clear
(hypervisor context). Now as xSRR1.GS is used to decide whether
KVM handler will be invoked to handle the exception or host
host kernel debug handler will be invoked to handle the exception.
This leads to host kernel debug handler handling the exception
which should either be handled by KVM.

This is tested on e500mc in 32 bit mode

Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
---
 arch/powerpc/kernel/exceptions-64e.S |   54 ++
 arch/powerpc/kernel/head_booke.h |   35 ++
 2 files changed, 89 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 4684e33..56882a0 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -450,6 +450,33 @@ interrupt_end_book3e:
andis.  r15,r14,DBSR_IC@h
beq+1f
 
+#ifdef CONFIG_KVM_BOOKE_HV
+   /*
+* EPCR.DUVD controls whether the debug events can come in
+* hypervisor mode or not. When KVM guest is using the debug
+* resource then we do not want debug events to be captured
+* in guest entry/exit path. So we set EPCR.DUVD when entering
+* and clears EPCR.DUVD when exiting from guest.
+* Debug instruction complete is a post-completion debug
+* exception but debug event gets posted on the basis of MSR
+* before the instruction is executed. Now if the instruction
+* switches the context from guest mode (MSR.GS = 1) to hypervisor
+* mode (MSR.GS = 0) then the xSRR0 points to first instruction of
+* KVM handler and xSRR1 points that MSR.GS is clear
+* (hypervisor context). Now as xSRR1.GS is used to decide whether
+* KVM handler will be invoked to handle the exception or host
+* host kernel debug handler will be invoked to handle the exception.
+* This leads to host kernel debug handler handling the exception
+* which should either be handled by KVM.
+*/
+   mfspr   r10, SPRN_EPCR
+   andis.  r10,r10,SPRN_EPCR_DUVD@h
+   beq+2f
+
+   andis.  r10,r9,MSR_GS@h
+   beq+1f
+2:
+#endif
LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e)
cmpld   cr0,r10,r14
@@ -516,6 +543,33 @@ kernel_dbg_exc:
andis.  r15,r14,DBSR_IC@h
beq+1f
 
+#ifdef CONFIG_KVM_BOOKE_HV
+   /*
+* EPCR.DUVD controls whether the debug events can come in
+* hypervisor mode or not. When KVM guest is using the debug
+* resource then we do not want debug events to be captured
+* in guest entry/exit path. So we set EPCR.DUVD when entering
+* and clears EPCR.DUVD when exiting from guest.
+* Debug instruction complete is a post-completion debug
+* exception but debug event gets posted on the basis of MSR
+* before the instruction is executed. Now if the instruction
+* switches the context from guest mode (MSR.GS = 1) to hypervisor
+* mode (MSR.GS = 0) then the xSRR0 points to first instruction of
+* KVM handler and xSRR1 points that MSR.GS is clear
+* (hypervisor context). Now as xSRR1.GS is used to decide whether
+* KVM handler will be invoked to handle the exception or host
+* host kernel debug handler will be invoked to handle the exception.
+* This leads to host kernel debug handler handling the exception
+* which should either be handled by KVM.
+*/
+   mfspr   r10, SPRN_EPCR
+   andis.  r10,r10,SPRN_EPCR_DUVD@h
+   beq+2f
+
+   andis.  r10,r9,MSR_GS@h
+   beq+1f
+2:
+#endif
LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e)
cmpld   cr0,r10,r14
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 5f051ee..040b0a3 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -285,7 +285,33 @@ label:
mfspr   r10,SPRN_DBSR;  /* check single-step/branch taken */  \
andis.  r10,r10,(DBSR_IC|DBSR_BT)@h;  \
beq+2f;   \
+#ifdef CONFIG_KVM_BOOKE_HV

[PATCH v4 0/7] Use eoi to track RTC interrupt delivery status

2013-03-20 Thread Yang Zhang
From: Yang Zhang yang.z.zh...@intel.com

Current interrupt coalescing logci which only used by RTC has conflict
with Posted Interrupt.

This patch introduces a new mechinism to use eoi to track interrupt:
When delivering an interrupt to vcpu, the need_eoi set to number of
vcpu that received the interrupt. And decrease it when each vcpu writing
eoi. No subsequent RTC interrupt can deliver to vcpu until all vcpus
write eoi.

Changes from v3 to v4
* Call kvm_apic_match_dest() to check destination vcpu.
* Update RTC interrrupt's destination vcpu map when ioapic entry of RTC
  or apic register (id, ldr, dfr) is changed.

Changes from v2 to v3:
* Remove unused viarable irq_ack_notifier.
* Acquire ioapic-lock before calculte destination vcpu map.
* Copy vcpu_map to expected_eoi_timap on each RTC irq and clear it on eoi.
 

Yang Zhang (7):
  KVM: Call kvm_apic_match_dest() to check destination vcpu
  KVM: Call common update function when ioapic entry changed.
  KVM: Add vcpu info to ioapic_update_eoi()
  KVM: Introduce struct rtc_status
  KVM: Recalculate destination vcpu map
  KVM: Add reset/restore rtc_status support
  KVM: Use eoi to track RTC interrupt delivery status

 arch/ia64/kvm/lapic.h|6 --
 arch/x86/kvm/lapic.c |   59 +++---
 arch/x86/kvm/lapic.h |4 +-
 arch/x86/kvm/vmx.c   |3 +
 arch/x86/kvm/x86.c   |   11 ++-
 include/linux/kvm_host.h |4 +-
 virt/kvm/ioapic.c|  152 --
 virt/kvm/ioapic.h|   19 --
 virt/kvm/irq_comm.c  |4 +-
 virt/kvm/kvm_main.c  |4 +-
 10 files changed, 175 insertions(+), 91 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 1/7] KVM: Call kvm_apic_match_dest() to check destination vcpu

2013-03-20 Thread Yang Zhang
From: Yang Zhang yang.z.zh...@intel.com

For a given vcpu, kvm_apic_match_dest() will tell you whether
the vcpu in the destination list quickly. Drop kvm_calculate_eoi_exitmap()
and use kvm_apic_match_dest() instead.

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
---
 arch/x86/kvm/lapic.c |   47 ---
 arch/x86/kvm/lapic.h |4 
 virt/kvm/ioapic.c|7 +--
 3 files changed, 5 insertions(+), 53 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 02b51dd..ee14f94 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -145,53 +145,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
return (kvm_apic_get_reg(apic, APIC_ID)  24)  0xff;
 }
 
-void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-   struct kvm_lapic_irq *irq,
-   u64 *eoi_exit_bitmap)
-{
-   struct kvm_lapic **dst;
-   struct kvm_apic_map *map;
-   unsigned long bitmap = 1;
-   int i;
-
-   rcu_read_lock();
-   map = rcu_dereference(vcpu-kvm-arch.apic_map);
-
-   if (unlikely(!map)) {
-   __set_bit(irq-vector, (unsigned long *)eoi_exit_bitmap);
-   goto out;
-   }
-
-   if (irq-dest_mode == 0) { /* physical mode */
-   if (irq-delivery_mode == APIC_DM_LOWEST ||
-   irq-dest_id == 0xff) {
-   __set_bit(irq-vector,
- (unsigned long *)eoi_exit_bitmap);
-   goto out;
-   }
-   dst = map-phys_map[irq-dest_id  0xff];
-   } else {
-   u32 mda = irq-dest_id  (32 - map-ldr_bits);
-
-   dst = map-logical_map[apic_cluster_id(map, mda)];
-
-   bitmap = apic_logical_id(map, mda);
-   }
-
-   for_each_set_bit(i, bitmap, 16) {
-   if (!dst[i])
-   continue;
-   if (dst[i]-vcpu == vcpu) {
-   __set_bit(irq-vector,
- (unsigned long *)eoi_exit_bitmap);
-   break;
-   }
-   }
-
-out:
-   rcu_read_unlock();
-}
-
 static void recalculate_apic_map(struct kvm *kvm)
 {
struct kvm_apic_map *new, *old = NULL;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1676d34..5806ef8 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -154,8 +154,4 @@ static inline u16 apic_logical_id(struct kvm_apic_map *map, 
u32 ldr)
return ldr  map-lid_mask;
 }
 
-void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-   struct kvm_lapic_irq *irq,
-   u64 *eoi_bitmap);
-
 #endif
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ce82b94..4443075 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -135,8 +135,11 @@ void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu 
*vcpu,
irqe.dest_id = e-fields.dest_id;
irqe.vector = e-fields.vector;
irqe.dest_mode = e-fields.dest_mode;
-   irqe.delivery_mode = e-fields.delivery_mode  8;
-   kvm_calculate_eoi_exitmap(vcpu, irqe, eoi_exit_bitmap);
+
+   if (kvm_apic_match_dest(vcpu, NULL, irqe.shorthand,
+   irqe.dest_id, irqe.dest_mode))
+   __set_bit(irqe.vector,
+   (unsigned long *)eoi_exit_bitmap);
}
}
spin_unlock(ioapic-lock);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 2/7] KVM: Call common update function when ioapic entry changed.

2013-03-20 Thread Yang Zhang
From: Yang Zhang yang.z.zh...@intel.com

Both RTC irq status and EOI exit bitmap need to be updated when
ioapic changed or vcpu's id/ldr/dfr changed. So use common function
instead eoi exit bitmap specific function.

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
---
 arch/ia64/kvm/lapic.h|6 --
 arch/x86/kvm/lapic.c |2 +-
 arch/x86/kvm/vmx.c   |3 +++
 arch/x86/kvm/x86.c   |   11 +++
 include/linux/kvm_host.h |4 ++--
 virt/kvm/ioapic.c|   27 ---
 virt/kvm/ioapic.h|7 +++
 virt/kvm/irq_comm.c  |4 ++--
 virt/kvm/kvm_main.c  |4 ++--
 9 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index c3e2935..c5f92a9 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -27,10 +27,4 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct 
kvm_lapic_irq *irq);
 #define kvm_apic_present(x) (true)
 #define kvm_lapic_enabled(x) (true)
 
-static inline bool kvm_apic_vid_enabled(void)
-{
-   /* IA64 has no apicv supporting, do nothing here */
-   return false;
-}
-
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ee14f94..1210866 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -209,7 +209,7 @@ out:
if (old)
kfree_rcu(old, rcu);
 
-   kvm_ioapic_make_eoibitmap_request(kvm);
+   kvm_vcpu_scan_ioapic(kvm);
 }
 
 static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c1b3041..6b25e17 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6362,6 +6362,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, 
int max_irr)
 
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
+   if (!vmx_vm_has_apicv(vcpu-kvm))
+   return;
+
vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c5bb6f..a959d81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5631,13 +5631,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
u64 eoi_exit_bitmap[4];
 
+   if (!kvm_lapic_enabled(vcpu))
+   return;
+
memset(eoi_exit_bitmap, 0, 32);
 
-   kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+   kvm_ioapic_scan_entry(vcpu, (unsigned long *)eoi_exit_bitmap);
kvm_x86_ops-load_eoi_exitmap(vcpu, eoi_exit_bitmap);
 }
 
@@ -5694,8 +5697,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_handle_pmu_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_deliver_pmi(vcpu);
-   if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
-   update_eoi_exitmap(vcpu);
+   if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+   vcpu_scan_ioapic(vcpu);
}
 
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 722cae7..9fe6433 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -123,7 +123,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_MASTERCLOCK_UPDATE 19
 #define KVM_REQ_MCLOCK_INPROGRESS 20
 #define KVM_REQ_EPR_EXIT  21
-#define KVM_REQ_EOIBITMAP 22
+#define KVM_REQ_SCAN_IOAPIC   22
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID   1
@@ -538,7 +538,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 void kvm_make_mclock_inprogress_request(struct kvm *kvm);
-void kvm_make_update_eoibitmap_request(struct kvm *kvm);
+void kvm_make_scan_ioapic_request(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 4443075..e7c460c 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -116,8 +116,8 @@ static void update_handled_vectors(struct kvm_ioapic 
*ioapic)
smp_wmb();
 }
 
-void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-   u64 *eoi_exit_bitmap)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
+   unsigned long *eoi_exit_bitmap)
 {
struct kvm_ioapic *ioapic = vcpu-kvm-arch.vioapic;
union kvm_ioapic_redirect_entry *e;
@@ -125,7 +125,6 @@ void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
int index;
 
spin_lock(ioapic-lock);
-   /* traverse 

[PATCH v4 7/7] KVM: Use eoi to track RTC interrupt delivery status

2013-03-20 Thread Yang Zhang
From: Yang Zhang yang.z.zh...@intel.com

Current interrupt coalescing logci which only used by RTC has conflict
with Posted Interrupt.
This patch introduces a new mechinism to use eoi to track interrupt:
When delivering an interrupt to vcpu, the need_eoi set to number of
vcpu that received the interrupt. And decrease it when each vcpu writing
eoi. No subsequent RTC interrupt can deliver to vcpu until all vcpus
write eoi.

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
---
 virt/kvm/ioapic.c |   69 +
 1 files changed, 69 insertions(+), 0 deletions(-)

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 2b35123..2165f5a 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -180,6 +180,50 @@ static void rtc_irq_restore(struct kvm_ioapic *ioapic)
}
ioapic-rtc_status.need_eoi = need_eoi;
 }
+
+static void rtc_irq_set_eoi(struct kvm_ioapic *ioapic, int irq)
+{
+   union kvm_ioapic_redirect_entry *entry = ioapic-redirtbl[irq];
+
+   if (irq != 8)
+   return;
+
+   if (likely(!bitmap_empty(ioapic-rtc_status.vcpu_map, KVM_MAX_VCPUS))) {
+   if (entry-fields.delivery_mode == APIC_DM_LOWEST)
+   ioapic-rtc_status.need_eoi = 1;
+   else {
+   int weight;
+   weight = bitmap_weight(ioapic-rtc_status.vcpu_map,
+   sizeof(ioapic-rtc_status.vcpu_map));
+   ioapic-rtc_status.need_eoi = weight;
+   }
+   bitmap_copy(ioapic-rtc_status.expected_eoi_bitmap,
+   ioapic-rtc_status.vcpu_map, KVM_MAX_VCPUS);
+   }
+}
+
+static void rtc_irq_ack_eoi(struct kvm_vcpu *vcpu,
+   struct rtc_status *rtc_status, int irq)
+{
+   if (irq != 8)
+   return;
+
+   if (test_and_clear_bit(vcpu-vcpu_id, rtc_status-expected_eoi_bitmap))
+   --rtc_status-need_eoi;
+
+   WARN_ON(rtc_status-need_eoi  0);
+}
+
+static bool rtc_irq_check(struct kvm_ioapic *ioapic, int irq)
+{
+   if (irq != 8)
+   return false;
+
+   if (ioapic-rtc_status.need_eoi  0)
+   return true; /* coalesced */
+
+   return false;
+}
 #else
 void kvm_vcpu_scan_ioapic(struct kvm *kvm)
 {
@@ -195,6 +239,22 @@ static void rtc_irq_restore(struct kvm_ioapic *ioapic)
 {
return;
 }
+
+static void rtc_irq_set_eoi(struct kvm_ioapic *ioapic, int irq)
+{
+   return;
+}
+
+static void rtc_irq_ack_eoi(struct kvm_vcpu *vcpu,
+   struct rtc_status *rtc_status, int irq)
+{
+   return;
+}
+
+static bool rtc_irq_check(struct kvm_ioapic *ioapic, int irq)
+{
+   return false;
+}
 #endif
 
 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
@@ -262,6 +322,8 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int 
irq)
irqe.level = 1;
irqe.shorthand = 0;
 
+   rtc_irq_set_eoi(ioapic, irq);
+
return kvm_irq_delivery_to_apic(ioapic-kvm, NULL, irqe);
 }
 
@@ -286,6 +348,11 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, 
int irq_source_id,
ret = 1;
} else {
int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+
+   if (rtc_irq_check(ioapic, irq)) {
+   ret = 0; /* coalesced */
+   goto out;
+   }
ioapic-irr |= mask;
if ((edge  old_irr != ioapic-irr) ||
(!edge  !entry.fields.remote_irr))
@@ -293,6 +360,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, 
int irq_source_id,
else
ret = 0; /* report coalesced interrupt */
}
+out:
trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
spin_unlock(ioapic-lock);
 
@@ -320,6 +388,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
if (ent-fields.vector != vector)
continue;
 
+   rtc_irq_ack_eoi(vcpu, ioapic-rtc_status, i);
/*
 * We are dropping lock while calling ack notifiers because ack
 * notifier callbacks for assigned devices call into IOAPIC
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 6/7] KVM: Add reset/restore rtc_status support

2013-03-20 Thread Yang Zhang
From: Yang Zhang yang.z.zh...@intel.com

reset/restore rtc_status when ioapic reset/restore.

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
---
 arch/x86/kvm/lapic.c |8 
 arch/x86/kvm/lapic.h |2 ++
 virt/kvm/ioapic.c|   34 ++
 3 files changed, 44 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ed80244..e95312e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -94,6 +94,14 @@ static inline int apic_test_vector(int vec, void *bitmap)
return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+   struct kvm_lapic *apic = vcpu-arch.apic;
+
+   return apic_test_vector(vector, apic-regs + APIC_ISR) ||
+   apic_test_vector(vector, apic-regs + APIC_IRR);
+}
+
 static inline void apic_set_vector(int vec, void *bitmap)
 {
set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 5806ef8..38815b0 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -154,4 +154,6 @@ static inline u16 apic_logical_id(struct kvm_apic_map *map, 
u32 ldr)
return ldr  map-lid_mask;
 }
 
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+
 #endif
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 91b4c08..2b35123 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -158,11 +158,43 @@ void kvm_vcpu_scan_ioapic(struct kvm *kvm)
return;
kvm_make_scan_ioapic_request(kvm);
 }
+
+static void rtc_irq_reset(struct kvm_ioapic *ioapic)
+{
+   ioapic-rtc_status.need_eoi = 0;
+   bitmap_zero(ioapic-rtc_status.vcpu_map, KVM_MAX_VCPUS);
+}
+
+static void rtc_irq_restore(struct kvm_ioapic *ioapic)
+{
+   struct kvm_vcpu *vcpu;
+   int vector, i, need_eoi = 0, rtc_pin = 8;
+
+   vector = ioapic-redirtbl[rtc_pin].fields.vector;
+   kvm_for_each_vcpu(i, vcpu, ioapic-kvm) {
+   if (kvm_apic_pending_eoi(vcpu, vector)) {
+   need_eoi++;
+   set_bit(vcpu-vcpu_id,
+   ioapic-rtc_status.expected_eoi_bitmap);
+   }
+   }
+   ioapic-rtc_status.need_eoi = need_eoi;
+}
 #else
 void kvm_vcpu_scan_ioapic(struct kvm *kvm)
 {
return;
 }
+
+static void rtc_irq_reset(struct kvm_ioapic *ioapic)
+{
+   return;
+}
+
+static void rtc_irq_restore(struct kvm_ioapic *ioapic)
+{
+   return;
+}
 #endif
 
 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
@@ -441,6 +473,7 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
ioapic-ioregsel = 0;
ioapic-irr = 0;
ioapic-id = 0;
+   rtc_irq_reset(ioapic);
update_handled_vectors(ioapic);
 }
 
@@ -506,6 +539,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state 
*state)
spin_lock(ioapic-lock);
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
update_handled_vectors(ioapic);
+   rtc_irq_restore(ioapic);
kvm_vcpu_scan_ioapic(kvm);
spin_unlock(ioapic-lock);
return 0;
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 5/7] KVM: Recalculate destination vcpu map

2013-03-20 Thread Yang Zhang
From: Yang Zhang yang.z.zh...@intel.com

Update RTC interrrupt's destination vcpu map when ioapic entry of RTC
or apic register (id, ldr, dfr) is changed.

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
---
 virt/kvm/ioapic.c |9 +++--
 1 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ddf9414..91b4c08 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -121,6 +121,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
 {
struct kvm_ioapic *ioapic = vcpu-kvm-arch.vioapic;
union kvm_ioapic_redirect_entry *e;
+   unsigned long *rtc_map = ioapic-rtc_status.vcpu_map;
struct kvm_lapic_irq irqe;
int index;
 
@@ -130,15 +131,19 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
if (!e-fields.mask 
(e-fields.trig_mode == IOAPIC_LEVEL_TRIG ||
 kvm_irq_has_notifier(ioapic-kvm, KVM_IRQCHIP_IOAPIC,
-index))) {
+index) || index == 8)) {
irqe.dest_id = e-fields.dest_id;
irqe.vector = e-fields.vector;
irqe.dest_mode = e-fields.dest_mode;
irqe.shorthand = 0;
 
if (kvm_apic_match_dest(vcpu, NULL, irqe.shorthand,
-   irqe.dest_id, irqe.dest_mode))
+   irqe.dest_id, irqe.dest_mode)) {
__set_bit(irqe.vector, eoi_exit_bitmap);
+   if (index == 8)
+   __set_bit(vcpu-vcpu_id, rtc_map);
+   } else if (index == 8)
+   __clear_bit(vcpu-vcpu_id, rtc_map);
}
}
spin_unlock(ioapic-lock);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 4/7] KVM: Introduce struct rtc_status

2013-03-20 Thread Yang Zhang
From: Yang Zhang yang.z.zh...@intel.com

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
---
 virt/kvm/ioapic.h |9 +
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 1f16088..b727657 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -34,6 +34,12 @@ struct kvm_vcpu;
 #defineIOAPIC_INIT 0x5
 #defineIOAPIC_EXTINT   0x7
 
+struct rtc_status {
+   int need_eoi;
+   DECLARE_BITMAP(vcpu_map, KVM_MAX_VCPUS);
+   DECLARE_BITMAP(expected_eoi_bitmap, KVM_MAX_VCPUS);
+};
+
 struct kvm_ioapic {
u64 base_address;
u32 ioregsel;
@@ -47,6 +53,9 @@ struct kvm_ioapic {
void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
DECLARE_BITMAP(handled_vectors, 256);
+#ifdef CONFIG_X86
+   struct rtc_status rtc_status;
+#endif
 };
 
 #ifdef DEBUG
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 3/7] KVM: Add vcpu info to ioapic_update_eoi()

2013-03-20 Thread Yang Zhang
From: Yang Zhang yang.z.zh...@intel.com

Add vcpu info to ioapic_update_eoi, so we can know which vcpu
issued this EOI.

Signed-off-by: Yang Zhang yang.z.zh...@intel.com
---
 arch/x86/kvm/lapic.c |2 +-
 virt/kvm/ioapic.c|   12 ++--
 virt/kvm/ioapic.h|3 ++-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1210866..ed80244 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -735,7 +735,7 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int 
vector)
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
-   kvm_ioapic_update_eoi(apic-vcpu-kvm, vector, trigger_mode);
+   kvm_ioapic_update_eoi(apic-vcpu, vector, trigger_mode);
}
 }
 
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index e7c460c..ddf9414 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -272,8 +272,8 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int 
irq_source_id)
spin_unlock(ioapic-lock);
 }
 
-static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
-int trigger_mode)
+static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
+   struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
int i;
 
@@ -312,12 +312,12 @@ bool kvm_ioapic_handles_vector(struct kvm *kvm, int 
vector)
return test_bit(vector, ioapic-handled_vectors);
 }
 
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
 {
-   struct kvm_ioapic *ioapic = kvm-arch.vioapic;
+   struct kvm_ioapic *ioapic = vcpu-kvm-arch.vioapic;
 
spin_lock(ioapic-lock);
-   __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
+   __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode);
spin_unlock(ioapic-lock);
 }
 
@@ -415,7 +415,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, 
gpa_t addr, int len,
break;
 #ifdef CONFIG_IA64
case IOAPIC_REG_EOI:
-   __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG);
+   __kvm_ioapic_update_eoi(NULL, ioapic, data, IOAPIC_LEVEL_TRIG);
break;
 #endif
 
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index e3538ba..1f16088 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -70,7 +70,8 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm 
*kvm)
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
+   int trigger_mode);
 bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v2 6/8] KVM: Add reset/restore rtc_status support

2013-03-20 Thread Zhang, Yang Z
I have send out the latest patch(v4). Please give comments for the latest one, 
because some issues you point out may not exist on latest patch. If it still 
exits. please point out it again.

Zhang, Yang Z wrote on 2013-03-20:
 Marcelo Tosatti wrote on 2013-03-20:
 On Mon, Mar 18, 2013 at 03:24:37PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 reset/restore rtc_status when ioapic reset/restore.
 
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  arch/x86/kvm/lapic.c |8 
  arch/x86/kvm/lapic.h |1 +
  virt/kvm/ioapic.c|   33 +
  3 files changed, 42 insertions(+), 0 deletions(-)
 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 6fb22e3..a223170 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -94,6 +94,14 @@ static inline int apic_test_vector(int vec, void *bitmap)
 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
  }
 +bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
 +{
 +   struct kvm_lapic *apic = vcpu-arch.apic;
 +
 +   return apic_test_vector(vector, apic-regs + APIC_ISR) ||
 +   apic_test_vector(vector, apic-regs + APIC_IRR);
 +}
 +
 
 Should hook into kvm_lapic_reset and kvm_vcpu_ioctl_set_lapic to
 generate updates.
 rtc_irq_restore will be called after lapic is restored. What's the problem if 
 we no
 hook into the two function?
 
  static inline void apic_set_vector(int vec, void *bitmap)
  {
 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index
 3a0f9d8..e2a03d1 100644 --- a/arch/x86/kvm/lapic.h +++
 b/arch/x86/kvm/lapic.h @@ -160,5 +160,6 @@ void
 kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
 
  void kvm_get_dest_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 unsigned long *vcpu_map);
 +bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
  #endif
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index 659511d..6266d1f 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -88,6 +88,27 @@ static unsigned long ioapic_read_indirect(struct
 kvm_ioapic *ioapic,
  }
  
  #ifdef CONFIG_X86
 +static void rtc_irq_reset(struct kvm_ioapic *ioapic)
 +{
 +   ioapic-rtc_status.need_eoi = 0;
 +   bitmap_zero(ioapic-rtc_status.vcpu_map, KVM_MAX_VCPUS);
 +}
 +
 +static void rtc_irq_restore(struct kvm_ioapic *ioapic)
 +{
 +   struct kvm_vcpu *vcpu;
 +   int vector, i, need_eoi = 0, rtc_pin = 8;
 +
 +   vector = ioapic-redirtbl[rtc_pin].fields.vector;
 +   kvm_for_each_vcpu(i, vcpu, ioapic-kvm) {
 +   if (kvm_apic_pending_eoi(vcpu, vector)) {
 +   need_eoi++;
 +   set_bit(vcpu-vcpu_id, ioapic-rtc_status.vcpu_map);
 
 Why set bit on vcpu_map here?
 We will set need_eoi here. And if target vcpu is not in vcpu_map, then it 
 will not
 update need_eoi on EOI.
 
 Best regards,
 Yang


Best regards,
Yang

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v6 5/5] KVM : VMX: Use posted interrupt to deliver virtual interrupt

2013-03-20 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-03-20:
 On Tue, Mar 19, 2013 at 12:27:38PM -0300, Marcelo Tosatti wrote:
 On Tue, Mar 19, 2013 at 12:19:55PM -0300, Marcelo Tosatti wrote:
 See the previous argument: should never enter guest mode with PIR ON bit
 set. With logic above:
 
 context1context2  context3
 set_bit(PIR-1)
 r = pi_test_and_set_on()
 set_bit(PIR-40)
 set_bit(KVM_REQ_EVENT)
 if (kvm_check_request(KVM_REQ_EVENT)
  if (test_and_clear_bit(on))
kvm_apic_update_irr()r =
 pi_test_and_set_on()
 
 guest entry with PIR ON=1
 
 
 Thats the reason for unconditional clearing on guest entry: it is easy
 to verify its correct. I understand and agree the callback (and VMWRITE)
 is not nice.
 
 Re: KVM_REQ_EVENT setting after set_bit(KVM_REQ_EVENT) assures no guest
 entry with PIR ON=1.
 
 Might be, would have to verify. Its trickier though. Maybe add a FIXME:
 to the callback and remove it later.
 We have time still. RTC series is not ready yet. I'll think hard and try
 to poke holes in the logic in this patch and you do the same for what I
 propose.
Any thought? As far as I see, the two solutions are ok. It's hard to say which 
is better. But clear ON bit when sync_pir_irr should be more clear and close to 
hardware's behavior.

Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 5/5] KVM : VMX: Use posted interrupt to deliver virtual interrupt

2013-03-20 Thread Gleb Natapov
On Wed, Mar 20, 2013 at 11:47:49AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-03-20:
  On Tue, Mar 19, 2013 at 12:27:38PM -0300, Marcelo Tosatti wrote:
  On Tue, Mar 19, 2013 at 12:19:55PM -0300, Marcelo Tosatti wrote:
  See the previous argument: should never enter guest mode with PIR ON bit
  set. With logic above:
  
  context1  context2  context3
set_bit(PIR-1)
r = pi_test_and_set_on()
  set_bit(PIR-40)
set_bit(KVM_REQ_EVENT)
  if (kvm_check_request(KVM_REQ_EVENT)
   if (test_and_clear_bit(on))
 kvm_apic_update_irr()  r =
  pi_test_and_set_on()
  
  guest entry with PIR ON=1
  
  
  Thats the reason for unconditional clearing on guest entry: it is easy
  to verify its correct. I understand and agree the callback (and VMWRITE)
  is not nice.
  
  Re: KVM_REQ_EVENT setting after set_bit(KVM_REQ_EVENT) assures no guest
  entry with PIR ON=1.
  
  Might be, would have to verify. Its trickier though. Maybe add a FIXME:
  to the callback and remove it later.
  We have time still. RTC series is not ready yet. I'll think hard and try
  to poke holes in the logic in this patch and you do the same for what I
  propose.
 Any thought? As far as I see, the two solutions are ok. It's hard to say 
 which is better. But clear ON bit when sync_pir_irr should be more clear and 
 close to hardware's behavior.
 
Lets go with it unless we see why it will not work.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v6 5/5] KVM : VMX: Use posted interrupt to deliver virtual interrupt

2013-03-20 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-03-20:
 On Wed, Mar 20, 2013 at 11:47:49AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-03-20:
 On Tue, Mar 19, 2013 at 12:27:38PM -0300, Marcelo Tosatti wrote:
 On Tue, Mar 19, 2013 at 12:19:55PM -0300, Marcelo Tosatti wrote:
 See the previous argument: should never enter guest mode with PIR ON
 bit set. With logic above:
 
 context1  context2  context3
   set_bit(PIR-1)
   r = pi_test_and_set_on()
 set_bit(PIR-40)
   set_bit(KVM_REQ_EVENT)
 if (kvm_check_request(KVM_REQ_EVENT)
  if (test_and_clear_bit(on))
kvm_apic_update_irr()  r =
 pi_test_and_set_on()
 
 guest entry with PIR ON=1
 
 
 Thats the reason for unconditional clearing on guest entry: it is easy
 to verify its correct. I understand and agree the callback (and VMWRITE)
 is not nice.
 
 Re: KVM_REQ_EVENT setting after set_bit(KVM_REQ_EVENT) assures no
 guest entry with PIR ON=1.
 
 Might be, would have to verify. Its trickier though. Maybe add a FIXME:
 to the callback and remove it later.
 We have time still. RTC series is not ready yet. I'll think hard and try
 to poke holes in the logic in this patch and you do the same for what I
 propose.
 Any thought? As far as I see, the two solutions are ok. It's hard to say 
 which is
 better. But clear ON bit when sync_pir_irr should be more clear and close to
 hardware's behavior.
 
 Lets go with it unless we see why it will not work.
Sure.

Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 1/7] KVM: Call kvm_apic_match_dest() to check destination vcpu

2013-03-20 Thread Gleb Natapov
On Wed, Mar 20, 2013 at 07:36:13PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 For a given vcpu, kvm_apic_match_dest() will tell you whether
 the vcpu in the destination list quickly. Drop kvm_calculate_eoi_exitmap()
 and use kvm_apic_match_dest() instead.
 
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  arch/x86/kvm/lapic.c |   47 ---
  arch/x86/kvm/lapic.h |4 
  virt/kvm/ioapic.c|7 +--
  3 files changed, 5 insertions(+), 53 deletions(-)
 
 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 02b51dd..ee14f94 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -145,53 +145,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
   return (kvm_apic_get_reg(apic, APIC_ID)  24)  0xff;
  }
  
 -void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
 - struct kvm_lapic_irq *irq,
 - u64 *eoi_exit_bitmap)
 -{
 - struct kvm_lapic **dst;
 - struct kvm_apic_map *map;
 - unsigned long bitmap = 1;
 - int i;
 -
 - rcu_read_lock();
 - map = rcu_dereference(vcpu-kvm-arch.apic_map);
 -
 - if (unlikely(!map)) {
 - __set_bit(irq-vector, (unsigned long *)eoi_exit_bitmap);
 - goto out;
 - }
 -
 - if (irq-dest_mode == 0) { /* physical mode */
 - if (irq-delivery_mode == APIC_DM_LOWEST ||
 - irq-dest_id == 0xff) {
 - __set_bit(irq-vector,
 -   (unsigned long *)eoi_exit_bitmap);
 - goto out;
 - }
 - dst = map-phys_map[irq-dest_id  0xff];
 - } else {
 - u32 mda = irq-dest_id  (32 - map-ldr_bits);
 -
 - dst = map-logical_map[apic_cluster_id(map, mda)];
 -
 - bitmap = apic_logical_id(map, mda);
 - }
 -
 - for_each_set_bit(i, bitmap, 16) {
 - if (!dst[i])
 - continue;
 - if (dst[i]-vcpu == vcpu) {
 - __set_bit(irq-vector,
 -   (unsigned long *)eoi_exit_bitmap);
 - break;
 - }
 - }
 -
 -out:
 - rcu_read_unlock();
 -}
 -
  static void recalculate_apic_map(struct kvm *kvm)
  {
   struct kvm_apic_map *new, *old = NULL;
 diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
 index 1676d34..5806ef8 100644
 --- a/arch/x86/kvm/lapic.h
 +++ b/arch/x86/kvm/lapic.h
 @@ -154,8 +154,4 @@ static inline u16 apic_logical_id(struct kvm_apic_map 
 *map, u32 ldr)
   return ldr  map-lid_mask;
  }
  
 -void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
 - struct kvm_lapic_irq *irq,
 - u64 *eoi_bitmap);
 -
  #endif
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index ce82b94..4443075 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -135,8 +135,11 @@ void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu 
 *vcpu,
   irqe.dest_id = e-fields.dest_id;
   irqe.vector = e-fields.vector;
   irqe.dest_mode = e-fields.dest_mode;
 - irqe.delivery_mode = e-fields.delivery_mode  8;
 - kvm_calculate_eoi_exitmap(vcpu, irqe, eoi_exit_bitmap);
 +
 + if (kvm_apic_match_dest(vcpu, NULL, irqe.shorthand,
 + irqe.dest_id, irqe.dest_mode))
 + __set_bit(irqe.vector,
 + (unsigned long *)eoi_exit_bitmap);
Nice! But you do not need irqe any more.

   }
   }
   spin_unlock(ioapic-lock);
 -- 
 1.7.1

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 5/7] KVM: Recalculate destination vcpu map

2013-03-20 Thread Gleb Natapov
On Wed, Mar 20, 2013 at 07:36:17PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 Update RTC interrrupt's destination vcpu map when ioapic entry of RTC
 or apic register (id, ldr, dfr) is changed.
 
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  virt/kvm/ioapic.c |9 +++--
  1 files changed, 7 insertions(+), 2 deletions(-)
 
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index ddf9414..91b4c08 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -121,6 +121,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
  {
   struct kvm_ioapic *ioapic = vcpu-kvm-arch.vioapic;
   union kvm_ioapic_redirect_entry *e;
 + unsigned long *rtc_map = ioapic-rtc_status.vcpu_map;
   struct kvm_lapic_irq irqe;
   int index;
  
 @@ -130,15 +131,19 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
   if (!e-fields.mask 
   (e-fields.trig_mode == IOAPIC_LEVEL_TRIG ||
kvm_irq_has_notifier(ioapic-kvm, KVM_IRQCHIP_IOAPIC,
 -  index))) {
 +  index) || index == 8)) {
   irqe.dest_id = e-fields.dest_id;
   irqe.vector = e-fields.vector;
   irqe.dest_mode = e-fields.dest_mode;
   irqe.shorthand = 0;
  
   if (kvm_apic_match_dest(vcpu, NULL, irqe.shorthand,
 - irqe.dest_id, irqe.dest_mode))
 + irqe.dest_id, irqe.dest_mode)) {
   __set_bit(irqe.vector, eoi_exit_bitmap);
 + if (index == 8)
 + __set_bit(vcpu-vcpu_id, rtc_map);
 + } else if (index == 8)
 + __clear_bit(vcpu-vcpu_id, rtc_map);
rtc_map bitmap is accessed from different vcpus simultaneously so access
has to be atomic. We also have a race:

vcpu0   iothread
ioapic config changes
request scan ioapic
 inject rtc interrupt
 use old vcpu mask
scan_ioapic()
recalculate vcpu mask

So this approach (suggested by me :() will not work.

Need to think about it some more. May be your idea of building a bitmap
while injecting the interrupt is the way to go indeed: pass a pointer to
a bitmap to kvm_irq_delivery_to_apic() and build it there. Pass NULL
pointer if caller does not need to track vcpus.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] bookehv: Handle debug exception on guest exit

2013-03-20 Thread Bharat Bhushan
EPCR.DUVD controls whether the debug events can come in
hypervisor mode or not. When KVM guest is using the debug
resource then we do not want debug events to be captured
in guest entry/exit path. So we set EPCR.DUVD when entering
and clears EPCR.DUVD when exiting from guest.

Debug instruction complete is a post-completion debug
exception but debug event gets posted on the basis of MSR
before the instruction is executed. Now if the instruction
switches the context from guest mode (MSR.GS = 1) to hypervisor
mode (MSR.GS = 0) then the xSRR0 points to first instruction of
KVM handler and xSRR1 points that MSR.GS is clear
(hypervisor context). Now as xSRR1.GS is used to decide whether
KVM handler will be invoked to handle the exception or host
host kernel debug handler will be invoked to handle the exception.
This leads to host kernel debug handler handling the exception
which should either be handled by KVM.

This is tested on e500mc in 32 bit mode

Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
---
v0:
 - Do not apply this change for debug_crit as we do not know those chips have 
issue or not.
 - corrected 64bit case branching

 arch/powerpc/kernel/exceptions-64e.S |   29 -
 arch/powerpc/kernel/head_booke.h |   26 ++
 2 files changed, 54 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 4684e33..8b26294 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -516,6 +516,33 @@ kernel_dbg_exc:
andis.  r15,r14,DBSR_IC@h
beq+1f
 
+#ifdef CONFIG_KVM_BOOKE_HV
+   /*
+* EPCR.DUVD controls whether the debug events can come in
+* hypervisor mode or not. When KVM guest is using the debug
+* resource then we do not want debug events to be captured
+* in guest entry/exit path. So we set EPCR.DUVD when entering
+* and clears EPCR.DUVD when exiting from guest.
+* Debug instruction complete is a post-completion debug
+* exception but debug event gets posted on the basis of MSR
+* before the instruction is executed. Now if the instruction
+* switches the context from guest mode (MSR.GS = 1) to hypervisor
+* mode (MSR.GS = 0) then the xSRR0 points to first instruction of
+* KVM handler and xSRR1 points that MSR.GS is clear
+* (hypervisor context). Now as xSRR1.GS is used to decide whether
+* KVM handler will be invoked to handle the exception or host
+* host kernel debug handler will be invoked to handle the exception.
+* This leads to host kernel debug handler handling the exception
+* which should either be handled by KVM.
+*/
+   mfspr   r10, SPRN_EPCR
+   andis.  r10,r10,SPRN_EPCR_DUVD@h
+   beq+2f
+
+   andis.  r10,r9,MSR_GS@h
+   beq+3f
+2:
+#endif
LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e)
cmpld   cr0,r10,r14
@@ -523,7 +550,7 @@ kernel_dbg_exc:
blt+cr0,1f
bge+cr1,1f
 
-   /* here it looks like we got an inappropriate debug exception. */
+3: /* here it looks like we got an inappropriate debug exception. */
lis r14,DBSR_IC@h   /* clear the IC event */
rlwinm  r11,r11,0,~MSR_DE   /* clear DE in the DSRR1 value */
mtspr   SPRN_DBSR,r14
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 5f051ee..edc6a3b 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -285,7 +285,33 @@ label:
mfspr   r10,SPRN_DBSR;  /* check single-step/branch taken */  \
andis.  r10,r10,(DBSR_IC|DBSR_BT)@h;  \
beq+2f;   \
+#ifdef CONFIG_KVM_BOOKE_HV   \
+   /*\
+* EPCR.DUVD controls whether the debug events can come in\
+* hypervisor mode or not. When KVM guest is using the debug  \
+* resource then we do not want debug events to be captured   \
+* in guest entry/exit path. So we set EPCR.DUVD when entering\
+* and clears EPCR.DUVD when exiting from guest.  \
+* Debug instruction complete is a post-completion debug  \
+* exception but debug event gets posted on the basis of MSR  \
+* before the instruction is executed. Now if the instruction \
+* switches the context from guest mode (MSR.GS = 1) to hypervisor\
+* mode (MSR.GS = 0) then the xSRR0 points to first instruction of\
+* KVM handler and xSRR1 points that MSR.GS is clear  \
+

Re: [PATCH 4/5] KVM: MMU: store generation-number into mmio spte

2013-03-20 Thread Marcelo Tosatti
On Mon, Mar 18, 2013 at 08:42:09PM +0800, Xiao Guangrong wrote:
 On 03/18/2013 07:19 PM, Paolo Bonzini wrote:
  Il 15/03/2013 16:29, Xiao Guangrong ha scritto:
  +/*
  + * spte bits of bit 3 ~ bit 11 are used as low 9 bits of
  + * generation, the bits of bits 52 ~ bit 61 are used as
  + * high 12 bits of generation.
  + */
  
  High 10 bits.
 
 Yes, i forgot to update the comments! Thank you, Paolo!
 
  
  How often does the generation number change?  Especially with Takuya's
  patches, using just 10 bits for the generation might be enough and
  simplifies the code.
 
 I guess it's only updated when guest is initializing and doing hotplug.
 In my origin thought, if the number is big enough that it could not
 round on all most case of hotplug, the things introduced by Takuya is
 not necessary. But, if you guys want to keep the things, 10 bits should
 be enough. :)

Since an overflow can happen in normal scenarios (cirrus emulation can
delete/add memory slot on legacy mode, with syslinux, for example), it
would be good to handle the overflow scenario. It also means bits for
generation number can be reduced.

Can walk the GPA range being added/moved instead of walking
active_mmu_pages (should be much faster since usually its a small slot
being added/deleted).

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH kvm-unit-tests 2/5] x86/README: Drop it

2013-03-20 Thread Marcelo Tosatti
On Fri, Mar 15, 2013 at 08:09:06PM -0400, Cole Robinson wrote:
 Was out of date, and not particularly useful to begin with.
 ---
  x86/README | 16 
  1 file changed, 16 deletions(-)
  delete mode 100644 x86/README
 
 diff --git a/x86/README b/x86/README
 deleted file mode 100644
 index d644abd..000
 --- a/x86/README
 +++ /dev/null
 @@ -1,16 +0,0 @@
 -Tests in this directory and what they do:
 -
 -access: lots of page table related access (pte/pde) (read/write)
 -apic: enable x2apic, self ipi, ioapic intr, ioapic simultaneous
 -emulator: move to/from regs, cmps, push, pop, to/from cr8, smsw and lmsw
 -hypercall: intel and amd hypercall insn
 -msr: write to msr (only KERNEL_GS_BASE for now)
 -port80: lots of out to port 80
 -realmode: goes back to realmode, shld, push/pop, mov immediate, cmp 
 immediate, add immediate,
 - io, eflags instructions (clc, cli, etc.), jcc short, jcc near, 
 call, long jmp, xchg
 -sieve: heavy memory access with no paging and with paging static and with 
 paging vmalloc'ed
 -smptest: run smp_id() on every cpu and compares return value to number
 -tsc: write to tsc(0) and write to tsc(1000) and read it back
 -vmexit: long loops for each: cpuid, vmcall, mov_from_cr8, mov_to_cr8, 
 inl_pmtimer, ipi, ipi+halt
 -kvmclock_test: test of wallclock, monotonic cycle and performance of kvmclock
 -pcid: basic functionality test of PCID/INVPCID feature
 \ No newline at end of file
 -- 
 1.8.1.4

The summary can be useful (would rather update it than remove it).

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH kvm-unit-tests 5/5] x86-run: Pull extra arguments from unittests.cfg

2013-03-20 Thread Marcelo Tosatti
On Sun, Mar 17, 2013 at 07:58:56PM -0400, Cole Robinson wrote:
 On 03/17/2013 11:25 AM, Gleb Natapov wrote:
  On Fri, Mar 15, 2013 at 08:09:09PM -0400, Cole Robinson wrote:
  Some tests want extra arguments as enumerated in unittests.cfg,
  use them.
 
  unittests.cfg also has a few sections about invoking certains tests
  with different combinations of options, but x86-run doesn't do
  anything with that.
  With this it will not be possible to use x86-run outside of autotest,
  no?
  
 
 Not true, x86-run is still meant to be the standalone helper script for
 running unittests. autotest doesn't care about x86-run, and ConfigParser is a
 standard python module.
 
 x86/unittests.cfg already exists in the kvm-unit-tests repo, I assumed it was
 encoding required test options but maybe I'm wrong about that. It's still
 useful to build off of if there's value in running some tests with different
 combinations of parameters.

I fail to see what is the point here.

unittests.cfg has been originally (and continues to be, AFAIK), intended
for autotest:
http://kerneltrap.org/mailarchive/linux-kvm/2010/6/24/6264146

Please don't remove manual execution from README.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command

2013-03-20 Thread Benjamin Herrenschmidt
On Wed, 2013-03-20 at 12:48 -0600, Alex Williamson wrote:
 Perhaps my problem is that I don't have a clear picture of where
 you're
 going with this like I do for AER.  For AER we're starting with
 notification of an error, from that we build into how to retrieve the
 error information, and finally how to perform corrective action.  Each
 of these will be done through vifo-pci.
 
 Here we're starting by registering a mapping that's really only useful
 to the vfio accelerator path, but we don't even have a hint of what
 the non-accelerated path is and how vfio is involved with it.  Thanks,

I'm surprised that you are building so much policy around AER ... can't
you just pass the raw stuff down to the guest and let the guest do it's
own corrective actions ?

As for EEH, I will let Gavin describe in more details what he is doing,
though I wouldn't be surprised if so far he doesn't have a
non-accelerated path :-) Which indeed makes things oddball, granted ...
at least for now. I *think* what Gavin's doing right now is a
pass-through to the host EEH directly in the kernel, so without a slow
path...

Gavin, it really boils down to that. In-kernel EEH for guests is a
KVMism that ends up not involving VFIO in any other way than
establishing the mapping, then arguably it could be done via a VM ioctl.

If there's more going through VFIO and shared state, then it should
probably go through VFIO-PCI.

Note (FYI) that EEH somewhat encompass AER... the EEH logic triggers on
AER errors as well and the error reports generated by the firmware
contain the AER register dump in addition to the bridge internal stuff.
IE. EEH encompass pretty much all sort of errors, correctable or not,
that happens on PCI. It adds a mechanism of isolation of domains on
first error (involving blocking MMIOs, DMAs and MSIs) which helps with
preventing propagation of bad data, and various recovery schemes.

Cheers,
Ben.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vmxcap: Update according to SDM of January 2013

2013-03-20 Thread Marcelo Tosatti
On Sun, Mar 17, 2013 at 11:45:50AM +0100, Jan Kiszka wrote:
 From: Jan Kiszka jan.kis...@siemens.com
 
 This adds reporting of VMCS shadowing, #VE, IA32_SMBASE, unrestricted
 VMWRITE and fixes the range of the MSEG revision ID.
 
 Signed-off-by: Jan Kiszka jan.kis...@siemens.com

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: Avoid busy loops over uninjectable pending APIC timers

2013-03-20 Thread Marcelo Tosatti
On Sun, Mar 17, 2013 at 12:47:17PM +0200, Gleb Natapov wrote:
 On Sun, Mar 17, 2013 at 11:45:34AM +0100, Jan Kiszka wrote:
  On 2013-03-17 09:47, Gleb Natapov wrote:
   On Sat, Mar 16, 2013 at 09:49:07PM +0100, Jan Kiszka wrote:
   From: Jan Kiszka jan.kis...@siemens.com
  
   If the guest didn't take the last APIC timer interrupt yet and generates
   another one on top, e.g. via periodic mode, we do not block the VCPU
   even if the guest state is halted. The reason is that
   apic_has_pending_timer continues to return a non-zero value.
  
   Fix this busy loop by taking the IRR content for the LVT vector in
   apic_has_pending_timer into account.
  
   Just drop coalescing tacking for lapic interrupt. After posted interrupt
   will be merged __apic_accept_irq() will not longer return coalescing
   information, so the code will be dead anyway.
  
  That requires the RTC decoalescing series to go first to avoid a
  regression, no? Then let's postpone this topic for now.
  
 Yes, but decoalescing will work only for RTC :(

Are you proposing to drop LAPIC interrupt reinjection? 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command

2013-03-20 Thread Alex Williamson
On Wed, 2013-03-20 at 20:31 +0100, Benjamin Herrenschmidt wrote:
 On Wed, 2013-03-20 at 12:48 -0600, Alex Williamson wrote:
  Perhaps my problem is that I don't have a clear picture of where
  you're
  going with this like I do for AER.  For AER we're starting with
  notification of an error, from that we build into how to retrieve the
  error information, and finally how to perform corrective action.  Each
  of these will be done through vifo-pci.
  
  Here we're starting by registering a mapping that's really only useful
  to the vfio accelerator path, but we don't even have a hint of what
  the non-accelerated path is and how vfio is involved with it.  Thanks,
 
 I'm surprised that you are building so much policy around AER ... can't
 you just pass the raw stuff down to the guest and let the guest do it's
 own corrective actions ?

How does the guest get the raw stuff?  We need to get the AER interrupt
out to the guest so it can be injected into the virtual PCIe port, then
we need to be able to retrieve the physical device log and pass it to
the qemu to mangle to match the guest topology.  We don't have existing
firmware interfaces for the guest to do that, so it's all being routed
through vfio-pci.

 As for EEH, I will let Gavin describe in more details what he is doing,
 though I wouldn't be surprised if so far he doesn't have a
 non-accelerated path :-) Which indeed makes things oddball, granted ...
 at least for now. I *think* what Gavin's doing right now is a
 pass-through to the host EEH directly in the kernel, so without a slow
 path...
 
 Gavin, it really boils down to that. In-kernel EEH for guests is a
 KVMism that ends up not involving VFIO in any other way than
 establishing the mapping, then arguably it could be done via a VM ioctl.
 
 If there's more going through VFIO and shared state, then it should
 probably go through VFIO-PCI.

Exactly my thinking.  Thanks,

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: Avoid busy loops over uninjectable pending APIC timers

2013-03-20 Thread Marcelo Tosatti
On Wed, Mar 20, 2013 at 04:30:33PM -0300, Marcelo Tosatti wrote:
 On Sun, Mar 17, 2013 at 12:47:17PM +0200, Gleb Natapov wrote:
  On Sun, Mar 17, 2013 at 11:45:34AM +0100, Jan Kiszka wrote:
   On 2013-03-17 09:47, Gleb Natapov wrote:
On Sat, Mar 16, 2013 at 09:49:07PM +0100, Jan Kiszka wrote:
From: Jan Kiszka jan.kis...@siemens.com
   
If the guest didn't take the last APIC timer interrupt yet and 
generates
another one on top, e.g. via periodic mode, we do not block the VCPU
even if the guest state is halted. The reason is that
apic_has_pending_timer continues to return a non-zero value.
   
Fix this busy loop by taking the IRR content for the LVT vector in
apic_has_pending_timer into account.
   
Just drop coalescing tacking for lapic interrupt. After posted interrupt
will be merged __apic_accept_irq() will not longer return coalescing
information, so the code will be dead anyway.
   
   That requires the RTC decoalescing series to go first to avoid a
   regression, no? Then let's postpone this topic for now.
   
  Yes, but decoalescing will work only for RTC :(
 
 Are you proposing to drop LAPIC interrupt reinjection? 

Since timer handling and injection is VCPU-local for LAPIC,
__apic_accept_irq can (and must) return coalesced information (cannot
drop LAPIC interrupt reinjection).


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 16/29] arm64: KVM: HYP mode world switch implementation

2013-03-20 Thread Christopher Covington
Hi Marc,

On 03/13/2013 03:59 PM, Christopher Covington wrote:

[...]

 Alternatively, you could consider storing the host registers in a
 slimmed-down vcpu structure for hosts, rather than on the stack.

One potential argument for storing the host in the same sort of vcpu structure
as the guest rather than on the hypervisor stack is that snapshot and
migration support initially intended for guests might more easily be extended
to work for hosts as well.

Regards,
Christopher

-- 
Employee of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by
the Linux Foundation
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH 14/29] arm64: KVM: guest one-reg interface

2013-03-20 Thread Christopher Covington
Hi Marc, Peter,

On 03/14/2013 04:57 AM, Peter Maydell wrote:
 On 13 March 2013 20:34, Christopher Covington c...@codeaurora.org wrote:
 My guess at the goal of the code cited above in this email is that it's 
 trying
 to sanity check that virtualization will work. Rather than taking a default
 deny approach with a hand-maintained white list of virtualization-supporting
 machine identifiers, why not check that EL2 is implemented on the current
 system and if it's not implied by that, that the timer and interrupt
 controller are suitable as well?

[...]

 ...you need to implement emulation code for the imp-def registers for a
 guest CPU.

[...]

This is reasonable. In this light the code I was picking out above is simply
converting MIDRs to KVM_ARM_TARGET_* constants. Because the mapping isn't
one-to-one, the hand-maintained list is an acceptable approach.

In the long term, I wonder if some kind of KVM_TARGET_CURRENT_CPU might be 
handy.

Thanks,
Christopher

-- 
Employee of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by
the Linux Foundation
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM: MMU: improve n_max_mmu_pages calculation with TDP

2013-03-20 Thread Marcelo Tosatti

kvm_mmu_calculate_mmu_pages numbers, 

maximum number of shadow pages = 2% of mapped guest pages

Does not make sense for TDP guests where mapping all of guest
memory with 4k pages cannot exceed mapped guest pages / 512
(not counting root pages).

Allow that maximum for TDP, forcing the guest to recycle otherwise.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 956ca35..a9694a8d7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4293,7 +4293,7 @@ nomem:
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 {
unsigned int nr_mmu_pages;
-   unsigned int  nr_pages = 0;
+   unsigned int i, nr_pages = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
 
@@ -4302,7 +4302,19 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
kvm_for_each_memslot(memslot, slots)
nr_pages += memslot-npages;
 
-   nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+   if (tdp_enabled) {
+   /* one root page */
+   nr_mmu_pages = 1;
+   /* nr_pages / (512^i) per level, due to
+* guest RAM map being linear */
+   for (i = 1; i  4; i++) {
+   int nr_pages_round = nr_pages + (1  (9*i));
+   nr_mmu_pages += nr_pages_round  (9*i);
+   }
+   } else {
+   nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+   }
+
nr_mmu_pages = max(nr_mmu_pages,
(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vfio powerpc: implement IOMMU driver for VFIO

2013-03-20 Thread Alex Williamson
On Tue, 2013-03-19 at 18:08 +1100, Alexey Kardashevskiy wrote:
 VFIO implements platform independent stuff such as
 a PCI driver, BAR access (via read/write on a file descriptor
 or direct mapping when possible) and IRQ signaling.
 
 The platform dependent part includes IOMMU initialization
 and handling. This patch implements an IOMMU driver for VFIO
 which does mapping/unmapping pages for the guest IO and
 provides information about DMA window (required by a POWERPC
 guest).
 
 The counterpart in QEMU is required to support this functionality.
 
 Changelog:
 * documentation updated
 * containter enable/disable ioctls added
 * request_module(spapr_iommu) added
 * various locks fixed
 
 Cc: David Gibson da...@gibson.dropbear.id.au
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---


Looking pretty good.  There's one problem with the detach_group,
otherwise just some trivial comments below.  What's the status of the
tce code that this depends on?  Thanks,

Alex

  Documentation/vfio.txt  |   63 ++
  drivers/vfio/Kconfig|6 +
  drivers/vfio/Makefile   |1 +
  drivers/vfio/vfio.c |1 +
  drivers/vfio/vfio_iommu_spapr_tce.c |  365 
 +++
  include/uapi/linux/vfio.h   |   38 
  6 files changed, 474 insertions(+)
  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
 
 diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
 index 8eda363..351f7c9 100644
 --- a/Documentation/vfio.txt
 +++ b/Documentation/vfio.txt
 @@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls.  The 
 read/write/mmap
  interfaces implement the device region access defined by the device's
  own VFIO_DEVICE_GET_REGION_INFO ioctl.
  
 +
 +PPC64 sPAPR implementation note
 +---
 +
 +This implementation has some specifics:
 +
 +1) Only one IOMMU group per container is supported as an IOMMU group
 +represents the minimal entity which isolation can be guaranteed for and
 +groups are allocated statically, one per a Partitionable Endpoint (PE)
 +(PE is often a PCI domain but not always).
 +
 +2) The hardware supports so called DMA windows - the PCI address range
 +within which DMA transfer is allowed, any attempt to access address space
 +out of the window leads to the whole PE isolation.
 +
 +3) PPC64 guests are paravirtualized but not fully emulated. There is an API
 +to map/unmap pages for DMA, and it normally maps 1..32 pages per call and
 +currently there is no way to reduce the number of calls. In order to make 
 things
 +faster, the map/unmap handling has been implented in real mode which provides

s/implented/implemented/

 +an excellent performance which has limitations such as inability to do
 +locked pages accounting in real time.
 +
 +So 3 additional ioctls have been added:
 +
 + VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
 + of the DMA window on the PCI bus.
 +
 + VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting
 + is done at this point. This lets user first to know what
 + the DMA window is and adjust rlimit before doing any real job.
 +
 + VFIO_IOMMU_DISABLE - disables the container.
 +
 +
 +The code flow from the example above should be slightly changed:
 +
 + .
 + /* Add the group to the container */
 + ioctl(group, VFIO_GROUP_SET_CONTAINER, container);
 +
 + /* Enable the IOMMU model we want */
 + ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU)
 +
 + /* Get addition sPAPR IOMMU info */
 + vfio_iommu_spapr_tce_info spapr_iommu_info;
 + ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, spapr_iommu_info);
 +
 + if (ioctl(container, VFIO_IOMMU_ENABLE))
 + /* Cannot enable container, may be low rlimit */
 +
 + /* Allocate some space and setup a DMA mapping */
 + dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
 +  MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
 +
 + dma_map.size = 1024 * 1024;
 + dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
 + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
 +
 + /* Check here is .iova/.size are within DMA window from 
 spapr_iommu_info */
 +
 + ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map);
 + .
 +

Awesome, thanks for the docs update

  
 ---
  
  [1] VFIO was originally an acronym for Virtual Function I/O in its
 diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
 index 7cd5dec..b464687 100644
 --- a/drivers/vfio/Kconfig
 +++ b/drivers/vfio/Kconfig
 @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
   depends on VFIO
   default n
  
 +config VFIO_IOMMU_SPAPR_TCE
 + tristate
 + depends on VFIO  SPAPR_TCE_IOMMU
 + default n
 +
  menuconfig VFIO
   tristate 

Re: [PATCH V6 4/5] virtio-scsi: introduce multiqueue support

2013-03-20 Thread Venkatesh Srinivas

On Wed, Mar 20, 2013 at 03:01:23PM +0800, Wanlong Gao wrote:

From: Paolo Bonzini pbonz...@redhat.com

This patch adds queue steering to virtio-scsi.  When a target is sent
multiple requests, we always drive them to the same queue so that FIFO
processing order is kept.  However, if a target was idle, we can choose
a queue arbitrarily.  In this case the queue is chosen according to the
current VCPU, so the driver expects the number of request queues to be
equal to the number of VCPUs.  This makes it easy and fast to select
the queue, and also lets the driver optimize the IRQ affinity for the
virtqueues (each virtqueue's affinity is set to the CPU that owns
the queue).

The speedup comes from improving cache locality and giving CPU affinity
to the virtqueues, which is why this scheme was selected.  Assuming that
the thread that is sending requests to the device is I/O-bound, it is
likely to be sleeping at the time the ISR is executed, and thus executing
the ISR on the same processor that sent the requests is cheap.

However, the kernel will not execute the ISR on the best processor
unless you explicitly set the affinity.  This is because in practice
you will have many such I/O-bound processes and thus many otherwise
idle processors.  Then the kernel will execute the ISR on a random
processor, rather than the one that is sending requests to the device.

The alternative to per-CPU virtqueues is per-target virtqueues.  To
achieve the same locality, we could dynamically choose the virtqueue's
affinity based on the CPU of the last task that sent a request.  This
is less appealing because we do not set the affinity directly---we only
provide a hint to the irqbalanced running in userspace.  Dynamically
changing the affinity only works if the userspace applies the hint
fast enough.


Looks good! Tested as V5.

Tested-by: Venkatesh Srinivas venkate...@google.com

-- vs;
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: Avoid busy loops over uninjectable pending APIC timers

2013-03-20 Thread Gleb Natapov
On Wed, Mar 20, 2013 at 05:03:19PM -0300, Marcelo Tosatti wrote:
 On Wed, Mar 20, 2013 at 04:30:33PM -0300, Marcelo Tosatti wrote:
  On Sun, Mar 17, 2013 at 12:47:17PM +0200, Gleb Natapov wrote:
   On Sun, Mar 17, 2013 at 11:45:34AM +0100, Jan Kiszka wrote:
On 2013-03-17 09:47, Gleb Natapov wrote:
 On Sat, Mar 16, 2013 at 09:49:07PM +0100, Jan Kiszka wrote:
 From: Jan Kiszka jan.kis...@siemens.com

 If the guest didn't take the last APIC timer interrupt yet and 
 generates
 another one on top, e.g. via periodic mode, we do not block the VCPU
 even if the guest state is halted. The reason is that
 apic_has_pending_timer continues to return a non-zero value.

 Fix this busy loop by taking the IRR content for the LVT vector in
 apic_has_pending_timer into account.

 Just drop coalescing tacking for lapic interrupt. After posted 
 interrupt
 will be merged __apic_accept_irq() will not longer return coalescing
 information, so the code will be dead anyway.

That requires the RTC decoalescing series to go first to avoid a
regression, no? Then let's postpone this topic for now.

   Yes, but decoalescing will work only for RTC :(
  
  Are you proposing to drop LAPIC interrupt reinjection? 
 
 Since timer handling and injection is VCPU-local for LAPIC,
 __apic_accept_irq can (and must) return coalesced information (cannot
 drop LAPIC interrupt reinjection).
 
Why can't we drop LAPIC interrupt reinjection? Proposed posted interrupt
patches do not properly check for interrupt coalescing even for
VCPU-local injection.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Virtualbox svga card in KVM

2013-03-20 Thread Sriram Murthy


Hi,
 I am planning on bringing in the virtualbox svga card into kvm as a new 
svga card type (vbox probably?) so that we can load the VirtualBox SVGA card 
drivers in the guest. 

 Is this even feasible?. Any ideas on where I should start looking? 

Regards,Sriram
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: Avoid busy loops over uninjectable pending APIC timers

2013-03-20 Thread Marcelo Tosatti
On Wed, Mar 20, 2013 at 11:32:38PM +0200, Gleb Natapov wrote:
 On Wed, Mar 20, 2013 at 05:03:19PM -0300, Marcelo Tosatti wrote:
  On Wed, Mar 20, 2013 at 04:30:33PM -0300, Marcelo Tosatti wrote:
   On Sun, Mar 17, 2013 at 12:47:17PM +0200, Gleb Natapov wrote:
On Sun, Mar 17, 2013 at 11:45:34AM +0100, Jan Kiszka wrote:
 On 2013-03-17 09:47, Gleb Natapov wrote:
  On Sat, Mar 16, 2013 at 09:49:07PM +0100, Jan Kiszka wrote:
  From: Jan Kiszka jan.kis...@siemens.com
 
  If the guest didn't take the last APIC timer interrupt yet and 
  generates
  another one on top, e.g. via periodic mode, we do not block the 
  VCPU
  even if the guest state is halted. The reason is that
  apic_has_pending_timer continues to return a non-zero value.
 
  Fix this busy loop by taking the IRR content for the LVT vector in
  apic_has_pending_timer into account.
 
  Just drop coalescing tacking for lapic interrupt. After posted 
  interrupt
  will be merged __apic_accept_irq() will not longer return coalescing
  information, so the code will be dead anyway.
 
 That requires the RTC decoalescing series to go first to avoid a
 regression, no? Then let's postpone this topic for now.
 
Yes, but decoalescing will work only for RTC :(
   
   Are you proposing to drop LAPIC interrupt reinjection? 
  
  Since timer handling and injection is VCPU-local for LAPIC,
  __apic_accept_irq can (and must) return coalesced information (cannot
  drop LAPIC interrupt reinjection).
  
 Why can't we drop LAPIC interrupt reinjection? Proposed posted interrupt
 patches do not properly check for interrupt coalescing even for
 VCPU-local injection.
 
 --
   Gleb.

Because older Linux guests depend on reinjection for proper timekeeping.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86: correctly initialize the CS base on reset

2013-03-20 Thread Marcelo Tosatti
On Tue, Mar 19, 2013 at 04:30:26PM +0100, Paolo Bonzini wrote:
 The CS base was initialized to 0 on VMX (wrong, but usually overridden
 by userspace before starting) or 0xf on SVM.  The correct value is
 0x, and VMX is able to emulate it now, so use it.
 
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  arch/x86/kvm/svm.c | 8 +---
  arch/x86/kvm/vmx.c | 1 +
  2 files changed, 2 insertions(+), 7 deletions(-)

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: allow host header to be included even for !CONFIG_KVM

2013-03-20 Thread Scott Wood

On 03/14/2013 07:13:46 PM, Kevin Hilman wrote:

The new context tracking subsystem unconditionally includes kvm_host.h
headers for the guest enter/exit macros.  This causes a compile
failure when KVM is not enabled.

Fix by adding an IS_ENABLED(CONFIG_KVM) check to kvm_host so it can
be included/compiled even when KVM is not enabled.

Cc: Frederic Weisbecker fweis...@gmail.com
Signed-off-by: Kevin Hilman khil...@linaro.org
---
Applies on v3.9-rc2

 include/linux/kvm_host.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)


This broke the PPC non-KVM build, which was relying on stub functions  
in kvm_ppc.h, which relies on struct vcpu in kvm_host.h.


Why can't the entirety kvm_host.h be included regardless of CONFIG_KVM,  
just like most other feature-specific headers?  Why can't the if/else  
just go around the functions that you want to stub out for non-KVM  
builds?


-Scott
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vfio powerpc: implement IOMMU driver for VFIO

2013-03-20 Thread Alexey Kardashevskiy

On 21/03/13 07:45, Alex Williamson wrote:

On Tue, 2013-03-19 at 18:08 +1100, Alexey Kardashevskiy wrote:

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Changelog:
* documentation updated
* containter enable/disable ioctls added
* request_module(spapr_iommu) added
* various locks fixed

Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---



Looking pretty good.  There's one problem with the detach_group,
otherwise just some trivial comments below.  What's the status of the
tce code that this depends on?  Thanks,



It is done, I am just waiting till other patches of the series will be 
reviewed (by guys) and fixed (by me) and then I'll post everything again.


[skipped]


+static int tce_iommu_enable(struct tce_container *container)
+{
+   int ret = 0;
+   unsigned long locked, lock_limit, npages;
+   struct iommu_table *tbl = container-tbl;
+
+   if (!container-tbl)
+   return -ENXIO;
+
+   if (!current-mm)
+   return -ESRCH; /* process exited */
+
+   mutex_lock(container-lock);
+   if (container-enabled) {
+   mutex_unlock(container-lock);
+   return -EBUSY;
+   }
+
+   /*
+* Accounting for locked pages.
+*
+* On sPAPR platform, IOMMU translation table contains
+* an entry per 4K page. Every map/unmap request is sent
+* by the guest via hypercall and supposed to be handled
+* quickly, ecpesially in real mode (if such option is


s/ecpesially/especially/



I replaced the whole text by the one written by Ben and David.

[skipped]


+   }
+
+   return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+   struct iommu_group *iommu_group)
+{
+   int ret;
+   struct tce_container *container = iommu_data;
+   struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+   BUG_ON(!tbl);
+   mutex_lock(container-lock);
+   pr_debug(tce_vfio: Attaching group #%u to iommu %p\n,
+   iommu_group_id(iommu_group), iommu_group);
+   if (container-tbl) {
+   pr_warn(tce_vfio: Only one group per IOMMU container is allowed, 
existing id=%d, attaching id=%d\n,
+   iommu_group_id(container-tbl-it_group),
+   iommu_group_id(iommu_group));
+   mutex_unlock(container-lock);
+   return -EBUSY;
+   }
+
+   ret = iommu_take_ownership(tbl);
+   if (!ret)
+   container-tbl = tbl;
+
+   mutex_unlock(container-lock);
+
+   return ret;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+   struct iommu_group *iommu_group)
+{
+   struct tce_container *container = iommu_data;
+   struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+   BUG_ON(!tbl);
+   mutex_lock(container-lock);
+   if (tbl != container-tbl) {
+   pr_warn(tce_vfio: detaching group #%u, expected group is 
#%u\n,
+   iommu_group_id(iommu_group),
+   iommu_group_id(tbl-it_group));
+   } else if (container-enabled) {
+   pr_err(tce_vfio: detaching group #%u from enabled container\n,
+   iommu_group_id(tbl-it_group));


Hmm, something more than a pr_err needs to happen here.  Wouldn't this
imply a disable and going back to an unprivileged container?


Ah. It does not return error code... Then something like that?

...
 } else if (container-enabled) {
 pr_warn(tce_vfio: detaching group #%u from enabled 
container, forcing disable\n,

 iommu_group_id(tbl-it_group));
 tce_iommu_disable(container);
...

--
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] VFIO: Direct access config reg without capability

2013-03-20 Thread Alex Williamson
On Fri, 2013-03-15 at 15:26 +0800, Gavin Shan wrote:
 The config registers in [0, 0x40] is being supported by VFIO. Apart
 from that, the other config registers should be coverred by PCI or
 PCIe capability. However, there might have some PCI devices (be2net)
 who has config registers (0x7c) out of [0, 0x40], and don't have
 corresponding PCI or PCIe capability. VFIO will return 0x0 on reading
 those registers and writing is dropped. It caused the be2net driver
 fails to be loaded because 0x0 returned from its config register 0x7c.
 
 The patch changes the behaviour so that those config registers out
 of [0, 0x40] and don't have corresponding PCI or PCIe capability
 will be accessed directly.
 
 Signed-off-by: Gavin Shan sha...@linux.vnet.ibm.com
 ---

Hi Gavin,

I'm onboard with making this change now, but this patch isn't
sufficient.  The config space map uses a byte per dword to index the
capability since both standard and extended capabilities are dword
aligned.  We currently have a bug that this patch exposes that we round
the length down, ex. a 14 byte MSI capability becomes 12 bytes leaving
the message data now exposed and writable with this patch.  That bug can
be fixed by aligning the length so the capability fills the dword, but
notice that 0x7c on the be2net is filling one of these gaps.  So fixing
that bug attaches that gap to the previous capability instead of
allowing direct access.

So, before we can make this change we need to fix the config map to have
byte granularity.  Thanks,

Alex

  drivers/vfio/pci/vfio_pci_config.c |   31 ---
  1 files changed, 20 insertions(+), 11 deletions(-)
 
 diff --git a/drivers/vfio/pci/vfio_pci_config.c 
 b/drivers/vfio/pci/vfio_pci_config.c
 index 964ff22..5ea3afb 100644
 --- a/drivers/vfio/pci/vfio_pci_config.c
 +++ b/drivers/vfio/pci/vfio_pci_config.c
 @@ -1471,18 +1471,27 @@ static ssize_t vfio_config_do_rw(struct 
 vfio_pci_device *vdev, char __user *buf,
  
   cap_id = vdev-pci_config_map[*ppos / 4];
  
 + /*
 +  * Some PCI device config registers might not be coverred by
 +  * capability and useful. We will enable direct access to
 +  * those registers.
 +  */
   if (cap_id == PCI_CAP_ID_INVALID) {
 - if (iswrite)
 - return ret; /* drop */
 -
 - /*
 -  * Per PCI spec 3.0, section 6.1, reads from reserved and
 -  * unimplemented registers return 0
 -  */
 - if (copy_to_user(buf, val, count))
 - return -EFAULT;
 -
 - return ret;
 + if (iswrite) {
 + if (copy_from_user(val, buf, count))
 + return -EFAULT;
 + ret = vfio_user_config_write(vdev-pdev, (int)(*ppos),
 +  val, count);
 + return ret ? ret : count;
 + } else {
 + ret = vfio_user_config_read(vdev-pdev, (int)(*ppos),
 + val, count);
 + if (ret)
 + return ret;
 + if (copy_to_user(buf, val, count))
 + return -EFAULT;
 + return count;
 + }
   }
  
   /*



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vfio powerpc: implement IOMMU driver for VFIO

2013-03-20 Thread Alex Williamson
On Thu, 2013-03-21 at 11:57 +1100, Alexey Kardashevskiy wrote:
 On 21/03/13 07:45, Alex Williamson wrote:
  On Tue, 2013-03-19 at 18:08 +1100, Alexey Kardashevskiy wrote:
  VFIO implements platform independent stuff such as
  a PCI driver, BAR access (via read/write on a file descriptor
  or direct mapping when possible) and IRQ signaling.
 
  The platform dependent part includes IOMMU initialization
  and handling. This patch implements an IOMMU driver for VFIO
  which does mapping/unmapping pages for the guest IO and
  provides information about DMA window (required by a POWERPC
  guest).
 
  The counterpart in QEMU is required to support this functionality.
 
  Changelog:
  * documentation updated
  * containter enable/disable ioctls added
  * request_module(spapr_iommu) added
  * various locks fixed
 
  Cc: David Gibson da...@gibson.dropbear.id.au
  Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
  ---
 
 
  Looking pretty good.  There's one problem with the detach_group,
  otherwise just some trivial comments below.  What's the status of the
  tce code that this depends on?  Thanks,
 
 
 It is done, I am just waiting till other patches of the series will be 
 reviewed (by guys) and fixed (by me) and then I'll post everything again.
 
 [skipped]
 
  +static int tce_iommu_enable(struct tce_container *container)
  +{
  +  int ret = 0;
  +  unsigned long locked, lock_limit, npages;
  +  struct iommu_table *tbl = container-tbl;
  +
  +  if (!container-tbl)
  +  return -ENXIO;
  +
  +  if (!current-mm)
  +  return -ESRCH; /* process exited */
  +
  +  mutex_lock(container-lock);
  +  if (container-enabled) {
  +  mutex_unlock(container-lock);
  +  return -EBUSY;
  +  }
  +
  +  /*
  +   * Accounting for locked pages.
  +   *
  +   * On sPAPR platform, IOMMU translation table contains
  +   * an entry per 4K page. Every map/unmap request is sent
  +   * by the guest via hypercall and supposed to be handled
  +   * quickly, ecpesially in real mode (if such option is
 
  s/ecpesially/especially/
 
 
 I replaced the whole text by the one written by Ben and David.
 
 [skipped]
 
  +  }
  +
  +  return -ENOTTY;
  +}
  +
  +static int tce_iommu_attach_group(void *iommu_data,
  +  struct iommu_group *iommu_group)
  +{
  +  int ret;
  +  struct tce_container *container = iommu_data;
  +  struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
  +
  +  BUG_ON(!tbl);
  +  mutex_lock(container-lock);
  +  pr_debug(tce_vfio: Attaching group #%u to iommu %p\n,
  +  iommu_group_id(iommu_group), iommu_group);
  +  if (container-tbl) {
  +  pr_warn(tce_vfio: Only one group per IOMMU container is 
  allowed, existing id=%d, attaching id=%d\n,
  +  iommu_group_id(container-tbl-it_group),
  +  iommu_group_id(iommu_group));
  +  mutex_unlock(container-lock);
  +  return -EBUSY;
  +  }
  +
  +  ret = iommu_take_ownership(tbl);
  +  if (!ret)
  +  container-tbl = tbl;
  +
  +  mutex_unlock(container-lock);
  +
  +  return ret;
  +}
  +
  +static void tce_iommu_detach_group(void *iommu_data,
  +  struct iommu_group *iommu_group)
  +{
  +  struct tce_container *container = iommu_data;
  +  struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
  +
  +  BUG_ON(!tbl);
  +  mutex_lock(container-lock);
  +  if (tbl != container-tbl) {
  +  pr_warn(tce_vfio: detaching group #%u, expected group is 
  #%u\n,
  +  iommu_group_id(iommu_group),
  +  iommu_group_id(tbl-it_group));
  +  } else if (container-enabled) {
  +  pr_err(tce_vfio: detaching group #%u from enabled container\n,
  +  iommu_group_id(tbl-it_group));
 
  Hmm, something more than a pr_err needs to happen here.  Wouldn't this
  imply a disable and going back to an unprivileged container?
 
 Ah. It does not return error code... Then something like that?
 
 ...
   } else if (container-enabled) {
   pr_warn(tce_vfio: detaching group #%u from enabled 
 container, forcing disable\n,
   iommu_group_id(tbl-it_group));
   tce_iommu_disable(container);
 ...

Yep, but of course it also needs to fall through and set tbl = NULL and
release ownership as well.  BTW, don't you think the pr_debug in that
path is a little excessive?  Likewise the one in the attach path.
Thanks,

Alex


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Host kernel crash at pci_find_upstream_pcie_bridge on VM exit

2013-03-20 Thread Alex Williamson
On Tue, 2013-03-19 at 17:09 -0700, Ganesh Narayanaswamy wrote:
 Hi Alex,
 
 Thanks for your reply.  The pci devices in question are proprietary FPGAs.  
 Here is the lspci -tv output:
 
 -bash-4.1# lspci -tv
 -[:00]-+-00.0  Intel Corporation Sandy Bridge DRAM Controller
+-01.0-[01-04]00.0-[02-04]--+-01.0-[03]00.0  Broadcom 
 Corporation Device b850
|   \-02.0-[04]00.0  Broadcom 
 Corporation Device b850
+-01.1-[05]--
+-06.0-[06]--+-00.0  Intel Corporation Device 0434
|+-00.1  Intel Corporation Device 0438
|+-00.2  Intel Corporation Device 0438
|+-00.3  Intel Corporation Device 0436
|\-00.4  Intel Corporation Device 0436
+-1d.0  Intel Corporation Device 2334
+-1f.0  Intel Corporation Device 2310
+-1f.2  Intel Corporation Device 2323
+-1f.3  Intel Corporation Device 2330
+-1f.4  Intel Corporation Device 2331
+-1f.6  Intel Corporation Device 2332
\-1f.7  Intel Corporation Device 2360
 
 My qemu command line is as follows:
 
 qemu-system-x86_64 -M q35 --enable-kvm -m 2048 -nographic -vga std
 -usb -drive file=IMG file,if=none,id=drive-sata-disk0,format=raw
 -device ahci,id=ahci -device
 ide-drive,bus=ahci.0,drive=drive-sata-disk0,id=sata-disk0,bootindex=1
 -device pci-assign,host=04:00.0 -device pci-assign,host=03:00.0
 
 
 The PCIe bridge is a PLX 8613 device:
 
 01:00.0 PCI bridge: PLX Technology, Inc. PEX 8613 12-lane, 3-Port PCI Express 
 Gen 2 (5.0 GT/s) Switch (rev ba)
 02:01.0 PCI bridge: PLX Technology, Inc. PEX 8613 12-lane, 3-Port PCI Express 
 Gen 2 (5.0 GT/s) Switch (rev ba)
 02:02.0 PCI bridge: PLX Technology, Inc. PEX 8613 12-lane, 3-Port PCI Express 
 Gen 2 (5.0 GT/s) Switch (rev ba)
 
 As shown by the lspci -tv output, each of the PCI device being passed
 through is connected to one of the downstream ports of the PLX PCI
 bridge.

Are your FPGAs actually PCIe devices (they must be because they connect
to a PCIe switch) that do not expose a PCIe capability?  For example,
lspci -v:

Capabilities: [e0] Express Endpoint, MSI 00

If so, they're in violation of the PCI Express specification and likely
the cause of this problem.  Thanks,

Alex

 On Mar 19, 2013, at 3:28 PM, Alex Williamson wrote:
 
  On Tue, 2013-03-19 at 13:30 -0700, Ganesh Narayanaswamy wrote:
  Hi,
  
  I am running qemu with kvm and VT-d enabled and a couple of PCI
  devices assigned to the guest VM. Both host and guest are running
  linux 2.6 kernel.  
  
  The passthrough works fine, but when I exit the VM, the host kernel
  crashes with the following backtrace:
  
  4[ 5569.836893] Process qemu-system-x86 (pid: 2925, threadinfo 
  8801f5f4, task 88024fa28720)
  0[ 5569.944946] Stack:
  4[ 5569.968845]  8801f5f41aa8 811a45fb 88024f04b680 
  88024f049980
  4[ 5570.057156]  88024f04b680 88024f049988 8801f5f41b08 
  811a6371
  4[ 5570.145470]  8801f5f41ad8 81391045 0246 
  88024f049990
  0[ 5570.233785] Call Trace:
  4[ 5570.262880]  [811a45fb] 
  iommu_detach_dependent_devices+0x25/0x91
  4[ 5570.344958]  [811a6371] vm_domain_exit+0xf8/0x28b
  4[ 5570.411457]  [81391045] ? sub_preempt_count+0x92/0xa6
  4[ 5570.482106]  [811a651a] 
  intel_iommu_domain_destroy+0x16/0x18
  4[ 5570.560030]  [811fb5ea] iommu_domain_free+0x16/0x22
  4[ 5570.628611]  [a0006261] kvm_iommu_unmap_guest+0x22/0x28 
  [kvm]
  4[ 5570.707570]  [a0009b7b] kvm_arch_destroy_vm+0x19/0x12a 
  [kvm]
  4[ 5570.785492]  [a0002614] kvm_put_kvm+0xe6/0x129 [kvm]
  4[ 5570.855102]  [a0002eb3] kvm_vcpu_release+0x13/0x17 [kvm]
  4[ 5570.928867]  [8109cdfc] fput+0x117/0x1be
  4[ 5570.986013]  [8109a147] filp_close+0x63/0x6d
  4[ 5571.047314]  [810342dd] put_files_struct+0x6f/0xda
  4[ 5571.114845]  [8103438e] exit_files+0x46/0x4e
  4[ 5571.176145]  [81035b3d] do_exit+0x1fc/0x681
  4[ 5571.236416]  [a000dedc] ? 
  kvm_arch_vcpu_ioctl_run+0xc2d/0xc55 [kvm]
  4[ 5571.321605]  [8138cc41] ? __mutex_lock_slowpath+0x26c/0x294
  4[ 5571.398490]  [81036034] do_group_exit+0x72/0x9a
  4[ 5571.462907]  [8103fec9] get_signal_to_deliver+0x331/0x350
  4[ 5571.537719]  [81001f0f] do_signal+0x6d/0x69a
  4[ 5571.599013]  [811da1fc] ? put_ldisc+0x92/0x97
  4[ 5571.661353]  [810a95ea] ? do_vfs_ioctl+0x527/0x576
  4[ 5571.728887]  [81002563] do_notify_resume+0x27/0x51
  4[ 5571.796419]  [810a968c] ? sys_ioctl+0x53/0x65
  4[ 5571.858758]  [81002b9b] int_signal+0x12/0x17
  0[ 5571.920058] Code: 48 85 d2 0f 95 c0 c9 c3 55 80 7f 4a 00 48 89 f8 48 
  89 e5 75 46 31 d2 48 8b 40 10 48 83 78 10 00 75 05 48 89 d0 eb 36 48 8b 40 
  38 80 78 4a 00 48 89 c2 74 e3 

Re: [PATCH] vfio powerpc: implement IOMMU driver for VFIO

2013-03-20 Thread David Gibson
On Wed, Mar 20, 2013 at 02:45:24PM -0600, Alex Williamson wrote:
 On Tue, 2013-03-19 at 18:08 +1100, Alexey Kardashevskiy wrote:
  VFIO implements platform independent stuff such as
  a PCI driver, BAR access (via read/write on a file descriptor
  or direct mapping when possible) and IRQ signaling.
  
  The platform dependent part includes IOMMU initialization
  and handling. This patch implements an IOMMU driver for VFIO
  which does mapping/unmapping pages for the guest IO and
  provides information about DMA window (required by a POWERPC
  guest).
  
  The counterpart in QEMU is required to support this functionality.
  
  Changelog:
  * documentation updated
  * containter enable/disable ioctls added
  * request_module(spapr_iommu) added
  * various locks fixed
  
  Cc: David Gibson da...@gibson.dropbear.id.au
  Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
  ---
 
 
 Looking pretty good.  There's one problem with the detach_group,
 otherwise just some trivial comments below.  What's the status of the
 tce code that this depends on?  Thanks,

[snip]
  +static void tce_iommu_detach_group(void *iommu_data,
  +   struct iommu_group *iommu_group)
  +{
  +   struct tce_container *container = iommu_data;
  +   struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
  +
  +   BUG_ON(!tbl);
  +   mutex_lock(container-lock);
  +   if (tbl != container-tbl) {
  +   pr_warn(tce_vfio: detaching group #%u, expected group is 
  #%u\n,
  +   iommu_group_id(iommu_group),
  +   iommu_group_id(tbl-it_group));
  +   } else if (container-enabled) {
  +   pr_err(tce_vfio: detaching group #%u from enabled container\n,
  +   iommu_group_id(tbl-it_group));
 
 Hmm, something more than a pr_err needs to happen here.  Wouldn't this
 imply a disable and going back to an unprivileged container?

Uh, no.  I think the idea here is that we use the enable/disable
semantic to address some other potential problems.  Specifically,
sidestepping the problems of what happens if you change the
container's capabilities by adding/removing groups while in the middle
of using it.  So the point is that the detach fails when the group is
enabled, rather than implicitly doing anything.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: Digital signature


Re: [PATCH] vfio powerpc: implement IOMMU driver for VFIO

2013-03-20 Thread Alex Williamson
On Thu, 2013-03-21 at 12:55 +1100, David Gibson wrote:
 On Wed, Mar 20, 2013 at 02:45:24PM -0600, Alex Williamson wrote:
  On Tue, 2013-03-19 at 18:08 +1100, Alexey Kardashevskiy wrote:
   VFIO implements platform independent stuff such as
   a PCI driver, BAR access (via read/write on a file descriptor
   or direct mapping when possible) and IRQ signaling.
   
   The platform dependent part includes IOMMU initialization
   and handling. This patch implements an IOMMU driver for VFIO
   which does mapping/unmapping pages for the guest IO and
   provides information about DMA window (required by a POWERPC
   guest).
   
   The counterpart in QEMU is required to support this functionality.
   
   Changelog:
   * documentation updated
   * containter enable/disable ioctls added
   * request_module(spapr_iommu) added
   * various locks fixed
   
   Cc: David Gibson da...@gibson.dropbear.id.au
   Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
   ---
  
  
  Looking pretty good.  There's one problem with the detach_group,
  otherwise just some trivial comments below.  What's the status of the
  tce code that this depends on?  Thanks,
 
 [snip]
   +static void tce_iommu_detach_group(void *iommu_data,
   + struct iommu_group *iommu_group)
   +{
   + struct tce_container *container = iommu_data;
   + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
   +
   + BUG_ON(!tbl);
   + mutex_lock(container-lock);
   + if (tbl != container-tbl) {
   + pr_warn(tce_vfio: detaching group #%u, expected group is 
   #%u\n,
   + iommu_group_id(iommu_group),
   + iommu_group_id(tbl-it_group));
   + } else if (container-enabled) {
   + pr_err(tce_vfio: detaching group #%u from enabled container\n,
   + iommu_group_id(tbl-it_group));
  
  Hmm, something more than a pr_err needs to happen here.  Wouldn't this
  imply a disable and going back to an unprivileged container?
 
 Uh, no.  I think the idea here is that we use the enable/disable
 semantic to address some other potential problems.  Specifically,
 sidestepping the problems of what happens if you change the
 container's capabilities by adding/removing groups while in the middle
 of using it.  So the point is that the detach fails when the group is
 enabled, rather than implicitly doing anything.

The function returns void.  We're not failing the detach, just getting
into a broken state.  This is only called to unwind attaching groups
when the iommu is set or if the user explicitly calls
GROUP_UNSET_CONTAINER.  The former won't have had a chance to call
enable but the latter would need to be fixed.  Thanks,

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 5/7] KVM: Recalculate destination vcpu map

2013-03-20 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-03-20:
 On Wed, Mar 20, 2013 at 07:36:17PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 Update RTC interrrupt's destination vcpu map when ioapic entry of RTC
 or apic register (id, ldr, dfr) is changed.
 
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  virt/kvm/ioapic.c |9 +++--
  1 files changed, 7 insertions(+), 2 deletions(-)
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index ddf9414..91b4c08 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -121,6 +121,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
  {   struct kvm_ioapic *ioapic = vcpu-kvm-arch.vioapic;union
  kvm_ioapic_redirect_entry *e; + unsigned long *rtc_map =
  ioapic-rtc_status.vcpu_map;struct kvm_lapic_irq irqe;  int 
 index;
 @@ -130,15 +131,19 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
  if (!e-fields.mask 
  (e-fields.trig_mode == IOAPIC_LEVEL_TRIG ||
   kvm_irq_has_notifier(ioapic-kvm, KVM_IRQCHIP_IOAPIC,
 - index))) {
 + index) || index == 8)) {
  irqe.dest_id = e-fields.dest_id;
  irqe.vector = e-fields.vector;
  irqe.dest_mode = e-fields.dest_mode;
  irqe.shorthand = 0;
  
  if (kvm_apic_match_dest(vcpu, NULL, irqe.shorthand,
 -irqe.dest_id, irqe.dest_mode))
 +irqe.dest_id, irqe.dest_mode)) {
  __set_bit(irqe.vector, eoi_exit_bitmap);
 +if (index == 8)
 +__set_bit(vcpu-vcpu_id, rtc_map);
 +} else if (index == 8)
 +__clear_bit(vcpu-vcpu_id, rtc_map);
 rtc_map bitmap is accessed from different vcpus simultaneously so access
 has to be atomic. We also have a race:
 
 vcpu0   iothread
 ioapic config changes
 request scan ioapic
  inject rtc interrupt
  use old vcpu mask
 scan_ioapic()
 recalculate vcpu mask
 
 So this approach (suggested by me :() will not work.
 
 Need to think about it some more. May be your idea of building a bitmap
 while injecting the interrupt is the way to go indeed: pass a pointer to
 a bitmap to kvm_irq_delivery_to_apic() and build it there. Pass NULL
 pointer if caller does not need to track vcpus.
Or, we can block inject rtc interrupt during recalculate vcpu map.

if(need_eoi  0  in_recalculating)
return coalesced

Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: Avoid busy loops over uninjectable pending APIC timers

2013-03-20 Thread Gleb Natapov
On Wed, Mar 20, 2013 at 08:19:13PM -0300, Marcelo Tosatti wrote:
 On Wed, Mar 20, 2013 at 11:32:38PM +0200, Gleb Natapov wrote:
  On Wed, Mar 20, 2013 at 05:03:19PM -0300, Marcelo Tosatti wrote:
   On Wed, Mar 20, 2013 at 04:30:33PM -0300, Marcelo Tosatti wrote:
On Sun, Mar 17, 2013 at 12:47:17PM +0200, Gleb Natapov wrote:
 On Sun, Mar 17, 2013 at 11:45:34AM +0100, Jan Kiszka wrote:
  On 2013-03-17 09:47, Gleb Natapov wrote:
   On Sat, Mar 16, 2013 at 09:49:07PM +0100, Jan Kiszka wrote:
   From: Jan Kiszka jan.kis...@siemens.com
  
   If the guest didn't take the last APIC timer interrupt yet and 
   generates
   another one on top, e.g. via periodic mode, we do not block the 
   VCPU
   even if the guest state is halted. The reason is that
   apic_has_pending_timer continues to return a non-zero value.
  
   Fix this busy loop by taking the IRR content for the LVT vector 
   in
   apic_has_pending_timer into account.
  
   Just drop coalescing tacking for lapic interrupt. After posted 
   interrupt
   will be merged __apic_accept_irq() will not longer return 
   coalescing
   information, so the code will be dead anyway.
  
  That requires the RTC decoalescing series to go first to avoid a
  regression, no? Then let's postpone this topic for now.
  
 Yes, but decoalescing will work only for RTC :(

Are you proposing to drop LAPIC interrupt reinjection? 
   
   Since timer handling and injection is VCPU-local for LAPIC,
   __apic_accept_irq can (and must) return coalesced information (cannot
   drop LAPIC interrupt reinjection).
   
  Why can't we drop LAPIC interrupt reinjection? Proposed posted interrupt
  patches do not properly check for interrupt coalescing even for
  VCPU-local injection.
  
  --
  Gleb.
 
 Because older Linux guests depend on reinjection for proper timekeeping.
Which versions? Those without kvmclock? Can we make them use PIT instead?
Posted interrupts going to break them.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 5/7] KVM: Recalculate destination vcpu map

2013-03-20 Thread Gleb Natapov
On Thu, Mar 21, 2013 at 03:42:46AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-03-20:
  On Wed, Mar 20, 2013 at 07:36:17PM +0800, Yang Zhang wrote:
  From: Yang Zhang yang.z.zh...@intel.com
  
  Update RTC interrrupt's destination vcpu map when ioapic entry of RTC
  or apic register (id, ldr, dfr) is changed.
  
  Signed-off-by: Yang Zhang yang.z.zh...@intel.com
  ---
   virt/kvm/ioapic.c |9 +++--
   1 files changed, 7 insertions(+), 2 deletions(-)
  diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
  index ddf9414..91b4c08 100644
  --- a/virt/kvm/ioapic.c
  +++ b/virt/kvm/ioapic.c
  @@ -121,6 +121,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
   { struct kvm_ioapic *ioapic = vcpu-kvm-arch.vioapic;union
   kvm_ioapic_redirect_entry *e; +   unsigned long *rtc_map =
   ioapic-rtc_status.vcpu_map;  struct kvm_lapic_irq irqe;  int 
  index;
  @@ -130,15 +131,19 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
 if (!e-fields.mask 
 (e-fields.trig_mode == IOAPIC_LEVEL_TRIG ||
  kvm_irq_has_notifier(ioapic-kvm, KVM_IRQCHIP_IOAPIC,
  -   index))) {
  +   index) || index == 8)) {
 irqe.dest_id = e-fields.dest_id;
 irqe.vector = e-fields.vector;
 irqe.dest_mode = e-fields.dest_mode;
 irqe.shorthand = 0;
   
 if (kvm_apic_match_dest(vcpu, NULL, irqe.shorthand,
  -  irqe.dest_id, irqe.dest_mode))
  +  irqe.dest_id, irqe.dest_mode)) {
 __set_bit(irqe.vector, eoi_exit_bitmap);
  +  if (index == 8)
  +  __set_bit(vcpu-vcpu_id, rtc_map);
  +  } else if (index == 8)
  +  __clear_bit(vcpu-vcpu_id, rtc_map);
  rtc_map bitmap is accessed from different vcpus simultaneously so access
  has to be atomic. We also have a race:
  
  vcpu0   iothread
  ioapic config changes
  request scan ioapic
   inject rtc interrupt
   use old vcpu mask
  scan_ioapic()
  recalculate vcpu mask
  
  So this approach (suggested by me :() will not work.
  
  Need to think about it some more. May be your idea of building a bitmap
  while injecting the interrupt is the way to go indeed: pass a pointer to
  a bitmap to kvm_irq_delivery_to_apic() and build it there. Pass NULL
  pointer if caller does not need to track vcpus.
 Or, we can block inject rtc interrupt during recalculate vcpu map.
 
 if(need_eoi  0  in_recalculating)
 return coalesced
 
This should be ||. Then you need to maintain in_recalculating and
recalculations requests may overlap. Too complex and fragile.
 
--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 5/7] KVM: Recalculate destination vcpu map

2013-03-20 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-03-21:
 On Thu, Mar 21, 2013 at 03:42:46AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-03-20:
 On Wed, Mar 20, 2013 at 07:36:17PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 Update RTC interrrupt's destination vcpu map when ioapic entry of RTC
 or apic register (id, ldr, dfr) is changed.
 
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  virt/kvm/ioapic.c |9 +++--
  1 files changed, 7 insertions(+), 2 deletions(-)
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index ddf9414..91b4c08 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -121,6 +121,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
  { struct kvm_ioapic *ioapic = vcpu-kvm-arch.vioapic;union
  kvm_ioapic_redirect_entry *e; +   unsigned long *rtc_map =
  ioapic-rtc_status.vcpu_map;  struct kvm_lapic_irq irqe;  int 
 index;
 @@ -130,15 +131,19 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu
 *vcpu,
if (!e-fields.mask 
(e-fields.trig_mode == IOAPIC_LEVEL_TRIG ||
 kvm_irq_has_notifier(ioapic-kvm, KVM_IRQCHIP_IOAPIC,
 -   index))) {
 +   index) || index == 8)) {
irqe.dest_id = e-fields.dest_id;
irqe.vector = e-fields.vector;
irqe.dest_mode = e-fields.dest_mode;
irqe.shorthand = 0;
  
if (kvm_apic_match_dest(vcpu, NULL, irqe.shorthand,
 -  irqe.dest_id, irqe.dest_mode))
 +  irqe.dest_id, irqe.dest_mode)) {
__set_bit(irqe.vector, eoi_exit_bitmap);
 +  if (index == 8)
 +  __set_bit(vcpu-vcpu_id, rtc_map);
 +  } else if (index == 8)
 +  __clear_bit(vcpu-vcpu_id, rtc_map);
 rtc_map bitmap is accessed from different vcpus simultaneously so access
 has to be atomic. We also have a race:
 
 vcpu0   iothread
 ioapic config changes
 request scan ioapic
  inject rtc interrupt
  use old vcpu mask
 scan_ioapic()
 recalculate vcpu mask
 
 So this approach (suggested by me :() will not work.
 
 Need to think about it some more. May be your idea of building a bitmap
 while injecting the interrupt is the way to go indeed: pass a pointer to
 a bitmap to kvm_irq_delivery_to_apic() and build it there. Pass NULL
 pointer if caller does not need to track vcpus.
 Or, we can block inject rtc interrupt during recalculate vcpu map.
 
 if(need_eoi  0  in_recalculating)
 return coalesced
 
 This should be ||. Then you need to maintain in_recalculating and
 recalculations requests may overlap. Too complex and fragile.
It should not be too complex. How about the following logic?

when make scan ioapic request:
kvm_vcpu_scan_ioapic()
{
kvm_for_each_vcpu()
in_recalculating++;
}

Then on each vcpu's request handler:
vcpu_scan_ioapic()
{
in_recalculating--;
}

And when delivering RTC interrupt:
if(need_eoi  0 || in_recalculating)
return coalesced


 
 --
   Gleb.
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 5/7] KVM: Recalculate destination vcpu map

2013-03-20 Thread Gleb Natapov
On Thu, Mar 21, 2013 at 05:30:32AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-03-21:
  On Thu, Mar 21, 2013 at 03:42:46AM +, Zhang, Yang Z wrote:
  Gleb Natapov wrote on 2013-03-20:
  On Wed, Mar 20, 2013 at 07:36:17PM +0800, Yang Zhang wrote:
  From: Yang Zhang yang.z.zh...@intel.com
  
  Update RTC interrrupt's destination vcpu map when ioapic entry of RTC
  or apic register (id, ldr, dfr) is changed.
  
  Signed-off-by: Yang Zhang yang.z.zh...@intel.com
  ---
   virt/kvm/ioapic.c |9 +++--
   1 files changed, 7 insertions(+), 2 deletions(-)
  diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
  index ddf9414..91b4c08 100644
  --- a/virt/kvm/ioapic.c
  +++ b/virt/kvm/ioapic.c
  @@ -121,6 +121,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
   {   struct kvm_ioapic *ioapic = vcpu-kvm-arch.vioapic;union
   kvm_ioapic_redirect_entry *e; + unsigned long *rtc_map =
   ioapic-rtc_status.vcpu_map;struct kvm_lapic_irq irqe;  int 
  index;
  @@ -130,15 +131,19 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu
  *vcpu,
   if (!e-fields.mask 
   (e-fields.trig_mode == IOAPIC_LEVEL_TRIG ||
kvm_irq_has_notifier(ioapic-kvm, 
  KVM_IRQCHIP_IOAPIC,
  - index))) {
  + index) || index == 8)) {
   irqe.dest_id = e-fields.dest_id;
   irqe.vector = e-fields.vector;
   irqe.dest_mode = e-fields.dest_mode;
   irqe.shorthand = 0;
   
   if (kvm_apic_match_dest(vcpu, NULL, 
  irqe.shorthand,
  -irqe.dest_id, 
  irqe.dest_mode))
  +irqe.dest_id, 
  irqe.dest_mode)) {
   __set_bit(irqe.vector, eoi_exit_bitmap);
  +if (index == 8)
  +__set_bit(vcpu-vcpu_id, 
  rtc_map);
  +} else if (index == 8)
  +__clear_bit(vcpu-vcpu_id, rtc_map);
  rtc_map bitmap is accessed from different vcpus simultaneously so access
  has to be atomic. We also have a race:
  
  vcpu0   iothread
  ioapic config changes
  request scan ioapic
   inject rtc interrupt
   use old vcpu mask
  scan_ioapic()
  recalculate vcpu mask
  
  So this approach (suggested by me :() will not work.
  
  Need to think about it some more. May be your idea of building a bitmap
  while injecting the interrupt is the way to go indeed: pass a pointer to
  a bitmap to kvm_irq_delivery_to_apic() and build it there. Pass NULL
  pointer if caller does not need to track vcpus.
  Or, we can block inject rtc interrupt during recalculate vcpu map.
  
  if(need_eoi  0  in_recalculating)
  return coalesced
  
  This should be ||. Then you need to maintain in_recalculating and
  recalculations requests may overlap. Too complex and fragile.
 It should not be too complex. How about the following logic?
 
 when make scan ioapic request:
 kvm_vcpu_scan_ioapic()
 {
 kvm_for_each_vcpu()
   in_recalculating++;
 }
 
 Then on each vcpu's request handler:
 vcpu_scan_ioapic()
 {
 in_recalculating--;
 }
 
kvm_vcpu_scan_ioapic() can be called more often then vcpu_scan_ioapic()

 And when delivering RTC interrupt:
 if(need_eoi  0 || in_recalculating)
   return coalesced
 
 
  
  --
  Gleb.
  --
  To unsubscribe from this list: send the line unsubscribe kvm in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
 Best regards,
 Yang
 

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 5/7] KVM: Recalculate destination vcpu map

2013-03-20 Thread Zhang, Yang Z
Gleb Natapov wrote on 2013-03-21:
 On Thu, Mar 21, 2013 at 05:30:32AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-03-21:
 On Thu, Mar 21, 2013 at 03:42:46AM +, Zhang, Yang Z wrote:
 Gleb Natapov wrote on 2013-03-20:
 On Wed, Mar 20, 2013 at 07:36:17PM +0800, Yang Zhang wrote:
 From: Yang Zhang yang.z.zh...@intel.com
 
 Update RTC interrrupt's destination vcpu map when ioapic entry of RTC
 or apic register (id, ldr, dfr) is changed.
 
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 ---
  virt/kvm/ioapic.c |9 +++--
  1 files changed, 7 insertions(+), 2 deletions(-)
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index ddf9414..91b4c08 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -121,6 +121,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu
 *vcpu,
  {   struct kvm_ioapic *ioapic = vcpu-kvm-arch.vioapic;union
  kvm_ioapic_redirect_entry *e; + unsigned long *rtc_map =
  ioapic-rtc_status.vcpu_map;struct kvm_lapic_irq irqe;  int 
 index;
 @@ -130,15 +131,19 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu
 *vcpu,
  if (!e-fields.mask 
  (e-fields.trig_mode == IOAPIC_LEVEL_TRIG ||
   kvm_irq_has_notifier(ioapic-kvm, 
 KVM_IRQCHIP_IOAPIC,
 - index))) {
 + index) || index == 8)) {
  irqe.dest_id = e-fields.dest_id;
  irqe.vector = e-fields.vector;
  irqe.dest_mode = e-fields.dest_mode;
  irqe.shorthand = 0;
  
  if (kvm_apic_match_dest(vcpu, NULL, 
 irqe.shorthand,
 -irqe.dest_id, 
 irqe.dest_mode))
 +irqe.dest_id, 
 irqe.dest_mode)) {
  __set_bit(irqe.vector, eoi_exit_bitmap);
 +if (index == 8)
 +__set_bit(vcpu-vcpu_id, 
 rtc_map);
 +} else if (index == 8)
 +__clear_bit(vcpu-vcpu_id, rtc_map);
 rtc_map bitmap is accessed from different vcpus simultaneously so access
 has to be atomic. We also have a race:
 
 vcpu0   iothread
 ioapic config changes
 request scan ioapic
  inject rtc interrupt
  use old vcpu mask
 scan_ioapic()
 recalculate vcpu mask
 
 So this approach (suggested by me :() will not work.
 
 Need to think about it some more. May be your idea of building a bitmap
 while injecting the interrupt is the way to go indeed: pass a pointer to
 a bitmap to kvm_irq_delivery_to_apic() and build it there. Pass NULL
 pointer if caller does not need to track vcpus.
 Or, we can block inject rtc interrupt during recalculate vcpu map.
 
 if(need_eoi  0  in_recalculating)
 return coalesced
 
 This should be ||. Then you need to maintain in_recalculating and
 recalculations requests may overlap. Too complex and fragile.
 It should not be too complex. How about the following logic?
 
 when make scan ioapic request:
 kvm_vcpu_scan_ioapic()
 {
 kvm_for_each_vcpu()
  in_recalculating++;
 }
 
 Then on each vcpu's request handler:
 vcpu_scan_ioapic()
 {
 in_recalculating--;
 }
 
 kvm_vcpu_scan_ioapic() can be called more often then vcpu_scan_ioapic()
Ok. I see your point. Maybe we need to rollback to old idea.

Can you pick the first two patches? If rollback to old way, it will not touch 
those code.

 
 And when delivering RTC interrupt:
 if(need_eoi  0 || in_recalculating)
  return coalesced
 
 
 
 --
 Gleb.
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
 Best regards,
 Yang
 
 
 --
   Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM: MMU: improve n_max_mmu_pages calculation with TDP

2013-03-20 Thread Xiao Guangrong
On 03/21/2013 04:14 AM, Marcelo Tosatti wrote:
 
 kvm_mmu_calculate_mmu_pages numbers, 
 
 maximum number of shadow pages = 2% of mapped guest pages
 
 Does not make sense for TDP guests where mapping all of guest
 memory with 4k pages cannot exceed mapped guest pages / 512
 (not counting root pages).
 
 Allow that maximum for TDP, forcing the guest to recycle otherwise.
 
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 
 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index 956ca35..a9694a8d7 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -4293,7 +4293,7 @@ nomem:
  unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
  {
   unsigned int nr_mmu_pages;
 - unsigned int  nr_pages = 0;
 + unsigned int i, nr_pages = 0;
   struct kvm_memslots *slots;
   struct kvm_memory_slot *memslot;
 
 @@ -4302,7 +4302,19 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm 
 *kvm)
   kvm_for_each_memslot(memslot, slots)
   nr_pages += memslot-npages;
 
 - nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
 + if (tdp_enabled) {
 + /* one root page */
 + nr_mmu_pages = 1;
 + /* nr_pages / (512^i) per level, due to
 +  * guest RAM map being linear */
 + for (i = 1; i  4; i++) {
 + int nr_pages_round = nr_pages + (1  (9*i));
 + nr_mmu_pages += nr_pages_round  (9*i);
 + }

Marcelo,

Can it work if nested guest is used? Did you see any problem in practice 
(direct guest
uses more memory than your calculation)?

And mmio also can build some page table that looks like not considered
in this patch.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] [RFC] bookehv: Handle debug exception on guest exit

2013-03-20 Thread Bharat Bhushan
EPCR.DUVD controls whether the debug events can come in
hypervisor mode or not. When KVM guest is using the debug
resource then we do not want debug events to be captured
in guest entry/exit path. So we set EPCR.DUVD when entering
and clears EPCR.DUVD when exiting from guest.

Debug instruction complete is a post-completion debug
exception but debug event gets posted on the basis of MSR
before the instruction is executed. Now if the instruction
switches the context from guest mode (MSR.GS = 1) to hypervisor
mode (MSR.GS = 0) then the xSRR0 points to first instruction of
KVM handler and xSRR1 points that MSR.GS is clear
(hypervisor context). Now as xSRR1.GS is used to decide whether
KVM handler will be invoked to handle the exception or host
host kernel debug handler will be invoked to handle the exception.
This leads to host kernel debug handler handling the exception
which should either be handled by KVM.

This is tested on e500mc in 32 bit mode

Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
---
 arch/powerpc/kernel/exceptions-64e.S |   54 ++
 arch/powerpc/kernel/head_booke.h |   35 ++
 2 files changed, 89 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 4684e33..56882a0 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -450,6 +450,33 @@ interrupt_end_book3e:
andis.  r15,r14,DBSR_IC@h
beq+1f
 
+#ifdef CONFIG_KVM_BOOKE_HV
+   /*
+* EPCR.DUVD controls whether the debug events can come in
+* hypervisor mode or not. When KVM guest is using the debug
+* resource then we do not want debug events to be captured
+* in guest entry/exit path. So we set EPCR.DUVD when entering
+* and clears EPCR.DUVD when exiting from guest.
+* Debug instruction complete is a post-completion debug
+* exception but debug event gets posted on the basis of MSR
+* before the instruction is executed. Now if the instruction
+* switches the context from guest mode (MSR.GS = 1) to hypervisor
+* mode (MSR.GS = 0) then the xSRR0 points to first instruction of
+* KVM handler and xSRR1 points that MSR.GS is clear
+* (hypervisor context). Now as xSRR1.GS is used to decide whether
+* KVM handler will be invoked to handle the exception or host
+* host kernel debug handler will be invoked to handle the exception.
+* This leads to host kernel debug handler handling the exception
+* which should either be handled by KVM.
+*/
+   mfspr   r10, SPRN_EPCR
+   andis.  r10,r10,SPRN_EPCR_DUVD@h
+   beq+2f
+
+   andis.  r10,r9,MSR_GS@h
+   beq+1f
+2:
+#endif
LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e)
cmpld   cr0,r10,r14
@@ -516,6 +543,33 @@ kernel_dbg_exc:
andis.  r15,r14,DBSR_IC@h
beq+1f
 
+#ifdef CONFIG_KVM_BOOKE_HV
+   /*
+* EPCR.DUVD controls whether the debug events can come in
+* hypervisor mode or not. When KVM guest is using the debug
+* resource then we do not want debug events to be captured
+* in guest entry/exit path. So we set EPCR.DUVD when entering
+* and clears EPCR.DUVD when exiting from guest.
+* Debug instruction complete is a post-completion debug
+* exception but debug event gets posted on the basis of MSR
+* before the instruction is executed. Now if the instruction
+* switches the context from guest mode (MSR.GS = 1) to hypervisor
+* mode (MSR.GS = 0) then the xSRR0 points to first instruction of
+* KVM handler and xSRR1 points that MSR.GS is clear
+* (hypervisor context). Now as xSRR1.GS is used to decide whether
+* KVM handler will be invoked to handle the exception or host
+* host kernel debug handler will be invoked to handle the exception.
+* This leads to host kernel debug handler handling the exception
+* which should either be handled by KVM.
+*/
+   mfspr   r10, SPRN_EPCR
+   andis.  r10,r10,SPRN_EPCR_DUVD@h
+   beq+2f
+
+   andis.  r10,r9,MSR_GS@h
+   beq+1f
+2:
+#endif
LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e)
cmpld   cr0,r10,r14
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 5f051ee..040b0a3 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -285,7 +285,33 @@ label:
mfspr   r10,SPRN_DBSR;  /* check single-step/branch taken */  \
andis.  r10,r10,(DBSR_IC|DBSR_BT)@h;  \
beq+2f;   \
+#ifdef CONFIG_KVM_BOOKE_HV

[PATCH] bookehv: Handle debug exception on guest exit

2013-03-20 Thread Bharat Bhushan
EPCR.DUVD controls whether the debug events can come in
hypervisor mode or not. When KVM guest is using the debug
resource then we do not want debug events to be captured
in guest entry/exit path. So we set EPCR.DUVD when entering
and clears EPCR.DUVD when exiting from guest.

Debug instruction complete is a post-completion debug
exception but debug event gets posted on the basis of MSR
before the instruction is executed. Now if the instruction
switches the context from guest mode (MSR.GS = 1) to hypervisor
mode (MSR.GS = 0) then the xSRR0 points to first instruction of
KVM handler and xSRR1 points that MSR.GS is clear
(hypervisor context). Now as xSRR1.GS is used to decide whether
KVM handler will be invoked to handle the exception or host
host kernel debug handler will be invoked to handle the exception.
This leads to host kernel debug handler handling the exception
which should either be handled by KVM.

This is tested on e500mc in 32 bit mode

Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
---
v0:
 - Do not apply this change for debug_crit as we do not know those chips have 
issue or not.
 - corrected 64bit case branching

 arch/powerpc/kernel/exceptions-64e.S |   29 -
 arch/powerpc/kernel/head_booke.h |   26 ++
 2 files changed, 54 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 4684e33..8b26294 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -516,6 +516,33 @@ kernel_dbg_exc:
andis.  r15,r14,DBSR_IC@h
beq+1f
 
+#ifdef CONFIG_KVM_BOOKE_HV
+   /*
+* EPCR.DUVD controls whether the debug events can come in
+* hypervisor mode or not. When KVM guest is using the debug
+* resource then we do not want debug events to be captured
+* in guest entry/exit path. So we set EPCR.DUVD when entering
+* and clears EPCR.DUVD when exiting from guest.
+* Debug instruction complete is a post-completion debug
+* exception but debug event gets posted on the basis of MSR
+* before the instruction is executed. Now if the instruction
+* switches the context from guest mode (MSR.GS = 1) to hypervisor
+* mode (MSR.GS = 0) then the xSRR0 points to first instruction of
+* KVM handler and xSRR1 points that MSR.GS is clear
+* (hypervisor context). Now as xSRR1.GS is used to decide whether
+* KVM handler will be invoked to handle the exception or host
+* host kernel debug handler will be invoked to handle the exception.
+* This leads to host kernel debug handler handling the exception
+* which should either be handled by KVM.
+*/
+   mfspr   r10, SPRN_EPCR
+   andis.  r10,r10,SPRN_EPCR_DUVD@h
+   beq+2f
+
+   andis.  r10,r9,MSR_GS@h
+   beq+3f
+2:
+#endif
LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e)
cmpld   cr0,r10,r14
@@ -523,7 +550,7 @@ kernel_dbg_exc:
blt+cr0,1f
bge+cr1,1f
 
-   /* here it looks like we got an inappropriate debug exception. */
+3: /* here it looks like we got an inappropriate debug exception. */
lis r14,DBSR_IC@h   /* clear the IC event */
rlwinm  r11,r11,0,~MSR_DE   /* clear DE in the DSRR1 value */
mtspr   SPRN_DBSR,r14
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 5f051ee..edc6a3b 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -285,7 +285,33 @@ label:
mfspr   r10,SPRN_DBSR;  /* check single-step/branch taken */  \
andis.  r10,r10,(DBSR_IC|DBSR_BT)@h;  \
beq+2f;   \
+#ifdef CONFIG_KVM_BOOKE_HV   \
+   /*\
+* EPCR.DUVD controls whether the debug events can come in\
+* hypervisor mode or not. When KVM guest is using the debug  \
+* resource then we do not want debug events to be captured   \
+* in guest entry/exit path. So we set EPCR.DUVD when entering\
+* and clears EPCR.DUVD when exiting from guest.  \
+* Debug instruction complete is a post-completion debug  \
+* exception but debug event gets posted on the basis of MSR  \
+* before the instruction is executed. Now if the instruction \
+* switches the context from guest mode (MSR.GS = 1) to hypervisor\
+* mode (MSR.GS = 0) then the xSRR0 points to first instruction of\
+* KVM handler and xSRR1 points that MSR.GS is clear  \
+