date:20140911

Re: [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast

2014-09-11 Thread Jan Kiszka

On 2014-09-11 07:06, Zhang Haoyu wrote:
 Currently, we call ioapic_service() immediately when we find the irq is still
 active during eoi broadcast. But for real hardware, there's some dealy between
 the EOI writing and irq delivery (system bus latency?). So we need to emulate
 this behavior. Otherwise, for a guest who haven't register a proper irq 
 handler
 , it would stay in the interrupt routine as this irq would be re-injected
 immediately after guest enables interrupt. This would lead guest can't move
 forward and may miss the possibility to get proper irq handler registered (one
 example is windows guest resuming from hibernation).
 
 As there's no way to differ the unhandled irq from new raised ones, this patch
 solve this problems by scheduling a delayed work when the count of irq 
 injected
 during eoi broadcast exceeds a threshold value. After this patch, the guest 
 can
 move a little forward when there's no suitable irq handler in case it may
 register one very soon and for guest who has a bad irq detection routine ( 
 such
 as note_interrupt() in linux ), this bad irq would be recognized soon as in 
 the
 past.
 
 Cc: Michael S. Tsirkin m...@redhat.com
 Signed-off-by: Jason Wang jasow...@redhat.com
 Signed-off-by: Zhang Haoyu zhan...@sangfor.com
 ---
  include/trace/events/kvm.h | 20 ++
  virt/kvm/ioapic.c  | 51 
 --
  virt/kvm/ioapic.h  |  6 ++
  3 files changed, 75 insertions(+), 2 deletions(-)
 
 diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
 index 908925a..b05f688 100644
 --- a/include/trace/events/kvm.h
 +++ b/include/trace/events/kvm.h
 @@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
 __entry-coalesced ?  (coalesced) : )
  );
  
 +TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
 + TP_PROTO(__u64 e),
 + TP_ARGS(e),
 +
 + TP_STRUCT__entry(
 + __field(__u64,  e   )
 + ),
 +
 + TP_fast_assign(
 + __entry-e  = e;
 + ),
 +
 + TP_printk(dst %x vec=%u (%s|%s|%s%s),
 +   (u8)(__entry-e  56), (u8)__entry-e,
 +   __print_symbolic((__entry-e  8  0x7), kvm_deliver_mode),
 +   (__entry-e  (111)) ? logical : physical,
 +   (__entry-e  (115)) ? level : edge,
 +   (__entry-e  (116)) ? |masked : )
 +);
 +
  TRACE_EVENT(kvm_msi_set_irq,
   TP_PROTO(__u64 address, __u64 data),
   TP_ARGS(address, data),
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index e8ce34c..a36c4c4 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int 
 irq_source_id)
   spin_unlock(ioapic-lock);
  }
  
 +static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
 +{
 + int i;
 + struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
 +  eoi_inject.work);
 + spin_lock(ioapic-lock);
 + for (i = 0; i  IOAPIC_NUM_PINS; i++) {
 + union kvm_ioapic_redirect_entry *ent = ioapic-redirtbl[i];
 +
 + if (ent-fields.trig_mode != IOAPIC_LEVEL_TRIG)
 + continue;
 +
 + if (ioapic-irr  (1  i)  !ent-fields.remote_irr)
 + ioapic_service(ioapic, i, false);
 + }
 + spin_unlock(ioapic-lock);
 +}
 +
  static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
   struct kvm_ioapic *ioapic, int vector, int trigger_mode)
  {
 @@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu 
 *vcpu,
  
   ASSERT(ent-fields.trig_mode == IOAPIC_LEVEL_TRIG);
   ent-fields.remote_irr = 0;
 - if (ioapic-irr  (1  i))
 - ioapic_service(ioapic, i, false);
 + if (!ent-fields.mask  (ioapic-irr  (1  i))) {

The mask check is new - why now? You don't check it in the work handler
as well.

 + ++ioapic-irq_eoi[i];
 + if (ioapic-irq_eoi[i] == 
 IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
 + /*
 +  * Real hardware does not deliver the irq so
 +  * immediately during eoi broadcast, so we need
 +  * to emulate this behavior. Otherwise, for
 +  * guests who has not registered handler of a
 +  * level irq, this irq would be injected
 +  * immediately after guest enables interrupt
 +  * (which happens usually at the end of the
 +  * common interrupt routine). This would lead
 +  * guest can't move forward and may miss the
 +  * possibility to get proper irq handler
 +  * registered. So we

Re: [PATCH] KVM: ioapic: add check for RTC_GSI

2014-09-11 Thread Jan Kiszka

On 2014-09-10 18:53, Chris J Arges wrote:
 cppcheck found the following error:
 [ioapic.c:114]: (error) Array index -1 is out of bounds.
 
 If CONFIG_X86 is not defined, RTC_GSI == -1U which means that an out of bounds

That issue is limited to the dead ia64 architecture, no other builds
ioapic.c (besides x86, of course). Maybe finally remove KVM support for
that arch instead of bloating otherwise correct code?

Jan

 error could occur when accessing ioapic-redirtbl[RTC_GSI].
 
 This patch adds a check to kvm_rtc_eoi_tracking_restore_one that is similar to
 how kvm_rtc_eoi_tracking_restore_all checks this condition.
 
 Signed-off-by: Chris J Arges chris.j.ar...@canonical.com
 ---
  virt/kvm/ioapic.c | 3 +++
  1 file changed, 3 insertions(+)
 
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index e8ce34c..859f5b8 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -136,6 +136,9 @@ void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu 
 *vcpu)
  {
   struct kvm_ioapic *ioapic = vcpu-kvm-arch.vioapic;
  
 + if (RTC_GSI = IOAPIC_NUM_PINS)
 + return;
 +
   spin_lock(ioapic-lock);
   __rtc_irq_eoi_tracking_restore_one(vcpu);
   spin_unlock(ioapic-lock);
 

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/2] virtio-rng cleanup: move some code out of mutex protection

2014-09-11 Thread Amit Shah

On (Wed) 10 Sep 2014 [17:07:06], Amos Kong wrote:
 It doesn't save too much cpu time as expected, just a cleanup.

Frankly I won't bother with this.  It doesn't completely remove all
copying from the mutex, so it's not worthwhile.

 Signed-off-by: Amos Kong ak...@redhat.com
 ---
  drivers/char/hw_random/core.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)
 
 diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
 index aa30a25..c591d7e 100644
 --- a/drivers/char/hw_random/core.c
 +++ b/drivers/char/hw_random/core.c
 @@ -270,8 +270,8 @@ static ssize_t hwrng_attr_current_show(struct device *dev,
   return -ERESTARTSYS;
   if (current_rng)
   name = current_rng-name;
 - ret = snprintf(buf, PAGE_SIZE, %s\n, name);
   mutex_unlock(rng_mutex);
 + ret = snprintf(buf, PAGE_SIZE, %s\n, name);
  
   return ret;
  }
 @@ -284,19 +284,19 @@ static ssize_t hwrng_attr_available_show(struct device 
 *dev,
   ssize_t ret = 0;
   struct hwrng *rng;
  
 + buf[0] = '\0';
   err = mutex_lock_interruptible(rng_mutex);
   if (err)
   return -ERESTARTSYS;
 - buf[0] = '\0';
   list_for_each_entry(rng, rng_list, list) {
   strncat(buf, rng-name, PAGE_SIZE - ret - 1);
   ret += strlen(rng-name);
   strncat(buf,  , PAGE_SIZE - ret - 1);
   ret++;
   }
 + mutex_unlock(rng_mutex);
   strncat(buf, \n, PAGE_SIZE - ret - 1);
   ret++;
 - mutex_unlock(rng_mutex);
  
   return ret;
  }
 -- 
 1.9.3
 


Amit
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes

2014-09-11 Thread Amit Shah

On (Wed) 10 Sep 2014 [17:07:07], Amos Kong wrote:
 When I check hwrng attributes in sysfs, cat process always gets
 stuck if guest has only 1 vcpu and uses a slow rng backend.
 
 Currently we check if there is any tasks waiting to be run on
 current cpu in rng_dev_read() by need_resched(). But need_resched()
 doesn't work because rng_dev_read() is executing in user context.
 
 This patch removed need_resched() and increase delay to 10 jiffies,
 then other tasks can have chance to execute protected code.
 Delaying 1 jiffy also works, but 10 jiffies is safer.

I'd prefer two patches for this one: one to remove the need_resched()
check, and the other to increase the timeout.

Anyway,

Reviewed-by: Amit Shah amit.s...@redhat.com

 
 Signed-off-by: Amos Kong ak...@redhat.com
 ---
  drivers/char/hw_random/core.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)
 
 diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
 index c591d7e..b5d1b6f 100644
 --- a/drivers/char/hw_random/core.c
 +++ b/drivers/char/hw_random/core.c
 @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char 
 __user *buf,
  
   mutex_unlock(rng_mutex);
  
 - if (need_resched())
 - schedule_timeout_interruptible(1);
 + schedule_timeout_interruptible(10);
  
   if (signal_pending(current)) {
   err = -ERESTARTSYS;
 -- 
 1.9.3
 

Amit
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [question] virtio-blk performance degradationhappenedwith virito-serial

2014-09-11 Thread Amit Shah

On (Sun) 07 Sep 2014 [17:46:26], Zhang Haoyu wrote:
 Hi, Paolo, Amit,
 any ideas?

I'll check this, thanks for testing with Linux guests.


Amit
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: EVENTFD: remove inclusion of irq.h

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 05:09, Christoffer Dall ha scritto:
 On Mon, Sep 01, 2014 at 12:11:19PM +0200, Paolo Bonzini wrote:
 Il 01/09/2014 10:36, Eric Auger ha scritto:
 No more needed. irq.h would be void on ARM.

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 I don't think irq.h is needed anymore since Paul Mackerras' work. However
 I did not compile for all architectures.
 ---
  virt/kvm/eventfd.c | 1 -
  1 file changed, 1 deletion(-)

 diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
 index 3c5981c..0c712a7 100644
 --- a/virt/kvm/eventfd.c
 +++ b/virt/kvm/eventfd.c
 @@ -36,7 +36,6 @@
  #include linux/seqlock.h
  #include trace/events/kvm.h
  
 -#include irq.h
  #include iodev.h
  
  #ifdef CONFIG_HAVE_KVM_IRQFD


 Acked-by: Paolo Bonzini pbonz...@redhat.com

 Christoffer, please include this via the ARM tree, together with ARM
 irqfd support.  Thanks,

 Marc is dealing with the tree this week and the next so he can apply it
 to kvmarm/queue.  Do you want it to wait and go with the irqfd patch
 (which has dependencies not yet resolved) or should we just queue it?

As you prefer.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm: ioapic: conditionally delay irq delivery duringeoi broadcast

2014-09-11 Thread Zhang Haoyu

 Currently, we call ioapic_service() immediately when we find the irq is still
 active during eoi broadcast. But for real hardware, there's some dealy 
 between
 the EOI writing and irq delivery (system bus latency?). So we need to emulate
 this behavior. Otherwise, for a guest who haven't register a proper irq 
 handler
 , it would stay in the interrupt routine as this irq would be re-injected
 immediately after guest enables interrupt. This would lead guest can't move
 forward and may miss the possibility to get proper irq handler registered 
 (one
 example is windows guest resuming from hibernation).
 
 As there's no way to differ the unhandled irq from new raised ones, this 
 patch
 solve this problems by scheduling a delayed work when the count of irq 
 injected
 during eoi broadcast exceeds a threshold value. After this patch, the guest 
 can
 move a little forward when there's no suitable irq handler in case it may
 register one very soon and for guest who has a bad irq detection routine ( 
 such
 as note_interrupt() in linux ), this bad irq would be recognized soon as in 
 the
 past.
 
 Cc: Michael S. Tsirkin m...@redhat.com
 Signed-off-by: Jason Wang jasow...@redhat.com
 Signed-off-by: Zhang Haoyu zhan...@sangfor.com
 ---
  include/trace/events/kvm.h | 20 ++
  virt/kvm/ioapic.c  | 51 
 --
  virt/kvm/ioapic.h  |  6 ++
  3 files changed, 75 insertions(+), 2 deletions(-)
 
 diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
 index 908925a..b05f688 100644
 --- a/include/trace/events/kvm.h
 +++ b/include/trace/events/kvm.h
 @@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
__entry-coalesced ?  (coalesced) : )
  );
  
 +TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
 +TP_PROTO(__u64 e),
 +TP_ARGS(e),
 +
 +TP_STRUCT__entry(
 +__field(__u64,  e   )
 +),
 +
 +TP_fast_assign(
 +__entry-e  = e;
 +),
 +
 +TP_printk(dst %x vec=%u (%s|%s|%s%s),
 +  (u8)(__entry-e  56), (u8)__entry-e,
 +  __print_symbolic((__entry-e  8  0x7), kvm_deliver_mode),
 +  (__entry-e  (111)) ? logical : physical,
 +  (__entry-e  (115)) ? level : edge,
 +  (__entry-e  (116)) ? |masked : )
 +);
 +
  TRACE_EVENT(kvm_msi_set_irq,
  TP_PROTO(__u64 address, __u64 data),
  TP_ARGS(address, data),
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index e8ce34c..a36c4c4 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, 
 int irq_source_id)
  spin_unlock(ioapic-lock);
  }
  
 +static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
 +{
 +int i;
 +struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
 + eoi_inject.work);
 +spin_lock(ioapic-lock);
 +for (i = 0; i  IOAPIC_NUM_PINS; i++) {
 +union kvm_ioapic_redirect_entry *ent = ioapic-redirtbl[i];
 +
 +if (ent-fields.trig_mode != IOAPIC_LEVEL_TRIG)
 +continue;
 +
 +if (ioapic-irr  (1  i)  !ent-fields.remote_irr)
 +ioapic_service(ioapic, i, false);
 +}
 +spin_unlock(ioapic-lock);
 +}
 +
  static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
  struct kvm_ioapic *ioapic, int vector, int trigger_mode)
  {
 @@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu 
 *vcpu,
  
  ASSERT(ent-fields.trig_mode == IOAPIC_LEVEL_TRIG);
  ent-fields.remote_irr = 0;
 -if (ioapic-irr  (1  i))
 -ioapic_service(ioapic, i, false);
 +if (!ent-fields.mask  (ioapic-irr  (1  i))) {

The mask check is new - why now? You don't check it in the work handler
as well.

The mask check is to avoid incrementing ioapic-irq_eoi[i] when this irq is 
masked, the count should be zeroed,
but needless to check it in the work handler, the check will be performed in 
ioapic_service().

 +++ioapic-irq_eoi[i];
 +if (ioapic-irq_eoi[i] == 
 IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
 +/*
 + * Real hardware does not deliver the irq so
 + * immediately during eoi broadcast, so we need
 + * to emulate this behavior. Otherwise, for
 + * guests who has not registered handler of a
 + * level irq, this irq would be injected
 + * immediately after guest enables interrupt
 + * (which happens usually at the end of the
 + * common interrupt routine). This would lead
 + * guest can't move forward and may miss the

Re: [PATCH v3] ARM: KVM: add irqfd support

2014-09-11 Thread Eric Auger

On 09/11/2014 05:09 AM, Christoffer Dall wrote:
 On Mon, Sep 01, 2014 at 10:53:04AM +0200, Eric Auger wrote:
 This patch enables irqfd on ARM.

 irqfd framework enables to inject a virtual IRQ into a guest upon an
 eventfd trigger. User-side uses KVM_IRQFD VM ioctl to provide KVM with
 a kvm_irqfd struct that associates a VM, an eventfd, a virtual IRQ number
 (aka. the gsi). When an actor signals the eventfd (typically a VFIO
 platform driver), the kvm irqfd subsystem injects the provided virtual
 IRQ into the guest.

 Resamplefd also is supported for level sensitive interrupts, ie. the
 user can provide another eventfd that is triggered when the completion
 of the virtual IRQ (gsi) is detected by the GIC.

 The gsi must correspond to a shared peripheral interrupt (SPI), ie the
 GIC interrupt ID is gsi+32.

 this patch enables CONFIG_HAVE_KVM_EVENTFD and CONFIG_HAVE_KVM_IRQFD.
 CONFIG_HAVE_KVM_IRQCHIP is removed. No IRQ routing table is used
 (irqchip.c and irqcomm.c are not used).

 Both KVM_CAP_IRQFD  KVM_CAP_IRQFD_RESAMPLE capabilities are exposed

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 This patch serie deprecates the previous serie featuring GSI routing
 (https://patches.linaro.org/32261/)

 The patch serie has the following dependencies:
 - arm/arm64: KVM: Various VGIC cleanups and improvements
   https://lists.cs.columbia.edu/pipermail/kvmarm/2014-June/009979.html
 - KVM: EVENTFD: remove inclusion of irq.h

 All pieces can be found on git://git.linaro.org/people/eric.auger/linux.git
 branch irqfd_norouting_integ_v3

 This work was tested with Calxeda Midway xgmac main interrupt with
 qemu-system-arm and QEMU VFIO platform device.

 v2 - v3:
 - removal of irq.h from eventfd.c put in a separate patch to increase
   visibility
 - properly expose KVM_CAP_IRQFD capability in arm.c
 - remove CONFIG_HAVE_KVM_IRQCHIP meaningfull only if irq_comm.c is used

 v1 - v2:
 - rebase on 3.17rc1
 - move of the dist unlock in process_maintenance
 - remove of dist lock in __kvm_vgic_sync_hwstate
 - rewording of the commit message (add resamplefd reference)
 - remove irq.h
 ---
  Documentation/virtual/kvm/api.txt |  5 +++-
  arch/arm/include/uapi/asm/kvm.h   |  3 +++
  arch/arm/kvm/Kconfig  |  4 +--
  arch/arm/kvm/Makefile |  2 +-
  arch/arm/kvm/arm.c|  3 +++
  virt/kvm/arm/vgic.c   | 56 
 ---
  6 files changed, 65 insertions(+), 8 deletions(-)

 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index beae3fd..8118b12 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -2204,7 +2204,7 @@ into the hash PTE second double word).
  4.75 KVM_IRQFD
  
  Capability: KVM_CAP_IRQFD
 -Architectures: x86 s390
 +Architectures: x86 s390 arm
  Type: vm ioctl
  Parameters: struct kvm_irqfd (in)
  Returns: 0 on success, -1 on error
 @@ -2230,6 +2230,9 @@ Note that closing the resamplefd is not sufficient to 
 disable the
  irqfd.  The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
  and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
  
 +On ARM/arm64 the injected must be a shared peripheral interrupt (SPI).
 +This means the programmed GIC interrupt ID is gsi+32.
 +
 
 See above comment.
Hi Christoffer,

sorry which comment do you refer to?  wrt your last comment do you
consider PPI injection support is a mandated feature for this patch to
be upstreamable?
 
  4.76 KVM_PPC_ALLOCATE_HTAB
  
  Capability: KVM_CAP_PPC_ALLOC_HTAB
 diff --git a/arch/arm/include/uapi/asm/kvm.h 
 b/arch/arm/include/uapi/asm/kvm.h
 index e6ebdd3..3034c66 100644
 --- a/arch/arm/include/uapi/asm/kvm.h
 +++ b/arch/arm/include/uapi/asm/kvm.h
 @@ -194,6 +194,9 @@ struct kvm_arch_memory_slot {
  /* Highest supported SPI, from VGIC_NR_IRQS */
  #define KVM_ARM_IRQ_GIC_MAX 127
  
 +/* One single KVM irqchip, ie. the VGIC */
 +#define KVM_NR_IRQCHIPS  1
 +
  /* PSCI interface */
  #define KVM_PSCI_FN_BASE0x95c1ba5e
  #define KVM_PSCI_FN(n)  (KVM_PSCI_FN_BASE + (n))
 diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
 index 466bd29..e519a40 100644
 --- a/arch/arm/kvm/Kconfig
 +++ b/arch/arm/kvm/Kconfig
 @@ -24,6 +24,7 @@ config KVM
  select KVM_MMIO
  select KVM_ARM_HOST
  depends on ARM_VIRT_EXT  ARM_LPAE
 +select HAVE_KVM_EVENTFD
  ---help---
Support hosting virtualized guest machines. You will also
need to select one or more of the processor modules below.
 @@ -55,7 +56,7 @@ config KVM_ARM_MAX_VCPUS
  config KVM_ARM_VGIC
  bool KVM support for Virtual GIC
  depends on KVM_ARM_HOST  OF
 -select HAVE_KVM_IRQCHIP
 +select HAVE_KVM_IRQFD
  default y
  ---help---
Adds support for a hardware assisted, in-kernel GIC emulation.
 @@ -63,7 +64,6 @@ config KVM_ARM_VGIC
  config KVM_ARM_TIMER
  bool KVM support for Architected Timers
  depends on

Re: [RFC v2 4/9] VFIO: platform: handler tests whether the IRQ is forwarded

2014-09-11 Thread Eric Auger

On 09/11/2014 05:10 AM, Christoffer Dall wrote:
 On Mon, Sep 01, 2014 at 02:52:43PM +0200, Eric Auger wrote:
 In case the IRQ is forwarded, the VFIO platform IRQ handler does not
 need to disable the IRQ anymore. In that mode, when the handler completes
 
 add a comma after completes
Hi Christoffer,
ok
 
 the IRQ is not deactivated but only its priority is lowered.

 Some other actor (typically a guest) is supposed to deactivate the IRQ,
 allowing at that time a new physical IRQ to hit.

 In virtualization use case, the physical IRQ is automatically completed
 by the interrupt controller when the guest completes the corresponding
 virtual IRQ.

 Signed-off-by: Eric Auger eric.au...@linaro.org
 ---
  drivers/vfio/platform/vfio_platform_irq.c | 7 ++-
  1 file changed, 6 insertions(+), 1 deletion(-)

 diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
 b/drivers/vfio/platform/vfio_platform_irq.c
 index 6768508..1f851b2 100644
 --- a/drivers/vfio/platform/vfio_platform_irq.c
 +++ b/drivers/vfio/platform/vfio_platform_irq.c
 @@ -88,13 +88,18 @@ static irqreturn_t vfio_irq_handler(int irq, void 
 *dev_id)
  struct vfio_platform_irq *irq_ctx = dev_id;
  unsigned long flags;
  int ret = IRQ_NONE;
 +struct irq_data *d;
 +bool is_forwarded;
  
  spin_lock_irqsave(irq_ctx-lock, flags);
  
  if (!irq_ctx-masked) {
  ret = IRQ_HANDLED;
 +d = irq_get_irq_data(irq_ctx-hwirq);
 +is_forwarded = irqd_irq_forwarded(d);
  
 -if (irq_ctx-flags  VFIO_IRQ_INFO_AUTOMASKED) {
 +if (irq_ctx-flags  VFIO_IRQ_INFO_AUTOMASKED 
 +!is_forwarded) {
  disable_irq_nosync(irq_ctx-hwirq);
  irq_ctx-masked = true;
  }
 -- 
 1.9.1

 It makes sense that these needs to be all controlled in the kernel, but
 I'm wondering if it would be cleaner / more correct to clear the
 AUTOMASKED flag when the IRQ is forwarded and have vfio refuse setting
 this flag as long as the irq is forwarded?

If I am not wrong, even if the user sets AUTOMASKED, this info never is
exploited by the vfio platform driver. AUTOMASKED only is set internally
to the driver, on init, for level sensitive IRQs.

It seems to be the same on PCI (for INTx). I do not see anywhere the
user flag curectly copied into a local storage. But I prefer to be
careful ;-)

If confirmed, although the flag value is exposed in the user API, the
user set value never is exploited so this removes the need to check.

the forwarded IRQ modality being fully dynamic currently, then I would
need to update the irq_ctx-flags on each vfio_irq_handler call. I don't
know if its better?

Best Regards

Eric


 
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 5/9] KVM: KVM-VFIO: update user API to program forwarded IRQ

2014-09-11 Thread Eric Auger

On 09/11/2014 05:10 AM, Christoffer Dall wrote:
 On Mon, Sep 01, 2014 at 02:52:44PM +0200, Eric Auger wrote:
 add new device group commands:
 - KVM_DEV_VFIO_DEVICE_FORWARD_IRQ and
   KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ

 which enable to turn forwarded IRQ mode on/off.

 the kvm_arch_forwarded_irq struct embodies a forwarded IRQ

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 v1 - v2:
 - struct kvm_arch_forwarded_irq moved from arch/arm/include/uapi/asm/kvm.h
   to include/uapi/linux/kvm.h
   also irq_index renamed into index and guest_irq renamed into gsi
 - ASSIGN/DEASSIGN renamed into FORWARD/UNFORWARD
 ---
  Documentation/virtual/kvm/devices/vfio.txt | 26 ++
  include/uapi/linux/kvm.h   |  9 +
  2 files changed, 35 insertions(+)

 diff --git a/Documentation/virtual/kvm/devices/vfio.txt 
 b/Documentation/virtual/kvm/devices/vfio.txt
 index ef51740..048baa0 100644
 --- a/Documentation/virtual/kvm/devices/vfio.txt
 +++ b/Documentation/virtual/kvm/devices/vfio.txt
 @@ -13,6 +13,7 @@ VFIO-group is held by KVM.
  
  Groups:
KVM_DEV_VFIO_GROUP
 +  KVM_DEV_VFIO_DEVICE
  
  KVM_DEV_VFIO_GROUP attributes:
KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
 @@ -20,3 +21,28 @@ KVM_DEV_VFIO_GROUP attributes:
  
  For each, kvm_device_attr.addr points to an int32_t file descriptor
  for the VFIO group.
 +
 +KVM_DEV_VFIO_DEVICE attributes:
 +  KVM_DEV_VFIO_DEVICE_FORWARD_IRQ
 +  KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ
 +
 +For each, kvm_device_attr.addr points to a kvm_arch_forwarded_irq struct.
 +This user API makes possible to create a special IRQ handling mode,
 
   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ enables a special IRQ handling mode on
   hardware that supports it,
OK
 
 +where KVM and a VFIO platform driver collaborate to improve IRQ
 +handling performance.
 +
 +'fd represents the file descriptor of a valid VFIO device whose physical
 
 fd is described out of context here.  Can you copy the struct definition
 into this document, perhaps right after the For each, ... line above.
yes sure
 
 +IRQ, referenced by its index, is injected into the VM guest irq (gsi).
  as a virtual IRQ (specified
by the gsi field) into the
VM.
 
 +
 +On FORWARD_IRQ, KVM-VFIO device programs:
When setting the  KVM_DEV_VFIO_DEVICE_FORWARD_IRQ attribute, the
KVM-VFIO device tells the host (or VFIO?) to not complete the
physical IRQ, and instead ensures that KVM (or the VM) completes the
physical IRQ.
 
 +- the host, to not complete the physical IRQ itself.
 +- the GIC, to automatically complete the physical IRQ when the guest
 +  completes the virtual IRQ.
 
 and drop this bullet form.
ok
 
 +This avoids trapping the end-of-interrupt for level sensitive IRQ.
 
 avoid this last line, it's specific to ARM.
ok
 
 +
 +On UNFORWARD_IRQ, one returns to the mode where the host completes the
When setting the KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ attribute, the
host (VFIO?) will again complete the physical IRQ and KVM will not...
  
 +physical IRQ and the guest completes the virtual IRQ.
 +
 +It is up to the caller of this API to make sure the IRQ is not
 +outstanding when the FORWARD/UNFORWARD is called. This could lead to
 
 outstanding? can you be specific?
active? and I should add *physical* IRQ
 
 don't refer to FOWARD/UNFORWARD, either refer to these attributes by
 their full name or use a clear reference in proper English.
ok
 
 +some inconsistency on who is going to complete the IRQ.
 
 This sounds like the whole thing is fragile and if userspace doesn't do
 things right, IRQ handling of a piece of hardware is going to be
 inconsistent?  Is this the case?  If so, we need some stronger
 semantics.  If not, this should be rephrased.
Actually the KVM-VFIO device rejects any attempt to change the
forwarding mode if the physical IRQ is active. So I hope this is robust
and will change the explanation.

Thanks

Eric
 
 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
 index cf3a2ff..8cd7b0e 100644
 --- a/include/uapi/linux/kvm.h
 +++ b/include/uapi/linux/kvm.h
 @@ -947,6 +947,12 @@ struct kvm_device_attr {
  __u64   addr;   /* userspace address of attr data */
  };
  
 +struct kvm_arch_forwarded_irq {
 +__u32 fd; /* file desciptor of the VFIO device */
 +__u32 index; /* VFIO device IRQ index */
 +__u32 gsi; /* gsi, ie. virtual IRQ number */
 +};
 +
  #define KVM_DEV_TYPE_FSL_MPIC_201
  #define KVM_DEV_TYPE_FSL_MPIC_422
  #define KVM_DEV_TYPE_XICS   3
 @@ -954,6 +960,9 @@ struct kvm_device_attr {
  #define  KVM_DEV_VFIO_GROUP 1
  #define   KVM_DEV_VFIO_GROUP_ADD1
  #define   KVM_DEV_VFIO_GROUP_DEL2
 +#define  KVM_DEV_VFIO_DEVICE2
 +#define   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ

Re: [RFC v2 6/9] VFIO: Extend external user API

2014-09-11 Thread Eric Auger

On 09/11/2014 05:10 AM, Christoffer Dall wrote:
 On Mon, Sep 01, 2014 at 02:52:45PM +0200, Eric Auger wrote:
 New functions are added to be called from ARM KVM-VFIO device.
 
 This commit message seems somewhat random.  This patch doesn't deal with
 anything ARM specific, it introduces some generic functions that allows
 users external to vfio itself to retrieve information about a vfio
 platform device.

Yes you're right.
 

 - vfio_device_get_external_user enables to get a vfio device from
   its fd
 - vfio_device_put_external_user puts the vfio device
 - vfio_external_base_device returns the struct device*,
   useful to access the platform_device

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 v1 - v2:

 - vfio_external_get_base_device renamed into vfio_external_base_device
 - vfio_external_get_type removed
 ---
  drivers/vfio/vfio.c  | 24 
  include/linux/vfio.h |  3 +++
  2 files changed, 27 insertions(+)

 diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
 index 8e84471..282814e 100644
 --- a/drivers/vfio/vfio.c
 +++ b/drivers/vfio/vfio.c
 @@ -1401,6 +1401,30 @@ void vfio_group_put_external_user(struct vfio_group 
 *group)
  }
  EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
  
 +struct vfio_device *vfio_device_get_external_user(struct file *filep)
 +{
 +struct vfio_device *vdev = filep-private_data;
 +
 +if (filep-f_op != vfio_device_fops)
 +return ERR_PTR(-EINVAL);
 +
 +vfio_device_get(vdev);
 +return vdev;
 +}
 +EXPORT_SYMBOL_GPL(vfio_device_get_external_user);
 +
 +void vfio_device_put_external_user(struct vfio_device *vdev)
 +{
 +vfio_device_put(vdev);
 +}
 +EXPORT_SYMBOL_GPL(vfio_device_put_external_user);
 +
 +struct device *vfio_external_base_device(struct vfio_device *vdev)
 +{
 +return vdev-dev;
 +}
 +EXPORT_SYMBOL_GPL(vfio_external_base_device);
 +
  int vfio_external_user_iommu_id(struct vfio_group *group)
  {
  return iommu_group_id(group-iommu_group);
 diff --git a/include/linux/vfio.h b/include/linux/vfio.h
 index ffe04ed..bd4b6cb 100644
 --- a/include/linux/vfio.h
 +++ b/include/linux/vfio.h
 @@ -99,6 +99,9 @@ extern void vfio_group_put_external_user(struct vfio_group 
 *group);
  extern int vfio_external_user_iommu_id(struct vfio_group *group);
  extern long vfio_external_check_extension(struct vfio_group *group,
unsigned long arg);
 +extern struct vfio_device *vfio_device_get_external_user(struct file 
 *filep);
 +extern void vfio_device_put_external_user(struct vfio_device *vdev);
 +extern struct device *vfio_external_base_device(struct vfio_device *vdev);
  
  struct pci_dev;
  #ifdef CONFIG_EEH
 -- 
 1.9.1

 Looks good to me,
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 7/9] KVM: KVM-VFIO: add new VFIO external API hooks

2014-09-11 Thread Eric Auger

On 09/11/2014 05:10 AM, Christoffer Dall wrote:
 On Mon, Sep 01, 2014 at 02:52:46PM +0200, Eric Auger wrote:
 add functions that implement the gateway to the extended
 
 Capital letter when beginning a new sentence.  Also the reference to
 'the extended VFIO API' feels a bit weird.  Can't you make your commit
 message a little more descriptive of this patch, something along the
 lines of:
 
 Provide wrapper functions that allows KVM-VFIO device code to get an
 external handle on a struct vfio_device based on a vfio device file
 descriptor.  We provide this through three new functions:
 
 (assuming I got this right).
sure
 
 
 
 external VFIO API:
 - kvm_vfio_device_get_external_user
 - kvm_vfio_device_put_external_user
 - kvm_vfio_external_base_device

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 v1 - v2:
 - kvm_vfio_external_get_base_device renamed into
   kvm_vfio_external_base_device
 - kvm_vfio_external_get_type removed
 ---
  arch/arm/include/asm/kvm_host.h |  5 +
  virt/kvm/vfio.c | 45 
 +
  2 files changed, 50 insertions(+)

 diff --git a/arch/arm/include/asm/kvm_host.h 
 b/arch/arm/include/asm/kvm_host.h
 index 6dfb404..1aee6bb 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -171,6 +171,11 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long 
 hva, pte_t pte);
  unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
  int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
  
 +struct vfio_device;
 +struct vfio_device *kvm_vfio_device_get_external_user(struct file *filep);
 +void kvm_vfio_device_put_external_user(struct vfio_device *vdev);
 +struct device *kvm_vfio_external_base_device(struct vfio_device *vdev);
 +
  /* We do not have shadow page tables, hence the empty hooks */
  static inline int kvm_age_hva(struct kvm *kvm, unsigned long hva)
  {
 diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
 index ba1a93f..76dc7a1 100644
 --- a/virt/kvm/vfio.c
 +++ b/virt/kvm/vfio.c
 @@ -59,6 +59,51 @@ static void kvm_vfio_group_put_external_user(struct 
 vfio_group *vfio_group)
  symbol_put(vfio_group_put_external_user);
  }
  
 +struct vfio_device *kvm_vfio_device_get_external_user(struct file *filep)
 +{
 +struct vfio_device *vdev;
 +struct vfio_device *(*fn)(struct file *);
 +
 +fn = symbol_get(vfio_device_get_external_user);
 +if (!fn)
 +return ERR_PTR(-EINVAL);
 +
 +vdev = fn(filep);
 +
 +symbol_put(vfio_device_get_external_user);
 +
 +return vdev;
 +}
 +
 +void kvm_vfio_device_put_external_user(struct vfio_device *vdev)
 +{
 +void (*fn)(struct vfio_device *);
 +
 +fn = symbol_get(vfio_device_put_external_user);
 +if (!fn)
 +return;
 +
 +fn(vdev);
 +
 +symbol_put(vfio_device_put_external_user);
 +}
 +
 +struct device *kvm_vfio_external_base_device(struct vfio_device *vdev)
 +{
 +struct device *(*fn)(struct vfio_device *);
 +struct device *dev;
 +
 +fn = symbol_get(vfio_external_base_device);
 +if (!fn)
 +return NULL;
 +
 +dev = fn(vdev);
 +
 +symbol_put(vfio_external_base_device);
 +
 +return dev;
 +}
 +
  static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
  {
  long (*fn)(struct vfio_group *, unsigned long);
 -- 
 1.9.1

 
 otherwise looks good to me!
 -Christoffer
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] kvm: ioapic: conditionally delay irq delivery duringeoi broadcast

2014-09-11 Thread Zhang Haoyu

Currently, we call ioapic_service() immediately when we find the irq is still
active during eoi broadcast. But for real hardware, there's some dealy between
the EOI writing and irq delivery (system bus latency?). So we need to emulate
this behavior. Otherwise, for a guest who haven't register a proper irq handler
, it would stay in the interrupt routine as this irq would be re-injected
immediately after guest enables interrupt. This would lead guest can't move
forward and may miss the possibility to get proper irq handler registered (one
example is windows guest resuming from hibernation).

As there's no way to differ the unhandled irq from new raised ones, this patch
solve this problems by scheduling a delayed work when the count of irq injected
during eoi broadcast exceeds a threshold value. After this patch, the guest can
move a little forward when there's no suitable irq handler in case it may
register one very soon and for guest who has a bad irq detection routine ( such
as note_interrupt() in linux ), this bad irq would be recognized soon as in the
past.

Cc: Michael S. Tsirkin m...@redhat.com
Signed-off-by: Jason Wang jasow...@redhat.com
Signed-off-by: Zhang Haoyu zhan...@sangfor.com
---
 include/trace/events/kvm.h | 20 +++
 virt/kvm/ioapic.c  | 50 --
 virt/kvm/ioapic.h  |  6 ++
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 908925a..ab679c3 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
  __entry-coalesced ?  (coalesced) : )
 );
 
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+   TP_PROTO(__u64 e),
+   TP_ARGS(e),
+
+   TP_STRUCT__entry(
+   __field(__u64,  e   )
+   ),
+
+   TP_fast_assign(
+   __entry-e  = e;
+   ),
+
+   TP_printk(dst %x vec=%u (%s|%s|%s%s),
+ (u8)(__entry-e  56), (u8)__entry-e,
+ __print_symbolic((__entry-e  8  0x7), kvm_deliver_mode),
+ (__entry-e  (111)) ? logical : physical,
+ (__entry-e  (115)) ? level : edge,
+ (__entry-e  (116)) ? |masked : )
+);
+
 TRACE_EVENT(kvm_msi_set_irq,
TP_PROTO(__u64 address, __u64 data),
TP_ARGS(address, data),
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index e8ce34c..8e1dc67 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int 
irq_source_id)
spin_unlock(ioapic-lock);
 }
 
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+   int i;
+   struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+eoi_inject.work);
+   spin_lock(ioapic-lock);
+   for (i = 0; i  IOAPIC_NUM_PINS; i++) {
+   union kvm_ioapic_redirect_entry *ent = ioapic-redirtbl[i];
+
+   if (ent-fields.trig_mode != IOAPIC_LEVEL_TRIG)
+   continue;
+
+   if (ioapic-irr  (1  i)  !ent-fields.remote_irr)
+   ioapic_service(ioapic, i, false);
+   }
+   spin_unlock(ioapic-lock);
+}
+
 static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
@@ -435,8 +453,32 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 
ASSERT(ent-fields.trig_mode == IOAPIC_LEVEL_TRIG);
ent-fields.remote_irr = 0;
-   if (ioapic-irr  (1  i))
-   ioapic_service(ioapic, i, false);
+   if (!ent-fields.mask  (ioapic-irr  (1  i))) {
+   ++ioapic-irq_eoi[i];
+   if (ioapic-irq_eoi[i] == 
IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+   /*
+* Real hardware does not deliver the irq so
+* immediately during eoi broadcast, so we need
+* to emulate this behavior. Otherwise, for
+* guests who has not registered handler of a
+* level irq, this irq would be injected
+* immediately after guest enables interrupt
+* (which happens usually at the end of the
+* common interrupt routine). This would lead
+* guest can't move forward and may miss the
+* possibility to get proper irq handler
+* registered. So we need to give some breath to
+* guest.
+*/
+

Re: [PATCH] KVM: ioapic: add check for RTC_GSI

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 08:01, Jan Kiszka ha scritto:
 That issue is limited to the dead ia64 architecture, no other builds
 ioapic.c (besides x86, of course). Maybe finally remove KVM support for
 that arch instead of bloating otherwise correct code?

Yes, we should.  I've just been too busy to do it.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 1/7] kvm: Use APIC_DEFAULT_PHYS_BASE macro as the apic access page address.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
 We have APIC_DEFAULT_PHYS_BASE defined as 0xfee0, which is also the 
 address of
 apic access page. So use this macro.
 
 Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
 Reviewed-by: Gleb Natapov g...@kernel.org
 ---
  arch/x86/kvm/svm.c | 3 ++-
  arch/x86/kvm/vmx.c | 6 +++---
  2 files changed, 5 insertions(+), 4 deletions(-)
 
 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
 index ddf7427..1d941ad 100644
 --- a/arch/x86/kvm/svm.c
 +++ b/arch/x86/kvm/svm.c
 @@ -1257,7 +1257,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm 
 *kvm, unsigned int id)
   svm-asid_generation = 0;
   init_vmcb(svm);
  
 - svm-vcpu.arch.apic_base = 0xfee0 | MSR_IA32_APICBASE_ENABLE;
 + svm-vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
 +MSR_IA32_APICBASE_ENABLE;
   if (kvm_vcpu_is_bsp(svm-vcpu))
   svm-vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
  
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index bfe11cf..4b80ead 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -3999,13 +3999,13 @@ static int alloc_apic_access_page(struct kvm *kvm)
   goto out;
   kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
   kvm_userspace_mem.flags = 0;
 - kvm_userspace_mem.guest_phys_addr = 0xfee0ULL;
 + kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
   kvm_userspace_mem.memory_size = PAGE_SIZE;
   r = __kvm_set_memory_region(kvm, kvm_userspace_mem);
   if (r)
   goto out;
  
 - page = gfn_to_page(kvm, 0xfee00);
 + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
   if (is_error_page(page)) {
   r = -EFAULT;
   goto out;
 @@ -4477,7 +4477,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
  
   vmx-vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
   kvm_set_cr8(vmx-vcpu, 0);
 - apic_base_msr.data = 0xfee0 | MSR_IA32_APICBASE_ENABLE;
 + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
   if (kvm_vcpu_is_bsp(vmx-vcpu))
   apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
   apic_base_msr.host_initiated = true;
 

Applied, thanks.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [GIT PULL 0/7] KVM: s390: Fixes and features for next (3.18)

2014-09-11 Thread Paolo Bonzini

Il 10/09/2014 15:53, Christian Borntraeger ha scritto:
 Paolo,
 
 please have a look at the next bunch of s390 patches and consider to apply:
 
 The following changes since commit fd2752352bbc98850d83b5448a288d8991590317:
 
   KVM: x86: use guest maxphyaddr to check MTRR values (2014-08-29 18:56:24 
 +0200)
 
 are available in the git repository at:
 
   git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git  
 tags/kvm-s390-next-20140910
 
 for you to fetch changes up to bfac1f59a1afb13a3cf225bffd04be99a49c51a6:
 
   KVM: s390/interrupt: remove double assignment (2014-09-10 12:19:45 +0200)
 
 
 KVM: s390: Fixes and features for next (3.18)
 
 1. Crypto/CPACF support: To enable the MSA4 instructions we have to
provide a common control structure for each SIE control block
 2. Two cleanups found by a static code checker: one redundant assignment
and one useless if
 3. Fix the page handling of the diag10 ballooning interface. If the
guest freed the pages at absolute 0 some checks and frees were
incorrect
 4. Limit guests to 16TB
 5. Add __must_check to interrupt injection code
 
 
 Christian Borntraeger (6):
   KVM: s390: add __must_check to interrupt deliver functions
   KVM: s390: Limit guest size to 16TB
   KVM: s390: unintended fallthrough for external call
   KVM: s390: get rid of constant condition in ipte_unlock_simple
   KVM: s390/cmm: Fix prefix handling for diag 10 balloon
   KVM: s390/interrupt: remove double assignment
 
 Tony Krowiak (1):
   KVM: CPACF: Enable MSA4 instructions for kvm guest
 
  arch/s390/include/asm/kvm_host.h | 14 +-
  arch/s390/kvm/diag.c | 26 ++
  arch/s390/kvm/gaccess.c  |  3 +--
  arch/s390/kvm/interrupt.c| 14 +++---
  arch/s390/kvm/kvm-s390.c | 35 ++-
  arch/s390/kvm/kvm-s390.h |  2 +-
  6 files changed, 74 insertions(+), 20 deletions(-)
 

Thanks, applying to kvm/queue.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 2/7] kvm: Remove ept_identity_pagetable from struct kvm_arch.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
 kvm_arch-ept_identity_pagetable holds the ept identity pagetable page. But
 it is never used to refer to the page at all.
 
 In vcpu initialization, it indicates two things:
 1. indicates if ept page is allocated
 2. indicates if a memory slot for identity page is initialized
 
 Actually, kvm_arch-ept_identity_pagetable_done is enough to tell if the ept
 identity pagetable is initialized. So we can remove ept_identity_pagetable.
 
 NOTE: In the original code, ept identity pagetable page is pinned in memroy.
   As a result, it cannot be migrated/hot-removed. After this patch, since
   kvm_arch-ept_identity_pagetable is removed, ept identity pagetable page
   is no longer pinned in memory. And it can be migrated/hot-removed.
 
 Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
 Reviewed-by: Gleb Natapov g...@kernel.org
 ---
  arch/x86/include/asm/kvm_host.h |  1 -
  arch/x86/kvm/vmx.c  | 50 
 -
  arch/x86/kvm/x86.c  |  2 --
  3 files changed, 25 insertions(+), 28 deletions(-)
 
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index 7c492ed..35171c7 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -580,7 +580,6 @@ struct kvm_arch {
  
   gpa_t wall_clock;
  
 - struct page *ept_identity_pagetable;
   bool ept_identity_pagetable_done;
   gpa_t ept_identity_map_addr;
  
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 4b80ead..953d529 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -743,6 +743,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment 
 *var);
  static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
  static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
  static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 +static int alloc_identity_pagetable(struct kvm *kvm);
  
  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 @@ -3938,21 +3939,27 @@ out:
  
  static int init_rmode_identity_map(struct kvm *kvm)
  {
 - int i, idx, r, ret;
 + int i, idx, r, ret = 0;
   pfn_t identity_map_pfn;
   u32 tmp;
  
   if (!enable_ept)
   return 1;
 - if (unlikely(!kvm-arch.ept_identity_pagetable)) {
 - printk(KERN_ERR EPT: identity-mapping pagetable 
 - haven't been allocated!\n);
 - return 0;
 +
 + /* Protect kvm-arch.ept_identity_pagetable_done. */
 + mutex_lock(kvm-slots_lock);
 +
 + if (likely(kvm-arch.ept_identity_pagetable_done)) {
 + ret = 1;
 + goto out2;
   }
 - if (likely(kvm-arch.ept_identity_pagetable_done))
 - return 1;
 - ret = 0;
 +
   identity_map_pfn = kvm-arch.ept_identity_map_addr  PAGE_SHIFT;
 +
 + r = alloc_identity_pagetable(kvm);
 + if (r)
 + goto out2;
 +
   idx = srcu_read_lock(kvm-srcu);
   r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
   if (r  0)
 @@ -3970,6 +3977,9 @@ static int init_rmode_identity_map(struct kvm *kvm)
   ret = 1;
  out:
   srcu_read_unlock(kvm-srcu, idx);
 +
 +out2:
 + mutex_unlock(kvm-slots_lock);
   return ret;
  }
  
 @@ -4019,31 +4029,23 @@ out:
  
  static int alloc_identity_pagetable(struct kvm *kvm)
  {
 - struct page *page;
 + /*
 +  * In init_rmode_identity_map(), kvm-arch.ept_identity_pagetable_done
 +  * is checked before calling this function and set to true after the
 +  * calling. The access to kvm-arch.ept_identity_pagetable_done should
 +  * be protected by kvm-slots_lock.
 +  */

Not just the access, also the call to this function must be protected by
the lock.

So replace this commend by just

/* Called with kvm-slots_lock held.  */

and add a BUG_ON that ept_identity_pagetable_done is false.

Paolo

   struct kvm_userspace_memory_region kvm_userspace_mem;
   int r = 0;
  
 - mutex_lock(kvm-slots_lock);
 - if (kvm-arch.ept_identity_pagetable)
 - goto out;
   kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
   kvm_userspace_mem.flags = 0;
   kvm_userspace_mem.guest_phys_addr =
   kvm-arch.ept_identity_map_addr;
   kvm_userspace_mem.memory_size = PAGE_SIZE;
   r = __kvm_set_memory_region(kvm, kvm_userspace_mem);
 - if (r)
 - goto out;
  
 - page = gfn_to_page(kvm, kvm-arch.ept_identity_map_addr  PAGE_SHIFT);
 - if (is_error_page(page)) {
 - r = -EFAULT;
 - goto out;
 - }
 -
 - kvm-arch.ept_identity_pagetable = page;
 -out:
 - mutex_unlock(kvm-slots_lock);
   return r;
  }
  
 @@ -7643,8 +7645,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm 
 *kvm, unsigned int id)
   kvm-arch.ept_identity_map_addr =
   VMX_EPT_IDENTITY_PAGETABLE_ADDR;

Re: [PATCH v5 3/7] kvm: Make init_rmode_identity_map() return 0 on success.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
 In init_rmode_identity_map(), there two variables indicating the return
 value, r and ret, and it return 0 on error, 1 on success. The function
 is only called by vmx_create_vcpu(), and r is redundant.
 
 This patch removes the redundant variable r, and make 
 init_rmode_identity_map()
 return 0 on success, -errno on failure.
 
 Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
 ---
  arch/x86/kvm/vmx.c | 25 +++--
  1 file changed, 11 insertions(+), 14 deletions(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 953d529..63c4c3e 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -3939,45 +3939,42 @@ out:
  
  static int init_rmode_identity_map(struct kvm *kvm)
  {
 - int i, idx, r, ret = 0;
 + int i, idx, ret = 0;
   pfn_t identity_map_pfn;
   u32 tmp;
  
   if (!enable_ept)
 - return 1;
 + return 0;
  
   /* Protect kvm-arch.ept_identity_pagetable_done. */
   mutex_lock(kvm-slots_lock);
  
 - if (likely(kvm-arch.ept_identity_pagetable_done)) {
 - ret = 1;
 + if (likely(kvm-arch.ept_identity_pagetable_done))
   goto out2;
 - }
  
   identity_map_pfn = kvm-arch.ept_identity_map_addr  PAGE_SHIFT;
  
 - r = alloc_identity_pagetable(kvm);
 - if (r)
 + ret = alloc_identity_pagetable(kvm);
 + if (ret)
   goto out2;
  
   idx = srcu_read_lock(kvm-srcu);
 - r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
 - if (r  0)
 + ret = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
 + if (ret)
   goto out;
   /* Set up identity-mapping pagetable for EPT in real mode */
   for (i = 0; i  PT32_ENT_PER_PAGE; i++) {
   tmp = (i  22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
   _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
 - r = kvm_write_guest_page(kvm, identity_map_pfn,
 + ret = kvm_write_guest_page(kvm, identity_map_pfn,
   tmp, i * sizeof(tmp), sizeof(tmp));
 - if (r  0)
 + if (ret)
   goto out;
   }
   kvm-arch.ept_identity_pagetable_done = true;
 - ret = 1;
 +
  out:
   srcu_read_unlock(kvm-srcu, idx);
 -
  out2:
   mutex_unlock(kvm-slots_lock);
   return ret;
 @@ -7645,7 +7642,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm 
 *kvm, unsigned int id)
   kvm-arch.ept_identity_map_addr =
   VMX_EPT_IDENTITY_PAGETABLE_ADDR;
   err = -ENOMEM;
 - if (!init_rmode_identity_map(kvm))
 + if (init_rmode_identity_map(kvm))

Please add  0 here.  I would also consider setting err to the return
value of init_rmode_identity_map, and initializing it to -ENOMEM only
after the if.

Paolo

   goto free_vmcs;

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: ioapic: add check for RTC_GSI

2014-09-11 Thread Jan Kiszka

On 2014-09-11 11:08, Paolo Bonzini wrote:
 Il 11/09/2014 08:01, Jan Kiszka ha scritto:
 That issue is limited to the dead ia64 architecture, no other builds
 ioapic.c (besides x86, of course). Maybe finally remove KVM support for
 that arch instead of bloating otherwise correct code?
 
 Yes, we should.  I've just been too busy to do it.

As I enjoy creating large negative diffs :), I would almost do it - if I
had a test environment for IA64...

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
 apic access page is pinned in memory. As a result, it cannot be 
 migrated/hot-removed.
 Actually, it is not necessary to be pinned.
 
 The hpa of apic access page is stored in VMCS APIC_ACCESS_ADDR pointer. When
 the page is migrated, kvm_mmu_notifier_invalidate_page() will invalidate the
 corresponding ept entry. This patch introduces a new vcpu request named
 KVM_REQ_APIC_PAGE_RELOAD, and makes this request to all the vcpus at this 
 time,
 and force all the vcpus exit guest, and re-enter guest till they updates the 
 VMCS
 APIC_ACCESS_ADDR pointer to the new apic access page address, and updates
 kvm-arch.apic_access_page to the new page.
 
 Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
 ---
  arch/x86/include/asm/kvm_host.h |  1 +
  arch/x86/kvm/svm.c  |  6 ++
  arch/x86/kvm/vmx.c  |  6 ++
  arch/x86/kvm/x86.c  | 15 +++
  include/linux/kvm_host.h|  2 ++
  virt/kvm/kvm_main.c | 12 
  6 files changed, 42 insertions(+)
 
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index 35171c7..514183e 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -739,6 +739,7 @@ struct kvm_x86_ops {
   void (*hwapic_isr_update)(struct kvm *kvm, int isr);
   void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
   void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 + void (*set_apic_access_page_addr)(struct kvm *kvm, hpa_t hpa);
   void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
   void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
   int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
 index 1d941ad..f2eacc4 100644
 --- a/arch/x86/kvm/svm.c
 +++ b/arch/x86/kvm/svm.c
 @@ -3619,6 +3619,11 @@ static void svm_set_virtual_x2apic_mode(struct 
 kvm_vcpu *vcpu, bool set)
   return;
  }
  
 +static void svm_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
 +{
 + return;
 +}
 +
  static int svm_vm_has_apicv(struct kvm *kvm)
  {
   return 0;
 @@ -4373,6 +4378,7 @@ static struct kvm_x86_ops svm_x86_ops = {
   .enable_irq_window = enable_irq_window,
   .update_cr8_intercept = update_cr8_intercept,
   .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
 + .set_apic_access_page_addr = svm_set_apic_access_page_addr,
   .vm_has_apicv = svm_vm_has_apicv,
   .load_eoi_exitmap = svm_load_eoi_exitmap,
   .hwapic_isr_update = svm_hwapic_isr_update,
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 63c4c3e..da6d55d 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -7093,6 +7093,11 @@ static void vmx_set_virtual_x2apic_mode(struct 
 kvm_vcpu *vcpu, bool set)
   vmx_set_msr_bitmap(vcpu);
  }
  
 +static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
 +{
 + vmcs_write64(APIC_ACCESS_ADDR, hpa);

This has to be guarded by if (!is_guest_mode(vcpu)).

 +}
 +
  static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
  {
   u16 status;
 @@ -8910,6 +8915,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
   .enable_irq_window = enable_irq_window,
   .update_cr8_intercept = update_cr8_intercept,
   .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
 + .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
   .vm_has_apicv = vmx_vm_has_apicv,
   .load_eoi_exitmap = vmx_load_eoi_exitmap,
   .hwapic_irr_update = vmx_hwapic_irr_update,
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index e05bd58..96f4188 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -5989,6 +5989,19 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
   kvm_apic_update_tmr(vcpu, tmr);
  }
  
 +static void vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 +{
 + /*
 +  * apic access page could be migrated. When the page is being migrated,
 +  * GUP will wait till the migrate entry is replaced with the new pte
 +  * entry pointing to the new page.
 +  */
 + vcpu-kvm-arch.apic_access_page = gfn_to_page(vcpu-kvm,
 + APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
 + kvm_x86_ops-set_apic_access_page_addr(vcpu-kvm,
 + page_to_phys(vcpu-kvm-arch.apic_access_page));
 +}
 +
  /*
   * Returns 1 to let __vcpu_run() continue the guest execution loop without
   * exiting to the userspace.  Otherwise, the value will be returned to the
 @@ -6049,6 +6062,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
   kvm_deliver_pmi(vcpu);
   if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
   vcpu_scan_ioapic(vcpu);
 + if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
 + vcpu_reload_apic_access_page(vcpu);
   }
  
   if

Re: [PATCH v5 5/7] kvm, mem-hotplug: Reload L1's apic access page on migration when L2 is running.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
 This patch only handle L1 and L2 vm share one apic access page situation.
 
 When L1 vm is running, if the shared apic access page is migrated, 
 mmu_notifier will
 request all vcpus to exit to L0, and reload apic access page physical address 
 for
 all the vcpus' vmcs (which is done by patch 5/6). And when it enters L2 vm, 
 L2's vmcs
 will be updated in prepare_vmcs02() called by nested_vm_run(). So we need to 
 do
 nothing.
 
 When L2 vm is running, if the shared apic access page is migrated, 
 mmu_notifier will
 request all vcpus to exit to L0, and reload apic access page physical address 
 for
 all L2 vmcs. And this patch requests apic access page reload in L2-L1 vmexit.
 
 Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
 ---
  arch/x86/kvm/vmx.c  | 7 +++
  virt/kvm/kvm_main.c | 1 +
  2 files changed, 8 insertions(+)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index da6d55d..e7704b2 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -8796,6 +8796,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
 u32 exit_reason,
   }
  
   /*
 +  * Do not call kvm_reload_apic_access_page() because we are now
 +  * running, mmu_notifier will force to reload the page's hpa for L2
 +  * vmcs. Need to reload it for L1 before entering L1.
 +  */
 + kvm_reload_apic_access_page(vcpu-kvm);

That would kill performance for large L1 hosts.
vcpu_reload_apic_access_page should be enough (please rename it to
kvm_vcpu_reload_apic_access_page in patch 4, and export it).

Paolo

 + /*
* Exiting from L2 to L1, we're now back to L1 which thinks it just
* finished a VMLAUNCH or VMRESUME instruction, so we need to set the
* success or failure flag accordingly.
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index d8280de..784127e 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -214,6 +214,7 @@ void kvm_reload_apic_access_page(struct kvm *kvm)
  {
   make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
  }
 +EXPORT_SYMBOL_GPL(kvm_reload_apic_access_page);
  
  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
  {
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [Qemu-devel] [PATCH] kvm: ioapic: conditionally delay irq delivery duringeoi broadcast

2014-09-11 Thread Gonglei (Arei)

 Subject: [Qemu-devel] [PATCH] kvm: ioapic: conditionally delay irq delivery
 duringeoi broadcast
 
 Currently, we call ioapic_service() immediately when we find the irq is still
 active during eoi broadcast. But for real hardware, there's some dealy between
 the EOI writing and irq delivery (system bus latency?). So we need to emulate
 this behavior. Otherwise, for a guest who haven't register a proper irq 
 handler
 , it would stay in the interrupt routine as this irq would be re-injected
 immediately after guest enables interrupt. This would lead guest can't move
 forward and may miss the possibility to get proper irq handler registered (one
 example is windows guest resuming from hibernation).
 
 As there's no way to differ the unhandled irq from new raised ones, this patch
 solve this problems by scheduling a delayed work when the count of irq 
 injected
 during eoi broadcast exceeds a threshold value. After this patch, the guest 
 can
 move a little forward when there's no suitable irq handler in case it may
 register one very soon and for guest who has a bad irq detection routine ( 
 such
 as note_interrupt() in linux ), this bad irq would be recognized soon as in 
 the
 past.
 
 Cc: Michael S. Tsirkin m...@redhat.com
 Signed-off-by: Jason Wang jasow...@redhat.com
 Signed-off-by: Zhang Haoyu zhan...@sangfor.com
 ---
  include/trace/events/kvm.h | 20 +++
  virt/kvm/ioapic.c  | 50
 --
  virt/kvm/ioapic.h  |  6 ++
  3 files changed, 74 insertions(+), 2 deletions(-)
 
If this is a new version, please add a v2/v3 suffix and describe the changes at
those different versions .

You can get more information from:
http://wiki.qemu.org/Contribute/SubmitAPatch

Best regards,
-Gonglei
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 7/7] kvm, mem-hotplug: Unpin and remove nested_vmx-apic_access_page.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
 Just like we removed kvm_arch-apic_access_page, nested_vmx-apic_access_page
 becomes useless for the same reason. This patch removes 
 nested_vmx-apic_access_page,
 and use gfn_to_page() to pin it in memory when we need it, and unpin it after 
 then.
 
 Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
 ---
  arch/x86/kvm/vmx.c | 31 +--
  1 file changed, 9 insertions(+), 22 deletions(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 058c373..4aa73cb 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -374,11 +374,6 @@ struct nested_vmx {
   u64 vmcs01_tsc_offset;
   /* L2 must run next, and mustn't decide to exit to L1. */
   bool nested_run_pending;
 - /*
 -  * Guest pages referred to in vmcs02 with host-physical pointers, so
 -  * we must keep them pinned while L2 runs.
 -  */
 - struct page *apic_access_page;
   u64 msr_ia32_feature_control;
  
   struct hrtimer preemption_timer;
 @@ -6154,11 +6149,6 @@ static void free_nested(struct vcpu_vmx *vmx)
   nested_release_vmcs12(vmx);
   if (enable_shadow_vmcs)
   free_vmcs(vmx-nested.current_shadow_vmcs);
 - /* Unpin physical memory we referred to in current vmcs02 */
 - if (vmx-nested.apic_access_page) {
 - nested_release_page(vmx-nested.apic_access_page);
 - vmx-nested.apic_access_page = 0;
 - }
  
   nested_free_all_saved_vmcss(vmx);
  }
 @@ -7983,28 +7973,31 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
 struct vmcs12 *vmcs12)
   exec_control |= vmcs12-secondary_vm_exec_control;
  
   if (exec_control  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
 + struct page *page;
   /*
* Translate L1 physical address to host physical
* address for vmcs02. Keep the page pinned, so this
* physical address remains valid. We keep a reference
* to it so we can release it later.
*/
 - if (vmx-nested.apic_access_page) /* shouldn't happen */
 - 
 nested_release_page(vmx-nested.apic_access_page);
 - vmx-nested.apic_access_page =
 - nested_get_page(vcpu, vmcs12-apic_access_addr);
 + page = nested_get_page(vcpu, vmcs12-apic_access_addr);
   /*
* If translation failed, no matter: This feature asks
* to exit when accessing the given address, and if it
* can never be accessed, this feature won't do
* anything anyway.
*/
 - if (!vmx-nested.apic_access_page)
 + if (!page)
   exec_control =
 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
   else
   vmcs_write64(APIC_ACCESS_ADDR,
 -   page_to_phys(vmx-nested.apic_access_page));
 +  page_to_phys(page));
 + /*
 +  * Do not pin nested vm's apic access page in memory so
 +  * that memory hotplug process is able to migrate it.
 +  */
 + put_page(page);
   } else if (vm_need_virtualize_apic_accesses(vmx-vcpu.kvm)) {
   struct page *page = gfn_to_page(vmx-vcpu.kvm,
   APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
 @@ -8807,12 +8800,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
 u32 exit_reason,
   /* This is needed for same reason as it was needed in prepare_vmcs02 */
   vmx-host_rsp = 0;
  
 - /* Unpin physical memory we referred to in vmcs02 */
 - if (vmx-nested.apic_access_page) {
 - nested_release_page(vmx-nested.apic_access_page);
 - vmx-nested.apic_access_page = 0;
 - }
 -
   /*
* Do not call kvm_reload_apic_access_page() because we are now
* running, mmu_notifier will force to reload the page's hpa for L2
 

This patch is not against the latest KVM tree.  The call to
nested_get_page is now in nested_get_vmcs12_pages, and you have to
handle virtual_apic_page in a similar manner.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 6/7] kvm, mem-hotplug: Unpin and remove kvm_arch-apic_access_page.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
 + if (vm_need_virtualize_apic_accesses(vmx-vcpu.kvm)) {
 + struct page *page = gfn_to_page(vmx-vcpu.kvm,
 + APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
 + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
 + /*
 +  * Do not pin apic access page in memory so that memory
 +  * hotplug process is able to migrate it.
 +  */
 + put_page(page);
 + }

Please reuse vcpu_reload_apic_access_page here, too.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: ioapic: add check for RTC_GSI

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 11:19, Jan Kiszka ha scritto:
 On 2014-09-11 11:08, Paolo Bonzini wrote:
 Il 11/09/2014 08:01, Jan Kiszka ha scritto:
 That issue is limited to the dead ia64 architecture, no other builds
 ioapic.c (besides x86, of course). Maybe finally remove KVM support for
 that arch instead of bloating otherwise correct code?

 Yes, we should.  I've just been too busy to do it.
 
 As I enjoy creating large negative diffs :), I would almost do it - if I
 had a test environment for IA64...

I have one. :)

Paolo

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Eric Auger

On 09/11/2014 05:10 AM, Christoffer Dall wrote:
 On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
 This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.

 This is a new control channel which enables KVM to cooperate with
 viable VFIO devices.

 The kvm-vfio device now holds a list of devices (kvm_vfio_device)
 in addition to a list of groups (kvm_vfio_group). The new
 infrastructure enables to check the validity of the VFIO device
 file descriptor, get and hold a reference to it.

 The first concrete implemented command is IRQ forward control:
 KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.

 It consists in programing the VFIO driver and KVM in a consistent manner
 so that an optimized IRQ injection/completion is set up. Each
 kvm_vfio_device holds a list of forwarded IRQ. When putting a
 kvm_vfio_device, the implementation makes sure the forwarded IRQs
 are set again in the normal handling state (non forwarded).
 
 'putting a kvm_vfio_device' sounds to like you're golf'ing :)
 
 When a kvm_vfio_device is released?
sure
 

 The forwarding programmming is architecture specific, embodied by the
 kvm_arch_set_fwd_state function. Its implementation is given in a
 separate patch file.
 
 I would drop the last sentence and instead indicate that this is handled
 properly when the architecture does not support such a feature.
ok
 

 The forwarding control modality is enabled by the
 __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 v1 - v2:
 - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
 - original patch file separated into 2 parts: generic part moved in vfio.c
   and ARM specific part(kvm_arch_set_fwd_state)
 ---
  include/linux/kvm_host.h |  27 +++
  virt/kvm/vfio.c  | 452 
 ++-
  2 files changed, 477 insertions(+), 2 deletions(-)

 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index a4c33b3..24350dc 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
unsigned long arg);
  };
  
 +enum kvm_fwd_irq_action {
 +KVM_VFIO_IRQ_SET_FORWARD,
 +KVM_VFIO_IRQ_SET_NORMAL,
 +KVM_VFIO_IRQ_CLEANUP,
 
 This is KVM internal API, so it would probably be good to document this.
 Especially the CLEANUP bit worries me, see below.
I will document it
 
 +};
 +
 +/* internal structure describing a forwarded IRQ */
 +struct kvm_fwd_irq {
 +struct list_head link;
 
 this list entry is local to the kvm vfio device, right? that means you
 probably want a struct with just the below fields, and then have a
 containing struct in the generic device file, private to it's logic.
I will introduce 2 separate structs
 
 +__u32 index; /* platform device irq index */
 +__u32 hwirq; /*physical IRQ */
 +__u32 gsi; /* virtual IRQ */
 +struct kvm_vcpu *vcpu; /* vcpu to inject into*/
 +};
 +
  void kvm_device_get(struct kvm_device *dev);
  void kvm_device_put(struct kvm_device *dev);
  struct kvm_device *kvm_device_from_filp(struct file *filp);
 @@ -1075,6 +1090,18 @@ extern struct kvm_device_ops kvm_vfio_ops;
  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
  extern struct kvm_device_ops kvm_flic_ops;
  
 +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
 +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
 
 what's the 'p' in pfwd?
will rename
 
 +   enum kvm_fwd_irq_action action);
 +
 +#else
 +static inline int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
 + enum kvm_fwd_irq_action action)
 +{
 +return 0;
 +}
 +#endif
 +
  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
  
  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool 
 val)
 diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
 index 76dc7a1..e4a81c4 100644
 --- a/virt/kvm/vfio.c
 +++ b/virt/kvm/vfio.c
 @@ -18,14 +18,24 @@
  #include linux/slab.h
  #include linux/uaccess.h
  #include linux/vfio.h
 +#include linux/platform_device.h
  
  struct kvm_vfio_group {
  struct list_head node;
  struct vfio_group *vfio_group;
  };
  
 +struct kvm_vfio_device {
 +struct list_head node;
 +struct vfio_device *vfio_device;
 +/* list of forwarded IRQs for that VFIO device */
 +struct list_head fwd_irq_list;
 +int fd;
 +};
 +
  struct kvm_vfio {
  struct list_head group_list;
 +struct list_head device_list;
  struct mutex lock;
  bool noncoherent;
  };
 @@ -246,12 +256,441 @@ static int kvm_vfio_set_group(struct kvm_device *dev, 
 long attr, u64 arg)
  return -ENXIO;
  }
  
 +/**
 + * get_vfio_device - returns the vfio-device corresponding to this fd
 + * @fd:fd of the vfio platform device
 + *
 + * checks it is a vfio device
 + * increment its ref counter
 
 why the short lines?  Just write this out in proper English.
OK
 
 + */
 +static struct vfio_device *kvm_vfio_get_vfio_device(int fd)
 +{

Re: [PATCH v5 7/7] kvm, mem-hotplug: Unpin and remove nested_vmx-apic_access_page.

2014-09-11 Thread tangchen



On 09/11/2014 05:33 PM, Paolo Bonzini wrote:

This patch is not against the latest KVM tree.  The call to
nested_get_page is now in nested_get_vmcs12_pages, and you have to
handle virtual_apic_page in a similar manner.

Hi Paolo,

Thanks for the reviewing.

This patch-set is against Linux v3.17-rc4.
Will make it against the latest KVM tree, and resend a patch set 
following you comments.


Thanks.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] Guest kernel divide error in kvm_unlock_kick

2014-09-11 Thread Paolo Bonzini

Il 08/09/2014 15:28, Chris Webb ha scritto:
 divide error:  [#1] PREEMPT SMP 
 Modules linked in:
 CPU: 0 PID: 743 Comm: syslogd Not tainted 3.16.2-guest #2
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
 rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
 task: 88007c972580 ti: 88007cb7c000 task.ti: 88007cb7c000
 RIP: 0010:[81037fe2]  [81037fe2] kvm_unlock_kick+0x72/0x80
 RSP: :88007fc03ec8  EFLAGS: 00010046
 RAX: 0005 RBX:  RCX: 0003
 RDX: 0003 RSI: 81a466a0 RDI: 
 RBP: 81a466a0 R08: 81b98940 R09: 0246
 R10: 0400 R11:  R12: 00ea
 R13: 0009 R14: 0002 R15: 88007fc0d300
 FS:  7f2a6473e700() GS:88007fc0() knlGS:
 CS:  0010 DS:  ES:  CR0: 8005003b
 CR2: 004a8240 CR3: 7ac75000 CR4: 000406f0
 Stack:
  81a46400 0246 0001 8168979d
  0282 81110d97 0007 88007cb7ffd8
  88007c972580 4b0782e8 0002 81a0b0c8
 Call Trace:
  IRQ 
  [8168979d] ? _raw_spin_unlock_irqrestore+0x5d/0x80
  [81110d97] ? rcu_process_callbacks+0x337/0x4f0
  [810cde2d] ? __do_softirq+0xfd/0x210
  [810ce06e] ? irq_exit+0x7e/0xa0
  [8103063b] ? smp_apic_timer_interrupt+0x3b/0x50
  [8168b04d] ? apic_timer_interrupt+0x6d/0x80
  EOI 
  [8114180b] ? filemap_map_pages+0x17b/0x240
  [811418c0] ? filemap_map_pages+0x230/0x240
  [811679e2] ? do_read_fault.isra.70+0x2a2/0x320
  [811696cc] ? handle_mm_fault+0x37c/0xd00
  [8103bb45] ? __do_page_fault+0x185/0x4c0
  [8168b958] ? async_page_fault+0x28/0x30
  [813b9610] ? __put_user_4+0x20/0x30
  [8168b958] ? async_page_fault+0x28/0x30
 Code: c0 ca a7 81 48 8d 04 0b 48 8b 30 48 39 ee 75 c9 0f b6 40 08 44 38 e0 75 
 c0 48 c7 c0 22 b0 00 00 31 db 0f b7 0c 08 b8 05 00 00 00 0f 01 c1 0f 1f 00 
 5b 5d 41 5c c3 0f 1f 00 48 c7 c0 10 cf 00 00 

Hi Chris,

sorry for not following up on your previous patch.

This is a hypercall that should have kicked VCPU 3 (see rcx).

Can you please apply this patch and gather a trace of the host
(using trace-cmd -e kvm qemu-kvm arguments)?

Thanks,

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fb919c574e23..25ed29f68419 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -709,6 +709,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
int result = 0;
struct kvm_vcpu *vcpu = apic-vcpu;
 
+   trace_kvm_apic_accept_irq(vcpu-vcpu_id, delivery_mode,
+ trig_mode, vector, false);
switch (delivery_mode) {
case APIC_DM_LOWEST:
vcpu-arch.apic_arb_prio++;
@@ -730,8 +732,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
-   trace_kvm_apic_accept_irq(vcpu-vcpu_id, delivery_mode,
- trig_mode, vector, false);
break;
 
case APIC_DM_REMRD:



Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: x86: make apic_accept_irq tracepoint more generic

2014-09-11 Thread Paolo Bonzini

Initially the tracepoint was added only to the APIC_DM_FIXED case,
also because it reported coalesced interrupts that only made sense
for that case.  However, the coalesced argument is not used anymore
and tracing other delivery modes is useful, so hoist the call out
of the switch statement.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 arch/x86/kvm/lapic.c |  4 ++--
 arch/x86/kvm/trace.h | 11 ---
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fb919c574e23..b8345dd41b25 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -709,6 +709,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
int result = 0;
struct kvm_vcpu *vcpu = apic-vcpu;
 
+   trace_kvm_apic_accept_irq(vcpu-vcpu_id, delivery_mode,
+ trig_mode, vector);
switch (delivery_mode) {
case APIC_DM_LOWEST:
vcpu-arch.apic_arb_prio++;
@@ -730,8 +732,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
-   trace_kvm_apic_accept_irq(vcpu-vcpu_id, delivery_mode,
- trig_mode, vector, false);
break;
 
case APIC_DM_REMRD:
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4c2868f36808..6b06ab8748dd 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -415,15 +415,14 @@ TRACE_EVENT(kvm_apic_ipi,
 );
 
 TRACE_EVENT(kvm_apic_accept_irq,
-   TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced),
-   TP_ARGS(apicid, dm, tm, vec, coalesced),
+   TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec),
+   TP_ARGS(apicid, dm, tm, vec),
 
TP_STRUCT__entry(
__field(__u32,  apicid  )
__field(__u16,  dm  )
__field(__u8,   tm  )
__field(__u8,   vec )
-   __field(bool,   coalesced   )
),
 
TP_fast_assign(
@@ -431,14 +430,12 @@ TRACE_EVENT(kvm_apic_accept_irq,
__entry-dm = dm;
__entry-tm = tm;
__entry-vec= vec;
-   __entry-coalesced  = coalesced;
),
 
-   TP_printk(apicid %x vec %u (%s|%s)%s,
+   TP_printk(apicid %x vec %u (%s|%s),
  __entry-apicid, __entry-vec,
  __print_symbolic((__entry-dm  8  0x7), kvm_deliver_mode),
- __entry-tm ? level : edge,
- __entry-coalesced ?  (coalesced) : )
+ __entry-tm ? level : edge)
 );
 
 TRACE_EVENT(kvm_eoi,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 11:21:49AM +0200, Paolo Bonzini wrote:
 Il 11/09/2014 07:38, Tang Chen ha scritto:
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index 63c4c3e..da6d55d 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -7093,6 +7093,11 @@ static void vmx_set_virtual_x2apic_mode(struct 
  kvm_vcpu *vcpu, bool set)
  vmx_set_msr_bitmap(vcpu);
   }
   
  +static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
  +{
  +   vmcs_write64(APIC_ACCESS_ADDR, hpa);
 
 This has to be guarded by if (!is_guest_mode(vcpu)).
 
We do need to write it if L1 and L2 share APIC_ACCESS_ADDR and skip
it otherwise, no?

  +}
  +

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread tangchen



On 09/11/2014 05:21 PM, Paolo Bonzini wrote:

Il 11/09/2014 07:38, Tang Chen ha scritto:

apic access page is pinned in memory. As a result, it cannot be 
migrated/hot-removed.
Actually, it is not necessary to be pinned.

The hpa of apic access page is stored in VMCS APIC_ACCESS_ADDR pointer. When
the page is migrated, kvm_mmu_notifier_invalidate_page() will invalidate the
corresponding ept entry. This patch introduces a new vcpu request named
KVM_REQ_APIC_PAGE_RELOAD, and makes this request to all the vcpus at this time,
and force all the vcpus exit guest, and re-enter guest till they updates the 
VMCS
APIC_ACCESS_ADDR pointer to the new apic access page address, and updates
kvm-arch.apic_access_page to the new page.

Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
---
  arch/x86/include/asm/kvm_host.h |  1 +
  arch/x86/kvm/svm.c  |  6 ++
  arch/x86/kvm/vmx.c  |  6 ++
  arch/x86/kvm/x86.c  | 15 +++
  include/linux/kvm_host.h|  2 ++
  virt/kvm/kvm_main.c | 12 
  6 files changed, 42 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 35171c7..514183e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -739,6 +739,7 @@ struct kvm_x86_ops {
void (*hwapic_isr_update)(struct kvm *kvm, int isr);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
+   void (*set_apic_access_page_addr)(struct kvm *kvm, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1d941ad..f2eacc4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3619,6 +3619,11 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu 
*vcpu, bool set)
return;
  }
  
+static void svm_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)

+{
+   return;
+}
+
  static int svm_vm_has_apicv(struct kvm *kvm)
  {
return 0;
@@ -4373,6 +4378,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
+   .set_apic_access_page_addr = svm_set_apic_access_page_addr,
.vm_has_apicv = svm_vm_has_apicv,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_isr_update = svm_hwapic_isr_update,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 63c4c3e..da6d55d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7093,6 +7093,11 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu 
*vcpu, bool set)
vmx_set_msr_bitmap(vcpu);
  }
  
+static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)

+{
+   vmcs_write64(APIC_ACCESS_ADDR, hpa);

This has to be guarded by if (!is_guest_mode(vcpu)).


Since we cannot get vcpu through kvm, I'd like to move this check to
vcpu_reload_apic_access_page() when 
kvm_x86_ops-set_apic_access_page_addr()

is called.

Thanks.



+}
+
  static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
  {
u16 status;
@@ -8910,6 +8915,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
+   .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.vm_has_apicv = vmx_vm_has_apicv,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.hwapic_irr_update = vmx_hwapic_irr_update,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e05bd58..96f4188 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5989,6 +5989,19 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
kvm_apic_update_tmr(vcpu, tmr);
  }
  
+static void vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)

+{
+   /*
+* apic access page could be migrated. When the page is being migrated,
+* GUP will wait till the migrate entry is replaced with the new pte
+* entry pointing to the new page.
+*/
+   vcpu-kvm-arch.apic_access_page = gfn_to_page(vcpu-kvm,
+   APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
+   kvm_x86_ops-set_apic_access_page_addr(vcpu-kvm,
+   page_to_phys(vcpu-kvm-arch.apic_access_page));
+}
+
  /*
   * Returns 1 to let __vcpu_run() continue the guest execution loop without
   * exiting to the userspace.  Otherwise, the value will be returned to the
@@ -6049,6 +6062,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))

Re: [PATCH v2] ARM/arm64: KVM: fix use of WnR bit in kvm_is_write_fault()

2014-09-11 Thread Marc Zyngier

On 11/09/14 04:12, Christoffer Dall wrote:
 On Tue, Sep 09, 2014 at 12:02:59PM +0100, Marc Zyngier wrote:
 [resending, as ARM email server seems to be in some mood]

 On 09/09/14 11:27, Ard Biesheuvel wrote:
 The ISS encoding for an exception from a Data Abort has a WnR
 bit[6] that indicates whether the Data Abort was caused by a
 read or a write instruction. While there are several fields
 in the encoding that are only valid if the ISV bit[24] is set,
 WnR is not one of them, so we can read it unconditionally.

 Instead of fixing both implementations of kvm_is_write_fault()
 in place, reimplement it just once using kvm_vcpu_dabt_iswrite(),
 which already does the right thing with respect to the WnR bit.
 Also fix up the callers to pass 'vcpu'

 Acked-by: Laszlo Ersek ler...@redhat.com
 Signed-off-by: Ard Biesheuvel ard.biesheu...@linaro.org

 Because I like that kind of diffstat:
 Acked-by: Marc Zyngier marc.zyng...@arm.com

 Christoffer, if you too are happy with that, I'll queue it right away.

 Extremely happy:
 
 Acked-by: Christoffer Dall christoffer.d...@linaro.org

Added to kvmarm/queue.

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 3/7] kvm: Make init_rmode_identity_map() return 0 on success.

2014-09-11 Thread tangchen



On 09/11/2014 05:17 PM, Paolo Bonzini wrote:

..
@@ -7645,7 +7642,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, 
unsigned int id)
kvm-arch.ept_identity_map_addr =
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = -ENOMEM;
-   if (!init_rmode_identity_map(kvm))
+   if (init_rmode_identity_map(kvm))
Please add  0 here.  I would also consider setting err to the return
value of init_rmode_identity_map, and initializing it to -ENOMEM only
after the if.


I'd like to move err = -ENOMEM to the following place:

vmx_create_vcpu()
{
..
err = kvm_vcpu_init(vmx-vcpu, kvm, id);
if (err)
goto free_vcpu;

err = -ENOMEM;  -- move it here

vmx-guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);

vmx-loaded_vmcs-vmcs = alloc_vmcs();

}

So that it can be used to handle the next two memory allocation error.

Thanks.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: EVENTFD: remove inclusion of irq.h

2014-09-11 Thread Marc Zyngier

On 11/09/14 08:11, Paolo Bonzini wrote:
 Il 11/09/2014 05:09, Christoffer Dall ha scritto:
 On Mon, Sep 01, 2014 at 12:11:19PM +0200, Paolo Bonzini wrote:
 Il 01/09/2014 10:36, Eric Auger ha scritto:
 No more needed. irq.h would be void on ARM.

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 I don't think irq.h is needed anymore since Paul Mackerras' work. However
 I did not compile for all architectures.
 ---
  virt/kvm/eventfd.c | 1 -
  1 file changed, 1 deletion(-)

 diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
 index 3c5981c..0c712a7 100644
 --- a/virt/kvm/eventfd.c
 +++ b/virt/kvm/eventfd.c
 @@ -36,7 +36,6 @@
  #include linux/seqlock.h
  #include trace/events/kvm.h
  
 -#include irq.h
  #include iodev.h
  
  #ifdef CONFIG_HAVE_KVM_IRQFD


 Acked-by: Paolo Bonzini pbonz...@redhat.com

 Christoffer, please include this via the ARM tree, together with ARM
 irqfd support.  Thanks,

 Marc is dealing with the tree this week and the next so he can apply it
 to kvmarm/queue.  Do you want it to wait and go with the irqfd patch
 (which has dependencies not yet resolved) or should we just queue it?
 
 As you prefer.

Added to kvmarm/queue.

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

blk-mq crash under KVM in multiqueue block code (with virtio-blk and ext4)

2014-09-11 Thread Christian Borntraeger

Folks,

we have seen the following bug with 3.16 as a KVM guest. It suspect the blk-mq 
rework that happened between 3.15 and 3.16, but it can be something completely 
different.


[   65.992022] Unable to handle kernel pointer dereference in virtual kernel 
address space
[   65.992187] failing address: d000 TEID: d803
[   65.992363] Fault in home space mode while using kernel ASCE.
[   65.992365] AS:00a7c007 R3:0024 
[   65.993754] Oops: 0038 [#1] SMP 
[   65.993923] Modules linked in: iscsi_tcp libiscsi_tcp libiscsi 
scsi_transport_iscsi virtio_balloon vhost_net vhost macvtap macvlan kvm 
dm_multipath virtio_net virtio_blk sunrpc
[   65.994274] CPU: 0 PID: 44 Comm: kworker/u6:2 Not tainted 
3.16.0-20140814.0.c66c84c.fc18-s390xfrob #1
[   65.996043] Workqueue: writeback bdi_writeback_workfn (flush-251:32)
[   65.996222] task: 0225 ti: 02258000 task.ti: 
02258000
[   65.996228] Krnl PSW : 0704f0018000 003ed114 
(blk_mq_tag_to_rq+0x20/0x38)
[   65.997299]R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:3 PM:0 
EA:3
   Krnl GPRS: 0040  01619000 
004e
[   65.997301]004e  0001 
00a0de18
[   65.997302]77ffbe18 77ffbd50 6d72d620 
004f
[   65.997304]01a99400 0080 003eddee 
77ffbc28
[   65.997864] Krnl Code: 003ed106: e3102034lg  
%r1,48(%r2)
  003ed10c: 91082044tm  
68(%r2),8
 #003ed110: a7840009brc 8,3ed122
 003ed114: e34016880004lg  
%r4,1672(%r1)
  003ed11a: 59304100c   
%r3,256(%r4)
  003ed11e: a7840003brc 8,3ed124
  003ed122: 07febcr 15,%r14
  003ed124: b9040024lgr %r2,%r4
[   65.998221] Call Trace:
[   65.998224] ([0001] 0x1)
[   65.998227]  [003f17b6] blk_mq_tag_busy_iter+0x7a/0xc4
[   65.998228]  [003edcd6] blk_mq_rq_timer+0x96/0x13c
[   65.999226]  [0013ee60] call_timer_fn+0x40/0x110
[   65.999230]  [0013f642] run_timer_softirq+0x2de/0x3d0
[   65.999238]  [00135b70] __do_softirq+0x124/0x2ac
[   65.999241]  [00136000] irq_exit+0xc4/0xe4
[   65.999435]  [0010bc08] do_IRQ+0x64/0x84
[   66.437533]  [0067ccd8] ext_skip+0x42/0x46
[   66.437541]  [003ed7b4] __blk_mq_alloc_request+0x58/0x1e8
[   66.437544] ([003ed788] __blk_mq_alloc_request+0x2c/0x1e8)
[   66.437547]  [003eef82] blk_mq_map_request+0xc2/0x208
[   66.437549]  [003ef860] blk_sq_make_request+0xac/0x350
[   66.437721]  [003e2d6c] generic_make_request+0xc4/0xfc
[   66.437723]  [003e2e56] submit_bio+0xb2/0x1a8
[   66.438373]  [0031e8aa] ext4_io_submit+0x52/0x80
[   66.438375]  [0031ccfa] ext4_writepages+0x7c6/0xd0c
[   66.438378]  [002aea20] __writeback_single_inode+0x54/0x274
[   66.438379]  [002b0134] writeback_sb_inodes+0x28c/0x4ec
[   66.438380]  [002b042e] __writeback_inodes_wb+0x9a/0xe4
[   66.438382]  [002b06a2] wb_writeback+0x22a/0x358
[   66.438383]  [002b0cd0] bdi_writeback_workfn+0x354/0x538
[   66.438618]  [0014e3aa] process_one_work+0x1aa/0x418
[   66.438621]  [0014ef94] worker_thread+0x48/0x524
[   66.438625]  [001560ca] kthread+0xee/0x108
[   66.438627]  [0067c76e] kernel_thread_starter+0x6/0xc
[   66.438628]  [0067c768] kernel_thread_starter+0x0/0xc
[   66.438629] Last Breaking-Event-Address:
[   66.438631]  [003edde8] blk_mq_timeout_check+0x6c/0xb8

I looked into the dump, and the full function is  (annotated by me to match the 
source code)
r2= tags
r3= tag (4e)
Dump of assembler code for function blk_mq_tag_to_rq:
   0x003ed0f4 +0: lg  %r1,96(%r2) # r1 
has now tags-rqs
   0x003ed0fa +6: sllg%r2,%r3,3   # r2 
has tag*8
   0x003ed100 +12:lg  %r2,0(%r2,%r1)  # r2 
now has rq (=tags-rqs[tag])
   0x003ed106 +18:lg  %r1,48(%r2) # r1 
now has rq-q
   0x003ed10c +24:tm  68(%r2),8   # test 
for rq-cmd_flags  REQ_FLUSH_SEQ
   0x003ed110 +28:je  0x3ed122 blk_mq_tag_to_rq+46  #  if 
not goto 3ed122
   0x003ed114 +32:lg  %r4,1672(%r1)   # r4 = 
rq-q-flush_rq   CRASHES as rq-q points to 
   0x003ed11a +38:c   %r3,256(%r4)# 
compare tag with rq-q-flush_rq-tag
   0x003ed11e +42:je

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 12:20, tangchen ha scritto:

 +vmcs_write64(APIC_ACCESS_ADDR, hpa);
 This has to be guarded by if (!is_guest_mode(vcpu)).
 
 Since we cannot get vcpu through kvm, I'd like to move this check to
 vcpu_reload_apic_access_page() when
 kvm_x86_ops-set_apic_access_page_addr()
 is called.

Good idea!  Though passing the vcpu to vmx_set_apic_access_page_addr
would also work.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 12:12, Gleb Natapov ha scritto:
 On Thu, Sep 11, 2014 at 11:21:49AM +0200, Paolo Bonzini wrote:
 Il 11/09/2014 07:38, Tang Chen ha scritto:
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 63c4c3e..da6d55d 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -7093,6 +7093,11 @@ static void vmx_set_virtual_x2apic_mode(struct 
 kvm_vcpu *vcpu, bool set)
 vmx_set_msr_bitmap(vcpu);
  }
  
 +static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
 +{
 +   vmcs_write64(APIC_ACCESS_ADDR, hpa);

 This has to be guarded by if (!is_guest_mode(vcpu)).

 We do need to write it if L1 and L2 share APIC_ACCESS_ADDR and skip
 it otherwise, no?

Yes, but this would be handled by patch 6:

} else if (vm_need_virtualize_apic_accesses(vmx-vcpu.kvm)) {
+   struct page *page = gfn_to_page(vmx-vcpu.kvm,
+   APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-   vmcs_write64(APIC_ACCESS_ADDR,
-   page_to_phys(vcpu-kvm-arch.apic_access_page));
+   vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+   /*
+* Do not pin apic access page in memory so that memory
+* hotplug process is able to migrate it.
+*/
+   put_page(page);
}

However, this is also useless code duplication because the above snippet could
reuse vcpu_reload_apic_access_page too.

So I think you cannot do the is_guest_mode check in
kvm_vcpu_reload_apic_access_page and also not in
vmx_reload_apic_access_page.  But you could do something like

kvm_vcpu_reload_apic_access_page(...)
{
...
kvm_x86_ops-reload_apic_access_page(...);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);

/* used in vcpu_enter_guest only */
vcpu_reload_apic_access_page(...)
{
if (!is_guest_mode(vcpu))
kvm_vcpu_reload_apic_access_page(...)
}

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 2/8] arm/arm64: KVM: vgic: switch to dynamic allocation

2014-09-11 Thread Marc Zyngier

So far, all the VGIC data structures are statically defined by the
*maximum* number of vcpus and interrupts it supports. It means that
we always have to oversize it to cater for the worse case.

Start by changing the data structures to be dynamically sizeable,
and allocate them at runtime.

The sizes are still very static though.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 arch/arm/kvm/arm.c |   3 +
 include/kvm/arm_vgic.h |  76 
 virt/kvm/arm/vgic.c| 237 ++---
 3 files changed, 267 insertions(+), 49 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index a99e0cd..923a01d 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -172,6 +172,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm-vcpus[i] = NULL;
}
}
+
+   kvm_vgic_destroy(kvm);
 }
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -253,6 +255,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
kvm_mmu_free_memory_caches(vcpu);
kvm_timer_vcpu_terminate(vcpu);
+   kvm_vgic_vcpu_destroy(vcpu);
kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index f074539..bdaac57 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -54,19 +54,33 @@
  * - a bunch of shared interrupts (SPI)
  */
 struct vgic_bitmap {
-   union {
-   u32 reg[VGIC_NR_PRIVATE_IRQS / 32];
-   DECLARE_BITMAP(reg_ul, VGIC_NR_PRIVATE_IRQS);
-   } percpu[VGIC_MAX_CPUS];
-   union {
-   u32 reg[VGIC_NR_SHARED_IRQS / 32];
-   DECLARE_BITMAP(reg_ul, VGIC_NR_SHARED_IRQS);
-   } shared;
+   /*
+* - One UL per VCPU for private interrupts (assumes UL is at
+*   least 32 bits)
+* - As many UL as necessary for shared interrupts.
+*
+* The private interrupts are accessed via the private
+* field, one UL per vcpu (the state for vcpu n is in
+* private[n]). The shared interrupts are accessed via the
+* shared pointer (IRQn state is at bit n-32 in the bitmap).
+*/
+   unsigned long *private;
+   unsigned long *shared;
 };
 
 struct vgic_bytemap {
-   u32 percpu[VGIC_MAX_CPUS][VGIC_NR_PRIVATE_IRQS / 4];
-   u32 shared[VGIC_NR_SHARED_IRQS  / 4];
+   /*
+* - 8 u32 per VCPU for private interrupts
+* - As many u32 as necessary for shared interrupts.
+*
+* The private interrupts are accessed via the private
+* field, (the state for vcpu n is in private[n*8] to
+* private[n*8 + 7]). The shared interrupts are accessed via
+* the shared pointer (IRQn state is at byte (n-32)%4 of the
+* shared[(n-32)/4] word).
+*/
+   u32 *private;
+   u32 *shared;
 };
 
 struct kvm_vcpu;
@@ -127,6 +141,9 @@ struct vgic_dist {
boolin_kernel;
boolready;
 
+   int nr_cpus;
+   int nr_irqs;
+
/* Virtual control interface mapping */
void __iomem*vctrl_base;
 
@@ -166,15 +183,36 @@ struct vgic_dist {
/* Level/edge triggered */
struct vgic_bitmap  irq_cfg;
 
-   /* Source CPU per SGI and target CPU */
-   u8  irq_sgi_sources[VGIC_MAX_CPUS][VGIC_NR_SGIS];
+   /*
+* Source CPU per SGI and target CPU:
+*
+* Each byte represent a SGI observable on a VCPU, each bit of
+* this byte indicating if the corresponding VCPU has
+* generated this interrupt. This is a GICv2 feature only.
+*
+* For VCPUn (n  8), irq_sgi_sources[n*16] to [n*16 + 15] are
+* the SGIs observable on VCPUn.
+*/
+   u8  *irq_sgi_sources;
 
-   /* Target CPU for each IRQ */
-   u8  irq_spi_cpu[VGIC_NR_SHARED_IRQS];
-   struct vgic_bitmap  irq_spi_target[VGIC_MAX_CPUS];
+   /*
+* Target CPU for each SPI:
+*
+* Array of available SPI, each byte indicating the target
+* VCPU for SPI. IRQn (n =32) is at irq_spi_cpu[n-32].
+*/
+   u8  *irq_spi_cpu;
+
+   /*
+* Reverse lookup of irq_spi_cpu for faster compute pending:
+*
+* Array of bitmaps, one per VCPU, describing is IRQn is
+* routed to a particular VCPU.
+*/
+   struct vgic_bitmap  *irq_spi_target;
 
/* Bitmap indicating which CPU has something pending */
-   unsigned long   irq_pending_on_cpu;
+   unsigned long   *irq_pending_on_cpu;
 #endif
 };
 
@@ -204,11 +242,11 @@ struct vgic_v3_cpu_if {
 struct vgic_cpu {
 #ifdef CONFIG_KVM_ARM_VGIC
/* per IRQ to LR mapping */
-   u8  vgic_irq_lr_map[VGIC_NR_IRQS];
+   u8

[PATCH v4 0/8] arm/arm64: KVM: dynamic VGIC sizing

2014-09-11 Thread Marc Zyngier

So far, the VGIC data structures have been statically sized, meaning
that we always have to support more interrupts than we actually want,
and more CPU interfaces than we should. This is a waste of resource,
and is the kind of things that should be tuneable.

This series addresses that issue by changing the data structures to be
dynamically allocated, and adds a new configuration attribute to
allocate the number of interrupts. When the attribute is not used, we
fallback to the old behaviour of allocating a fixed number of
interrupts.

This series is also the base for Andre Przywara's GICv3 distributor
emulation code (which can support far more than 8 vcpus and 1020
interrupts).

This has been tested on both ARM (TC2, A20) and arm64 (model and Juno).

The code is available from my kvm-arm64/kvmtool-vgic-dyn branch,
together with the corresponding kvmtool code.

* From v3 [3]
  - Number of comments added to the data structures, making slightly more
obvious the various mappings
  - Dropped the nr_irqs field from bitmap and bytemap structures, as
it was a leftover from the initial revision that only had a single
pointer
  - Small cleanups all over the place
  - Dropped the sub-page offset patch for now, as this need some
serious reworking
  - Rebased on top of Christoffer vgic cleanup series, with 3.17-rc4
thrown in for a good measure

* From v2 [2]
  - Fixed bug that broke QEMU (register access can trigger allocation)
  - irq_pending_on_cpu is now dynamic (needed for more than 32 or 64 vcpus)
  - Rebased on top of Victor's BE patches

* From v1 [1]
  - Rebased on top of 3.16-rc1
  - Lots of cleanup

[1]: https://lists.cs.columbia.edu/pipermail/kvmarm/2013-October/005879.html
[2]: https://lists.cs.columbia.edu/pipermail/kvmarm/2014-June/010050.html
[3]: https://lists.cs.columbia.edu/pipermail/kvmarm/2014-July/010383.html

Marc Zyngier (8):
  KVM: ARM: vgic: plug irq injection race
  arm/arm64: KVM: vgic: switch to dynamic allocation
  arm/arm64: KVM: vgic: Parametrize VGIC_NR_SHARED_IRQS
  arm/arm64: KVM: vgic: kill VGIC_MAX_CPUS
  arm/arm64: KVM: vgic: handle out-of-range MMIO accesses
  arm/arm64: KVM: vgic: kill VGIC_NR_IRQS
  arm/arm64: KVM: vgic: delay vgic allocation until init time
  arm/arm64: KVM: vgic: make number of irqs a configurable attribute

 Documentation/virtual/kvm/devices/arm-vgic.txt |  10 +
 arch/arm/include/uapi/asm/kvm.h|   1 +
 arch/arm/kvm/arm.c |  10 +-
 arch/arm64/include/uapi/asm/kvm.h  |   1 +
 include/kvm/arm_vgic.h |  88 --
 virt/kvm/arm/vgic.c| 396 +
 6 files changed, 413 insertions(+), 93 deletions(-)

-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 3/8] arm/arm64: KVM: vgic: Parametrize VGIC_NR_SHARED_IRQS

2014-09-11 Thread Marc Zyngier

Having a dynamic number of supported interrupts means that we
cannot relly on VGIC_NR_SHARED_IRQS being fixed anymore.

Instead, make it take the distributor structure as a parameter,
so it can return the right value.

Reviewed-by: Christoffer Dall christoffer.d...@linaro.org
Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 include/kvm/arm_vgic.h |  1 -
 virt/kvm/arm/vgic.c| 16 +++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index bdaac57..bdeb451 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -29,7 +29,6 @@
 #define VGIC_NR_SGIS   16
 #define VGIC_NR_PPIS   16
 #define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
-#define VGIC_NR_SHARED_IRQS(VGIC_NR_IRQS - VGIC_NR_PRIVATE_IRQS)
 #define VGIC_MAX_CPUS  KVM_MAX_VCPUS
 
 #define VGIC_V2_MAX_LRS(1  6)
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 92c086e..93fe73b 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1083,11 +1083,17 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, 
u32 reg)
}
 }
 
+static int vgic_nr_shared_irqs(struct vgic_dist *dist)
+{
+   return dist-nr_irqs - VGIC_NR_PRIVATE_IRQS;
+}
+
 static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
 {
struct vgic_dist *dist = vcpu-kvm-arch.vgic;
unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
unsigned long pending_private, pending_shared;
+   int nr_shared = vgic_nr_shared_irqs(dist);
int vcpu_id;
 
vcpu_id = vcpu-vcpu_id;
@@ -1100,15 +1106,15 @@ static int compute_pending_for_cpu(struct kvm_vcpu 
*vcpu)
 
pending = vgic_bitmap_get_shared_map(dist-irq_pending);
enabled = vgic_bitmap_get_shared_map(dist-irq_enabled);
-   bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS);
+   bitmap_and(pend_shared, pending, enabled, nr_shared);
bitmap_and(pend_shared, pend_shared,
   vgic_bitmap_get_shared_map(dist-irq_spi_target[vcpu_id]),
-  VGIC_NR_SHARED_IRQS);
+  nr_shared);
 
pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS);
-   pending_shared = find_first_bit(pend_shared, VGIC_NR_SHARED_IRQS);
+   pending_shared = find_first_bit(pend_shared, nr_shared);
return (pending_private  VGIC_NR_PRIVATE_IRQS ||
-   pending_shared  VGIC_NR_SHARED_IRQS);
+   pending_shared  vgic_nr_shared_irqs(dist));
 }
 
 /*
@@ -1365,7 +1371,7 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu 
*vcpu)
}
 
/* SPIs */
-   for_each_set_bit(i, vgic_cpu-pending_shared, VGIC_NR_SHARED_IRQS) {
+   for_each_set_bit(i, vgic_cpu-pending_shared, 
vgic_nr_shared_irqs(dist)) {
if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
overflow = 1;
}
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 1/8] KVM: ARM: vgic: plug irq injection race

2014-09-11 Thread Marc Zyngier

As it stands, nothing prevents userspace from injecting an interrupt
before the guest's GIC is actually initialized.

This goes unnoticed so far (as everything is pretty much statically
allocated), but ends up exploding in a spectacular way once we switch
to a more dynamic allocation (the GIC data structure isn't there yet).

The fix is to test for the ready flag in the VGIC distributor before
trying to inject the interrupt. Note that in order to avoid breaking
userspace, we have to ignore what is essentially an error.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
Acked-by: Christoffer Dall christoffer.d...@linaro.org
---
 virt/kvm/arm/vgic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index f7ab1ca..d3299d4 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1584,7 +1584,8 @@ out:
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
bool level)
 {
-   if (vgic_update_irq_pending(kvm, cpuid, irq_num, level))
+   if (likely(vgic_initialized(kvm)) 
+   vgic_update_irq_pending(kvm, cpuid, irq_num, level))
vgic_kick_vcpus(kvm);
 
return 0;
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 5/8] arm/arm64: KVM: vgic: handle out-of-range MMIO accesses

2014-09-11 Thread Marc Zyngier

Now that we can (almost) dynamically size the number of interrupts,
we're facing an interesting issue:

We have to evaluate at runtime whether or not an access hits a valid
register, based on the sizing of this particular instance of the
distributor. Furthermore, the GIC spec says that accessing a reserved
register is RAZ/WI.

For this, add a new field to our range structure, indicating the number
of bits a single interrupts uses. That allows us to find out whether or
not the access is in range.

Reviewed-by: Christoffer Dall christoffer.d...@linaro.org
Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 include/kvm/arm_vgic.h |  3 ++-
 virt/kvm/arm/vgic.c| 56 --
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 3900e31..97f5f57 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -32,6 +32,7 @@
 
 #define VGIC_V2_MAX_LRS(1  6)
 #define VGIC_V3_MAX_LRS16
+#define VGIC_MAX_IRQS  1024
 
 /* Sanity checks... */
 #if (KVM_MAX_VCPUS  8)
@@ -42,7 +43,7 @@
 #error VGIC_NR_IRQS must be a multiple of 32
 #endif
 
-#if (VGIC_NR_IRQS  1024)
+#if (VGIC_NR_IRQS  VGIC_MAX_IRQS)
 #error VGIC_NR_IRQS must be = 1024
 #endif
 
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 7e6e64d..ab01cab 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -892,6 +892,7 @@ static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
 struct mmio_range {
phys_addr_t base;
unsigned long len;
+   int bits_per_irq;
bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
phys_addr_t offset);
 };
@@ -900,56 +901,67 @@ static const struct mmio_range vgic_dist_ranges[] = {
{
.base   = GIC_DIST_CTRL,
.len= 12,
+   .bits_per_irq   = 0,
.handle_mmio= handle_mmio_misc,
},
{
.base   = GIC_DIST_IGROUP,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_raz_wi,
},
{
.base   = GIC_DIST_ENABLE_SET,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_set_enable_reg,
},
{
.base   = GIC_DIST_ENABLE_CLEAR,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_clear_enable_reg,
},
{
.base   = GIC_DIST_PENDING_SET,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_set_pending_reg,
},
{
.base   = GIC_DIST_PENDING_CLEAR,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_clear_pending_reg,
},
{
.base   = GIC_DIST_ACTIVE_SET,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_raz_wi,
},
{
.base   = GIC_DIST_ACTIVE_CLEAR,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_raz_wi,
},
{
.base   = GIC_DIST_PRI,
-   .len= VGIC_NR_IRQS,
+   .len= VGIC_MAX_IRQS,
+   .bits_per_irq   = 8,
.handle_mmio= handle_mmio_priority_reg,
},
{
.base   = GIC_DIST_TARGET,
-   .len= VGIC_NR_IRQS,
+   .len= VGIC_MAX_IRQS,
+   .bits_per_irq   = 8,
.handle_mmio= handle_mmio_target_reg,
},
{
.base   = GIC_DIST_CONFIG,
-   .len= VGIC_NR_IRQS / 4,
+   .len= VGIC_MAX_IRQS / 4,
+   .bits_per_irq   = 2,
.handle_mmio= handle_mmio_cfg_reg,
},
{
@@ -987,6 +999,22 @@ struct mmio_range *find_matching_range(const struct 
mmio_range *ranges,
return NULL;
 }
 
+static bool vgic_validate_access(const struct vgic_dist *dist,
+

[PATCH v4 8/8] arm/arm64: KVM: vgic: make number of irqs a configurable attribute

2014-09-11 Thread Marc Zyngier

In order to make the number of interrupts configurable, use the new
fancy device management API to add KVM_DEV_ARM_VGIC_GRP_NR_IRQS as
a VGIC configurable attribute.

Userspace can now specify the exact size of the GIC (by increments
of 32 interrupts).

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 Documentation/virtual/kvm/devices/arm-vgic.txt | 10 +++
 arch/arm/include/uapi/asm/kvm.h|  1 +
 arch/arm64/include/uapi/asm/kvm.h  |  1 +
 virt/kvm/arm/vgic.c| 37 ++
 4 files changed, 49 insertions(+)

diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt 
b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 7f4e91b..df8b0c7 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -71,3 +71,13 @@ Groups:
   Errors:
 -ENODEV: Getting or setting this register is not yet supported
 -EBUSY: One or more VCPUs are running
+
+  KVM_DEV_ARM_VGIC_GRP_NR_IRQS
+  Attributes:
+A value describing the number of interrupts (SGI, PPI and SPI) for
+this GIC instance, ranging from 64 to 1024, in increments of 32.
+
+  Errors:
+-EINVAL: Value set is out of the expected range
+-EBUSY: Value has already be set, or GIC has already been initialized
+with default values.
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index e6ebdd3..8b51c1a 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -173,6 +173,7 @@ struct kvm_arch_memory_slot {
 #define   KVM_DEV_ARM_VGIC_CPUID_MASK  (0xffULL  
KVM_DEV_ARM_VGIC_CPUID_SHIFT)
 #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT0
 #define   KVM_DEV_ARM_VGIC_OFFSET_MASK (0xULL  
KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS   3
 
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index e633ff8..b5cd6ed 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -159,6 +159,7 @@ struct kvm_arch_memory_slot {
 #define   KVM_DEV_ARM_VGIC_CPUID_MASK  (0xffULL  
KVM_DEV_ARM_VGIC_CPUID_SHIFT)
 #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT0
 #define   KVM_DEV_ARM_VGIC_OFFSET_MASK (0xULL  
KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS   3
 
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 9180823..744388d 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -2331,6 +2331,36 @@ static int vgic_set_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
 
return vgic_attr_regs_access(dev, attr, reg, true);
}
+   case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+   u32 __user *uaddr = (u32 __user *)(long)attr-addr;
+   u32 val;
+   int ret = 0;
+
+   if (get_user(val, uaddr))
+   return -EFAULT;
+
+   /*
+* We require:
+* - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
+* - at most 1024 interrupts
+* - a multiple of 32 interrupts
+*/
+   if (val  (VGIC_NR_PRIVATE_IRQS + 32) ||
+   val  VGIC_MAX_IRQS ||
+   (val  31))
+   return -EINVAL;
+
+   mutex_lock(dev-kvm-lock);
+
+   if (vgic_initialized(dev-kvm) || dev-kvm-arch.vgic.nr_irqs)
+   ret = -EBUSY;
+   else
+   dev-kvm-arch.vgic.nr_irqs = val;
+
+   mutex_unlock(dev-kvm-lock);
+
+   return ret;
+   }
 
}
 
@@ -2367,6 +2397,11 @@ static int vgic_get_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
r = put_user(reg, uaddr);
break;
}
+   case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+   u32 __user *uaddr = (u32 __user *)(long)attr-addr;
+   r = put_user(dev-kvm-arch.vgic.nr_irqs, uaddr);
+   break;
+   }
 
}
 
@@ -2403,6 +2438,8 @@ static int vgic_has_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
offset = attr-attr  KVM_DEV_ARM_VGIC_OFFSET_MASK;
return vgic_has_attr_regs(vgic_cpu_ranges, offset);
+   case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+   return 0;
}
return -ENXIO;
 }
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 6/8] arm/arm64: KVM: vgic: kill VGIC_NR_IRQS

2014-09-11 Thread Marc Zyngier

Nuke VGIC_NR_IRQS entierly, now that the distributor instance
contains the number of IRQ allocated to this GIC.

Also add VGIC_NR_IRQS_LEGACY to preserve the current API.

Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 include/kvm/arm_vgic.h |  6 +++---
 virt/kvm/arm/vgic.c| 17 +++--
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 97f5f57..0a27564 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -25,7 +25,7 @@
 #include linux/spinlock.h
 #include linux/types.h
 
-#define VGIC_NR_IRQS   256
+#define VGIC_NR_IRQS_LEGACY256
 #define VGIC_NR_SGIS   16
 #define VGIC_NR_PPIS   16
 #define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
@@ -39,11 +39,11 @@
 #error Invalid number of CPU interfaces
 #endif
 
-#if (VGIC_NR_IRQS  31)
+#if (VGIC_NR_IRQS_LEGACY  31)
 #error VGIC_NR_IRQS must be a multiple of 32
 #endif
 
-#if (VGIC_NR_IRQS  VGIC_MAX_IRQS)
+#if (VGIC_NR_IRQS_LEGACY  VGIC_MAX_IRQS)
 #error VGIC_NR_IRQS must be = 1024
 #endif
 
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index ab01cab..dfa6430 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -436,7 +436,7 @@ static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
 
case 4: /* GICD_TYPER */
reg  = (atomic_read(vcpu-kvm-online_vcpus) - 1)  5;
-   reg |= (VGIC_NR_IRQS  5) - 1;
+   reg |= (vcpu-kvm-arch.vgic.nr_irqs  5) - 1;
vgic_reg_access(mmio, reg, word_offset,
ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
break;
@@ -1274,13 +1274,14 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu 
*vcpu)
 static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
 {
struct vgic_cpu *vgic_cpu = vcpu-arch.vgic_cpu;
+   struct vgic_dist *dist = vcpu-kvm-arch.vgic;
struct vgic_lr vlr;
int lr;
 
/* Sanitize the input... */
BUG_ON(sgi_source_id  ~7);
BUG_ON(sgi_source_id  irq = VGIC_NR_SGIS);
-   BUG_ON(irq = VGIC_NR_IRQS);
+   BUG_ON(irq = dist-nr_irqs);
 
kvm_debug(Queue IRQ%d\n, irq);
 
@@ -1516,7 +1517,7 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 
vlr = vgic_get_lr(vcpu, lr);
 
-   BUG_ON(vlr.irq = VGIC_NR_IRQS);
+   BUG_ON(vlr.irq = dist-nr_irqs);
vgic_cpu-vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
}
 
@@ -1738,7 +1739,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
if (vcpu-vcpu_id = dist-nr_cpus)
return -EBUSY;
 
-   for (i = 0; i  VGIC_NR_IRQS; i++) {
+   for (i = 0; i  dist-nr_irqs; i++) {
if (i  VGIC_NR_PPIS)
vgic_bitmap_set_irq_val(dist-irq_enabled,
vcpu-vcpu_id, i, 1);
@@ -1880,7 +1881,11 @@ static int vgic_init_maps(struct kvm *kvm)
int ret, i;
 
nr_cpus = dist-nr_cpus = KVM_MAX_VCPUS;
-   nr_irqs = dist-nr_irqs = VGIC_NR_IRQS;
+
+   if (!dist-nr_irqs)
+   dist-nr_irqs = VGIC_NR_IRQS_LEGACY;
+
+   nr_irqs = dist-nr_irqs;
 
ret  = vgic_init_bitmap(dist-irq_enabled, nr_cpus, nr_irqs);
ret |= vgic_init_bitmap(dist-irq_level, nr_cpus, nr_irqs);
@@ -1964,7 +1969,7 @@ int kvm_vgic_init(struct kvm *kvm)
goto out;
}
 
-   for (i = VGIC_NR_PRIVATE_IRQS; i  VGIC_NR_IRQS; i += 4)
+   for (i = VGIC_NR_PRIVATE_IRQS; i  kvm-arch.vgic.nr_irqs; i += 4)
vgic_set_target_reg(kvm, 0, i);
 
kvm-arch.vgic.ready = true;
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 7/8] arm/arm64: KVM: vgic: delay vgic allocation until init time

2014-09-11 Thread Marc Zyngier

It is now quite easy to delay the allocation of the vgic tables
until we actually require it to be up and running (when the first
vcpu is kicking around, or someones tries to access the GIC registers).

This allow us to allocate memory for the exact number of CPUs we
have. As nobody configures the number of interrupts just yet,
use a fallback to VGIC_NR_IRQS_LEGACY.

Reviewed-by: Christoffer Dall christoffer.d...@linaro.org
Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 arch/arm/kvm/arm.c |  7 ---
 include/kvm/arm_vgic.h |  1 -
 virt/kvm/arm/vgic.c| 42 +-
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 923a01d..7d5065e 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -271,16 +271,9 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
-   int ret;
-
/* Force users to call KVM_ARM_VCPU_INIT */
vcpu-arch.target = -1;
 
-   /* Set up VGIC */
-   ret = kvm_vgic_vcpu_init(vcpu);
-   if (ret)
-   return ret;
-
/* Set up the timer */
kvm_timer_vcpu_init(vcpu);
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 0a27564..73cbb61 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -277,7 +277,6 @@ int kvm_vgic_hyp_init(void);
 int kvm_vgic_init(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm);
 void kvm_vgic_destroy(struct kvm *kvm);
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index dfa6430..9180823 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1730,15 +1730,12 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, 
int nr_irqs)
  * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to
  * this vcpu and enable the VGIC for this VCPU
  */
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
 {
struct vgic_cpu *vgic_cpu = vcpu-arch.vgic_cpu;
struct vgic_dist *dist = vcpu-kvm-arch.vgic;
int i;
 
-   if (vcpu-vcpu_id = dist-nr_cpus)
-   return -EBUSY;
-
for (i = 0; i  dist-nr_irqs; i++) {
if (i  VGIC_NR_PPIS)
vgic_bitmap_set_irq_val(dist-irq_enabled,
@@ -1758,8 +1755,6 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
vgic_cpu-nr_lr = vgic-nr_lr;
 
vgic_enable(vcpu);
-
-   return 0;
 }
 
 static void vgic_init_maintenance_interrupt(void *info)
@@ -1880,8 +1875,17 @@ static int vgic_init_maps(struct kvm *kvm)
int nr_cpus, nr_irqs;
int ret, i;
 
-   nr_cpus = dist-nr_cpus = KVM_MAX_VCPUS;
+   if (dist-nr_cpus)  /* Already allocated */
+   return 0;
+
+   nr_cpus = dist-nr_cpus = atomic_read(kvm-online_vcpus);
+   if (!nr_cpus)   /* No vcpus? Can't be good... */
+   return -EINVAL;
 
+   /*
+* If nobody configured the number of interrupts, use the
+* legacy one.
+*/
if (!dist-nr_irqs)
dist-nr_irqs = VGIC_NR_IRQS_LEGACY;
 
@@ -1927,6 +1931,9 @@ static int vgic_init_maps(struct kvm *kvm)
}
}
 
+   for (i = VGIC_NR_PRIVATE_IRQS; i  dist-nr_irqs; i += 4)
+   vgic_set_target_reg(kvm, 0, i);
+
 out:
if (ret)
kvm_vgic_destroy(kvm);
@@ -1945,6 +1952,7 @@ out:
  */
 int kvm_vgic_init(struct kvm *kvm)
 {
+   struct kvm_vcpu *vcpu;
int ret = 0, i;
 
if (!irqchip_in_kernel(kvm))
@@ -1962,6 +1970,12 @@ int kvm_vgic_init(struct kvm *kvm)
goto out;
}
 
+   ret = vgic_init_maps(kvm);
+   if (ret) {
+   kvm_err(Unable to allocate maps\n);
+   goto out;
+   }
+
ret = kvm_phys_addr_ioremap(kvm, kvm-arch.vgic.vgic_cpu_base,
vgic-vcpu_base, KVM_VGIC_V2_CPU_SIZE);
if (ret) {
@@ -1969,11 +1983,13 @@ int kvm_vgic_init(struct kvm *kvm)
goto out;
}
 
-   for (i = VGIC_NR_PRIVATE_IRQS; i  kvm-arch.vgic.nr_irqs; i += 4)
-   vgic_set_target_reg(kvm, 0, i);
+   kvm_for_each_vcpu(i, vcpu, kvm)
+   kvm_vgic_vcpu_init(vcpu);
 
kvm-arch.vgic.ready = true;
 out:
+   if (ret)
+   kvm_vgic_destroy(kvm);
mutex_unlock(kvm-lock);
return ret;
 }
@@ -2014,10 +2030,6 @@ int kvm_vgic_create(struct kvm *kvm)
kvm-arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
kvm-arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
 
-   ret = vgic_init_maps(kvm);
-   if (ret)
-   kvm_err(Unable to allocate maps\n);
-
 out_unlock:
for (; vcpu_lock_idx = 0;

[PATCH v4 4/8] arm/arm64: KVM: vgic: kill VGIC_MAX_CPUS

2014-09-11 Thread Marc Zyngier

We now have the information about the number of CPU interfaces in
the distributor itself. Let's get rid of VGIC_MAX_CPUS, and just
rely on KVM_MAX_VCPUS where we don't have the choice. Yet.

Reviewed-by: Christoffer Dall christoffer.d...@linaro.org
Signed-off-by: Marc Zyngier marc.zyng...@arm.com
---
 include/kvm/arm_vgic.h | 3 +--
 virt/kvm/arm/vgic.c| 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index bdeb451..3900e31 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -29,13 +29,12 @@
 #define VGIC_NR_SGIS   16
 #define VGIC_NR_PPIS   16
 #define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
-#define VGIC_MAX_CPUS  KVM_MAX_VCPUS
 
 #define VGIC_V2_MAX_LRS(1  6)
 #define VGIC_V3_MAX_LRS16
 
 /* Sanity checks... */
-#if (VGIC_MAX_CPUS  8)
+#if (KVM_MAX_VCPUS  8)
 #error Invalid number of CPU interfaces
 #endif
 
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 93fe73b..7e6e64d 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1294,7 +1294,7 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq)
 
sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
 
-   for_each_set_bit(c, sources, VGIC_MAX_CPUS) {
+   for_each_set_bit(c, sources, dist-nr_cpus) {
if (vgic_queue_irq(vcpu, c, irq))
clear_bit(c, sources);
}
@@ -1701,7 +1701,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
struct vgic_dist *dist = vcpu-kvm-arch.vgic;
int i;
 
-   if (vcpu-vcpu_id = VGIC_MAX_CPUS)
+   if (vcpu-vcpu_id = dist-nr_cpus)
return -EBUSY;
 
for (i = 0; i  VGIC_NR_IRQS; i++) {
@@ -1845,7 +1845,7 @@ static int vgic_init_maps(struct kvm *kvm)
int nr_cpus, nr_irqs;
int ret, i;
 
-   nr_cpus = dist-nr_cpus = VGIC_MAX_CPUS;
+   nr_cpus = dist-nr_cpus = KVM_MAX_VCPUS;
nr_irqs = dist-nr_irqs = VGIC_NR_IRQS;
 
ret  = vgic_init_bitmap(dist-irq_enabled, nr_cpus, nr_irqs);
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 12:47:16PM +0200, Paolo Bonzini wrote:
 Il 11/09/2014 12:12, Gleb Natapov ha scritto:
  On Thu, Sep 11, 2014 at 11:21:49AM +0200, Paolo Bonzini wrote:
  Il 11/09/2014 07:38, Tang Chen ha scritto:
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index 63c4c3e..da6d55d 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -7093,6 +7093,11 @@ static void vmx_set_virtual_x2apic_mode(struct 
  kvm_vcpu *vcpu, bool set)
vmx_set_msr_bitmap(vcpu);
   }
   
  +static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
  +{
  + vmcs_write64(APIC_ACCESS_ADDR, hpa);
 
  This has to be guarded by if (!is_guest_mode(vcpu)).
 
  We do need to write it if L1 and L2 share APIC_ACCESS_ADDR and skip
  it otherwise, no?
 
 Yes, but this would be handled by patch 6:
 
   } else if (vm_need_virtualize_apic_accesses(vmx-vcpu.kvm)) {
 + struct page *page = gfn_to_page(vmx-vcpu.kvm,
 + APIC_DEFAULT_PHYS_BASE  PAGE_SHIFT);
   exec_control |=
   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 - vmcs_write64(APIC_ACCESS_ADDR,
 - page_to_phys(vcpu-kvm-arch.apic_access_page));
 + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
 + /*
 +  * Do not pin apic access page in memory so that memory
 +  * hotplug process is able to migrate it.
 +  */
 + put_page(page);
   }
This code is in prepare_vmcs02() and is executed during L1-L2 vmentry. What 
happens
when apic access page is migrated while L2 is running? It needs to be update 
somewhere.

 
 However, this is also useless code duplication because the above snippet could
 reuse vcpu_reload_apic_access_page too.
 
 So I think you cannot do the is_guest_mode check in
 kvm_vcpu_reload_apic_access_page and also not in
 vmx_reload_apic_access_page.  But you could do something like
 
 kvm_vcpu_reload_apic_access_page(...)
 {
   ...
   kvm_x86_ops-reload_apic_access_page(...);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
 
 /* used in vcpu_enter_guest only */
 vcpu_reload_apic_access_page(...)
 {
   if (!is_guest_mode(vcpu))
   kvm_vcpu_reload_apic_access_page(...)
 }
 
 Paolo

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes

2014-09-11 Thread Rusty Russell

Amos Kong ak...@redhat.com writes:
 When I check hwrng attributes in sysfs, cat process always gets
 stuck if guest has only 1 vcpu and uses a slow rng backend.

 Currently we check if there is any tasks waiting to be run on
 current cpu in rng_dev_read() by need_resched(). But need_resched()
 doesn't work because rng_dev_read() is executing in user context.

I don't understand this explanation?  I'd expect the sysfs process to be
woken by the mutex_unlock().

If we're really high priority (vs. the sysfs process) then I can see why
we'd need schedule_timeout_interruptible() instead of just schedule(),
and in that case, need_resched() would be false too.

You could argue that's intended behaviour, but I can't see how it
happens in the normal case anyway.

What am I missing?

Thanks,
Rusty.

 This patch removed need_resched() and increase delay to 10 jiffies,
 then other tasks can have chance to execute protected code.
 Delaying 1 jiffy also works, but 10 jiffies is safer.

 Signed-off-by: Amos Kong ak...@redhat.com
 ---
  drivers/char/hw_random/core.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

 diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
 index c591d7e..b5d1b6f 100644
 --- a/drivers/char/hw_random/core.c
 +++ b/drivers/char/hw_random/core.c
 @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char 
 __user *buf,
  
   mutex_unlock(rng_mutex);
  
 - if (need_resched())
 - schedule_timeout_interruptible(1);
 + schedule_timeout_interruptible(10);
  
   if (signal_pending(current)) {
   err = -ERESTARTSYS;
 -- 
 1.9.3
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Howto connect to a terminal in an emalated linux-livecd?

2014-09-11 Thread Oliver Rath

Hi list,

after hours for searching in google-world, i didnt find any appropriate
for this problem:

I want to boot a live-cd (i.e. ubuntu 14.04.1-desktop) in qemu, which
starts with an graphical interface, done i.e. by

qemu-system-x86_64 -m 3G -smp 2 -drive
file=ubuntu-14.04.1-desktop-i386.iso,media=cdrom,if=virtio --enable-kvm

Now i want to access to the console of the ubuntu-livecd. At the moment
i can do this over changing to text mode via

sendkey ctrl-alt-f1

in qemu-console (Alt-2), then switching back to qemu-window (alt-1). Now
i have access to tty1 of my livecd.

But IMHO there should be a more simple way to access to such a console
with qemu, i.e. through a pipe, a serial console etc., but i didnt found
anything working. The best i got was with -chardev pty,id=myid, which
resulted in a char device redirected to /dev/pts/0 (label myid).  But
with a screen /dev/pts/0 i wasnt able to see any input or output.

ssh is unfortunatly not available at this time on the livecd (so i could
connect i.e. via -net user,hostfwd:tcp:10022-:22)

Any hints to connect directly to a console in an emulated linux?

Tfh!
Oliver


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Eric Auger

On 09/11/2014 07:05 AM, Alex Williamson wrote:
 On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
 On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
 This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.

 This is a new control channel which enables KVM to cooperate with
 viable VFIO devices.

 The kvm-vfio device now holds a list of devices (kvm_vfio_device)
 in addition to a list of groups (kvm_vfio_group). The new
 infrastructure enables to check the validity of the VFIO device
 file descriptor, get and hold a reference to it.

 The first concrete implemented command is IRQ forward control:
 KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.

 It consists in programing the VFIO driver and KVM in a consistent manner
 so that an optimized IRQ injection/completion is set up. Each
 kvm_vfio_device holds a list of forwarded IRQ. When putting a
 kvm_vfio_device, the implementation makes sure the forwarded IRQs
 are set again in the normal handling state (non forwarded).

 'putting a kvm_vfio_device' sounds to like you're golf'ing :)

 When a kvm_vfio_device is released?


 The forwarding programmming is architecture specific, embodied by the
 kvm_arch_set_fwd_state function. Its implementation is given in a
 separate patch file.

 I would drop the last sentence and instead indicate that this is handled
 properly when the architecture does not support such a feature.


 The forwarding control modality is enabled by the
 __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 v1 - v2:
 - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
 - original patch file separated into 2 parts: generic part moved in vfio.c
   and ARM specific part(kvm_arch_set_fwd_state)
 ---
  include/linux/kvm_host.h |  27 +++
  virt/kvm/vfio.c  | 452 
 ++-
  2 files changed, 477 insertions(+), 2 deletions(-)

 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index a4c33b3..24350dc 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
   unsigned long arg);
  };
  
 +enum kvm_fwd_irq_action {
 +   KVM_VFIO_IRQ_SET_FORWARD,
 +   KVM_VFIO_IRQ_SET_NORMAL,
 +   KVM_VFIO_IRQ_CLEANUP,

 This is KVM internal API, so it would probably be good to document this.
 Especially the CLEANUP bit worries me, see below.
 
 This also doesn't match the user API, which is simply FORWARD/UNFORWARD.
Hi Alex,

will change that.
 Extra states worry me too.

I tried to explained the 2 motivations behind. Please let me know if it
makes sense.
 
 +};
 +
 +/* internal structure describing a forwarded IRQ */
 +struct kvm_fwd_irq {
 +   struct list_head link;

 this list entry is local to the kvm vfio device, right? that means you
 probably want a struct with just the below fields, and then have a
 containing struct in the generic device file, private to it's logic.
 
 Yes, this is part of the abstraction problem.
OK will fix that.
 
 +   __u32 index; /* platform device irq index */
 
 This is a vfio_device irq_index, but vfio_devices support indexes and
 sub-indexes.  At this level the API should match vfio, not the specifics
 of platform devices not supporting sub-index.
I will add sub-indexes then.
 
 +   __u32 hwirq; /*physical IRQ */
 +   __u32 gsi; /* virtual IRQ */
 +   struct kvm_vcpu *vcpu; /* vcpu to inject into*/
 
 Not sure I understand why vcpu is necessary.
vcpu is used when providing the physical IRQ/virtual IRQ mapping to the
virtual GIC. I can remove it from and add a vcpu struct * param to
kvm_arch_set_fwd_state if you prefer.

  Also I see a 'get' in the code below, but not a 'put'.
Sorry I do not understand your comment here? What 'get' do you mention?
 
 +};
 +
  void kvm_device_get(struct kvm_device *dev);
  void kvm_device_put(struct kvm_device *dev);
  struct kvm_device *kvm_device_from_filp(struct file *filp);
 @@ -1075,6 +1090,18 @@ extern struct kvm_device_ops kvm_vfio_ops;
  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
  extern struct kvm_device_ops kvm_flic_ops;
  
 +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
 +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,

 what's the 'p' in pfwd?
 
 p is for pointer?
yes it was ;-)
 
 +  enum kvm_fwd_irq_action action);
 +
 +#else
 +static inline int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
 +enum kvm_fwd_irq_action action)
 +{
 +   return 0;
 +}
 +#endif
 +
  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
  
  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool 
 val)
 diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
 index 76dc7a1..e4a81c4 100644
 --- a/virt/kvm/vfio.c
 +++ b/virt/kvm/vfio.c
 @@ -18,14 +18,24 @@
  #include linux/slab.h
  #include linux/uaccess.h
  #include linux/vfio.h
 +#include linux/platform_device.h
  
  struct kvm_vfio_group {
 struct

Re: [Qemu-devel] [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast

2014-09-11 Thread Zhang Haoyu

 Subject: [Qemu-devel] [PATCH] kvm: ioapic: conditionally delay irq delivery
 duringeoi broadcast
 
 Currently, we call ioapic_service() immediately when we find the irq is still
 active during eoi broadcast. But for real hardware, there's some dealy 
 between
 the EOI writing and irq delivery (system bus latency?). So we need to emulate
 this behavior. Otherwise, for a guest who haven't register a proper irq 
 handler
 , it would stay in the interrupt routine as this irq would be re-injected
 immediately after guest enables interrupt. This would lead guest can't move
 forward and may miss the possibility to get proper irq handler registered 
 (one
 example is windows guest resuming from hibernation).
 
 As there's no way to differ the unhandled irq from new raised ones, this 
 patch
 solve this problems by scheduling a delayed work when the count of irq 
 injected
 during eoi broadcast exceeds a threshold value. After this patch, the guest 
 can
 move a little forward when there's no suitable irq handler in case it may
 register one very soon and for guest who has a bad irq detection routine ( 
 such
 as note_interrupt() in linux ), this bad irq would be recognized soon as in 
 the
 past.
 
 Cc: Michael S. Tsirkin m...@redhat.com
 Signed-off-by: Jason Wang jasow...@redhat.com
 Signed-off-by: Zhang Haoyu zhan...@sangfor.com
 ---
  include/trace/events/kvm.h | 20 +++
  virt/kvm/ioapic.c  | 50
 --
  virt/kvm/ioapic.h  |  6 ++
  3 files changed, 74 insertions(+), 2 deletions(-)
 
If this is a new version, please add a v2/v3 suffix and describe the changes at
those different versions .

You can get more information from:
http://wiki.qemu.org/Contribute/SubmitAPatch

Okay, thanks.

Best regards,
-Gonglei

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 13:30, Gleb Natapov ha scritto:
  +  vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
  +  /*
  +   * Do not pin apic access page in memory so that memory
  +   * hotplug process is able to migrate it.
  +   */
  +  put_page(page);
 }
 This code is in prepare_vmcs02() and is executed during L1-L2 vmentry. What 
 happens
 when apic access page is migrated while L2 is running? It needs to be update 
 somewhere.

Before it is migrated, the MMU notifier is called and will force a
vmexit on all CPUs.  The reload code will call GUP again on the page
again and swap it in.

Paolo

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: PPC: Convert openpic lock to raw_spinlock

2014-09-11 Thread Bogdan Purcareata

This patch enables running intensive I/O workloads, e.g. netperf, in a guest
deployed on a RT host. No change for !RT kernels.

The openpic spinlock becomes a sleeping mutex on a RT system. This no longer
guarantees that EPR is atomic with exception delivery. The guest VCPU thread
fails due to a BUG_ON(preemptible()) when running netperf.

In order to make the kvmppc_mpic_set_epr() call safe on RT from non-atomic
context, convert the openpic lock to a raw_spinlock. A similar approach can
be seen for x86 platforms in the following commit [1].

Here are some comparative cyclitest measurements run inside a high priority RT
guest run on a RT host. The guest has 1 VCPU and the test has been run for 15
minutes. The guest runs ~750 hackbench processes as background stress.

  spinlock  raw_spinlock
Min latency (us)  4 4
Avg latency (us)  1519
Max latency (us)  7062

[1] https://lkml.org/lkml/2010/1/11/289

Signed-off-by: Bogdan Purcareata bogdan.purcare...@freescale.com
---
 arch/powerpc/kvm/mpic.c |   44 ++--
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 2861ae9..309036c 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -194,7 +194,7 @@ struct openpic {
int num_mmio_regions;
 
gpa_t reg_base;
-   spinlock_t lock;
+   raw_spinlock_t lock;
 
/* Behavior control */
struct fsl_mpic_info *fsl;
@@ -1105,9 +1105,9 @@ static int openpic_cpu_write_internal(void *opaque, gpa_t 
addr,
mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
}
 
-   spin_unlock(opp-lock);
+   raw_spin_unlock(opp-lock);
kvm_notify_acked_irq(opp-kvm, 0, notify_eoi);
-   spin_lock(opp-lock);
+   raw_spin_lock(opp-lock);
 
break;
}
@@ -1182,12 +1182,12 @@ void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
int cpu = vcpu-arch.irq_cpu_id;
unsigned long flags;
 
-   spin_lock_irqsave(opp-lock, flags);
+   raw_spin_lock_irqsave(opp-lock, flags);
 
if ((opp-gcr  opp-mpic_mode_mask) == GCR_MODE_PROXY)
kvmppc_set_epr(vcpu, openpic_iack(opp, opp-dst[cpu], cpu));
 
-   spin_unlock_irqrestore(opp-lock, flags);
+   raw_spin_unlock_irqrestore(opp-lock, flags);
 }
 
 static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
@@ -1387,9 +1387,9 @@ static int kvm_mpic_read(struct kvm_io_device *this, 
gpa_t addr,
return -EINVAL;
}
 
-   spin_lock_irq(opp-lock);
+   raw_spin_lock_irq(opp-lock);
ret = kvm_mpic_read_internal(opp, addr - opp-reg_base, u.val);
-   spin_unlock_irq(opp-lock);
+   raw_spin_unlock_irq(opp-lock);
 
/*
 * Technically only 32-bit accesses are allowed, but be nice to
@@ -1427,10 +1427,10 @@ static int kvm_mpic_write(struct kvm_io_device *this, 
gpa_t addr,
return -EOPNOTSUPP;
}
 
-   spin_lock_irq(opp-lock);
+   raw_spin_lock_irq(opp-lock);
ret = kvm_mpic_write_internal(opp, addr - opp-reg_base,
  *(const u32 *)ptr);
-   spin_unlock_irq(opp-lock);
+   raw_spin_unlock_irq(opp-lock);
 
pr_debug(%s: addr %llx ret %d val %x\n,
 __func__, addr, ret, *(const u32 *)ptr);
@@ -1501,14 +1501,14 @@ static int access_reg(struct openpic *opp, gpa_t addr, 
u32 *val, int type)
if (addr  3)
return -ENXIO;
 
-   spin_lock_irq(opp-lock);
+   raw_spin_lock_irq(opp-lock);
 
if (type == ATTR_SET)
ret = kvm_mpic_write_internal(opp, addr, *val);
else
ret = kvm_mpic_read_internal(opp, addr, val);
 
-   spin_unlock_irq(opp-lock);
+   raw_spin_unlock_irq(opp-lock);
 
pr_debug(%s: type %d addr %llx val %x\n, __func__, type, addr, *val);
 
@@ -1545,9 +1545,9 @@ static int mpic_set_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
if (attr32 != 0  attr32 != 1)
return -EINVAL;
 
-   spin_lock_irq(opp-lock);
+   raw_spin_lock_irq(opp-lock);
openpic_set_irq(opp, attr-attr, attr32);
-   spin_unlock_irq(opp-lock);
+   raw_spin_unlock_irq(opp-lock);
return 0;
}
 
@@ -1592,9 +1592,9 @@ static int mpic_get_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
if (attr-attr  MAX_SRC)
return -EINVAL;
 
-   spin_lock_irq(opp-lock);
+   raw_spin_lock_irq(opp-lock);
attr32 = opp-src[attr-attr].pending;
-   spin_unlock_irq(opp-lock);
+   raw_spin_unlock_irq(opp-lock);
 
if (put_user(attr32, (u32 __user *)(long)attr-addr))
return -EFAULT;
@@ -1669,7 +1669,7

[PATCH v2] kvm: ioapic: conditionally delay irq delivery duringeoi broadcast

2014-09-11 Thread Zhang Haoyu

Currently, we call ioapic_service() immediately when we find the irq is still
active during eoi broadcast. But for real hardware, there's some dealy between
the EOI writing and irq delivery (system bus latency?). So we need to emulate
this behavior. Otherwise, for a guest who haven't register a proper irq handler
, it would stay in the interrupt routine as this irq would be re-injected
immediately after guest enables interrupt. This would lead guest can't move
forward and may miss the possibility to get proper irq handler registered (one
example is windows guest resuming from hibernation).

As there's no way to differ the unhandled irq from new raised ones, this patch
solve this problems by scheduling a delayed work when the count of irq injected
during eoi broadcast exceeds a threshold value. After this patch, the guest can
move a little forward when there's no suitable irq handler in case it may
register one very soon and for guest who has a bad irq detection routine ( such
as note_interrupt() in linux ), this bad irq would be recognized soon as in the
past.

v1 - v2:
  - delete the TODO comment
  - adjust the coding style to kernel style
  - add the missing } for if (ioapic-irq_eoi[i] == 
IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {

Cc: Michael S. Tsirkin m...@redhat.com
Cc: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Jason Wang jasow...@redhat.com
Signed-off-by: Zhang Haoyu zhan...@sangfor.com
---
 include/trace/events/kvm.h | 20 +++
 virt/kvm/ioapic.c  | 50 --
 virt/kvm/ioapic.h  |  6 ++
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 908925a..ab679c3 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
  __entry-coalesced ?  (coalesced) : )
 );
 
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+   TP_PROTO(__u64 e),
+   TP_ARGS(e),
+
+   TP_STRUCT__entry(
+   __field(__u64,  e   )
+   ),
+
+   TP_fast_assign(
+   __entry-e  = e;
+   ),
+
+   TP_printk(dst %x vec=%u (%s|%s|%s%s),
+ (u8)(__entry-e  56), (u8)__entry-e,
+ __print_symbolic((__entry-e  8  0x7), kvm_deliver_mode),
+ (__entry-e  (111)) ? logical : physical,
+ (__entry-e  (115)) ? level : edge,
+ (__entry-e  (116)) ? |masked : )
+);
+
 TRACE_EVENT(kvm_msi_set_irq,
TP_PROTO(__u64 address, __u64 data),
TP_ARGS(address, data),
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index e8ce34c..8e1dc67 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int 
irq_source_id)
spin_unlock(ioapic-lock);
 }
 
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+   int i;
+   struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+eoi_inject.work);
+   spin_lock(ioapic-lock);
+   for (i = 0; i  IOAPIC_NUM_PINS; i++) {
+   union kvm_ioapic_redirect_entry *ent = ioapic-redirtbl[i];
+
+   if (ent-fields.trig_mode != IOAPIC_LEVEL_TRIG)
+   continue;
+
+   if (ioapic-irr  (1  i)  !ent-fields.remote_irr)
+   ioapic_service(ioapic, i, false);
+   }
+   spin_unlock(ioapic-lock);
+}
+
 static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
@@ -435,8 +453,32 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 
ASSERT(ent-fields.trig_mode == IOAPIC_LEVEL_TRIG);
ent-fields.remote_irr = 0;
-   if (ioapic-irr  (1  i))
-   ioapic_service(ioapic, i, false);
+   if (!ent-fields.mask  (ioapic-irr  (1  i))) {
+   ++ioapic-irq_eoi[i];
+   if (ioapic-irq_eoi[i] == 
IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+   /*
+* Real hardware does not deliver the irq so
+* immediately during eoi broadcast, so we need
+* to emulate this behavior. Otherwise, for
+* guests who has not registered handler of a
+* level irq, this irq would be injected
+* immediately after guest enables interrupt
+* (which happens usually at the end of the
+* common interrupt routine). This would lead
+* guest can't move forward and may miss the
+* possibility to get proper irq

Re: [Qemu-devel] QEMU with KVM does not start Win8 on kernel 3.4.67 and core2duo

2014-09-11 Thread Erik Rull

 On August 6, 2014 at 1:19 PM Erik Rull erik.r...@rdsoftware.de wrote:


 Hi all,

 I did already several tests and I'm not completely sure what's going wrong,
 but
 here my scenario:

 When I start up QEMU w/ KVM 1.7.0 on a Core2Duo machine running a vanilla
 kernel
 3.4.67 to run a Windows 8.0 guest, the guest freezes at boot without any
 error.
 When I dump the CPU registers via info registers, nothing changes, that
 means
 the system really stalled. Same happens with QEMU 2.0.0.

 But - when I run the very same guest using Kernel 2.6.32.12 and QEMU 1.7.0 on
 the host side it works on the Core2Duo. Also the system above but just with an
 i3 or i5 CPU it works, too.

 I already disabled networking and USB for the guest and changed the graphics
 card - no effect. I assume that some mean bits and bytes have to be set up
 properly to get the thing running.

 Any hint what to change / test would be really appreciated.

 Thanks in advance,

 Best regards,

 Erik


Hi all,

I opened a qemu bug report on that and Jan helped me creating a kvm trace. I
attached it to the bug report.
https://bugs.launchpad.net/qemu/+bug/1366836

If you have further questions, please let me know.

Thanks.

Best regards,

Erik
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] QEMU with KVM does not start Win8 on kernel 3.4.67 and core2duo

2014-09-11 Thread Jan Kiszka

On 2014-09-11 15:25, Erik Rull wrote:
 On August 6, 2014 at 1:19 PM Erik Rull erik.r...@rdsoftware.de wrote:


 Hi all,

 I did already several tests and I'm not completely sure what's going wrong,
 but
 here my scenario:

 When I start up QEMU w/ KVM 1.7.0 on a Core2Duo machine running a vanilla
 kernel
 3.4.67 to run a Windows 8.0 guest, the guest freezes at boot without any
 error.
 When I dump the CPU registers via info registers, nothing changes, that
 means
 the system really stalled. Same happens with QEMU 2.0.0.

 But - when I run the very same guest using Kernel 2.6.32.12 and QEMU 1.7.0 on
 the host side it works on the Core2Duo. Also the system above but just with 
 an
 i3 or i5 CPU it works, too.

 I already disabled networking and USB for the guest and changed the graphics
 card - no effect. I assume that some mean bits and bytes have to be set up
 properly to get the thing running.

 Any hint what to change / test would be really appreciated.

 Thanks in advance,

 Best regards,

 Erik

 
 Hi all,
 
 I opened a qemu bug report on that and Jan helped me creating a kvm trace. I
 attached it to the bug report.
 https://bugs.launchpad.net/qemu/+bug/1366836
 
 If you have further questions, please let me know.

File possibly truncated. Need at least 346583040, but file size is
133414912.

Does trace-cmd report work for you? Is your file larger?

Again, please also validate the behavior on latest next branch from kvm.git.

Jan

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 03:05:05PM +0200, Paolo Bonzini wrote:
 Il 11/09/2014 13:30, Gleb Natapov ha scritto:
   +vmcs_write64(APIC_ACCESS_ADDR, 
   page_to_phys(page));
   +/*
   + * Do not pin apic access page in memory so 
   that memory
   + * hotplug process is able to migrate it.
   + */
   +put_page(page);
}
  This code is in prepare_vmcs02() and is executed during L1-L2 vmentry. 
  What happens
  when apic access page is migrated while L2 is running? It needs to be 
  update somewhere.
 
 Before it is migrated, the MMU notifier is called and will force a
 vmexit on all CPUs.  The reload code will call GUP again on the page
 again and swap it in.
 
This is how it will work without if (!is_guest_mode(vcpu)). But,
unless I am missing something, with this check it will not work while
vcpu is in L2.

Suppose vmcs01-APIC_ACCESS_ADDR = 0xf000. During L2 entry
vmcs02-APIC_ACCESS_ADDR is set to 0xf000 too (by prepare_vmcs02). Now
0xf000 is migrated to 0x8000, mmu notifier is called, it forces vmexit,
but vcpu is in a guest mode so vmcs02-APIC_ACCESS_ADDR is never updated
to 0x8000 because of if (!is_guest_mode(vcpu)) check. So what am I
missing here?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 15:59, Gleb Natapov ha scritto:
 
 Suppose vmcs01-APIC_ACCESS_ADDR = 0xf000. During L2 entry
 vmcs02-APIC_ACCESS_ADDR is set to 0xf000 too (by prepare_vmcs02). Now
 0xf000 is migrated to 0x8000, mmu notifier is called, it forces vmexit,
 but vcpu is in a guest mode so vmcs02-APIC_ACCESS_ADDR is never updated
 to 0x8000 because of if (!is_guest_mode(vcpu)) check. So what am I
 missing here?

Right, guest mode isn't left as soon as you execute nested_vmx_vmexit,
because this isn't an L2-L1 exit.  So we need an else for that if
(!is_guest_mode(vcpu)), in which case the hpa is ignored and
vmcs12-apic_access_addr is used instead?

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] QEMU with KVM does not start Win8 on kernel 3.4.67 and core2duo

2014-09-11 Thread Erik Rull

 On September 11, 2014 at 3:32 PM Jan Kiszka jan.kis...@siemens.com wrote:


 On 2014-09-11 15:25, Erik Rull wrote:
  On August 6, 2014 at 1:19 PM Erik Rull erik.r...@rdsoftware.de wrote:
 
 
  Hi all,
 
  I did already several tests and I'm not completely sure what's going wrong,
  but
  here my scenario:
 
  When I start up QEMU w/ KVM 1.7.0 on a Core2Duo machine running a vanilla
  kernel
  3.4.67 to run a Windows 8.0 guest, the guest freezes at boot without any
  error.
  When I dump the CPU registers via info registers, nothing changes, that
  means
  the system really stalled. Same happens with QEMU 2.0.0.
 
  But - when I run the very same guest using Kernel 2.6.32.12 and QEMU 1.7.0
  on
  the host side it works on the Core2Duo. Also the system above but just with
  an
  i3 or i5 CPU it works, too.
 
  I already disabled networking and USB for the guest and changed the
  graphics
  card - no effect. I assume that some mean bits and bytes have to be set up
  properly to get the thing running.
 
  Any hint what to change / test would be really appreciated.
 
  Thanks in advance,
 
  Best regards,
 
  Erik
 
 
  Hi all,
 
  I opened a qemu bug report on that and Jan helped me creating a kvm trace. I
  attached it to the bug report.
  https://bugs.launchpad.net/qemu/+bug/1366836
 
  If you have further questions, please let me know.

 File possibly truncated. Need at least 346583040, but file size is
 133414912.

 Does trace-cmd report work for you? Is your file larger?

 Again, please also validate the behavior on latest next branch from kvm.git.

 Jan


Sorry for the corrupt file, didn't work on my side as well. I re-captured the
trace (trace-cmd report worked!) and posted it on the bugtracker. The kvm.git
test is in progress, my build system is already busy.

Best regards,

Erik
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 16:21, Gleb Natapov ha scritto:
 As far as I can tell the if that is needed there is:
 
 if (!is_guest_mode() || !(vmcs12-secondary_vm_exec_control  
 ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 write(PIC_ACCESS_ADDR)
 
 In other words if L2 shares L1 apic access page then reload, otherwise do 
 nothing.

What if the page being swapped out is L1's APIC access page?  We don't
run prepare_vmcs12 in that case because it's an L2-L0-L2 entry, so we
need to do something.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 04:06:58PM +0200, Paolo Bonzini wrote:
 Il 11/09/2014 15:59, Gleb Natapov ha scritto:
  
  Suppose vmcs01-APIC_ACCESS_ADDR = 0xf000. During L2 entry
  vmcs02-APIC_ACCESS_ADDR is set to 0xf000 too (by prepare_vmcs02). Now
  0xf000 is migrated to 0x8000, mmu notifier is called, it forces vmexit,
  but vcpu is in a guest mode so vmcs02-APIC_ACCESS_ADDR is never updated
  to 0x8000 because of if (!is_guest_mode(vcpu)) check. So what am I
  missing here?
 
 Right, guest mode isn't left as soon as you execute nested_vmx_vmexit,
 because this isn't an L2-L1 exit.  So we need an else for that if
 (!is_guest_mode(vcpu)), in which case the hpa is ignored and
 vmcs12-apic_access_addr is used instead?
 
As far as I can tell the if that is needed there is:

if (!is_guest_mode() || !(vmcs12-secondary_vm_exec_control  
ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
write(PIC_ACCESS_ADDR)

In other words if L2 shares L1 apic access page then reload, otherwise do 
nothing.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 04:24:04PM +0200, Paolo Bonzini wrote:
 Il 11/09/2014 16:21, Gleb Natapov ha scritto:
  As far as I can tell the if that is needed there is:
  
  if (!is_guest_mode() || !(vmcs12-secondary_vm_exec_control  
  ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
  write(PIC_ACCESS_ADDR)
  
  In other words if L2 shares L1 apic access page then reload, otherwise do 
  nothing.
 
 What if the page being swapped out is L1's APIC access page?  We don't
 run prepare_vmcs12 in that case because it's an L2-L0-L2 entry, so we
 need to do something.
We will do something on L2-L1 exit. We will call kvm_reload_apic_access_page().
That is what patch 5 of this series is doing.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 16:31, Gleb Natapov ha scritto:
  What if the page being swapped out is L1's APIC access page?  We don't
  run prepare_vmcs12 in that case because it's an L2-L0-L2 entry, so we
  need to do something.
 We will do something on L2-L1 exit. We will call 
 kvm_reload_apic_access_page().
 That is what patch 5 of this series is doing.

Sorry, I meant the APIC access page prepared by L1 for L2's execution.

You wrote:

 if (!is_guest_mode() || !(vmcs12-secondary_vm_exec_control  
 ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 write(PIC_ACCESS_ADDR)
 
 In other words if L2 shares L1 apic access page then reload, otherwise do 
 nothing.

but in that case you have to redo nested_get_page, so do nothing
doesn't work.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 04:37:39PM +0200, Paolo Bonzini wrote:
 Il 11/09/2014 16:31, Gleb Natapov ha scritto:
   What if the page being swapped out is L1's APIC access page?  We don't
   run prepare_vmcs12 in that case because it's an L2-L0-L2 entry, so we
   need to do something.
  We will do something on L2-L1 exit. We will call 
  kvm_reload_apic_access_page().
  That is what patch 5 of this series is doing.
 
 Sorry, I meant the APIC access page prepared by L1 for L2's execution.
 
 You wrote:
 
  if (!is_guest_mode() || !(vmcs12-secondary_vm_exec_control  
  ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
  write(PIC_ACCESS_ADDR)
  
  In other words if L2 shares L1 apic access page then reload, otherwise do 
  nothing.
 
 but in that case you have to redo nested_get_page, so do nothing
 doesn't work.
 
Ah, 7/7 is new in this submission. Before that this page was still
pinned.  Looking at 7/7 now I do not see how it can work since it has no
code for mmu notifier to detect that it deals with such page and call
kvm_reload_apic_access_page().  I said to Tang previously that nested
kvm has a bunch of pinned page that are hard to deal with and suggested
to iron out non nested case first :(

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/4] kvmtool: ARM: Use KVM_ARM_PREFERRED_TARGET vm ioctl to determine target cpu

2014-09-11 Thread Andre Przywara

Hi Anup,

On 08/09/14 09:17, Anup Patel wrote:
 Instead, of trying out each and every target type we should
 use KVM_ARM_PREFERRED_TARGET vm ioctl to determine target type
 for KVM ARM/ARM64.
 
 If KVM_ARM_PREFERRED_TARGET vm ioctl fails then we fallback to
 old method of trying all known target types.
 
 If KVM_ARM_PREFERRED_TARGET vm ioctl succeeds but the returned
 target type is not known to KVMTOOL then we forcefully init
 VCPU with target type returned by KVM_ARM_PREFERRED_TARGET vm ioctl.
 
 Signed-off-by: Pranavkumar Sawargaonkar pranavku...@linaro.org
 Signed-off-by: Anup Patel anup.pa...@linaro.org
 ---
  tools/kvm/arm/kvm-cpu.c |   52 
 +--
  1 file changed, 41 insertions(+), 11 deletions(-)
 
 diff --git a/tools/kvm/arm/kvm-cpu.c b/tools/kvm/arm/kvm-cpu.c
 index aeaa4cf..ba7a762 100644
 --- a/tools/kvm/arm/kvm-cpu.c
 +++ b/tools/kvm/arm/kvm-cpu.c
 @@ -33,7 +33,8 @@ struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, 
 unsigned long cpu_id)
   struct kvm_arm_target *target;
   struct kvm_cpu *vcpu;
   int coalesced_offset, mmap_size, err = -1;
 - unsigned int i;
 + unsigned int i, target_type;
 + struct kvm_vcpu_init preferred_init;
   struct kvm_vcpu_init vcpu_init = {
   .features = ARM_VCPU_FEATURE_FLAGS(kvm, cpu_id)
   };
 @@ -55,19 +56,47 @@ struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, 
 unsigned long cpu_id)
   if (vcpu-kvm_run == MAP_FAILED)
   die(unable to mmap vcpu fd);
  
 - /* Find an appropriate target CPU type. */
 - for (i = 0; i  ARRAY_SIZE(kvm_arm_targets); ++i) {
 - if (!kvm_arm_targets[i])
 - continue;
 - target = kvm_arm_targets[i];
 - vcpu_init.target = target-id;
 - err = ioctl(vcpu-vcpu_fd, KVM_ARM_VCPU_INIT, vcpu_init);
 - if (!err)
 - break;
 + /*
 +  * If preferred target ioctl successful then use preferred target
 +  * else try each and every target type.
 +  */
 + err = ioctl(kvm-vm_fd, KVM_ARM_PREFERRED_TARGET, preferred_init);
 + if (!err) {
 + /* Match preferred target CPU type. */
 + target = NULL;
 + for (i = 0; i  ARRAY_SIZE(kvm_arm_targets); ++i) {
 + if (!kvm_arm_targets[i])
 + continue;
 + if (kvm_arm_targets[i]-id == preferred_init.target) {
 + target = kvm_arm_targets[i];
 + target_type = kvm_arm_targets[i]-id;
 + break;
 + }
 + }
 + if (!target) {
 + target = kvm_arm_targets[0];

I think you missed the part of the patch which adds the now magic zero
member of kvm_arm_targets[]. A simple static initializer should work.

 + target_type = preferred_init.target;

Can't you move that out of the loop, in front of it actually? Then you
can get rid of the line above setting the target_type also, since you
always use the same value now, regardless whether you found that CPU in
the list or not.

 + }
 + } else {
 + /* Find an appropriate target CPU type. */
 + for (i = 0; i  ARRAY_SIZE(kvm_arm_targets); ++i) {
 + if (!kvm_arm_targets[i])
 + continue;
 + target = kvm_arm_targets[i];
 + target_type = target-id;
 + vcpu_init.target = target_type;
 + err = ioctl(vcpu-vcpu_fd, KVM_ARM_VCPU_INIT, 
 vcpu_init);
 + if (!err)
 + break;
 + }
 + if (err)
 + die(Unable to find matching target);
   }
  
 + vcpu_init.target = target_type;
 + err = ioctl(vcpu-vcpu_fd, KVM_ARM_VCPU_INIT, vcpu_init);

You should do this only in the if-branch above, since you (try to) call
KVM_ARM_VCPU_INIT already in the else branch before. Otherwise in the
latter case you would do it twice.

Regards,
Andre.

   if (err || target-init(vcpu))
 - die(Unable to initialise ARM vcpu);
 + die(Unable to initialise vcpu);
  
   coalesced_offset = ioctl(kvm-sys_fd, KVM_CHECK_EXTENSION,
KVM_CAP_COALESCED_MMIO);
 @@ -81,6 +110,7 @@ struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, 
 unsigned long cpu_id)
   vcpu-cpu_type  = target-id;
   vcpu-cpu_compatible= target-compatible;
   vcpu-is_running= true;
 +
   return vcpu;
  }
  
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Alex Williamson

On Thu, 2014-09-11 at 14:04 +0200, Eric Auger wrote:
 On 09/11/2014 07:05 AM, Alex Williamson wrote:
  On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
  On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
  This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.
 
  This is a new control channel which enables KVM to cooperate with
  viable VFIO devices.
 
  The kvm-vfio device now holds a list of devices (kvm_vfio_device)
  in addition to a list of groups (kvm_vfio_group). The new
  infrastructure enables to check the validity of the VFIO device
  file descriptor, get and hold a reference to it.
 
  The first concrete implemented command is IRQ forward control:
  KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.
 
  It consists in programing the VFIO driver and KVM in a consistent manner
  so that an optimized IRQ injection/completion is set up. Each
  kvm_vfio_device holds a list of forwarded IRQ. When putting a
  kvm_vfio_device, the implementation makes sure the forwarded IRQs
  are set again in the normal handling state (non forwarded).
 
  'putting a kvm_vfio_device' sounds to like you're golf'ing :)
 
  When a kvm_vfio_device is released?
 
 
  The forwarding programmming is architecture specific, embodied by the
  kvm_arch_set_fwd_state function. Its implementation is given in a
  separate patch file.
 
  I would drop the last sentence and instead indicate that this is handled
  properly when the architecture does not support such a feature.
 
 
  The forwarding control modality is enabled by the
  __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.
 
  Signed-off-by: Eric Auger eric.au...@linaro.org
 
  ---
 
  v1 - v2:
  - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
  - original patch file separated into 2 parts: generic part moved in vfio.c
and ARM specific part(kvm_arch_set_fwd_state)
  ---
   include/linux/kvm_host.h |  27 +++
   virt/kvm/vfio.c  | 452 
  ++-
   2 files changed, 477 insertions(+), 2 deletions(-)
 
  diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
  index a4c33b3..24350dc 100644
  --- a/include/linux/kvm_host.h
  +++ b/include/linux/kvm_host.h
  @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
  unsigned long arg);
   };
   
  +enum kvm_fwd_irq_action {
  + KVM_VFIO_IRQ_SET_FORWARD,
  + KVM_VFIO_IRQ_SET_NORMAL,
  + KVM_VFIO_IRQ_CLEANUP,
 
  This is KVM internal API, so it would probably be good to document this.
  Especially the CLEANUP bit worries me, see below.
  
  This also doesn't match the user API, which is simply FORWARD/UNFORWARD.
 Hi Alex,
 
 will change that.
  Extra states worry me too.
 
 I tried to explained the 2 motivations behind. Please let me know if it
 makes sense.

Not really.  It seems like it's just a leak of arch specific handling
out into common code.

  +};
  +
  +/* internal structure describing a forwarded IRQ */
  +struct kvm_fwd_irq {
  + struct list_head link;
 
  this list entry is local to the kvm vfio device, right? that means you
  probably want a struct with just the below fields, and then have a
  containing struct in the generic device file, private to it's logic.
  
  Yes, this is part of the abstraction problem.
 OK will fix that.
  
  + __u32 index; /* platform device irq index */
  
  This is a vfio_device irq_index, but vfio_devices support indexes and
  sub-indexes.  At this level the API should match vfio, not the specifics
  of platform devices not supporting sub-index.
 I will add sub-indexes then.
  
  + __u32 hwirq; /*physical IRQ */
  + __u32 gsi; /* virtual IRQ */
  + struct kvm_vcpu *vcpu; /* vcpu to inject into*/
  
  Not sure I understand why vcpu is necessary.
 vcpu is used when providing the physical IRQ/virtual IRQ mapping to the
 virtual GIC. I can remove it from and add a vcpu struct * param to
 kvm_arch_set_fwd_state if you prefer.

The kvm-vfio API for this interface doesn't allow the user to indicate
which vcpu to inject to.  On x86, it would be the programming of the
interrupt controller that would decide that.  In the code here we
arbitrarily pick vcpu0.  It feels both architecture specific and a bit
unspecified.

 
   Also I see a 'get' in the code below, but not a 'put'.
 Sorry I do not understand your comment here? What 'get' do you mention?

I suppose vcpus don't subscribe to the get/put philosophy, I was
expecting a reference count, but there is none.  How do we know that
vcpu pointer is still valid later?

  
  +};
  +
   void kvm_device_get(struct kvm_device *dev);
   void kvm_device_put(struct kvm_device *dev);
   struct kvm_device *kvm_device_from_filp(struct file *filp);
  @@ -1075,6 +1090,18 @@ extern struct kvm_device_ops kvm_vfio_ops;
   extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
   extern struct kvm_device_ops kvm_flic_ops;
   
  +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
  +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
 
  what's the 'p' in pfwd?

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Alex Williamson

On Thu, 2014-09-11 at 11:35 +0200, Eric Auger wrote:
 On 09/11/2014 05:10 AM, Christoffer Dall wrote:
  On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
  This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.
 
  This is a new control channel which enables KVM to cooperate with
  viable VFIO devices.
 
  The kvm-vfio device now holds a list of devices (kvm_vfio_device)
  in addition to a list of groups (kvm_vfio_group). The new
  infrastructure enables to check the validity of the VFIO device
  file descriptor, get and hold a reference to it.
 
  The first concrete implemented command is IRQ forward control:
  KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.
 
  It consists in programing the VFIO driver and KVM in a consistent manner
  so that an optimized IRQ injection/completion is set up. Each
  kvm_vfio_device holds a list of forwarded IRQ. When putting a
  kvm_vfio_device, the implementation makes sure the forwarded IRQs
  are set again in the normal handling state (non forwarded).
  
  'putting a kvm_vfio_device' sounds to like you're golf'ing :)
  
  When a kvm_vfio_device is released?
 sure
  
 
  The forwarding programmming is architecture specific, embodied by the
  kvm_arch_set_fwd_state function. Its implementation is given in a
  separate patch file.
  
  I would drop the last sentence and instead indicate that this is handled
  properly when the architecture does not support such a feature.
 ok
  
 
  The forwarding control modality is enabled by the
  __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.
 
  Signed-off-by: Eric Auger eric.au...@linaro.org
 
  ---
 
  v1 - v2:
  - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
  - original patch file separated into 2 parts: generic part moved in vfio.c
and ARM specific part(kvm_arch_set_fwd_state)
  ---
   include/linux/kvm_host.h |  27 +++
   virt/kvm/vfio.c  | 452 
  ++-
   2 files changed, 477 insertions(+), 2 deletions(-)
 
  diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
  index a4c33b3..24350dc 100644
  --- a/include/linux/kvm_host.h
  +++ b/include/linux/kvm_host.h
  @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
   unsigned long arg);
   };
   
  +enum kvm_fwd_irq_action {
  +  KVM_VFIO_IRQ_SET_FORWARD,
  +  KVM_VFIO_IRQ_SET_NORMAL,
  +  KVM_VFIO_IRQ_CLEANUP,
  
  This is KVM internal API, so it would probably be good to document this.
  Especially the CLEANUP bit worries me, see below.
 I will document it
  
  +};
  +
  +/* internal structure describing a forwarded IRQ */
  +struct kvm_fwd_irq {
  +  struct list_head link;
  
  this list entry is local to the kvm vfio device, right? that means you
  probably want a struct with just the below fields, and then have a
  containing struct in the generic device file, private to it's logic.
 I will introduce 2 separate structs
  
  +  __u32 index; /* platform device irq index */
  +  __u32 hwirq; /*physical IRQ */
  +  __u32 gsi; /* virtual IRQ */
  +  struct kvm_vcpu *vcpu; /* vcpu to inject into*/
  +};
  +
   void kvm_device_get(struct kvm_device *dev);
   void kvm_device_put(struct kvm_device *dev);
   struct kvm_device *kvm_device_from_filp(struct file *filp);
  @@ -1075,6 +1090,18 @@ extern struct kvm_device_ops kvm_vfio_ops;
   extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
   extern struct kvm_device_ops kvm_flic_ops;
   
  +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
  +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
  
  what's the 'p' in pfwd?
 will rename
  
  + enum kvm_fwd_irq_action action);
  +
  +#else
  +static inline int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
  +   enum kvm_fwd_irq_action action)
  +{
  +  return 0;
  +}
  +#endif
  +
   #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
   
   static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool 
  val)
  diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
  index 76dc7a1..e4a81c4 100644
  --- a/virt/kvm/vfio.c
  +++ b/virt/kvm/vfio.c
  @@ -18,14 +18,24 @@
   #include linux/slab.h
   #include linux/uaccess.h
   #include linux/vfio.h
  +#include linux/platform_device.h
   
   struct kvm_vfio_group {
 struct list_head node;
 struct vfio_group *vfio_group;
   };
   
  +struct kvm_vfio_device {
  +  struct list_head node;
  +  struct vfio_device *vfio_device;
  +  /* list of forwarded IRQs for that VFIO device */
  +  struct list_head fwd_irq_list;
  +  int fd;
  +};
  +
   struct kvm_vfio {
 struct list_head group_list;
  +  struct list_head device_list;
 struct mutex lock;
 bool noncoherent;
   };
  @@ -246,12 +256,441 @@ static int kvm_vfio_set_group(struct kvm_device 
  *dev, long attr, u64 arg)
 return -ENXIO;
   }
   
  +/**
  + * get_vfio_device - returns the vfio-device corresponding to this fd
  + * @fd:fd of the vfio platform device
  + *
  + * checks it is a vfio device
  + *

Re: [PATCH v3 2/4] kvmtool: ARM64: Add target type potenza for aarch64

2014-09-11 Thread Andre Przywara

Anup,

On 08/09/14 09:17, Anup Patel wrote:
 The VCPU target type KVM_ARM_TARGET_XGENE_POTENZA is available
 in latest Linux-3.16-rcX or higher hence register aarch64 target
 type for it.

 This patch enables us to run KVMTOOL on X-Gene Potenza host.

Why do you need this still if the previous patch got rid of the need for
naming each and every CPU in kvmtool?
Do you care about kernels older than 3.12? I wouldn't bother so much
since you'd need a much newer kvmtool anyway.

Can you consider dropping this patch then?
I'd rather avoid adding CPUs to this list needlessly from now on.

Regards,
Andre.

 
 Signed-off-by: Pranavkumar Sawargaonkar pranavku...@linaro.org
 Signed-off-by: Anup Patel anup.pa...@linaro.org
 ---
  tools/kvm/arm/aarch64/arm-cpu.c |9 -
  1 file changed, 8 insertions(+), 1 deletion(-)
 
 diff --git a/tools/kvm/arm/aarch64/arm-cpu.c b/tools/kvm/arm/aarch64/arm-cpu.c
 index ce5ea2f..ce526e3 100644
 --- a/tools/kvm/arm/aarch64/arm-cpu.c
 +++ b/tools/kvm/arm/aarch64/arm-cpu.c
 @@ -41,10 +41,17 @@ static struct kvm_arm_target target_cortex_a57 = {
   .init   = arm_cpu__vcpu_init,
  };
  
 +static struct kvm_arm_target target_potenza = {
 + .id = KVM_ARM_TARGET_XGENE_POTENZA,
 + .compatible = arm,arm-v8,
 + .init   = arm_cpu__vcpu_init,
 +};
 +
  static int arm_cpu__core_init(struct kvm *kvm)
  {
   return (kvm_cpu__register_kvm_arm_target(target_aem_v8) ||
   kvm_cpu__register_kvm_arm_target(target_foundation_v8) ||
 - kvm_cpu__register_kvm_arm_target(target_cortex_a57));
 + kvm_cpu__register_kvm_arm_target(target_cortex_a57) ||
 + kvm_cpu__register_kvm_arm_target(target_potenza));
  }
  core_init(arm_cpu__core_init);
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 3/4] kvmtool: Handle exit reason KVM_EXIT_SYSTEM_EVENT

2014-09-11 Thread Andre Przywara


On 08/09/14 09:17, Anup Patel wrote:
 The KVM_EXIT_SYSTEM_EVENT exit reason was added to define
 architecture independent system-wide events for a Guest.
 
 Currently, it is used by in-kernel PSCI-0.2 emulation of
 KVM ARM/ARM64 to inform user space about PSCI SYSTEM_OFF
 or PSCI SYSTEM_RESET request.
 
 For now, we simply treat all system-wide guest events as
 shutdown request in KVMTOOL.

Is that really a good idea to default to exit_kvm?
I find a shutdown a rather drastic default.
Also I'd like to see RESET not easily mapped to shutdown. If the user
resets the box explicitly, it's probably expected to come up again (to
load an updated kernel or proceed with an install).
So what about a more explicit message like: ... please restart the VM
until we gain proper reboot support in kvmtool?

Regards,
Andre.

 Signed-off-by: Pranavkumar Sawargaonkar pranavku...@linaro.org
 Signed-off-by: Anup Patel anup.pa...@linaro.org
 ---
  tools/kvm/kvm-cpu.c |   19 +++
  1 file changed, 19 insertions(+)
 
 diff --git a/tools/kvm/kvm-cpu.c b/tools/kvm/kvm-cpu.c
 index ee0a8ec..6d01192 100644
 --- a/tools/kvm/kvm-cpu.c
 +++ b/tools/kvm/kvm-cpu.c
 @@ -160,6 +160,25 @@ int kvm_cpu__start(struct kvm_cpu *cpu)
   goto exit_kvm;
   case KVM_EXIT_SHUTDOWN:
   goto exit_kvm;
 + case KVM_EXIT_SYSTEM_EVENT:
 + /*
 +  * Print the type of system event and
 +  * treat all system events as shutdown request.
 +  */
 + switch (cpu-kvm_run-system_event.type) {
 + case KVM_SYSTEM_EVENT_SHUTDOWN:
 + printf(  # Info: shutdown system event\n);
 + break;
 + case KVM_SYSTEM_EVENT_RESET:
 + printf(  # Info: reset system event\n);
 + break;
 + default:
 + printf(  # Warning: unknown system event 
 type=%d\n,
 +cpu-kvm_run-system_event.type);
 + break;
 + };
 + printf(  # Info: exiting KVMTOOL\n);
 + goto exit_kvm;
   default: {
   bool ret;
  
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] ARM: KVM: add irqfd support

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 10:14:13AM +0200, Eric Auger wrote:
 On 09/11/2014 05:09 AM, Christoffer Dall wrote:
  On Mon, Sep 01, 2014 at 10:53:04AM +0200, Eric Auger wrote:
  This patch enables irqfd on ARM.
 
  irqfd framework enables to inject a virtual IRQ into a guest upon an
  eventfd trigger. User-side uses KVM_IRQFD VM ioctl to provide KVM with
  a kvm_irqfd struct that associates a VM, an eventfd, a virtual IRQ number
  (aka. the gsi). When an actor signals the eventfd (typically a VFIO
  platform driver), the kvm irqfd subsystem injects the provided virtual
  IRQ into the guest.
 
  Resamplefd also is supported for level sensitive interrupts, ie. the
  user can provide another eventfd that is triggered when the completion
  of the virtual IRQ (gsi) is detected by the GIC.
 
  The gsi must correspond to a shared peripheral interrupt (SPI), ie the
  GIC interrupt ID is gsi+32.
 
  this patch enables CONFIG_HAVE_KVM_EVENTFD and CONFIG_HAVE_KVM_IRQFD.
  CONFIG_HAVE_KVM_IRQCHIP is removed. No IRQ routing table is used
  (irqchip.c and irqcomm.c are not used).
 
  Both KVM_CAP_IRQFD  KVM_CAP_IRQFD_RESAMPLE capabilities are exposed
 
  Signed-off-by: Eric Auger eric.au...@linaro.org
 
  ---
 
  This patch serie deprecates the previous serie featuring GSI routing
  (https://patches.linaro.org/32261/)
 
  The patch serie has the following dependencies:
  - arm/arm64: KVM: Various VGIC cleanups and improvements
https://lists.cs.columbia.edu/pipermail/kvmarm/2014-June/009979.html
  - KVM: EVENTFD: remove inclusion of irq.h
 
  All pieces can be found on git://git.linaro.org/people/eric.auger/linux.git
  branch irqfd_norouting_integ_v3
 
  This work was tested with Calxeda Midway xgmac main interrupt with
  qemu-system-arm and QEMU VFIO platform device.
 
  v2 - v3:
  - removal of irq.h from eventfd.c put in a separate patch to increase
visibility
  - properly expose KVM_CAP_IRQFD capability in arm.c
  - remove CONFIG_HAVE_KVM_IRQCHIP meaningfull only if irq_comm.c is used
 
  v1 - v2:
  - rebase on 3.17rc1
  - move of the dist unlock in process_maintenance
  - remove of dist lock in __kvm_vgic_sync_hwstate
  - rewording of the commit message (add resamplefd reference)
  - remove irq.h
  ---
   Documentation/virtual/kvm/api.txt |  5 +++-
   arch/arm/include/uapi/asm/kvm.h   |  3 +++
   arch/arm/kvm/Kconfig  |  4 +--
   arch/arm/kvm/Makefile |  2 +-
   arch/arm/kvm/arm.c|  3 +++
   virt/kvm/arm/vgic.c   | 56 
  ---
   6 files changed, 65 insertions(+), 8 deletions(-)
 
  diff --git a/Documentation/virtual/kvm/api.txt 
  b/Documentation/virtual/kvm/api.txt
  index beae3fd..8118b12 100644
  --- a/Documentation/virtual/kvm/api.txt
  +++ b/Documentation/virtual/kvm/api.txt
  @@ -2204,7 +2204,7 @@ into the hash PTE second double word).
   4.75 KVM_IRQFD
   
   Capability: KVM_CAP_IRQFD
  -Architectures: x86 s390
  +Architectures: x86 s390 arm
   Type: vm ioctl
   Parameters: struct kvm_irqfd (in)
   Returns: 0 on success, -1 on error
  @@ -2230,6 +2230,9 @@ Note that closing the resamplefd is not sufficient 
  to disable the
   irqfd.  The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
   and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
   
  +On ARM/arm64 the injected must be a shared peripheral interrupt (SPI).
  +This means the programmed GIC interrupt ID is gsi+32.
  +
  
  See above comment.
 Hi Christoffer,
 
 sorry which comment do you refer to?

good question, I thought I had a comment above, just disregard.

 wrt your last comment do you
 consider PPI injection support is a mandated feature for this patch to
 be upstreamable?

well, right now, the only reason it's not supported is we didn't bother
thinking about it or doing it and I haven't heard a valid reason for
why we should be designing a new user space API etc. without supporting
PPIs.

So yes, either argue why it's better to not include PPI support in the
first round, why we never need to, or just support it ;)

  
   4.76 KVM_PPC_ALLOCATE_HTAB
   
   Capability: KVM_CAP_PPC_ALLOC_HTAB
  diff --git a/arch/arm/include/uapi/asm/kvm.h 
  b/arch/arm/include/uapi/asm/kvm.h
  index e6ebdd3..3034c66 100644
  --- a/arch/arm/include/uapi/asm/kvm.h
  +++ b/arch/arm/include/uapi/asm/kvm.h
  @@ -194,6 +194,9 @@ struct kvm_arch_memory_slot {
   /* Highest supported SPI, from VGIC_NR_IRQS */
   #define KVM_ARM_IRQ_GIC_MAX   127
   
  +/* One single KVM irqchip, ie. the VGIC */
  +#define KVM_NR_IRQCHIPS  1
  +
   /* PSCI interface */
   #define KVM_PSCI_FN_BASE  0x95c1ba5e
   #define KVM_PSCI_FN(n)(KVM_PSCI_FN_BASE + (n))
  diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
  index 466bd29..e519a40 100644
  --- a/arch/arm/kvm/Kconfig
  +++ b/arch/arm/kvm/Kconfig
  @@ -24,6 +24,7 @@ config KVM
 select KVM_MMIO
 select KVM_ARM_HOST
 depends on ARM_VIRT_EXT  ARM_LPAE
  +  select

Re: [BUG] Guest kernel divide error in kvm_unlock_kick

2014-09-11 Thread Chris Webb

Paolo Bonzini pbonz...@redhat.com wrote:

 This is a hypercall that should have kicked VCPU 3 (see rcx).
 
 Can you please apply this patch and gather a trace of the host
 (using trace-cmd -e kvm qemu-kvm arguments)?

Sure, no problem. I've built the trace-cmd tool against udis86 (I hope) and
have put the resulting trace.dat at

  http://cdw.me.uk/tmp/trace.dat

This is actually for a -smp 2 qemu (failing to kick VCPU 1?) as I was having
trouble persuading the -smp 4 qemu to crash as reliably under tracing.
(Something timing related?) Otherwise the qemu-system-x86 command line is
exactly as before.

The guest kernel crash message which corresponds to this trace was:

divide error:  [#1] PREEMPT SMP 
Modules linked in:
CPU: 0 PID: 618 Comm: mkdir Not tainted 3.16.2-guest #2
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
task: 88007c997080 ti: 88007c614000 task.ti: 88007c614000
RIP: 0010:[81037fe2]  [81037fe2] kvm_unlock_kick+0x72/0x80
RSP: 0018:88007c617d40  EFLAGS: 00010046
RAX: 0005 RBX:  RCX: 0001
RDX: 0001 RSI: 88007fd11c40 RDI: 
RBP: 88007fd11c40 R08: 81b98940 R09: 0001
R10:  R11: 0007 R12: 00f6
R13: 0001 R14: 0001 R15: 00011c40
FS:  7f43eb1ed700() GS:88007fc0() knlGS:
CS:  0010 DS:  ES:  CR0: 8005003b
CR2: 7f43eace0a30 CR3: 01a12000 CR4: 000406f0
Stack:
 88007c994380 88007c9949aa 0046 81689715
 810f3174 0001 ea0001f16320 ea0001f17860
  88007c99e1e8 88007c997080 0001
Call Trace:
 [81689715] ? _raw_spin_unlock+0x45/0x70
 [810f3174] ? try_to_wake_up+0x2a4/0x330
 [81101e2c] ? __wake_up_common+0x4c/0x80
 [81102418] ? __wake_up_sync_key+0x38/0x60
 [810d873a] ? do_notify_parent+0x19a/0x280
 [810f4d56] ? sched_move_task+0xb6/0x190
 [810cb4fc] ? do_exit+0xa1c/0xab0
 [810cc344] ? do_group_exit+0x34/0xb0
 [810cc3cb] ? SyS_exit_group+0xb/0x10
 [8168a16d] ? system_call_fastpath+0x1a/0x1f
Code: c0 ca a7 81 48 8d 04 0b 48 8b 30 48 39 ee 75 c9 0f b6 40 08 44 38 e0 75 
c0 48 c7 c0 22 b0 00 00 31 db 0f b7 0c 08 b8 05 00 00 00 0f 01 c1 0f 1f 00 5b 
5d 41 5c c3 0f 1f 00 48 c7 c0 10 cf 00 00 
RIP  [81037fe2] kvm_unlock_kick+0x72/0x80
 RSP 88007c617d40
---[ end trace bf5a4445f9decdbb ]---
Fixing recursive fault but reboot is needed!
BUG: scheduling while atomic: mkdir/618/0x0006
Modules linked in:
CPU: 0 PID: 618 Comm: mkdir Tainted: G  D   3.16.2-guest #2
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
  c022d302 81684029 
 810ee956 81686266 00011c40 88007c617fd8
 00011c40 88007c997080 0006 0046
Call Trace:
 [81684029] ? dump_stack+0x49/0x6a
 [810ee956] ? __schedule_bug+0x46/0x60
 [81686266] ? __schedule+0x5a6/0x7c0
 [816828cd] ? printk+0x59/0x75
 [810cb33b] ? do_exit+0x85b/0xab0
 [816828cd] ? printk+0x59/0x75
 [8100614a] ? oops_end+0x7a/0x100
 [810033e5] ? do_error_trap+0x85/0x110
 [81037fe2] ? kvm_unlock_kick+0x72/0x80
 [8114a358] ? __alloc_pages_nodemask+0x108/0xa60
 [8168b57e] ? divide_error+0x1e/0x30
 [81037fe2] ? kvm_unlock_kick+0x72/0x80
 [81689715] ? _raw_spin_unlock+0x45/0x70
 [810f3174] ? try_to_wake_up+0x2a4/0x330
 [81101e2c] ? __wake_up_common+0x4c/0x80
 [81102418] ? __wake_up_sync_key+0x38/0x60
 [810d873a] ? do_notify_parent+0x19a/0x280
 [810f4d56] ? sched_move_task+0xb6/0x190
 [810cb4fc] ? do_exit+0xa1c/0xab0
 [810cc344] ? do_group_exit+0x34/0xb0
 [810cc3cb] ? SyS_exit_group+0xb/0x10
 [8168a16d] ? system_call_fastpath+0x1a/0x1f

Best wishes,

Chris.--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 4/9] VFIO: platform: handler tests whether the IRQ is forwarded

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 10:44:02AM +0200, Eric Auger wrote:
 On 09/11/2014 05:10 AM, Christoffer Dall wrote:
  On Mon, Sep 01, 2014 at 02:52:43PM +0200, Eric Auger wrote:
  In case the IRQ is forwarded, the VFIO platform IRQ handler does not
  need to disable the IRQ anymore. In that mode, when the handler completes
  
  add a comma after completes
 Hi Christoffer,
 ok
  
  the IRQ is not deactivated but only its priority is lowered.
 
  Some other actor (typically a guest) is supposed to deactivate the IRQ,
  allowing at that time a new physical IRQ to hit.
 
  In virtualization use case, the physical IRQ is automatically completed
  by the interrupt controller when the guest completes the corresponding
  virtual IRQ.
 
  Signed-off-by: Eric Auger eric.au...@linaro.org
  ---
   drivers/vfio/platform/vfio_platform_irq.c | 7 ++-
   1 file changed, 6 insertions(+), 1 deletion(-)
 
  diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
  b/drivers/vfio/platform/vfio_platform_irq.c
  index 6768508..1f851b2 100644
  --- a/drivers/vfio/platform/vfio_platform_irq.c
  +++ b/drivers/vfio/platform/vfio_platform_irq.c
  @@ -88,13 +88,18 @@ static irqreturn_t vfio_irq_handler(int irq, void 
  *dev_id)
 struct vfio_platform_irq *irq_ctx = dev_id;
 unsigned long flags;
 int ret = IRQ_NONE;
  +  struct irq_data *d;
  +  bool is_forwarded;
   
 spin_lock_irqsave(irq_ctx-lock, flags);
   
 if (!irq_ctx-masked) {
 ret = IRQ_HANDLED;
  +  d = irq_get_irq_data(irq_ctx-hwirq);
  +  is_forwarded = irqd_irq_forwarded(d);
   
  -  if (irq_ctx-flags  VFIO_IRQ_INFO_AUTOMASKED) {
  +  if (irq_ctx-flags  VFIO_IRQ_INFO_AUTOMASKED 
  +  !is_forwarded) {
 disable_irq_nosync(irq_ctx-hwirq);
 irq_ctx-masked = true;
 }
  -- 
  1.9.1
 
  It makes sense that these needs to be all controlled in the kernel, but
  I'm wondering if it would be cleaner / more correct to clear the
  AUTOMASKED flag when the IRQ is forwarded and have vfio refuse setting
  this flag as long as the irq is forwarded?
 
 If I am not wrong, even if the user sets AUTOMASKED, this info never is
 exploited by the vfio platform driver. AUTOMASKED only is set internally
 to the driver, on init, for level sensitive IRQs.
 
 It seems to be the same on PCI (for INTx). I do not see anywhere the
 user flag curectly copied into a local storage. But I prefer to be
 careful ;-)
 
 If confirmed, although the flag value is exposed in the user API, the
 user set value never is exploited so this removes the need to check.
 
 the forwarded IRQ modality being fully dynamic currently, then I would
 need to update the irq_ctx-flags on each vfio_irq_handler call. I don't
 know if its better?
 
I'm not an expert on vfio, so I'll leave that to Alex Williamson to
answer, but I'm just worried that we need to special-case the forwarded
IRQ here, and if that may get lost elsewhere in the vfio code.  If the
AUTOMASKED flag covers specifically this behavior, then why don't we
simply clear/set that flag when forwarding/unforwarding the specific
IRQ?

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 5/9] KVM: KVM-VFIO: update user API to program forwarded IRQ

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 10:49:08AM +0200, Eric Auger wrote:
 On 09/11/2014 05:10 AM, Christoffer Dall wrote:
  On Mon, Sep 01, 2014 at 02:52:44PM +0200, Eric Auger wrote:

[...]

  +
  +It is up to the caller of this API to make sure the IRQ is not
  +outstanding when the FORWARD/UNFORWARD is called. This could lead to
  
  outstanding? can you be specific?
 active? and I should add *physical* IRQ
  
  don't refer to FOWARD/UNFORWARD, either refer to these attributes by
  their full name or use a clear reference in proper English.
 ok
  
  +some inconsistency on who is going to complete the IRQ.
  
  This sounds like the whole thing is fragile and if userspace doesn't do
  things right, IRQ handling of a piece of hardware is going to be
  inconsistent?  Is this the case?  If so, we need some stronger
  semantics.  If not, this should be rephrased.
 Actually the KVM-VFIO device rejects any attempt to change the
 forwarding mode if the physical IRQ is active. So I hope this is robust
 and will change the explanation.
 
ok, so what is the proposed method if the IRQ is indeed active, should
user space loop around and try or can user space make sure somehow?  If
user space should simply retry for a number of times, we should probalby
return a proper error code for this case -EINTR?

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 4/9] VFIO: platform: handler tests whether the IRQ is forwarded

2014-09-11 Thread Antonios Motakis

On Thu, Sep 11, 2014 at 10:44 AM, Eric Auger eric.au...@linaro.org wrote:

 On 09/11/2014 05:10 AM, Christoffer Dall wrote:
  On Mon, Sep 01, 2014 at 02:52:43PM +0200, Eric Auger wrote:
  In case the IRQ is forwarded, the VFIO platform IRQ handler does not
  need to disable the IRQ anymore. In that mode, when the handler completes
 
  add a comma after completes
 Hi Christoffer,
 ok
 
  the IRQ is not deactivated but only its priority is lowered.
 
  Some other actor (typically a guest) is supposed to deactivate the IRQ,
  allowing at that time a new physical IRQ to hit.
 
  In virtualization use case, the physical IRQ is automatically completed
  by the interrupt controller when the guest completes the corresponding
  virtual IRQ.
 
  Signed-off-by: Eric Auger eric.au...@linaro.org
  ---
   drivers/vfio/platform/vfio_platform_irq.c | 7 ++-
   1 file changed, 6 insertions(+), 1 deletion(-)
 
  diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
  b/drivers/vfio/platform/vfio_platform_irq.c
  index 6768508..1f851b2 100644
  --- a/drivers/vfio/platform/vfio_platform_irq.c
  +++ b/drivers/vfio/platform/vfio_platform_irq.c
  @@ -88,13 +88,18 @@ static irqreturn_t vfio_irq_handler(int irq, void 
  *dev_id)
   struct vfio_platform_irq *irq_ctx = dev_id;
   unsigned long flags;
   int ret = IRQ_NONE;
  +struct irq_data *d;
  +bool is_forwarded;
 
   spin_lock_irqsave(irq_ctx-lock, flags);
 
   if (!irq_ctx-masked) {
   ret = IRQ_HANDLED;
  +d = irq_get_irq_data(irq_ctx-hwirq);
  +is_forwarded = irqd_irq_forwarded(d);
 
  -if (irq_ctx-flags  VFIO_IRQ_INFO_AUTOMASKED) {
  +if (irq_ctx-flags  VFIO_IRQ_INFO_AUTOMASKED 
  +!is_forwarded) {
   disable_irq_nosync(irq_ctx-hwirq);
   irq_ctx-masked = true;
   }
  --
  1.9.1
 
  It makes sense that these needs to be all controlled in the kernel, but
  I'm wondering if it would be cleaner / more correct to clear the
  AUTOMASKED flag when the IRQ is forwarded and have vfio refuse setting
  this flag as long as the irq is forwarded?

 If I am not wrong, even if the user sets AUTOMASKED, this info never is
 exploited by the vfio platform driver. AUTOMASKED only is set internally
 to the driver, on init, for level sensitive IRQs.

 It seems to be the same on PCI (for INTx). I do not see anywhere the
 user flag curectly copied into a local storage. But I prefer to be
 careful ;-)

 If confirmed, although the flag value is exposed in the user API, the
 user set value never is exploited so this removes the need to check.


Yeah, the way the API is right now the AUTOMASKED flag is only to be
communicated by the kernel to the user, never the other way around.

IMHO there shouldn't be a need to change that. The flag is there just
to inform the user for the kernel behavior for non-forwarded IRQs (and
it's still true if the user unforwards the IRQ later). The user
decides the mode of operation, but it might still be a bit of
information he wants to know.

 the forwarded IRQ modality being fully dynamic currently, then I would
 need to update the irq_ctx-flags on each vfio_irq_handler call. I don't
 know if its better?

 Best Regards

 Eric


 
  -Christoffer
 




-- 
Antonios Motakis
Virtual Open Systems
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Christoffer Dall

On Wed, Sep 10, 2014 at 11:05:49PM -0600, Alex Williamson wrote:
 On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
  On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:

[...]


   +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
   +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
  
  what's the 'p' in pfwd?
 
 p is for pointer?
 

shouldn't the type declation spell out quite clearly to me that I'm
dealing with a pointer?

[...]

  
  need some spaceing here, also, I would turn this around, first check if
  the strcmp fails, and then error out, then do you next check etc., to
  avoid so many nested statements.
  
   + /* is a ref to this device already owned by the KVM-VFIO device? */
  
  this comment is not particularly helpful in its current form, it would
  be helpful if you specified that we're checking whether that particular
  device/irq combo is already registered.
  
   + *kvm_vdev = kvm_vfio_find_device(kv, vdev);
   + if (*kvm_vdev) {
   + if (kvm_vfio_find_irq(*kvm_vdev, fwd_irq-index)) {
   + kvm_err(%s irq %d already forwarded\n,
   + __func__, *hwirq);
 
 Why didn't we do this first?
 
huh?

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 0/4] Make kvm_device_ops registration dynamic

2014-09-11 Thread Will Deacon

Hi Paolo,

On Tue, Sep 02, 2014 at 10:27:32AM +0100, Will Deacon wrote:
 Hi all,
 
 This is version 3 of the patches originally posted here:
 
   v1: http://www.spinics.net/lists/kvm-arm/msg10219.html
   v2: http://www.spinics.net/lists/kvm/msg105197.html
 
 Changes since v2 include:
 
   - Rebased onto 3.17-rc* (the vgic code changed a lot!)
   - Added relevant acks
 
 The mpic, flic and xics are still not ported over, as I don't want to
 risk breaking those devices (it's not clear at which point they need to
 be registered).

Any further comments on this lot?

Will
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH v6 15/20] vfio/platform: support for maskable and automasked interrupts

2014-09-11 Thread Antonios Motakis

On Wed, Sep 10, 2014 at 12:13 PM, Christoffer Dall
christoffer.d...@linaro.org wrote:
 On Tue, Sep 02, 2014 at 06:06:17PM +0200, Antonios Motakis wrote:
 On Sun, Jun 8, 2014 at 12:17 PM, Christoffer Dall
 christoffer.d...@linaro.org wrote:
  On Thu, Jun 05, 2014 at 07:03:23PM +0200, Antonios Motakis wrote:
  Adds support to mask interrupts, and also for automasked interrupts.
  Level sensitive interrupts are exposed as automasked interrupts and
  are masked and disabled automatically when they fire.
 
  Signed-off-by: Antonios Motakis a.mota...@virtualopensystems.com
  ---
   drivers/vfio/platform/vfio_platform_irq.c | 112 
  --
   drivers/vfio/platform/vfio_platform_private.h |   2 +
   2 files changed, 109 insertions(+), 5 deletions(-)
 
  diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
  b/drivers/vfio/platform/vfio_platform_irq.c
  index d79f5af..10dfbf0 100644
  --- a/drivers/vfio/platform/vfio_platform_irq.c
  +++ b/drivers/vfio/platform/vfio_platform_irq.c
  @@ -51,9 +51,17 @@ int vfio_platform_irq_init(struct vfio_platform_device 
  *vdev)
if (hwirq  0)
goto err;
 
  - vdev-irq[i].flags = VFIO_IRQ_INFO_EVENTFD;
  + spin_lock_init(vdev-irq[i].lock);
  +
  + vdev-irq[i].flags = VFIO_IRQ_INFO_EVENTFD
  + | VFIO_IRQ_INFO_MASKABLE;
  +
  + if (irq_get_trigger_type(hwirq)  IRQ_TYPE_LEVEL_MASK)
  + vdev-irq[i].flags |= VFIO_IRQ_INFO_AUTOMASKED;
 
  This seems to rely on the fact that you had actually loaded a driver for
  your device to set the right type.  Is this assumption always correct?
 
  It seems to me that this configuration bit should now be up to your user
  space drive who is the best candidate to know details about your device
  at this point?
 

 Hm, I see this type being set usually either in a device tree source,
 or in the support code for a specific platform. Are there any
 situations where this is actually set by the driver? If I understand
 right this is not the case for the PL330, but if it is possible that
 it is the case for another device then I need to rethink this. Though
 as far as I understand this should not be the case.


 Wow, this has been incredibly long time since I looked at this code, so
 not sure if I remember my original reasoning anymore, however,

 while device properties are set in the DT, they would only be available
 to this code if you actually loaded a device driver for that device,
 right?  I'm just not sure that assumption always holds for devices used
 by VFIO, but I'm really not sure anymore.  Maybe I'm rambling.

The device I'm testing with, the PL330 DMAC, is one of the devices
that exposes level sensitive interrupts, and therefore for it to
properly work VFIO needs to be able to expose it as automasked.

I just tested the code on a kernel that doesn't include the native
PL330 DMA driver. It seems that even so, the unmasked property is
properly detected and exposed by VFIO. So for this scenario at least
the assumptions are true...

I'm afraid I have to admit that if there are any edge cases where this
might not be true, I don't know which they are :(


  +
vdev-irq[i].count = 1;
vdev-irq[i].hwirq = hwirq;
  + vdev-irq[i].masked = false;
}
 
vdev-num_irqs = cnt;
  @@ -77,11 +85,27 @@ void vfio_platform_irq_cleanup(struct 
  vfio_platform_device *vdev)
 
   static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
   {
  - struct eventfd_ctx *trigger = dev_id;
  + struct vfio_platform_irq *irq_ctx = dev_id;
  + unsigned long flags;
  + int ret = IRQ_NONE;
  +
  + spin_lock_irqsave(irq_ctx-lock, flags);
  +
  + if (!irq_ctx-masked) {
  + ret = IRQ_HANDLED;
  +
  + if (irq_ctx-flags  VFIO_IRQ_INFO_AUTOMASKED) {
  + disable_irq_nosync(irq_ctx-hwirq);
  + irq_ctx-masked = true;
  + }
  + }
 
  - eventfd_signal(trigger, 1);
  + spin_unlock_irqrestore(irq_ctx-lock, flags);
 
  - return IRQ_HANDLED;
  + if (ret == IRQ_HANDLED)
  + eventfd_signal(irq_ctx-trigger, 1);
  +
  + return ret;
   }
 
   static int vfio_set_trigger(struct vfio_platform_device *vdev,
  @@ -162,6 +186,82 @@ static int vfio_platform_set_irq_trigger(struct 
  vfio_platform_device *vdev,
return -EFAULT;
   }
 
  +static int vfio_platform_set_irq_unmask(struct vfio_platform_device 
  *vdev,
  + unsigned index, unsigned start,
  + unsigned count, uint32_t flags, void 
  *data)
  +{
  + uint8_t arr;
 
 
  arr?

 arr for array! As in, the VFIO API allows an array of IRQs. However
 for platform devices we don't use this, each IRQ is exposed
 independently as an array of 1 IRQ.


 but it's not an array.  If it contains IRQs, call it irqs.  Unless this
 is

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 02:04:39PM +0200, Eric Auger wrote:
 On 09/11/2014 07:05 AM, Alex Williamson wrote:
  On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
  On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
  This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.
 
  This is a new control channel which enables KVM to cooperate with
  viable VFIO devices.
 
  The kvm-vfio device now holds a list of devices (kvm_vfio_device)
  in addition to a list of groups (kvm_vfio_group). The new
  infrastructure enables to check the validity of the VFIO device
  file descriptor, get and hold a reference to it.
 
  The first concrete implemented command is IRQ forward control:
  KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.
 
  It consists in programing the VFIO driver and KVM in a consistent manner
  so that an optimized IRQ injection/completion is set up. Each
  kvm_vfio_device holds a list of forwarded IRQ. When putting a
  kvm_vfio_device, the implementation makes sure the forwarded IRQs
  are set again in the normal handling state (non forwarded).
 
  'putting a kvm_vfio_device' sounds to like you're golf'ing :)
 
  When a kvm_vfio_device is released?
 
 
  The forwarding programmming is architecture specific, embodied by the
  kvm_arch_set_fwd_state function. Its implementation is given in a
  separate patch file.
 
  I would drop the last sentence and instead indicate that this is handled
  properly when the architecture does not support such a feature.
 
 
  The forwarding control modality is enabled by the
  __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.
 
  Signed-off-by: Eric Auger eric.au...@linaro.org
 
  ---
 
  v1 - v2:
  - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
  - original patch file separated into 2 parts: generic part moved in vfio.c
and ARM specific part(kvm_arch_set_fwd_state)
  ---
   include/linux/kvm_host.h |  27 +++
   virt/kvm/vfio.c  | 452 
  ++-
   2 files changed, 477 insertions(+), 2 deletions(-)
 
  diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
  index a4c33b3..24350dc 100644
  --- a/include/linux/kvm_host.h
  +++ b/include/linux/kvm_host.h
  @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
  unsigned long arg);
   };
   
  +enum kvm_fwd_irq_action {
  + KVM_VFIO_IRQ_SET_FORWARD,
  + KVM_VFIO_IRQ_SET_NORMAL,
  + KVM_VFIO_IRQ_CLEANUP,
 
  This is KVM internal API, so it would probably be good to document this.
  Especially the CLEANUP bit worries me, see below.
  
  This also doesn't match the user API, which is simply FORWARD/UNFORWARD.
 Hi Alex,
 
 will change that.
  Extra states worry me too.
 
 I tried to explained the 2 motivations behind. Please let me know if it
 makes sense.
  
  +};
  +
  +/* internal structure describing a forwarded IRQ */
  +struct kvm_fwd_irq {
  + struct list_head link;
 
  this list entry is local to the kvm vfio device, right? that means you
  probably want a struct with just the below fields, and then have a
  containing struct in the generic device file, private to it's logic.
  
  Yes, this is part of the abstraction problem.
 OK will fix that.
  
  + __u32 index; /* platform device irq index */
  
  This is a vfio_device irq_index, but vfio_devices support indexes and
  sub-indexes.  At this level the API should match vfio, not the specifics
  of platform devices not supporting sub-index.
 I will add sub-indexes then.
  
  + __u32 hwirq; /*physical IRQ */
  + __u32 gsi; /* virtual IRQ */
  + struct kvm_vcpu *vcpu; /* vcpu to inject into*/
  
  Not sure I understand why vcpu is necessary.
 vcpu is used when providing the physical IRQ/virtual IRQ mapping to the
 virtual GIC. I can remove it from and add a vcpu struct * param to
 kvm_arch_set_fwd_state if you prefer.
 
   Also I see a 'get' in the code below, but not a 'put'.
 Sorry I do not understand your comment here? What 'get' do you mention?

he means kvm_get_vcpu(), but you are ok on that one, the kvm naming of
this function is unfortunate, because it doesn't increment any refcounts
but just resolves to an entry in the array.

  
  +};
  +
   void kvm_device_get(struct kvm_device *dev);
   void kvm_device_put(struct kvm_device *dev);
   struct kvm_device *kvm_device_from_filp(struct file *filp);
  @@ -1075,6 +1090,18 @@ extern struct kvm_device_ops kvm_vfio_ops;
   extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
   extern struct kvm_device_ops kvm_flic_ops;
   
  +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
  +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
 
  what's the 'p' in pfwd?
  
  p is for pointer?
 yes it was ;-)
  
  +enum kvm_fwd_irq_action action);
  +
  +#else
  +static inline int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
  +  enum kvm_fwd_irq_action action)
  +{
  + return 0;
  +}
  +#endif
  +
   #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
   
   static inline

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 09:59:24AM -0600, Alex Williamson wrote:
 On Thu, 2014-09-11 at 14:04 +0200, Eric Auger wrote:
  On 09/11/2014 07:05 AM, Alex Williamson wrote:
   On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
   On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
   This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.
  
   This is a new control channel which enables KVM to cooperate with
   viable VFIO devices.
  
   The kvm-vfio device now holds a list of devices (kvm_vfio_device)
   in addition to a list of groups (kvm_vfio_group). The new
   infrastructure enables to check the validity of the VFIO device
   file descriptor, get and hold a reference to it.
  
   The first concrete implemented command is IRQ forward control:
   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.
  
   It consists in programing the VFIO driver and KVM in a consistent manner
   so that an optimized IRQ injection/completion is set up. Each
   kvm_vfio_device holds a list of forwarded IRQ. When putting a
   kvm_vfio_device, the implementation makes sure the forwarded IRQs
   are set again in the normal handling state (non forwarded).
  
   'putting a kvm_vfio_device' sounds to like you're golf'ing :)
  
   When a kvm_vfio_device is released?
  
  
   The forwarding programmming is architecture specific, embodied by the
   kvm_arch_set_fwd_state function. Its implementation is given in a
   separate patch file.
  
   I would drop the last sentence and instead indicate that this is handled
   properly when the architecture does not support such a feature.
  
  
   The forwarding control modality is enabled by the
   __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.
  
   Signed-off-by: Eric Auger eric.au...@linaro.org
  
   ---
  
   v1 - v2:
   - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
   - original patch file separated into 2 parts: generic part moved in 
   vfio.c
 and ARM specific part(kvm_arch_set_fwd_state)
   ---
include/linux/kvm_host.h |  27 +++
virt/kvm/vfio.c  | 452 
   ++-
2 files changed, 477 insertions(+), 2 deletions(-)
  
   diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
   index a4c33b3..24350dc 100644
   --- a/include/linux/kvm_host.h
   +++ b/include/linux/kvm_host.h
   @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
 unsigned long arg);
};

   +enum kvm_fwd_irq_action {
   +   KVM_VFIO_IRQ_SET_FORWARD,
   +   KVM_VFIO_IRQ_SET_NORMAL,
   +   KVM_VFIO_IRQ_CLEANUP,
  
   This is KVM internal API, so it would probably be good to document this.
   Especially the CLEANUP bit worries me, see below.
   
   This also doesn't match the user API, which is simply FORWARD/UNFORWARD.
  Hi Alex,
  
  will change that.
   Extra states worry me too.
  
  I tried to explained the 2 motivations behind. Please let me know if it
  makes sense.
 
 Not really.  It seems like it's just a leak of arch specific handling
 out into common code.
 
   +};
   +
   +/* internal structure describing a forwarded IRQ */
   +struct kvm_fwd_irq {
   +   struct list_head link;
  
   this list entry is local to the kvm vfio device, right? that means you
   probably want a struct with just the below fields, and then have a
   containing struct in the generic device file, private to it's logic.
   
   Yes, this is part of the abstraction problem.
  OK will fix that.
   
   +   __u32 index; /* platform device irq index */
   
   This is a vfio_device irq_index, but vfio_devices support indexes and
   sub-indexes.  At this level the API should match vfio, not the specifics
   of platform devices not supporting sub-index.
  I will add sub-indexes then.
   
   +   __u32 hwirq; /*physical IRQ */
   +   __u32 gsi; /* virtual IRQ */
   +   struct kvm_vcpu *vcpu; /* vcpu to inject into*/
   
   Not sure I understand why vcpu is necessary.
  vcpu is used when providing the physical IRQ/virtual IRQ mapping to the
  virtual GIC. I can remove it from and add a vcpu struct * param to
  kvm_arch_set_fwd_state if you prefer.
 
 The kvm-vfio API for this interface doesn't allow the user to indicate
 which vcpu to inject to.  On x86, it would be the programming of the
 interrupt controller that would decide that.  In the code here we
 arbitrarily pick vcpu0.  It feels both architecture specific and a bit
 unspecified.
 
  
Also I see a 'get' in the code below, but not a 'put'.
  Sorry I do not understand your comment here? What 'get' do you mention?
 
 I suppose vcpus don't subscribe to the get/put philosophy, I was
 expecting a reference count, but there is none.  How do we know that
 vcpu pointer is still valid later?
 

Because it will stay valid for as long as you can have a handle to this
instance of the kvm vfio device?

The only way for it to go away is when the VM is completely dying, but
the KVM device API should keep it a alive

Re: [RFC v2 2/9] KVM: ARM: VGIC: add forwarded irq rbtree lock

2014-09-11 Thread Eric Auger

On 09/11/2014 05:09 AM, Christoffer Dall wrote:
 On Mon, Sep 01, 2014 at 02:52:41PM +0200, Eric Auger wrote:
 add a lock related to the rb tree manipulation. The rb tree can be
 
 Ok, I can't hold myself back any longer. 


 Please begin sentences with a
 capital letter. You don't do this in French? :)
Hi Christoffer,


yep that's understood ;-) Definitively we do. Just that I am discovering
it is common too in commits and comments ;-)
 
 searched in one thread (irqfd handler for instance) and map/unmap
 happen in another.

 Signed-off-by: Eric Auger eric.au...@linaro.org
 ---
  include/kvm/arm_vgic.h |  1 +
  virt/kvm/arm/vgic.c| 46 +-
  2 files changed, 38 insertions(+), 9 deletions(-)

 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index 743020f..3da244f 100644
 --- a/include/kvm/arm_vgic.h
 +++ b/include/kvm/arm_vgic.h
 @@ -177,6 +177,7 @@ struct vgic_dist {
  unsigned long   irq_pending_on_cpu;
  
  struct rb_root  irq_phys_map;
 +spinlock_t  rb_tree_lock;
  #endif
  };
  
 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index 8ef495b..dbc2a5a 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -1630,9 +1630,15 @@ static struct rb_root *vgic_get_irq_phys_map(struct 
 kvm_vcpu *vcpu,
  
  int vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
  {
 -struct rb_root *root = vgic_get_irq_phys_map(vcpu, virt_irq);
 -struct rb_node **new = root-rb_node, *parent = NULL;
 +struct rb_root *root;
 +struct rb_node **new, *parent = NULL;
  struct irq_phys_map *new_map;
 +struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +spin_lock(dist-rb_tree_lock);
 +
 +root = vgic_get_irq_phys_map(vcpu, virt_irq);
 +new = root-rb_node;
  
  /* Boilerplate rb_tree code */
  while (*new) {
 @@ -1644,13 +1650,17 @@ int vgic_map_phys_irq(struct kvm_vcpu *vcpu, int 
 virt_irq, int phys_irq)
  new = (*new)-rb_left;
  else if (this-virt_irq  virt_irq)
  new = (*new)-rb_right;
 -else
 +else {
 +spin_unlock(dist-rb_tree_lock);
  return -EEXIST;
 +}
 
 can you initialize a ret variable to -EEXIST in the beginning of this
 function, and add an out label above the unlock below, replace this
 multi-line statement with a goto out, and set ret = 0 after the while
 loop?
sure
 
  }
  
  new_map = kzalloc(sizeof(*new_map), GFP_KERNEL);
 -if (!new_map)
 +if (!new_map) {
 +spin_unlock(dist-rb_tree_lock);
  return -ENOMEM;
 
 then this becomes ret = -ENOMEM; goto out;
OK
 
 +}
  
  new_map-virt_irq = virt_irq;
  new_map-phys_irq = phys_irq;
 @@ -1658,6 +1668,8 @@ int vgic_map_phys_irq(struct kvm_vcpu *vcpu, int 
 virt_irq, int phys_irq)
  rb_link_node(new_map-node, parent, new);
  rb_insert_color(new_map-node, root);
  
 +spin_unlock(dist-rb_tree_lock);
 +
 
 aren't you allocating memory with GFP_KERNEL while holding a spinlock
 here?
oups. Thanks for noticing. I Will move the lock.
 
  return 0;
  }
  
 @@ -1685,24 +1697,39 @@ static struct irq_phys_map 
 *vgic_irq_map_search(struct kvm_vcpu *vcpu,
  
  int vgic_get_phys_irq(struct kvm_vcpu *vcpu, int virt_irq)
  {
 -struct irq_phys_map *map = vgic_irq_map_search(vcpu, virt_irq);
 +struct irq_phys_map *map;
 +struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +int ret;
 +
 +spin_lock(dist-rb_tree_lock);
 +map = vgic_irq_map_search(vcpu, virt_irq);
  
  if (map)
 -return map-phys_irq;
 +ret = map-phys_irq;
 +else
 +ret =  -ENOENT;
 
 initialize ret to -ENOENT and avoid the else statement.
ok
 
 +
 +spin_unlock(dist-rb_tree_lock);
 +return ret;
  
 -return -ENOENT;
  }
  
  int vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
  {
 -struct irq_phys_map *map = vgic_irq_map_search(vcpu, virt_irq);
 +struct irq_phys_map *map;
 +struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +spin_lock(dist-rb_tree_lock);
 +
 +map = vgic_irq_map_search(vcpu, virt_irq);
  
  if (map  map-phys_irq == phys_irq) {
  rb_erase(map-node, vgic_get_irq_phys_map(vcpu, virt_irq));
  kfree(map);
 +spin_unlock(dist-rb_tree_lock);
 
 can kfree sleep?  I don't remember.  In any case, you can unlock before
 calling kfree.
no it can't but I will move anyway.
 
  return 0;
  }
 -
 +spin_unlock(dist-rb_tree_lock);
  return -ENOENT;
 
 an out label and single unlock location would be preferred here as well
 I think.
ok

Thansk

Eric
 
  }
  
 @@ -1898,6 +1925,7 @@ int kvm_vgic_create(struct kvm *kvm)
  }
  
  spin_lock_init(kvm-arch.vgic.lock);
 +spin_lock_init(kvm-arch.vgic.rb_tree_lock);
  kvm-arch.vgic.in_kernel = true;
  kvm-arch.vgic.vctrl_base =

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 11:35:56AM +0200, Eric Auger wrote:
 On 09/11/2014 05:10 AM, Christoffer Dall wrote:
  On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:

[...]

  +  if (!pfwd)
  +  return -ENOMEM;
  +  pfwd-index = fwd_irq-index;
  +  pfwd-gsi = fwd_irq-gsi;
  +  pfwd-hwirq = hwirq;
  +  pfwd-vcpu = kvm_get_vcpu(kdev-kvm, 0);
  +  ret = kvm_arch_set_fwd_state(pfwd, KVM_VFIO_IRQ_SET_FORWARD);
  +  if (ret  0) {
  +  kvm_arch_set_fwd_state(pfwd, KVM_VFIO_IRQ_CLEANUP);
  
  this whole thing feels incredibly broken to me.  Setting a forward
  should either work or not work, not something in between that leaves
  something to be cleaned up.  Why this two-stage thingy here?
 I wanted to exploit the return value of vgic_map_phys_irq which is
 likely to fail if the phys/virt mapping exists at VGIC level.

then just have the kvm_arch_set_fwd_state return with -EXIST and it is
the responsibility of that function itself to cleanup from whatever it
was doing, not to rely on its caller to call a cleanup function.

 
 I already validated the injection from a KVM_VFIO_DEVICE point of view
 (the device/irq is not known internally). But what if another external
 component - which does not exist yet - maps the IRQ at VGIC level? Maybe
 I need to replace the existing validation check by querying the VGIC at
 low level instead of checking KVM-VFIO local variables.

No need to over-complicate this, in this case, the
kvm_arch_set_fwd_state() will simply fail (graceously), as I said above,
and you just return to the user, sorry, couldn't do what you asked me
because of this error code.


[...]

  + *
  + * When this function is called, the vcpu already are destroyed. No
  the VPUCs are already destroyed.
  + * vgic manipulation can happen hence the KVM_VFIO_IRQ_CLEANUP
  + * kvm_arch_set_fwd_state action
  
  this last bit didn't make any sense to me.  Also, why are we referring
  to the vgic in generic code?
 doesn't make sense anymore indeed. I wanted to emphasize the fact that
 VGIC KVM device is destroyed before the KVM VFIO device and this
 explains why I need a special CLEANUP cmd (besides the fact I need to
 call chip-irq_eoi(d) for the forwarded IRQs);
 

I don't think it explains why you need a special CLEANUP cmd.  When the
vgic is going away it must cleanup its state.  When the kvm vfio device
goes away, it must unforward any unforwarded IRQs, and the architecture
specific implementation MUST correctly unforward such IRQs - as a single
operation!

Hope this helps.
-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

hardware problem + travelling

2014-09-11 Thread Paolo Bonzini

An OOPS has just bricked my main machine (or so it seems---firmware
doesn't show up at all, not even with disks removed), so I'll likely
spend part of tomorrow reviving an older one which I've luckily kept.

Plus, I'll be travelling most of next week.

For this reason I might be slow answering the mailing list and, for the
rest of the 3.18 series, I might be applying fewer x86 patches due to
limited testing ability.

Thanks, and sorry for the inconvenience.

Paolo

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 4/9] VFIO: platform: handler tests whether the IRQ is forwarded

2014-09-11 Thread Alex Williamson

On Thu, 2014-09-11 at 19:05 +0200, Christoffer Dall wrote:
 On Thu, Sep 11, 2014 at 10:44:02AM +0200, Eric Auger wrote:
  On 09/11/2014 05:10 AM, Christoffer Dall wrote:
   On Mon, Sep 01, 2014 at 02:52:43PM +0200, Eric Auger wrote:
   In case the IRQ is forwarded, the VFIO platform IRQ handler does not
   need to disable the IRQ anymore. In that mode, when the handler completes
   
   add a comma after completes
  Hi Christoffer,
  ok
   
   the IRQ is not deactivated but only its priority is lowered.
  
   Some other actor (typically a guest) is supposed to deactivate the IRQ,
   allowing at that time a new physical IRQ to hit.
  
   In virtualization use case, the physical IRQ is automatically completed
   by the interrupt controller when the guest completes the corresponding
   virtual IRQ.
  
   Signed-off-by: Eric Auger eric.au...@linaro.org
   ---
drivers/vfio/platform/vfio_platform_irq.c | 7 ++-
1 file changed, 6 insertions(+), 1 deletion(-)
  
   diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
   b/drivers/vfio/platform/vfio_platform_irq.c
   index 6768508..1f851b2 100644
   --- a/drivers/vfio/platform/vfio_platform_irq.c
   +++ b/drivers/vfio/platform/vfio_platform_irq.c
   @@ -88,13 +88,18 @@ static irqreturn_t vfio_irq_handler(int irq, void 
   *dev_id)
struct vfio_platform_irq *irq_ctx = dev_id;
unsigned long flags;
int ret = IRQ_NONE;
   +struct irq_data *d;
   +bool is_forwarded;

spin_lock_irqsave(irq_ctx-lock, flags);

if (!irq_ctx-masked) {
ret = IRQ_HANDLED;
   +d = irq_get_irq_data(irq_ctx-hwirq);
   +is_forwarded = irqd_irq_forwarded(d);

   -if (irq_ctx-flags  VFIO_IRQ_INFO_AUTOMASKED) {
   +if (irq_ctx-flags  VFIO_IRQ_INFO_AUTOMASKED 
   +!is_forwarded) {
disable_irq_nosync(irq_ctx-hwirq);
irq_ctx-masked = true;
}
   -- 
   1.9.1
  
   It makes sense that these needs to be all controlled in the kernel, but
   I'm wondering if it would be cleaner / more correct to clear the
   AUTOMASKED flag when the IRQ is forwarded and have vfio refuse setting
   this flag as long as the irq is forwarded?
  
  If I am not wrong, even if the user sets AUTOMASKED, this info never is
  exploited by the vfio platform driver. AUTOMASKED only is set internally
  to the driver, on init, for level sensitive IRQs.
  
  It seems to be the same on PCI (for INTx). I do not see anywhere the
  user flag curectly copied into a local storage. But I prefer to be
  careful ;-)
  
  If confirmed, although the flag value is exposed in the user API, the
  user set value never is exploited so this removes the need to check.
  
  the forwarded IRQ modality being fully dynamic currently, then I would
  need to update the irq_ctx-flags on each vfio_irq_handler call. I don't
  know if its better?
  
 I'm not an expert on vfio, so I'll leave that to Alex Williamson to
 answer, but I'm just worried that we need to special-case the forwarded
 IRQ here, and if that may get lost elsewhere in the vfio code.  If the
 AUTOMASKED flag covers specifically this behavior, then why don't we
 simply clear/set that flag when forwarding/unforwarding the specific
 IRQ?

The way that VFIO_IRQ_INFO_AUTOMASKED is being used here is unique to
the platform device vfio backend.  In the rest of VFIO,
VFIO_IRQ_INFO_AUTOMASKED is simply a flag bit exposed via
VFIO_DEVICE_GET_IRQ_INFO.  The flags field of struct vfio_irq_info is
output-only.  vfio-pci knows by the IRQ index whether it is edge or
level.  I do agree though that changing the flag bit, or better yet a
bool, rather than adding extra tests that need to be handled as each
usage seems less error prone.

Things could get confusing for userspace though if suddenly
VFIO_DEVICE_GET_IRQ_INFO starts calling the index edge triggered once
forwarding mode is enabled.  Thanks,

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Alex Williamson

On Thu, 2014-09-11 at 19:10 +0200, Christoffer Dall wrote:
 On Wed, Sep 10, 2014 at 11:05:49PM -0600, Alex Williamson wrote:
  On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
   On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
 
 [...]
 
 
+#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
+int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
   
   what's the 'p' in pfwd?
  
  p is for pointer?
  
 
 shouldn't the type declation spell out quite clearly to me that I'm
 dealing with a pointer?

Sure.  In the cases where I've done similar things it's more a matter of
not needing to come up with another variable, for instance if I need
both a struct and a struct* I might call them foo and pfoo if I can't
come up with anything more meaningful.


 [...]
 
   
   need some spaceing here, also, I would turn this around, first check if
   the strcmp fails, and then error out, then do you next check etc., to
   avoid so many nested statements.
   
+   /* is a ref to this device already owned by the KVM-VFIO 
device? */
   
   this comment is not particularly helpful in its current form, it would
   be helpful if you specified that we're checking whether that particular
   device/irq combo is already registered.
   
+   *kvm_vdev = kvm_vfio_find_device(kv, vdev);
+   if (*kvm_vdev) {
+   if (kvm_vfio_find_irq(*kvm_vdev, fwd_irq-index)) {
+   kvm_err(%s irq %d already forwarded\n,
+   __func__, *hwirq);
  
  Why didn't we do this first?
  
 huh?

The code is doing:

1. can the arch forward this irq
2. are we already forwarding this irq

It's backwards, test for duplicates locally before calling out into arch
code.  Besides, I think the arch code here should go away and just be
another error condition for the call-out.  Thanks,

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 1/9] KVM: ARM: VGIC: fix multiple injection of level sensitive forwarded IRQ

2014-09-11 Thread Eric Auger

On 09/11/2014 05:09 AM, Christoffer Dall wrote:
 On Mon, Sep 01, 2014 at 02:52:40PM +0200, Eric Auger wrote:
 Fix multiple injection of level sensitive forwarded IRQs.
 With current code, the second injection fails since the state bitmaps
 are not reset (process_maintenance is not called anymore).
 New implementation consists in fully bypassing the vgic state
 management for forwarded IRQ (checks are ignored in
 vgic_update_irq_pending). This obviously assumes the forwarded IRQ is
 injected from kernel side.

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 It was attempted to reset the states in __kvm_vgic_sync_hwstate, checking
 the emptied LR of forwarded IRQ. However surprisingly this solution does
 not seem to work. Some times, a new forwarded IRQ injection is observed
 while the LR of the previous instance was not observed as empty.
 
 hmmm, concerning.  It would probably have been helpful overall if you
 could start by describing the problem with the current implementation in
 the commit message, and then explain the fix...
 

 v1 - v2:
 - fix vgic state bypass in vgic_queue_hwirq
 ---
  virt/kvm/arm/vgic.c | 13 ++---
  1 file changed, 10 insertions(+), 3 deletions(-)

 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index 0007300..8ef495b 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -1259,7 +1259,9 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int 
 irq)
  
  static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
  {
 -if (vgic_irq_is_queued(vcpu, irq))
 +bool is_forwarded =  (vgic_get_phys_irq(vcpu, irq)  0);
 
 can you create a static function to factor this vgic_get_phys_irq check out, 
 please?
yes sure
 
 +
 +if (vgic_irq_is_queued(vcpu, irq)  !is_forwarded)
  return true; /* level interrupt, already queued */
 
 so essentially if an IRQ is already on a LR so we shouldn't resample the
 line, then we still resample the line if the IRQ is forwarded?
 
 I think you need to explain this, both to me here, and also in the code
 by moving the comment following the return statement above the check and
 comment this clearly.
Well, I admit it may look a bit pushy! When we discussed this issue with
Marc, the outcome was that the vgic states were not accurate with
forwarded IRQs and VGIC state may be fully bypassed. Since the first
injection still sets the state - and I did not want to modify this - the
2d one would fail due to that check, and the validate_injection. May be
cleaner to not update the states when injecting the fwd irq too.

 
  
  if (vgic_queue_irq(vcpu, 0, irq)) {
 @@ -1517,14 +1519,18 @@ static bool vgic_update_irq_pending(struct kvm *kvm, 
 int cpuid,
  int edge_triggered, level_triggered;
  int enabled;
  bool ret = true;
 +bool is_forwarded;
  
  spin_lock(dist-lock);
  
  vcpu = kvm_get_vcpu(kvm, cpuid);
 +is_forwarded = (vgic_get_phys_irq(vcpu, irq_num)  0);
 
 use your new function here as well.
ok
 
 +
  edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
  level_triggered = !edge_triggered;
  
 -if (!vgic_validate_injection(vcpu, irq_num, level)) {
 +if (!is_forwarded 
 +!vgic_validate_injection(vcpu, irq_num, level)) {
 
 I don't see the rationale here either.  If an IRQ is forwarded, why do
 you need to do anything if the condition of the line hasn't changed for
 a level-triggered IRQ or if you have a falling edge on an edge-triggered
 IRQ (assuming active-HIGH)?
To me this even cannot cannot happen. a second fwd irq can only hit if
the same virtual IRQ was completed and completed the corresponding phys
IRQ. Still the problem is that on the 1st injection we updated the VGIC
state. I aknowledge this is a hack to work around the 1st injection
update the state and nothing reset them. So on subsequent injections, -
and even on the 1st one-  I never check the state.
 
  ret = false;
  goto out;
  }
 @@ -1557,7 +1563,8 @@ static bool vgic_update_irq_pending(struct kvm *kvm, 
 int cpuid,
  goto out;
  }
  
 -if (level_triggered  vgic_irq_is_queued(vcpu, irq_num)) {
 +if (!is_forwarded 
 +level_triggered  vgic_irq_is_queued(vcpu, irq_num)) {
 
 So here it's making sense for SPIs since you can have an EOIed interrupt
 on a CPU that didn't exit the VM yet, and this it's still queued, but
 you still need to resample the line to respect other CPUs.  Only, we
 ever only target a single CPU for SPIs IIRC (the first in the target
 list register) so we have to wait for that CPU to to exit the VM anyhow.
 
 This leads me to believe that, given a fowarded irq, you can only have
 XXX situations at this point:
 
 (1) is_queued  target_vcpu_in_vm:
 The vcpu should resample this line when it exits the VM, because we
 check the LRs for IRQs like this one, so we don't have to do anything
 and go to out here.
 
 (2) is_queued  !target_vcpu_in_vm::
 You have a bug because you exited the VM which must

Re: [PATCH] KVM: PPC: Convert openpic lock to raw_spinlock

2014-09-11 Thread Scott Wood

On Thu, 2014-09-11 at 15:25 -0400, Bogdan Purcareata wrote:
 This patch enables running intensive I/O workloads, e.g. netperf, in a guest
 deployed on a RT host. No change for !RT kernels.
 
 The openpic spinlock becomes a sleeping mutex on a RT system. This no longer
 guarantees that EPR is atomic with exception delivery. The guest VCPU thread
 fails due to a BUG_ON(preemptible()) when running netperf.
 
 In order to make the kvmppc_mpic_set_epr() call safe on RT from non-atomic
 context, convert the openpic lock to a raw_spinlock. A similar approach can
 be seen for x86 platforms in the following commit [1].
 
 Here are some comparative cyclitest measurements run inside a high priority RT
 guest run on a RT host. The guest has 1 VCPU and the test has been run for 15
 minutes. The guest runs ~750 hackbench processes as background stress.

Does hackbench involve triggering interrupts that would go through the
MPIC?  You may want to try an I/O-heavy benchmark to stress the MPIC
code (the more interrupt sources are active at once, the better).
Also try a guest with many vcpus.

-Scott


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Recommended Kernel and KVM version

2014-09-11 Thread Flávio Ramalho

Hi,

I am running an OpenStack infrastructure and some compute nodes are
frequently having kernel panic, as far as I see, the kernel panics are
related with KVM.

Do you guys have any recommendation about the kernel and KVM version
to be used in
an production environment?

Thanks,

Flávio
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 12:14:10PM -0600, Alex Williamson wrote:
 On Thu, 2014-09-11 at 19:10 +0200, Christoffer Dall wrote:
  On Wed, Sep 10, 2014 at 11:05:49PM -0600, Alex Williamson wrote:
   On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
  
  [...]
  
  
 +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
 +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,

what's the 'p' in pfwd?
   
   p is for pointer?
   
  
  shouldn't the type declation spell out quite clearly to me that I'm
  dealing with a pointer?
 
 Sure.  In the cases where I've done similar things it's more a matter of
 not needing to come up with another variable, for instance if I need
 both a struct and a struct* I might call them foo and pfoo if I can't
 come up with anything more meaningful.
 
 
  [...]
  

need some spaceing here, also, I would turn this around, first check if
the strcmp fails, and then error out, then do you next check etc., to
avoid so many nested statements.

 + /* is a ref to this device already owned by the KVM-VFIO 
 device? */

this comment is not particularly helpful in its current form, it would
be helpful if you specified that we're checking whether that particular
device/irq combo is already registered.

 + *kvm_vdev = kvm_vfio_find_device(kv, vdev);
 + if (*kvm_vdev) {
 + if (kvm_vfio_find_irq(*kvm_vdev, fwd_irq-index)) {
 + kvm_err(%s irq %d already forwarded\n,
 + __func__, *hwirq);
   
   Why didn't we do this first?
   
  huh?
 
 The code is doing:
 
 1. can the arch forward this irq
 2. are we already forwarding this irq
 
 It's backwards, test for duplicates locally before calling out into arch
 code.  Besides, I think the arch code here should go away and just be
 another error condition for the call-out.  Thanks,
 
Ah, right, you meant for the whole check.  I agree completely.

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 1/9] KVM: ARM: VGIC: fix multiple injection of level sensitive forwarded IRQ

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 08:17:49PM +0200, Eric Auger wrote:
 On 09/11/2014 05:09 AM, Christoffer Dall wrote:
  On Mon, Sep 01, 2014 at 02:52:40PM +0200, Eric Auger wrote:
  Fix multiple injection of level sensitive forwarded IRQs.
  With current code, the second injection fails since the state bitmaps
  are not reset (process_maintenance is not called anymore).
  New implementation consists in fully bypassing the vgic state
  management for forwarded IRQ (checks are ignored in
  vgic_update_irq_pending). This obviously assumes the forwarded IRQ is
  injected from kernel side.
 
  Signed-off-by: Eric Auger eric.au...@linaro.org
 
  ---
 
  It was attempted to reset the states in __kvm_vgic_sync_hwstate, checking
  the emptied LR of forwarded IRQ. However surprisingly this solution does
  not seem to work. Some times, a new forwarded IRQ injection is observed
  while the LR of the previous instance was not observed as empty.
  
  hmmm, concerning.  It would probably have been helpful overall if you
  could start by describing the problem with the current implementation in
  the commit message, and then explain the fix...
  
 
  v1 - v2:
  - fix vgic state bypass in vgic_queue_hwirq
  ---
   virt/kvm/arm/vgic.c | 13 ++---
   1 file changed, 10 insertions(+), 3 deletions(-)
 
  diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
  index 0007300..8ef495b 100644
  --- a/virt/kvm/arm/vgic.c
  +++ b/virt/kvm/arm/vgic.c
  @@ -1259,7 +1259,9 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, 
  int irq)
   
   static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
   {
  -  if (vgic_irq_is_queued(vcpu, irq))
  +  bool is_forwarded =  (vgic_get_phys_irq(vcpu, irq)  0);
  
  can you create a static function to factor this vgic_get_phys_irq check 
  out, please?
 yes sure
  
  +
  +  if (vgic_irq_is_queued(vcpu, irq)  !is_forwarded)
 return true; /* level interrupt, already queued */
  
  so essentially if an IRQ is already on a LR so we shouldn't resample the
  line, then we still resample the line if the IRQ is forwarded?
  
  I think you need to explain this, both to me here, and also in the code
  by moving the comment following the return statement above the check and
  comment this clearly.
 Well, I admit it may look a bit pushy! When we discussed this issue with
 Marc, the outcome was that the vgic states were not accurate with
 forwarded IRQs and VGIC state may be fully bypassed.

Can you explain this in more details?  Perhaps with a concrete example?

 Since the first
 injection still sets the state - and I did not want to modify this - the
 2d one would fail due to that check, and the validate_injection. May be
 cleaner to not update the states when injecting the fwd irq too.

Hmmm, I don't think I understand you here.  I think you need to think
about the whole flow of things here and understand any posible sequence
of events combined with any potential state you may have.  Perhaps this
is better deferred to a face-to-face discussion.

 
  
   
 if (vgic_queue_irq(vcpu, 0, irq)) {
  @@ -1517,14 +1519,18 @@ static bool vgic_update_irq_pending(struct kvm 
  *kvm, int cpuid,
 int edge_triggered, level_triggered;
 int enabled;
 bool ret = true;
  +  bool is_forwarded;
   
 spin_lock(dist-lock);
   
 vcpu = kvm_get_vcpu(kvm, cpuid);
  +  is_forwarded = (vgic_get_phys_irq(vcpu, irq_num)  0);
  
  use your new function here as well.
 ok
  
  +
 edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
 level_triggered = !edge_triggered;
   
  -  if (!vgic_validate_injection(vcpu, irq_num, level)) {
  +  if (!is_forwarded 
  +  !vgic_validate_injection(vcpu, irq_num, level)) {
  
  I don't see the rationale here either.  If an IRQ is forwarded, why do
  you need to do anything if the condition of the line hasn't changed for
  a level-triggered IRQ or if you have a falling edge on an edge-triggered
  IRQ (assuming active-HIGH)?
 To me this even cannot cannot happen. a second fwd irq can only hit if
 the same virtual IRQ was completed and completed the corresponding phys
 IRQ. Still the problem is that on the 1st injection we updated the VGIC
 state. 

Updated teh VGIC state?  Be more specific!

 I aknowledge this is a hack to work around the 1st injection
 update the state and nothing reset them. So on subsequent injections, -
 and even on the 1st one-  I never check the state.

Is the case here that you propogate the line state onto the vcpu pending
state when somebody calls this inject function, so you use this as a
chance to resample the line?

If so, we need to document this clearly (and you need to convince me
that this is in fact the right thing we're doing overall), and we may
have to reword and refactor some of this to not have these weird-looking
corner cases with outrageously complicated comments describing state in
what's basically becoming a non-trivial state machine.

  
 ret = false;
 goto out;
 }
  @@ -1557,7

Re: [PATCH v4 2/8] arm/arm64: KVM: vgic: switch to dynamic allocation

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 12:09:09PM +0100, Marc Zyngier wrote:
 So far, all the VGIC data structures are statically defined by the
 *maximum* number of vcpus and interrupts it supports. It means that
 we always have to oversize it to cater for the worse case.
 
 Start by changing the data structures to be dynamically sizeable,
 and allocate them at runtime.
 
 The sizes are still very static though.
 
 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 ---
  arch/arm/kvm/arm.c |   3 +
  include/kvm/arm_vgic.h |  76 
  virt/kvm/arm/vgic.c| 237 
 ++---
  3 files changed, 267 insertions(+), 49 deletions(-)
 
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index a99e0cd..923a01d 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -172,6 +172,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
   kvm-vcpus[i] = NULL;
   }
   }
 +
 + kvm_vgic_destroy(kvm);
  }
  
  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 @@ -253,6 +255,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
  {
   kvm_mmu_free_memory_caches(vcpu);
   kvm_timer_vcpu_terminate(vcpu);
 + kvm_vgic_vcpu_destroy(vcpu);
   kmem_cache_free(kvm_vcpu_cache, vcpu);
  }
  
 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index f074539..bdaac57 100644
 --- a/include/kvm/arm_vgic.h
 +++ b/include/kvm/arm_vgic.h
 @@ -54,19 +54,33 @@
   * - a bunch of shared interrupts (SPI)
   */
  struct vgic_bitmap {
 - union {
 - u32 reg[VGIC_NR_PRIVATE_IRQS / 32];
 - DECLARE_BITMAP(reg_ul, VGIC_NR_PRIVATE_IRQS);
 - } percpu[VGIC_MAX_CPUS];
 - union {
 - u32 reg[VGIC_NR_SHARED_IRQS / 32];
 - DECLARE_BITMAP(reg_ul, VGIC_NR_SHARED_IRQS);
 - } shared;
 + /*
 +  * - One UL per VCPU for private interrupts (assumes UL is at
 +  *   least 32 bits)
 +  * - As many UL as necessary for shared interrupts.
 +  *
 +  * The private interrupts are accessed via the private
 +  * field, one UL per vcpu (the state for vcpu n is in
 +  * private[n]). The shared interrupts are accessed via the
 +  * shared pointer (IRQn state is at bit n-32 in the bitmap).
 +  */
 + unsigned long *private;
 + unsigned long *shared;

the comment above the define for REG_OFFSET_SWIZZLE still talks about
the unions in struct vgic_bitmap, which is no longer true.  Mind
updating that comment?

  };
  
  struct vgic_bytemap {
 - u32 percpu[VGIC_MAX_CPUS][VGIC_NR_PRIVATE_IRQS / 4];
 - u32 shared[VGIC_NR_SHARED_IRQS  / 4];
 + /*
 +  * - 8 u32 per VCPU for private interrupts
 +  * - As many u32 as necessary for shared interrupts.
 +  *
 +  * The private interrupts are accessed via the private
 +  * field, (the state for vcpu n is in private[n*8] to
 +  * private[n*8 + 7]). The shared interrupts are accessed via
 +  * the shared pointer (IRQn state is at byte (n-32)%4 of the
 +  * shared[(n-32)/4] word).
 +  */
 + u32 *private;
 + u32 *shared;
  };
  
  struct kvm_vcpu;
 @@ -127,6 +141,9 @@ struct vgic_dist {
   boolin_kernel;
   boolready;
  
 + int nr_cpus;
 + int nr_irqs;
 +
   /* Virtual control interface mapping */
   void __iomem*vctrl_base;
  
 @@ -166,15 +183,36 @@ struct vgic_dist {
   /* Level/edge triggered */
   struct vgic_bitmap  irq_cfg;
  
 - /* Source CPU per SGI and target CPU */
 - u8  irq_sgi_sources[VGIC_MAX_CPUS][VGIC_NR_SGIS];
 + /*
 +  * Source CPU per SGI and target CPU:
 +  *
 +  * Each byte represent a SGI observable on a VCPU, each bit of
 +  * this byte indicating if the corresponding VCPU has
 +  * generated this interrupt. This is a GICv2 feature only.
 +  *
 +  * For VCPUn (n  8), irq_sgi_sources[n*16] to [n*16 + 15] are
 +  * the SGIs observable on VCPUn.
 +  */
 + u8  *irq_sgi_sources;
  
 - /* Target CPU for each IRQ */
 - u8  irq_spi_cpu[VGIC_NR_SHARED_IRQS];
 - struct vgic_bitmap  irq_spi_target[VGIC_MAX_CPUS];
 + /*
 +  * Target CPU for each SPI:
 +  *
 +  * Array of available SPI, each byte indicating the target
 +  * VCPU for SPI. IRQn (n =32) is at irq_spi_cpu[n-32].
 +  */
 + u8  *irq_spi_cpu;
 +
 + /*
 +  * Reverse lookup of irq_spi_cpu for faster compute pending:
 +  *
 +  * Array of bitmaps, one per VCPU, describing is IRQn is

ah, describing *if* ?

 +  * routed to a particular VCPU.
 +  */
 + struct vgic_bitmap  *irq_spi_target;
  
   /* Bitmap indicating which CPU has something pending */
 - unsigned long   irq_pending_on_cpu;
 + unsigned long   *irq_pending_on_cpu;
  #endif
  };
  
 @@

Re: [PATCH v4 6/8] arm/arm64: KVM: vgic: kill VGIC_NR_IRQS

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 12:09:13PM +0100, Marc Zyngier wrote:
 Nuke VGIC_NR_IRQS entierly, now that the distributor instance
 contains the number of IRQ allocated to this GIC.
 
 Also add VGIC_NR_IRQS_LEGACY to preserve the current API.
 
 Signed-off-by: Marc Zyngier marc.zyng...@arm.com

Did anything dramtically change in this patch since last time around?

If not, I'll re-affirm my tag:

Reviewed-by: Christoffer Dall christoffer.d...@linaro.org

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 8/8] arm/arm64: KVM: vgic: make number of irqs a configurable attribute

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 12:09:15PM +0100, Marc Zyngier wrote:
 In order to make the number of interrupts configurable, use the new
 fancy device management API to add KVM_DEV_ARM_VGIC_GRP_NR_IRQS as
 a VGIC configurable attribute.
 
 Userspace can now specify the exact size of the GIC (by increments
 of 32 interrupts).
 
 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 ---
  Documentation/virtual/kvm/devices/arm-vgic.txt | 10 +++
  arch/arm/include/uapi/asm/kvm.h|  1 +
  arch/arm64/include/uapi/asm/kvm.h  |  1 +
  virt/kvm/arm/vgic.c| 37 
 ++
  4 files changed, 49 insertions(+)
 
 diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt 
 b/Documentation/virtual/kvm/devices/arm-vgic.txt
 index 7f4e91b..df8b0c7 100644
 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt
 +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
 @@ -71,3 +71,13 @@ Groups:
Errors:
  -ENODEV: Getting or setting this register is not yet supported
  -EBUSY: One or more VCPUs are running
 +
 +  KVM_DEV_ARM_VGIC_GRP_NR_IRQS
 +  Attributes:
 +A value describing the number of interrupts (SGI, PPI and SPI) for
 +this GIC instance, ranging from 64 to 1024, in increments of 32.
 +
 +  Errors:
 +-EINVAL: Value set is out of the expected range
 +-EBUSY: Value has already be set, or GIC has already been initialized
 +with default values.
 diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
 index e6ebdd3..8b51c1a 100644
 --- a/arch/arm/include/uapi/asm/kvm.h
 +++ b/arch/arm/include/uapi/asm/kvm.h
 @@ -173,6 +173,7 @@ struct kvm_arch_memory_slot {
  #define   KVM_DEV_ARM_VGIC_CPUID_MASK(0xffULL  
 KVM_DEV_ARM_VGIC_CPUID_SHIFT)
  #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT  0
  #define   KVM_DEV_ARM_VGIC_OFFSET_MASK   (0xULL  
 KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
 +#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
  
  /* KVM_IRQ_LINE irq field index values */
  #define KVM_ARM_IRQ_TYPE_SHIFT   24
 diff --git a/arch/arm64/include/uapi/asm/kvm.h 
 b/arch/arm64/include/uapi/asm/kvm.h
 index e633ff8..b5cd6ed 100644
 --- a/arch/arm64/include/uapi/asm/kvm.h
 +++ b/arch/arm64/include/uapi/asm/kvm.h
 @@ -159,6 +159,7 @@ struct kvm_arch_memory_slot {
  #define   KVM_DEV_ARM_VGIC_CPUID_MASK(0xffULL  
 KVM_DEV_ARM_VGIC_CPUID_SHIFT)
  #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT  0
  #define   KVM_DEV_ARM_VGIC_OFFSET_MASK   (0xULL  
 KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
 +#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
  
  /* KVM_IRQ_LINE irq field index values */
  #define KVM_ARM_IRQ_TYPE_SHIFT   24
 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index 9180823..744388d 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -2331,6 +2331,36 @@ static int vgic_set_attr(struct kvm_device *dev, 
 struct kvm_device_attr *attr)
  
   return vgic_attr_regs_access(dev, attr, reg, true);
   }
 + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
 + u32 __user *uaddr = (u32 __user *)(long)attr-addr;
 + u32 val;
 + int ret = 0;
 +
 + if (get_user(val, uaddr))
 + return -EFAULT;
 +
 + /*
 +  * We require:
 +  * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
 +  * - at most 1024 interrupts
 +  * - a multiple of 32 interrupts
 +  */
 + if (val  (VGIC_NR_PRIVATE_IRQS + 32) ||
 + val  VGIC_MAX_IRQS ||
 + (val  31))
 + return -EINVAL;
 +
 + mutex_lock(dev-kvm-lock);
 +
 + if (vgic_initialized(dev-kvm) || dev-kvm-arch.vgic.nr_irqs)
 + ret = -EBUSY;
 + else
 + dev-kvm-arch.vgic.nr_irqs = val;
 +
 + mutex_unlock(dev-kvm-lock);
 +
 + return ret;
 + }
  
   }
  
 @@ -2367,6 +2397,11 @@ static int vgic_get_attr(struct kvm_device *dev, 
 struct kvm_device_attr *attr)
   r = put_user(reg, uaddr);
   break;
   }
 + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
 + u32 __user *uaddr = (u32 __user *)(long)attr-addr;
 + r = put_user(dev-kvm-arch.vgic.nr_irqs, uaddr);
 + break;
 + }
  
   }
  
 @@ -2403,6 +2438,8 @@ static int vgic_has_attr(struct kvm_device *dev, struct 
 kvm_device_attr *attr)
   case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
   offset = attr-attr  KVM_DEV_ARM_VGIC_OFFSET_MASK;
   return vgic_has_attr_regs(vgic_cpu_ranges, offset);
 + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
 + return 0;
   }
   return -ENXIO;
  }
 -- 
 2.0.4
 

Reviewed-by: Christoffer Dall christoffer.d...@linaro.org
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at

Re: [PATCH 1/2] KVM: document KVM_SET_GUEST_DEBUG api

2014-09-11 Thread Christoffer Dall

On Tue, Sep 09, 2014 at 05:27:18PM +0100, Alex Bennée wrote:
 In preparation for working on the ARM implementation I noticed the debug
 interface was missing from the API document. I've pieced together the
 expected behaviour from the code and commit messages written it up as
 best I can.
 
 Signed-off-by: Alex Bennée alex.ben...@linaro.org
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index d3dde61..723d3f3 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -2575,6 +2575,50 @@ associated with the service will be forgotten, and 
 subsequent RTAS
  calls by the guest for that service will be passed to userspace to be
  handled.
  
 +4.87 KVM_SET_GUEST_DEBUG
 +
 +Capability: KVM_CAP_SET_GUEST_DEBUG
 +Architectures: x86, s390, ppc
 +Type: vcpu ioctl
 +Parameters: struct kvm_guest_debug (in)
 +Returns: 0 on success; -1 on error

Any specific error codes that need explaining here?

 +
 +struct kvm_guest_debug {
 +   __u32 control;
 +   __u32 pad;
 +   struct kvm_guest_debug_arch arch;
 +};
 +
 +Set up the processor specific debug registers and configure vcpu for

configure *the* vcpu?

 +handling guest debug events. There are two parts to the structure, the

handling guest debug events does this mean whatever user space needs to
configure so that the guest can deug things or is this for userspace to
debug the guest execution, we could probably be more specific.

 +first a control bitfield indicates the type of debug events to handle
 +when running. Common control bits are:

'when running'? 'when running the vcpu'?

 +
 +  - KVM_GUESTDBG_ENABLE:guest debugging is enabled
 +  - KVM_GUESTDBG_SINGLESTEP:the next run should single-step
 +
 +The top 16 bits of the control field are architecture specific control
 +flags which can include the following:
 +
 +  - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86]
 +  - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390]
 +  - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86]
 +  - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86]
 +  - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
 +
 +For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints
 +are enabled in memory so we need to ensure breakpoint exceptions are
 +correctly trapped and the KVM run loop exits at the breakpoint and not
 +running off into the normal guest vector. For KVM_GUESTDBG_USE_HW_BP

I didn't quite understand this bit, can you clarify slightly?  For
example, I don't know what it means that a software breakpoint is
'enabled in memory' and I'm not quite sure what the goal you are arguing
for here is; is this about if this ioctl is used to set a specific
breakpoint then we want to make sure that the breakpoint exception goes
to KVM and not to the guest?

also, are you not missing a 'that' before 'the KVM run loop...'

 +we need to ensure the guest vCPUs architecture specific registers are

please be consistent about the use of vcpu, VCPUs, vCPUs, etc.  The
document seems to prefer lowercase vcpus most places.

 +updated to the correct (supplied) values.
 +
 +The second part of the structure is architecture specific and
 +typically contains a set of debug registers.
 +
 +When debug events exit the main run loop with the reason

I think you should just talk about then the VCPU exits the guest and not
be specific about whether we have a main run loop or not.

 +KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 +structure containing architecture specific debug information.

I feel like this sentence leaves me hanging; When they exit with the
debug information, then what?  Or did you mean to say that when they
exit, then the arch struct is filled with the debug info?

  
  5. The kvm_run structure
  
 -- 
 2.1.0
 

Thanks for taking care of this!
-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/2] KVM API documentation patches

2014-09-11 Thread Christoffer Dall

On Wed, Sep 10, 2014 at 11:34:53AM +0200, Paolo Bonzini wrote:
 Il 09/09/2014 18:27, Alex Bennée ha scritto:
  Hi,
  
  I'm preparing to add ARM KVM GDB support and I went to read the API
  documentation and found it surprisingly mute on the subject ;-)
  
  The first patch documents the new KVM_SET_GUEST_DEBUG ioctl based on
  reviewing the code. I've included a long CC list of people who've
  actually done the various implementations who I hope can sanity check
  the write-up. The second is a trivial formatting fix for what looks
  like a minor merge trip-up.
  
  Alex Bennée (2):
KVM: document KVM_SET_GUEST_DEBUG api
KVM: fix api documentation of KVM_GET_EMULATED_CPUID
  
   Documentation/virtual/kvm/api.txt | 184 
  +++---
   1 file changed, 114 insertions(+), 70 deletions(-)
  
 
 Thanks, applying both!
 
ha, read this after reviewing his text.  Was I too nit-picky? ;)

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: arm64: add gic-400 compatible

2014-09-11 Thread Christoffer Dall

On Mon, Sep 08, 2014 at 10:21:44PM +, Joel Schopp wrote:
 Add a one liner to identify the gic-400.  It's gicv2 with optional MSI 
 extensions.
 
 Cc: Christoffer Dall christoffer.d...@linaro.org
 Signed-off-by: Joel Schopp joel.sch...@amd.com
 ---
  virt/kvm/arm/vgic.c |1 +
  1 file changed, 1 insertion(+)
 
 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index 73eba79..e81444e 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -1550,6 +1550,7 @@ static struct notifier_block vgic_cpu_nb = {
  
  static const struct of_device_id vgic_ids[] = {
   { .compatible = arm,cortex-a15-gic, .data = vgic_v2_probe, },
 + { .compatible = arm,gic-400, .data = vgic_v2_probe, },
   { .compatible = arm,gic-v3, .data = vgic_v3_probe, },
   {},
  };
 
Acked-by: Christoffer Dall christoffer.d...@linaro.org
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/2] virtio-rng: fix stuck of hot-unplugging busy device

2014-09-11 Thread Rusty Russell

Amit Shah amit.s...@redhat.com writes:
 On (Wed) 10 Sep 2014 [14:11:36], Amos Kong wrote:
 When we try to hot-remove a busy virtio-rng device from QEMU monitor,
 the device can't be hot-removed. Because virtio-rng driver hangs at
 wait_for_completion_killable().
 
 This patch exits the waiting by completing have_data completion before
 unregistering, resets data_avail to avoid the hwrng core use wrong
 buffer bytes.
 
 Signed-off-by: Amos Kong ak...@redhat.com
 Cc: sta...@vger.kernel.org

 Reviewed-by: Amit Shah amit.s...@redhat.com

Thanks, applied.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread tangchen


Hi Gleb, Paolo,

On 09/11/2014 10:47 PM, Gleb Natapov wrote:

On Thu, Sep 11, 2014 at 04:37:39PM +0200, Paolo Bonzini wrote:

Il 11/09/2014 16:31, Gleb Natapov ha scritto:

What if the page being swapped out is L1's APIC access page?  We don't
run prepare_vmcs12 in that case because it's an L2-L0-L2 entry, so we
need to do something.

We will do something on L2-L1 exit. We will call kvm_reload_apic_access_page().
That is what patch 5 of this series is doing.

Sorry, I meant the APIC access page prepared by L1 for L2's execution.

You wrote:


if (!is_guest_mode() || !(vmcs12-secondary_vm_exec_control  
ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 write(PIC_ACCESS_ADDR)

In other words if L2 shares L1 apic access page then reload, otherwise do 
nothing.

but in that case you have to redo nested_get_page, so do nothing
doesn't work.


Ah, 7/7 is new in this submission. Before that this page was still
pinned.  Looking at 7/7 now I do not see how it can work since it has no
code for mmu notifier to detect that it deals with such page and call
kvm_reload_apic_access_page().


Since L1 and L2 share one apic page, if the page is unmapped, 
mmu_notifier will

be called, and :

 - if vcpu is in L1, a L1-L0 exit is rised. apic page's pa will be 
updated in the next

   L0-L1 entry by making vcpu request.

 - if vcpu is in L2 (is_guest_mode, right?), a L2-L0 exit is rised. 
nested_vmx_vmexit()
   will not be called since it is called in L2-L1 exit. It returns 
from vmx_vcpu_run()
   directly, right ? So we should update apic page in L0-L2 entry. 
This is also done

   by making vcpu request, right ?.

   prepare_vmcs02() is called in L1-L2 entry, and nested_vmx_vmexit() 
is called in
   L2-L1 exit. So we also need to update L1's vmcs in 
nested_vmx_vmexit() in patch 5/7.


IIUC, I think patch 1~6 has done such things.

And yes, the is_guest_mode() check is not needed.


I said to Tang previously that nested
kvm has a bunch of pinned page that are hard to deal with and suggested
to iron out non nested case first :(


Yes, and maybe adding patch 7 is not a good idea for now.

Thanks.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread tangchen


Hi Paolo,

On 09/11/2014 10:24 PM, Paolo Bonzini wrote:

Il 11/09/2014 16:21, Gleb Natapov ha scritto:

As far as I can tell the if that is needed there is:

if (!is_guest_mode() || !(vmcs12-secondary_vm_exec_control  
ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 write(PIC_ACCESS_ADDR)

In other words if L2 shares L1 apic access page then reload, otherwise do 
nothing.

What if the page being swapped out is L1's APIC access page?  We don't
run prepare_vmcs12 in that case because it's an L2-L0-L2 entry, so we
need to do something.


Are you talking about the case that L1 and L2 have different apic pages ?
I think I didn't deal with this situation in this patch set.

Sorry I didn't say it clearly. Here, I assume L1 and L2 share the same 
apic page.
If we are in L2, and the page is migrated, we updated L2's vmcs by 
making vcpu
request. And of course, we should also update L1's vmcs. This is done by 
patch 5.

We make vcpu request again in nested_vmx_exit().

Thanks.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [question] virtio-blk performance degradation happened with virito-serial

2014-09-11 Thread Zhang Haoyu

   If virtio-blk and virtio-serial share an IRQ, the guest operating 
   system has to check each virtqueue for activity. Maybe there is some 
   inefficiency doing that.
   AFAIK virtio-serial registers 64 virtqueues (on 31 ports + console) 
   even if everything is unused.
  
  That could be the case if MSI is disabled.
 
 Do the windows virtio drivers enable MSIs, in their inf file?

It depends on the version of the drivers, but it is a reasonable guess
at what differs between Linux and Windows.  Haoyu, can you give us the
output of lspci from a Linux guest?

I made a test with fio on rhel-6.5 guest, the same degradation happened too,  
this degradation can be reproduced on rhel6.5 guest 100%.
virtio_console module installed:
64K-write-sequence: 285 MBPS, 4380 IOPS
virtio_console module uninstalled:
64K-write-sequence: 370 MBPS, 5670 IOPS

I use top -d 1 -H -p qemu-pid to monitor the cpu usage, and found that,
virtio_console module installed:
qemu main thread cpu usage: 98%
virtio_console module uninstalled:
qemu main thread cpu usage: 60%

perf top -p qemu-pid result,
virtio_console module installed:
   PerfTop:9868 irqs/sec  kernel:76.4%  exact:  0.0% [4000Hz cycles],  
(target_pid: 88381)
--

11.80%  [kernel] [k] _raw_spin_lock_irqsave
 8.42%  [kernel] [k] _raw_spin_unlock_irqrestore
 7.33%  [kernel] [k] fget_light
 6.28%  [kernel] [k] fput
 3.61%  [kernel] [k] do_sys_poll
 3.30%  qemu-system-x86_64   [.] qcow2_check_metadata_overlap
 3.10%  [kernel] [k] __pollwait
 2.15%  qemu-system-x86_64   [.] qemu_iohandler_poll
 1.44%  libglib-2.0.so.0.3200.4  [.] g_array_append_vals
 1.36%  libc-2.13.so [.] 0x0011fc2a
 1.31%  libpthread-2.13.so   [.] pthread_mutex_lock
 1.24%  libglib-2.0.so.0.3200.4  [.] 0x0001f961
 1.20%  libpthread-2.13.so   [.] __pthread_mutex_unlock_usercnt
 0.99%  [kernel] [k] eventfd_poll
 0.98%  [vdso]   [.] 0x0771
 0.97%  [kernel] [k] remove_wait_queue
 0.96%  qemu-system-x86_64   [.] qemu_iohandler_fill
 0.95%  [kernel] [k] add_wait_queue
 0.69%  [kernel] [k] __srcu_read_lock
 0.58%  [kernel] [k] poll_freewait
 0.57%  [kernel] [k] _raw_spin_lock_irq
 0.54%  [kernel] [k] __srcu_read_unlock
 0.47%  [kernel] [k] copy_user_enhanced_fast_string
 0.46%  [kvm_intel]  [k] vmx_vcpu_run
 0.46%  [kvm][k] vcpu_enter_guest
 0.42%  [kernel] [k] tcp_poll
 0.41%  [kernel] [k] system_call_after_swapgs
 0.40%  libglib-2.0.so.0.3200.4  [.] g_slice_alloc
 0.40%  [kernel] [k] system_call
 0.38%  libpthread-2.13.so   [.] 0xe18d
 0.38%  libglib-2.0.so.0.3200.4  [.] g_slice_free1
 0.38%  qemu-system-x86_64   [.] address_space_translate_internal
 0.38%  [kernel] [k] _raw_spin_lock
 0.37%  qemu-system-x86_64   [.] phys_page_find
 0.36%  [kernel] [k] get_page_from_freelist
 0.35%  [kernel] [k] sock_poll
 0.34%  [kernel] [k] fsnotify
 0.31%  libglib-2.0.so.0.3200.4  [.] g_main_context_check
 0.30%  [kernel] [k] do_direct_IO
 0.29%  libpthread-2.13.so   [.] pthread_getspecific

virtio_console module uninstalled:
   PerfTop:9138 irqs/sec  kernel:71.7%  exact:  0.0% [4000Hz cycles],  
(target_pid: 88381)
--

 5.72%  qemu-system-x86_64   [.] qcow2_check_metadata_overlap
 4.51%  [kernel] [k] fget_light
 3.98%  [kernel] [k] _raw_spin_lock_irqsave
 2.55%  [kernel] [k] fput
 2.48%  libpthread-2.13.so   [.] pthread_mutex_lock
 2.46%  [kernel] [k] _raw_spin_unlock_irqrestore
 2.21%  libpthread-2.13.so   [.] __pthread_mutex_unlock_usercnt
 1.71%  [vdso]   [.] 0x060c
 1.68%  libc-2.13.so [.] 0x000e751f
 1.64%  libglib-2.0.so.0.3200.4  [.] 0x0004fca0
 1.20%  [kernel] [k] __srcu_read_lock
 1.14%  [kernel] [k] do_sys_poll
 0.96%  [kernel] [k] _raw_spin_lock_irq
 0.95%  [kernel] [k] __pollwait
 0.91%  [kernel] [k] __srcu_read_unlock
 0.78%  [kernel] [k] tcp_poll
 0.74%  [kvm][k]

[PATCH] KVM: Refactor making request to makes it meaningful

2014-09-11 Thread guohliu

This patch replace the set_bit method by kvm_make_request
to makes it more readable and consistency.

Signed-off-by: Guo Hui Liu liuguo...@gmail.com
---
 arch/x86/kvm/x86.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 916e895..5fed2de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1518,7 +1518,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
pvclock_update_vm_gtod_copy(kvm);
 
kvm_for_each_vcpu(i, vcpu, kvm)
-   set_bit(KVM_REQ_CLOCK_UPDATE, vcpu-requests);
+   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
@@ -1661,7 +1661,7 @@ static void kvmclock_update_fn(struct work_struct *work)
struct kvm_vcpu *vcpu;
 
kvm_for_each_vcpu(i, vcpu, kvm) {
-   set_bit(KVM_REQ_CLOCK_UPDATE, vcpu-requests);
+   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
kvm_vcpu_kick(vcpu);
}
 }
@@ -1670,7 +1670,7 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
 {
struct kvm *kvm = v-kvm;
 
-   set_bit(KVM_REQ_CLOCK_UPDATE, v-requests);
+   kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
schedule_delayed_work(kvm-arch.kvmclock_update_work,
KVMCLOCK_UPDATE_DELAY);
 }
@@ -2846,7 +2846,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(vcpu-arch.tsc_offset_adjustment)) {
adjust_tsc_offset_host(vcpu, vcpu-arch.tsc_offset_adjustment);
vcpu-arch.tsc_offset_adjustment = 0;
-   set_bit(KVM_REQ_CLOCK_UPDATE, vcpu-requests);
+   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
 
if (unlikely(vcpu-cpu != cpu) || check_tsc_unstable()) {
@@ -5600,7 +5600,7 @@ static void pvclock_gtod_update_fn(struct work_struct 
*work)
spin_lock(kvm_lock);
list_for_each_entry(kvm, vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
-   set_bit(KVM_REQ_MASTERCLOCK_UPDATE, vcpu-requests);
+   kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
atomic_set(kvm_guest_has_master_clock, 0);
spin_unlock(kvm_lock);
 }
@@ -6978,7 +6978,7 @@ int kvm_arch_hardware_enable(void)
list_for_each_entry(kvm, vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!stable  vcpu-cpu == smp_processor_id())
-   set_bit(KVM_REQ_CLOCK_UPDATE, vcpu-requests);
+   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (stable  vcpu-arch.last_host_tsc  local_tsc) {
backwards_tsc = true;
if (vcpu-arch.last_host_tsc  max_tsc)
@@ -7032,8 +7032,7 @@ int kvm_arch_hardware_enable(void)
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu-arch.tsc_offset_adjustment += delta_cyc;
vcpu-arch.last_host_tsc = local_tsc;
-   set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
-   vcpu-requests);
+   kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, 
vcpu);
}
 
/*
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

1 2 >

1 - 100 of 102 matches

Mail list logo