Re: [PATCH 1/5] KVM: vmx: fix ept reserved bits for 1-GByte page

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 04:17, Wanpeng Li ha scritto:
  -if (level == 1 || (level == 2  (spte  (1ULL  7 
  {
  +if (level == 1 || ((level == 3 || level == 2)
  + (spte  (1ULL  7 {
 
 This condition can be simplified by checking the return value of 
 ept_rsvd_mask.
 If it includes 0x38, this is a large page.

Oops, a not was missing. If it includes 0x38, this is _not_ a large
page (it is a page directory / page directory pointer / PML4).

 Otherwise it is a leaf page and
 you can go down the if.
 As you know, 5:3 bits which used for EPT MT are not reserved bits, so 
 I fail to understand why check the return value of ept_rsvd_mask and 
 it's a large page if includes 0x38. Could you eplain in more details? ;-)

A non-leaf page will always have 0x38 in the ept_rsvd_mask.  A leaf page
will never have 0x38 in the ept_rsvd_mask.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4] KVM: nVMX: nested TPR shadow/threshold emulation

2014-08-19 Thread Wanpeng Li
This patch fix bug https://bugzilla.kernel.org/show_bug.cgi?id=61411

TPR shadow/threshold feature is important to speed up the Windows guest.
Besides, it is a must feature for certain VMM.

We map virtual APIC page address and TPR threshold from L1 VMCS. If
TPR_BELOW_THRESHOLD VM exit is triggered by L2 guest and L1 interested
in, we inject it into L1 VMM for handling.

Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com
---
v3 - v4:
 * add Paolo's Reviewed-by
 * unconditionally fail the vmentry, with a comment  
 * setup the TPR_SHADOW/virtual_apic_page of vmcs02 based on vmcs01 if L2 owns 
the APIC
v2 - v3:
 * nested vm entry failure if both tpr shadow and cr8 exiting bits are not set
v1 - v2:
 * don't take L0's virtualize APIC accesses setting into account
 * virtual_apic_page do exactly the same thing that is done for apic_access_page
 * add the tpr threshold field to the read-write fields for shadow VMCS

 arch/x86/kvm/vmx.c | 49 +++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bfe11cf..c8d8e9a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -379,6 +379,7 @@ struct nested_vmx {
 * we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
+   struct page *virtual_apic_page;
u64 msr_ia32_feature_control;
 
struct hrtimer preemption_timer;
@@ -533,6 +534,7 @@ static int max_shadow_read_only_fields =
ARRAY_SIZE(shadow_read_only_fields);
 
 static unsigned long shadow_read_write_fields[] = {
+   TPR_THRESHOLD,
GUEST_RIP,
GUEST_RSP,
GUEST_CR0,
@@ -2330,7 +2332,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
-   CPU_BASED_PAUSE_EXITING |
+   CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
/*
 * We can allow some features even when not supported by the
@@ -6148,6 +6150,10 @@ static void free_nested(struct vcpu_vmx *vmx)
nested_release_page(vmx-nested.apic_access_page);
vmx-nested.apic_access_page = 0;
}
+   if (vmx-nested.virtual_apic_page) {
+   nested_release_page(vmx-nested.virtual_apic_page);
+   vmx-nested.virtual_apic_page = 0;
+   }
 
nested_free_all_saved_vmcss(vmx);
 }
@@ -6936,7 +6942,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_MCE_DURING_VMENTRY:
return 0;
case EXIT_REASON_TPR_BELOW_THRESHOLD:
-   return 1;
+   return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
case EXIT_REASON_APIC_ACCESS:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
@@ -7057,6 +7063,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
+   struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+   if (is_guest_mode(vcpu) 
+   nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+   return;
+
if (irr == -1 || tpr  irr) {
vmcs_write32(TPR_THRESHOLD, 0);
return;
@@ -8024,6 +8036,35 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12)
exec_control = ~CPU_BASED_VIRTUAL_NMI_PENDING;
exec_control = ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12-cpu_based_vm_exec_control;
+
+   if (exec_control  CPU_BASED_TPR_SHADOW) {
+   if (vmx-nested.virtual_apic_page)
+   nested_release_page(vmx-nested.virtual_apic_page);
+   vmx-nested.virtual_apic_page =
+  nested_get_page(vcpu, vmcs12-virtual_apic_page_addr);
+   if (!vmx-nested.virtual_apic_page)
+   exec_control =
+   ~CPU_BASED_TPR_SHADOW;
+   else
+   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+   page_to_phys(vmx-nested.virtual_apic_page));
+
+   /*
+* If CR8 load exits are enabled, CR8 store exits are enabled,
+* and virtualize APIC access is disabled, the processor would
+* never notice. Doing it unconditionally is not correct, but
+* it is the simplest thing.
+*/
+   if (!(exec_control  CPU_BASED_TPR_SHADOW) 
+   !((exec_control  CPU_BASED_CR8_LOAD_EXITING) 
+   (exec_control  CPU_BASED_CR8_STORE_EXITING)))
+   nested_vmx_failValid(vcpu, 

Re: [PATCH 1/2] KVM: fix cache stale memslot info with correct mmio generation number

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 05:50, Xiao Guangrong ha scritto:
 
 Note in the step *, my approach detects the invalid generation-number which
 will invalidate the mmio spte properly .

You are right, in fact my mail included another part: Another 
alternative could be to use the low bit to mark an in-progress change, 
*and skip the caching if the low bit is set*.

I think if you always treat the low bit as zero in mmio sptes, you can 
do that without losing a bit of the generation.

Something like this (untested/uncompiled):

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 931467881da7..3a56d377c6d7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -199,16 +199,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
 /*
- * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
- * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
- * number.
+ * the low bit of the generation number is always presumed to be zero.
+ * This disables mmio caching during memslot updates.  The concept is
+ * similar to a seqcount but instead of retrying the access we just punt
+ * and ignore the cache.
+ *
+ * spte bits 3-11 are used as bits 1-9 of the generation number,
+ * the bits 52-61 are used as bits 10-19 of the generation number.
  */
-#define MMIO_SPTE_GEN_LOW_SHIFT3
+#define MMIO_SPTE_GEN_LOW_SHIFT2
 #define MMIO_SPTE_GEN_HIGH_SHIFT   52
 
-#define MMIO_GEN_SHIFT 19
-#define MMIO_GEN_LOW_SHIFT 9
-#define MMIO_GEN_LOW_MASK  ((1  MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_SHIFT 20
+#define MMIO_GEN_LOW_SHIFT 10
+#define MMIO_GEN_LOW_MASK  ((1  MMIO_GEN_LOW_SHIFT) - 2)
 #define MMIO_GEN_MASK  ((1  MMIO_GEN_SHIFT) - 1)
 #define MMIO_MAX_GEN   ((1  MMIO_GEN_SHIFT) - 1)
 
@@ -236,12 +240,7 @@ static unsigned int get_mmio_spte_generation(u64 spte)
 
 static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
 {
-   /*
-* Init kvm generation close to MMIO_MAX_GEN to easily test the
-* code of handling generation number wrap-around.
-*/
-   return (kvm_memslots(kvm)-generation +
- MMIO_MAX_GEN - 150)  MMIO_GEN_MASK;
+   return kvm_memslots(kvm)-generation  MMIO_GEN_MASK;
 }
 
 static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a69a623938b8..c7e2800313b8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -474,6 +476,13 @@ static struct kvm *kvm_create_vm(unsigned long type)
kvm-memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!kvm-memslots)
goto out_err_no_srcu;
+
+   /*
+* Init kvm generation close to MMIO_MAX_GEN to easily test the
+* code of handling generation number wrap-around.
+*/
+   kvm-memslots-generation = -150;
+
kvm_init_memslots_id(kvm);
if (init_srcu_struct(kvm-srcu))
goto out_err_no_srcu;
@@ -725,6 +732,8 @@ static struct kvm_memslots *install_new_memslots(struct kvm 
*kvm,
synchronize_srcu_expedited(kvm-srcu);
 
kvm_arch_memslots_updated(kvm);
+   slots-generation++;
+   WARN_ON(slots-generation  1);
 
return old_memslots;
 }

(modulo the changes to always set the generation in install_new_memslots, which
I'm eliding for clarity).

Moving the initialization to kvm_create_vm ensures that the low bit is untouched
between install_new_memslots and kvm_current_mmio_generation.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4] KVM: nVMX: nested TPR shadow/threshold emulation

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 10:30, Wanpeng Li ha scritto:
 + if (vmx-nested.virtual_apic_page)
 + nested_release_page(vmx-nested.virtual_apic_page);
 + vmx-nested.virtual_apic_page =
 +nested_get_page(vcpu, vmcs12-virtual_apic_page_addr);
 + if (!vmx-nested.virtual_apic_page)
 + exec_control =
 + ~CPU_BASED_TPR_SHADOW;
 + else
 + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
 + page_to_phys(vmx-nested.virtual_apic_page));
 +
 + /*
 +  * If CR8 load exits are enabled, CR8 store exits are enabled,
 +  * and virtualize APIC access is disabled, the processor would
 +  * never notice. Doing it unconditionally is not correct, but
 +  * it is the simplest thing.
 +  */
 + if (!(exec_control  CPU_BASED_TPR_SHADOW) 
 + !((exec_control  CPU_BASED_CR8_LOAD_EXITING) 
 + (exec_control  CPU_BASED_CR8_STORE_EXITING)))
 + nested_vmx_failValid(vcpu, 
 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 +

You aren't checking virtualize APIC access here, but the comment
mentions it.

As the comment says, failing the entry unconditionally could be the
simplest thing, which means moving the nested_vmx_failValid call inside
the if (!vmx-nested.virtual_apic_page).

If you want to check all of CR8_LOAD/CR8_STORE/VIRTUALIZE_APIC_ACCESS,
please mention in the comment that failing the vm entry is _not_ what
the processor does but it's basically the only possibility we have.  In
that case, I would also place the if within the if
(!vmx-nested.virtual_apic_page): it also simplifies the condition
because you don't have to check CPU_BASED_TPR_SHADOW anymore.

You can send v5 with these changes, and I'll apply it for 3.18.  Thanks!

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost: Add polling mode

2014-08-19 Thread Razya Ladelsky
 That was just one example. There many other possibilities.  Either
 actually make the systems load all host CPUs equally, or divide
 throughput by host CPU.
 

The polling patch adds this capability to vhost, reducing costly exit 
overhead when the vm is loaded.

In order to load the vm I ran netperf  with msg size of 256:

Without polling:  2480 Mbits/sec,  utilization: vm - 100%   vhost - 64% 
With Polling: 4160 Mbits/sec,  utilization: vm - 100%   vhost - 100% 

Therefore, throughput/cpu without polling is 15.1, and 20.8 with polling.

My intention was to load vhost as close as possible to 100% utilization 
without polling, in order to compare it to the polling utilization case 
(where vhost is always 100%). 
The best use case, of course, would be when the shared vhost thread work 
(TBD) is integrated and then vhost will actually be using its polling 
cycles to handle requests of multiple devices (even from multiple vms).

Thanks,
Razya


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Christian Borntraeger
On 07/08/14 15:40, Paolo Bonzini wrote:
 Il 07/08/2014 11:59, Christian Borntraeger ha scritto:
 Paolo,

 are you willing to apply to kvm/queue?
 
 I asked a question, but anyway... not until the end of the merge window
 and my small vacation. :)
 
 Paolo
 
Absolutely, was on vacation myself :-) See my answers to the other mail.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Christian Borntraeger
On 07/08/14 15:39, Paolo Bonzini wrote:
 Il 05/08/2014 16:44, Christian Borntraeger ha scritto:
 We currently track the pid of the task that runs the VCPU in
 vcpu_load. Since we call vcpu_load for all kind of ioctls on a
 CPU, this causes hickups due to synchronize_rcu if one CPU is
 modified by another CPU or the main thread (e.g. initialization,
 reset). We track the pid only for the purpose of yielding, so
 let's update the pid only in the KVM_RUN ioctl.

 In addition, don't do a synchronize_rcu on startup (pid == 0).
 
 Speaking of QEMU, most ioctls should run from the VCPU anyway.  Which
 ioctls do you see called from elsewhere?  What speedup can you see if
 you just do the no synchronize_rcu on pid == 0 part?

I think on x86 no synchronize_rcu on pid == 0 is the only thing that is 
necessary.

 
 The patch may be okay, but I'm worried that it might be hiding a bug in
 QEMU.

On s390 we call KVM_S390_INITIAL_RESET from several reset functions, e.g. 
during 
CPU creation. This is the first hickup and the pid now points to the main 
thread.

The 2nd hickup comes when the guest activates additional CPUs via SIGP (ipi). 
Here
the first ioctl in the vpcu thread will get the pid back to the vcpu thread.



 
 Paolo
 
 This speeds up guest boot time on s390 noticably for some configs, e.g.
 HZ=100, no full state tracking, 64 guest cpus 32 host cpus.
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] KVM: fix cache stale memslot info with correct mmio generation number

2014-08-19 Thread Xiao Guangrong
On 08/19/2014 04:28 PM, Paolo Bonzini wrote:
 Il 19/08/2014 05:50, Xiao Guangrong ha scritto:

 Note in the step *, my approach detects the invalid generation-number which
 will invalidate the mmio spte properly .
 
 You are right, in fact my mail included another part: Another 
 alternative could be to use the low bit to mark an in-progress change, 
 *and skip the caching if the low bit is set*.

Okay, what confused me it that it seems that the single line patch
is ok to you. :)

Now, do we really need to care the case 2? like David said:
Sorry I didn't explain myself very well: Since we can get a single wrong
mmio exit no matter what, it has to be handled in userspace. So my point
was, it doesn't really help to fix that one very specific way that it can
happen, because it can just happen in other ways. (E.g. update memslots
occurs after is_noslot_pfn() and before mmio exit).

What's your idea?

 
 I think if you always treat the low bit as zero in mmio sptes, you can 
 do that without losing a bit of the generation.

What's you did is avoiding cache a invalid generation number into spte, but
actually if we can figure it out when we check mmio access, it's ok. Like the
updated patch i posted should fix it, that way avoids doubly increase the 
number.

Okay, if you're interested increasing the number doubly, there is the simpler
one:

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9314678..bf49170 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -236,6 +236,9 @@ static unsigned int get_mmio_spte_generation(u64 spte)

 static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
 {
+   /* The initialized generation number should be even. */
+   BUILD_BUG_ON((MMIO_MAX_GEN - 150)  0x1);
+
/*
 * Init kvm generation close to MMIO_MAX_GEN to easily test the
 * code of handling generation number wrap-around.
@@ -292,6 +295,14 @@ static bool check_mmio_spte(struct kvm *kvm, u64 spte)
kvm_gen = kvm_current_mmio_generation(kvm);
spte_gen = get_mmio_spte_generation(spte);

+   /*
+* Aha, we cached a being-updated generation number or
+* generation number is currnetly being updated, let do the
+* real check for mmio access.
+*/
+   if ((kvm_gen | spte_gen)  0x1)
+   return false;
+
trace_check_mmio_spte(spte, kvm_gen, spte_gen);
return likely(kvm_gen == spte_gen);
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 33712fb..5c3f7b7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -725,7 +725,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm 
*kvm,
update_memslots(slots, new, kvm-memslots-generation);
rcu_assign_pointer(kvm-memslots, slots);
synchronize_srcu_expedited(kvm-srcu);
-
+   kvm-memslots-generation++;
kvm_arch_memslots_updated(kvm);

return old_memslots;

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] ARM: KVM: add irqfd support

2014-08-19 Thread Eric Auger
On 08/13/2014 04:55 PM, Christoffer Dall wrote:
 On Mon, Aug 04, 2014 at 02:08:22PM +0200, Eric Auger wrote:
 This patch enables irqfd on ARM.

 irqfd framework enables to inject a virtual IRQ into a guest upon an
 eventfd trigger. User-side uses KVM_IRQFD VM ioctl to provide KVM with
 a kvm_irqfd struct that associates a VM, an eventfd, an IRQ number
 (aka. the gsi). When an actor signals the eventfd (typically a VFIO
 platform driver), the kvm irqfd subsystem injects the provided virtual
 IRQ into the guest.

 The gsi must correspond to a shared peripheral interrupt (SPI), ie the
 GIC interrupt ID is gsi+32.
 
 Why can't we support PPIs?
Hi Christoffer,

Well, in case we want to support PPI at irqfd level, we would need to
change the semantic of the GSI value and use the same as KVM_IRQ_LINE,
to specify the target vcpu. This is obviously feasible but this also
induces changes in currently generic user parts, vfio, vhost. is PPI
injection though irqfd a valid use case?
 

 CONFIG_HAVE_KVM_EVENTFD and CONFIG_HAVE_KVM_IRQFD are turned on.
 
 This patch enables CONFIG_
 
OK

 No IRQ routing table is used thanks to Paul Mackerras' patch serie:
 IRQFD without IRQ routing, enabled for XICS
 (https://www.mail-archive.com/kvm@vger.kernel.org/msg104478.html)

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 This patch would deprecate the previous patch featuring GSI routing
 (https://patches.linaro.org/32261/)

 irqchip.c and irq_comm.c are not used at all.

 This RFC applies on top of Christoffer Dall's serie
 arm/arm64: KVM: Various VGIC cleanups and improvements
 https://lists.cs.columbia.edu/pipermail/kvmarm/2014-June/009979.html

 All pieces can be found on git://git.linaro.org/people/eric.auger/linux.git
 branch irqfd_integ_v4

 This work was tested with Calxeda Midway xgmac main interrupt with
 qemu-system-arm and QEMU VFIO platform device.
 ---
  Documentation/virtual/kvm/api.txt |  5 +++-
  arch/arm/include/uapi/asm/kvm.h   |  3 +++
  arch/arm/kvm/Kconfig  |  3 ++-
  arch/arm/kvm/Makefile |  2 +-
  arch/arm/kvm/irq.h| 25 ++
  virt/kvm/arm/vgic.c   | 54 
 ---
  6 files changed, 85 insertions(+), 7 deletions(-)
  create mode 100644 arch/arm/kvm/irq.h

 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 0fe3649..04310d9 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -2132,7 +2132,7 @@ into the hash PTE second double word).
  4.75 KVM_IRQFD
  
  Capability: KVM_CAP_IRQFD
 -Architectures: x86 s390
 +Architectures: x86 s390 arm
  Type: vm ioctl
  Parameters: struct kvm_irqfd (in)
  Returns: 0 on success, -1 on error
 @@ -2158,6 +2158,9 @@ Note that closing the resamplefd is not sufficient to 
 disable the
  irqfd.  The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
  and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
  
 +On ARM/arm64 the injected must be a shared peripheral interrupt (SPI).
 +This means the programmed GIC interrupt ID is gsi+32.
 +
  4.76 KVM_PPC_ALLOCATE_HTAB
  
  Capability: KVM_CAP_PPC_ALLOC_HTAB
 diff --git a/arch/arm/include/uapi/asm/kvm.h 
 b/arch/arm/include/uapi/asm/kvm.h
 index e6ebdd3..3034c66 100644
 --- a/arch/arm/include/uapi/asm/kvm.h
 +++ b/arch/arm/include/uapi/asm/kvm.h
 @@ -194,6 +194,9 @@ struct kvm_arch_memory_slot {
  /* Highest supported SPI, from VGIC_NR_IRQS */
  #define KVM_ARM_IRQ_GIC_MAX 127
  
 +/* One single KVM irqchip, ie. the VGIC */
 +#define KVM_NR_IRQCHIPS  1
 +
  /* PSCI interface */
  #define KVM_PSCI_FN_BASE0x95c1ba5e
  #define KVM_PSCI_FN(n)  (KVM_PSCI_FN_BASE + (n))
 diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
 index 4be5bb1..7800261 100644
 --- a/arch/arm/kvm/Kconfig
 +++ b/arch/arm/kvm/Kconfig
 @@ -24,6 +24,7 @@ config KVM
  select KVM_MMIO
  select KVM_ARM_HOST
  depends on ARM_VIRT_EXT  ARM_LPAE  !CPU_BIG_ENDIAN
 +select HAVE_KVM_EVENTFD
  ---help---
Support hosting virtualized guest machines. You will also
need to select one or more of the processor modules below.
 @@ -55,7 +56,7 @@ config KVM_ARM_MAX_VCPUS
  config KVM_ARM_VGIC
  bool KVM support for Virtual GIC
  depends on KVM_ARM_HOST  OF
 -select HAVE_KVM_IRQCHIP
 +select HAVE_KVM_IRQFD
  default y
  ---help---
Adds support for a hardware assisted, in-kernel GIC emulation.
 diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
 index 789bca9..2fa2f82 100644
 --- a/arch/arm/kvm/Makefile
 +++ b/arch/arm/kvm/Makefile
 @@ -15,7 +15,7 @@ AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt)
  AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)
  
  KVM := ../../../virt/kvm
 -kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
 +kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o
  
  obj-y += kvm-arm.o init.o interrupts.o
  obj-y += 

Re: [RFC PATCH] ARM: KVM: add irqfd support

2014-08-19 Thread Eric Auger
On 08/13/2014 04:55 PM, Christoffer Dall wrote:
 On Mon, Aug 04, 2014 at 02:08:22PM +0200, Eric Auger wrote:
 This patch enables irqfd on ARM.

 irqfd framework enables to inject a virtual IRQ into a guest upon an
 eventfd trigger. User-side uses KVM_IRQFD VM ioctl to provide KVM with
 a kvm_irqfd struct that associates a VM, an eventfd, an IRQ number
 (aka. the gsi). When an actor signals the eventfd (typically a VFIO
 platform driver), the kvm irqfd subsystem injects the provided virtual
 IRQ into the guest.

 The gsi must correspond to a shared peripheral interrupt (SPI), ie the
 GIC interrupt ID is gsi+32.
 
 Why can't we support PPIs?
Hi Christoffer,

Well, in case we want to support PPI at irqfd level, we would need to
change the semantic of the GSI value and use the same as KVM_IRQ_LINE,
to specify the target vcpu. This is obviously feasible but this also
induces changes in currently generic user parts, vfio, vhost. is PPI
injection though irqfd a valid use case?
 

 CONFIG_HAVE_KVM_EVENTFD and CONFIG_HAVE_KVM_IRQFD are turned on.
 
 This patch enables CONFIG_
 
OK

 No IRQ routing table is used thanks to Paul Mackerras' patch serie:
 IRQFD without IRQ routing, enabled for XICS
 (https://www.mail-archive.com/kvm@vger.kernel.org/msg104478.html)

 Signed-off-by: Eric Auger eric.au...@linaro.org

 ---

 This patch would deprecate the previous patch featuring GSI routing
 (https://patches.linaro.org/32261/)

 irqchip.c and irq_comm.c are not used at all.

 This RFC applies on top of Christoffer Dall's serie
 arm/arm64: KVM: Various VGIC cleanups and improvements
 https://lists.cs.columbia.edu/pipermail/kvmarm/2014-June/009979.html

 All pieces can be found on git://git.linaro.org/people/eric.auger/linux.git
 branch irqfd_integ_v4

 This work was tested with Calxeda Midway xgmac main interrupt with
 qemu-system-arm and QEMU VFIO platform device.
 ---
  Documentation/virtual/kvm/api.txt |  5 +++-
  arch/arm/include/uapi/asm/kvm.h   |  3 +++
  arch/arm/kvm/Kconfig  |  3 ++-
  arch/arm/kvm/Makefile |  2 +-
  arch/arm/kvm/irq.h| 25 ++
  virt/kvm/arm/vgic.c   | 54 
 ---
  6 files changed, 85 insertions(+), 7 deletions(-)
  create mode 100644 arch/arm/kvm/irq.h

 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 0fe3649..04310d9 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -2132,7 +2132,7 @@ into the hash PTE second double word).
  4.75 KVM_IRQFD
  
  Capability: KVM_CAP_IRQFD
 -Architectures: x86 s390
 +Architectures: x86 s390 arm
  Type: vm ioctl
  Parameters: struct kvm_irqfd (in)
  Returns: 0 on success, -1 on error
 @@ -2158,6 +2158,9 @@ Note that closing the resamplefd is not sufficient to 
 disable the
  irqfd.  The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
  and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
  
 +On ARM/arm64 the injected must be a shared peripheral interrupt (SPI).
 +This means the programmed GIC interrupt ID is gsi+32.
 +
  4.76 KVM_PPC_ALLOCATE_HTAB
  
  Capability: KVM_CAP_PPC_ALLOC_HTAB
 diff --git a/arch/arm/include/uapi/asm/kvm.h 
 b/arch/arm/include/uapi/asm/kvm.h
 index e6ebdd3..3034c66 100644
 --- a/arch/arm/include/uapi/asm/kvm.h
 +++ b/arch/arm/include/uapi/asm/kvm.h
 @@ -194,6 +194,9 @@ struct kvm_arch_memory_slot {
  /* Highest supported SPI, from VGIC_NR_IRQS */
  #define KVM_ARM_IRQ_GIC_MAX 127
  
 +/* One single KVM irqchip, ie. the VGIC */
 +#define KVM_NR_IRQCHIPS  1
 +
  /* PSCI interface */
  #define KVM_PSCI_FN_BASE0x95c1ba5e
  #define KVM_PSCI_FN(n)  (KVM_PSCI_FN_BASE + (n))
 diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
 index 4be5bb1..7800261 100644
 --- a/arch/arm/kvm/Kconfig
 +++ b/arch/arm/kvm/Kconfig
 @@ -24,6 +24,7 @@ config KVM
  select KVM_MMIO
  select KVM_ARM_HOST
  depends on ARM_VIRT_EXT  ARM_LPAE  !CPU_BIG_ENDIAN
 +select HAVE_KVM_EVENTFD
  ---help---
Support hosting virtualized guest machines. You will also
need to select one or more of the processor modules below.
 @@ -55,7 +56,7 @@ config KVM_ARM_MAX_VCPUS
  config KVM_ARM_VGIC
  bool KVM support for Virtual GIC
  depends on KVM_ARM_HOST  OF
 -select HAVE_KVM_IRQCHIP
 +select HAVE_KVM_IRQFD
  default y
  ---help---
Adds support for a hardware assisted, in-kernel GIC emulation.
 diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
 index 789bca9..2fa2f82 100644
 --- a/arch/arm/kvm/Makefile
 +++ b/arch/arm/kvm/Makefile
 @@ -15,7 +15,7 @@ AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt)
  AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)
  
  KVM := ../../../virt/kvm
 -kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
 +kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o
  
  obj-y += kvm-arm.o init.o interrupts.o
  obj-y += 

[PATCH v2 1/3] KVM: vmx: fix ept reserved bits for 1-GByte page

2014-08-19 Thread Wanpeng Li
EPT misconfig handler in kvm will check which reason lead to EPT 
misconfiguration after vmexit. One of the reasons is that an EPT 
paging-structure entry is configured with settings reserved for 
future functionality. However, the handler can't identify if 
paging-structure entry of reserved bits for 1-GByte page are 
configured, since PDPTE which point to 1-GByte page will reserve 
bits 29:12 instead of bits 7:3 which are reserved for PDPTE that 
references an EPT Page Directory. This patch fix it by reserve 
bits 29:12 for 1-GByte page. 

Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com
---
v1 - v2:
 * same if statement cover both 2MB and 1GB pages
 * return 0xf8 for level == 4
 * get the level by checking the return value of ept_rsvd_mask 

 arch/x86/kvm/vmx.c | 19 +++
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cad37d5..2763f37 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5521,17 +5521,12 @@ static u64 ept_rsvd_mask(u64 spte, int level)
for (i = 51; i  boot_cpu_data.x86_phys_bits; i--)
mask |= (1ULL  i);
 
-   if (level  2)
-   /* bits 7:3 reserved */
-   mask |= 0xf8;
-   else if (level == 2) {
-   if (spte  (1ULL  7))
-   /* 2MB ref, bits 20:12 reserved */
-   mask |= 0x1ff000;
-   else
-   /* bits 6:3 reserved */
-   mask |= 0x78;
-   }
+   if (spte  (1ULL  7))
+   /* 1GB/2MB page, bits 29:12 or 20:12 reserved respectively */
+   mask |= (PAGE_SIZE  ((level - 1) * 9)) - PAGE_SIZE;
+   else
+   /* bits 6:3 reserved */
+   mask |= 0x78;
 
return mask;
 }
@@ -5561,7 +5556,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu 
*vcpu, u64 spte,
WARN_ON(1);
}
 
-   if (level == 1 || (level == 2  (spte  (1ULL  7 {
+   if (level == 1 || (rsvd_bits  0x38)) {
u64 ept_mem_type = (spte  0x38)  3;
 
if (ept_mem_type == 2 || ept_mem_type == 3 ||
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/3] KVM: vmx: fix ept reserved bits for 1-GByte page

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 11:04, Wanpeng Li ha scritto:
 EPT misconfig handler in kvm will check which reason lead to EPT 
 misconfiguration after vmexit. One of the reasons is that an EPT 
 paging-structure entry is configured with settings reserved for 
 future functionality. However, the handler can't identify if 
 paging-structure entry of reserved bits for 1-GByte page are 
 configured, since PDPTE which point to 1-GByte page will reserve 
 bits 29:12 instead of bits 7:3 which are reserved for PDPTE that 
 references an EPT Page Directory. This patch fix it by reserve 
 bits 29:12 for 1-GByte page. 
 
 Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com
 ---
 v1 - v2:
  * same if statement cover both 2MB and 1GB pages
  * return 0xf8 for level == 4

I think you dropped this check by mistake.

  * get the level by checking the return value of ept_rsvd_mask 
 
  arch/x86/kvm/vmx.c | 19 +++
  1 file changed, 7 insertions(+), 12 deletions(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index cad37d5..2763f37 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -5521,17 +5521,12 @@ static u64 ept_rsvd_mask(u64 spte, int level)
   for (i = 51; i  boot_cpu_data.x86_phys_bits; i--)
   mask |= (1ULL  i);
  
 - if (level  2)
 - /* bits 7:3 reserved */
 - mask |= 0xf8;
 - else if (level == 2) {
 - if (spte  (1ULL  7))
 - /* 2MB ref, bits 20:12 reserved */
 - mask |= 0x1ff000;
 - else
 - /* bits 6:3 reserved */
 - mask |= 0x78;
 - }
 + if (spte  (1ULL  7))

You need to go this way if level == 1 too.  Otherwise, you would report
bits 6:3 reserved if the hypervisor is using the ignored bit 7 (Table
28-6, Format of an EPT Page-Table Entry).

 + /* 1GB/2MB page, bits 29:12 or 20:12 reserved respectively */
 + mask |= (PAGE_SIZE  ((level - 1) * 9)) - PAGE_SIZE;
 + else
 + /* bits 6:3 reserved */
 + mask |= 0x78;
  
   return mask;
  }
 @@ -5561,7 +5556,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu 
 *vcpu, u64 spte,
   WARN_ON(1);
   }
  
 - if (level == 1 || (level == 2  (spte  (1ULL  7 {
 + if (level == 1 || (rsvd_bits  0x38)) {

- rsvd_bits will always be zero here.  You need to check the return
value of ept_rsvd_mask().  Let's call it rsvd_mask in the rest of this
email.

- the test is inverted, you need to check that bits 5:3 are _not_
reserved, hence (rsvd_mask  0x38) == 0.

- once you do this, the test also covers level 1.

I suggest that you write a testcase for kvm-unit-tests.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/3] KVM: x86: #GP when attempts to write reserved bits of Variable Range MTRRs

2014-08-19 Thread Wanpeng Li
Section 11.11.2.3 of the SDM mentions All other bits in the 
IA32_MTRR_PHYSBASEn 
and IA32_MTRR_PHYSMASKn registers are reserved; the processor generates a 
general-protection exception(#GP) if software attempts to write to them. This 
patch do it in kvm.

Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com
---
 arch/x86/kvm/x86.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fb3ea7a..b85da5f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1726,6 +1726,7 @@ static bool valid_mtrr_type(unsigned t)
 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
int i;
+   u64 mask = 0;
 
if (!msr_mtrr_valid(msr))
return false;
@@ -1749,10 +1750,21 @@ static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, 
u64 data)
/* variable MTRRs */
WARN_ON(!(msr = 0x200  msr  0x200 + 2 * KVM_NR_VAR_MTRR));
 
-   if ((msr  1) == 0)
+   for (i = 63; i  boot_cpu_data.x86_phys_bits; i--)
+   mask |= (1ULL  i);
+   if ((msr  1) == 0) {
/* MTRR base */
-   return valid_mtrr_type(data  0xff);
-   /* MTRR mask */
+   if (!valid_mtrr_type(data  0xff))
+   return false;
+mask |= 0xf00;
+   } else
+   /* MTRR mask */
+   mask |= 0x7ff;
+   if (data  mask) {
+   kvm_inject_gp(vcpu, 0);
+   return false;
+   }
+
return true;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/3] KVM: x86: fix check legal type of Variable Range MTRRs

2014-08-19 Thread Wanpeng Li
The first entry in each pair(IA32_MTRR_PHYSBASEn) defines the base 
address and memory type for the range; the second entry(IA32_MTRR_PHYSMASKn)
contains a mask used to determine the address range. The legal values 
for the type field of IA32_MTRR_PHYSBASEn are 0,1,4,5, and 6. However,
IA32_MTRR_PHYSMASKn don't have type field. This patch avoid check if 
the type field is legal for IA32_MTRR_PHYSMASKn.

Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com
---
v1 - v2:
 * WARN_ON if not fall in variable Range MTRRs 
 * the base/mask can be separated just with an 

 arch/x86/kvm/x86.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5f5edb6..fb3ea7a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1747,7 +1747,13 @@ static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, 
u64 data)
}
 
/* variable MTRRs */
-   return valid_mtrr_type(data  0xff);
+   WARN_ON(!(msr = 0x200  msr  0x200 + 2 * KVM_NR_VAR_MTRR));
+
+   if ((msr  1) == 0)
+   /* MTRR base */
+   return valid_mtrr_type(data  0xff);
+   /* MTRR mask */
+   return true;
 }
 
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] KVM: fix cache stale memslot info with correct mmio generation number

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 10:50, Xiao Guangrong ha scritto:
 Okay, what confused me it that it seems that the single line patch
 is ok to you. :)

No, it was late and I was confused. :)

 Now, do we really need to care the case 2? like David said:
 Sorry I didn't explain myself very well: Since we can get a single wrong
 mmio exit no matter what, it has to be handled in userspace. So my point
 was, it doesn't really help to fix that one very specific way that it can
 happen, because it can just happen in other ways. (E.g. update memslots
 occurs after is_noslot_pfn() and before mmio exit).
 
 What's your idea?
 
  I think if you always treat the low bit as zero in mmio sptes, you can 
  do that without losing a bit of the generation.
 
 What's you did is avoiding cache a invalid generation number into spte, but
 actually if we can figure it out when we check mmio access, it's ok. Like the
 updated patch i posted should fix it, that way avoids doubly increase the 
 number.

Yes.

 Okay, if you're interested increasing the number doubly, there is the simpler
 one:

This wastes a bit in the mmio spte though.  My idea is to increase the
memslots generation twice, but drop the low bit in the mmio spte.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/3] KVM: x86: #GP when attempts to write reserved bits of Variable Range MTRRs

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 11:04, Wanpeng Li ha scritto:
 Section 11.11.2.3 of the SDM mentions All other bits in the 
 IA32_MTRR_PHYSBASEn 
 and IA32_MTRR_PHYSMASKn registers are reserved; the processor generates a 
 general-protection exception(#GP) if software attempts to write to them. 
 This 
 patch do it in kvm.
 
 Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com
 ---
  arch/x86/kvm/x86.c | 18 +++---
  1 file changed, 15 insertions(+), 3 deletions(-)
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index fb3ea7a..b85da5f 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1726,6 +1726,7 @@ static bool valid_mtrr_type(unsigned t)
  static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
  {
   int i;
 + u64 mask = 0;
  
   if (!msr_mtrr_valid(msr))
   return false;
 @@ -1749,10 +1750,21 @@ static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 
 msr, u64 data)
   /* variable MTRRs */
   WARN_ON(!(msr = 0x200  msr  0x200 + 2 * KVM_NR_VAR_MTRR));
  
 - if ((msr  1) == 0)
 + for (i = 63; i  boot_cpu_data.x86_phys_bits; i--)
 + mask |= (1ULL  i);
 + if ((msr  1) == 0) {
   /* MTRR base */
 - return valid_mtrr_type(data  0xff);
 - /* MTRR mask */
 + if (!valid_mtrr_type(data  0xff))
 + return false;
 +  mask |= 0xf00;
 + } else
 + /* MTRR mask */
 + mask |= 0x7ff;
 + if (data  mask) {
 + kvm_inject_gp(vcpu, 0);
 + return false;
 + }
 +
   return true;
  }
  
 

Thanks, these two patches look good.  Please write a testcase for
kvm-unit-tests (x86/msr.c), too.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 10:38, Christian Borntraeger ha scritto:
  The patch may be okay, but I'm worried that it might be hiding a bug in
  QEMU.
 On s390 we call KVM_S390_INITIAL_RESET from several reset functions, e.g. 
 during 
 CPU creation. This is the first hickup and the pid now points to the main 
 thread.

Any reason to have a special ioctl instead of SET_REGS/SET_ONE_REG/...
(via kvm_cpu_synchronize_state, which does the ioctls in the VCPU thread)?

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Christian Borntraeger
On 19/08/14 11:27, Paolo Bonzini wrote:
 Il 19/08/2014 10:38, Christian Borntraeger ha scritto:
 The patch may be okay, but I'm worried that it might be hiding a bug in
 QEMU.
 On s390 we call KVM_S390_INITIAL_RESET from several reset functions, e.g. 
 during 
 CPU creation. This is the first hickup and the pid now points to the main 
 thread.
 
 Any reason to have a special ioctl instead of SET_REGS/SET_ONE_REG/...
 (via kvm_cpu_synchronize_state, which does the ioctls in the VCPU thread)?

Historical reasons mostly. Older kernel miss several interfaces to bring the 
CPU in a defined state (pending interrupts, cpu state, some registers...)

Good news is that we are working on getting rid of it: cpu states are now 
available as far as I can see, only local interrupt flushing is missing.This 
needs some more work on our side.  So in some month we probably will have a 
QEMU version that does not need to call this any more. For todays QEMU this 
patch help though.

Christian

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH kvm-unit-tests 1/2] x86: Use host CPU parameter for apic test

2014-08-19 Thread Paolo Bonzini
Il 18/08/2014 21:43, Nadav Amit ha scritto:
 Currently, the TSC deadline test never runs, since TSC deadline is disabled
 unless the host cpu parameter is used. This patch changes the apic test to use
 the qemu host cpu parameter.

Better use
 Signed-off-by: Nadav Amit na...@cs.technion.ac.il
 ---
  x86/unittests.cfg | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/x86/unittests.cfg b/x86/unittests.cfg
 index 6d3e23a..f692b2b 100644
 --- a/x86/unittests.cfg
 +++ b/x86/unittests.cfg
 @@ -9,7 +9,7 @@
  [apic]
  file = apic.flat
  smp = 2
 -extra_params = -cpu qemu64,+x2apic
 +extra_params = -cpu host,+x2apic
  arch = x86_64
  
  [smptest]
 

Eduardo, I think we should add tsc_deadline to QEMU instead?

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 11:47, Christian Borntraeger ha scritto:
 On 19/08/14 11:27, Paolo Bonzini wrote:
 Il 19/08/2014 10:38, Christian Borntraeger ha scritto:
 The patch may be okay, but I'm worried that it might be
 hiding a bug in QEMU.
 On s390 we call KVM_S390_INITIAL_RESET from several reset
 functions, e.g. during CPU creation. This is the first hickup and
 the pid now points to the main thread.
 
 Any reason to have a special ioctl instead of
 SET_REGS/SET_ONE_REG/... (via kvm_cpu_synchronize_state, which does
 the ioctls in the VCPU thread)?
 
 Historical reasons mostly. Older kernel miss several interfaces to
 bring the CPU in a defined state (pending interrupts, cpu state, some
 registers...)
 
 Good news is that we are working on getting rid of it: cpu states are
 now available as far as I can see, only local interrupt flushing is
 missing.This needs some more work on our side.  So in some month we
 probably will have a QEMU version that does not need to call this any
 more. For todays QEMU this patch help though.

Just by the sound of it, interrupt flushing seems dangerous to do in a
way that could be concurrent with KVM_RUN...

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Christian Borntraeger
On 19/08/14 11:53, Paolo Bonzini wrote:
 Il 19/08/2014 11:47, Christian Borntraeger ha scritto:
 On 19/08/14 11:27, Paolo Bonzini wrote:
 Il 19/08/2014 10:38, Christian Borntraeger ha scritto:
 The patch may be okay, but I'm worried that it might be
 hiding a bug in QEMU.
 On s390 we call KVM_S390_INITIAL_RESET from several reset
 functions, e.g. during CPU creation. This is the first hickup and
 the pid now points to the main thread.

 Any reason to have a special ioctl instead of
 SET_REGS/SET_ONE_REG/... (via kvm_cpu_synchronize_state, which does
 the ioctls in the VCPU thread)?

 Historical reasons mostly. Older kernel miss several interfaces to
 bring the CPU in a defined state (pending interrupts, cpu state, some
 registers...)

 Good news is that we are working on getting rid of it: cpu states are
 now available as far as I can see, only local interrupt flushing is
 missing.This needs some more work on our side.  So in some month we
 probably will have a QEMU version that does not need to call this any
 more. For todays QEMU this patch help though.
 
 Just by the sound of it, interrupt flushing seems dangerous to do in a
 way that could be concurrent with KVM_RUN...

Its only for the interrupts that are cpu local (like pending IPIs). In 
addition, we would do that only for the reset case (with an interface that can 
be used for migration).
Right now KVM_S390_INITIAL_RESET takes the vcpu_mutex, so this protects against 
KVM_RUN.

Christian


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] ARM: KVM: add irqfd support

2014-08-19 Thread Christoffer Dall
On Tue, Aug 19, 2014 at 10:53:16AM +0200, Eric Auger wrote:
 On 08/13/2014 04:55 PM, Christoffer Dall wrote:
  On Mon, Aug 04, 2014 at 02:08:22PM +0200, Eric Auger wrote:
  This patch enables irqfd on ARM.
 
  irqfd framework enables to inject a virtual IRQ into a guest upon an
  eventfd trigger. User-side uses KVM_IRQFD VM ioctl to provide KVM with
  a kvm_irqfd struct that associates a VM, an eventfd, an IRQ number
  (aka. the gsi). When an actor signals the eventfd (typically a VFIO
  platform driver), the kvm irqfd subsystem injects the provided virtual
  IRQ into the guest.
 
  The gsi must correspond to a shared peripheral interrupt (SPI), ie the
  GIC interrupt ID is gsi+32.
  
  Why can't we support PPIs?
 Hi Christoffer,
 
 Well, in case we want to support PPI at irqfd level, we would need to
 change the semantic of the GSI value and use the same as KVM_IRQ_LINE,
 to specify the target vcpu. This is obviously feasible but this also
 induces changes in currently generic user parts, vfio, vhost. is PPI
 injection though irqfd a valid use case?

I don't see why you wouldn't want to be able to support PPIs as
passthrough interrupts to a guest?

[...]

  +
  +/* MSI not implemented yet */
  
  yet?  What is an MSI on ARM?
 Well some MSI support comes with GICv2m and GICv3. My current
 understanding is it makes sense to inject an MSI from an irqfd trigger.
 Don't you share this understanding?
 
Doh, I read GSI and wrote MSI, of course, complete brain meltdown on my
side.

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 11:59, Christian Borntraeger ha scritto:
 Its only for the interrupts that are cpu local (like pending IPIs).
 In addition, we would do that only for the reset case (with an
 interface that can be used for migration). Right now
 KVM_S390_INITIAL_RESET takes the vcpu_mutex, so this protects against
 KVM_RUN.

I'm not sure, this does seem like a workaround for another limitation
after all...  Gleb?

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH kvm-unit-tests 2/2] x86: Check deadline counter is cleared after interrupt

2014-08-19 Thread Paolo Bonzini
Il 18/08/2014 21:43, Nadav Amit ha scritto:
 Once the local-apic timer is configured to use TSC deadline, the deadline
 should be cleared after the deadline passes.  This patch adds a check of this
 behavior.
 
 Signed-off-by: Nadav Amit na...@cs.technion.ac.il
 ---
  x86/apic.c | 1 +
  1 file changed, 1 insertion(+)
 
 diff --git a/x86/apic.c b/x86/apic.c
 index 487c248..3f463a5 100644
 --- a/x86/apic.c
 +++ b/x86/apic.c
 @@ -35,6 +35,7 @@ static void start_tsc_deadline_timer(void)
  wrmsr(MSR_IA32_TSCDEADLINE, rdmsr(MSR_IA32_TSC));
  asm volatile (nop);
  report(tsc deadline timer, tdt_count == 1);
 +report(tsc deadline timer clearing, rdmsr(MSR_IA32_TSCDEADLINE) == 0);
  }
  
  static int enable_tsc_deadline_timer(void)
 

Thanks, applying this patch.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Christian Borntraeger
On 19/08/14 12:03, Paolo Bonzini wrote:
 Il 19/08/2014 11:59, Christian Borntraeger ha scritto:
 Its only for the interrupts that are cpu local (like pending IPIs).
 In addition, we would do that only for the reset case (with an
 interface that can be used for migration). Right now
 KVM_S390_INITIAL_RESET takes the vcpu_mutex, so this protects against
 KVM_RUN.
 
 I'm not sure, this does seem like a workaround for another limitation
 after all...  Gleb?

Yes. We want to get rid of KVM_S390_INITIAL_RESET in QEMU. This comes from a 
time, when we had another userspace prototype for KVM on s390 (kuli). Its 
really a wart that has to go.
Its just that we are not there yet to remove the call to 
KVM_S390_INITIAL_RESET. Doing so can result in hard to debug errors after 
reboot, if an interrupt was made pending just before reboot that gets delivered 
in the new instance.

The new way for local interrupt read/write will probably be some onereg or 
syncreg interface with a bitmask register and payload registers. We have to 
solve some concurrency and implemenation issues here.

Christian

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call for agenda for 2014-08-19

2014-08-19 Thread Juan Quintela
Juan Quintela quint...@redhat.com wrote:
 Hi

 Please, send any topic that you are interested in covering.

 People have complained on the past that I don't cancel the call until
 the very last minute.  So, what do you think that deadline for
 submitting topics is 23:00UTC on Monday?

As there are no topics, call gets cancelled.

Have a good week, Juan.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH kvm-unit-tests] x86: Test task-switch with cs.rpl != cs.dpl

2014-08-19 Thread Paolo Bonzini
Il 17/08/2014 21:34, Nadav Amit ha scritto:
 Commit 5045b46803 added a check that cs.dpl equals cs.rpl during task-switch.
 This is a wrong check, and this test introduces a test in which cs.dpl !=
 cs.rpl. To do so, it configures tss.cs to be conforming with rpl=3 and dpl=0.
 Since the cpl after calling is 3, it does not make any prints in the callee.
 
 Signed-off-by: Nadav Amit na...@cs.technion.ac.il
 ---
  x86/taskswitch2.c | 22 ++
  1 file changed, 22 insertions(+)
 
 diff --git a/x86/taskswitch2.c b/x86/taskswitch2.c
 index 92fc941..d96853f 100644
 --- a/x86/taskswitch2.c
 +++ b/x86/taskswitch2.c
 @@ -7,6 +7,8 @@
  
  #define MAIN_TSS_SEL (FIRST_SPARE_SEL + 0)
  #define VM86_TSS_SEL (FIRST_SPARE_SEL + 8)
 +#define USER_CS_SEL  (FIRST_SPARE_SEL + 16)

Please call it CONFORM_CS_SEL since it's not really a user-mode selector
(DPL=0), it's just used as one (RPL=3).

 +#define USER_DS_SEL  (FIRST_SPARE_SEL + 24)

Not needed, see below.

  
  static volatile int test_count;
  static volatile unsigned int test_divider;
 @@ -102,6 +104,14 @@ start:
   goto start;
  }
  
 +static void user_tss(void)
 +{
 +start:

Please add a printf and print_current_tss_info() here.

 + test_count++;
 + asm volatile (iret);
 + goto start;
 +}
 +
  void test_kernel_mode_int()
  {
   unsigned int res;
 @@ -201,6 +211,18 @@ void test_kernel_mode_int()
   asm volatile (ljmp $ xstr(TSS_INTR) , $0xf4f4f4f4);
   printf(Jump back succeeded\n);
   report(ljmp, test_count == 1);
 +
 + /* test lcall with conforming segment, cs.dpl != cs.rpl */
 + test_count = 0;
 + set_intr_task_gate(0, user_tss);
 +
 + tss_intr.cs = USER_CS_SEL | 3;
 + tss_intr.ss = USER_DS_SEL | 3;
 + tss_intr.ds = tss_intr.gs = tss_intr.fs = tss_intr.ss;
 + set_gdt_entry(USER_CS_SEL, 0, 0x, 0x9f, 0xc0);
 + set_gdt_entry(USER_DS_SEL, 0, 0x, 0xf3, 0xc0);

You can use USER_DS here.  Also, please put the test in a separate
function and call it last (after test_vm86_switch), because a failure in
this test breaks test_vm86_switch too.

Paolo

 + asm volatile(lcall $ xstr(TSS_INTR) , $0xf4f4f4f4);
 + report(lcall when cs.rpl != cs.dpl, test_count == 1);
  }
  
  void test_vm86_switch(void)
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 12:09, Christian Borntraeger ha scritto:
 I'm not sure, this does seem like a workaround for another
 limitation after all...  Gleb?

 Yes. We want to get rid of KVM_S390_INITIAL_RESET in QEMU. This comes
 from a time, when we had another userspace prototype for KVM on s390
 (kuli). Its really a wart that has to go. Its just that we are not
 there yet to remove the call to KVM_S390_INITIAL_RESET. Doing so can
 result in hard to debug errors after reboot, if an interrupt was made
 pending just before reboot that gets delivered in the new instance.
 
 The new way for local interrupt read/write will probably be some
 onereg or syncreg interface with a bitmask register and payload
 registers. We have to solve some concurrency and implemenation issues
 here.

Yes, I understand; the plan is fine and it's good that it was already on
your todo list.

But since you acknowledge that KVM_S390_INITIAL_RESET will go, I'm not
sure we want to apply this patch (except for the pid == 0 part, of
course---that one is good).

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: recalculate_apic_map after enabling apic

2014-08-19 Thread Paolo Bonzini
Il 18/08/2014 23:03, Nadav Amit ha scritto:
 Currently, recalculate_apic_map ignores vcpus whose lapic is software disabled
 through the spurious interrupt vector. However, once it is re-enabled, the map
 is not recalculated. Therefore, if the guest OS configured DFR while lapic is
 software-disabled, the map may be incorrect. This patch recalculates apic map
 after software enabling the lapic.
 
 Signed-off-by: Nadav Amit na...@cs.technion.ac.il
 ---
  arch/x86/kvm/lapic.c | 25 ++---
  1 file changed, 14 insertions(+), 11 deletions(-)
 
 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 08e8a89..4a736199 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -112,17 +112,6 @@ static inline int __apic_test_and_clear_vector(int vec, 
 void *bitmap)
  struct static_key_deferred apic_hw_disabled __read_mostly;
  struct static_key_deferred apic_sw_disabled __read_mostly;
  
 -static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 -{
 - if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val)  APIC_SPIV_APIC_ENABLED) 
 {
 - if (val  APIC_SPIV_APIC_ENABLED)
 - static_key_slow_dec_deferred(apic_sw_disabled);
 - else
 - static_key_slow_inc(apic_sw_disabled.key);
 - }
 - apic_set_reg(apic, APIC_SPIV, val);
 -}
 -
  static inline int apic_enabled(struct kvm_lapic *apic)
  {
   return kvm_apic_sw_enabled(apic)  kvm_apic_hw_enabled(apic);
 @@ -210,6 +199,20 @@ out:
   kvm_vcpu_request_scan_ioapic(kvm);
  }
  
 +static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 +{
 + u32 prev = kvm_apic_get_reg(apic, APIC_SPIV);
 +
 + apic_set_reg(apic, APIC_SPIV, val);
 + if ((prev ^ val)  APIC_SPIV_APIC_ENABLED) {
 + if (val  APIC_SPIV_APIC_ENABLED) {
 + static_key_slow_dec_deferred(apic_sw_disabled);
 + recalculate_apic_map(apic-vcpu-kvm);
 + } else
 + static_key_slow_inc(apic_sw_disabled.key);
 + }
 +}
 +
  static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
  {
   apic_set_reg(apic, APIC_ID, id  24);
 

Applied to kvm/queue, thanks.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: Clear apic tsc-deadline after deadline

2014-08-19 Thread Paolo Bonzini
Il 18/08/2014 21:42, Nadav Amit ha scritto:
 Intel SDM 10.5.4.1 says When the timer generates an interrupt, it disarms
 itself and clears the IA32_TSC_DEADLINE MSR.
 
 This patch clears the MSR upon timer interrupt delivery which delivered on
 deadline mode.  Since the MSR may be reconfigured while an interrupt is
 pending, causing the new value to be overriden, pending timer interrupts are
 checked before setting a new deadline.
 
 Signed-off-by: Nadav Amit na...@cs.technion.ac.il
 ---
  arch/x86/kvm/lapic.c | 5 +
  1 file changed, 5 insertions(+)
 
 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 08e8a89..666c086 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -1352,6 +1352,9 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu 
 *vcpu, u64 data)
   return;
  
   hrtimer_cancel(apic-lapic_timer.timer);
 + /* Inject here so clearing tscdeadline won't override new value */
 + if (apic_has_pending_timer(vcpu))
 + kvm_inject_apic_timer_irqs(vcpu);
   apic-lapic_timer.tscdeadline = data;
   start_apic_timer(apic);
  }
 @@ -1639,6 +1642,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
  
   if (atomic_read(apic-lapic_timer.pending)  0) {
   kvm_apic_local_deliver(apic, APIC_LVTT);
 + if (apic_lvtt_tscdeadline(apic))
 + apic-lapic_timer.tscdeadline = 0;
   atomic_set(apic-lapic_timer.pending, 0);
   }
  }
 

Applied, thanks.  Also applied patch 2 to kvm-unit-tests.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Christian Borntraeger
On 19/08/14 12:31, Paolo Bonzini wrote:
 Il 19/08/2014 12:09, Christian Borntraeger ha scritto:
 I'm not sure, this does seem like a workaround for another
 limitation after all...  Gleb?

 Yes. We want to get rid of KVM_S390_INITIAL_RESET in QEMU. This comes
 from a time, when we had another userspace prototype for KVM on s390
 (kuli). Its really a wart that has to go. Its just that we are not
 there yet to remove the call to KVM_S390_INITIAL_RESET. Doing so can
 result in hard to debug errors after reboot, if an interrupt was made
 pending just before reboot that gets delivered in the new instance.

 The new way for local interrupt read/write will probably be some
 onereg or syncreg interface with a bitmask register and payload
 registers. We have to solve some concurrency and implemenation issues
 here.
 
 Yes, I understand; the plan is fine and it's good that it was already on
 your todo list.
 
 But since you acknowledge that KVM_S390_INITIAL_RESET will go, I'm not
 sure we want to apply this patch (except for the pid == 0 part, of
 course---that one is good).

Well, it makes todays QEMU (a lot) faster on s390 bootup with many CPUs. 
(According to strace on my system the first GET_FPU ioctl takes up to 0.079 
sec. With 64 CPUs this sums up to several seconds.
But I understand your concern of touching generic KVM code only if really 
necessary. Let me know if I should send a minimal pid==0 version. (I would 
prefer the full version, of course).

Christian

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 12:48, Christian Borntraeger ha scritto:
 But I understand your concern of touching generic KVM code only if
 really necessary. Let me know if I should send a minimal pid==0
 version. (I would prefer the full version, of course).

Yes, please do.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/3] KVM: vmx: fix ept reserved bits for 1-GByte page

2014-08-19 Thread Wanpeng Li
On Tue, Aug 19, 2014 at 11:09:49AM +0200, Paolo Bonzini wrote:
Il 19/08/2014 11:04, Wanpeng Li ha scritto:
 EPT misconfig handler in kvm will check which reason lead to EPT 
 misconfiguration after vmexit. One of the reasons is that an EPT 
 paging-structure entry is configured with settings reserved for 
 future functionality. However, the handler can't identify if 
 paging-structure entry of reserved bits for 1-GByte page are 
 configured, since PDPTE which point to 1-GByte page will reserve 
 bits 29:12 instead of bits 7:3 which are reserved for PDPTE that 
 references an EPT Page Directory. This patch fix it by reserve 
 bits 29:12 for 1-GByte page. 
 
 Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com
 ---
 v1 - v2:
  * same if statement cover both 2MB and 1GB pages
  * return 0xf8 for level == 4

I think you dropped this check by mistake.

Indeed. I will do it in next version.


  * get the level by checking the return value of ept_rsvd_mask 
 
  arch/x86/kvm/vmx.c | 19 +++
  1 file changed, 7 insertions(+), 12 deletions(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index cad37d5..2763f37 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -5521,17 +5521,12 @@ static u64 ept_rsvd_mask(u64 spte, int level)
  for (i = 51; i  boot_cpu_data.x86_phys_bits; i--)
  mask |= (1ULL  i);
  
 -if (level  2)
 -/* bits 7:3 reserved */
 -mask |= 0xf8;
 -else if (level == 2) {
 -if (spte  (1ULL  7))
 -/* 2MB ref, bits 20:12 reserved */
 -mask |= 0x1ff000;
 -else
 -/* bits 6:3 reserved */
 -mask |= 0x78;
 -}
 +if (spte  (1ULL  7))

You need to go this way if level == 1 too.  Otherwise, you would report
bits 6:3 reserved if the hypervisor is using the ignored bit 7 (Table
28-6, Format of an EPT Page-Table Entry).


Agreed. What still need to do here is to update the comments in order to  
include level == 1, right?

 +/* 1GB/2MB page, bits 29:12 or 20:12 reserved respectively */
 +mask |= (PAGE_SIZE  ((level - 1) * 9)) - PAGE_SIZE;
 +else
 +/* bits 6:3 reserved */
 +mask |= 0x78;
  
  return mask;
  }
 @@ -5561,7 +5556,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu 
 *vcpu, u64 spte,
  WARN_ON(1);
  }
  
 -if (level == 1 || (level == 2  (spte  (1ULL  7 {
 +if (level == 1 || (rsvd_bits  0x38)) {

- rsvd_bits will always be zero here.  You need to check the return
value of ept_rsvd_mask().  Let's call it rsvd_mask in the rest of this
email.

- the test is inverted, you need to check that bits 5:3 are _not_
reserved, hence (rsvd_mask  0x38) == 0.

- once you do this, the test also covers level 1.

Agreed.


I suggest that you write a testcase for kvm-unit-tests.


Ok. 

Regards,
Wanpeng Li 

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm:iommu: fix the third parameter of kvm_iommu_put_pages

2014-08-19 Thread zhanghailiang
The third parameter of kvm_iommu_put_pages is wrong,
It should be 'gfn-slot-base_gfn'

Signed-off-by: zhanghailiang zhang.zhanghaili...@huawei.com
---
 virt/kvm/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 0df7d4b..c02f9a3 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -134,7 +134,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct 
kvm_memory_slot *slot)
return 0;
 
 unmap_pages:
-   kvm_iommu_put_pages(kvm, slot-base_gfn, gfn);
+   kvm_iommu_put_pages(kvm, slot-base_gfn, gfn - slot-base_gfn);
return r;
 }
 
-- 
1.7.12.4


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread David Hildenbrand
 Il 19/08/2014 10:38, Christian Borntraeger ha scritto:
   The patch may be okay, but I'm worried that it might be hiding a bug in
   QEMU.
  On s390 we call KVM_S390_INITIAL_RESET from several reset functions, e.g. 
  during 
  CPU creation. This is the first hickup and the pid now points to the main 
  thread.
 
 Any reason to have a special ioctl instead of SET_REGS/SET_ONE_REG/...
 (via kvm_cpu_synchronize_state, which does the ioctls in the VCPU thread)?
 
 Paolo

Looking at the code, kvm_cpu_synchronize_state() seems to do these ioctls in
the vcpu thread (e.g. comming from cpu_synchronize_all_states()), any reasons
why kvm_cpu_synchronize_post_reset() doesn't do the same (e.g. called from
cpu_synchronize_all_post_reset())?

David

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] KVM: Introduce gfn_to_hva_memslow_prot

2014-08-19 Thread Christoffer Dall
To support read-only memory regions on arm and arm64, we have a need to
resolve a gfn to an hva given a pointer to a memslot to avoid looping
through the memslots twice and to reuse the hva error checking of
gfn_to_hva_prot(), add a new gfn_to_hva_memslot_prot() function and
refactor gfn_to_hva_prot() to use this function.

Acked-by: Marc Zyngier marc.zyng...@arm.com
Signed-off-by: Christoffer Dall christoffer.d...@linaro.org
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c  | 11 +--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a4c33b3..85875e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -528,6 +528,8 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable);
 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
+unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
+ bool *writable);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 33712fb..36b887d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1075,9 +1075,9 @@ EXPORT_SYMBOL_GPL(gfn_to_hva);
  * If writable is set to false, the hva returned by this function is only
  * allowed to be read.
  */
-unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
+ gfn_t gfn, bool *writable)
 {
-   struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
 
if (!kvm_is_error_hva(hva)  writable)
@@ -1086,6 +1086,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t 
gfn, bool *writable)
return hva;
 }
 
+unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+{
+   struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+
+   return gfn_to_hva_memslot_prot(slot, gfn, writable);
+}
+
 static int kvm_read_hva(void *data, void __user *hva, int len)
 {
return __copy_from_user(data, hva, len);
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] arm/arm64: KVM: Support KVM_CAP_READONLY_MEM

2014-08-19 Thread Christoffer Dall
When userspace loads code and data in a read-only memory regions, KVM
needs to be able to handle this on arm and arm64.  Specifically this is
used when running code directly from a read-only flash device; the
common scenario is a UEFI blob loaded with the -bios option in QEMU.

Note that the MMIO exit on writes to a read-only memory is ABI and can
be used to emulate block-erase style flash devices.

Acked-by: Marc Zyngier marc.zyng...@arm.com
Signed-off-by: Christoffer Dall christoffer.d...@linaro.org
---
 arch/arm/include/uapi/asm/kvm.h   |  1 +
 arch/arm/kvm/arm.c|  1 +
 arch/arm/kvm/mmu.c| 15 ---
 arch/arm64/include/uapi/asm/kvm.h |  1 +
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index e6ebdd3..51257fd 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -25,6 +25,7 @@
 
 #define __KVM_HAVE_GUEST_DEBUG
 #define __KVM_HAVE_IRQ_LINE
+#define __KVM_HAVE_READONLY_MEM
 
 #define KVM_REG_SIZE(id)   \
(1U  (((id)  KVM_REG_SIZE_MASK)  KVM_REG_SIZE_SHIFT))
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index a99e0cd..3ab3e60 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -188,6 +188,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ONE_REG:
case KVM_CAP_ARM_PSCI:
case KVM_CAP_ARM_PSCI_0_2:
+   case KVM_CAP_READONLY_MEM:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 16e7994..dcbe01e 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -747,14 +747,13 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, 
phys_addr_t *ipap)
 }
 
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- struct kvm_memory_slot *memslot,
+ struct kvm_memory_slot *memslot, unsigned long hva,
  unsigned long fault_status)
 {
int ret;
bool write_fault, writable, hugetlb = false, force_pte = false;
unsigned long mmu_seq;
gfn_t gfn = fault_ipa  PAGE_SHIFT;
-   unsigned long hva = gfn_to_hva(vcpu-kvm, gfn);
struct kvm *kvm = vcpu-kvm;
struct kvm_mmu_memory_cache *memcache = vcpu-arch.mmu_page_cache;
struct vm_area_struct *vma;
@@ -863,7 +862,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
unsigned long fault_status;
phys_addr_t fault_ipa;
struct kvm_memory_slot *memslot;
-   bool is_iabt;
+   unsigned long hva;
+   bool is_iabt, write_fault, writable;
gfn_t gfn;
int ret, idx;
 
@@ -884,7 +884,10 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
idx = srcu_read_lock(vcpu-kvm-srcu);
 
gfn = fault_ipa  PAGE_SHIFT;
-   if (!kvm_is_visible_gfn(vcpu-kvm, gfn)) {
+   memslot = gfn_to_memslot(vcpu-kvm, gfn);
+   hva = gfn_to_hva_memslot_prot(memslot, gfn, writable);
+   write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu));
+   if (kvm_is_error_hva(hva) || (write_fault  !writable)) {
if (is_iabt) {
/* Prefetch Abort on I/O address */
kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
@@ -910,9 +913,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
goto out_unlock;
}
 
-   memslot = gfn_to_memslot(vcpu-kvm, gfn);
-
-   ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status);
+   ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
if (ret == 0)
ret = 1;
 out_unlock:
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index e633ff8..f4ec5a6 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -37,6 +37,7 @@
 
 #define __KVM_HAVE_GUEST_DEBUG
 #define __KVM_HAVE_IRQ_LINE
+#define __KVM_HAVE_READONLY_MEM
 
 #define KVM_REG_SIZE(id)   \
(1U  (((id)  KVM_REG_SIZE_MASK)  KVM_REG_SIZE_SHIFT))
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 13:28, David Hildenbrand ha scritto:
 Looking at the code, kvm_cpu_synchronize_state() seems to do these ioctls in
 the vcpu thread (e.g. comming from cpu_synchronize_all_states()), any reasons
 why kvm_cpu_synchronize_post_reset() doesn't do the same (e.g. called from
 cpu_synchronize_all_post_reset())?

No reason, feel free to post a patch for QEMU kvm-all.c.
Documentation/virtual/kvm/api.txt clearly says:

   Only run vcpu ioctls from the same thread that was used to create the
   vcpu.

Paolo

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/3] KVM: vmx: fix ept reserved bits for 1-GByte page

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 13:16, Wanpeng Li ha scritto:
 On Tue, Aug 19, 2014 at 11:09:49AM +0200, Paolo Bonzini wrote:
 Il 19/08/2014 11:04, Wanpeng Li ha scritto:
 EPT misconfig handler in kvm will check which reason lead to EPT 
 misconfiguration after vmexit. One of the reasons is that an EPT 
 paging-structure entry is configured with settings reserved for 
 future functionality. However, the handler can't identify if 
 paging-structure entry of reserved bits for 1-GByte page are 
 configured, since PDPTE which point to 1-GByte page will reserve 
 bits 29:12 instead of bits 7:3 which are reserved for PDPTE that 
 references an EPT Page Directory. This patch fix it by reserve 
 bits 29:12 for 1-GByte page. 

 Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com
 ---
 v1 - v2:
  * same if statement cover both 2MB and 1GB pages
  * return 0xf8 for level == 4

 I think you dropped this check by mistake.
 
 Indeed. I will do it in next version.
 

  * get the level by checking the return value of ept_rsvd_mask 

  arch/x86/kvm/vmx.c | 19 +++
  1 file changed, 7 insertions(+), 12 deletions(-)

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index cad37d5..2763f37 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -5521,17 +5521,12 @@ static u64 ept_rsvd_mask(u64 spte, int level)
 for (i = 51; i  boot_cpu_data.x86_phys_bits; i--)
 mask |= (1ULL  i);
  
 -   if (level  2)
 -   /* bits 7:3 reserved */
 -   mask |= 0xf8;
 -   else if (level == 2) {
 -   if (spte  (1ULL  7))
 -   /* 2MB ref, bits 20:12 reserved */
 -   mask |= 0x1ff000;
 -   else
 -   /* bits 6:3 reserved */
 -   mask |= 0x78;
 -   }
 +   if (spte  (1ULL  7))

 You need to go this way if level == 1 too.  Otherwise, you would report
 bits 6:3 reserved if the hypervisor is using the ignored bit 7 (Table
 28-6, Format of an EPT Page-Table Entry).

 
 Agreed. What still need to do here is to update the comments in order to  
 include level == 1, right?

Yes.

 +   /* 1GB/2MB page, bits 29:12 or 20:12 reserved respectively */
 +   mask |= (PAGE_SIZE  ((level - 1) * 9)) - PAGE_SIZE;
 +   else
 +   /* bits 6:3 reserved */
 +   mask |= 0x78;
  
 return mask;
  }
 @@ -5561,7 +5556,7 @@ static void ept_misconfig_inspect_spte(struct 
 kvm_vcpu *vcpu, u64 spte,
 WARN_ON(1);
 }
  
 -   if (level == 1 || (level == 2  (spte  (1ULL  7 {
 +   if (level == 1 || (rsvd_bits  0x38)) {

 - rsvd_bits will always be zero here.  You need to check the return
 value of ept_rsvd_mask().  Let's call it rsvd_mask in the rest of this
 email.

 - the test is inverted, you need to check that bits 5:3 are _not_
 reserved, hence (rsvd_mask  0x38) == 0.

 - once you do this, the test also covers level 1.
 
 Agreed.

Thanks,

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread David Hildenbrand
 Il 19/08/2014 13:28, David Hildenbrand ha scritto:
  Looking at the code, kvm_cpu_synchronize_state() seems to do these ioctls in
  the vcpu thread (e.g. comming from cpu_synchronize_all_states()), any 
  reasons
  why kvm_cpu_synchronize_post_reset() doesn't do the same (e.g. called from
  cpu_synchronize_all_post_reset())?
 
 No reason, feel free to post a patch for QEMU kvm-all.c.
 Documentation/virtual/kvm/api.txt clearly says:
 
Only run vcpu ioctls from the same thread that was used to create the
vcpu.
 
 Paolo
 

Thanks! A little more tweaking in the other parts of s390x resets
and we should be able to reduce the number of wrong ioctls (I think I found
most cases that are responsible for the performance degradation).

David

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 82761] New: DMAR:[fault reason 06] PTE Read access is not set

2014-08-19 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=82761

Bug ID: 82761
   Summary: DMAR:[fault reason 06] PTE Read access is not set
   Product: Virtualization
   Version: unspecified
Kernel Version: 3.16.1
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: kvm
  Assignee: virtualization_...@kernel-bugs.osdl.org
  Reporter: ansalonistef...@gmail.com
Regression: No

When I boot with intel_iommu=on parameter, I get these errors repeated over
and over again in dmesg:
---
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
---


lspci -vvs 05:00.0
05:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8169 PCI
Gigabit Ethernet Controller (rev 10)
Subsystem: Realtek Semiconductor Co., Ltd. RTL8169/8110 Family PCI
Gigabit Ethernet NIC
Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV+ VGASnoop- ParErr-
Stepping- SERR- FastB2B- DisINTx-
Status: Cap+ 66MHz+ UDF- FastB2B+ ParErr- DEVSEL=medium TAbort-
TAbort- MAbort- SERR- PERR- INTx-
Latency: 64 (8000ns min, 16000ns max), Cache Line Size: 32 bytes
Interrupt: pin A routed to IRQ 19
Region 0: I/O ports at c200 [size=256]
Region 1: Memory at f7862000 (32-bit, non-prefetchable) [size=256]
Expansion ROM at f784 [disabled] [size=128K]
Capabilities: [dc] Power Management version 2
Flags: PMEClk- DSI- D1+ D2+ AuxCurrent=375mA
PME(D0-,D1+,D2+,D3hot+,D3cold+)
Status: D0 NoSoftRst- PME-Enable- DSel=0 DScale=0 PME-
Kernel driver in use: r8169


lspci -vt
-[:00]-+-00.0  Intel Corporation Xeon E3-1200 v2/3rd Gen Core processor
DRAM Controller
   +-01.0-[01]--+-00.0  NVIDIA Corporation Device 0fc6
   |\-00.1  NVIDIA Corporation Device 0e1b
   +-02.0  Intel Corporation Xeon E3-1200 v2/3rd Gen Core processor
Graphics Controller
   +-14.0  Intel Corporation 7 Series/C210 Series Chipset Family USB
xHCI Host Controller
   +-16.0  Intel Corporation 7 Series/C210 Series Chipset Family MEI
Controller #1
   +-1a.0  Intel Corporation 7 Series/C210 Series Chipset Family USB
Enhanced Host Controller #2
   +-1b.0  Intel Corporation 7 Series/C210 Series Chipset Family High
Definition Audio Controller
   +-1c.0-[02]--
   +-1c.1-[03]00.0  Realtek Semiconductor Co., Ltd. RTL8111/8168B
PCI Express Gigabit Ethernet controller
   +-1c.3-[04-05]00.0-[05]--+-00.0  Realtek Semiconductor Co., Ltd.
RTL8169 PCI Gigabit Ethernet Controller
   |+-01.0  Realtek Semiconductor Co., Ltd.
RTL8169 PCI Gigabit Ethernet Controller
   |\-02.0  Realtek Semiconductor Co., Ltd.
RTL8169 PCI Gigabit Ethernet Controller
   +-1d.0  Intel Corporation 7 Series/C210 Series Chipset Family USB
Enhanced Host Controller #1
   +-1f.0  Intel Corporation H77 Express Chipset LPC Controller
   +-1f.2  Intel Corporation 7 Series/C210 Series Chipset Family 6-port
SATA Controller [AHCI mode]
   \-1f.3  Intel Corporation 7 Series/C210 Series Chipset Family SMBus
Controller


dmesg | grep IOMMU
Intel-IOMMU: enabled
dmar: IOMMU 0: reg_base_addr fed9 ver 1:0 cap c020e60262 ecap f0101a
dmar: IOMMU 1: reg_base_addr fed91000 ver 1:0 cap c9008020660262 ecap f0105a
IOAPIC id 2 under DRHD base  0xfed91000 IOMMU 1
IOMMU 0 0xfed9: using Queued invalidation
IOMMU 1 0xfed91000: using Queued invalidation
IOMMU: software identity mapping for device :00:00.0
IOMMU: software identity mapping for device :00:01.0
IOMMU: software identity mapping for device :00:14.0
IOMMU: software identity mapping for device :00:16.0
IOMMU: software identity mapping for device :00:1a.0
IOMMU: software identity mapping for device :00:1b.0
IOMMU: software identity mapping for device :00:1c.0
IOMMU: software identity mapping for device :00:1c.1
IOMMU: software identity mapping for device :00:1c.3
IOMMU: software identity mapping for device :00:1d.0
IOMMU: software identity mapping for device :00:1f.0
IOMMU: software identity mapping for device :00:1f.2
IOMMU: software identity mapping for device :00:1f.3
IOMMU: software identity mapping for device :01:00.0
IOMMU: software identity mapping for device :01:00.1
IOMMU: software identity mapping for device :03:00.0
IOMMU: Setting RMRR:
IOMMU: Setting identity map for device :00:02.0 [0xcb80 - 0xcf9f]
IOMMU: Setting identity map for device :00:14.0 [0xc8d17000 - 0xc8d24fff]
IOMMU: Setting identity map for device :00:1a.0 [0xc8d17000 - 0xc8d24fff]
IOMMU: Setting identity map for device :00:1d.0 [0xc8d17000 - 0xc8d24fff]
IOMMU: 

Re: [PATCH v4] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Christoffer Dall
On Mon, Aug 18, 2014 at 03:30:58PM -0500, Joel Schopp wrote:
 
  #endif /* __ARM_KVM_MMU_H__ */
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 16e7994..70f0f02 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -521,6 +521,7 @@ int create_hyp_io_mappings(void *from, void *to, 
 phys_addr_t phys_addr)
   */
  int kvm_alloc_stage2_pgd(struct kvm *kvm)
  {
 + unsigned int s2_pgds, s2_pgd_order;
   pgd_t *pgd;
  
   if (kvm-arch.pgd != NULL) {
 @@ -528,10 +529,18 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
   return -EINVAL;
   }
  
 - pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, S2_PGD_ORDER);
 + s2_pgds = (1  (kvm_get_phys_addr_shift() - PGDIR_SHIFT));
 + s2_pgd_order = get_order(s2_pgds * sizeof(pgd_t));
 +
 + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, s2_pgd_order);
   if (!pgd)
   return -ENOMEM;
  
 + if ((unsigned long)pgd  ~vttbr_baddr_mask) {
 + kvm_err(Stage-2 pgd not correctly aligned: %p\n, pgd);
 + return -EFAULT;
 + }
 
 
 There are two problems that I've found here.  The first problem is that
 vttbr_baddr_mask isn't allocated yet at this point in the code.

allocated? you mean assigned?
aren't you setting vttbr_baddr_mask in kvm_arch_init()?  that's
certainly called before kvm_arch_init_vm().


 The
 second problem is that pgd is a virtual address, ie pgd ==
 0xfe03bbb4 while the vttbr masks off the high bits for a
 physical address, ie vttbr_baddr_mask=0x7ffe .  Even
 correcting for those issues I haven't been able to make this check work
 properly.  I'll resend v5 the patch with all the other suggested changes.
 

What are the issues that you face?  Iow. what is the alignment of the
returned physical address?

(You should be able to just to virt_to_phys(pgd) and use that to test
for the vttbr_baddr_mask).


Thanks,
-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Christoffer Dall
On Mon, Aug 18, 2014 at 03:36:04PM -0500, Joel Schopp wrote:
 The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
 systems.  Rather than just add a bit it seems like a good time to also set
 things at run-time instead of compile time to accomodate more hardware.
 
 This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
 not compile time.
 
 In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
 size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
 depend on hardware capability.
 
 According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
 vttbr_x is calculated using different fixed values with consideration
 of T0SZ, granule size and the level of translation tables. Therefore,
 vttbr_baddr_mask should be determined dynamically.
 
 Changes since v4:
 More minor cleanups from review
 Moved some functions into headers
 
 Changes since v3:
 Another rebase
 Addressed minor comments from v2
 
 Changes since v2:
 Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git 
 next branch
 
 Changes since v1:
 Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
 provide better long term fix.  Updated that patch to log error instead of
 silently fail on unaligned vttbr.
 
 Cc: Christoffer Dall christoffer.d...@linaro.org
 Cc: Sungjinn Chung sungjinn.ch...@samsung.com
 Signed-off-by: Jungseok Lee jays@samsung.com
 Signed-off-by: Joel Schopp joel.sch...@amd.com
 ---
  arch/arm/include/asm/kvm_mmu.h   |   12 ++
  arch/arm/kvm/arm.c   |   17 +++-
  arch/arm64/include/asm/kvm_arm.h |   17 +---
  arch/arm64/include/asm/kvm_mmu.h |   78 
 ++
  arch/arm64/kvm/hyp-init.S|   20 +++---
  5 files changed, 122 insertions(+), 22 deletions(-)
 
 diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
 index 5c7aa3c..73f6ff6 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -166,6 +166,18 @@ static inline void coherent_cache_guest_page(struct 
 kvm_vcpu *vcpu, hva_t hva,
  
  void stage2_flush_vm(struct kvm *kvm);
  
 +static inline int kvm_get_phys_addr_shift(void)
 +{
 + return KVM_PHYS_SHIFT;
 +}
 +
 +static inline int set_vttbr_baddr_mask(void)
 +{
 + vttbr_baddr_mask = VTTBR_BADDR_MASK;
 + return 0;
 +}
 +
 +
  #endif   /* !__ASSEMBLY__ */
  
  #endif /* __ARM_KVM_MMU_H__ */
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 3c82b37..f396eb7 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -37,6 +37,7 @@
  #include asm/mman.h
  #include asm/tlbflush.h
  #include asm/cacheflush.h
 +#include asm/cputype.h
  #include asm/virt.h
  #include asm/kvm_arm.h
  #include asm/kvm_asm.h
 @@ -466,8 +467,14 @@ static void update_vttbr(struct kvm *kvm)
   /* update vttbr to be used with the new vmid */
   pgd_phys = virt_to_phys(kvm-arch.pgd);
   vmid = ((u64)(kvm-arch.vmid)  VTTBR_VMID_SHIFT)  VTTBR_VMID_MASK;
 - kvm-arch.vttbr = pgd_phys  VTTBR_BADDR_MASK;
 - kvm-arch.vttbr |= vmid;
 +
 + /*
 +  * If the VTTBR isn't aligned there is something wrong with the system
 +  * or kernel.
 +  */
 + BUG_ON(pgd_phys  ~vttbr_baddr_mask);
 +
 + kvm-arch.vttbr = pgd_phys | vmid;
  
   spin_unlock(kvm_vmid_lock);
  }
 @@ -1052,6 +1059,12 @@ int kvm_arch_init(void *opaque)
   }
   }
  
 + err = set_vttbr_baddr_mask();
 + if (err) {
 + kvm_err(Cannot set vttbr_baddr_mask\n);
 + return -EINVAL;
 + }
 +
   cpu_notifier_register_begin();
  
   err = init_hyp_mode();
 diff --git a/arch/arm64/include/asm/kvm_arm.h 
 b/arch/arm64/include/asm/kvm_arm.h
 index 3d69030..8dbef70 100644
 --- a/arch/arm64/include/asm/kvm_arm.h
 +++ b/arch/arm64/include/asm/kvm_arm.h
 @@ -94,7 +94,6 @@
  /* TCR_EL2 Registers bits */
  #define TCR_EL2_TBI  (1  20)
  #define TCR_EL2_PS   (7  16)
 -#define TCR_EL2_PS_40B   (2  16)
  #define TCR_EL2_TG0  (1  14)
  #define TCR_EL2_SH0  (3  12)
  #define TCR_EL2_ORGN0(3  10)
 @@ -103,8 +102,6 @@
  #define TCR_EL2_MASK (TCR_EL2_TG0 | TCR_EL2_SH0 | \
TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
  
 -#define TCR_EL2_FLAGS(TCR_EL2_PS_40B)
 -
  /* VTCR_EL2 Registers bits */
  #define VTCR_EL2_PS_MASK (7  16)
  #define VTCR_EL2_TG0_MASK(1  14)
 @@ -119,36 +116,28 @@
  #define VTCR_EL2_SL0_MASK(3  6)
  #define VTCR_EL2_SL0_LVL1(1  6)
  #define VTCR_EL2_T0SZ_MASK   0x3f
 -#define VTCR_EL2_T0SZ_40B24
 +#define VTCR_EL2_T0SZ(bits)  (64 - (bits))
  
  #ifdef CONFIG_ARM64_64K_PAGES
  /*
   * Stage2 translation configuration:
 - * 40bits output (PS = 2)
 - * 40bits input  (T0SZ = 24)
   * 64kB pages (TG0 = 1)
   * 2 level page tables (SL = 1)
   */
  #define VTCR_EL2_FLAGS   (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER 
 | \
VTCR_EL2_ORGN0_WBWA | 

[Bug 82761] DMAR:[fault reason 06] PTE Read access is not set

2014-08-19 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=82761

Alex Williamson alex.william...@redhat.com changed:

   What|Removed |Added

 CC||alex.william...@redhat.com

--- Comment #1 from Alex Williamson alex.william...@redhat.com ---
Does it work on 3.17-rc1?  Are all of the 8169 NICs on bus 05 up and running? 
Please provide lspci -vv info for 04:00.0.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm:iommu: fix the third parameter of kvm_iommu_put_pages

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 13:14, zhanghailiang ha scritto:
 The third parameter of kvm_iommu_put_pages is wrong,
 It should be 'gfn-slot-base_gfn'

This fixes a crash, doesn't it?  Please mention it in the commit
message, because we have to assign a CVE for this kind of issue.  It
will be CVE-2014-3601.

I have learnt about this crash last week from Michael Tsirkin, whose
patch also had the following hunk:

diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 0df7d4b34dfe..1e458a7d96f1 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -123,6 +123,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct
kvm_memory_slot *slot)
if (r) {
printk(KERN_ERR kvm_iommu_map_address:
   iommu failed to map pfn=%llx\n, pfn);
+   kvm_unpin_pages(kvm, pfn, page_size);
goto unmap_pages;
}

So I'm going to apply his instead.

Thanks for the report,

Paolo

 Signed-off-by: zhanghailiang zhang.zhanghaili...@huawei.com
 ---
  virt/kvm/iommu.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
 index 0df7d4b..c02f9a3 100644
 --- a/virt/kvm/iommu.c
 +++ b/virt/kvm/iommu.c
 @@ -134,7 +134,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct 
 kvm_memory_slot *slot)
   return 0;
  
  unmap_pages:
 - kvm_iommu_put_pages(kvm, slot-base_gfn, gfn);
 + kvm_iommu_put_pages(kvm, slot-base_gfn, gfn - slot-base_gfn);
   return r;
  }
  
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kvm-unit-tests v2] x86: Test task-switch with cs.rpl != cs.dpl

2014-08-19 Thread Nadav Amit
Commit 5045b46803 added a check that cs.dpl equals cs.rpl during task-switch.
This is a wrong check, and this patch introduces a test in which cs.dpl !=
cs.rpl. To do so, it configures tss.cs to be conforming with rpl=3 and dpl=0.
Since the cpl after calling is 3, it does not make any prints in the callee.

Signed-off-by: Nadav Amit na...@cs.technion.ac.il
---
 x86/taskswitch2.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/x86/taskswitch2.c b/x86/taskswitch2.c
index 92fc941..1fe833e 100644
--- a/x86/taskswitch2.c
+++ b/x86/taskswitch2.c
@@ -7,6 +7,7 @@
 
 #define MAIN_TSS_SEL (FIRST_SPARE_SEL + 0)
 #define VM86_TSS_SEL (FIRST_SPARE_SEL + 8)
+#define CONFORM_CS_SEL  (FIRST_SPARE_SEL + 16)
 
 static volatile int test_count;
 static volatile unsigned int test_divider;
@@ -102,6 +103,14 @@ start:
goto start;
 }
 
+static void user_tss(void)
+{
+start:
+   test_count++;
+   asm volatile (iret);
+   goto start;
+}
+
 void test_kernel_mode_int()
 {
unsigned int res;
@@ -248,6 +257,19 @@ void test_vm86_switch(void)
 report(VM86, 1);
 }
 
+void test_conforming_switch(void)
+{
+   /* test lcall with conforming segment, cs.dpl != cs.rpl */
+   test_count = 0;
+   set_intr_task_gate(0, user_tss);
+
+   tss_intr.cs = CONFORM_CS_SEL | 3;
+   tss_intr.ds = tss_intr.gs = tss_intr.fs = tss_intr.ss = USER_DS;
+   set_gdt_entry(CONFORM_CS_SEL, 0, 0x, 0x9f, 0xc0);
+   asm volatile(lcall $ xstr(TSS_INTR) , $0xf4f4f4f4);
+   report(lcall with cs.rpl != cs.dpl, test_count == 1);
+}
+
 int main()
 {
setup_vm();
@@ -256,6 +278,7 @@ int main()
 
test_kernel_mode_int();
test_vm86_switch();
+   test_conforming_switch();
 
return report_summary();
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH kvm-unit-tests 1/2] x86: Use host CPU parameter for apic test

2014-08-19 Thread Eduardo Habkost
On Tue, Aug 19, 2014 at 11:51:05AM +0200, Paolo Bonzini wrote:
 Il 18/08/2014 21:43, Nadav Amit ha scritto:
  Currently, the TSC deadline test never runs, since TSC deadline is disabled
  unless the host cpu parameter is used. This patch changes the apic test to 
  use
  the qemu host cpu parameter.
 
 Better use
  Signed-off-by: Nadav Amit na...@cs.technion.ac.il
  ---
   x86/unittests.cfg | 2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)
  
  diff --git a/x86/unittests.cfg b/x86/unittests.cfg
  index 6d3e23a..f692b2b 100644
  --- a/x86/unittests.cfg
  +++ b/x86/unittests.cfg
  @@ -9,7 +9,7 @@
   [apic]
   file = apic.flat
   smp = 2
  -extra_params = -cpu qemu64,+x2apic
  +extra_params = -cpu host,+x2apic
   arch = x86_64
   
   [smptest]
  
 
 Eduardo, I think we should add tsc_deadline to QEMU instead?

+tsc-deadline is supported by QEMU since v1.1.0.

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH kvm-unit-tests 1/2] x86: Use host CPU parameter for apic test

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 15:18, Eduardo Habkost ha scritto:
 On Tue, Aug 19, 2014 at 11:51:05AM +0200, Paolo Bonzini wrote:
 Il 18/08/2014 21:43, Nadav Amit ha scritto:
 Currently, the TSC deadline test never runs, since TSC deadline is disabled
 unless the host cpu parameter is used. This patch changes the apic test to 
 use
 the qemu host cpu parameter.

 Better use
 Signed-off-by: Nadav Amit na...@cs.technion.ac.il
 ---
  x86/unittests.cfg | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/x86/unittests.cfg b/x86/unittests.cfg
 index 6d3e23a..f692b2b 100644
 --- a/x86/unittests.cfg
 +++ b/x86/unittests.cfg
 @@ -9,7 +9,7 @@
  [apic]
  file = apic.flat
  smp = 2
 -extra_params = -cpu qemu64,+x2apic
 +extra_params = -cpu host,+x2apic
  arch = x86_64
  
  [smptest]


 Eduardo, I think we should add tsc_deadline to QEMU instead?
 
 +tsc-deadline is supported by QEMU since v1.1.0.

Thanks, I applied this:

diff --git a/x86/unittests.cfg b/x86/unittests.cfg
index 6d3e23a..0123944 100644
--- a/x86/unittests.cfg
+++ b/x86/unittests.cfg
@@ -9,7 +9,7 @@
 [apic]
 file = apic.flat
 smp = 2
-extra_params = -cpu qemu64,+x2apic
+extra_params = -cpu qemu64,+x2apic,+tsc-deadline
 arch = x86_64

 [smptest]

Paolo

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH kvm-unit-tests v2] x86: Test task-switch with cs.rpl != cs.dpl

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 15:04, Nadav Amit ha scritto:
 Commit 5045b46803 added a check that cs.dpl equals cs.rpl during task-switch.
 This is a wrong check, and this patch introduces a test in which cs.dpl !=
 cs.rpl. To do so, it configures tss.cs to be conforming with rpl=3 and dpl=0.
 Since the cpl after calling is 3, it does not make any prints in the callee.
 
 Signed-off-by: Nadav Amit na...@cs.technion.ac.il
 ---
  x86/taskswitch2.c | 23 +++
  1 file changed, 23 insertions(+)
 
 diff --git a/x86/taskswitch2.c b/x86/taskswitch2.c
 index 92fc941..1fe833e 100644
 --- a/x86/taskswitch2.c
 +++ b/x86/taskswitch2.c
 @@ -7,6 +7,7 @@
  
  #define MAIN_TSS_SEL (FIRST_SPARE_SEL + 0)
  #define VM86_TSS_SEL (FIRST_SPARE_SEL + 8)
 +#define CONFORM_CS_SEL  (FIRST_SPARE_SEL + 16)
  
  static volatile int test_count;
  static volatile unsigned int test_divider;
 @@ -102,6 +103,14 @@ start:
   goto start;
  }
  
 +static void user_tss(void)
 +{
 +start:
 + test_count++;
 + asm volatile (iret);
 + goto start;
 +}
 +
  void test_kernel_mode_int()
  {
   unsigned int res;
 @@ -248,6 +257,19 @@ void test_vm86_switch(void)
  report(VM86, 1);
  }
  
 +void test_conforming_switch(void)
 +{
 + /* test lcall with conforming segment, cs.dpl != cs.rpl */
 + test_count = 0;
 + set_intr_task_gate(0, user_tss);

No need to use set_intr_task_gate, since the IDT is not involved here.

tss_intr.eip = (u32)user_tss;

is enough.

I fixed this up and applied the patch.  Thanks!

Paolo

 +
 + tss_intr.cs = CONFORM_CS_SEL | 3;
 + tss_intr.ds = tss_intr.gs = tss_intr.fs = tss_intr.ss = USER_DS;
 + set_gdt_entry(CONFORM_CS_SEL, 0, 0x, 0x9f, 0xc0);
 + asm volatile(lcall $ xstr(TSS_INTR) , $0xf4f4f4f4);
 + report(lcall with cs.rpl != cs.dpl, test_count == 1);
 +}
 +
  int main()
  {
   setup_vm();
 @@ -256,6 +278,7 @@ int main()
  
   test_kernel_mode_int();
   test_vm86_switch();
 + test_conforming_switch();
  
   return report_summary();
  }
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH kvm-unit-tests v2] x86: Test task-switch with cs.rpl != cs.dpl

2014-08-19 Thread Nadav Amit

On Aug 19, 2014, at 4:28 PM, Paolo Bonzini pbonz...@redhat.com wrote:

 Il 19/08/2014 15:04, Nadav Amit ha scritto:
 
 +void test_conforming_switch(void)
 +{
 +/* test lcall with conforming segment, cs.dpl != cs.rpl */
 +test_count = 0;
 +set_intr_task_gate(0, user_tss);
 
 No need to use set_intr_task_gate, since the IDT is not involved here.
 
tss_intr.eip = (u32)user_tss;
 
 is enough.
 
 I fixed this up and applied the patch.  Thanks!

I know, but all the other ‘call' tests did. If there is an error, at least it 
should be consistent. ;-)

Thanks,
Nadav


signature.asc
Description: Message signed with OpenPGP using GPGMail


Re: [PATCH kvm-unit-tests v2] x86: Test task-switch with cs.rpl != cs.dpl

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 15:34, Nadav Amit ha scritto:
 I know, but all the other ‘call' tests did. If there is an error, at
 least it should be consistent. ;-)

Right, let's fix it up:

diff --git a/x86/taskswitch2.c b/x86/taskswitch2.c
index fd9a404..3cfb467 100644
--- a/x86/taskswitch2.c
+++ b/x86/taskswitch2.c
@@ -190,7 +190,7 @@ void test_kernel_mode_int()

/* test that calling a task by lcall works */
test_count = 0;
-   set_intr_task_gate(0, irq_tss);
+   tss_intr.eip = (u32)irq_tss;
printf(Calling task by lcall\n);
/* hlt opcode is 0xf4 I use destination IP 0xf4f4f4f4 to catch
   incorrect instruction length calculation */
@@ -205,7 +205,7 @@ void test_kernel_mode_int()

/* test that calling a task by ljmp works */
test_count = 0;
-   set_intr_task_gate(0, jmp_tss);
+   tss_intr.eip = (u32)jmp_tss;
printf(Jumping to a task by ljmp\n);
asm volatile (ljmp $ xstr(TSS_INTR) , $0xf4f4f4f4);
printf(Jump back succeeded\n);


Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Christian Borntraeger
On 18/08/14 07:02, Wanpeng Li wrote:
 Hi Christian,
 On Tue, Aug 05, 2014 at 04:44:14PM +0200, Christian Borntraeger wrote:
 We currently track the pid of the task that runs the VCPU in
 vcpu_load. Since we call vcpu_load for all kind of ioctls on a
 CPU, this causes hickups due to synchronize_rcu if one CPU is
 modified by another CPU or the main thread (e.g. initialization,
 reset). We track the pid only for the purpose of yielding, so
 let's update the pid only in the KVM_RUN ioctl.

 In addition, don't do a synchronize_rcu on startup (pid == 0).

 This speeds up guest boot time on s390 noticably for some configs, e.g.
 HZ=100, no full state tracking, 64 guest cpus 32 host cpus.

 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
 CC: Rik van Riel r...@redhat.com
 CC: Raghavendra K T raghavendra...@linux.vnet.ibm.com
 CC: Michael Mueller m...@linux.vnet.ibm.com
 ---
 virt/kvm/kvm_main.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 9ae9135..ebc8f54 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -124,14 +124,6 @@ int vcpu_load(struct kvm_vcpu *vcpu)

  if (mutex_lock_killable(vcpu-mutex))
  return -EINTR;
 
 One question: 
 
 -if (unlikely(vcpu-pid != current-pids[PIDTYPE_PID].pid)) {
 
 When vcpu-pid and current-pids[PIDTYPE_PID].pid will be different?

If two different thread call an ioctl on a vcpu fd. (It must be an ioctl that 
has done vcpu_load - almost all except for some interrupt injections)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Christian Borntraeger
On 19/08/14 14:14, David Hildenbrand wrote:
 Il 19/08/2014 13:28, David Hildenbrand ha scritto:
 Looking at the code, kvm_cpu_synchronize_state() seems to do these ioctls in
 the vcpu thread (e.g. comming from cpu_synchronize_all_states()), any 
 reasons
 why kvm_cpu_synchronize_post_reset() doesn't do the same (e.g. called from
 cpu_synchronize_all_post_reset())?

 No reason, feel free to post a patch for QEMU kvm-all.c.
 Documentation/virtual/kvm/api.txt clearly says:

Only run vcpu ioctls from the same thread that was used to create the
vcpu.

 Paolo

 
 Thanks! A little more tweaking in the other parts of s390x resets
 and we should be able to reduce the number of wrong ioctls (I think I found
 most cases that are responsible for the performance degradation).

Hmm. We want to not only reduce, we want them be zero.
In addition to a reworked MP_STATE patch set, we might be able to change the 
code to call KVM_S390_INITIAL_RESET only from the cpu thread itself. 
If that simplifies things, we could avoid doing KVM_S390_INITIAL_RESET on CPU 
creation, because we know that all kernel version will do an implicit cpu reset 
on cpu creation anyway. Can you have a try on this as well when reworking that 
code? We could then fix this rcu performance penalty independent from getting 
rid of that ioctl.

Christian

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Joel Schopp

On 08/19/2014 07:22 AM, Christoffer Dall wrote:
 On Mon, Aug 18, 2014 at 03:30:58PM -0500, Joel Schopp wrote:
  #endif /* __ARM_KVM_MMU_H__ */
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 16e7994..70f0f02 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -521,6 +521,7 @@ int create_hyp_io_mappings(void *from, void *to, 
 phys_addr_t phys_addr)
   */
  int kvm_alloc_stage2_pgd(struct kvm *kvm)
  {
 +unsigned int s2_pgds, s2_pgd_order;
  pgd_t *pgd;
  
  if (kvm-arch.pgd != NULL) {
 @@ -528,10 +529,18 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
  return -EINVAL;
  }
  
 -pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, S2_PGD_ORDER);
 +s2_pgds = (1  (kvm_get_phys_addr_shift() - PGDIR_SHIFT));
 +s2_pgd_order = get_order(s2_pgds * sizeof(pgd_t));
 +
 +pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, s2_pgd_order);
  if (!pgd)
  return -ENOMEM;
  
 +if ((unsigned long)pgd  ~vttbr_baddr_mask) {
 +kvm_err(Stage-2 pgd not correctly aligned: %p\n, pgd);
 +return -EFAULT;
 +}


 There are two problems that I've found here.  The first problem is that
 vttbr_baddr_mask isn't allocated yet at this point in the code.
 allocated? you mean assigned?
 aren't you setting vttbr_baddr_mask in kvm_arch_init()?  that's
 certainly called before kvm_arch_init_vm().
Yes, I mean assigned, at least I got the first letter correct :)  All I
know is that vttbr_baddr_mask was still zero and checking for zero and
calling the set function gave it a value.


 The
 second problem is that pgd is a virtual address, ie pgd ==
 0xfe03bbb4 while the vttbr masks off the high bits for a
 physical address, ie vttbr_baddr_mask=0x7ffe .  Even
 correcting for those issues I haven't been able to make this check work
 properly.  I'll resend v5 the patch with all the other suggested changes.

 What are the issues that you face?  Iow. what is the alignment of the
 returned physical address?

 (You should be able to just to virt_to_phys(pgd) and use that to test
 for the vttbr_baddr_mask).
The addresses above are actually from my system, 64K page aligned on a
64K page kernel.  I did use virt_to_phys() and the kernel got a null
dereference and paniced, I didn't trace down where the panic was occuring.



 Thanks,
 -Christoffer

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread David Hildenbrand
 On 19/08/14 14:14, David Hildenbrand wrote:
  Il 19/08/2014 13:28, David Hildenbrand ha scritto:
  Looking at the code, kvm_cpu_synchronize_state() seems to do these ioctls 
  in
  the vcpu thread (e.g. comming from cpu_synchronize_all_states()), any 
  reasons
  why kvm_cpu_synchronize_post_reset() doesn't do the same (e.g. called from
  cpu_synchronize_all_post_reset())?
 
  No reason, feel free to post a patch for QEMU kvm-all.c.
  Documentation/virtual/kvm/api.txt clearly says:
 
 Only run vcpu ioctls from the same thread that was used to create the
 vcpu.
 
  Paolo
 
  
  Thanks! A little more tweaking in the other parts of s390x resets
  and we should be able to reduce the number of wrong ioctls (I think I 
  found
  most cases that are responsible for the performance degradation).
 
 Hmm. We want to not only reduce, we want them be zero.
 In addition to a reworked MP_STATE patch set, we might be able to change the 
 code to call KVM_S390_INITIAL_RESET only from the cpu thread itself. 
 If that simplifies things, we could avoid doing KVM_S390_INITIAL_RESET on CPU 
 creation, because we know that all kernel version will do an implicit cpu 
 reset on cpu creation anyway. Can you have a try on this as well when 
 reworking that code? We could then fix this rcu performance penalty 
 independent from getting rid of that ioctl.
 
 Christian
 

Already working on it, only one ioctl left on vcpu creation that is called
from wrong context, trying to hide from me. Restarts and resets are already
blasting fast.

David

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Joel Schopp

On 08/19/2014 07:24 AM, Christoffer Dall wrote:
 On Mon, Aug 18, 2014 at 03:36:04PM -0500, Joel Schopp wrote:
 The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
 systems.  Rather than just add a bit it seems like a good time to also set
 things at run-time instead of compile time to accomodate more hardware.

 This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
 not compile time.

 In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
 size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
 depend on hardware capability.

 According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
 vttbr_x is calculated using different fixed values with consideration
 of T0SZ, granule size and the level of translation tables. Therefore,
 vttbr_baddr_mask should be determined dynamically.

 Changes since v4:
 More minor cleanups from review
 Moved some functions into headers

 Changes since v3:
 Another rebase
 Addressed minor comments from v2

 Changes since v2:
 Rebased on https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git 
 next branch

 Changes since v1:
 Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
 provide better long term fix.  Updated that patch to log error instead of
 silently fail on unaligned vttbr.

 Cc: Christoffer Dall christoffer.d...@linaro.org
 Cc: Sungjinn Chung sungjinn.ch...@samsung.com
 Signed-off-by: Jungseok Lee jays@samsung.com
 Signed-off-by: Joel Schopp joel.sch...@amd.com
 ---
  arch/arm/include/asm/kvm_mmu.h   |   12 ++
  arch/arm/kvm/arm.c   |   17 +++-
  arch/arm64/include/asm/kvm_arm.h |   17 +---
  arch/arm64/include/asm/kvm_mmu.h |   78 
 ++
  arch/arm64/kvm/hyp-init.S|   20 +++---
  5 files changed, 122 insertions(+), 22 deletions(-)

 diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
 index 5c7aa3c..73f6ff6 100644
 --- a/arch/arm/include/asm/kvm_mmu.h
 +++ b/arch/arm/include/asm/kvm_mmu.h
 @@ -166,6 +166,18 @@ static inline void coherent_cache_guest_page(struct 
 kvm_vcpu *vcpu, hva_t hva,
  
  void stage2_flush_vm(struct kvm *kvm);
  
 +static inline int kvm_get_phys_addr_shift(void)
 +{
 +return KVM_PHYS_SHIFT;
 +}
 +
 +static inline int set_vttbr_baddr_mask(void)
 +{
 +vttbr_baddr_mask = VTTBR_BADDR_MASK;
 +return 0;
 +}
 +
 +
  #endif  /* !__ASSEMBLY__ */
  
  #endif /* __ARM_KVM_MMU_H__ */
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 3c82b37..f396eb7 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -37,6 +37,7 @@
  #include asm/mman.h
  #include asm/tlbflush.h
  #include asm/cacheflush.h
 +#include asm/cputype.h
  #include asm/virt.h
  #include asm/kvm_arm.h
  #include asm/kvm_asm.h
 @@ -466,8 +467,14 @@ static void update_vttbr(struct kvm *kvm)
  /* update vttbr to be used with the new vmid */
  pgd_phys = virt_to_phys(kvm-arch.pgd);
  vmid = ((u64)(kvm-arch.vmid)  VTTBR_VMID_SHIFT)  VTTBR_VMID_MASK;
 -kvm-arch.vttbr = pgd_phys  VTTBR_BADDR_MASK;
 -kvm-arch.vttbr |= vmid;
 +
 +/*
 + * If the VTTBR isn't aligned there is something wrong with the system
 + * or kernel.
 + */
 +BUG_ON(pgd_phys  ~vttbr_baddr_mask);
 +
 +kvm-arch.vttbr = pgd_phys | vmid;
  
  spin_unlock(kvm_vmid_lock);
  }
 @@ -1052,6 +1059,12 @@ int kvm_arch_init(void *opaque)
  }
  }
  
 +err = set_vttbr_baddr_mask();
 +if (err) {
 +kvm_err(Cannot set vttbr_baddr_mask\n);
 +return -EINVAL;
 +}
 +
  cpu_notifier_register_begin();
  
  err = init_hyp_mode();
 diff --git a/arch/arm64/include/asm/kvm_arm.h 
 b/arch/arm64/include/asm/kvm_arm.h
 index 3d69030..8dbef70 100644
 --- a/arch/arm64/include/asm/kvm_arm.h
 +++ b/arch/arm64/include/asm/kvm_arm.h
 @@ -94,7 +94,6 @@
  /* TCR_EL2 Registers bits */
  #define TCR_EL2_TBI (1  20)
  #define TCR_EL2_PS  (7  16)
 -#define TCR_EL2_PS_40B  (2  16)
  #define TCR_EL2_TG0 (1  14)
  #define TCR_EL2_SH0 (3  12)
  #define TCR_EL2_ORGN0   (3  10)
 @@ -103,8 +102,6 @@
  #define TCR_EL2_MASK(TCR_EL2_TG0 | TCR_EL2_SH0 | \
   TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
  
 -#define TCR_EL2_FLAGS   (TCR_EL2_PS_40B)
 -
  /* VTCR_EL2 Registers bits */
  #define VTCR_EL2_PS_MASK(7  16)
  #define VTCR_EL2_TG0_MASK   (1  14)
 @@ -119,36 +116,28 @@
  #define VTCR_EL2_SL0_MASK   (3  6)
  #define VTCR_EL2_SL0_LVL1   (1  6)
  #define VTCR_EL2_T0SZ_MASK  0x3f
 -#define VTCR_EL2_T0SZ_40B   24
 +#define VTCR_EL2_T0SZ(bits) (64 - (bits))
  
  #ifdef CONFIG_ARM64_64K_PAGES
  /*
   * Stage2 translation configuration:
 - * 40bits output (PS = 2)
 - * 40bits input  (T0SZ = 24)
   * 64kB pages (TG0 = 1)
   * 2 level page tables (SL = 1)
   */
  #define VTCR_EL2_FLAGS  (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER 
 | \
   

Re: [PATCH v4] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Christoffer Dall
On Tue, Aug 19, 2014 at 09:05:09AM -0500, Joel Schopp wrote:
 
 On 08/19/2014 07:22 AM, Christoffer Dall wrote:
  On Mon, Aug 18, 2014 at 03:30:58PM -0500, Joel Schopp wrote:
   #endif /* __ARM_KVM_MMU_H__ */
  diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
  index 16e7994..70f0f02 100644
  --- a/arch/arm/kvm/mmu.c
  +++ b/arch/arm/kvm/mmu.c
  @@ -521,6 +521,7 @@ int create_hyp_io_mappings(void *from, void *to, 
  phys_addr_t phys_addr)
*/
   int kvm_alloc_stage2_pgd(struct kvm *kvm)
   {
  +  unsigned int s2_pgds, s2_pgd_order;
 pgd_t *pgd;
   
 if (kvm-arch.pgd != NULL) {
  @@ -528,10 +529,18 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
 return -EINVAL;
 }
   
  -  pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, S2_PGD_ORDER);
  +  s2_pgds = (1  (kvm_get_phys_addr_shift() - PGDIR_SHIFT));
  +  s2_pgd_order = get_order(s2_pgds * sizeof(pgd_t));
  +
  +  pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, s2_pgd_order);
 if (!pgd)
 return -ENOMEM;
   
  +  if ((unsigned long)pgd  ~vttbr_baddr_mask) {
  +  kvm_err(Stage-2 pgd not correctly aligned: %p\n, pgd);
  +  return -EFAULT;
  +  }
 
 
  There are two problems that I've found here.  The first problem is that
  vttbr_baddr_mask isn't allocated yet at this point in the code.
  allocated? you mean assigned?
  aren't you setting vttbr_baddr_mask in kvm_arch_init()?  that's
  certainly called before kvm_arch_init_vm().
 Yes, I mean assigned, at least I got the first letter correct :)  All I
 know is that vttbr_baddr_mask was still zero and checking for zero and
 calling the set function gave it a value.

that sounds weird and wrong.  Hum.  Mind sticking a few prints in
there and figuring out what's causing this?

 
 
  The
  second problem is that pgd is a virtual address, ie pgd ==
  0xfe03bbb4 while the vttbr masks off the high bits for a
  physical address, ie vttbr_baddr_mask=0x7ffe .  Even
  correcting for those issues I haven't been able to make this check work
  properly.  I'll resend v5 the patch with all the other suggested changes.
 
  What are the issues that you face?  Iow. what is the alignment of the
  returned physical address?
 
  (You should be able to just to virt_to_phys(pgd) and use that to test
  for the vttbr_baddr_mask).
 The addresses above are actually from my system, 64K page aligned on a
 64K page kernel.  I did use virt_to_phys() and the kernel got a null
 dereference and paniced, I didn't trace down where the panic was occuring.
 
virt_to_phys() directly caused the null dereference?  That sounds bad!

Would you mind trying to trace this down?  I'll be happy to provide as
much help as I can along the way.

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Christoffer Dall
On Tue, Aug 19, 2014 at 09:23:57AM -0500, Joel Schopp wrote:
 
 On 08/19/2014 07:24 AM, Christoffer Dall wrote:
  On Mon, Aug 18, 2014 at 03:36:04PM -0500, Joel Schopp wrote:
  The current VTTBR_BADDR_MASK only masks 39 bits, which is broken on current
  systems.  Rather than just add a bit it seems like a good time to also set
  things at run-time instead of compile time to accomodate more hardware.
 
  This patch sets TCR_EL2.PS, VTCR_EL2.T0SZ and vttbr_baddr_mask in runtime,
  not compile time.
 
  In ARMv8, EL2 physical address size (TCR_EL2.PS) and stage2 input address
  size (VTCR_EL2.T0SZE) cannot be determined in compile time since they
  depend on hardware capability.
 
  According to Table D4-23 and Table D4-25 in ARM DDI 0487A.b document,
  vttbr_x is calculated using different fixed values with consideration
  of T0SZ, granule size and the level of translation tables. Therefore,
  vttbr_baddr_mask should be determined dynamically.
 
  Changes since v4:
  More minor cleanups from review
  Moved some functions into headers
 
  Changes since v3:
  Another rebase
  Addressed minor comments from v2
 
  Changes since v2:
  Rebased on 
  https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git next 
  branch
 
  Changes since v1:
  Rebased fix on Jungseok Lee's patch https://lkml.org/lkml/2014/5/12/189 to
  provide better long term fix.  Updated that patch to log error instead of
  silently fail on unaligned vttbr.
 
  Cc: Christoffer Dall christoffer.d...@linaro.org
  Cc: Sungjinn Chung sungjinn.ch...@samsung.com
  Signed-off-by: Jungseok Lee jays@samsung.com
  Signed-off-by: Joel Schopp joel.sch...@amd.com
  ---
   arch/arm/include/asm/kvm_mmu.h   |   12 ++
   arch/arm/kvm/arm.c   |   17 +++-
   arch/arm64/include/asm/kvm_arm.h |   17 +---
   arch/arm64/include/asm/kvm_mmu.h |   78 
  ++
   arch/arm64/kvm/hyp-init.S|   20 +++---
   5 files changed, 122 insertions(+), 22 deletions(-)
 
  diff --git a/arch/arm/include/asm/kvm_mmu.h 
  b/arch/arm/include/asm/kvm_mmu.h
  index 5c7aa3c..73f6ff6 100644
  --- a/arch/arm/include/asm/kvm_mmu.h
  +++ b/arch/arm/include/asm/kvm_mmu.h
  @@ -166,6 +166,18 @@ static inline void coherent_cache_guest_page(struct 
  kvm_vcpu *vcpu, hva_t hva,
   
   void stage2_flush_vm(struct kvm *kvm);
   
  +static inline int kvm_get_phys_addr_shift(void)
  +{
  +  return KVM_PHYS_SHIFT;
  +}
  +
  +static inline int set_vttbr_baddr_mask(void)
  +{
  +  vttbr_baddr_mask = VTTBR_BADDR_MASK;
  +  return 0;
  +}
  +
  +
   #endif/* !__ASSEMBLY__ */
   
   #endif /* __ARM_KVM_MMU_H__ */
  diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
  index 3c82b37..f396eb7 100644
  --- a/arch/arm/kvm/arm.c
  +++ b/arch/arm/kvm/arm.c
  @@ -37,6 +37,7 @@
   #include asm/mman.h
   #include asm/tlbflush.h
   #include asm/cacheflush.h
  +#include asm/cputype.h
   #include asm/virt.h
   #include asm/kvm_arm.h
   #include asm/kvm_asm.h
  @@ -466,8 +467,14 @@ static void update_vttbr(struct kvm *kvm)
 /* update vttbr to be used with the new vmid */
 pgd_phys = virt_to_phys(kvm-arch.pgd);
 vmid = ((u64)(kvm-arch.vmid)  VTTBR_VMID_SHIFT)  VTTBR_VMID_MASK;
  -  kvm-arch.vttbr = pgd_phys  VTTBR_BADDR_MASK;
  -  kvm-arch.vttbr |= vmid;
  +
  +  /*
  +   * If the VTTBR isn't aligned there is something wrong with the system
  +   * or kernel.
  +   */
  +  BUG_ON(pgd_phys  ~vttbr_baddr_mask);
  +
  +  kvm-arch.vttbr = pgd_phys | vmid;
   
 spin_unlock(kvm_vmid_lock);
   }
  @@ -1052,6 +1059,12 @@ int kvm_arch_init(void *opaque)
 }
 }
   
  +  err = set_vttbr_baddr_mask();
  +  if (err) {
  +  kvm_err(Cannot set vttbr_baddr_mask\n);
  +  return -EINVAL;
  +  }
  +
 cpu_notifier_register_begin();
   
 err = init_hyp_mode();
  diff --git a/arch/arm64/include/asm/kvm_arm.h 
  b/arch/arm64/include/asm/kvm_arm.h
  index 3d69030..8dbef70 100644
  --- a/arch/arm64/include/asm/kvm_arm.h
  +++ b/arch/arm64/include/asm/kvm_arm.h
  @@ -94,7 +94,6 @@
   /* TCR_EL2 Registers bits */
   #define TCR_EL2_TBI   (1  20)
   #define TCR_EL2_PS(7  16)
  -#define TCR_EL2_PS_40B(2  16)
   #define TCR_EL2_TG0   (1  14)
   #define TCR_EL2_SH0   (3  12)
   #define TCR_EL2_ORGN0 (3  10)
  @@ -103,8 +102,6 @@
   #define TCR_EL2_MASK  (TCR_EL2_TG0 | TCR_EL2_SH0 | \
  TCR_EL2_ORGN0 | TCR_EL2_IRGN0 | TCR_EL2_T0SZ)
   
  -#define TCR_EL2_FLAGS (TCR_EL2_PS_40B)
  -
   /* VTCR_EL2 Registers bits */
   #define VTCR_EL2_PS_MASK  (7  16)
   #define VTCR_EL2_TG0_MASK (1  14)
  @@ -119,36 +116,28 @@
   #define VTCR_EL2_SL0_MASK (3  6)
   #define VTCR_EL2_SL0_LVL1 (1  6)
   #define VTCR_EL2_T0SZ_MASK0x3f
  -#define VTCR_EL2_T0SZ_40B 24
  +#define VTCR_EL2_T0SZ(bits)   (64 - (bits))
   
   #ifdef CONFIG_ARM64_64K_PAGES
   /*
* Stage2 translation configuration:
  - * 40bits output (PS = 2)
  - * 40bits input  (T0SZ = 24)
* 64kB 

Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Christian Borntraeger
On 19/08/14 16:23, David Hildenbrand wrote:
 On 19/08/14 14:14, David Hildenbrand wrote:
 Il 19/08/2014 13:28, David Hildenbrand ha scritto:
 Looking at the code, kvm_cpu_synchronize_state() seems to do these ioctls 
 in
 the vcpu thread (e.g. comming from cpu_synchronize_all_states()), any 
 reasons
 why kvm_cpu_synchronize_post_reset() doesn't do the same (e.g. called from
 cpu_synchronize_all_post_reset())?

 No reason, feel free to post a patch for QEMU kvm-all.c.
 Documentation/virtual/kvm/api.txt clearly says:

Only run vcpu ioctls from the same thread that was used to create the
vcpu.

 Paolo


 Thanks! A little more tweaking in the other parts of s390x resets
 and we should be able to reduce the number of wrong ioctls (I think I 
 found
 most cases that are responsible for the performance degradation).

 Hmm. We want to not only reduce, we want them be zero.
 In addition to a reworked MP_STATE patch set, we might be able to change the 
 code to call KVM_S390_INITIAL_RESET only from the cpu thread itself. 
 If that simplifies things, we could avoid doing KVM_S390_INITIAL_RESET on 
 CPU creation, because we know that all kernel version will do an implicit 
 cpu reset on cpu creation anyway. Can you have a try on this as well when 
 reworking that code? We could then fix this rcu performance penalty 
 independent from getting rid of that ioctl.

 Christian

 
 Already working on it, only one ioctl left on vcpu creation that is called
 from wrong context, trying to hide from me. Restarts and resets are already

Maybe its the synchronize when the oldpid is 0? Can you check the patch that I 
just sent?

 blasting fast.
 
 David
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: avoid unnecessary synchronize_rcu

2014-08-19 Thread Christian Borntraeger
We dont have to wait for a grace period if there is no oldpid that
we are going to free. putpid also checks for NULL, so this patch
only fences synchronize_rcu.

Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
---
 virt/kvm/kvm_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 33712fb..39b1603 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -129,7 +129,8 @@ int vcpu_load(struct kvm_vcpu *vcpu)
struct pid *oldpid = vcpu-pid;
struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
rcu_assign_pointer(vcpu-pid, newpid);
-   synchronize_rcu();
+   if (oldpid)
+   synchronize_rcu();
put_pid(oldpid);
}
cpu = get_cpu();
-- 
1.8.4.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Joel Schopp

 The return is a value,not just an error code. Because of this returning
 an error overloads that value.  0 just seemed like a convenient invalid
 value to check since a vttbr_x of 0 is invalid, but returning a negative
 error code would be as equally invalid.  If this is the only issue it
 doesn't seem worth respinning the patch for, but I'll change it to
 -EINVAL if for some reason a v6 is needed.
 Have you given up on doing the alignment check with the proper size on
 the pgd allocation for this patch?
Yes, I'd rather leave the extra check out of this patch.  If I were
changing the pgd allocation code I would make sure to add a check, or if
there were a static check there now I would update it for the dynamic
value from the hardware, but it seems unrelated to add several checks to
other parts of the code beyond those already in the patch.  I did leave
the functions in the headers such that checks like this could be added
when someone is updating the code for other reasons, say 4 level page
tables.

-Joel
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread David Hildenbrand
  Hmm. We want to not only reduce, we want them be zero.
  In addition to a reworked MP_STATE patch set, we might be able to change 
  the code to call KVM_S390_INITIAL_RESET only from the cpu thread itself. 
  If that simplifies things, we could avoid doing KVM_S390_INITIAL_RESET on 
  CPU creation, because we know that all kernel version will do an implicit 
  cpu reset on cpu creation anyway. Can you have a try on this as well when 
  reworking that code? We could then fix this rcu performance penalty 
  independent from getting rid of that ioctl.
 
  Christian
 
  
  Already working on it, only one ioctl left on vcpu creation that is called
  from wrong context, trying to hide from me. Restarts and resets are already
 
 Maybe its the synchronize when the oldpid is 0? Can you check the patch that 
 I just sent?

Already got that in my code. Seems to be an architecture specific one called
from wrong context. (actually it is the third one being called
after SET_MP_STATE and SET_SIGNAL_MASK).

A few more minutes and I should have it :)

David

 
  blasting fast.
  
  David
  
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Joel Schopp

On 08/19/2014 09:37 AM, Christoffer Dall wrote:
 On Tue, Aug 19, 2014 at 09:05:09AM -0500, Joel Schopp wrote:
 On 08/19/2014 07:22 AM, Christoffer Dall wrote:
 On Mon, Aug 18, 2014 at 03:30:58PM -0500, Joel Schopp wrote:
  #endif /* __ARM_KVM_MMU_H__ */
 diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
 index 16e7994..70f0f02 100644
 --- a/arch/arm/kvm/mmu.c
 +++ b/arch/arm/kvm/mmu.c
 @@ -521,6 +521,7 @@ int create_hyp_io_mappings(void *from, void *to, 
 phys_addr_t phys_addr)
   */
  int kvm_alloc_stage2_pgd(struct kvm *kvm)
  {
 +  unsigned int s2_pgds, s2_pgd_order;
pgd_t *pgd;
  
if (kvm-arch.pgd != NULL) {
 @@ -528,10 +529,18 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
return -EINVAL;
}
  
 -  pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, S2_PGD_ORDER);
 +  s2_pgds = (1  (kvm_get_phys_addr_shift() - PGDIR_SHIFT));
 +  s2_pgd_order = get_order(s2_pgds * sizeof(pgd_t));
 +
 +  pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, s2_pgd_order);
if (!pgd)
return -ENOMEM;
  
 +  if ((unsigned long)pgd  ~vttbr_baddr_mask) {
 +  kvm_err(Stage-2 pgd not correctly aligned: %p\n, pgd);
 +  return -EFAULT;
 +  }


 There are two problems that I've found here.  The first problem is that
 vttbr_baddr_mask isn't allocated yet at this point in the code.
 allocated? you mean assigned?
 aren't you setting vttbr_baddr_mask in kvm_arch_init()?  that's
 certainly called before kvm_arch_init_vm().
 Yes, I mean assigned, at least I got the first letter correct :)  All I
 know is that vttbr_baddr_mask was still zero and checking for zero and
 calling the set function gave it a value.
 that sounds weird and wrong.  Hum.  Mind sticking a few prints in
 there and figuring out what's causing this?


 The
 second problem is that pgd is a virtual address, ie pgd ==
 0xfe03bbb4 while the vttbr masks off the high bits for a
 physical address, ie vttbr_baddr_mask=0x7ffe .  Even
 correcting for those issues I haven't been able to make this check work
 properly.  I'll resend v5 the patch with all the other suggested changes.

 What are the issues that you face?  Iow. what is the alignment of the
 returned physical address?

 (You should be able to just to virt_to_phys(pgd) and use that to test
 for the vttbr_baddr_mask).
 The addresses above are actually from my system, 64K page aligned on a
 64K page kernel.  I did use virt_to_phys() and the kernel got a null
 dereference and paniced, I didn't trace down where the panic was occuring.

 virt_to_phys() directly caused the null dereference?  That sounds bad!
I don't think it was the virt_to_phys() directly causing the null
dereference, but again I didn't trace it down.


 Would you mind trying to trace this down?  I'll be happy to provide as
 much help as I can along the way.
I can break the kvm_alloc_stage2_pgd check into a separate patch on top
of this one and circle back around to it after I finish another
unrelated thing I'm working on.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Christoffer Dall
On Tue, Aug 19, 2014 at 09:49:07AM -0500, Joel Schopp wrote:
 
  The return is a value,not just an error code. Because of this returning
  an error overloads that value.  0 just seemed like a convenient invalid
  value to check since a vttbr_x of 0 is invalid, but returning a negative
  error code would be as equally invalid.  If this is the only issue it
  doesn't seem worth respinning the patch for, but I'll change it to
  -EINVAL if for some reason a v6 is needed.
  Have you given up on doing the alignment check with the proper size on
  the pgd allocation for this patch?
 Yes, I'd rather leave the extra check out of this patch.  If I were
 changing the pgd allocation code I would make sure to add a check, or if
 there were a static check there now I would update it for the dynamic
 value from the hardware, but it seems unrelated to add several checks to
 other parts of the code beyond those already in the patch.  I did leave
 the functions in the headers such that checks like this could be added
 when someone is updating the code for other reasons, say 4 level page
 tables.
 

hmmm, the point is that we need to ensure that we have a properly
aligned allocated PGD, that's what this patch currently addresses, and as
you pointed out, the BUG_ON() just before trying to run a VM is not the
nicest solution - we should really be dealing with this properly at
allocation time.

But, if you don't have time to look at that, then ok, I'll have to pick
it up myself.

However, you are hinting that we do not support 4 levels of page tables,
yet you do allow the t0sz_to_vttbr_x funciton to pass even when using
t0sz values only supported under 4 levels of page tables  What is
the rationale for that?

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: avoid unnecessary synchronize_rcu

2014-08-19 Thread Raghavendra K T

On 08/19/2014 08:15 PM, Christian Borntraeger wrote:

We dont have to wait for a grace period if there is no oldpid that
we are going to free. putpid also checks for NULL, so this patch
only fences synchronize_rcu.

Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com

Feel free to add:
Reviewed-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Christoffer Dall
On Tue, Aug 19, 2014 at 09:53:51AM -0500, Joel Schopp wrote:
 
 On 08/19/2014 09:37 AM, Christoffer Dall wrote:
  On Tue, Aug 19, 2014 at 09:05:09AM -0500, Joel Schopp wrote:
  On 08/19/2014 07:22 AM, Christoffer Dall wrote:
  On Mon, Aug 18, 2014 at 03:30:58PM -0500, Joel Schopp wrote:
   #endif /* __ARM_KVM_MMU_H__ */
  diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
  index 16e7994..70f0f02 100644
  --- a/arch/arm/kvm/mmu.c
  +++ b/arch/arm/kvm/mmu.c
  @@ -521,6 +521,7 @@ int create_hyp_io_mappings(void *from, void *to, 
  phys_addr_t phys_addr)
*/
   int kvm_alloc_stage2_pgd(struct kvm *kvm)
   {
  +unsigned int s2_pgds, s2_pgd_order;
   pgd_t *pgd;
   
   if (kvm-arch.pgd != NULL) {
  @@ -528,10 +529,18 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
   return -EINVAL;
   }
   
  -pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, S2_PGD_ORDER);
  +s2_pgds = (1  (kvm_get_phys_addr_shift() - PGDIR_SHIFT));
  +s2_pgd_order = get_order(s2_pgds * sizeof(pgd_t));
  +
  +pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, s2_pgd_order);
   if (!pgd)
   return -ENOMEM;
   
  +if ((unsigned long)pgd  ~vttbr_baddr_mask) {
  +kvm_err(Stage-2 pgd not correctly aligned: %p\n, pgd);
  +return -EFAULT;
  +}
 
 
  There are two problems that I've found here.  The first problem is that
  vttbr_baddr_mask isn't allocated yet at this point in the code.
  allocated? you mean assigned?
  aren't you setting vttbr_baddr_mask in kvm_arch_init()?  that's
  certainly called before kvm_arch_init_vm().
  Yes, I mean assigned, at least I got the first letter correct :)  All I
  know is that vttbr_baddr_mask was still zero and checking for zero and
  calling the set function gave it a value.
  that sounds weird and wrong.  Hum.  Mind sticking a few prints in
  there and figuring out what's causing this?
 
 
  The
  second problem is that pgd is a virtual address, ie pgd ==
  0xfe03bbb4 while the vttbr masks off the high bits for a
  physical address, ie vttbr_baddr_mask=0x7ffe .  Even
  correcting for those issues I haven't been able to make this check work
  properly.  I'll resend v5 the patch with all the other suggested changes.
 
  What are the issues that you face?  Iow. what is the alignment of the
  returned physical address?
 
  (You should be able to just to virt_to_phys(pgd) and use that to test
  for the vttbr_baddr_mask).
  The addresses above are actually from my system, 64K page aligned on a
  64K page kernel.  I did use virt_to_phys() and the kernel got a null
  dereference and paniced, I didn't trace down where the panic was occuring.
 
  virt_to_phys() directly caused the null dereference?  That sounds bad!
 I don't think it was the virt_to_phys() directly causing the null
 dereference, but again I didn't trace it down.
 
 
  Would you mind trying to trace this down?  I'll be happy to provide as
  much help as I can along the way.
 I can break the kvm_alloc_stage2_pgd check into a separate patch on top
 of this one and circle back around to it after I finish another
 unrelated thing I'm working on.

that would be great, thanks.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5] arm64: fix VTTBR_BADDR_MASK

2014-08-19 Thread Joel Schopp

 hmmm, the point is that we need to ensure that we have a properly
 aligned allocated PGD, that's what this patch currently addresses, and as
 you pointed out, the BUG_ON() just before trying to run a VM is not the
 nicest solution - we should really be dealing with this properly at
 allocation time.

 But, if you don't have time to look at that, then ok, I'll have to pick
 it up myself.

 However, you are hinting that we do not support 4 levels of page tables,
 yet you do allow the t0sz_to_vttbr_x funciton to pass even when using
 t0sz values only supported under 4 levels of page tables  What is
 the rationale for that?
Minimizing merge conflicts. I figure both are going in within the next
month or two.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virt/kvm/assigned-dev.c: Set 'dev-irq_source_id' to '-1' after free it

2014-08-19 Thread Chen Gang
Hello maintainers:

Please help check this patch, when you have time.

Thanks.

On 08/08/2014 11:37 PM, Chen Gang wrote:
 As a generic function, deassign_guest_irq() assumes it can be called
 even if assign_guest_irq() is not be called successfully (which can be
 triggered by ioctl from user mode, indirectly).
 
 So for assign_guest_irq() failure process, need set 'dev-irq_source_id'
 to -1 after free 'dev-irq_source_id', or deassign_guest_irq() may free
 it again.
 
 Signed-off-by: Chen Gang gang.chen.5...@gmail.com
 ---
  virt/kvm/assigned-dev.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
 index bf06577..5819a27 100644
 --- a/virt/kvm/assigned-dev.c
 +++ b/virt/kvm/assigned-dev.c
 @@ -526,8 +526,10 @@ static int assign_guest_irq(struct kvm *kvm,
   dev-irq_requested_type |= guest_irq_type;
   if (dev-ack_notifier.gsi != -1)
   kvm_register_irq_ack_notifier(kvm, dev-ack_notifier);
 - } else
 + } else {
   kvm_free_irq_source_id(kvm, dev-irq_source_id);
 + dev-irq_source_id = -1;
 + }
  
   return r;
  }
 


-- 
Chen Gang

Open share and attitude like air water and life which God blessed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virt/kvm/assigned-dev.c: Set 'dev-irq_source_id' to '-1' after free it

2014-08-19 Thread Paolo Bonzini
Il 19/08/2014 17:44, Chen Gang ha scritto:
 Hello maintainers:
 
 Please help check this patch, when you have time.

Hi, it's already on its way to 3.17-rc2, but I first have to run a bunch
of tests.

Paolo

 Thanks.
 
 On 08/08/2014 11:37 PM, Chen Gang wrote:
 As a generic function, deassign_guest_irq() assumes it can be called
 even if assign_guest_irq() is not be called successfully (which can be
 triggered by ioctl from user mode, indirectly).

 So for assign_guest_irq() failure process, need set 'dev-irq_source_id'
 to -1 after free 'dev-irq_source_id', or deassign_guest_irq() may free
 it again.

 Signed-off-by: Chen Gang gang.chen.5...@gmail.com
 ---
  virt/kvm/assigned-dev.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

 diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
 index bf06577..5819a27 100644
 --- a/virt/kvm/assigned-dev.c
 +++ b/virt/kvm/assigned-dev.c
 @@ -526,8 +526,10 @@ static int assign_guest_irq(struct kvm *kvm,
  dev-irq_requested_type |= guest_irq_type;
  if (dev-ack_notifier.gsi != -1)
  kvm_register_irq_ack_notifier(kvm, dev-ack_notifier);
 -} else
 +} else {
  kvm_free_irq_source_id(kvm, dev-irq_source_id);
 +dev-irq_source_id = -1;
 +}
  
  return r;
  }


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 82761] DMAR:[fault reason 06] PTE Read access is not set

2014-08-19 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=82761

--- Comment #2 from Ansa89 ansalonistef...@gmail.com ---
1) I would prefer stay on stable kernel if it's possible (which commits of
3.17-rc1 would be relevant for this bug?).

2) Yes, all of the 8169 NICs are up and running.

3) lspci -vvs 04:00.0
04:00.0 PCI bridge: ASMedia Technology Inc. ASM1083/1085 PCIe to PCI Bridge
(rev 03) (prog-if 00 [Normal decode])
Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr-
Stepping- SERR- FastB2B- DisINTx-
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort-
MAbort- SERR- PERR- INTx-
Latency: 0, Cache Line Size: 64 bytes
Bus: primary=04, secondary=05, subordinate=05, sec-latency=32
I/O behind bridge: c000-cfff
Memory behind bridge: f780-f78f
Secondary status: 66MHz+ FastB2B- ParErr- DEVSEL=fast TAbort- TAbort-
MAbort+ SERR- PERR-
BridgeCtl: Parity- SERR- NoISA- VGA- MAbort- Reset- FastB2B-
PriDiscTmr- SecDiscTmr- DiscTmrStat- DiscTmrSERREn-
Capabilities: [50] MSI: Enable- Count=1/1 Maskable- 64bit+
Address:   Data: 
Capabilities: [78] Power Management version 3
Flags: PMEClk- DSI+ D1+ D2+ AuxCurrent=0mA
PME(D0+,D1+,D2+,D3hot+,D3cold+)
Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=0 PME-
Capabilities: [80] Express (v1) PCI/PCI-X Bridge, MSI 00
DevCap: MaxPayload 128 bytes, PhantFunc 0, Latency L0s 64ns,
L1 1us
ExtTag- AttnBtn- AttnInd- PwrInd- RBE+ FLReset-
DevCtl: Report errors: Correctable- Non-Fatal- Fatal-
Unsupported-
RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop+
BrConfRtry-
MaxPayload 128 bytes, MaxReadReq 512 bytes
DevSta: CorrErr- UncorrErr+ FatalErr- UnsuppReq+ AuxPwr-
TransPend-
LnkCap: Port #1, Speed 2.5GT/s, Width x1, ASPM L0s L1, Latency
L0 2us, L1 2us
ClockPM- Surprise- LLActRep- BwNot-
LnkCtl: ASPM Disabled; Disabled- Retrain- CommClk-
ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
LnkSta: Speed 2.5GT/s, Width x1, TrErr- Train- SlotClk-
DLActive- BWMgmt- ABWMgmt-
Capabilities: [c0] Subsystem: Micro-Star International Co., Ltd. Device
7758
Capabilities: [100 v1] Virtual Channel
Caps:   LPEVC=0 RefClk=100ns PATEntryBits=1
Arb:Fixed- WRR32- WRR64- WRR128-
Ctrl:   ArbSelect=Fixed
Status: InProgress-
VC0:Caps:   PATOffset=00 MaxTimeSlots=1 RejSnoopTrans-
Arb:Fixed- WRR32- WRR64- WRR128- TWRR128- WRR256-
Ctrl:   Enable+ ID=0 ArbSelect=Fixed TC/VC=01
Status: NegoPending- InProgress-

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 82761] DMAR:[fault reason 06] PTE Read access is not set

2014-08-19 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=82761

--- Comment #3 from Alex Williamson alex.william...@redhat.com ---
(In reply to Ansa89 from comment #2)
 1) I would prefer stay on stable kernel if it's possible (which commits of
 3.17-rc1 would be relevant for this bug?).

579305f iommu/vt-d: Update to use PCI DMA aliases
e17f9ff iommu/vt-d: Use iommu_group_get_for_dev()
104a1c1 iommu/core: Create central IOMMU group lookup/creation interface

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 82761] DMAR:[fault reason 06] PTE Read access is not set

2014-08-19 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=82761

--- Comment #4 from Ansa89 ansalonistef...@gmail.com ---
I will try 3.17-rc1 (hoping it's enough stable for home-server).

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


virt-install: failed to initialize KVM: Permission denied

2014-08-19 Thread arnaud gaboury
$ uname -r
3.16.1-1-ARCH
-

As a regular user, member of the libvirt group, I run this command to
create a basic VM:

virt-install --connect qemu:///system --name=test --ram 2048 --cpu
host-model-only --os-variant=win7 --disk /myVM/test --boot cdrom,hd
--virt-type kvm --graphics spice --controller scsi,model=virtio-scsi
--cdrom=/drawer/myIso/w8.iso

It returns an error :
--
---
Starting install...
ERRORinternal error: process exited while connecting to monitor:
Could not access KVM kernel module: Permission denied
failed to initialize KVM: Permission denied
-

$ getfacl /dev/kvm

# file: dev/kvm
# owner: root
# group: kvm
user::rw-
user:martinus:rw-
group::rw-
mask::rw-
other::---

The command return seems to indicate rights are correct.
$ lsmod return kvm  kvm_intel are loaded.

If I run the virt-install with qemu:///session, I do not have this
issue and can create the VM.

I found many entries about the KVM permission issue, but with no clear
answer to solve it.

Thank you for any suggestion

-- 

google.com/+arnaudgabourygabx
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 0/4] VFIO: PLATFORM: Return device tree info for a platform device node

2014-08-19 Thread Joel Schopp

 This RFC's intention is to show what an interface to access device node
 properties for VFIO_PLATFORM can look like.

 If a device tree node corresponding to a platform device bound by 
 VFIO_PLATFORM
 is available, this patch series will allow the user to query the properties
 associated with this device node. This can be useful for userspace drivers
 to automatically query parameters related to the device.

 An API to return data from a device's device tree has been proposed before on
 these lists. The API proposed here is slightly different.

 Properties to parse from the device tree are not indexed by a numerical id.
 The host system doesn't guarantee any specific ordering for the available
 properties, or that those will remain the same; while this does not happen in
 practice, there is nothing from the host changing the device nodes during
 operation. So properties are accessed by property name.

 The type of the property accessed must also be known by the user. Properties
 types implemented in this RFC:
 - VFIO_DEVTREE_ARR_TYPE_STRING (strings sepparated by the null character)
 - VFIO_DEVTREE_ARR_TYPE_U32
 - VFIO_DEVTREE_ARR_TYPE_U16
 - VFIO_DEVTREE_ARR_TYPE_U8

 These can all be access by the ioctl VFIO_DEVICE_GET_DEVTREE_INFO. A new ioctl
 was preferred instead of shoehorning the functionality in 
 VFIO_DEVICE_GET_INFO.
 The structure exchanged looks like this:

You'll have to forgive my ignorance on the history.  But with the dtc
tool already supporting a filesystem representation (--in-format=fs),
with the dtc tool source already built into qemu, and having an example
implementation of such an interface in /proc/device-tree/ why is an
ioctl interface preferred over a filesystem interface? 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/9] KVM: add kvm_arch_sched_in

2014-08-19 Thread Radim Krčmář
Introduce preempt notifiers for architecture specific code.
Advantage over creating a new notifier in every arch is slightly simpler
code and guaranteed call order with respect to kvm_sched_in.

Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 arch/arm/kvm/arm.c | 4 
 arch/mips/kvm/mips.c   | 4 
 arch/powerpc/kvm/powerpc.c | 4 
 arch/s390/kvm/kvm-s390.c   | 4 
 arch/x86/kvm/x86.c | 4 
 include/linux/kvm_host.h   | 2 ++
 virt/kvm/kvm_main.c| 2 ++
 7 files changed, 24 insertions(+)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index a99e0cd..9f788eb 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -288,6 +288,10 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 }
 
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
vcpu-cpu = cpu;
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index cd71141..2362df2 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1002,6 +1002,10 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 }
 
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
  struct kvm_translation *tr)
 {
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4c79284..cbc432f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -720,6 +720,10 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
kvmppc_subarch_vcpu_uninit(vcpu);
 }
 
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 #ifdef CONFIG_BOOKE
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ce81eb2..a3c324e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -555,6 +555,10 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
/* Nothing todo */
 }
 
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
save_fp_ctl(vcpu-arch.host_fpregs.fpc);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f1e22d..d7c214f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7146,6 +7146,10 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
static_key_slow_dec(kvm_no_apic_vcpu);
 }
 
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
if (type)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a4c33b3..ebd7236 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -624,6 +624,8 @@ void kvm_arch_exit(void);
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
 
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu);
+
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 33712fb..d3c3ed0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3123,6 +3123,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int 
cpu)
if (vcpu-preempted)
vcpu-preempted = false;
 
+   kvm_arch_sched_in(vcpu, cpu);
+
kvm_arch_vcpu_load(vcpu, cpu);
 }
 
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/9] KVM: VMX: clamp PLE window

2014-08-19 Thread Radim Krčmář
Modifications could get unwanted values of PLE window. (low or negative)
Use ple_window and the maximal value that cannot overflow as bounds.

ple_window_max defaults to a very high value, but it would make sense to
set it to some fraction of the scheduler tick.

Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 arch/x86/kvm/vmx.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 66259fd..e1192fb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -144,6 +144,10 @@ module_param(ple_window_grow, int, S_IRUGO);
 static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
 module_param(ple_window_shrink, int, S_IRUGO);
 
+/* Default is to compute the maximum so we can never overflow. */
+static int ple_window_max = INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_max, int, S_IRUGO);
+
 extern const ulong vmx_return;
 
 #define NR_AUTOLOAD_MSRS 8
@@ -5704,7 +5708,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
else
new = old + ple_window_grow;
 
-   vmx-ple_window = new;
+   vmx-ple_window = min(new, ple_window_max);
 }
 
 static void shrink_ple_window(struct kvm_vcpu *vcpu)
@@ -5720,7 +5724,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
else
new = old - ple_window_shrink;
 
-   vmx-ple_window = new;
+   vmx-ple_window = max(new, ple_window);
 }
 
 /*
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/9] KVM: trace kvm_ple_window grow/shrink

2014-08-19 Thread Radim Krčmář
Tracepoint for dynamic PLE window, fired on every potential change.

Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 arch/x86/kvm/trace.h | 29 +
 arch/x86/kvm/vmx.c   |  4 
 arch/x86/kvm/x86.c   |  1 +
 3 files changed, 34 insertions(+)

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index e850a7d..e4682f5 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -848,6 +848,35 @@ TRACE_EVENT(kvm_track_tsc,
  __print_symbolic(__entry-host_clock, host_clocks))
 );
 
+TRACE_EVENT(kvm_ple_window,
+   TP_PROTO(int grow, unsigned int vcpu_id, int new, int old),
+   TP_ARGS(grow, vcpu_id, new, old),
+
+   TP_STRUCT__entry(
+   __field( int,  grow )
+   __field(unsigned int,   vcpu_id )
+   __field( int,   new )
+   __field( int,   old )
+   ),
+
+   TP_fast_assign(
+   __entry-grow   = grow;
+   __entry-vcpu_id= vcpu_id;
+   __entry-new= new;
+   __entry-old= old;
+   ),
+
+   TP_printk(vcpu %u: ple_window %d %s %d,
+ __entry-vcpu_id,
+ __entry-new,
+ __entry-grow ? + : -,
+ __entry-old)
+);
+#define trace_kvm_ple_window_grow(vcpu_id, new, old) \
+   trace_kvm_ple_window(1, vcpu_id, new, old)
+#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \
+   trace_kvm_ple_window(0, vcpu_id, new, old)
+
 #endif /* CONFIG_X86_64 */
 
 #endif /* _TRACE_KVM_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e1192fb..a236a9f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5709,6 +5709,8 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
new = old + ple_window_grow;
 
vmx-ple_window = min(new, ple_window_max);
+
+   trace_kvm_ple_window_grow(vcpu-vcpu_id, vmx-ple_window, old);
 }
 
 static void shrink_ple_window(struct kvm_vcpu *vcpu)
@@ -5725,6 +5727,8 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
new = old - ple_window_shrink;
 
vmx-ple_window = max(new, ple_window);
+
+   trace_kvm_ple_window_shrink(vcpu-vcpu_id, vmx-ple_window, old);
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5696ee7..814b20c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7648,3 +7648,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 9/9] KVM: VMX: automatic PLE window maximum

2014-08-19 Thread Radim Krčmář
Every increase of ple_window_grow creates potential overflows.
They are not serious, because we clamp ple_window and userspace is
expected to fix ple_window_max within a second.
---
 arch/x86/kvm/vmx.c | 34 +-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d7f58e8..6873a0b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -138,7 +138,9 @@ module_param(ple_window, int, S_IRUGO | S_IWUSR);
 
 /* Default doubles per-vcpu window every exit. */
 static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
-module_param(ple_window_grow, int, S_IRUGO | S_IWUSR);
+static struct kernel_param_ops ple_window_grow_ops;
+module_param_cb(ple_window_grow, ple_window_grow_ops,
+ple_window_grow, S_IRUGO | S_IWUSR);
 
 /* Default resets per-vcpu window every exit to ple_window. */
 static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
@@ -5717,6 +5719,36 @@ static void type##_ple_window(struct kvm_vcpu *vcpu) \
 make_ple_window_modifier(grow,   *, +) /* grow_ple_window */
 make_ple_window_modifier(shrink, /, -) /* shrink_ple_window */
 
+static void clamp_ple_window_max(void)
+{
+   int maximum;
+
+   if (ple_window_grow  1)
+   return;
+
+   if (ple_window_grow  ple_window)
+   maximum = INT_MAX / ple_window_grow;
+   else
+   maximum = INT_MAX - ple_window_grow;
+
+   ple_window_max = clamp(ple_window_max, ple_window, maximum);
+}
+
+static int set_ple_window_grow(const char *arg, const struct kernel_param *kp)
+{
+   int ret;
+
+   clamp_ple_window_max();
+   ret = param_set_int(arg, kp);
+
+   return ret;
+}
+
+static struct kernel_param_ops ple_window_grow_ops = {
+   .set = set_ple_window_grow,
+   .get = param_get_int,
+};
+
 /*
  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/9] KVM: VMX: abstract ple_window modifiers

2014-08-19 Thread Radim Krčmář
They were almost identical and thus merged with a loathable macro.

Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 This solution is hopefully more acceptable than function pointers.

 arch/x86/kvm/vmx.c | 53 +++--
 1 file changed, 19 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a236a9f..c6cfb71 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5694,42 +5694,27 @@ static int handle_invalid_guest_state(struct kvm_vcpu 
*vcpu)
 out:
return ret;
 }
-
-static void grow_ple_window(struct kvm_vcpu *vcpu)
-{
-   struct vcpu_vmx *vmx = to_vmx(vcpu);
-   int old = vmx-ple_window;
-   int new;
-
-   if (ple_window_grow  1)
-   new = ple_window;
-   else if (ple_window_grow  ple_window)
-   new = old * ple_window_grow;
-   else
-   new = old + ple_window_grow;
-
-   vmx-ple_window = min(new, ple_window_max);
-
-   trace_kvm_ple_window_grow(vcpu-vcpu_id, vmx-ple_window, old);
+#define make_ple_window_modifier(type, oplt, opge, cmp, bound) \
+static void type##_ple_window(struct kvm_vcpu *vcpu) \
+{ \
+   struct vcpu_vmx *vmx = to_vmx(vcpu); \
+   int old = vmx-ple_window; \
+   int new; \
+\
+   if (ple_window_##type  1) \
+   new = ple_window; \
+   else if (ple_window_##type  ple_window) \
+   new = old oplt ple_window_##type; \
+   else \
+   new = old opge ple_window_##type; \
+\
+   vmx-ple_window = cmp(new, bound); \
+\
+   trace_kvm_ple_window_##type(vcpu-vcpu_id, vmx-ple_window, old); \
 }
 
-static void shrink_ple_window(struct kvm_vcpu *vcpu)
-{
-   struct vcpu_vmx *vmx = to_vmx(vcpu);
-   int old = vmx-ple_window;
-   int new;
-
-   if (ple_window_shrink  1)
-   new = ple_window;
-   else if (ple_window_shrink  ple_window)
-   new = old / ple_window_shrink;
-   else
-   new = old - ple_window_shrink;
-
-   vmx-ple_window = max(new, ple_window);
-
-   trace_kvm_ple_window_shrink(vcpu-vcpu_id, vmx-ple_window, old);
-}
+make_ple_window_modifier(grow,   *, +, min, ple_window_max)
+make_ple_window_modifier(shrink, /, -, max, ple_window)
 
 /*
  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/9] KVM: VMX: runtime knobs for dynamic PLE window

2014-08-19 Thread Radim Krčmář
ple_window is updated on every vmentry, so there is no reason to have it
read-only anymore.
ple_window_* weren't writable to prevent runtime overflow races;
they are mitigated by clamping the value of ple_window.

Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 If we decide to ignore insane overflows, last two hunks can be dropped.

 arch/x86/kvm/vmx.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c6cfb71..d7f58e8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -134,19 +134,19 @@ static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
 module_param(ple_gap, int, S_IRUGO);
 
 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
-module_param(ple_window, int, S_IRUGO);
+module_param(ple_window, int, S_IRUGO | S_IWUSR);
 
 /* Default doubles per-vcpu window every exit. */
 static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
-module_param(ple_window_grow, int, S_IRUGO);
+module_param(ple_window_grow, int, S_IRUGO | S_IWUSR);
 
 /* Default resets per-vcpu window every exit to ple_window. */
 static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
-module_param(ple_window_shrink, int, S_IRUGO);
+module_param(ple_window_shrink, int, S_IRUGO | S_IWUSR);
 
 /* Default is to compute the maximum so we can never overflow. */
 static int ple_window_max = INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
-module_param(ple_window_max, int, S_IRUGO);
+module_param(ple_window_max, int, S_IRUGO | S_IWUSR);
 
 extern const ulong vmx_return;
 
@@ -5694,7 +5694,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu 
*vcpu)
 out:
return ret;
 }
-#define make_ple_window_modifier(type, oplt, opge, cmp, bound) \
+
+#define make_ple_window_modifier(type, oplt, opge) \
 static void type##_ple_window(struct kvm_vcpu *vcpu) \
 { \
struct vcpu_vmx *vmx = to_vmx(vcpu); \
@@ -5708,13 +5709,13 @@ static void type##_ple_window(struct kvm_vcpu *vcpu) \
else \
new = old opge ple_window_##type; \
 \
-   vmx-ple_window = cmp(new, bound); \
+   vmx-ple_window = clamp(new, ple_window, ple_window_max); \
 \
trace_kvm_ple_window_##type(vcpu-vcpu_id, vmx-ple_window, old); \
 }
 
-make_ple_window_modifier(grow,   *, +, min, ple_window_max)
-make_ple_window_modifier(shrink, /, -, max, ple_window)
+make_ple_window_modifier(grow,   *, +) /* grow_ple_window */
+make_ple_window_modifier(shrink, /, -) /* shrink_ple_window */
 
 /*
  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/9] KVM: VMX: dynamise PLE window

2014-08-19 Thread Radim Krčmář
Window is increased on every PLE exit and decreased on every sched_in.
The idea is that we don't want to PLE exit if there is no preemption
going on.

We do this with sched_in() because it does not hold rq lock.

There are two new kernel parameters for changing the window:
 ple_window_grow and ple_window_shrink
ple_window_grow affects the window on PLE exit and ple_window_shrink
does it on sched_in;  depending on their value, the window is modifier
like this: (ple_window is kvm_intel's global)

  ple_window_shrink/ |
  ple_window_grow| PLE exit   | sched_in
  ---++-
   1|  = ple_window  |  = ple_window
   ple_window   | *= ple_window_grow | /= ple_window_shrink
  otherwise  | += ple_window_grow | -= ple_window_shrink

Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 arch/x86/kvm/vmx.c | 52 ++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index eaa5574..66259fd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -125,14 +125,25 @@ module_param(nested, bool, S_IRUGO);
  * Time is measured based on a counter that runs at the same rate as the TSC,
  * refer SDM volume 3b section 21.6.13  22.1.3.
  */
-#define KVM_VMX_DEFAULT_PLE_GAP128
-#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+#define KVM_VMX_DEFAULT_PLE_GAP   128
+#define KVM_VMX_DEFAULT_PLE_WINDOW4096
+#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW   2
+#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
+
 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
 module_param(ple_gap, int, S_IRUGO);
 
 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 module_param(ple_window, int, S_IRUGO);
 
+/* Default doubles per-vcpu window every exit. */
+static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, int, S_IRUGO);
+
+/* Default resets per-vcpu window every exit to ple_window. */
+static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, int, S_IRUGO);
+
 extern const ulong vmx_return;
 
 #define NR_AUTOLOAD_MSRS 8
@@ -5680,12 +5691,47 @@ out:
return ret;
 }
 
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   int old = vmx-ple_window;
+   int new;
+
+   if (ple_window_grow  1)
+   new = ple_window;
+   else if (ple_window_grow  ple_window)
+   new = old * ple_window_grow;
+   else
+   new = old + ple_window_grow;
+
+   vmx-ple_window = new;
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   int old = vmx-ple_window;
+   int new;
+
+   if (ple_window_shrink  1)
+   new = ple_window;
+   else if (ple_window_shrink  ple_window)
+   new = old / ple_window_shrink;
+   else
+   new = old - ple_window_shrink;
+
+   vmx-ple_window = new;
+}
+
 /*
  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
  */
 static int handle_pause(struct kvm_vcpu *vcpu)
 {
+   if (ple_gap)
+   grow_ple_window(vcpu);
+
skip_emulated_instruction(vcpu);
kvm_vcpu_on_spin(vcpu);
 
@@ -8855,6 +8901,8 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 
 void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+   if (ple_gap)
+   shrink_ple_window(vcpu);
 }
 
 static struct kvm_x86_ops vmx_x86_ops = {
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/9] KVM: x86: introduce sched_in to kvm_x86_ops

2014-08-19 Thread Radim Krčmář
sched_in preempt notifier is available for x86, allow its use in
specific virtualization technlogies as well.

Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/svm.c  | 6 ++
 arch/x86/kvm/vmx.c  | 6 ++
 arch/x86/kvm/x86.c  | 1 +
 4 files changed, 15 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5724601..358e2f3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -772,6 +772,8 @@ struct kvm_x86_ops {
bool (*mpx_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
+
+   void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ddf7427..4baf1bc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4305,6 +4305,10 @@ static void svm_handle_external_intr(struct kvm_vcpu 
*vcpu)
local_irq_enable();
 }
 
+static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -4406,6 +4410,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 
.check_intercept = svm_check_intercept,
.handle_external_intr = svm_handle_external_intr,
+
+   .sched_in = svm_sched_in,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bfe11cf..2b306f9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8846,6 +8846,10 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
return X86EMUL_CONTINUE;
 }
 
+void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -8951,6 +8955,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.mpx_supported = vmx_mpx_supported,
 
.check_nested_events = vmx_check_nested_events,
+
+   .sched_in = vmx_sched_in,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d7c214f..5696ee7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7148,6 +7148,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+   kvm_x86_ops-sched_in(vcpu, cpu);
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/9] KVM: VMX: make PLE window per-vcpu

2014-08-19 Thread Radim Krčmář
Change PLE window into per-vcpu variable, seeded from module parameter,
to allow greater flexibility.

Brings in a small overhead on every vmentry.

Signed-off-by: Radim Krčmář rkrc...@redhat.com
---
 I've been thinking about a general hierarchical per-vcpu variable model,
 but it's hard to have current performance and sane code.

 arch/x86/kvm/vmx.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2b306f9..eaa5574 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -484,6 +484,9 @@ struct vcpu_vmx {
 
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
+
+   /* Dynamic PLE window. */
+   int ple_window;
 };
 
 enum segment_cache_field {
@@ -4403,6 +4406,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
if (ple_gap) {
vmcs_write32(PLE_GAP, ple_gap);
vmcs_write32(PLE_WINDOW, ple_window);
+   vmx-ple_window = ple_window;
}
 
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
@@ -7387,6 +7391,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vmx-emulation_required)
return;
 
+   if (ple_gap)
+   vmcs_write32(PLE_WINDOW, vmx-ple_window);
+
if (vmx-nested.sync_shadow_vmcs) {
copy_vmcs12_to_shadow(vmx);
vmx-nested.sync_shadow_vmcs = false;
-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/9] Dynamic Pause Loop Exiting window.

2014-08-19 Thread Radim Krčmář
PLE does not scale in its current form.  When increasing VCPU count
above 150, one can hit soft lockups because of runqueue lock contention.
(Which says a lot about performance.)

The main reason is that kvm_ple_loop cycles through all VCPUs.
Replacing it with a scalable solution would be ideal, but it has already
been well optimized for various workloads, so this series tries to
alleviate one different major problem while minimizing a chance of
regressions: we have too many useless PLE exits.

Just increasing PLE window would help some cases, but it still spirals
out of control.  By increasing the window after every PLE exit, we can
limit the amount of useless ones, so we don't reach the state where CPUs
spend 99% of the time waiting for a lock.

HP confirmed that this series avoids soft lockups and TSC sync errors on
large guests.

---
Design notes and questions:

Alternative to first two patches could be a new notifier.

All values are made changeable because defaults weren't selected after
weeks of benchmarking -- we can get improved performance by hardcoding
if someone is willing to do it.
(Or by presuming that noone is ever going to.)

Then, we can quite safely drop overflow checks: they are impossible to
hit with small increases and I don't think that anyone wants large ones.

Also, I'd argue against the last patch: it should be done in userspace,
but I'm not sure about Linux's policy.


Radim Krčmář (9):
  KVM: add kvm_arch_sched_in
  KVM: x86: introduce sched_in to kvm_x86_ops
  KVM: VMX: make PLE window per-vcpu
  KVM: VMX: dynamise PLE window
  KVM: VMX: clamp PLE window
  KVM: trace kvm_ple_window grow/shrink
  KVM: VMX: abstract ple_window modifiers
  KVM: VMX: runtime knobs for dynamic PLE window
  KVM: VMX: automatic PLE window maximum

 arch/arm/kvm/arm.c  |  4 ++
 arch/mips/kvm/mips.c|  4 ++
 arch/powerpc/kvm/powerpc.c  |  4 ++
 arch/s390/kvm/kvm-s390.c|  4 ++
 arch/x86/include/asm/kvm_host.h |  2 +
 arch/x86/kvm/svm.c  |  6 +++
 arch/x86/kvm/trace.h| 29 +
 arch/x86/kvm/vmx.c  | 93 +++--
 arch/x86/kvm/x86.c  |  6 +++
 include/linux/kvm_host.h|  2 +
 virt/kvm/kvm_main.c |  2 +
 11 files changed, 153 insertions(+), 3 deletions(-)

-- 
2.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 82761] DMAR:[fault reason 06] PTE Read access is not set

2014-08-19 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=82761

--- Comment #5 from Ansa89 ansalonistef...@gmail.com ---
Tested with 3.17-rc1: the errors still there, but the spam rate seems lower
than 3.16.1 (with 3.16.1 I get the errors repeated a lot of times and the count
grows fast; with 3.17-rc1 I get the same errors repeated less times and the
count seems to grow slower).

After ~10 minutes:
dmesg | grep -i dmar
ACPI: DMAR 0xC8EA83F0 B8 (v01 INTEL  SNB  0001 INTL
0001)
dmar: Host address width 36
dmar: DRHD base: 0x00fed9 flags: 0x0
dmar: IOMMU 0: reg_base_addr fed9 ver 1:0 cap c020e60262 ecap f0101a
dmar: DRHD base: 0x00fed91000 flags: 0x1
dmar: IOMMU 1: reg_base_addr fed91000 ver 1:0 cap c9008020660262 ecap f0105a
dmar: RMRR base: 0x00c8d17000 end: 0x00c8d24fff
dmar: RMRR base: 0x00cb80 end: 0x00cf9f
DMAR: No ATSR found
[drm] DMAR active, disabling use of stolen memory
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set
dmar: DRHD: handling fault status reg 3
dmar: DMAR:[DMA Read] Request device [05:00.0] fault addr ff3f4000 
DMAR:[fault reason 06] PTE Read access is not set


In the end the bug seems not fixed in 3.17-rc1.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 82761] DMAR:[fault reason 06] PTE Read access is not set

2014-08-19 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=82761

--- Comment #6 from Alex Williamson alex.william...@redhat.com ---
Ok, then it's probably not a result of the PCIe-to-PCI bridge since 05:00.0 is
the correct requester ID for all the devices behind the bridge.  Unfortunately
that means that the problem may not be fixable.  We're only seeing reads to a
single address, which may mean the NIC is using that read to synchronize
transaction ordering, ex. using a DMA read to flush a DMA write from the
device.  If the NIC driver has visibility of this address, then it could
attempt to do a coherent mapping for the device(s) to avoid the fault.  If it
doesn't, then these NICs may simply be incompatible with the IOMMU.

Are these 3 separate NICs plugged into PCI slots on the motherboard or is this
a single triple-port card with embedded PCIe-to-PCI bridge?

You might be able to run the IOMMU in passthrough mode with iommu=pt
r8169.use_dac=1, but note the warning in modinfo use_dac:Enable PCI DAC.
Unsafe on 32 bit PCI slot.  Unfortunately if you don't enable use_dac, then
intel_iommu will ignore the passthrough option for these devices.

Also note that this problem has nothing to do with Virtualization/KVM. 
Drivers/Network or perhaps Drivers/PCI would be a more appropriate
classification.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Bug: No irq handler for vector (irq -1) on C602

2014-08-19 Thread Andrey Korolyov
Hello,

ran into this error for a first time over veru large hardware
span/uptime (the server which experienced the error is identical to
others, and I had absolutely none of MSI-related problems with this
hardware ever).

Running 3.10 at host, I had one (of many) VM on it which produced
enormous count of context switches due to mess inside (hundreds of
active apache-itk workers). All VM threads are pinned to the first
sibling for every core on two-head system, e.g. having 24 HT cores and
second half is just HT siblings, cpuset cg limits threads only to
first. The error itself was produced a second after reset event for
this VM (through libvirt, if exact call matters):

[7696746.523478] do_IRQ: 11.233 No irq handler for vector (irq -1)

Since there are no hints for this exact error recently, and it
triggered by critical part of the kernel code, I think it may be
interesting to re-raise the issue (or, at least, make a better bound
for error source).
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 82761] DMAR:[fault reason 06] PTE Read access is not set

2014-08-19 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=82761

--- Comment #7 from Alex Williamson alex.william...@redhat.com ---
I'm guessing this might be the motherboard here: MSI ZH77A-G43

Since you're apparently trying to use VT-d on this system for KVM and therefore
presumably device assignment, I'll note that you will never be able to
successfully assign the conventional PCI devices separately between guests or
between host and guests.  The IOMMU does not have the granularity to create
separate IOMMU domains per PCI slot in this topology.  Also, some (all?)
Realtek NICs have some strange backdoors to PCI configuration space that make
them poor targets for PCI device assignment:

http://git.qemu.org/?p=qemu.git;a=commit;h=4cb47d281a995cb49e4652cb26bafb3ab2d9bd28

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] KVM: track pid for VCPU only on KVM_RUN ioctl

2014-08-19 Thread Wanpeng Li
On Tue, Aug 19, 2014 at 04:04:03PM +0200, Christian Borntraeger wrote:
On 18/08/14 07:02, Wanpeng Li wrote:
 Hi Christian,
 On Tue, Aug 05, 2014 at 04:44:14PM +0200, Christian Borntraeger wrote:
 We currently track the pid of the task that runs the VCPU in
 vcpu_load. Since we call vcpu_load for all kind of ioctls on a
 CPU, this causes hickups due to synchronize_rcu if one CPU is
 modified by another CPU or the main thread (e.g. initialization,
 reset). We track the pid only for the purpose of yielding, so
 let's update the pid only in the KVM_RUN ioctl.

 In addition, don't do a synchronize_rcu on startup (pid == 0).

 This speeds up guest boot time on s390 noticably for some configs, e.g.
 HZ=100, no full state tracking, 64 guest cpus 32 host cpus.

 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
 CC: Rik van Riel r...@redhat.com
 CC: Raghavendra K T raghavendra...@linux.vnet.ibm.com
 CC: Michael Mueller m...@linux.vnet.ibm.com
 ---
 virt/kvm/kvm_main.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 9ae9135..ebc8f54 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -124,14 +124,6 @@ int vcpu_load(struct kvm_vcpu *vcpu)

 if (mutex_lock_killable(vcpu-mutex))
 return -EINTR;
 
 One question: 
 
 -   if (unlikely(vcpu-pid != current-pids[PIDTYPE_PID].pid)) {
 
 When vcpu-pid and current-pids[PIDTYPE_PID].pid will be different?

If two different thread call an ioctl on a vcpu fd. (It must be an ioctl that 
has done vcpu_load - almost all except for some interrupt injections)

Thanks for your explanation. When can this happen?

Regards,
Wanpeng Li 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virt/kvm/assigned-dev.c: Set 'dev-irq_source_id' to '-1' after free it

2014-08-19 Thread Chen Gang
On 08/19/2014 11:49 PM, Paolo Bonzini wrote:
 Il 19/08/2014 17:44, Chen Gang ha scritto:
  Hello maintainers:
  
  Please help check this patch, when you have time.
 Hi, it's already on its way to 3.17-rc2, but I first have to run a bunch
 of tests.

OK, thanks. Also can let me try the test, although I am not quite
familiar with KVM. Since I plan to focus on KVM/Xen next, I shall
construct related environments for its' common test, at least.

I am just constructing the gcc common test environments under a new PC,
is a PC also enough for KVM/Xen common test?

Welcome any ideas, suggestions or completions about it (especially the
information about KVM/Xen common test).


Thanks.
-- 
Chen Gang

Open share and attitude like air water and life which God blessed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virt/kvm/assigned-dev.c: Set 'dev-irq_source_id' to '-1' after free it

2014-08-19 Thread Chen Gang

By the way, at present, I use Qemu as user mode program, is there common
test with both Qemu and KVM/Xen? And is a PC enough for the common test?

Thanks.

On 08/20/2014 07:58 AM, Chen Gang wrote:
 On 08/19/2014 11:49 PM, Paolo Bonzini wrote:
 Il 19/08/2014 17:44, Chen Gang ha scritto:
 Hello maintainers:

 Please help check this patch, when you have time.
 Hi, it's already on its way to 3.17-rc2, but I first have to run a bunch
 of tests.
 
 OK, thanks. Also can let me try the test, although I am not quite
 familiar with KVM. Since I plan to focus on KVM/Xen next, I shall
 construct related environments for its' common test, at least.
 
 I am just constructing the gcc common test environments under a new PC,
 is a PC also enough for KVM/Xen common test?
 
 Welcome any ideas, suggestions or completions about it (especially the
 information about KVM/Xen common test).
 
 
 Thanks.
 


-- 
Chen Gang

Open share and attitude like air water and life which God blessed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] KVM: fix cache stale memslot info with correct mmio generation number

2014-08-19 Thread Xiao Guangrong
On 08/19/2014 05:03 PM, Paolo Bonzini wrote:
 Il 19/08/2014 10:50, Xiao Guangrong ha scritto:
 Okay, what confused me it that it seems that the single line patch
 is ok to you. :)
 
 No, it was late and I was confused. :)
 
 Now, do we really need to care the case 2? like David said:
 Sorry I didn't explain myself very well: Since we can get a single wrong
 mmio exit no matter what, it has to be handled in userspace. So my point
 was, it doesn't really help to fix that one very specific way that it can
 happen, because it can just happen in other ways. (E.g. update memslots
 occurs after is_noslot_pfn() and before mmio exit).

 What's your idea?

 I think if you always treat the low bit as zero in mmio sptes, you can 
 do that without losing a bit of the generation.

 What's you did is avoiding cache a invalid generation number into spte, but
 actually if we can figure it out when we check mmio access, it's ok. Like the
 updated patch i posted should fix it, that way avoids doubly increase the 
 number.
 
 Yes.
 
 Okay, if you're interested increasing the number doubly, there is the simpler
 one:
 
 This wastes a bit in the mmio spte though.  My idea is to increase the
 memslots generation twice, but drop the low bit in the mmio spte.

Yeah, really smart idea. :)

Paolo/David, would you mind making a patch for this (+ the comments in David's
patch)?

Please feel free to add my:
Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] KVM: fix cache stale memslot info with correct mmio generation number

2014-08-19 Thread David Matlack
On Tue, Aug 19, 2014 at 5:29 PM, Xiao Guangrong
xiaoguangr...@linux.vnet.ibm.com wrote:
 On 08/19/2014 05:03 PM, Paolo Bonzini wrote:
 Il 19/08/2014 10:50, Xiao Guangrong ha scritto:
 Okay, what confused me it that it seems that the single line patch
 is ok to you. :)

 No, it was late and I was confused. :)

 Now, do we really need to care the case 2? like David said:
 Sorry I didn't explain myself very well: Since we can get a single wrong
 mmio exit no matter what, it has to be handled in userspace. So my point
 was, it doesn't really help to fix that one very specific way that it can
 happen, because it can just happen in other ways. (E.g. update memslots
 occurs after is_noslot_pfn() and before mmio exit).

 What's your idea?

 I think if you always treat the low bit as zero in mmio sptes, you can
 do that without losing a bit of the generation.

 What's you did is avoiding cache a invalid generation number into spte, but
 actually if we can figure it out when we check mmio access, it's ok. Like 
 the
 updated patch i posted should fix it, that way avoids doubly increase the 
 number.

 Yes.

 Okay, if you're interested increasing the number doubly, there is the 
 simpler
 one:

 This wastes a bit in the mmio spte though.  My idea is to increase the
 memslots generation twice, but drop the low bit in the mmio spte.

 Yeah, really smart idea. :)

 Paolo/David, would you mind making a patch for this (+ the comments in David's
 patch)?

Paolo, since it was your idea would you like to write it? I don't mind either
way.


 Please feel free to add my:
 Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] PC, KVM, CMA: Fix regression caused by wrong get_order() use

2014-08-19 Thread Joonsoo Kim
On Thu, Aug 14, 2014 at 03:03:07PM +1000, Alexey Kardashevskiy wrote:
 fc95ca7284bc54953165cba76c3228bd2cdb9591 claims that there is no
 functional change but this is not true as it calls get_order() (which
 takes bytes) where it should have called ilog2() and the kernel stops
 on VM_BUG_ON().
 
 This replaces get_order() with order_base_2() (round-up version of ilog2).
 
 Suggested-by: Paul Mackerras pau...@samba.org
 Cc: Alexander Graf ag...@suse.de
 Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 Cc: Joonsoo Kim iamjoonsoo@lge.com
 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

Sorry for my fault. :(

Acked-by: Joonsoo Kim iamjoonsoo@lge.com

Thanks.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3] KVM: vmx: fix ept reserved bits for 1-GByte page

2014-08-19 Thread Wanpeng Li
EPT misconfig handler in kvm will check which reason lead to EPT
misconfiguration after vmexit. One of the reasons is that an EPT
paging-structure entry is configured with settings reserved for
future functionality. However, the handler can't identify if
paging-structure entry of reserved bits for 1-GByte page are
configured, since PDPTE which point to 1-GByte page will reserve
bits 29:12 instead of bits 7:3 which are reserved for PDPTE that
references an EPT Page Directory. This patch fix it by reserve
bits 29:12 for 1-GByte page.

Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com
---
v2 - v3:
 * return 0xf8 for level == 4
 * check spte  (1ULL  7) if level == 1 
 * (rsvd_mask  0x38) == 0 for large page or leaf page 
v1 - v2:
 * same if statement cover both 2MB and 1GB pages
 * return 0xf8 for level == 4
 * get the level by checking the return value of ept_rsvd_mask

 arch/x86/kvm/vmx.c | 22 --
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cad37d5..3218359 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5521,17 +5521,18 @@ static u64 ept_rsvd_mask(u64 spte, int level)
for (i = 51; i  boot_cpu_data.x86_phys_bits; i--)
mask |= (1ULL  i);
 
-   if (level  2)
+   if (level == 4)
/* bits 7:3 reserved */
mask |= 0xf8;
-   else if (level == 2) {
-   if (spte  (1ULL  7))
-   /* 2MB ref, bits 20:12 reserved */
-   mask |= 0x1ff000;
-   else
-   /* bits 6:3 reserved */
-   mask |= 0x78;
-   }
+   else if (spte  (1ULL  7))
+   /*
+* 1GB/2MB page, bits 29:12 or 20:12 reserved respectively,
+* level == 1 if the hypervisor is using the ignored bit 7.
+*/
+   mask |= (PAGE_SIZE  ((level - 1) * 9)) - PAGE_SIZE;
+   else
+   /* bits 6:3 reserved */
+   mask |= 0x78;
 
return mask;
 }
@@ -5561,7 +5562,8 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu 
*vcpu, u64 spte,
WARN_ON(1);
}
 
-   if (level == 1 || (level == 2  (spte  (1ULL  7 {
+   /* bits 5:3 are _not_ reserved for large page or leaf page */
+   if ((rsvd_bits  0x38) == 0) {
u64 ept_mem_type = (spte  0x38)  3;
 
if (ept_mem_type == 2 || ept_mem_type == 3 ||
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/3] KVM: vmx: fix ept reserved bits for 1-GByte page

2014-08-19 Thread Wanpeng Li
Hi Paolo,
On Tue, Aug 19, 2014 at 11:09:49AM +0200, Paolo Bonzini wrote:
[...]
I suggest that you write a testcase for kvm-unit-tests.


Just send out v3. The testcase will be written later since I'm not familiar
with kvm-unit-tests before and time is still needed.

Regards,
Wanpeng Li 

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4] powerpc/kvm: support to handle sw breakpoint

2014-08-19 Thread Madhavan Srinivasan
This patch adds kernel side support for software breakpoint.
Design is that, by using an illegal instruction, we trap to hypervisor
via Emulation Assistance interrupt, where we check for the illegal instruction
and accordingly we return to Host or Guest. Patch also adds support for
software breakpoint in PR KVM.

Changes v3-v4:
 Made changes to code comments and removed #define of zero opcode
 Added a new function to handle the debug instruction emulation in book3s_hv
 Rebased the code to latest upstream source.

Changes v2-v3:
 Changed the debug instructions. Using the all zero opcode in the instruction 
word
  as illegal instruction as mentioned in Power ISA instead of ABS
 Removed reg updated in emulation assist and added a call to
  kvmppc_emulate_instruction for reg update.

Changes v1-v2:

 Moved the debug instruction #def to kvm_book3s.h. This way PR_KVM can also 
share it.
 Added code to use KVM get one reg infrastructure to get debug opcode.
 Updated emulate.c to include emulation of debug instruction incase of PR_KVM.
 Made changes to commit message.

Signed-off-by: Madhavan Srinivasan ma...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_book3s.h |  7 +++
 arch/powerpc/kvm/book3s.c |  3 ++-
 arch/powerpc/kvm/book3s_hv.c  | 32 ++--
 arch/powerpc/kvm/book3s_pr.c  |  3 +++
 arch/powerpc/kvm/emulate.c| 11 +++
 5 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 6acf0c2..a1944f8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -24,6 +24,13 @@
 #include linux/kvm_host.h
 #include asm/kvm_book3s_asm.h
 
+/*
+ * KVMPPC_INST_BOOK3S_DEBUG is debug Instruction for supporting Software 
Breakpoint.
+ * Based on PowerISA v2.07, Instruction with primary opcode 0 will be treated 
as illegal
+ * instruction.
+ */
+#define KVMPPC_INST_BOOK3S_DEBUG   0x0000
+
 struct kvmppc_bat {
u64 raw;
u32 bepi;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index dd03f6b..00e9c9f 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -778,7 +778,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
 {
-   return -EINVAL;
+   vcpu-guest_debug = dbg-control;
+   return 0;
 }
 
 void kvmppc_decrementer_func(unsigned long data)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 27cced9..0a92e45 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -725,6 +725,14 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
return kvmppc_hcall_impl_hv_realmode(cmd);
 }
 
+static int kvmppc_emulate_debug_instruction_hv(struct kvm_run *run,
+   struct kvm_vcpu *vcpu)
+{
+   run-exit_reason = KVM_EXIT_DEBUG;
+   run-debug.arch.address = kvmppc_get_pc(vcpu);
+   return 0;
+}
+
 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 struct task_struct *tsk)
 {
@@ -811,9 +819,26 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
 * we don't emulate any guest instructions at this stage.
 */
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
-   kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
-   r = RESUME_GUEST;
+   {
+   u32 last_inst;
+   if(kvmppc_get_last_inst(vcpu, INST_GENERIC, last_inst) !=
+   EMULATE_DONE) {
+   /*
+* Fetch failed, so return to guest and
+* try executing it again.
+*/
+   r = RESUME_GUEST;
+   } else {
+   if (last_inst == KVMPPC_INST_BOOK3S_DEBUG) {
+   kvmppc_emulate_debug_instruction_hv(run, vcpu);
+   r = RESUME_HOST;
+   } else {
+   kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+   r = RESUME_GUEST;
+   }
+   }
break;
+   }
/*
 * This occurs if the guest (kernel or userspace), does something that
 * is prohibited by HFSCR.  We just generate a program interrupt to
@@ -922,6 +947,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 
id,
long int i;
 
switch (id) {
+   case KVM_REG_PPC_DEBUG_INST:
+   *val = get_reg_val(id, KVMPPC_INST_BOOK3S_DEBUG);
+   break;
case KVM_REG_PPC_HIOR:
*val = get_reg_val(id, 0);
break;
diff --git 

Re: [PATCH v2] PC, KVM, CMA: Fix regression caused by wrong get_order() use

2014-08-19 Thread Joonsoo Kim
On Thu, Aug 14, 2014 at 03:03:07PM +1000, Alexey Kardashevskiy wrote:
 fc95ca7284bc54953165cba76c3228bd2cdb9591 claims that there is no
 functional change but this is not true as it calls get_order() (which
 takes bytes) where it should have called ilog2() and the kernel stops
 on VM_BUG_ON().
 
 This replaces get_order() with order_base_2() (round-up version of ilog2).
 
 Suggested-by: Paul Mackerras pau...@samba.org
 Cc: Alexander Graf ag...@suse.de
 Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 Cc: Joonsoo Kim iamjoonsoo@lge.com
 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

Sorry for my fault. :(

Acked-by: Joonsoo Kim iamjoonsoo@lge.com

Thanks.
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4] powerpc/kvm: support to handle sw breakpoint

2014-08-19 Thread Madhavan Srinivasan
This patch adds kernel side support for software breakpoint.
Design is that, by using an illegal instruction, we trap to hypervisor
via Emulation Assistance interrupt, where we check for the illegal instruction
and accordingly we return to Host or Guest. Patch also adds support for
software breakpoint in PR KVM.

Changes v3-v4:
 Made changes to code comments and removed #define of zero opcode
 Added a new function to handle the debug instruction emulation in book3s_hv
 Rebased the code to latest upstream source.

Changes v2-v3:
 Changed the debug instructions. Using the all zero opcode in the instruction 
word
  as illegal instruction as mentioned in Power ISA instead of ABS
 Removed reg updated in emulation assist and added a call to
  kvmppc_emulate_instruction for reg update.

Changes v1-v2:

 Moved the debug instruction #def to kvm_book3s.h. This way PR_KVM can also 
share it.
 Added code to use KVM get one reg infrastructure to get debug opcode.
 Updated emulate.c to include emulation of debug instruction incase of PR_KVM.
 Made changes to commit message.

Signed-off-by: Madhavan Srinivasan ma...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_book3s.h |  7 +++
 arch/powerpc/kvm/book3s.c |  3 ++-
 arch/powerpc/kvm/book3s_hv.c  | 32 ++--
 arch/powerpc/kvm/book3s_pr.c  |  3 +++
 arch/powerpc/kvm/emulate.c| 11 +++
 5 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 6acf0c2..a1944f8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -24,6 +24,13 @@
 #include linux/kvm_host.h
 #include asm/kvm_book3s_asm.h
 
+/*
+ * KVMPPC_INST_BOOK3S_DEBUG is debug Instruction for supporting Software 
Breakpoint.
+ * Based on PowerISA v2.07, Instruction with primary opcode 0 will be treated 
as illegal
+ * instruction.
+ */
+#define KVMPPC_INST_BOOK3S_DEBUG   0x0000
+
 struct kvmppc_bat {
u64 raw;
u32 bepi;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index dd03f6b..00e9c9f 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -778,7 +778,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
 {
-   return -EINVAL;
+   vcpu-guest_debug = dbg-control;
+   return 0;
 }
 
 void kvmppc_decrementer_func(unsigned long data)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 27cced9..0a92e45 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -725,6 +725,14 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
return kvmppc_hcall_impl_hv_realmode(cmd);
 }
 
+static int kvmppc_emulate_debug_instruction_hv(struct kvm_run *run,
+   struct kvm_vcpu *vcpu)
+{
+   run-exit_reason = KVM_EXIT_DEBUG;
+   run-debug.arch.address = kvmppc_get_pc(vcpu);
+   return 0;
+}
+
 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 struct task_struct *tsk)
 {
@@ -811,9 +819,26 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
 * we don't emulate any guest instructions at this stage.
 */
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
-   kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
-   r = RESUME_GUEST;
+   {
+   u32 last_inst;
+   if(kvmppc_get_last_inst(vcpu, INST_GENERIC, last_inst) !=
+   EMULATE_DONE) {
+   /*
+* Fetch failed, so return to guest and
+* try executing it again.
+*/
+   r = RESUME_GUEST;
+   } else {
+   if (last_inst == KVMPPC_INST_BOOK3S_DEBUG) {
+   kvmppc_emulate_debug_instruction_hv(run, vcpu);
+   r = RESUME_HOST;
+   } else {
+   kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+   r = RESUME_GUEST;
+   }
+   }
break;
+   }
/*
 * This occurs if the guest (kernel or userspace), does something that
 * is prohibited by HFSCR.  We just generate a program interrupt to
@@ -922,6 +947,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 
id,
long int i;
 
switch (id) {
+   case KVM_REG_PPC_DEBUG_INST:
+   *val = get_reg_val(id, KVMPPC_INST_BOOK3S_DEBUG);
+   break;
case KVM_REG_PPC_HIOR:
*val = get_reg_val(id, 0);
break;
diff --git