Re: [PATCH 15/21] arm64: KVM: Add panic handling

2015-11-16 Thread Ard Biesheuvel
On 16 November 2015 at 14:11, Marc Zyngier  wrote:
> Add the panic handler, together with the small bits of assembly
> code to call the kernel's panic implementation.
>
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm64/kvm/hyp/hyp-entry.S | 11 ++-
>  arch/arm64/kvm/hyp/hyp.h   |  1 +
>  arch/arm64/kvm/hyp/switch.c| 35 +++
>  3 files changed, 46 insertions(+), 1 deletion(-)
>
> diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
> index e11a129..7218eed 100644
> --- a/arch/arm64/kvm/hyp/hyp-entry.S
> +++ b/arch/arm64/kvm/hyp/hyp-entry.S
> @@ -141,7 +141,16 @@ el1_irq:
> mov x1, #ARM_EXCEPTION_IRQ
> b   __guest_exit
>
> -.macro invalid_vector  label, target = __kvm_hyp_panic
> +ENTRY(__hyp_do_panic)
> +   mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
> + PSR_MODE_EL1h)
> +   msr spsr_el2, lr
> +   ldr lr, =panic
> +   msr elr_el2, lr
> +   eret
> +ENDPROC(__hyp_do_panic)
> +
> +.macro invalid_vector  label, target = __hyp_panic
> .align  2
>  \label:
> b \target
> diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
> index 240fb79..d5d500d 100644
> --- a/arch/arm64/kvm/hyp/hyp.h
> +++ b/arch/arm64/kvm/hyp/hyp.h
> @@ -74,6 +74,7 @@ void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
>  void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
>
>  u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
> +void __noreturn __hyp_do_panic(unsigned long, ...);
>
>  #endif /* __ARM64_KVM_HYP_H__ */
>
> diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
> index 06d3e20..cdc2a96 100644
> --- a/arch/arm64/kvm/hyp/switch.c
> +++ b/arch/arm64/kvm/hyp/switch.c
> @@ -140,3 +140,38 @@ int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
>
> return exit_code;
>  }
> +
> +static const char *__hyp_panic_string = "HYP panic:\nPS:%08x PC:%p 
> ESR:%p\nFAR:%p HPFAR:%p PAR:%p\nVCPU:%p\n";
> +

Re separating the HYP text from the kernel proper: this is exactly the
thing that is likely to cause trouble when you execute the kernel text
from HYP.

__hyp_panic_string is a non-const char pointer containing the absolute
address of the string in the initializer, as seen from the high kernel
virtual mapping.
Better use 'static const char __hyp_panic_string[]' instead.

(If it currenty works fine, it is only because the compiler optimizes
the entire variable away, and performs a relative access in the place
where the variable is referenced.)


> +void __hyp_text __noreturn __hyp_panic(void)
> +{
> +   u64 spsr = read_sysreg(spsr_el2);
> +   u64 elr = read_sysreg(elr_el2);
> +   u64 par = read_sysreg(par_el1);
> +
> +   if (read_sysreg(vttbr_el2)) {
> +   struct kvm_vcpu *vcpu;
> +   struct kvm_cpu_context *host_ctxt;
> +
> +   vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
> +   host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> +   __deactivate_traps(vcpu);
> +   __deactivate_vm(vcpu);
> +   __sysreg_restore_state(host_ctxt);
> +
> +   write_sysreg(host_ctxt->gp_regs.sp_el1, sp_el1);
> +   }
> +
> +   /* Call panic for real */
> +   while (1) {
> +   unsigned long str_va = (unsigned long)__hyp_panic_string;
> +
> +   str_va -= HYP_PAGE_OFFSET;
> +   str_va += PAGE_OFFSET;
> +   __hyp_do_panic(str_va,
> +  spsr,  elr,
> +  read_sysreg(esr_el2),   read_sysreg(far_el2),
> +  read_sysreg(hpfar_el2), par,
> +  read_sysreg(tpidr_el2));
> +   }
> +}
> --
> 2.1.4
>
>
> ___
> linux-arm-kernel mailing list
> linux-arm-ker...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 15/21] arm64: KVM: Add panic handling

2015-11-16 Thread Marc Zyngier
On 16/11/15 15:53, Ard Biesheuvel wrote:
> On 16 November 2015 at 14:11, Marc Zyngier  wrote:
>> Add the panic handler, together with the small bits of assembly
>> code to call the kernel's panic implementation.
>>
>> Signed-off-by: Marc Zyngier 
>> ---
>>  arch/arm64/kvm/hyp/hyp-entry.S | 11 ++-
>>  arch/arm64/kvm/hyp/hyp.h   |  1 +
>>  arch/arm64/kvm/hyp/switch.c| 35 +++
>>  3 files changed, 46 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
>> index e11a129..7218eed 100644
>> --- a/arch/arm64/kvm/hyp/hyp-entry.S
>> +++ b/arch/arm64/kvm/hyp/hyp-entry.S
>> @@ -141,7 +141,16 @@ el1_irq:
>> mov x1, #ARM_EXCEPTION_IRQ
>> b   __guest_exit
>>
>> -.macro invalid_vector  label, target = __kvm_hyp_panic
>> +ENTRY(__hyp_do_panic)
>> +   mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
>> + PSR_MODE_EL1h)
>> +   msr spsr_el2, lr
>> +   ldr lr, =panic
>> +   msr elr_el2, lr
>> +   eret
>> +ENDPROC(__hyp_do_panic)
>> +
>> +.macro invalid_vector  label, target = __hyp_panic
>> .align  2
>>  \label:
>> b \target
>> diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
>> index 240fb79..d5d500d 100644
>> --- a/arch/arm64/kvm/hyp/hyp.h
>> +++ b/arch/arm64/kvm/hyp/hyp.h
>> @@ -74,6 +74,7 @@ void __fpsimd_save_state(struct user_fpsimd_state 
>> *fp_regs);
>>  void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
>>
>>  u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
>> +void __noreturn __hyp_do_panic(unsigned long, ...);
>>
>>  #endif /* __ARM64_KVM_HYP_H__ */
>>
>> diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
>> index 06d3e20..cdc2a96 100644
>> --- a/arch/arm64/kvm/hyp/switch.c
>> +++ b/arch/arm64/kvm/hyp/switch.c
>> @@ -140,3 +140,38 @@ int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
>>
>> return exit_code;
>>  }
>> +
>> +static const char *__hyp_panic_string = "HYP panic:\nPS:%08x PC:%p 
>> ESR:%p\nFAR:%p HPFAR:%p PAR:%p\nVCPU:%p\n";
>> +
> 
> Re separating the HYP text from the kernel proper: this is exactly the
> thing that is likely to cause trouble when you execute the kernel text
> from HYP.
> 
> __hyp_panic_string is a non-const char pointer containing the absolute
> address of the string in the initializer, as seen from the high kernel
> virtual mapping.
> Better use 'static const char __hyp_panic_string[]' instead.

Definitely.

> (If it currenty works fine, it is only because the compiler optimizes
> the entire variable away, and performs a relative access in the place
> where the variable is referenced.)

That, and the fact that only panic() gets passed a pointer to this
string, so it doesn't really matter where it lives in this case.

But you do have a point here, and I'll address this for the next round.

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] vhost: relax log address alignment

2015-11-16 Thread Michael S. Tsirkin
commit 5d9a07b0de512b77bf28d2401e5fe3351f00a240 ("vhost: relax used
address alignment") fixed the alignment for the used virtual address,
but not for the physical address used for logging.

That's a mistake: alignment should clearly be the same for virtual and
physical addresses,

Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index eec2f11..080422f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -819,7 +819,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void 
__user *argp)
BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
(a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
-   (a.log_guest_addr & (sizeof(u64) - 1))) {
+   (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) {
r = -EINVAL;
break;
}
-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 2/2] target-i386: add support to migrate vcpu's TSC rate

2015-11-16 Thread Eduardo Habkost
On Mon, Nov 16, 2015 at 10:30:08PM +0800, Haozhong Zhang wrote:
> On 11/16/15 11:43, Eduardo Habkost wrote:
> > On Mon, Nov 16, 2015 at 04:04:08PM +0800, Haozhong Zhang wrote:
> > > This patch enables migrating vcpu's TSC rate. If KVM on the destination
> > > machine supports TSC scaling, guest programs will observe a consistent
> > > TSC rate across the migration.
> > > 
> > > If TSC scaling is not supported on the destination machine, the
> > > migration will not be aborted and QEMU on the destination will not set
> > > vcpu's TSC rate to the migrated value.
> > > 
> > > If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
> > > machine is inconsistent with the migrated TSC rate, the migration will
> > > be aborted.
> > > 
> > > For backwards compatibility, the migration of vcpu's TSC rate is
> > > disabled on pc-*-2.4 and older machine types.
> > > 
> > > Signed-off-by: Haozhong Zhang 
[...]
> > > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > > index 9084b29..8448248 100644
> > > --- a/target-i386/kvm.c
> > > +++ b/target-i386/kvm.c
> > > @@ -2334,8 +2334,30 @@ static void kvm_arch_set_tsc_khz(CPUState *cs)
> > >  int r;
> > >  
> > >  /*
> > > - * If no user-specified TSC frequency is present, we will try to
> > > - * set env->tsc_khz to the value used by KVM.
> > > + * For other cases of env->tsc_khz and env->user_tsc_khz:
> > > + *
> > > + * - We have eliminated all cases that satisfy
> > > + *   env->tsc_khz && env->user_tsc_khz &&
> > > + *   env->tsc_khz != env->user_tsc_khz
> > > + *   in cpu_post_load().
> > > + *
> > > + * - If env->user_tsc_khz is not zero, then it must be equal to
> > > + *   env->tsc_khz (if the latter is not zero) and has been set in
> > > + *   kvm_arch_init_vcpu().
> > > + */
> > > +if (env->tsc_khz && !env->user_tsc_khz) {
> > > +r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
> > > +kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) : -ENOTSUP;
> > 
> > Please don't duplicate the code from kvm_arch_init_vcpu(). We can
> > handle both cases in kvm_arch_put_registers(KVM_PUT_FULL_STATE),
> > can't we?
> >
> 
> No. If KVM_SET_TSC_KHZ fails in kvm_arch_init_vcpu(), QEMU will
> exit. But if KVM_SET_TSC_KHZ fails in the migration, QEMU will not
> abort. And because the return value of
> kvm_arch_put_registers(KVM_PUT_FULL_STATE) is not checked by its
> caller do_kvm_cpu_synchronize_post_init(), I have to handle them in
> different ways.

Reporting errors back in kvm_put_registers() may be difficult, I
see, so handling user-provided TSC frequency in
kvm_arch_init_vcpu() makes sense. But you can still avoid code
duplication. Just reuse the same function in kvm_arch_init_vcpu()
and kvm_put_registers(), but return errors back to the caller in
kvm_arch_init_vcpu() in case env->user_tsc_khz is set.

kvm_put_registers() can ignore the error, and just print a
warning. But (on both cases) we should print a warning only if
env->tsc_khz doesn't match KVM_GET_TSC_KHZ, because we don't want
to print spurious warnings on every migration when TSC scaling
isn't supported. (You even suggested changes to the example code
that does that, at Message-ID:
<20151106023244.gb24...@hzzhang-optiplex-9020.sh.intel.com>).

Also, I believe it won't be a problem if we call KVM_SET_TSC_KHZ
twice in the case of incoming migration, so there's no need to
check user_tsc_khz in the kvm_arch_put_registers() path. Keeping
the code simple is more important than avoiding one extra ioctl()
on incoming migration, IMO.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 15/21] arm64: KVM: Add panic handling

2015-11-16 Thread Marc Zyngier
Add the panic handler, together with the small bits of assembly
code to call the kernel's panic implementation.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/hyp-entry.S | 11 ++-
 arch/arm64/kvm/hyp/hyp.h   |  1 +
 arch/arm64/kvm/hyp/switch.c| 35 +++
 3 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index e11a129..7218eed 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -141,7 +141,16 @@ el1_irq:
mov x1, #ARM_EXCEPTION_IRQ
b   __guest_exit
 
-.macro invalid_vector  label, target = __kvm_hyp_panic
+ENTRY(__hyp_do_panic)
+   mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
+ PSR_MODE_EL1h)
+   msr spsr_el2, lr
+   ldr lr, =panic
+   msr elr_el2, lr
+   eret
+ENDPROC(__hyp_do_panic)
+
+.macro invalid_vector  label, target = __hyp_panic
.align  2
 \label:
b \target
diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
index 240fb79..d5d500d 100644
--- a/arch/arm64/kvm/hyp/hyp.h
+++ b/arch/arm64/kvm/hyp/hyp.h
@@ -74,6 +74,7 @@ void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
 void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
 
 u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
+void __noreturn __hyp_do_panic(unsigned long, ...);
 
 #endif /* __ARM64_KVM_HYP_H__ */
 
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 06d3e20..cdc2a96 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -140,3 +140,38 @@ int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
 
return exit_code;
 }
+
+static const char *__hyp_panic_string = "HYP panic:\nPS:%08x PC:%p 
ESR:%p\nFAR:%p HPFAR:%p PAR:%p\nVCPU:%p\n";
+
+void __hyp_text __noreturn __hyp_panic(void)
+{
+   u64 spsr = read_sysreg(spsr_el2);
+   u64 elr = read_sysreg(elr_el2);
+   u64 par = read_sysreg(par_el1);
+
+   if (read_sysreg(vttbr_el2)) {
+   struct kvm_vcpu *vcpu;
+   struct kvm_cpu_context *host_ctxt;
+
+   vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
+   host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+   __deactivate_traps(vcpu);
+   __deactivate_vm(vcpu);
+   __sysreg_restore_state(host_ctxt);
+
+   write_sysreg(host_ctxt->gp_regs.sp_el1, sp_el1);
+   }
+
+   /* Call panic for real */
+   while (1) {
+   unsigned long str_va = (unsigned long)__hyp_panic_string;
+
+   str_va -= HYP_PAGE_OFFSET;
+   str_va += PAGE_OFFSET;
+   __hyp_do_panic(str_va,
+  spsr,  elr,
+  read_sysreg(esr_el2),   read_sysreg(far_el2),
+  read_sysreg(hpfar_el2), par,
+  read_sysreg(tpidr_el2));
+   }
+}
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 18/21] arm64: KVM: Move away from the assembly version of the world switch

2015-11-16 Thread Marc Zyngier
This is it. We remove all of the code that has now been rewritten.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/Makefile |2 -
 arch/arm64/kvm/hyp.S| 1071 +--
 arch/arm64/kvm/vgic-v2-switch.S |  134 -
 arch/arm64/kvm/vgic-v3-switch.S |  269 --
 4 files changed, 1 insertion(+), 1475 deletions(-)
 delete mode 100644 arch/arm64/kvm/vgic-v2-switch.S
 delete mode 100644 arch/arm64/kvm/vgic-v3-switch.S

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index d31e4e5..caee9ee 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -23,8 +23,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o 
sys_regs.o sys_regs_generi
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v2-switch.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v3-switch.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 1599701..0ccdcbb 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -17,906 +17,7 @@
 
 #include 
 
-#include 
-#include 
 #include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#define CPU_GP_REG_OFFSET(x)   (CPU_GP_REGS + x)
-#define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x)
-#define CPU_SPSR_OFFSET(x) CPU_GP_REG_OFFSET(CPU_SPSR + 8*x)
-#define CPU_SYSREG_OFFSET(x)   (CPU_SYSREGS + 8*x)
-
-   .text
-   .pushsection.hyp.text, "ax"
-   .align  PAGE_SHIFT
-
-.macro save_common_regs
-   // x2: base address for cpu context
-   // x3: tmp register
-
-   add x3, x2, #CPU_XREG_OFFSET(19)
-   stp x19, x20, [x3]
-   stp x21, x22, [x3, #16]
-   stp x23, x24, [x3, #32]
-   stp x25, x26, [x3, #48]
-   stp x27, x28, [x3, #64]
-   stp x29, lr, [x3, #80]
-
-   mrs x19, sp_el0
-   mrs x20, elr_el2// pc before entering el2
-   mrs x21, spsr_el2   // pstate before entering el2
-
-   stp x19, x20, [x3, #96]
-   str x21, [x3, #112]
-
-   mrs x22, sp_el1
-   mrs x23, elr_el1
-   mrs x24, spsr_el1
-
-   str x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
-   str x23, [x2, #CPU_GP_REG_OFFSET(CPU_ELR_EL1)]
-   str x24, [x2, #CPU_SPSR_OFFSET(KVM_SPSR_EL1)]
-.endm
-
-.macro restore_common_regs
-   // x2: base address for cpu context
-   // x3: tmp register
-
-   ldr x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
-   ldr x23, [x2, #CPU_GP_REG_OFFSET(CPU_ELR_EL1)]
-   ldr x24, [x2, #CPU_SPSR_OFFSET(KVM_SPSR_EL1)]
-
-   msr sp_el1, x22
-   msr elr_el1, x23
-   msr spsr_el1, x24
-
-   add x3, x2, #CPU_XREG_OFFSET(31)// SP_EL0
-   ldp x19, x20, [x3]
-   ldr x21, [x3, #16]
-
-   msr sp_el0, x19
-   msr elr_el2, x20// pc on return from el2
-   msr spsr_el2, x21   // pstate on return from el2
-
-   add x3, x2, #CPU_XREG_OFFSET(19)
-   ldp x19, x20, [x3]
-   ldp x21, x22, [x3, #16]
-   ldp x23, x24, [x3, #32]
-   ldp x25, x26, [x3, #48]
-   ldp x27, x28, [x3, #64]
-   ldp x29, lr, [x3, #80]
-.endm
-
-.macro save_host_regs
-   save_common_regs
-.endm
-
-.macro restore_host_regs
-   restore_common_regs
-.endm
-
-.macro save_fpsimd
-   // x2: cpu context address
-   // x3, x4: tmp regs
-   add x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
-   fpsimd_save x3, 4
-.endm
-
-.macro restore_fpsimd
-   // x2: cpu context address
-   // x3, x4: tmp regs
-   add x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
-   fpsimd_restore x3, 4
-.endm
-
-.macro save_guest_regs
-   // x0 is the vcpu address
-   // x1 is the return code, do not corrupt!
-   // x2 is the cpu context
-   // x3 is a tmp register
-   // Guest's x0-x3 are on the stack
-
-   // Compute base to save registers
-   add x3, x2, #CPU_XREG_OFFSET(4)
-   stp x4, x5, [x3]
-   stp x6, x7, [x3, #16]
-   stp x8, x9, [x3, #32]
-   stp x10, x11, [x3, #48]
-   stp x12, x13, [x3, #64]
-   stp x14, x15, [x3, #80]
-   stp x16, x17, [x3, #96]
-   str x18, [x3, #112]
-
-   pop x6, x7  // x2, x3
-   pop x4, x5  // x0, x1
-
-   add x3, x2, #CPU_XREG_OFFSET(0)
-   stp x4, x5, [x3]
-   stp x6, x7, [x3, #16]
-
-   save_common_regs
-.endm
-
-.macro restore_guest_regs
-   // x0 is the vcpu address.
-   // x2 is the cpu context
-   // x3 is 

[PATCH 10/21] arm64: KVM: Add patchable function selector

2015-11-16 Thread Marc Zyngier
KVM so far relies on code patching, and is likely to use it more
in the future. The main issue is that our alternative system works
at the instruction level, while we'd like to have alternatives at
the function level.

In order to cope with this, add the "hyp_alternate_select" macro that
outputs a brief sequence of code that in turn can be patched, allowing
al alternative function to be selected.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/hyp.h | 16 
 1 file changed, 16 insertions(+)

diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
index 2937552..bf13238 100644
--- a/arch/arm64/kvm/hyp/hyp.h
+++ b/arch/arm64/kvm/hyp/hyp.h
@@ -27,6 +27,22 @@
 
 #define kern_hyp_va(v) (typeof(v))((unsigned long)v & HYP_PAGE_OFFSET_MASK)
 
+/*
+ * Generates patchable code sequences that are used to switch between
+ * two implementations of a function, depending on the availability of
+ * a feature.
+ */
+#define hyp_alternate_select(fname, orig, alt, cond)   \
+typeof(orig) * __hyp_text fname(void)  \
+{  \
+   typeof(alt) *val = orig;\
+   asm volatile(ALTERNATIVE("nop   \n",\
+"mov   %0, %1  \n",\
+cond)  \
+: "+r" (val) : "r" (alt)); \
+   return val; \
+}
+
 void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
 void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
 
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/21] arm64: KVM: Implement guest entry

2015-11-16 Thread Marc Zyngier
Contrary to the previous patch, the guest entry is fairly different
from its assembly counterpart, mostly because it is only concerned
with saving/restoring the GP registers, and nothing else.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/Makefile |   1 +
 arch/arm64/kvm/hyp/entry.S  | 155 
 arch/arm64/kvm/hyp/hyp.h|   2 +
 3 files changed, 158 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/entry.S

diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index ec14cac..1e1ff06 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += entry.o
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
new file mode 100644
index 000..2c4449a
--- /dev/null
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define CPU_GP_REG_OFFSET(x)   (CPU_GP_REGS + x)
+#define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x)
+
+   .text
+   .pushsection.hyp.text, "ax"
+
+.macro save_common_regs ctxt
+   stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+   stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+   stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+   stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+   stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+   stp x29, lr,  [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro restore_common_regs ctxt
+   ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+   ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+   ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+   ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+   ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+   ldp x29, lr,  [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro save_host_regs reg
+   save_common_regs \reg
+.endm
+
+.macro restore_host_regs reg
+   restore_common_regs \reg
+.endm
+
+.macro save_guest_regs
+   // x0 is the vcpu address
+   // x1 is the return code, do not corrupt!
+   // x2 is the cpu context
+   // x3 is a tmp register
+   // Guest's x0-x3 are on the stack
+
+   add x2, x0, #VCPU_CONTEXT
+
+   // Compute base to save registers
+   stp x4, x5,   [x2, #CPU_XREG_OFFSET(4)]
+   stp x6, x7,   [x2, #CPU_XREG_OFFSET(6)]
+   stp x8, x9,   [x2, #CPU_XREG_OFFSET(8)]
+   stp x10, x11, [x2, #CPU_XREG_OFFSET(10)]
+   stp x12, x13, [x2, #CPU_XREG_OFFSET(12)]
+   stp x14, x15, [x2, #CPU_XREG_OFFSET(14)]
+   stp x16, x17, [x2, #CPU_XREG_OFFSET(16)]
+   str x18,  [x2, #CPU_XREG_OFFSET(18)]
+
+   pop x6, x7  // x2, x3
+   pop x4, x5  // x0, x1
+
+   stp x4, x5, [x2, #CPU_XREG_OFFSET(0)]
+   stp x6, x7, [x2, #CPU_XREG_OFFSET(2)]
+
+   save_common_regs x2
+.endm
+
+.macro restore_guest_regs
+   // Assume vcpu in x0, clobbers everything else
+
+   add x2, x0, #VCPU_CONTEXT
+
+   // Prepare x0-x3 for later restore
+   ldp x4, x5, [x2, #CPU_XREG_OFFSET(0)]
+   ldp x6, x7, [x2, #CPU_XREG_OFFSET(2)]
+   pushx4, x5  // Push x0-x3 on the stack
+   pushx6, x7
+
+   // x4-x18
+   ldp x4, x5,   [x2, #CPU_XREG_OFFSET(4)] 
+   ldp x6, x7,   [x2, #CPU_XREG_OFFSET(6)] 
+   ldp x8, x9,   [x2, #CPU_XREG_OFFSET(8)] 
+   ldp x10, x11, [x2, #CPU_XREG_OFFSET(10)]
+   ldp x12, x13, [x2, #CPU_XREG_OFFSET(12)]
+   ldp x14, x15, [x2, #CPU_XREG_OFFSET(14)]
+   ldp x16, x17, [x2, #CPU_XREG_OFFSET(16)]
+   ldr x18,  [x2, #CPU_XREG_OFFSET(18)]
+
+   // x19-x29, lr
+   restore_common_regs x2
+
+   // Last bits of the 64bit state
+   pop x2, x3
+   pop x0, x1
+
+   // Do not touch any register after this!
+.endm
+
+/*
+ * u64 __guest_enter(struct kvm_vcpu *vcpu,
+ *  struct kvm_cpu_context *host_ctxt);
+ */

[PATCH 08/21] arm64: KVM: Implement debug save/restore

2015-11-16 Thread Marc Zyngier
Implement the debug save restore as a direct translation of
the assembly code version.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/Makefile   |   1 +
 arch/arm64/kvm/hyp/debug-sr.c | 132 ++
 arch/arm64/kvm/hyp/hyp.h  |  13 +
 3 files changed, 146 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/debug-sr.c

diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index ec94200..ec14cac 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
new file mode 100644
index 000..118ea39
--- /dev/null
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+#include 
+
+#include 
+
+#include "hyp.h"
+
+#define read_debug(r,n)read_sysreg(r##n##_el1)
+#define write_debug(v,r,n) write_sysreg(v, r##n##_el1)
+
+#define save_debug(ptr,reg,nr) \
+   switch (nr) {   \
+   case 15:ptr[15] = read_debug(reg, 15);  \
+   case 14:ptr[14] = read_debug(reg, 14);  \
+   case 13:ptr[13] = read_debug(reg, 13);  \
+   case 12:ptr[12] = read_debug(reg, 12);  \
+   case 11:ptr[11] = read_debug(reg, 11);  \
+   case 10:ptr[10] = read_debug(reg, 10);  \
+   case 9: ptr[9] = read_debug(reg, 9);\
+   case 8: ptr[8] = read_debug(reg, 8);\
+   case 7: ptr[7] = read_debug(reg, 7);\
+   case 6: ptr[6] = read_debug(reg, 6);\
+   case 5: ptr[5] = read_debug(reg, 5);\
+   case 4: ptr[4] = read_debug(reg, 4);\
+   case 3: ptr[3] = read_debug(reg, 3);\
+   case 2: ptr[2] = read_debug(reg, 2);\
+   case 1: ptr[1] = read_debug(reg, 1);\
+   default:ptr[0] = read_debug(reg, 0);\
+   }
+
+#define restore_debug(ptr,reg,nr)  \
+   switch (nr) {   \
+   case 15:write_debug(ptr[15], reg, 15);  \
+   case 14:write_debug(ptr[14], reg, 14);  \
+   case 13:write_debug(ptr[13], reg, 13);  \
+   case 12:write_debug(ptr[12], reg, 12);  \
+   case 11:write_debug(ptr[11], reg, 11);  \
+   case 10:write_debug(ptr[10], reg, 10);  \
+   case 9: write_debug(ptr[9], reg, 9);\
+   case 8: write_debug(ptr[8], reg, 8);\
+   case 7: write_debug(ptr[7], reg, 7);\
+   case 6: write_debug(ptr[6], reg, 6);\
+   case 5: write_debug(ptr[5], reg, 5);\
+   case 4: write_debug(ptr[4], reg, 4);\
+   case 3: write_debug(ptr[3], reg, 3);\
+   case 2: write_debug(ptr[2], reg, 2);\
+   case 1: write_debug(ptr[1], reg, 1);\
+   default:write_debug(ptr[0], reg, 0);\
+   }
+
+void __hyp_text __debug_save_state(struct kvm_vcpu *vcpu,
+  struct kvm_guest_debug_arch *dbg,
+  struct kvm_cpu_context *ctxt)
+{
+   if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY) {
+   u64 aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
+   int brps, wrps;
+
+   brps = (aa64dfr0 >> 12) & 0xf;
+   wrps = (aa64dfr0 >> 20) & 0xf;
+
+

[PATCH 07/21] arm64: KVM: Implement 32bit system register save/restore

2015-11-16 Thread Marc Zyngier
Implement the 32bit system register save restore as a direct
translation of the assembly code version.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/hyp.h   |  2 ++
 arch/arm64/kvm/hyp/sysreg-sr.c | 41 +
 2 files changed, 43 insertions(+)

diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
index 087d3a5..4639330 100644
--- a/arch/arm64/kvm/hyp/hyp.h
+++ b/arch/arm64/kvm/hyp/hyp.h
@@ -38,6 +38,8 @@ void __timer_restore_state(struct kvm_vcpu *vcpu);
 
 void __sysreg_save_state(struct kvm_cpu_context *ctxt);
 void __sysreg_restore_state(struct kvm_cpu_context *ctxt);
+void __sysreg32_save_state(struct kvm_vcpu *vcpu);
+void __sysreg32_restore_state(struct kvm_vcpu *vcpu);
 
 #endif /* __ARM64_KVM_HYP_H__ */
 
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index add8fcb..3f81a4d 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -88,3 +88,44 @@ void __hyp_text __sysreg_restore_state(struct 
kvm_cpu_context *ctxt)
write_sysreg(ctxt->gp_regs.elr_el1, elr_el1);
write_sysreg(ctxt->gp_regs.spsr[KVM_SPSR_EL1], spsr_el1);
 }
+
+void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
+{
+   if (!(read_sysreg(hcr_el2) & HCR_RW)) {
+   u64 *spsr = vcpu->arch.ctxt.gp_regs.spsr;
+   u64 *sysreg = vcpu->arch.ctxt.sys_regs;
+
+   spsr[KVM_SPSR_ABT] = read_sysreg(spsr_abt);
+   spsr[KVM_SPSR_UND] = read_sysreg(spsr_und);
+   spsr[KVM_SPSR_IRQ] = read_sysreg(spsr_irq);
+   spsr[KVM_SPSR_FIQ] = read_sysreg(spsr_fiq);
+
+   sysreg[DACR32_EL2] = read_sysreg(dacr32_el2);
+   sysreg[IFSR32_EL2] = read_sysreg(ifsr32_el2);
+
+   if (!(read_sysreg(cptr_el2) & CPTR_EL2_TFP))
+   sysreg[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
+
+   if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
+   sysreg[DBGVCR32_EL2] = read_sysreg(dbgvcr32_el2);
+   }
+}
+
+void __hyp_text __sysreg32_restore_state(struct kvm_vcpu *vcpu)
+{
+   if (!(read_sysreg(hcr_el2) & HCR_RW)) {
+   u64 *spsr = vcpu->arch.ctxt.gp_regs.spsr;
+   u64 *sysreg = vcpu->arch.ctxt.sys_regs;
+
+   write_sysreg(spsr[KVM_SPSR_ABT], spsr_abt);
+   write_sysreg(spsr[KVM_SPSR_UND], spsr_und);
+   write_sysreg(spsr[KVM_SPSR_IRQ], spsr_irq);
+   write_sysreg(spsr[KVM_SPSR_FIQ], spsr_fiq);
+
+   write_sysreg(sysreg[DACR32_EL2], dacr32_el2);
+   write_sysreg(sysreg[IFSR32_EL2], ifsr32_el2);
+
+   if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
+   write_sysreg(sysreg[DBGVCR32_EL2], dbgvcr32_el2);
+   }
+}
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 17/21] arm64: KVM: Map the kernel RO section into HYP

2015-11-16 Thread Marc Zyngier
In order to run C code in HYP, we must make sure that the kernel's
RO section in mapped into HYP (otherwise things break badly).

Signed-off-by: Marc Zyngier 
---
 arch/arm/kvm/arm.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index eab83b2..6c4549a 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef REQUIRES_VIRT
 __asm__(".arch_extension   virt");
@@ -1072,6 +1073,12 @@ static int init_hyp_mode(void)
goto out_free_mappings;
}
 
+   err = create_hyp_mappings(__start_rodata, __end_rodata);
+   if (err) {
+   kvm_err("Cannot map rodata section\n");
+   goto out_free_mappings;
+   }
+
/*
 * Map the Hyp stack pages
 */
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/21] arm64: KVM: Implement system register save/restore

2015-11-16 Thread Marc Zyngier
Implement the system registe save restore as a direct translation of
the assembly code version.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/Makefile|  1 +
 arch/arm64/kvm/hyp/hyp.h   |  3 ++
 arch/arm64/kvm/hyp/sysreg-sr.c | 90 ++
 3 files changed, 94 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/sysreg-sr.c

diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 455dc0a..ec94200 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -5,3 +5,4 @@
 obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
index 86aa5a2..087d3a5 100644
--- a/arch/arm64/kvm/hyp/hyp.h
+++ b/arch/arm64/kvm/hyp/hyp.h
@@ -36,5 +36,8 @@ void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
 void __timer_save_state(struct kvm_vcpu *vcpu);
 void __timer_restore_state(struct kvm_vcpu *vcpu);
 
+void __sysreg_save_state(struct kvm_cpu_context *ctxt);
+void __sysreg_restore_state(struct kvm_cpu_context *ctxt);
+
 #endif /* __ARM64_KVM_HYP_H__ */
 
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
new file mode 100644
index 000..add8fcb
--- /dev/null
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+#include 
+
+#include 
+
+#include "hyp.h"
+
+/* ctxt is already in the HYP VA space */
+void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
+{
+   ctxt->sys_regs[MPIDR_EL1]   = read_sysreg(vmpidr_el2);
+   ctxt->sys_regs[CSSELR_EL1]  = read_sysreg(csselr_el1);
+   ctxt->sys_regs[SCTLR_EL1]   = read_sysreg(sctlr_el1);
+   ctxt->sys_regs[ACTLR_EL1]   = read_sysreg(actlr_el1);
+   ctxt->sys_regs[CPACR_EL1]   = read_sysreg(cpacr_el1);
+   ctxt->sys_regs[TTBR0_EL1]   = read_sysreg(ttbr0_el1);
+   ctxt->sys_regs[TTBR1_EL1]   = read_sysreg(ttbr1_el1);
+   ctxt->sys_regs[TCR_EL1] = read_sysreg(tcr_el1);
+   ctxt->sys_regs[ESR_EL1] = read_sysreg(esr_el1);
+   ctxt->sys_regs[AFSR0_EL1]   = read_sysreg(afsr0_el1);
+   ctxt->sys_regs[AFSR1_EL1]   = read_sysreg(afsr1_el1);
+   ctxt->sys_regs[FAR_EL1] = read_sysreg(far_el1);
+   ctxt->sys_regs[MAIR_EL1]= read_sysreg(mair_el1);
+   ctxt->sys_regs[VBAR_EL1]= read_sysreg(vbar_el1);
+   ctxt->sys_regs[CONTEXTIDR_EL1]  = read_sysreg(contextidr_el1);
+   ctxt->sys_regs[TPIDR_EL0]   = read_sysreg(tpidr_el0);
+   ctxt->sys_regs[TPIDRRO_EL0] = read_sysreg(tpidrro_el0);
+   ctxt->sys_regs[TPIDR_EL1]   = read_sysreg(tpidr_el1);
+   ctxt->sys_regs[AMAIR_EL1]   = read_sysreg(amair_el1);
+   ctxt->sys_regs[CNTKCTL_EL1] = read_sysreg(cntkctl_el1);
+   ctxt->sys_regs[PAR_EL1] = read_sysreg(par_el1);
+   ctxt->sys_regs[MDSCR_EL1]   = read_sysreg(mdscr_el1);
+
+   ctxt->gp_regs.regs.sp   = read_sysreg(sp_el0);
+   ctxt->gp_regs.regs.pc   = read_sysreg(elr_el2);
+   ctxt->gp_regs.regs.pstate   = read_sysreg(spsr_el2);
+   ctxt->gp_regs.sp_el1= read_sysreg(sp_el1);
+   ctxt->gp_regs.elr_el1   = read_sysreg(elr_el1);
+   ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg(spsr_el1);
+}
+
+void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
+{
+   write_sysreg(ctxt->sys_regs[MPIDR_EL1],   vmpidr_el2);
+   write_sysreg(ctxt->sys_regs[CSSELR_EL1],  csselr_el1);
+   write_sysreg(ctxt->sys_regs[SCTLR_EL1],   sctlr_el1);
+   write_sysreg(ctxt->sys_regs[ACTLR_EL1],   actlr_el1);
+   write_sysreg(ctxt->sys_regs[CPACR_EL1],   cpacr_el1);
+   write_sysreg(ctxt->sys_regs[TTBR0_EL1],   ttbr0_el1);
+   write_sysreg(ctxt->sys_regs[TTBR1_EL1],   ttbr1_el1);
+   write_sysreg(ctxt->sys_regs[TCR_EL1], tcr_el1);
+   write_sysreg(ctxt->sys_regs[ESR_EL1], esr_el1);
+   write_sysreg(ctxt->sys_regs[AFSR0_EL1],   afsr0_el1);
+   write_sysreg(ctxt->sys_regs[AFSR1_EL1],   afsr1_el1);
+   write_sysreg(ctxt->sys_regs[FAR_EL1], far_el1);
+   write_sysreg(ctxt->sys_regs[MAIR_EL1],  

Re: [PATCH v4 2/2] target-i386: add support to migrate vcpu's TSC rate

2015-11-16 Thread Eduardo Habkost
On Mon, Nov 16, 2015 at 04:04:08PM +0800, Haozhong Zhang wrote:
> This patch enables migrating vcpu's TSC rate. If KVM on the destination
> machine supports TSC scaling, guest programs will observe a consistent
> TSC rate across the migration.
> 
> If TSC scaling is not supported on the destination machine, the
> migration will not be aborted and QEMU on the destination will not set
> vcpu's TSC rate to the migrated value.
> 
> If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
> machine is inconsistent with the migrated TSC rate, the migration will
> be aborted.
> 
> For backwards compatibility, the migration of vcpu's TSC rate is
> disabled on pc-*-2.4 and older machine types.
> 
> Signed-off-by: Haozhong Zhang 
> ---
>  hw/i386/pc.c  |  1 +
>  hw/i386/pc_piix.c |  1 +
>  hw/i386/pc_q35.c  |  1 +
>  include/hw/i386/pc.h  |  1 +
>  target-i386/cpu.c |  2 +-
>  target-i386/cpu.h |  1 +
>  target-i386/kvm.c | 26 --
>  target-i386/machine.c | 28 
>  8 files changed, 58 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 0cb8afd..2f2fc93 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1952,6 +1952,7 @@ static void pc_machine_class_init(ObjectClass *oc, void 
> *data)
>  HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
>  
>  pcmc->get_hotplug_handler = mc->get_hotplug_handler;
> +pcmc->save_tsc_khz = true;
>  mc->get_hotplug_handler = pc_get_hotpug_handler;
>  mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id;
>  mc->default_boot_order = "cad";
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 07d0baa..7c5b0d2 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -489,6 +489,7 @@ static void pc_i440fx_2_4_machine_options(MachineClass *m)
>  m->alias = NULL;
>  m->is_default = 0;
>  pcmc->broken_reserved_end = true;
> +pcmc->save_tsc_khz = false;
>  SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
>  }
>  
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 0fdae09..fd8efe3 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -387,6 +387,7 @@ static void pc_q35_2_4_machine_options(MachineClass *m)
>  m->hw_version = "2.4.0";
>  m->alias = NULL;
>  pcmc->broken_reserved_end = true;
> +pcmc->save_tsc_khz = false;
>  SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
>  }
>  
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 4bbc0ff..fea0f28 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -60,6 +60,7 @@ struct PCMachineClass {
>  
>  /*< public >*/
>  bool broken_reserved_end;
> +bool save_tsc_khz;
>  HotplugHandler *(*get_hotplug_handler)(MachineState *machine,
> DeviceState *dev);
>  };
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index e5f1c5b..98c6a4c 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -1724,7 +1724,7 @@ static void x86_cpuid_set_tsc_freq(Object *obj, Visitor 
> *v, void *opaque,
>  return;
>  }
>  
> -cpu->env.tsc_khz = value / 1000;
> +cpu->env.tsc_khz = cpu->env.user_tsc_khz = value / 1000;
>  }
>  
>  static void x86_cpuid_get_apic_id(Object *obj, Visitor *v, void *opaque,
> diff --git a/target-i386/cpu.h b/target-i386/cpu.h
> index fc4a605..1ad1da8 100644
> --- a/target-i386/cpu.h
> +++ b/target-i386/cpu.h
> @@ -973,6 +973,7 @@ typedef struct CPUX86State {
>  uint32_t sipi_vector;
>  bool tsc_valid;
>  int64_t tsc_khz;
> +int64_t user_tsc_khz;
>  void *kvm_xsave_buf;
>  
>  uint64_t mcg_cap;
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 9084b29..8448248 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -2334,8 +2334,30 @@ static void kvm_arch_set_tsc_khz(CPUState *cs)
>  int r;
>  
>  /*
> - * If no user-specified TSC frequency is present, we will try to
> - * set env->tsc_khz to the value used by KVM.
> + * For other cases of env->tsc_khz and env->user_tsc_khz:
> + *
> + * - We have eliminated all cases that satisfy
> + *   env->tsc_khz && env->user_tsc_khz &&
> + *   env->tsc_khz != env->user_tsc_khz
> + *   in cpu_post_load().
> + *
> + * - If env->user_tsc_khz is not zero, then it must be equal to
> + *   env->tsc_khz (if the latter is not zero) and has been set in
> + *   kvm_arch_init_vcpu().
> + */
> +if (env->tsc_khz && !env->user_tsc_khz) {
> +r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
> +kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) : -ENOTSUP;

Please don't duplicate the code from kvm_arch_init_vcpu(). We can
handle both cases in kvm_arch_put_registers(KVM_PUT_FULL_STATE),
can't we?

> +if (r < 0) {
> +error_report("warning: TSC frequency mismatch between VM and 
> host, "
> + "and 

Re: [PATCH v4 1/2] target-i386: fallback vcpu's TSC rate to value returned by KVM

2015-11-16 Thread Eduardo Habkost
On Mon, Nov 16, 2015 at 04:04:07PM +0800, Haozhong Zhang wrote:
> If no user-specified TSC rate is present, we will try to set
> env->tsc_khz to the value returned by KVM_GET_TSC_KHZ.
> 
> Signed-off-by: Haozhong Zhang 
> ---
>  target-i386/kvm.c | 25 +
>  1 file changed, 25 insertions(+)
> 
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 2a9953b..9084b29 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -2327,6 +2327,27 @@ static int kvm_get_debugregs(X86CPU *cpu)
>  return 0;
>  }
>  
> +static void kvm_arch_set_tsc_khz(CPUState *cs)

> +{
> +X86CPU *cpu = X86_CPU(cs);
> +CPUX86State *env = >env;
> +int r;
> +
> +/*
> + * If no user-specified TSC frequency is present, we will try to
> + * set env->tsc_khz to the value used by KVM.
> + */
> +if (!env->tsc_khz) {
> +r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
> +kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;

Can't we do this on kvm_arch_init_vcpu()? kvm_arch_put_registers()'s purpose is
to just copy data from QEMU to KVM, not the other way around.


> +if (r < 0) {
> +error_report("warning: KVM_GET_TSC_KHZ failed");

Having a kernel that doesn't support KVM_CAP_GET_TSC_KHZ shouldn't trigger a
warning every time we run QEMU, unless the user is explicitly asking for a
feature that requires KVM_GET_TSC_KHZ.

> +} else {
> +env->tsc_khz = r;
> +}
> +}
> +}
> +
>  int kvm_arch_put_registers(CPUState *cpu, int level)
>  {
>  X86CPU *x86_cpu = X86_CPU(cpu);
> @@ -2341,6 +2362,10 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
>  }
>  }
>  
> +if (level == KVM_PUT_FULL_STATE) {
> +kvm_arch_set_tsc_khz(cpu);
> +}

I see that kvm_arch_set_tsc_khz() will be extended to call
KVM_SET_TSC_KHZ in the next patch, so the kvm_arch_set_tsc_khz()
seems to belong here. But the KVM_GET_TSC_KHZ call doesn't seem
to belong in kvm_arch_set_tsc_khz()

kvm_arch_put_registers() callers don't expect any QEMU-side data
to change, but just that KVM data is updated according to the
QEMU-side data.

> +
>  ret = kvm_getput_regs(x86_cpu, 1);
>  if (ret < 0) {
>  return ret;
> -- 
> 2.4.8
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 17/21] arm64: KVM: Map the kernel RO section into HYP

2015-11-16 Thread Mark Rutland
On Mon, Nov 16, 2015 at 01:11:55PM +, Marc Zyngier wrote:
> In order to run C code in HYP, we must make sure that the kernel's
> RO section in mapped into HYP (otherwise things break badly).

Somewhat tangential, but do we have any strong guarantees that the hyp
text is otherwise safe in its address space which differs from that of
the kernel proper?

i.e. do we need something like we did for the EFI stub in commit
e8f3010f7326c003 ("arm64/efi: isolate EFI stub from the kernel proper")?

Mark.

> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm/kvm/arm.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index eab83b2..6c4549a 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -44,6 +44,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #ifdef REQUIRES_VIRT
>  __asm__(".arch_extension virt");
> @@ -1072,6 +1073,12 @@ static int init_hyp_mode(void)
>   goto out_free_mappings;
>   }
>  
> + err = create_hyp_mappings(__start_rodata, __end_rodata);
> + if (err) {
> + kvm_err("Cannot map rodata section\n");
> + goto out_free_mappings;
> + }
> +
>   /*
>* Map the Hyp stack pages
>*/
> -- 
> 2.1.4
> 
> ___
> kvmarm mailing list
> kvm...@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 15/21] arm64: KVM: Add panic handling

2015-11-16 Thread Marc Zyngier
On 16/11/15 14:32, Mark Rutland wrote:
 +  /* Call panic for real */
 +  while (1) {
 +  unsigned long str_va = (unsigned long)__hyp_panic_string;
 +
 +  str_va -= HYP_PAGE_OFFSET;
 +  str_va += PAGE_OFFSET;
 +  __hyp_do_panic(str_va,
 + spsr,  elr,
 + read_sysreg(esr_el2),   read_sysreg(far_el2),
 + read_sysreg(hpfar_el2), par,
 + read_sysreg(tpidr_el2));
 +  }
 +}
>>>
>>> I think the while (1) here is confusing.
>>>
>>> Can we not jsut declare str_va at the start of the function and get rid
>>> of the loop?
>>
>> The while(1) is to prevent GCC from screaming (it otherwise believes
>> that the function actually returns, despite the __noreturn attribute).
> 
> Aha!
> 
> Perhaps a comment to that effect...?
> 
>> Or were you thinking of something else?
> 
> I just failed to derive the __noreturn problem from first principles.
> 
> Perhaps follow the __hyp_do_panic() call with an unreachable(), with the
> comment as to GCC failing to reason about the __noreturn? That would be
> less confusing than the loop, assuming that it works.

Worth giving it a try.

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 15/21] arm64: KVM: Add panic handling

2015-11-16 Thread Mark Rutland
> diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
> index 06d3e20..cdc2a96 100644
> --- a/arch/arm64/kvm/hyp/switch.c
> +++ b/arch/arm64/kvm/hyp/switch.c
> @@ -140,3 +140,38 @@ int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
>  
>   return exit_code;
>  }
> +
> +static const char *__hyp_panic_string = "HYP panic:\nPS:%08x PC:%p 
> ESR:%p\nFAR:%p HPFAR:%p PAR:%p\nVCPU:%p\n";

I assume that if [1] goes in we'll update this to match.

> +
> +void __hyp_text __noreturn __hyp_panic(void)
> +{
> + u64 spsr = read_sysreg(spsr_el2);
> + u64 elr = read_sysreg(elr_el2);
> + u64 par = read_sysreg(par_el1);
> +
> + if (read_sysreg(vttbr_el2)) {
> + struct kvm_vcpu *vcpu;
> + struct kvm_cpu_context *host_ctxt;
> +
> + vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
> + host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> + __deactivate_traps(vcpu);
> + __deactivate_vm(vcpu);
> + __sysreg_restore_state(host_ctxt);
> +
> + write_sysreg(host_ctxt->gp_regs.sp_el1, sp_el1);

__sysreg_restore_state restores the host sp_el1, no?

> + }
> +
> + /* Call panic for real */
> + while (1) {
> + unsigned long str_va = (unsigned long)__hyp_panic_string;
> +
> + str_va -= HYP_PAGE_OFFSET;
> + str_va += PAGE_OFFSET;
> + __hyp_do_panic(str_va,
> +spsr,  elr,
> +read_sysreg(esr_el2),   read_sysreg(far_el2),
> +read_sysreg(hpfar_el2), par,
> +read_sysreg(tpidr_el2));
> + }
> +}

I think the while (1) here is confusing.

Can we not jsut declare str_va at the start of the function and get rid
of the loop?

Thanks,
Mark.

[1] 
http://lists.infradead.org/pipermail/linux-arm-kernel/2015-November/385199.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 21/21] arm64: KVM: Remove weak attributes

2015-11-16 Thread Marc Zyngier
As we've now switched to the new world switch implementation,
remove the weak attributes, as nobody is supposed to override
it anymore.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/debug-sr.c   |  5 ++---
 arch/arm64/kvm/hyp/hyp-entry.S  |  3 ---
 arch/arm64/kvm/hyp/switch.c |  5 ++---
 arch/arm64/kvm/hyp/tlb.c| 16 +++-
 arch/arm64/kvm/hyp/vgic-v3-sr.c |  5 ++---
 5 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
index c7da3ec..f8ec964 100644
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -127,10 +127,9 @@ void __hyp_text __debug_clear_restore_state(struct 
kvm_vcpu *vcpu,
}
 }
 
-u32 __hyp_text __debug_read_mdcr_el2(void)
+static u32 __hyp_text __debug_read_mdcr_el2(void)
 {
return read_sysreg(mdcr_el2);
 }
 
-__alias(__debug_read_mdcr_el2)
-u32 __weak __kvm_get_mdcr_el2(void);
+__alias(__debug_read_mdcr_el2) u32 __kvm_get_mdcr_el2(void);
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 28de58f..658b9af 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -175,9 +175,7 @@ ENDPROC(\label)
 
.align 11
 
-   .weak   __kvm_hyp_vector
 ENTRY(__kvm_hyp_vector)
-ENTRY(__hyp_vector)
ventry  el2t_sync_invalid   // Synchronous EL2t
ventry  el2t_irq_invalid// IRQ EL2t
ventry  el2t_fiq_invalid// FIQ EL2t
@@ -197,5 +195,4 @@ ENTRY(__hyp_vector)
ventry  el1_irq // IRQ 32-bit EL1
ventry  el1_fiq_invalid // FIQ 32-bit EL1
ventry  el1_error_invalid   // Error 32-bit EL1
-ENDPROC(__hyp_vector)
 ENDPROC(__kvm_hyp_vector)
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index ef58066..a2885f5 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -84,7 +84,7 @@ static void __hyp_text __vgic_restore_state(struct kvm_vcpu 
*vcpu)
__vgic_call_restore_state()(vcpu);
 }
 
-int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
+static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
 {
struct kvm_cpu_context *host_ctxt;
struct kvm_cpu_context *guest_ctxt;
@@ -141,8 +141,7 @@ int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
return exit_code;
 }
 
-__alias(__guest_run)
-int __weak __kvm_vcpu_run(struct kvm_vcpu *vcpu);
+__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 static const char *__hyp_panic_string = "HYP panic:\nPS:%08x PC:%p 
ESR:%p\nFAR:%p HPFAR:%p PAR:%p\nVCPU:%p\n";
 
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index 2c279a8..250e06c 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -17,7 +17,7 @@
 
 #include "hyp.h"
 
-void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
dsb(ishst);
 
@@ -47,10 +47,10 @@ void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
write_sysreg(0, vttbr_el2);
 }
 
-__alias(__tlb_flush_vmid_ipa)
-void __weak __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm,
+   phys_addr_t ipa);
 
-void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
+static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
 {
dsb(ishst);
 
@@ -66,10 +66,9 @@ void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
write_sysreg(0, vttbr_el2);
 }
 
-__alias(__tlb_flush_vmid)
-void __weak __kvm_tlb_flush_vmid(struct kvm *kvm);
+__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm);
 
-void __hyp_text __tlb_flush_vm_context(void)
+static void __hyp_text __tlb_flush_vm_context(void)
 {
dsb(ishst);
asm volatile("tlbi alle1is  \n"
@@ -77,5 +76,4 @@ void __hyp_text __tlb_flush_vm_context(void)
dsb(ish);
 }
 
-__alias(__tlb_flush_vm_context)
-void __weak __kvm_flush_vm_context(void);
+__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void);
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 0cf316c..9189f71 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -216,10 +216,9 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu 
*vcpu)
}
 }
 
-u64 __hyp_text __vgic_v3_read_ich_vtr_el2(void)
+static u64 __hyp_text __vgic_v3_read_ich_vtr_el2(void)
 {
return read_gicreg(ICH_VTR_EL2);
 }
 
-__alias(__vgic_v3_read_ich_vtr_el2)
-u64 __weak __vgic_v3_get_ich_vtr_el2(void);
+__alias(__vgic_v3_read_ich_vtr_el2) u64 __vgic_v3_get_ich_vtr_el2(void);
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

[PATCH 12/21] arm64: KVM: Implement fpsimd save/restore

2015-11-16 Thread Marc Zyngier
Implement the fpsimd save restore, keeping the lazy part in
assembler (as returning to C would be overkill).

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/Makefile |  1 +
 arch/arm64/kvm/hyp/entry.S  | 32 +++-
 arch/arm64/kvm/hyp/fpsimd.S | 33 +
 arch/arm64/kvm/hyp/hyp.h|  3 +++
 arch/arm64/kvm/hyp/switch.c |  8 
 5 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kvm/hyp/fpsimd.S

diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 9c11b0f..56238d0 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += entry.o
 obj-$(CONFIG_KVM_ARM_HOST) += switch.o
+obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 2c4449a..7552922 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -27,6 +27,7 @@
 
 #define CPU_GP_REG_OFFSET(x)   (CPU_GP_REGS + x)
 #define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x)
+#define CPU_SYSREG_OFFSET(x)   (CPU_SYSREGS + 8*x)
 
.text
.pushsection.hyp.text, "ax"
@@ -152,4 +153,33 @@ ENTRY(__guest_exit)
ret
 ENDPROC(__guest_exit)
 
-   /* Insert fault handling here */
+ENTRY(__fpsimd_guest_restore)
+   pushx4, lr
+
+   mrs x2, cptr_el2
+   bic x2, x2, #CPTR_EL2_TFP
+   msr cptr_el2, x2
+   isb
+
+   mrs x3, tpidr_el2
+
+   ldr x0, [x3, #VCPU_HOST_CONTEXT]
+   kern_hyp_va x0
+   add x0, x0, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
+   bl  __fpsimd_save_state
+
+   add x2, x3, #VCPU_CONTEXT
+   add x0, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
+   bl  __fpsimd_restore_state
+
+   mrs x1, hcr_el2
+   tbnzx1, #HCR_RW_SHIFT, 1f
+   ldr x4, [x2, #CPU_SYSREG_OFFSET(FPEXC32_EL2)]
+   msr fpexc32_el2, x4
+1:
+   pop x4, lr
+   pop x2, x3
+   pop x0, x1
+
+   eret
+ENDPROC(__fpsimd_guest_restore)
diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S
new file mode 100644
index 000..da3f22c
--- /dev/null
+++ b/arch/arm64/kvm/hyp/fpsimd.S
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+
+#include 
+
+   .text
+   .pushsection.hyp.text, "ax"
+
+ENTRY(__fpsimd_save_state)
+   fpsimd_save x0, 1
+   ret
+ENDPROC(__fpsimd_save_state)
+
+ENTRY(__fpsimd_restore_state)
+   fpsimd_restore  x0, 1
+   ret
+ENDPROC(__fpsimd_restore_state)
diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
index bf13238..240fb79 100644
--- a/arch/arm64/kvm/hyp/hyp.h
+++ b/arch/arm64/kvm/hyp/hyp.h
@@ -70,6 +70,9 @@ void __debug_clear_restore_state(struct kvm_vcpu *vcpu,
 struct kvm_guest_debug_arch *dbg,
 struct kvm_cpu_context *ctxt);
 
+void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
+void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
+
 u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
 
 #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index a3af81a..06d3e20 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -88,6 +88,7 @@ int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
 {
struct kvm_cpu_context *host_ctxt;
struct kvm_cpu_context *guest_ctxt;
+   bool fp_enabled;
u64 exit_code;
 
vcpu = kern_hyp_va(vcpu);
@@ -117,6 +118,8 @@ int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
exit_code = __guest_enter(vcpu, host_ctxt);
/* And we're baaack! */
 
+   fp_enabled = !!(read_sysreg(cptr_el2) & CPTR_EL2_TFP);
+
__sysreg_save_state(guest_ctxt);
__sysreg32_save_state(vcpu);
__timer_save_state(vcpu);
@@ -127,6 +130,11 @@ int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
 
__sysreg_restore_state(host_ctxt);
 
+   if (fp_enabled) {
+   __fpsimd_save_state(_ctxt->gp_regs.fp_regs);
+   __fpsimd_restore_state(_ctxt->gp_regs.fp_regs);
+   

[PATCH 20/21] arm64: KVM: Cleanup asm-offset.c

2015-11-16 Thread Marc Zyngier
As we've now rewritten most of our code-base in C, most of the
KVM-specific code in asm-offset.c is useless. Delete-time again!

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kernel/asm-offsets.c | 39 ---
 1 file changed, 39 deletions(-)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 4b72231..94090a6 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -108,50 +108,11 @@ int main(void)
   DEFINE(CPU_GP_REGS,  offsetof(struct kvm_cpu_context, gp_regs));
   DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_regs, regs));
   DEFINE(CPU_FP_REGS,  offsetof(struct kvm_regs, fp_regs));
-  DEFINE(CPU_SP_EL1,   offsetof(struct kvm_regs, sp_el1));
-  DEFINE(CPU_ELR_EL1,  offsetof(struct kvm_regs, elr_el1));
-  DEFINE(CPU_SPSR, offsetof(struct kvm_regs, spsr));
-  DEFINE(CPU_SYSREGS,  offsetof(struct kvm_cpu_context, sys_regs));
   DEFINE(VCPU_FPEXC32_EL2, offsetof(struct kvm_vcpu, 
arch.ctxt.sys_regs[FPEXC32_EL2]));
   DEFINE(VCPU_ESR_EL2, offsetof(struct kvm_vcpu, arch.fault.esr_el2));
   DEFINE(VCPU_FAR_EL2, offsetof(struct kvm_vcpu, arch.fault.far_el2));
   DEFINE(VCPU_HPFAR_EL2,   offsetof(struct kvm_vcpu, 
arch.fault.hpfar_el2));
-  DEFINE(VCPU_DEBUG_FLAGS, offsetof(struct kvm_vcpu, arch.debug_flags));
-  DEFINE(VCPU_DEBUG_PTR,   offsetof(struct kvm_vcpu, arch.debug_ptr));
-  DEFINE(DEBUG_BCR,offsetof(struct kvm_guest_debug_arch, dbg_bcr));
-  DEFINE(DEBUG_BVR,offsetof(struct kvm_guest_debug_arch, dbg_bvr));
-  DEFINE(DEBUG_WCR,offsetof(struct kvm_guest_debug_arch, dbg_wcr));
-  DEFINE(DEBUG_WVR,offsetof(struct kvm_guest_debug_arch, dbg_wvr));
-  DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2));
-  DEFINE(VCPU_MDCR_EL2,offsetof(struct kvm_vcpu, arch.mdcr_el2));
-  DEFINE(VCPU_IRQ_LINES,   offsetof(struct kvm_vcpu, arch.irq_lines));
   DEFINE(VCPU_HOST_CONTEXT,offsetof(struct kvm_vcpu, 
arch.host_cpu_context));
-  DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, 
arch.host_debug_state));
-  DEFINE(VCPU_TIMER_CNTV_CTL,  offsetof(struct kvm_vcpu, 
arch.timer_cpu.cntv_ctl));
-  DEFINE(VCPU_TIMER_CNTV_CVAL, offsetof(struct kvm_vcpu, 
arch.timer_cpu.cntv_cval));
-  DEFINE(KVM_TIMER_CNTVOFF,offsetof(struct kvm, arch.timer.cntvoff));
-  DEFINE(KVM_TIMER_ENABLED,offsetof(struct kvm, arch.timer.enabled));
-  DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
-  DEFINE(VCPU_VGIC_CPU,offsetof(struct kvm_vcpu, 
arch.vgic_cpu));
-  DEFINE(VGIC_V2_CPU_HCR,  offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
-  DEFINE(VGIC_V2_CPU_VMCR, offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
-  DEFINE(VGIC_V2_CPU_MISR, offsetof(struct vgic_cpu, vgic_v2.vgic_misr));
-  DEFINE(VGIC_V2_CPU_EISR, offsetof(struct vgic_cpu, vgic_v2.vgic_eisr));
-  DEFINE(VGIC_V2_CPU_ELRSR,offsetof(struct vgic_cpu, vgic_v2.vgic_elrsr));
-  DEFINE(VGIC_V2_CPU_APR,  offsetof(struct vgic_cpu, vgic_v2.vgic_apr));
-  DEFINE(VGIC_V2_CPU_LR,   offsetof(struct vgic_cpu, vgic_v2.vgic_lr));
-  DEFINE(VGIC_V3_CPU_SRE,  offsetof(struct vgic_cpu, vgic_v3.vgic_sre));
-  DEFINE(VGIC_V3_CPU_HCR,  offsetof(struct vgic_cpu, vgic_v3.vgic_hcr));
-  DEFINE(VGIC_V3_CPU_VMCR, offsetof(struct vgic_cpu, vgic_v3.vgic_vmcr));
-  DEFINE(VGIC_V3_CPU_MISR, offsetof(struct vgic_cpu, vgic_v3.vgic_misr));
-  DEFINE(VGIC_V3_CPU_EISR, offsetof(struct vgic_cpu, vgic_v3.vgic_eisr));
-  DEFINE(VGIC_V3_CPU_ELRSR,offsetof(struct vgic_cpu, vgic_v3.vgic_elrsr));
-  DEFINE(VGIC_V3_CPU_AP0R, offsetof(struct vgic_cpu, vgic_v3.vgic_ap0r));
-  DEFINE(VGIC_V3_CPU_AP1R, offsetof(struct vgic_cpu, vgic_v3.vgic_ap1r));
-  DEFINE(VGIC_V3_CPU_LR,   offsetof(struct vgic_cpu, vgic_v3.vgic_lr));
-  DEFINE(VGIC_CPU_NR_LR,   offsetof(struct vgic_cpu, nr_lr));
-  DEFINE(KVM_VTTBR,offsetof(struct kvm, arch.vttbr));
-  DEFINE(KVM_VGIC_VCTRL,   offsetof(struct kvm, arch.vgic.vctrl_base));
 #endif
 #ifdef CONFIG_CPU_PM
   DEFINE(CPU_SUSPEND_SZ,   sizeof(struct cpu_suspend_ctx));
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/21] arm64: KVM: Add a HYP-specific header file

2015-11-16 Thread Marc Zyngier
In order to expose the various EL2 services that are private to
the hypervisor, add a new hyp.h file.

So far, it only contains mundane things such as section annotation
and VA manipulation.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/hyp.h | 31 +++
 1 file changed, 31 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/hyp.h

diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
new file mode 100644
index 000..dac843e
--- /dev/null
+++ b/arch/arm64/kvm/hyp/hyp.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#ifndef __ARM64_KVM_HYP_H__
+#define __ARM64_KVM_HYP_H__
+
+#include 
+#include 
+#include 
+#include 
+
+#define __hyp_text __section(.hyp.text) notrace
+
+#define kern_hyp_va(v) (typeof(v))((unsigned long)v & HYP_PAGE_OFFSET_MASK)
+
+#endif /* __ARM64_KVM_HYP_H__ */
+
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 00/21] arm64: KVM: world switch in C

2015-11-16 Thread Marc Zyngier
Once upon a time, the KVM/arm64 world switch was a nice, clean, lean
and mean piece of hand-crafted assembly code. Over time, features have
crept in, the code has become harder to maintain, and the smallest
change is a pain to introduce. The VHE patches are a prime example of
why this doesn't work anymore.

This series rewrites most of the existing assembly code in C, but keeps
the existing code structure in place (most function names will look
familiar to the reader). The biggest change is that we don't have to
deal with a static register allocation (the compiler does it for us),
we can easily follow structure and pointers, and only the lowest level
is still in assembly code. Oh, and a negative diffstat.

There is still a healthy dose of inline assembly (system register
accessors, runtime code patching), but I've tried not to make it too
invasive. The generated code, while not exactly brilliant, doesn't
look too shaby. I do expect a small performance degradation, but I
believe this is something we can improve over time (my initial
measurements don't show any obvious regression).

Eventually (and assuming people are happy with the general approach
taken here), it should be possible to make the 32bit converge with
this and reuse some parts of the code.

Patches are against 4.4-rc1 (mostly), and I've pushed a branch out
(kvm-arm64/wsinc). This has been tested on Juno, Seattle and the FVP
model. I also have pushed out kvm-arm64/vhe-wsinc that implements VHE
on top of these patches.
M.

Marc Zyngier (20):
  arm64: KVM: Add a HYP-specific header file
  arm64: KVM: Implement vgic-v2 save/restore
  arm64: KVM: Implement vgic-v3 save/restore
  arm64: KVM: Implement timer save/restore
  arm64: KVM: Implement system register save/restore
  arm64: KVM: Implement 32bit system register save/restore
  arm64: KVM: Implement debug save/restore
  arm64: KVM: Implement guest entry
  arm64: KVM: Add patchable function selector
  arm64: KVM: Implement the core world switch
  arm64: KVM: Implement fpsimd save/restore
  arm64: KVM: Implement TLB handling
  arm64: KVM: HYP mode entry points
  arm64: KVM: Add panic handling
  arm64: KVM: Add compatibility aliases
  arm64: KVM: Map the kernel RO section into HYP
  arm64: KVM: Move away from the assembly version of the world switch
  arm64: KVM: Turn system register numbers to an enum
  arm64: KVM: Cleanup asm-offset.c
  arm64: KVM: Remove weak attributes

Mark Rutland (1):
  arm64: add macros to read/write system registers

 arch/arm/kvm/arm.c   |7 +
 arch/arm64/include/asm/kvm_asm.h |   76 ---
 arch/arm64/include/asm/kvm_emulate.h |1 -
 arch/arm64/include/asm/kvm_host.h|   81 ++-
 arch/arm64/include/asm/kvm_mmio.h|1 -
 arch/arm64/include/asm/sysreg.h  |   17 +
 arch/arm64/kernel/asm-offsets.c  |   40 +-
 arch/arm64/kvm/Makefile  |3 +-
 arch/arm64/kvm/guest.c   |1 -
 arch/arm64/kvm/handle_exit.c |1 +
 arch/arm64/kvm/hyp.S | 1071 +-
 arch/arm64/kvm/hyp/Makefile  |   14 +
 arch/arm64/kvm/hyp/debug-sr.c|  135 +
 arch/arm64/kvm/hyp/entry.S   |  184 ++
 arch/arm64/kvm/hyp/fpsimd.S  |   33 ++
 arch/arm64/kvm/hyp/hyp-entry.S   |  198 +++
 arch/arm64/kvm/hyp/hyp.h |   80 +++
 arch/arm64/kvm/hyp/switch.c  |  179 ++
 arch/arm64/kvm/hyp/sysreg-sr.c   |  132 +
 arch/arm64/kvm/hyp/timer-sr.c|   68 +++
 arch/arm64/kvm/hyp/tlb.c |   79 +++
 arch/arm64/kvm/hyp/vgic-v2-sr.c  |   85 +++
 arch/arm64/kvm/hyp/vgic-v3-sr.c  |  224 +++
 arch/arm64/kvm/sys_regs.c|1 +
 arch/arm64/kvm/vgic-v2-switch.S  |  134 -
 arch/arm64/kvm/vgic-v3-switch.S  |  269 -
 virt/kvm/arm/vgic-v3.c   |1 +
 27 files changed, 1521 insertions(+), 1594 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/Makefile
 create mode 100644 arch/arm64/kvm/hyp/debug-sr.c
 create mode 100644 arch/arm64/kvm/hyp/entry.S
 create mode 100644 arch/arm64/kvm/hyp/fpsimd.S
 create mode 100644 arch/arm64/kvm/hyp/hyp-entry.S
 create mode 100644 arch/arm64/kvm/hyp/hyp.h
 create mode 100644 arch/arm64/kvm/hyp/switch.c
 create mode 100644 arch/arm64/kvm/hyp/sysreg-sr.c
 create mode 100644 arch/arm64/kvm/hyp/timer-sr.c
 create mode 100644 arch/arm64/kvm/hyp/tlb.c
 create mode 100644 arch/arm64/kvm/hyp/vgic-v2-sr.c
 create mode 100644 arch/arm64/kvm/hyp/vgic-v3-sr.c
 delete mode 100644 arch/arm64/kvm/vgic-v2-switch.S
 delete mode 100644 arch/arm64/kvm/vgic-v3-switch.S

-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/21] arm64: KVM: Implement vgic-v2 save/restore

2015-11-16 Thread Marc Zyngier
Implement the vgic-v2 save restore as a direct translation of
the assembly code version.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/Makefile |  1 +
 arch/arm64/kvm/hyp/Makefile |  5 +++
 arch/arm64/kvm/hyp/hyp.h|  3 ++
 arch/arm64/kvm/hyp/vgic-v2-sr.c | 85 +
 4 files changed, 94 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/Makefile
 create mode 100644 arch/arm64/kvm/hyp/vgic-v2-sr.c

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 1949fe5..d31e4e5 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -10,6 +10,7 @@ KVM=../../../virt/kvm
 ARM=../../../arch/arm/kvm
 
 obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
+obj-$(CONFIG_KVM_ARM_HOST) += hyp/
 
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o 
$(KVM)/eventfd.o $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
new file mode 100644
index 000..d8d5968
--- /dev/null
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for Kernel-based Virtual Machine module, HYP part
+#
+
+obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-sr.o
diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
index dac843e..78f25c4 100644
--- a/arch/arm64/kvm/hyp/hyp.h
+++ b/arch/arm64/kvm/hyp/hyp.h
@@ -27,5 +27,8 @@
 
 #define kern_hyp_va(v) (typeof(v))((unsigned long)v & HYP_PAGE_OFFSET_MASK)
 
+void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
+void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+
 #endif /* __ARM64_KVM_HYP_H__ */
 
diff --git a/arch/arm64/kvm/hyp/vgic-v2-sr.c b/arch/arm64/kvm/hyp/vgic-v2-sr.c
new file mode 100644
index 000..1382d2e
--- /dev/null
+++ b/arch/arm64/kvm/hyp/vgic-v2-sr.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+#include 
+#include 
+
+#include 
+
+#include "hyp.h"
+
+/* vcpu is already in the HYP VA space */
+void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+   struct vgic_v2_cpu_if *cpu_if = >arch.vgic_cpu.vgic_v2;
+   struct vgic_dist *vgic = >arch.vgic;
+   void __iomem *base = kern_hyp_va(vgic->vctrl_base);
+   u32 __iomem *lr_base;
+   u32 eisr0, eisr1, elrsr0, elrsr1;
+   int i = 0, nr_lr;
+
+   if (!base)
+   return;
+
+   cpu_if->vgic_vmcr = readl_relaxed(base + GICH_VMCR);
+   cpu_if->vgic_misr = readl_relaxed(base + GICH_MISR);
+   eisr0  = readl_relaxed(base + GICH_EISR0);
+   eisr1  = readl_relaxed(base + GICH_EISR0);
+   elrsr0 = readl_relaxed(base + GICH_ELRSR0);
+   elrsr1 = readl_relaxed(base + GICH_ELRSR1);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+   cpu_if->vgic_eisr  = ((u64)eisr0 << 32) | eisr1;
+   cpu_if->vgic_elrsr = ((u64)elrsr0 << 32) | elrsr1;
+#else
+   cpu_if->vgic_eisr  = ((u64)eisr1 << 32) | eisr0;
+   cpu_if->vgic_elrsr = ((u64)elrsr1 << 32) | elrsr0;
+#endif
+   cpu_if->vgic_apr= readl_relaxed(base + GICH_APR);
+
+   writel_relaxed(0, base + GICH_HCR);
+
+   lr_base = base + GICH_LR0;
+   nr_lr = vcpu->arch.vgic_cpu.nr_lr;
+   do {
+   cpu_if->vgic_lr[i++] = readl_relaxed(lr_base++);
+   } while (--nr_lr);
+}
+
+void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+   struct vgic_v2_cpu_if *cpu_if = >arch.vgic_cpu.vgic_v2;
+   struct vgic_dist *vgic = >arch.vgic;
+   void __iomem *base = kern_hyp_va(vgic->vctrl_base);
+   u32 __iomem *lr_base;
+   unsigned int i = 0, nr_lr;
+
+   if (!base)
+   return;
+
+   writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
+   writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR);
+   writel_relaxed(cpu_if->vgic_apr, base + GICH_APR);
+
+   lr_base = base + GICH_LR0;
+   nr_lr = vcpu->arch.vgic_cpu.nr_lr;
+   do {
+   writel_relaxed(cpu_if->vgic_lr[i++], lr_base++);
+   } while (--nr_lr);
+}
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/21] arm64: KVM: Implement vgic-v3 save/restore

2015-11-16 Thread Marc Zyngier
Implement the vgic-v3 save restore as a direct translation of
the assembly code version.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/Makefile |   1 +
 arch/arm64/kvm/hyp/hyp.h|   3 +
 arch/arm64/kvm/hyp/vgic-v3-sr.c | 222 
 3 files changed, 226 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/vgic-v3-sr.c

diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index d8d5968..d1e38ce 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
index 78f25c4..a31cb6e 100644
--- a/arch/arm64/kvm/hyp/hyp.h
+++ b/arch/arm64/kvm/hyp/hyp.h
@@ -30,5 +30,8 @@
 void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
 void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
 
+void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
+void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
+
 #endif /* __ARM64_KVM_HYP_H__ */
 
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
new file mode 100644
index 000..f2289ab
--- /dev/null
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+#include 
+#include 
+
+#include 
+
+#include "hyp.h"
+
+/*
+ * We store LRs in reverse order to let the CPU deal with streaming
+ * access. Use this macro to make it look saner...
+ */
+#define LR_OFFSET(n)   (15 - n)
+
+#define read_gicreg(r) \
+   ({  \
+   u64 reg;\
+   asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \
+   reg;\
+   })
+
+#define write_gicreg(v,r)  \
+   do {\
+   u64 __val = (v);\
+   asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
+   } while (0)
+
+/* vcpu is already in the HYP VA space */
+void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
+{
+   struct vgic_v3_cpu_if *cpu_if = >arch.vgic_cpu.vgic_v3;
+   u64 val;
+   u32 nr_lr, nr_pri;
+
+   /*
+* Make sure stores to the GIC via the memory mapped interface
+* are now visible to the system register interface.
+*/
+   dsb(st);
+
+   cpu_if->vgic_vmcr  = read_gicreg(ICH_VMCR_EL2);
+   cpu_if->vgic_misr  = read_gicreg(ICH_MISR_EL2);
+   cpu_if->vgic_eisr  = read_gicreg(ICH_EISR_EL2);
+   cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2);
+
+   write_gicreg(0, ICH_HCR_EL2);
+   val = read_gicreg(ICH_VTR_EL2);
+   nr_lr = val & 0xf;
+   nr_pri = ((u32)val >> 29) + 1;
+
+   switch (nr_lr) {
+   case 15:
+   cpu_if->vgic_lr[LR_OFFSET(15)] = read_gicreg(ICH_LR15_EL2);
+   case 14:
+   cpu_if->vgic_lr[LR_OFFSET(14)] = read_gicreg(ICH_LR14_EL2);
+   case 13:
+   cpu_if->vgic_lr[LR_OFFSET(13)] = read_gicreg(ICH_LR13_EL2);
+   case 12:
+   cpu_if->vgic_lr[LR_OFFSET(12)] = read_gicreg(ICH_LR12_EL2);
+   case 11:
+   cpu_if->vgic_lr[LR_OFFSET(11)] = read_gicreg(ICH_LR11_EL2);
+   case 10:
+   cpu_if->vgic_lr[LR_OFFSET(19)] = read_gicreg(ICH_LR10_EL2);
+   case 9:
+   cpu_if->vgic_lr[LR_OFFSET(9)] = read_gicreg(ICH_LR9_EL2);
+   case 8:
+   cpu_if->vgic_lr[LR_OFFSET(8)] = read_gicreg(ICH_LR8_EL2);
+   case 7:
+   cpu_if->vgic_lr[LR_OFFSET(7)] = read_gicreg(ICH_LR7_EL2);
+   case 6:
+   cpu_if->vgic_lr[LR_OFFSET(6)] = read_gicreg(ICH_LR6_EL2);
+   case 5:
+   cpu_if->vgic_lr[LR_OFFSET(5)] = read_gicreg(ICH_LR5_EL2);
+   case 4:
+   cpu_if->vgic_lr[LR_OFFSET(4)] = read_gicreg(ICH_LR4_EL2);
+   case 3:
+   cpu_if->vgic_lr[LR_OFFSET(3)] = read_gicreg(ICH_LR3_EL2);
+   case 2:
+   cpu_if->vgic_lr[LR_OFFSET(2)] = read_gicreg(ICH_LR2_EL2);
+   

[PATCH 01/21] arm64: add macros to read/write system registers

2015-11-16 Thread Marc Zyngier
From: Mark Rutland 

Rather than crafting custom macros for reading/writing each system
register provide generics accessors, read_sysreg and write_sysreg, for
this purpose.

Unlike read_cpuid, calls to read_exception_reg are never expected
to be optimized away or replaced with synthetic values.

Signed-off-by: Mark Rutland 
Cc: Catalin Marinas 
Cc: Marc Zyngier 
Cc: Suzuki Poulose 
Cc: Will Deacon 
Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/sysreg.h | 17 +
 1 file changed, 17 insertions(+)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index d48ab5b..c9c283a 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -20,6 +20,8 @@
 #ifndef __ASM_SYSREG_H
 #define __ASM_SYSREG_H
 
+#include 
+
 #include 
 
 /*
@@ -208,6 +210,8 @@
 
 #else
 
+#include 
+
 asm(
 "  .irp
num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n"
 "  .equ__reg_num_x\\num, \\num\n"
@@ -232,6 +236,19 @@ static inline void config_sctlr_el1(u32 clear, u32 set)
val |= set;
asm volatile("msr sctlr_el1, %0" : : "r" (val));
 }
+
+#define read_sysreg(r) ({  \
+   u64 __val;  \
+   asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \
+   __val;  \
+})
+
+#define write_sysreg(v, r) do {\
+   u64 __val = (u64)v; \
+   asm volatile("msr " __stringify(r) ", %0"   \
+: : "r" (__val));  \
+} while (0)
+
 #endif
 
 #endif /* __ASM_SYSREG_H */
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/21] arm64: KVM: Implement TLB handling

2015-11-16 Thread Marc Zyngier
Implement the TLB handling as a direct translation of the assembly
code version.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/Makefile |  1 +
 arch/arm64/kvm/hyp/tlb.c| 72 +
 2 files changed, 73 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/tlb.c

diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 56238d0..1a529f5 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += entry.o
 obj-$(CONFIG_KVM_ARM_HOST) += switch.o
 obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
+obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
new file mode 100644
index 000..d4a07d0
--- /dev/null
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include "hyp.h"
+
+void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+{
+   dsb(ishst);
+
+   /* Switch to requested VMID */
+   kvm = kern_hyp_va(kvm);
+   write_sysreg(kvm->arch.vttbr, vttbr_el2);
+   isb();
+
+   /*
+* We could do so much better if we had the VA as well.
+* Instead, we invalidate Stage-2 for this IPA, and the
+* whole of Stage-1. Weep...
+*/
+   ipa >>= 12;
+   asm volatile("tlbi ipas2e1is, %0" : : "r" (ipa));
+   dsb(ish);
+   /*
+* We have to ensure completion of the invalidation at Stage-2,
+* since a table walk on another CPU could refill a TLB with a
+* complete (S1 + S2) walk based on the old Stage-2 mapping if
+* the Stage-1 invalidation happened first.
+*/
+   asm volatile("tlbi vmalle1is" : : );
+   dsb(ish);
+   isb();
+
+   write_sysreg(0, vttbr_el2);
+}
+
+void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
+{
+   dsb(ishst);
+
+   /* Switch to requested VMID */
+   kvm = kern_hyp_va(kvm);
+   write_sysreg(kvm->arch.vttbr, vttbr_el2);
+   isb();
+
+   asm volatile("tlbi vmalls12e1is" : : );
+   dsb(ish);
+   isb();
+
+   write_sysreg(0, vttbr_el2);
+}
+
+void __hyp_text __tlb_flush_vm_context(void)
+{
+   dsb(ishst);
+   asm volatile("tlbi alle1is  \n"
+"ic ialluis  ": : );
+   dsb(ish);
+}
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/21] arm64: KVM: Implement timer save/restore

2015-11-16 Thread Marc Zyngier
Implement the timer save restore as a direct translation of
the assembly code version.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/Makefile   |  1 +
 arch/arm64/kvm/hyp/hyp.h  |  3 ++
 arch/arm64/kvm/hyp/timer-sr.c | 68 +++
 3 files changed, 72 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/timer-sr.c

diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index d1e38ce..455dc0a 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -4,3 +4,4 @@
 
 obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h
index a31cb6e..86aa5a2 100644
--- a/arch/arm64/kvm/hyp/hyp.h
+++ b/arch/arm64/kvm/hyp/hyp.h
@@ -33,5 +33,8 @@ void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
 void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
 void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
 
+void __timer_save_state(struct kvm_vcpu *vcpu);
+void __timer_restore_state(struct kvm_vcpu *vcpu);
+
 #endif /* __ARM64_KVM_HYP_H__ */
 
diff --git a/arch/arm64/kvm/hyp/timer-sr.c b/arch/arm64/kvm/hyp/timer-sr.c
new file mode 100644
index 000..1a1d2ac
--- /dev/null
+++ b/arch/arm64/kvm/hyp/timer-sr.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+#include 
+
+#include 
+
+#include "hyp.h"
+
+/* vcpu is already in the HYP VA space */
+void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+   struct arch_timer_cpu *timer = >arch.timer_cpu;
+
+   if (kvm->arch.timer.enabled) {
+   timer->cntv_ctl = read_sysreg(cntv_ctl_el0);
+   isb();
+   timer->cntv_cval = read_sysreg(cntv_cval_el0);
+   }
+
+   /* Disable the virtual timer */
+   write_sysreg(0, cntv_ctl_el0);
+
+   /* Allow physical timer/counter access for the host */
+   write_sysreg(read_sysreg(cnthctl_el2) | 3, cnthctl_el2);
+
+   /* Clear cntvoff for the host */
+   write_sysreg(0, cntvoff_el2);
+}
+
+void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+   struct arch_timer_cpu *timer = >arch.timer_cpu;
+   u64 val;
+
+   /*
+* Disallow physical timer access for the guest
+* Physical counter access is allowed
+*/
+   val = read_sysreg(cnthctl_el2);
+   val &= ~(1 << 1);
+   val |= 1;
+   write_sysreg(val, cnthctl_el2);
+
+   if (kvm->arch.timer.enabled) {
+   write_sysreg(kvm->arch.timer.cntvoff, cntvoff_el2);
+   write_sysreg(timer->cntv_cval, cntv_cval_el0);
+   isb();
+   write_sysreg(timer->cntv_ctl, cntv_ctl_el0);
+   }
+}
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 15/21] arm64: KVM: Add panic handling

2015-11-16 Thread Marc Zyngier
On 16/11/15 14:16, Mark Rutland wrote:
>> diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
>> index 06d3e20..cdc2a96 100644
>> --- a/arch/arm64/kvm/hyp/switch.c
>> +++ b/arch/arm64/kvm/hyp/switch.c
>> @@ -140,3 +140,38 @@ int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
>>  
>>  return exit_code;
>>  }
>> +
>> +static const char *__hyp_panic_string = "HYP panic:\nPS:%08x PC:%p 
>> ESR:%p\nFAR:%p HPFAR:%p PAR:%p\nVCPU:%p\n";
> 
> I assume that if [1] goes in we'll update this to match.

Definitely.

>> +
>> +void __hyp_text __noreturn __hyp_panic(void)
>> +{
>> +u64 spsr = read_sysreg(spsr_el2);
>> +u64 elr = read_sysreg(elr_el2);
>> +u64 par = read_sysreg(par_el1);
>> +
>> +if (read_sysreg(vttbr_el2)) {
>> +struct kvm_vcpu *vcpu;
>> +struct kvm_cpu_context *host_ctxt;
>> +
>> +vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
>> +host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>> +__deactivate_traps(vcpu);
>> +__deactivate_vm(vcpu);
>> +__sysreg_restore_state(host_ctxt);
>> +
>> +write_sysreg(host_ctxt->gp_regs.sp_el1, sp_el1);
> 
> __sysreg_restore_state restores the host sp_el1, no?

Better safe than sorry! ;-) Looks like a leftover from some ancient
version... I'll fix that.

>> +}
>> +
>> +/* Call panic for real */
>> +while (1) {
>> +unsigned long str_va = (unsigned long)__hyp_panic_string;
>> +
>> +str_va -= HYP_PAGE_OFFSET;
>> +str_va += PAGE_OFFSET;
>> +__hyp_do_panic(str_va,
>> +   spsr,  elr,
>> +   read_sysreg(esr_el2),   read_sysreg(far_el2),
>> +   read_sysreg(hpfar_el2), par,
>> +   read_sysreg(tpidr_el2));
>> +}
>> +}
> 
> I think the while (1) here is confusing.
> 
> Can we not jsut declare str_va at the start of the function and get rid
> of the loop?

The while(1) is to prevent GCC from screaming (it otherwise believes
that the function actually returns, despite the __noreturn attribute).

Or were you thinking of something else?

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 19/21] arm64: KVM: Turn system register numbers to an enum

2015-11-16 Thread Marc Zyngier
Having the system register numbers as #defines has been a pain
since day one, as the ordering is pretty fragile, and moving
things around leads to renumbering and epic conflict resolutions.

Now that we're mostly acessing the sysreg file in C, an enum is
a much better type to use, and we can clean things up a bit.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/kvm_asm.h | 76 -
 arch/arm64/include/asm/kvm_emulate.h |  1 -
 arch/arm64/include/asm/kvm_host.h| 81 +++-
 arch/arm64/include/asm/kvm_mmio.h|  1 -
 arch/arm64/kernel/asm-offsets.c  |  1 +
 arch/arm64/kvm/guest.c   |  1 -
 arch/arm64/kvm/handle_exit.c |  1 +
 arch/arm64/kvm/hyp/debug-sr.c|  1 +
 arch/arm64/kvm/hyp/entry.S   |  3 +-
 arch/arm64/kvm/hyp/sysreg-sr.c   |  1 +
 arch/arm64/kvm/sys_regs.c|  1 +
 virt/kvm/arm/vgic-v3.c   |  1 +
 12 files changed, 87 insertions(+), 82 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 5e37710..52b777b 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -20,82 +20,6 @@
 
 #include 
 
-/*
- * 0 is reserved as an invalid value.
- * Order *must* be kept in sync with the hyp switch code.
- */
-#defineMPIDR_EL1   1   /* MultiProcessor Affinity Register */
-#defineCSSELR_EL1  2   /* Cache Size Selection Register */
-#defineSCTLR_EL1   3   /* System Control Register */
-#defineACTLR_EL1   4   /* Auxiliary Control Register */
-#defineCPACR_EL1   5   /* Coprocessor Access Control */
-#defineTTBR0_EL1   6   /* Translation Table Base Register 0 */
-#defineTTBR1_EL1   7   /* Translation Table Base Register 1 */
-#defineTCR_EL1 8   /* Translation Control Register */
-#defineESR_EL1 9   /* Exception Syndrome Register */
-#defineAFSR0_EL1   10  /* Auxilary Fault Status Register 0 */
-#defineAFSR1_EL1   11  /* Auxilary Fault Status Register 1 */
-#defineFAR_EL1 12  /* Fault Address Register */
-#defineMAIR_EL113  /* Memory Attribute Indirection 
Register */
-#defineVBAR_EL114  /* Vector Base Address Register */
-#defineCONTEXTIDR_EL1  15  /* Context ID Register */
-#defineTPIDR_EL0   16  /* Thread ID, User R/W */
-#defineTPIDRRO_EL0 17  /* Thread ID, User R/O */
-#defineTPIDR_EL1   18  /* Thread ID, Privileged */
-#defineAMAIR_EL1   19  /* Aux Memory Attribute Indirection 
Register */
-#defineCNTKCTL_EL1 20  /* Timer Control Register (EL1) */
-#definePAR_EL1 21  /* Physical Address Register */
-#define MDSCR_EL1  22  /* Monitor Debug System Control Register */
-#define MDCCINT_EL123  /* Monitor Debug Comms Channel Interrupt Enable 
Reg */
-
-/* 32bit specific registers. Keep them at the end of the range */
-#defineDACR32_EL2  24  /* Domain Access Control Register */
-#defineIFSR32_EL2  25  /* Instruction Fault Status Register */
-#defineFPEXC32_EL2 26  /* Floating-Point Exception Control 
Register */
-#defineDBGVCR32_EL227  /* Debug Vector Catch Register */
-#defineNR_SYS_REGS 28
-
-/* 32bit mapping */
-#define c0_MPIDR   (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
-#define c0_CSSELR  (CSSELR_EL1 * 2)/* Cache Size Selection Register */
-#define c1_SCTLR   (SCTLR_EL1 * 2) /* System Control Register */
-#define c1_ACTLR   (ACTLR_EL1 * 2) /* Auxiliary Control Register */
-#define c1_CPACR   (CPACR_EL1 * 2) /* Coprocessor Access Control */
-#define c2_TTBR0   (TTBR0_EL1 * 2) /* Translation Table Base Register 0 */
-#define c2_TTBR0_high  (c2_TTBR0 + 1)  /* TTBR0 top 32 bits */
-#define c2_TTBR1   (TTBR1_EL1 * 2) /* Translation Table Base Register 1 */
-#define c2_TTBR1_high  (c2_TTBR1 + 1)  /* TTBR1 top 32 bits */
-#define c2_TTBCR   (TCR_EL1 * 2)   /* Translation Table Base Control R. */
-#define c3_DACR(DACR32_EL2 * 2)/* Domain Access Control 
Register */
-#define c5_DFSR(ESR_EL1 * 2)   /* Data Fault Status Register */
-#define c5_IFSR(IFSR32_EL2 * 2)/* Instruction Fault Status 
Register */
-#define c5_ADFSR   (AFSR0_EL1 * 2) /* Auxiliary Data Fault Status R */
-#define c5_AIFSR   (AFSR1_EL1 * 2) /* Auxiliary Instr Fault Status R */
-#define c6_DFAR(FAR_EL1 * 2)   /* Data Fault Address Register 
*/
-#define c6_IFAR(c6_DFAR + 1)   /* Instruction Fault Address 
Register */
-#define c7_PAR (PAR_EL1 * 2)   /* Physical Address Register */
-#define c7_PAR_high(c7_PAR + 1)/* PAR top 32 bits */

[PATCH 11/21] arm64: KVM: Implement the core world switch

2015-11-16 Thread Marc Zyngier
Implement the core of the world switch in C. Not everything is there
yet, and there is nothing to re-enter the world switch either.

But this already outlines the code structure well enough.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/Makefile |   1 +
 arch/arm64/kvm/hyp/switch.c | 134 
 2 files changed, 135 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/switch.c

diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 1e1ff06..9c11b0f 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += entry.o
+obj-$(CONFIG_KVM_ARM_HOST) += switch.o
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
new file mode 100644
index 000..a3af81a
--- /dev/null
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include "hyp.h"
+
+static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
+{
+   u64 val;
+
+   /*
+* We are about to set CPTR_EL2.TFP to trap all floating point
+* register accesses to EL2, however, the ARM ARM clearly states that
+* traps are only taken to EL2 if the operation would not otherwise
+* trap to EL1.  Therefore, always make sure that for 32-bit guests,
+* we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
+*/
+   val = vcpu->arch.hcr_el2;
+   if (val & HCR_RW) {
+   write_sysreg(1 << 30, fpexc32_el2);
+   isb();
+   }
+   write_sysreg(val, hcr_el2);
+   write_sysreg(1 << 15, hstr_el2);
+   write_sysreg(CPTR_EL2_TTA | CPTR_EL2_TFP, cptr_el2);
+   write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
+}
+
+static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
+{
+   write_sysreg(HCR_RW, hcr_el2);
+   write_sysreg(0, hstr_el2);
+   write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2);
+   write_sysreg(0, cptr_el2);
+}
+
+static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+   write_sysreg(kvm->arch.vttbr, vttbr_el2);
+}
+
+static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
+{
+   write_sysreg(0, vttbr_el2);
+}
+
+static hyp_alternate_select(__vgic_call_save_state,
+   __vgic_v2_save_state, __vgic_v3_save_state,
+   ARM64_HAS_SYSREG_GIC_CPUIF);
+
+static hyp_alternate_select(__vgic_call_restore_state,
+   __vgic_v2_restore_state, __vgic_v3_restore_state,
+   ARM64_HAS_SYSREG_GIC_CPUIF);
+
+static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
+{
+   __vgic_call_save_state()(vcpu);
+   write_sysreg(read_sysreg(hcr_el2) & ~HCR_INT_OVERRIDE, hcr_el2);
+}
+
+static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
+{
+   u64 val;
+
+   val = read_sysreg(hcr_el2);
+   val |=  HCR_INT_OVERRIDE;
+   val |= vcpu->arch.irq_lines;
+   write_sysreg(val, hcr_el2);
+
+   __vgic_call_restore_state()(vcpu);
+}
+
+int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
+{
+   struct kvm_cpu_context *host_ctxt;
+   struct kvm_cpu_context *guest_ctxt;
+   u64 exit_code;
+
+   vcpu = kern_hyp_va(vcpu);
+   write_sysreg(vcpu, tpidr_el2);
+
+   host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+   guest_ctxt = >arch.ctxt;
+
+   __sysreg_save_state(host_ctxt);
+   __debug_cond_save_state(vcpu, >arch.host_debug_state, host_ctxt);
+
+   __activate_traps(vcpu);
+   __activate_vm(vcpu);
+
+   __vgic_restore_state(vcpu);
+   __timer_restore_state(vcpu);
+
+   /*
+* We must restore the 32-bit state before the sysregs, thanks
+* to Cortex-A57 erratum #852523.
+*/
+   __sysreg32_restore_state(vcpu);
+   __sysreg_restore_state(guest_ctxt);
+   __debug_restore_state(vcpu, >arch.vcpu_debug_state, guest_ctxt);
+
+   /* Jump in the fire! */
+   exit_code = __guest_enter(vcpu, host_ctxt);
+   /* And we're baaack! */
+
+   __sysreg_save_state(guest_ctxt);
+   

[PATCH 14/21] arm64: KVM: HYP mode entry points

2015-11-16 Thread Marc Zyngier
Add the entry points for HYP mode (both for hypercalls and
exception handling).

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/Makefile|   1 +
 arch/arm64/kvm/hyp/hyp-entry.S | 189 +
 2 files changed, 190 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/hyp-entry.S

diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 1a529f5..826032b 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -11,3 +11,4 @@ obj-$(CONFIG_KVM_ARM_HOST) += entry.o
 obj-$(CONFIG_KVM_ARM_HOST) += switch.o
 obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
 obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
+obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
new file mode 100644
index 000..e11a129
--- /dev/null
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -0,0 +1,189 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+   .text
+   .pushsection.hyp.text, "ax"
+
+el1_sync:  // Guest trapped into EL2
+   pushx0, x1
+   pushx2, x3
+
+   mrs x1, esr_el2
+   lsr x2, x1, #ESR_ELx_EC_SHIFT
+
+   cmp x2, #ESR_ELx_EC_HVC64
+   b.neel1_trap
+
+   mrs x3, vttbr_el2   // If vttbr is valid, the 64bit 
guest
+   cbnzx3, el1_trap// called HVC
+
+   /* Here, we're pretty sure the host called HVC. */
+   pop x2, x3
+   pop x0, x1
+
+   /* Check for __hyp_get_vectors */
+   cbnzx0, 1f
+   mrs x0, vbar_el2
+   b   2f
+
+1: pushlr, xzr
+
+   /*
+* Compute the function address in EL2, and shuffle the parameters.
+*/
+   kern_hyp_va x0
+   mov lr, x0
+   mov x0, x1
+   mov x1, x2
+   mov x2, x3
+   blr lr
+
+   pop lr, xzr
+2: eret
+
+el1_trap:
+   /*
+* x1: ESR
+* x2: ESR_EC
+*/
+
+   /* Guest accessed VFP/SIMD registers, save host, restore Guest */
+   cmp x2, #ESR_ELx_EC_FP_ASIMD
+   b.eq__fpsimd_guest_restore
+
+   cmp x2, #ESR_ELx_EC_DABT_LOW
+   mov x0, #ESR_ELx_EC_IABT_LOW
+   ccmpx2, x0, #4, ne
+   b.ne1f  // Not an abort we care about
+
+   /* This is an abort. Check for permission fault */
+   and x2, x1, #ESR_ELx_FSC_TYPE
+   cmp x2, #FSC_PERM
+   b.ne1f  // Not a permission fault
+
+   /*
+* Check for Stage-1 page table walk, which is guaranteed
+* to give a valid HPFAR_EL2.
+*/
+   tbnzx1, #7, 1f  // S1PTW is set
+
+   /* Preserve PAR_EL1 */
+   mrs x3, par_el1
+   pushx3, xzr
+
+   /*
+* Permission fault, HPFAR_EL2 is invalid.
+* Resolve the IPA the hard way using the guest VA.
+* Stage-1 translation already validated the memory access rights.
+* As such, we can use the EL1 translation regime, and don't have
+* to distinguish between EL0 and EL1 access.
+*/
+   mrs x2, far_el2
+   at  s1e1r, x2
+   isb
+
+   /* Read result */
+   mrs x3, par_el1
+   pop x0, xzr // Restore PAR_EL1 from the stack
+   msr par_el1, x0
+   tbnzx3, #0, 3f  // Bail out if we failed the translation
+   ubfxx3, x3, #12, #36// Extract IPA
+   lsl x3, x3, #4  // and present it like HPFAR
+   b   2f
+
+1: mrs x3, hpfar_el2
+   mrs x2, far_el2
+
+2: mrs x0, tpidr_el2
+   str w1, [x0, #VCPU_ESR_EL2]
+   str x2, [x0, #VCPU_FAR_EL2]
+   str x3, [x0, #VCPU_HPFAR_EL2]
+
+   mov x1, #ARM_EXCEPTION_TRAP
+   b   __guest_exit
+
+   /*
+* Translation failed. Just return to the guest and
+* let it fault again. Another CPU is probably playing
+* behind our back.
+*/
+3: pop x2, x3
+   pop x0, x1
+
+   eret
+
+el1_irq:
+   pushx0, x1
+   pushx2, x3
+   mrs x0, tpidr_el2
+   mov x1, #ARM_EXCEPTION_IRQ
+   b   __guest_exit
+

[PATCH 16/21] arm64: KVM: Add compatibility aliases

2015-11-16 Thread Marc Zyngier
So far, we've implemented the new world switch with a completely
different namespace, so that we could have both implementation
compiled in.

Let's take things one step further by adding weak aliases that
have the same names as the original implementation. The weak
attributes allows the new implementation to be overriden by the
old one, and everything still work.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/debug-sr.c   | 3 +++
 arch/arm64/kvm/hyp/hyp-entry.S  | 3 +++
 arch/arm64/kvm/hyp/switch.c | 3 +++
 arch/arm64/kvm/hyp/tlb.c| 9 +
 arch/arm64/kvm/hyp/vgic-v3-sr.c | 3 +++
 5 files changed, 21 insertions(+)

diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
index 118ea39..042d074 100644
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -130,3 +130,6 @@ u32 __hyp_text __debug_read_mdcr_el2(void)
 {
return read_sysreg(mdcr_el2);
 }
+
+__alias(__debug_read_mdcr_el2)
+u32 __weak __kvm_get_mdcr_el2(void);
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 7218eed..28de58f 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -175,6 +175,8 @@ ENDPROC(\label)
 
.align 11
 
+   .weak   __kvm_hyp_vector
+ENTRY(__kvm_hyp_vector)
 ENTRY(__hyp_vector)
ventry  el2t_sync_invalid   // Synchronous EL2t
ventry  el2t_irq_invalid// IRQ EL2t
@@ -196,3 +198,4 @@ ENTRY(__hyp_vector)
ventry  el1_fiq_invalid // FIQ 32-bit EL1
ventry  el1_error_invalid   // Error 32-bit EL1
 ENDPROC(__hyp_vector)
+ENDPROC(__kvm_hyp_vector)
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index cdc2a96..ef58066 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -141,6 +141,9 @@ int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
return exit_code;
 }
 
+__alias(__guest_run)
+int __weak __kvm_vcpu_run(struct kvm_vcpu *vcpu);
+
 static const char *__hyp_panic_string = "HYP panic:\nPS:%08x PC:%p 
ESR:%p\nFAR:%p HPFAR:%p PAR:%p\nVCPU:%p\n";
 
 void __hyp_text __noreturn __hyp_panic(void)
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index d4a07d0..2c279a8 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -47,6 +47,9 @@ void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, 
phys_addr_t ipa)
write_sysreg(0, vttbr_el2);
 }
 
+__alias(__tlb_flush_vmid_ipa)
+void __weak __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
+
 void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
 {
dsb(ishst);
@@ -63,6 +66,9 @@ void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
write_sysreg(0, vttbr_el2);
 }
 
+__alias(__tlb_flush_vmid)
+void __weak __kvm_tlb_flush_vmid(struct kvm *kvm);
+
 void __hyp_text __tlb_flush_vm_context(void)
 {
dsb(ishst);
@@ -70,3 +76,6 @@ void __hyp_text __tlb_flush_vm_context(void)
 "ic ialluis  ": : );
dsb(ish);
 }
+
+__alias(__tlb_flush_vm_context)
+void __weak __kvm_flush_vm_context(void);
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index f2289ab..0cf316c 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -220,3 +220,6 @@ u64 __hyp_text __vgic_v3_read_ich_vtr_el2(void)
 {
return read_gicreg(ICH_VTR_EL2);
 }
+
+__alias(__vgic_v3_read_ich_vtr_el2)
+u64 __weak __vgic_v3_get_ich_vtr_el2(void);
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 2/2] target-i386: add support to migrate vcpu's TSC rate

2015-11-16 Thread Haozhong Zhang
On 11/16/15 11:43, Eduardo Habkost wrote:
> On Mon, Nov 16, 2015 at 04:04:08PM +0800, Haozhong Zhang wrote:
> > This patch enables migrating vcpu's TSC rate. If KVM on the destination
> > machine supports TSC scaling, guest programs will observe a consistent
> > TSC rate across the migration.
> > 
> > If TSC scaling is not supported on the destination machine, the
> > migration will not be aborted and QEMU on the destination will not set
> > vcpu's TSC rate to the migrated value.
> > 
> > If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
> > machine is inconsistent with the migrated TSC rate, the migration will
> > be aborted.
> > 
> > For backwards compatibility, the migration of vcpu's TSC rate is
> > disabled on pc-*-2.4 and older machine types.
> > 
> > Signed-off-by: Haozhong Zhang 
> > ---
> >  hw/i386/pc.c  |  1 +
> >  hw/i386/pc_piix.c |  1 +
> >  hw/i386/pc_q35.c  |  1 +
> >  include/hw/i386/pc.h  |  1 +
> >  target-i386/cpu.c |  2 +-
> >  target-i386/cpu.h |  1 +
> >  target-i386/kvm.c | 26 --
> >  target-i386/machine.c | 28 
> >  8 files changed, 58 insertions(+), 3 deletions(-)
> > 
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index 0cb8afd..2f2fc93 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -1952,6 +1952,7 @@ static void pc_machine_class_init(ObjectClass *oc, 
> > void *data)
> >  HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
> >  
> >  pcmc->get_hotplug_handler = mc->get_hotplug_handler;
> > +pcmc->save_tsc_khz = true;
> >  mc->get_hotplug_handler = pc_get_hotpug_handler;
> >  mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id;
> >  mc->default_boot_order = "cad";
> > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> > index 07d0baa..7c5b0d2 100644
> > --- a/hw/i386/pc_piix.c
> > +++ b/hw/i386/pc_piix.c
> > @@ -489,6 +489,7 @@ static void pc_i440fx_2_4_machine_options(MachineClass 
> > *m)
> >  m->alias = NULL;
> >  m->is_default = 0;
> >  pcmc->broken_reserved_end = true;
> > +pcmc->save_tsc_khz = false;
> >  SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
> >  }
> >  
> > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> > index 0fdae09..fd8efe3 100644
> > --- a/hw/i386/pc_q35.c
> > +++ b/hw/i386/pc_q35.c
> > @@ -387,6 +387,7 @@ static void pc_q35_2_4_machine_options(MachineClass *m)
> >  m->hw_version = "2.4.0";
> >  m->alias = NULL;
> >  pcmc->broken_reserved_end = true;
> > +pcmc->save_tsc_khz = false;
> >  SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
> >  }
> >  
> > diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> > index 4bbc0ff..fea0f28 100644
> > --- a/include/hw/i386/pc.h
> > +++ b/include/hw/i386/pc.h
> > @@ -60,6 +60,7 @@ struct PCMachineClass {
> >  
> >  /*< public >*/
> >  bool broken_reserved_end;
> > +bool save_tsc_khz;
> >  HotplugHandler *(*get_hotplug_handler)(MachineState *machine,
> > DeviceState *dev);
> >  };
> > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > index e5f1c5b..98c6a4c 100644
> > --- a/target-i386/cpu.c
> > +++ b/target-i386/cpu.c
> > @@ -1724,7 +1724,7 @@ static void x86_cpuid_set_tsc_freq(Object *obj, 
> > Visitor *v, void *opaque,
> >  return;
> >  }
> >  
> > -cpu->env.tsc_khz = value / 1000;
> > +cpu->env.tsc_khz = cpu->env.user_tsc_khz = value / 1000;
> >  }
> >  
> >  static void x86_cpuid_get_apic_id(Object *obj, Visitor *v, void *opaque,
> > diff --git a/target-i386/cpu.h b/target-i386/cpu.h
> > index fc4a605..1ad1da8 100644
> > --- a/target-i386/cpu.h
> > +++ b/target-i386/cpu.h
> > @@ -973,6 +973,7 @@ typedef struct CPUX86State {
> >  uint32_t sipi_vector;
> >  bool tsc_valid;
> >  int64_t tsc_khz;
> > +int64_t user_tsc_khz;
> >  void *kvm_xsave_buf;
> >  
> >  uint64_t mcg_cap;
> > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > index 9084b29..8448248 100644
> > --- a/target-i386/kvm.c
> > +++ b/target-i386/kvm.c
> > @@ -2334,8 +2334,30 @@ static void kvm_arch_set_tsc_khz(CPUState *cs)
> >  int r;
> >  
> >  /*
> > - * If no user-specified TSC frequency is present, we will try to
> > - * set env->tsc_khz to the value used by KVM.
> > + * For other cases of env->tsc_khz and env->user_tsc_khz:
> > + *
> > + * - We have eliminated all cases that satisfy
> > + *   env->tsc_khz && env->user_tsc_khz &&
> > + *   env->tsc_khz != env->user_tsc_khz
> > + *   in cpu_post_load().
> > + *
> > + * - If env->user_tsc_khz is not zero, then it must be equal to
> > + *   env->tsc_khz (if the latter is not zero) and has been set in
> > + *   kvm_arch_init_vcpu().
> > + */
> > +if (env->tsc_khz && !env->user_tsc_khz) {
> > +r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
> > +kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, 

Re: [PATCH 15/21] arm64: KVM: Add panic handling

2015-11-16 Thread Mark Rutland
> >> +  /* Call panic for real */
> >> +  while (1) {
> >> +  unsigned long str_va = (unsigned long)__hyp_panic_string;
> >> +
> >> +  str_va -= HYP_PAGE_OFFSET;
> >> +  str_va += PAGE_OFFSET;
> >> +  __hyp_do_panic(str_va,
> >> + spsr,  elr,
> >> + read_sysreg(esr_el2),   read_sysreg(far_el2),
> >> + read_sysreg(hpfar_el2), par,
> >> + read_sysreg(tpidr_el2));
> >> +  }
> >> +}
> > 
> > I think the while (1) here is confusing.
> > 
> > Can we not jsut declare str_va at the start of the function and get rid
> > of the loop?
> 
> The while(1) is to prevent GCC from screaming (it otherwise believes
> that the function actually returns, despite the __noreturn attribute).

Aha!

Perhaps a comment to that effect...?

> Or were you thinking of something else?

I just failed to derive the __noreturn problem from first principles.

Perhaps follow the __hyp_do_panic() call with an unreachable(), with the
comment as to GCC failing to reason about the __noreturn? That would be
less confusing than the loop, assuming that it works.

Mark.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 1/2] target-i386: fallback vcpu's TSC rate to value returned by KVM

2015-11-16 Thread Haozhong Zhang
On 11/16/15 11:39, Eduardo Habkost wrote:
> On Mon, Nov 16, 2015 at 04:04:07PM +0800, Haozhong Zhang wrote:
> > If no user-specified TSC rate is present, we will try to set
> > env->tsc_khz to the value returned by KVM_GET_TSC_KHZ.
> > 
> > Signed-off-by: Haozhong Zhang 
> > ---
> >  target-i386/kvm.c | 25 +
> >  1 file changed, 25 insertions(+)
> > 
> > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > index 2a9953b..9084b29 100644
> > --- a/target-i386/kvm.c
> > +++ b/target-i386/kvm.c
> > @@ -2327,6 +2327,27 @@ static int kvm_get_debugregs(X86CPU *cpu)
> >  return 0;
> >  }
> >  
> > +static void kvm_arch_set_tsc_khz(CPUState *cs)
> 
> > +{
> > +X86CPU *cpu = X86_CPU(cs);
> > +CPUX86State *env = >env;
> > +int r;
> > +
> > +/*
> > + * If no user-specified TSC frequency is present, we will try to
> > + * set env->tsc_khz to the value used by KVM.
> > + */
> > +if (!env->tsc_khz) {
> > +r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
> > +kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
> 
> Can't we do this on kvm_arch_init_vcpu()? kvm_arch_put_registers()'s purpose 
> is
> to just copy data from QEMU to KVM, not the other way around.
>

I'll move this to kvm_arch_init_vcpu().

> 
> > +if (r < 0) {
> > +error_report("warning: KVM_GET_TSC_KHZ failed");
> 
> Having a kernel that doesn't support KVM_CAP_GET_TSC_KHZ shouldn't trigger a
> warning every time we run QEMU, unless the user is explicitly asking for a
> feature that requires KVM_GET_TSC_KHZ.
>

I'll remove the warning.

> > +} else {
> > +env->tsc_khz = r;
> > +}
> > +}
> > +}
> > +
> >  int kvm_arch_put_registers(CPUState *cpu, int level)
> >  {
> >  X86CPU *x86_cpu = X86_CPU(cpu);
> > @@ -2341,6 +2362,10 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
> >  }
> >  }
> >  
> > +if (level == KVM_PUT_FULL_STATE) {
> > +kvm_arch_set_tsc_khz(cpu);
> > +}
> 
> I see that kvm_arch_set_tsc_khz() will be extended to call
> KVM_SET_TSC_KHZ in the next patch, so the kvm_arch_set_tsc_khz()
> seems to belong here. But the KVM_GET_TSC_KHZ call doesn't seem
> to belong in kvm_arch_set_tsc_khz()
> 
> kvm_arch_put_registers() callers don't expect any QEMU-side data
> to change, but just that KVM data is updated according to the
> QEMU-side data.
>

Good to know this. As above, I'll move the KVM_GET_TSC_KHZ call to
kvm_arch_init_vcpu().

Haozhong

> > +
> >  ret = kvm_getput_regs(x86_cpu, 1);
> >  if (ret < 0) {
> >  return ret;
> > -- 
> > 2.4.8
> > 
> 
> -- 
> Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 17/21] arm64: KVM: Map the kernel RO section into HYP

2015-11-16 Thread Marc Zyngier
On 16/11/15 14:27, Mark Rutland wrote:
> On Mon, Nov 16, 2015 at 01:11:55PM +, Marc Zyngier wrote:
>> In order to run C code in HYP, we must make sure that the kernel's
>> RO section in mapped into HYP (otherwise things break badly).
> 
> Somewhat tangential, but do we have any strong guarantees that the hyp
> text is otherwise safe in its address space which differs from that of
> the kernel proper?
> 
> i.e. do we need something like we did for the EFI stub in commit
> e8f3010f7326c003 ("arm64/efi: isolate EFI stub from the kernel proper")?

Probably. That will make things more difficult for VHE, where there are
function calls between the kernel and the "hypervisor" (kvm_call_hyp()
and panic() are the most obvious ones).

I'll have a look, thanks for the pointer.

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm/vmx: EPTP switching test

2015-11-16 Thread =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?=
2015-11-15 18:00+0200, Michael S. Tsirkin:
> This patch adds a new parameter: eptp_switching_test, which enables
> 
> testing EPT switching on VMX if supported by hardware.  All EPT entries
> are initialized to the same value so this adds no useful functionality
> by itself, but can be used to test VMFUNC performance, and serve as a
> basis for future features based on EPTP switching.
> 
> Support for nested virt is not enabled.
> 
> This was tested using the following code within guest:
>   #define VMX_VMFUNC ".byte 0x0f,0x01,0xd4"
>   static void vmfunc(unsigned int nr, unsigned int ept)
>   {
>   asm volatile(VMX_VMFUNC
>:
>: "a"(nr), "c"(ept)
>: "memory");
>   }
> 
> VMFUNC instruction cost was measured at ~122 cycles.
> (Note: recent versions of gnu toolchain support
>  the vmfunc instruction - removing the need for writing
>  the bytecode manually).
> 
> Signed-off-by: Michael S. Tsirkin 
> ---
> 
> I think I'd like to put this upstream so future eptp switching work can
> be implemented on top. Comments?

I'd wait for the future.  Patch is already on the list so people
interested in benchmarking VMFUNC can quickly compile a kernel and
developers will need to overwrite the code anyway.

(And I think that eptp switching is expected to be used in conjuction
 with #VE, so it'd then make sense to implement a nop for it as well.)

> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> @@ -3011,6 +3035,7 @@ static __init int setup_vmcs_config(struct vmcs_config 
> *vmcs_conf)
>   SECONDARY_EXEC_PAUSE_LOOP_EXITING |
>   SECONDARY_EXEC_RDTSCP |
>   SECONDARY_EXEC_ENABLE_INVPCID |
> + SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |

The VMFUNC vmexit should be handled to prevent guests from triggering a
WARN_ON on the host.  (VMFUNC did just #UD before this patch.)

After that, it's ok for KVM.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm/vmx: EPTP switching test

2015-11-16 Thread =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?=
2015-11-16 19:59+0200, Michael S. Tsirkin:
> On Mon, Nov 16, 2015 at 06:51:06PM +0100, 
> =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= wrote:
>> 2015-11-15 18:00+0200, Michael S. Tsirkin:
>> (And I think that eptp switching is expected to be used in conjuction
>>  with #VE, so it'd then make sense to implement a nop for it as well.)
> 
> No idea how would I even test it, so I'm not interested in #VE at this
> point.  If you are - go ahead and post a patch for that on top though,
> why not.

I thought that it's going to be simpler to provide functionality (that
utilizes eptp switching) to the guest through #VE, which probably isn't
true as I think more about it.  (Not interested in implementing it :])
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm/vmx: EPTP switching test

2015-11-16 Thread Andy Lutomirski
On Mon, Nov 16, 2015 at 10:18 AM,
=?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?=  wrote:
> 2015-11-16 19:59+0200, Michael S. Tsirkin:
>> On Mon, Nov 16, 2015 at 06:51:06PM +0100, 
>> =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= wrote:
>>> 2015-11-15 18:00+0200, Michael S. Tsirkin:
>>> (And I think that eptp switching is expected to be used in conjuction
>>>  with #VE, so it'd then make sense to implement a nop for it as well.)
>>
>> No idea how would I even test it, so I'm not interested in #VE at this
>> point.  If you are - go ahead and post a patch for that on top though,
>> why not.
>
> I thought that it's going to be simpler to provide functionality (that
> utilizes eptp switching) to the guest through #VE, which probably isn't
> true as I think more about it.  (Not interested in implementing it :])

I'm willing to review a #VE stub.  I'm not likely to implement it.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: x86: Add lowest-priority support for vt-d posted-interrupts

2015-11-16 Thread Radim Krčmář
2015-11-09 10:46+0800, Feng Wu:
> Use vector-hashing to handle lowest-priority interrupts for
> posted-interrupts. As an example, modern Intel CPUs use this
> method to handle lowest-priority interrupts.

(I don't think it's a good idea that the algorithm differs from non-PI
 lowest priority delivery.  I'd make them both vector-hashing, which
 would be "fun" to explain to people expecting round robin ...)

> Signed-off-by: Feng Wu 
> ---
> diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
> +/*
> + * This routine handles lowest-priority interrupts using vector-hashing
> + * mechanism. As an example, modern Intel CPUs use this method to handle
> + * lowest-priority interrupts.
> + *
> + * Here is the details about the vector-hashing mechanism:
> + * 1. For lowest-priority interrupts, store all the possible destination
> + *vCPUs in an array.
> + * 2. Use "guest vector % max number of destination vCPUs" to find the right
> + *destination vCPU in the array for the lowest-priority interrupt.
> + */

(Is Skylake i7-6700 a modern Intel CPU?
 I didn't manage to get hashing ... all interrupts always went to the
 lowest APIC ID in the set :/
 Is there a simple way to verify the algorithm?)

> +struct kvm_vcpu *kvm_intr_vector_hashing_dest(struct kvm *kvm,
> +   struct kvm_lapic_irq *irq)
> +
> +{
> + unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
> + unsigned int dest_vcpus = 0;
> + struct kvm_vcpu *vcpu;
> + unsigned int i, mod, idx = 0;
> +
> + vcpu = kvm_intr_vector_hashing_dest_fast(kvm, irq);
> + if (vcpu)
> + return vcpu;

I think the rest of this function shouldn't be implemented:
 - Shorthands are only for IPIs and hence don't need to be handled,
 - Lowest priority physical broadcast is not supported,
 - Lowest priority cluster logical broadcast is not supported,
 - No point in optimizing mixed xAPIC and x2APIC mode,
 - The rest is handled by kvm_intr_vector_hashing_dest_fast().
   (Even lowest priority flat logical "broadcast".)
 - We do the work twice when vcpu == NULL means that there is no
   matching destination.

Is there a valid case that can be resolved by going through all vcpus?

> +
> + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (!kvm_apic_present(vcpu))
> + continue;
> +
> + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
> + irq->dest_id, irq->dest_mode))
> + continue;
> +
> + __set_bit(vcpu->vcpu_id, dest_vcpu_bitmap);
> + dest_vcpus++;
> + }
> +
> + if (dest_vcpus == 0)
> + return NULL;
> +
> + mod = irq->vector % dest_vcpus;
> +
> + for (i = 0; i <= mod; i++) {
> + idx = find_next_bit(dest_vcpu_bitmap, KVM_MAX_VCPUS, idx) + 1;
> + BUG_ON(idx >= KVM_MAX_VCPUS);
> + }
> +
> + return kvm_get_vcpu(kvm, idx - 1);
> +}
> +EXPORT_SYMBOL_GPL(kvm_intr_vector_hashing_dest);
> +
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> @@ -816,6 +816,63 @@ out:
> +struct kvm_vcpu *kvm_intr_vector_hashing_dest_fast(struct kvm *kvm,
> +struct kvm_lapic_irq *irq)

We now have three very similar functions :(

  kvm_irq_delivery_to_apic_fast
  kvm_intr_is_single_vcpu_fast
  kvm_intr_vector_hashing_dest_fast

By utilizing the gcc optimizer, they can be merged without introducing
many instructions to the hot path, kvm_irq_delivery_to_apic_fast.
(I would eventually do it, so you can save time by ignoring this.)

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm/vmx: EPTP switching test

2015-11-16 Thread Michael S. Tsirkin
On Mon, Nov 16, 2015 at 06:51:06PM +0100, 
=?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= wrote:
> 2015-11-15 18:00+0200, Michael S. Tsirkin:
> > This patch adds a new parameter: eptp_switching_test, which enables
> > 
> > testing EPT switching on VMX if supported by hardware.  All EPT entries
> > are initialized to the same value so this adds no useful functionality
> > by itself, but can be used to test VMFUNC performance, and serve as a
> > basis for future features based on EPTP switching.
> > 
> > Support for nested virt is not enabled.
> > 
> > This was tested using the following code within guest:
> > #define VMX_VMFUNC ".byte 0x0f,0x01,0xd4"
> > static void vmfunc(unsigned int nr, unsigned int ept)
> > {
> > asm volatile(VMX_VMFUNC
> >  :
> >  : "a"(nr), "c"(ept)
> >  : "memory");
> > }
> > 
> > VMFUNC instruction cost was measured at ~122 cycles.
> > (Note: recent versions of gnu toolchain support
> >  the vmfunc instruction - removing the need for writing
> >  the bytecode manually).
> > 
> > Signed-off-by: Michael S. Tsirkin 
> > ---
> > 
> > I think I'd like to put this upstream so future eptp switching work can
> > be implemented on top. Comments?
> 
> I'd wait for the future.  Patch is already on the list so people
> interested in benchmarking VMFUNC can quickly compile a kernel and
> developers will need to overwrite the code anyway.

It'll bitrot though.  But I'll let Paolo decide that.

> (And I think that eptp switching is expected to be used in conjuction
>  with #VE, so it'd then make sense to implement a nop for it as well.)

No idea how would I even test it, so I'm not interested in #VE at this
point.  If you are - go ahead and post a patch for that on top though,
why not.

> > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> > @@ -3011,6 +3035,7 @@ static __init int setup_vmcs_config(struct 
> > vmcs_config *vmcs_conf)
> > SECONDARY_EXEC_PAUSE_LOOP_EXITING |
> > SECONDARY_EXEC_RDTSCP |
> > SECONDARY_EXEC_ENABLE_INVPCID |
> > +   SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
> 
> The VMFUNC vmexit should be handled to prevent guests from triggering a
> WARN_ON on the host.  (VMFUNC did just #UD before this patch.)

Do you mean VMFUNC other than EPTP switch 0?  True, thanks!

> 
> After that, it's ok for KVM.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 2/2] dma-mapping-common: add DMA attribute - DMA_ATTR_IOMMU_BYPASS

2015-11-16 Thread Benjamin Serebrin
On Mon, Nov 16, 2015 at 12:42 AM, David Woodhouse  wrote:
>
> On Sun, 2015-11-15 at 22:54 -0800, Benjamin Serebrin wrote:
> > We looked into Intel IOMMU performance a while ago and learned a few
> > useful things.  We generally did a parallel 200 thread TCP_RR test,
> > as this churns through mappings quickly and uses all available cores.
> >
> > First, the main bottleneck was software performance[1].
>
> For the Intel IOMMU, *all* we need to do is put a PTE in place. For
> real hardware (i.e not an IOMMU emulated by qemu for a VM), we don't
> need to do an IOTLB flush. It's a single 64-bit write of the PTE.
>
> All else is software overhead.
>

Agreed!

>
> (I'm deliberately ignoring the stupid chipsets where DMA page tables
> aren't cache coherent and we need a clflush too. They make me too sad.)



How much does Linux need to care about such chipsets?  Can we argue
that they get very sad performance and so be it?

>
>
>
> >   This study preceded the recent patch to break the locks into pools
> > ("Break up monolithic iommu table/lock into finer graularity pools
> > and lock").  There were several points of lock contention:
> > - the RB tree ...
> > - the RB tree ...
> > - the RB tree ...
> >
> > Omer's paper (https://www.usenix.org/system/files/conference/atc15/at
> > c15-paper-peleg.pdf) has some promising approaches.  The magazine
> > avoids the RB tree issue.
>
> I'm thinking of ditching the RB tree altogether and switching to the
> allocator in lib/iommu-common.c (and thus getting the benefit of the
> finer granularity pools).


Sounds promising!  Is 4 parallel arenas enough?  We'll try to play
with that here.
I think lazy_flush leaves dangling references.

>
>
> > I'm interested in seeing if the dynamic 1:1 with a mostly-lock-free
> > page table cleanup algorithm could do well.
>
> When you say 'dynamic 1:1 mapping', is that the same thing that's been
> suggested elsewhere — avoiding the IOVA allocator completely by using a
> virtual address which *matches* the physical address, if that virtual
> address is available?


Yes.  We munge in 2 upper address bits in the IOVA to encode read and
write permissions as well.

>
> Simply cmpxchg on the PTE itself, and if it was
> already set *then* we fall back to the allocator, obviously configured
> to allocate from a range *higher* than the available physical memory.
>
> Jörg has been looking at this too, and was even trying to find space in
> the PTE for a use count so a given page could be in more than one
> mapping before we call back to the IOVA allocator.


Aren't bits 63:52 available at all levels?

>
>
>
> > There are correctness fixes and optimizations left in the
> > invalidation path: I want strict-ish semantics (a page doesn't go
> > back into the freelist until the last IOTLB/IOMMU TLB entry is
> > invalidated) with good performance, and that seems to imply that an
> > additional page reference should be gotten at dma_map time and put
> > back at the completion of the IOMMU flush routine.  (This is worthy
> > of much discussion.)
>
> We already do something like this for page table pages which are freed
> by an unmap, FWIW.


As I understood the code when I last looked, this was true only if a
single unmap operation covered an entire table's worth (2MByte, or
1GByte, etc) of mappings.  The caffeine hasn't hit yet, though, so I
can't even begin to dive into the new calls into mm.c.


>
>
> > Additionally, we can find ways to optimize the flush routine by
> > realizing that if we have frequent maps and unmaps, it may be because
> > the device creates and destroys buffers a lot; these kind of
> > workloads use an IOVA for one event and then never come back.  Maybe
> > TLBs don't do much good and we could just flush the entire IOMMU TLB
> > [and ATS cache] for that BDF.
>
> That would be a very interesting thing to look at. Although it would be
> nice if we had a better way to measure the performance impact of IOTLB
> misses — currently we don't have a lot of visibility at all.


All benchmarks are lies.  But we intend to run internal workloads as
well as well-agreed loads and see how things go.

>
>
> > 1: We verified that the IOMMU costs are almost entirely software
> > overheads by forcing software 1:1 mode, where we create page tables
> > for all physical addresses.  We tested using leaf nodes of size 4KB,
> > of 2MB, and of 1GB.  In call cases, there is zero runtime maintenance
> > of the page tables, and no IOMMU invalidations.  We did piles of DMA
> > maximizing x16 PCIe bandwidth on multiple lanes, to random DRAM
> > addresses.  At 4KB page size, we could see some bandwidth slowdown,
> > but at 2MB and 1GB, there was < 1% performance loss as compared with
> > IOMMU off.
>
> Was this with ATS on or off? With ATS, the cost of the page walk can be
> amortised in some cases — you can look up the physical address *before*
> you are ready to actually start the DMA to it, and don't take that
> latency at the time you're 

[PATCH v2 4/4] KVM: x86: fix ready_for_interrupt reporting in split IRQ chip case

2015-11-16 Thread Matt Gingell
This patch adds a call to kvm_vcpu_ready_for_interrupt_injection to
ensure ready for interrupt is reported to user space correctly. This
addresses a problem observed in QEMU when kvm->ready_for_interrupt is
set but the x86 interrupt flag is clear.

Reviewed-by: Steve Rutherford 
Signed-off-by: Matt Gingell 
---
 arch/x86/kvm/x86.c | 14 +++---
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d57bdd9..c0e6f94 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5836,17 +5836,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
-   if (!irqchip_in_kernel(vcpu->kvm))
-   kvm_run->ready_for_interrupt_injection =
-   kvm_arch_interrupt_allowed(vcpu) &&
-   !kvm_cpu_has_interrupt(vcpu) &&
-   !kvm_event_needs_reinjection(vcpu);
-   else if (!pic_in_kernel(vcpu->kvm))
-   kvm_run->ready_for_interrupt_injection =
-   kvm_apic_accept_pic_intr(vcpu) &&
-   !kvm_cpu_has_interrupt(vcpu);
-   else
-   kvm_run->ready_for_interrupt_injection = 1;
+   kvm_run->ready_for_interrupt_injection =
+   pic_in_kernel(vcpu->kvm) ||
+   kvm_vcpu_ready_for_interrupt_injection(vcpu);
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
-- 
2.6.0.rc2.230.g3dd15c0


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/4] KVM: x86: request interrupt window when IRQ chip is split

2015-11-16 Thread Matt Gingell
Before this patch, we incorrectly enter the guest without requesting an
interrupt window if the IRQ chip is split between user space and the
kernel.

Because lapic_in_kernel no longer implies the PIC is in the kernel, this
patch tests pic_in_kernel to determining whether an interrupt window
should be requested when entering the guest.

If the APIC is in the kernel and we request an interrupt window the
guest will return immediately. If the APIC is masked the guest will not
not make forward progress and unmask it, leading to a loop when KVM
reenters and requests again. This patch adds a check to ensure the APIC
is ready to accept an interrupt before requesting a window.

Reviewed-by: Steve Rutherford 
Signed-off-by: Matt Gingell 
---
 arch/x86/kvm/x86.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c370eef..d57bdd9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6259,8 +6259,11 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm 
*kvm,
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
int r;
-   bool req_int_win = !lapic_in_kernel(vcpu) &&
-   vcpu->run->request_interrupt_window;
+   bool req_int_win =
+   vcpu->run->request_interrupt_window &&
+   likely(!pic_in_kernel(vcpu->kvm)) &&
+   (!lapic_in_kernel(vcpu) || kvm_apic_accept_pic_intr(vcpu));
+
bool req_immediate_exit = false;
 
if (vcpu->requests) {
-- 
2.6.0.rc2.230.g3dd15c0


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/4] KVM: x86: set KVM_REQ_EVENT on local interrupt request from user space

2015-11-16 Thread Matt Gingell
Set KVM_REQ_EVENT when a PIC in user space injects a local interrupt.

Currently a request is only made when neither the PIC nor the APIC is in
the kernel, which is not sufficient in the split IRQ chip case.

This addresses a problem in QEMU where interrupts are delayed until
another path invokes the event loop.

Reviewed-by: Steve Rutherford 
Signed-off-by: Matt Gingell 
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2e461b9..c370eef 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2691,6 +2691,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return -EEXIST;
 
vcpu->arch.pending_external_vector = irq->irq;
+   kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
 }
 
-- 
2.6.0.rc2.230.g3dd15c0


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/4] KVM: x86: add new function kvm_vcpu_ready_for_interrupt_injection

2015-11-16 Thread Matt Gingell
This patch breaks out a new function kvm_vcpu_ready_for_interrupt_injection.
This routine encapsulates the logic required to determine whether a vcpu
is ready to accept an interrupt injection, which is now required on
multiple paths.

Reviewed-by: Steve Rutherford 
Signed-off-by: Matt Gingell 
---
 arch/x86/kvm/x86.c | 22 --
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 30723a4..2e461b9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2660,6 +2660,14 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu 
*vcpu,
return 0;
 }
 
+static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) {
+   return kvm_arch_interrupt_allowed(vcpu) &&
+   !kvm_cpu_has_interrupt(vcpu) &&
+   !kvm_event_needs_reinjection(vcpu) &&
+   (!lapic_in_kernel(vcpu) ||
+kvm_apic_accept_pic_intr(vcpu));
+}
+
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
 {
@@ -5815,15 +5823,8 @@ static int emulator_fix_hypercall(struct 
x86_emulate_ctxt *ctxt)
  */
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 {
-   if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm))
-   return false;
-
-   if (kvm_cpu_has_interrupt(vcpu))
-   return false;
-
-   return (irqchip_split(vcpu->kvm)
-   ? kvm_apic_accept_pic_intr(vcpu)
-   : kvm_arch_interrupt_allowed(vcpu));
+   return vcpu->run->request_interrupt_window &&
+   likely(!pic_in_kernel(vcpu->kvm));
 }
 
 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@@ -6561,7 +6562,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
 
-   if (dm_request_for_irq_injection(vcpu)) {
+   if (dm_request_for_irq_injection(vcpu) &&
+   kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
r = 0;
vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.request_irq_exits;
-- 
2.6.0.rc2.230.g3dd15c0


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] KVM: x86: always set accessed bit in shadow PTEs

2015-11-16 Thread Zhang, Yang Z
Paolo Bonzini wrote on 2015-11-13:
> Commit 7a1638ce4220 ("nEPT: Redefine EPT-specific link_shadow_page()",
> 2013-08-05) says:
> 
> Since nEPT doesn't support A/D bit, we should not set those bit
> when building the shadow page table.
> but this is not necessary.  Even though nEPT doesn't support A/D
> bits, and hence the vmcs12 EPT pointer will never enable them,
> we always use them for shadow page tables if available (see
> construct_eptp in vmx.c).  So we can set the A/D bits freely
> in the shadow page table.
> 
> This patch hence basically reverts commit 7a1638ce4220.
> 
> Cc: Yang Zhang 
> Cc: Takuya Yoshikawa 
> Signed-off-by: Paolo Bonzini 

Looks good to me.

Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virtio_ring: Shadow available ring flags & index

2015-11-16 Thread Xie, Huawei
On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
> On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
>> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
>>> Improves cacheline transfer flow of available ring header.
>>>
>>> Virtqueues are implemented as a pair of rings, one producer->consumer
>>> avail ring and one consumer->producer used ring; preceding the
>>> avail ring in memory are two contiguous u16 fields -- avail->flags
>>> and avail->idx. A producer posts work by writing to avail->idx and
>>> a consumer reads avail->idx.
>>>
>>> The flags and idx fields only need to be written by a producer CPU
>>> and only read by a consumer CPU; when the producer and consumer are
>>> running on different CPUs and the virtio_ring code is structured to
>>> only have source writes/sink reads, we can continuously transfer the
>>> avail header cacheline between 'M' states between cores. This flow
>>> optimizes core -> core bandwidth on certain CPUs.
>>>
>>> (see: "Software Optimization Guide for AMD Family 15h Processors",
>>> Section 11.6; similar language appears in the 10h guide and should
>>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
>>>
>>> Unfortunately the existing virtio_ring code issued reads to the
>>> avail->idx and read-modify-writes to avail->flags on the producer.
>>>
>>> This change shadows the flags and index fields in producer memory;
>>> the vring code now reads from the shadows and only ever writes to
>>> avail->flags and avail->idx, allowing the cacheline to transfer
>>> core -> core optimally.
>> Sounds logical, I'll apply this after a  bit of testing
>> of my own, thanks!
> Thanks!
Venkatesh:
Is it that your patch only applies to CPUs w/ exclusive caches? Do you
have perf data on Intel CPUs?
For the perf metric you provide, why not L1-dcache-load-misses which is
more meaning full?
>
>>> In a concurrent version of vring_bench, the time required for
>>> 10,000,000 buffer checkout/returns was reduced by ~2% (average
>>> across many runs) on an AMD Piledriver (15h) CPU:
>>>
>>> (w/o shadowing):
>>>  Performance counter stats for './vring_bench':
>>>  5,451,082,016  L1-dcache-loads
>>>  ...
>>>2.221477739 seconds time elapsed
>>>
>>> (w/ shadowing):
>>>  Performance counter stats for './vring_bench':
>>>  5,405,701,361  L1-dcache-loads
>>>  ...
>>>2.168405376 seconds time elapsed
>> Could you supply the full command line you used
>> to test this?
> Yes --
>
> perf stat -e L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores \
>   ./vring_bench
>
> The standard version of vring_bench is single-threaded (posted on this list
> but never submitted); my tests were with a version that has a worker thread
> polling the VQ. How should I share it? Should I just attach it to an email
> here?
>
>>> The further away (in a NUMA sense) virtio producers and consumers are
>>> from each other, the more we expect to benefit. Physical implementations
>>> of virtio devices and implementations of virtio where the consumer polls
>>> vring avail indexes (vhost) should also benefit.
>>>
>>> Signed-off-by: Venkatesh Srinivas 
>> Here's a similar patch for the ring itself:
>> https://lkml.org/lkml/2015/9/10/111
>>
>> Does it help you as well?
> I tested your patch in our environment; our virtqueues do not support
> Indirect entries and your patch does not manage to elide many writes, so I
> do not see a performance difference. In an environment with Indirect, your
> patch will likely be a win.
>
> (My patch gets most of its win by eliminating reads on the producer; when
> the producer reads avail fields at the same time the consumer is polling,
> we see cacheline transfers that hurt performance. Your patch eliminates
> writes, which is nice, but our tests w/ polling are not as sensitive to
> writes from the producer.)
>
> I have two quick comments on your patch --
> 1) I think you need to kfree vq->avail when deleting the virtqueue.
>
> 2) Should we avoid allocating a cache for virtqueues that are not
>performance critical? (ex: virtio-scsi eventq/controlq, virtio-net
>controlq)
>
> Should I post comments in reply to the original patch email (given that it
> is ~2 months old)?
>
> Thanks!
> -- vs;
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] kvm/vmx: EPTP switching test

2015-11-16 Thread Zhang, Yang Z
Michael S. Tsirkin wrote on 2015-11-16:
> This patch adds a new parameter: eptp_switching_test, which enables
> testing EPT switching on VMX if supported by hardware.  All EPT
> entries are initialized to the same value so this adds no useful
> functionality by itself, but can be used to test VMFUNC performance,
> and serve as a basis for future features based on EPTP switching.
> 
> Support for nested virt is not enabled.
> 
> This was tested using the following code within guest:
>   #define VMX_VMFUNC ".byte 0x0f,0x01,0xd4"
>   static void vmfunc(unsigned int nr, unsigned int ept)
>   {
>   asm volatile(VMX_VMFUNC
>:
>: "a"(nr), "c"(ept)
>: "memory");
>   }
> 
> VMFUNC instruction cost was measured at ~122 cycles.
> (Note: recent versions of gnu toolchain support  the vmfunc
> instruction - removing the need for writing  the bytecode manually).
> 
> Signed-off-by: Michael S. Tsirkin 
> ---
> 
> I think I'd like to put this upstream so future eptp switching work
> can be implemented on top. Comments?

We have a different version in hand which is using separate EPTP. As you known, 
the patch will be more complex if using separate EPTP. And there are still lots 
of thing need to do in our version. We will send out for comments soon.

Best regards,
Yang


N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf

[Bug 108021] New: Kernel Panic in the Virtual Machines running kernel 3.13.0-44-generic (Ubuntu)

2015-11-16 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=108021

Bug ID: 108021
   Summary: Kernel Panic in the Virtual Machines running kernel
3.13.0-44-generic (Ubuntu)
   Product: Virtualization
   Version: unspecified
Kernel Version: 2.6.32-47
  Hardware: x86-64
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: blocking
  Priority: P1
 Component: kvm
  Assignee: virtualization_...@kernel-bugs.osdl.org
  Reporter: msbro...@gmail.com
Regression: No

All virtual machines boot and after a while they give the same kernel panic
message. It's my first report, so I'm sorry if it is not correctly classified.

How can I help you more?

In the virtual machine:
[0.00] Linux version 3.13.0-44-generic (buildd@lamiak) (gcc version
4.8.2 (Ubuntu 4.8.2-19ubuntu1) ) #73-Ubuntu SMP Tue Dec 16 00:22:43 UTC 2014
(Ubuntu 3.13.0-44.73-generic 3.13.11-ckt12)

[ 4654.445436] BUG: unable to handle kernel NULL pointer dereference at
0358
[ 4654.447011] IP: [] sym_int_sir+0x4cb/0x16e0 [sym53c8xx]
[ 4654.448400] PGD 98555067 PUD 985be067 PMD 0
[ 4654.449289] Oops:  [#1] SMP
[ 4654.449342] Modules linked in: autofs4 dm_crypt nfsd auth_rpcgss nfs_acl nfs
lockd fscache sunrpc serio_raw i2c_piix4 mac_hid lp parport cirrus syscopyarea
sysfillrect sysimgblt ttm drm_kms_helper psmouse e1000 drm sym53c8xx pata_acpi
floppy
[ 4654.449342] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.13.0-44-generic
#73-Ubuntu
[ 4654.449342] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 4654.449342] task: 81c15480 ti: 81c0 task.ti:
81c0
[ 4654.449342] RIP: 0010:[]  []
sym_int_sir+0x4cb/0x16e0 [sym53c8xx]
[ 4654.449342] RSP: 0018:8800bfa03df8  EFLAGS: 00010097
[ 4654.449342] RAX: 0012 RBX:  RCX:
0034
[ 4654.449342] RDX:  RSI: 88003656a000 RDI:
c962a006
[ 4654.449342] RBP: 8800bfa03e58 R08: 000d R09:
8800bc00
[ 4654.449342] R10: 000d R11:  R12:
88003658d000
[ 4654.449342] R13: 0012 R14: 88003658d090 R15:
0084
[ 4654.449342] FS:  () GS:8800bfa0()
knlGS:
[ 4654.449342] CS:  0010 DS:  ES:  CR0: 8005003b
[ 4654.449342] CR2: 0358 CR3: b9cd3000 CR4:
06f0
[ 4654.449342] Stack:
[ 4654.449342]  8800bfa03e18 8136c970 8800bfa0d1a0
8800bfa0dce0
[ 4654.449342]  8800bfa03e40 8108e2a5 8800bfa0d1a0

[ 4654.449342]  0001  88003658d000
0084
[ 4654.449342] Call Trace:
[ 4654.449342]  
[ 4654.449342]  [] ? timerqueue_add+0x60/0xb0
[ 4654.449342]  [] ? enqueue_hrtimer+0x25/0x80
[ 4654.449342]  [] sym_interrupt+0x612/0x790 [sym53c8xx]
[ 4654.449342]  [] sym53c8xx_intr+0x4a/0x90 [sym53c8xx]
[ 4654.449342]  [] handle_irq_event_percpu+0x3e/0x1d0
[ 4654.449342]  [] handle_irq_event+0x3d/0x60
[ 4654.449342]  [] handle_fasteoi_irq+0x5a/0x100
[ 4654.449342]  [] handle_irq+0x1e/0x30
[ 4654.449342]  [] do_IRQ+0x4d/0xc0
[ 4654.449342]  [] common_interrupt+0x6d/0x6d
 4654.449342]  
[ 4654.449342]  [] ? native_safe_halt+0x6/0x10
[ 4654.449342]  [] default_idle+0x1f/0xc0
[ 4654.449342]  [] arch_cpu_idle+0x26/0x30
[ 4654.449342]  [] cpu_startup_entry+0xc5/0x290
[ 4654.449342]  [] rest_init+0x77/0x80
[ 4654.449342]  [] start_kernel+0x438/0x443
[ 4654.449342]  [] ? repair_env_string+0x5c/0x5c
[ 4654.449342]  [] ? early_idt_handlers+0x120/0x120
[ 4654.449342]  [] x86_64_start_reservations+0x2a/0x2c
[ 4654.449342]  [] x86_64_start_kernel+0x143/0x152
[ 4654.449342] Code: e8 5b 0c 35 e1 80 8b 9b 03 00 00 10 e9 5f fc ff ff 45 0f
b6 84 24 8c 07 00 00 41 c6 84 24 8c 07 00 00 08 45 89 84 24 9c 07 00 00 <48> 8b
83 58 03 00 00 48 8b 90 80 00 00 00 48 8b 92 c8 00 00 00
[ 4654.449342] RIP  [] sym_int_sir+0x4cb/0x16e0 [sym53c8xx]
[ 4654.449342]  RSP 
[ 4654.449342] CR2: 0358
[ 4654.449342] ---[ end trace 7ed8af64cc95007a ]---
[ 4654.449342] Kernel panic - not syncing: Fatal exception in interrupt
[ 4654.449342] Shutting down cpus with NMI
[ 4654.449342] drm_kms_helper: panic occurred, switching back to text console

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 2/3] target-i386: reorganize TSC rate setting code

2015-11-16 Thread Haozhong Zhang
Following two changes are made to the TSC rate setting code in
kvm_arch_init_vcpu():
 * The code is moved to a new function kvm_arch_set_tsc_khz().
 * If setting user-specified TSC rate fails and the host TSC rate is
   inconsistent with the user-specified one, print a warning message.

Signed-off-by: Haozhong Zhang 
---
 target-i386/kvm.c | 45 ++---
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 9e4d27f..6a1acb4 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -524,6 +524,41 @@ static bool hyperv_enabled(X86CPU *cpu)
 cpu->hyperv_runtime);
 }
 
+/**
+ * Ask KVM to set vcpu's TSC rate to X86_CPU(cs)->env.tsc_khz.
+ *
+ * Returns: 0if successful;
+ *  -ENOTSUP if KVM_CAP_TSC_CONTROL is unavailable;
+ *  -EIO if KVM_SET_TSC_KHZ fails.
+ */
+static int kvm_arch_set_tsc_khz(CPUState *cs)
+{
+X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = >env;
+int has_tsc_control, r = 0;
+
+if (!env->tsc_khz) {
+return 0;
+}
+
+has_tsc_control = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL);
+if (has_tsc_control) {
+r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
+}
+
+if (!has_tsc_control || r < 0) {
+r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
+kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
+if (r <= 0 || r != env->tsc_khz) {
+error_report("warning: TSC frequency mismatch between "
+ "VM and host, and TSC scaling unavailable");
+return has_tsc_control ? -EIO : -ENOTSUP;
+}
+}
+
+return 0;
+}
+
 static Error *invtsc_mig_blocker;
 
 #define KVM_MAX_CPUID_ENTRIES  100
@@ -823,13 +858,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return r;
 }
 
-r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL);
-if (r && env->tsc_khz) {
-r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
-if (r < 0) {
-fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
-return r;
-}
+if (kvm_arch_set_tsc_khz(cs) == -EIO) {
+fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
+return -EIO;
 }
 
 /*
-- 
2.4.8

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 3/3] target-i386: add support to migrate vcpu's TSC rate

2015-11-16 Thread Haozhong Zhang
This patch enables migrating vcpu's TSC rate. If KVM on the destination
machine supports TSC scaling, guest programs will observe a consistent
TSC rate across the migration.

If TSC scaling is not supported on the destination machine, the
migration will not be aborted and QEMU on the destination will not set
vcpu's TSC rate to the migrated value.

If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
machine is inconsistent with the migrated TSC rate, the migration will
be aborted.

For backwards compatibility, the migration of vcpu's TSC rate is
disabled on pc-*-2.4 and older machine types.

Signed-off-by: Haozhong Zhang 
---
 hw/i386/pc.c  |  1 +
 hw/i386/pc_piix.c |  1 +
 hw/i386/pc_q35.c  |  1 +
 include/hw/i386/pc.h  |  1 +
 target-i386/cpu.c |  2 +-
 target-i386/cpu.h |  1 +
 target-i386/kvm.c |  4 
 target-i386/machine.c | 28 
 8 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 0cb8afd..2f2fc93 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1952,6 +1952,7 @@ static void pc_machine_class_init(ObjectClass *oc, void 
*data)
 HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
 
 pcmc->get_hotplug_handler = mc->get_hotplug_handler;
+pcmc->save_tsc_khz = true;
 mc->get_hotplug_handler = pc_get_hotpug_handler;
 mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id;
 mc->default_boot_order = "cad";
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 07d0baa..7c5b0d2 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -489,6 +489,7 @@ static void pc_i440fx_2_4_machine_options(MachineClass *m)
 m->alias = NULL;
 m->is_default = 0;
 pcmc->broken_reserved_end = true;
+pcmc->save_tsc_khz = false;
 SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
 }
 
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 0fdae09..fd8efe3 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -387,6 +387,7 @@ static void pc_q35_2_4_machine_options(MachineClass *m)
 m->hw_version = "2.4.0";
 m->alias = NULL;
 pcmc->broken_reserved_end = true;
+pcmc->save_tsc_khz = false;
 SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
 }
 
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 4bbc0ff..fea0f28 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -60,6 +60,7 @@ struct PCMachineClass {
 
 /*< public >*/
 bool broken_reserved_end;
+bool save_tsc_khz;
 HotplugHandler *(*get_hotplug_handler)(MachineState *machine,
DeviceState *dev);
 };
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index e5f1c5b..98c6a4c 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -1724,7 +1724,7 @@ static void x86_cpuid_set_tsc_freq(Object *obj, Visitor 
*v, void *opaque,
 return;
 }
 
-cpu->env.tsc_khz = value / 1000;
+cpu->env.tsc_khz = cpu->env.user_tsc_khz = value / 1000;
 }
 
 static void x86_cpuid_get_apic_id(Object *obj, Visitor *v, void *opaque,
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index fc4a605..ffe0bce 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -973,6 +973,7 @@ typedef struct CPUX86State {
 uint32_t sipi_vector;
 bool tsc_valid;
 int64_t tsc_khz;
+int64_t user_tsc_khz; /* for sanity check only */
 void *kvm_xsave_buf;
 
 uint64_t mcg_cap;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 6a1acb4..6856899 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -2384,6 +2384,10 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
 }
 }
 
+if (level == KVM_PUT_FULL_STATE) {
+kvm_arch_set_tsc_khz(cpu);
+}
+
 ret = kvm_getput_regs(x86_cpu, 1);
 if (ret < 0) {
 return ret;
diff --git a/target-i386/machine.c b/target-i386/machine.c
index a18e16e..3c5d24b 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -331,6 +331,13 @@ static int cpu_post_load(void *opaque, int version_id)
 CPUX86State *env = >env;
 int i;
 
+if (env->tsc_khz && env->user_tsc_khz &&
+env->tsc_khz != env->user_tsc_khz) {
+fprintf(stderr, "Mismatch between user-specified TSC frequency and "
+"migrated TSC frequency\n");
+return -1;
+}
+
 /*
  * Real mode guest segments register DPL should be zero.
  * Older KVM version were setting it wrongly.
@@ -775,6 +782,26 @@ static const VMStateDescription vmstate_xss = {
 }
 };
 
+static bool tsc_khz_needed(void *opaque)
+{
+X86CPU *cpu = opaque;
+CPUX86State *env = >env;
+MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine());
+PCMachineClass *pcmc = PC_MACHINE_CLASS(mc);
+return env->tsc_khz && pcmc->save_tsc_khz;
+}
+
+static const VMStateDescription vmstate_tsc_khz = {
+.name = "cpu/tsc_khz",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = tsc_khz_needed,
+.fields = (VMStateField[]) {
+   

[PATCH v5 0/3] target-i386: save/restore vcpu's TSC rate during migration

2015-11-16 Thread Haozhong Zhang
This patchset enables QEMU to save/restore vcpu's TSC rate during the
migration on machine types pc-*-2.5 or newer.

On the source machine:
 * If the vcpu's TSC rate is specified by the cpu option 'tsc-freq',
   then this user-specified TSC rate will be migrated.
 * Otherwise, the TSC rate returned by KVM_GET_TSC_KHZ will be
   migrated. For a fresh VM, this is the host TSC rate.

On the destination machine:
 * If the vcpu's TSC rate has been specified by the cpu option
   'tsc-freq' and is inconsistent with the migrated TSC rate, then
   the migration will be aborted.
 * Otherwise, QEMU will try to use the migrated TSC rate. If KVM on
   the destination supports TSC scaling, guest programs will observe a
   consistent TSC rate across the migration. If TSC scaling is not
   supported, the migration will not be aborted and QEMU will behave
   like before, i.e using the host TSC rate instead.

Changes in v5:
 * Move KVM_GET_TSC_KHZ call to kvm_arch_init_vcpu().
 * Remove an unnecessary warning message.
 * Unify TSC rate setting code in kvm_arch_init_vcpu() and
   kvm_arch_put_registers().

Changes in v4:
 * Make all code x86 specific.
 * Abort the migration if the user-specified TSC rate is inconsistent
   with the migrated TSC rate.
 * Move the sanity check to cpu_post_load().
 * All KVM_SET_TSC_KHZ and save/restore use env->tsc_khz.
 * Replace env->tsc_khz_saved with env->user_tsc_khz, and only use the
   latter for sanity check.

Changes in v3:
 * Change the cpu option 'save-tsc-freq' to an internal flag.
 * Remove the cpu option 'load-tsc-freq' and change the logic of
   loading the migrated TSC rate as above.
 * Move the setup of migrated TSC rate back to
   do_kvm_cpu_synchronize_post_init().

Changes in v2:
 * Add a pair of cpu options 'save-tsc-freq' and 'load-tsc-freq' to
   control the migration of vcpu's TSC rate.
 * Move all logic of setting TSC rate to target-i386.
 * Remove the duplicated TSC setup in kvm_arch_init_vcpu().

Haozhong Zhang (3):
  target-i386: fallback vcpu's TSC rate to value returned by KVM
  target-i386: reorganize TSC rate setting code
  target-i386: add support to migrate vcpu's TSC rate

 hw/i386/pc.c  |  1 +
 hw/i386/pc_piix.c |  1 +
 hw/i386/pc_q35.c  |  1 +
 include/hw/i386/pc.h  |  1 +
 target-i386/cpu.c |  2 +-
 target-i386/cpu.h |  1 +
 target-i386/kvm.c | 59 +--
 target-i386/machine.c | 28 
 8 files changed, 87 insertions(+), 7 deletions(-)

-- 
2.4.8

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 1/3] target-i386: fallback vcpu's TSC rate to value returned by KVM

2015-11-16 Thread Haozhong Zhang
If no user-specified TSC rate is present, we will try to set
env->tsc_khz to the value returned by KVM_GET_TSC_KHZ.

Signed-off-by: Haozhong Zhang 
---
 target-i386/kvm.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 2a9953b..9e4d27f 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -832,6 +832,18 @@ int kvm_arch_init_vcpu(CPUState *cs)
 }
 }
 
+/*
+ * If no user-specified TSC frequency is present, we will try to
+ * set env->tsc_khz to the value used by KVM.
+ */
+if (!env->tsc_khz) {
+r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
+kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
+if (r > 0) {
+env->tsc_khz = r;
+}
+}
+
 if (has_xsave) {
 env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
 }
-- 
2.4.8

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 2/2] target-i386: add support to migrate vcpu's TSC rate

2015-11-16 Thread Haozhong Zhang
On 11/16/15 13:35, Eduardo Habkost wrote:
> On Mon, Nov 16, 2015 at 10:30:08PM +0800, Haozhong Zhang wrote:
> > On 11/16/15 11:43, Eduardo Habkost wrote:
> > > On Mon, Nov 16, 2015 at 04:04:08PM +0800, Haozhong Zhang wrote:
> > > > This patch enables migrating vcpu's TSC rate. If KVM on the destination
> > > > machine supports TSC scaling, guest programs will observe a consistent
> > > > TSC rate across the migration.
> > > > 
> > > > If TSC scaling is not supported on the destination machine, the
> > > > migration will not be aborted and QEMU on the destination will not set
> > > > vcpu's TSC rate to the migrated value.
> > > > 
> > > > If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
> > > > machine is inconsistent with the migrated TSC rate, the migration will
> > > > be aborted.
> > > > 
> > > > For backwards compatibility, the migration of vcpu's TSC rate is
> > > > disabled on pc-*-2.4 and older machine types.
> > > > 
> > > > Signed-off-by: Haozhong Zhang 
> [...]
> > > > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > > > index 9084b29..8448248 100644
> > > > --- a/target-i386/kvm.c
> > > > +++ b/target-i386/kvm.c
> > > > @@ -2334,8 +2334,30 @@ static void kvm_arch_set_tsc_khz(CPUState *cs)
> > > >  int r;
> > > >  
> > > >  /*
> > > > - * If no user-specified TSC frequency is present, we will try to
> > > > - * set env->tsc_khz to the value used by KVM.
> > > > + * For other cases of env->tsc_khz and env->user_tsc_khz:
> > > > + *
> > > > + * - We have eliminated all cases that satisfy
> > > > + *   env->tsc_khz && env->user_tsc_khz &&
> > > > + *   env->tsc_khz != env->user_tsc_khz
> > > > + *   in cpu_post_load().
> > > > + *
> > > > + * - If env->user_tsc_khz is not zero, then it must be equal to
> > > > + *   env->tsc_khz (if the latter is not zero) and has been set in
> > > > + *   kvm_arch_init_vcpu().
> > > > + */
> > > > +if (env->tsc_khz && !env->user_tsc_khz) {
> > > > +r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
> > > > +kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) : 
> > > > -ENOTSUP;
> > > 
> > > Please don't duplicate the code from kvm_arch_init_vcpu(). We can
> > > handle both cases in kvm_arch_put_registers(KVM_PUT_FULL_STATE),
> > > can't we?
> > >
> > 
> > No. If KVM_SET_TSC_KHZ fails in kvm_arch_init_vcpu(), QEMU will
> > exit. But if KVM_SET_TSC_KHZ fails in the migration, QEMU will not
> > abort. And because the return value of
> > kvm_arch_put_registers(KVM_PUT_FULL_STATE) is not checked by its
> > caller do_kvm_cpu_synchronize_post_init(), I have to handle them in
> > different ways.
> 
> Reporting errors back in kvm_put_registers() may be difficult, I
> see, so handling user-provided TSC frequency in
> kvm_arch_init_vcpu() makes sense. But you can still avoid code
> duplication. Just reuse the same function in kvm_arch_init_vcpu()
> and kvm_put_registers(), but return errors back to the caller in
> kvm_arch_init_vcpu() in case env->user_tsc_khz is set.
>

Agree on using the same function to set TSC rate. I'll change in the
next version.

> kvm_put_registers() can ignore the error, and just print a
> warning. But (on both cases) we should print a warning only if
> env->tsc_khz doesn't match KVM_GET_TSC_KHZ, because we don't want
> to print spurious warnings on every migration when TSC scaling
> isn't supported. (You even suggested changes to the example code
> that does that, at Message-ID:
> <20151106023244.gb24...@hzzhang-optiplex-9020.sh.intel.com>).
>

I'll check whether env->tsc_khz == KVM_GET_TSC_KHZ in the next
version.

> Also, I believe it won't be a problem if we call KVM_SET_TSC_KHZ
> twice in the case of incoming migration, so there's no need to
> check user_tsc_khz in the kvm_arch_put_registers() path. Keeping
> the code simple is more important than avoiding one extra ioctl()
> on incoming migration, IMO.
>

I'll use tsc_khz only in the next version.

Thanks,
Haozhong
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost: relax log address alignment

2015-11-16 Thread Jason Wang


On 11/16/2015 11:00 PM, Michael S. Tsirkin wrote:
> commit 5d9a07b0de512b77bf28d2401e5fe3351f00a240 ("vhost: relax used
> address alignment") fixed the alignment for the used virtual address,
> but not for the physical address used for logging.
>
> That's a mistake: alignment should clearly be the same for virtual and
> physical addresses,
>
> Signed-off-by: Michael S. Tsirkin 
> ---
>  drivers/vhost/vhost.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index eec2f11..080422f 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -819,7 +819,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, 
> void __user *argp)
>   BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
>   if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
>   (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
> - (a.log_guest_addr & (sizeof(u64) - 1))) {
> + (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) {
>   r = -EINVAL;
>   break;
>   }

Acked-by: Jason Wang 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next RFC V3 0/3] basic busy polling support for vhost_net

2015-11-16 Thread Jason Wang


On 11/13/2015 05:20 PM, Jason Wang wrote:
>
> On 11/12/2015 08:02 PM, Felipe Franciosi wrote:
>> Hi Jason,
>>
>> I understand your busy loop timeout is quite conservative at 50us. Did you 
>> try any other values?
> I've also tried 20us. And results shows 50us was better in:
>
> - very small packet tx (e.g 64bytes at most 46% improvement)
> - TCP_RR (at most 11% improvement)
>
> But I will test bigger values. In fact, for net itself, we can be even
> more aggressive: make vhost poll forever but I haven't tired this.
>
>> Also, did you measure how polling affects many VMs talking to each other 
>> (e.g. 20 VMs on each host, perhaps with several vNICs each, transmitting to 
>> a corresponding VM/vNIC pair on another host)?
> Not yet, in my todo list.
>
>>
>> On a complete separate experiment (busy waiting on storage I/O rings on 
>> Xen), I have observed that bigger timeouts gave bigger benefits. On the 
>> other hand, all cases that contended for CPU were badly hurt with any sort 
>> of polling.
>>
>> The cases that contended for CPU consisted of many VMs generating workload 
>> over very fast I/O devices (in that case, several NVMe devices on a single 
>> host). And the metric that got affected was aggregate throughput from all 
>> VMs.
>>
>> The solution was to determine whether to poll depending on the host's 
>> overall CPU utilisation at that moment. That gave me the best of both worlds 
>> as polling made everything faster without slowing down any other metric.
> You mean a threshold and exit polling when it exceeds this? I use a
> simpler method: just exit the busy loop when there's more than one
> processes is in running state. I test this method in the past for socket
> busy read (http://www.gossamer-threads.com/lists/linux/kernel/1997531)
> which seems can solve the issue. But haven't tested this for vhost
> polling. Will run some simple test (e.g pin two vhost threads in one
> host cpu), and see how well it perform.
>
> Thanks

Run simple test like:

- starting two VMs, each vm has one vcpu
- pin both two vhost threads of VMs to cpu 0
- pin vcpu0 of VM1 to cpu1
- pin vcpu0 of VM2 to cpu2

Try two TCP_RR netperf tests in parallell:

/busy loop timeouts/trate1+trate2/-+%/
/no busy loop/13966.76+13987.31/+0%/
/20us/14097.89+14088.82/+0.08%)
/50us/15103.98+15103.73/+8.06%/


Busy loop can still give improvements even if two vhost threads are
contending one cpu on host.

>
>> Thanks,
>> Felipe
>>
>>
>>
>> On 12/11/2015 10:20, "kvm-ow...@vger.kernel.org on behalf of Jason Wang" 
>>  wrote:
>>
>>> On 11/12/2015 06:16 PM, Jason Wang wrote:
 Hi all:

 This series tries to add basic busy polling for vhost net. The idea is
 simple: at the end of tx/rx processing, busy polling for new tx added
 descriptor and rx receive socket for a while. The maximum number of
 time (in us) could be spent on busy polling was specified ioctl.

 Test were done through:

 - 50 us as busy loop timeout
 - Netperf 2.6
 - Two machines with back to back connected ixgbe
 - Guest with 1 vcpu and 1 queue

 Results:
 - For stream workload, ioexits were reduced dramatically in medium
   size (1024-2048) of tx (at most -39%) and almost all rx (at most
   -79%) as a result of polling. This compensate for the possible
   wasted cpu cycles more or less. That porbably why we can still see
   some increasing in the normalized throughput in some cases.
 - Throughput of tx were increased (at most 105%) expect for the huge
   write (16384). And we can send more packets in the case (+tpkts were
   increased).
 - Very minor rx regression in some cases.
 - Improvemnt on TCP_RR (at most 16%).
>>> Forget to mention, the following test results by order are:
>>>
>>> 1) Guest TX
>>> 2) Guest RX
>>> 3) TCP_RR
>>>
 size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/
64/ 1/   +9%/  -17%/   +5%/  +10%/   -2%
64/ 2/   +8%/  -18%/   +6%/  +10%/   -1%
64/ 4/   +4%/  -21%/   +6%/  +10%/   -1%
64/ 8/   +9%/  -17%/   +6%/   +9%/   -2%
   256/ 1/  +20%/   -1%/  +15%/  +11%/   -9%
   256/ 2/  +15%/   -6%/  +15%/   +8%/   -8%
   256/ 4/  +17%/   -4%/  +16%/   +8%/   -8%
   256/ 8/  -61%/  -69%/  +16%/  +10%/  -10%
   512/ 1/  +15%/   -3%/  +19%/  +18%/  -11%
   512/ 2/  +19%/0%/  +19%/  +13%/  -10%
   512/ 4/  +18%/   -2%/  +18%/  +15%/  -10%
   512/ 8/  +17%/   -1%/  +18%/  +15%/  -11%
  1024/ 1/  +25%/   +4%/  +27%/  +16%/  -21%
  1024/ 2/  +28%/   +8%/  +25%/  +15%/  -22%
  1024/ 4/  +25%/   +5%/  +25%/  +14%/  -21%
  1024/ 8/  +27%/   +7%/  +25%/  +16%/  -21%
  2048/ 1/  +32%/  +12%/  +31%/  +22%/  -38%
  2048/ 2/  +33%/  +12%/  +30%/  +23%/  -36%
  2048/ 4/  +31%/  +10%/  +31%/  +24%/  -37%
  2048/ 8/ +105%/  +75%/ 

Re: [PATCH v1 2/2] dma-mapping-common: add DMA attribute - DMA_ATTR_IOMMU_BYPASS

2015-11-16 Thread David Woodhouse
On Sun, 2015-11-15 at 22:54 -0800, Benjamin Serebrin wrote:
> We looked into Intel IOMMU performance a while ago and learned a few
> useful things.  We generally did a parallel 200 thread TCP_RR test,
> as this churns through mappings quickly and uses all available cores.
> 
> First, the main bottleneck was software performance[1].

For the Intel IOMMU, *all* we need to do is put a PTE in place. For
real hardware (i.e not an IOMMU emulated by qemu for a VM), we don't
need to do an IOTLB flush. It's a single 64-bit write of the PTE.

All else is software overhead.

(I'm deliberately ignoring the stupid chipsets where DMA page tables
aren't cache coherent and we need a clflush too. They make me too sad.)


>   This study preceded the recent patch to break the locks into pools
> ("Break up monolithic iommu table/lock into finer graularity pools
> and lock").  There were several points of lock contention:
> - the RB tree ...
> - the RB tree ...
> - the RB tree ...
> 
> Omer's paper (https://www.usenix.org/system/files/conference/atc15/at
> c15-paper-peleg.pdf) has some promising approaches.  The magazine
> avoids the RB tree issue.  

I'm thinking of ditching the RB tree altogether and switching to the
allocator in lib/iommu-common.c (and thus getting the benefit of the
finer granularity pools).

> I'm interested in seeing if the dynamic 1:1 with a mostly-lock-free
> page table cleanup algorithm could do well.

When you say 'dynamic 1:1 mapping', is that the same thing that's been
suggested elsewhere — avoiding the IOVA allocator completely by using a
virtual address which *matches* the physical address, if that virtual
address is available? Simply cmpxchg on the PTE itself, and if it was
already set *then* we fall back to the allocator, obviously configured
to allocate from a range *higher* than the available physical memory.

Jörg has been looking at this too, and was even trying to find space in
the PTE for a use count so a given page could be in more than one
mapping before we call back to the IOVA allocator.


> There are correctness fixes and optimizations left in the
> invalidation path: I want strict-ish semantics (a page doesn't go
> back into the freelist until the last IOTLB/IOMMU TLB entry is
> invalidated) with good performance, and that seems to imply that an
> additional page reference should be gotten at dma_map time and put
> back at the completion of the IOMMU flush routine.  (This is worthy
> of much discussion.)  

We already do something like this for page table pages which are freed
by an unmap, FWIW.

> Additionally, we can find ways to optimize the flush routine by
> realizing that if we have frequent maps and unmaps, it may be because
> the device creates and destroys buffers a lot; these kind of
> workloads use an IOVA for one event and then never come back.  Maybe
> TLBs don't do much good and we could just flush the entire IOMMU TLB
> [and ATS cache] for that BDF.

That would be a very interesting thing to look at. Although it would be
nice if we had a better way to measure the performance impact of IOTLB
misses — currently we don't have a lot of visibility at all.

> 1: We verified that the IOMMU costs are almost entirely software
> overheads by forcing software 1:1 mode, where we create page tables
> for all physical addresses.  We tested using leaf nodes of size 4KB,
> of 2MB, and of 1GB.  In call cases, there is zero runtime maintenance
> of the page tables, and no IOMMU invalidations.  We did piles of DMA
> maximizing x16 PCIe bandwidth on multiple lanes, to random DRAM
> addresses.  At 4KB page size, we could see some bandwidth slowdown,
> but at 2MB and 1GB, there was < 1% performance loss as compared with
> IOMMU off.

Was this with ATS on or off? With ATS, the cost of the page walk can be
amortised in some cases — you can look up the physical address *before*
you are ready to actually start the DMA to it, and don't take that
latency at the time you're actually moving the data.

-- 
David WoodhouseOpen Source Technology Centre
david.woodho...@intel.com  Intel Corporation



smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH V2 7/7] KVM, pkeys: disable PKU feature without ept

2015-11-16 Thread Paolo Bonzini


On 16/11/2015 08:51, Huaitong Han wrote:
> This patch disables CPUID:PKU without ept.

The commit message and probably the code too should say why.

Paolo

> Signed-off-by: Huaitong Han 
> 
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index ece687b..e1113ae 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -447,6 +447,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
> *entry, u32 function,
>   entry->ebx |= F(TSC_ADJUST);
>   entry->ecx &= kvm_supported_word11_x86_features;
>   cpuid_mask(>ecx, 13);
> + if (!tdp_enabled)
> + entry->ecx &= ~F(PKU);
>   } else {
>   entry->ebx = 0;
>   entry->ecx = 0;
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] arm64: KVM: Add workaround for Cortex-A57 erratum 834220

2015-11-16 Thread Marc Zyngier
Cortex-A57 parts up to r1p2 can misreport Stage 2 translation faults
when a Stage 1 permission fault or device alignment fault should
have been reported.

This patch implements the workaround (which is to validate that the
Stage-1 translation actually succeeds) by using code patching.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/Kconfig  | 21 +
 arch/arm64/include/asm/cpufeature.h |  3 ++-
 arch/arm64/kernel/cpu_errata.c  |  9 +
 arch/arm64/kvm/hyp.S|  6 ++
 4 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9ac16a4..746d985 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -316,6 +316,27 @@ config ARM64_ERRATUM_832075
 
  If unsure, say Y.
 
+config ARM64_ERRATUM_834220
+   bool "Cortex-A57: 834220: Stage 2 translation fault might be 
incorrectly reported in presence of a Stage 1 fault"
+   depends on KVM
+   default y
+   help
+ This option adds an alternative code sequence to work around ARM
+ erratum 834220 on Cortex-A57 parts up to r1p2.
+
+ Affected Cortex-A57 parts might report a Stage 2 translation
+ fault as a the result of a Stage 1 fault for load crossing a
+ page boundary when there is a permission or device memory
+ alignment fault at Stage 1 and a translation fault at Stage 2.
+
+ The workaround is to verify that the Stage-1 translation
+ doesn't generate a fault before handling the Stage-2 fault.
+ Please note that this does not necessarily enable the workaround,
+ as it depends on the alternative framework, which will only patch
+ the kernel if an affected CPU is detected.
+
+ If unsure, say Y.
+
 config ARM64_ERRATUM_845719
bool "Cortex-A53: 845719: a load might read incorrect data"
depends on COMPAT
diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index 11d5bb0f..52722ee 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -29,8 +29,9 @@
 #define ARM64_HAS_PAN  4
 #define ARM64_HAS_LSE_ATOMICS  5
 #define ARM64_WORKAROUND_CAVIUM_23154  6
+#define ARM64_WORKAROUND_8342207
 
-#define ARM64_NCAPS7
+#define ARM64_NCAPS8
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 24926f2..feb6b4e 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -75,6 +75,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
   (1 << MIDR_VARIANT_SHIFT) | 2),
},
 #endif
+#ifdef CONFIG_ARM64_ERRATUM_834220
+   {
+   /* Cortex-A57 r0p0 - r1p2 */
+   .desc = "ARM erratum 834220",
+   .capability = ARM64_WORKAROUND_834220,
+   MIDR_RANGE(MIDR_CORTEX_A57, 0x00,
+  (1 << MIDR_VARIANT_SHIFT) | 2),
+   },
+#endif
 #ifdef CONFIG_ARM64_ERRATUM_845719
{
/* Cortex-A53 r0p[01234] */
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 1599701..ff2e038 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -1015,9 +1015,15 @@ el1_trap:
b.ne1f  // Not an abort we care about
 
/* This is an abort. Check for permission fault */
+alternative_if_not ARM64_WORKAROUND_834220
and x2, x1, #ESR_ELx_FSC_TYPE
cmp x2, #FSC_PERM
b.ne1f  // Not a permission fault
+alternative_else
+   nop // Use the permission fault path to
+   nop // check for a valid S1 translation,
+   nop // regardless of the ESR value.
+alternative_endif
 
/*
 * Check for Stage-1 page table walk, which is guaranteed
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] arm64: KVM: Fix AArch32 to AArch64 register mapping

2015-11-16 Thread Marc Zyngier
When running a 32bit guest under a 64bit hypervisor, the ARMv8
architecture defines a mapping of the 32bit registers in the 64bit
space. This includes banked registers that are being demultiplexed
over the 64bit ones.

On exception caused by an operation involving a 32bit register, the
HW exposes the register number in the ESR_EL2 register. It was so
far understood that SW had to compute which register was AArch64
register was used (based on the current AArch32 mode and register
number).

It turns out that I misinterpreted the ARM ARM, and the clue is in
D1.20.1: "For some exceptions, the exception syndrome given in the
ESR_ELx identifies one or more register numbers from the issued
instruction that generated the exception. Where the exception is
taken from an Exception level using AArch32 these register numbers
give the AArch64 view of the register."

Which means that the HW is already giving us the translated version,
and that we shouldn't try to interpret it at all (for example, doing
an MMIO operation from the IRQ mode using the LR register leads to
very unexpected behaviours).

The fix is thus not to perform a call to vcpu_reg32() at all from
vcpu_reg(), and use whatever register number is supplied directly.
The only case we need to find out about the mapping is when we
actively generate a register access, which only occurs when injecting
a fault in a guest.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/kvm_emulate.h | 8 +---
 arch/arm64/kvm/inject_fault.c| 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index 17e92f0..3ca894e 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -99,11 +99,13 @@ static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu)
*vcpu_cpsr(vcpu) |= COMPAT_PSR_T_BIT;
 }
 
+/*
+ * vcpu_reg should always be passed a register number coming from a
+ * read of ESR_EL2. Otherwise, it may give the wrong result on AArch32
+ * with banked registers.
+ */
 static inline unsigned long *vcpu_reg(const struct kvm_vcpu *vcpu, u8 reg_num)
 {
-   if (vcpu_mode_is_32bit(vcpu))
-   return vcpu_reg32(vcpu, reg_num);
-
return (unsigned long *)_gp_regs(vcpu)->regs.regs[reg_num];
 }
 
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 85c5715..648112e 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -48,7 +48,7 @@ static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, 
u32 vect_offset)
 
/* Note: These now point to the banked copies */
*vcpu_spsr(vcpu) = new_spsr_value;
-   *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
+   *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
 
/* Branch to exception vector */
if (sctlr & (1 << 13))
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] arm64: KVM: Fixes for 4.4-rc2

2015-11-16 Thread Marc Zyngier
Here's a couple of fixes for KVM/arm64:

- The first one addresses a misinterpretation of the architecture
  spec, leading to the mishandling of I/O accesses generated from an
  AArch32 guest using banked registers.

- The second one is a workaround for a Cortex-A57 erratum.

Both patches are based on v4.4-rc1.

Marc Zyngier (2):
  arm64: KVM: Fix AArch32 to AArch64 register mapping
  arm64: KVM: Add workaround for Cortex-A57 erratum 834220

 arch/arm64/Kconfig   | 21 +
 arch/arm64/include/asm/cpufeature.h  |  3 ++-
 arch/arm64/include/asm/kvm_emulate.h |  8 +---
 arch/arm64/kernel/cpu_errata.c   |  9 +
 arch/arm64/kvm/hyp.S |  6 ++
 arch/arm64/kvm/inject_fault.c|  2 +-
 6 files changed, 44 insertions(+), 5 deletions(-)

-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 4/5] nvdimm acpi: build ACPI nvdimm devices

2015-11-16 Thread Xiao Guangrong
NVDIMM devices is defined in ACPI 6.0 9.20 NVDIMM Devices

There is a root device under \_SB and specified NVDIMM devices are under the
root device. Each NVDIMM device has _ADR which returns its handle used to
associate MEMDEV structure in NFIT

Currently, we do not support any function on _DSM, that means, NVDIMM
label data has not been supported yet

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/nvdimm.c | 85 
 1 file changed, 85 insertions(+)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 98c004d..abe0daa 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -367,6 +367,90 @@ static void nvdimm_build_nfit(GSList *device_list, GArray 
*table_offsets,
 g_array_free(structures, true);
 }
 
+static void nvdimm_build_common_dsm(Aml *root_dev)
+{
+Aml *method, *ifctx, *function;
+uint8_t byte_list[1];
+
+method = aml_method("NCAL", 4);
+{
+function = aml_arg(2);
+
+/*
+ * function 0 is called to inquire what functions are supported by
+ * OSPM
+ */
+ifctx = aml_if(aml_equal(function, aml_int(0)));
+byte_list[0] = 0 /* No function Supported */;
+aml_append(ifctx, aml_return(aml_buffer(1, byte_list)));
+aml_append(method, ifctx);
+
+/* No function is supported yet. */
+byte_list[0] = 1 /* Not Supported */;
+aml_append(method, aml_return(aml_buffer(1, byte_list)));
+}
+aml_append(root_dev, method);
+}
+
+static void nvdimm_build_nvdimm_devices(GSList *device_list, Aml *root_dev)
+{
+for (; device_list; device_list = device_list->next) {
+DeviceState *dev = device_list->data;
+int slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP,
+   NULL);
+uint32_t handle = nvdimm_slot_to_handle(slot);
+Aml *nvdimm_dev, *method;
+
+nvdimm_dev = aml_device("NV%02X", slot);
+aml_append(nvdimm_dev, aml_name_decl("_ADR", aml_int(handle)));
+
+method = aml_method("_DSM", 4);
+{
+aml_append(method, aml_return(aml_call4("NCAL", aml_arg(0),
+   aml_arg(1), aml_arg(2), aml_arg(3;
+}
+aml_append(nvdimm_dev, method);
+
+aml_append(root_dev, nvdimm_dev);
+}
+}
+
+static void nvdimm_build_ssdt(GSList *device_list, GArray *table_offsets,
+  GArray *table_data, GArray *linker)
+{
+Aml *ssdt, *sb_scope, *dev, *method;
+
+acpi_add_table(table_offsets, table_data);
+
+ssdt = init_aml_allocator();
+acpi_data_push(ssdt->buf, sizeof(AcpiTableHeader));
+
+sb_scope = aml_scope("\\_SB");
+
+dev = aml_device("NVDR");
+aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0012")));
+
+nvdimm_build_common_dsm(dev);
+method = aml_method("_DSM", 4);
+{
+aml_append(method, aml_return(aml_call4("NCAL", aml_arg(0),
+   aml_arg(1), aml_arg(2), aml_arg(3;
+}
+aml_append(dev, method);
+
+nvdimm_build_nvdimm_devices(device_list, dev);
+
+aml_append(sb_scope, dev);
+
+aml_append(ssdt, sb_scope);
+/* copy AML table into ACPI tables blob and patch header there */
+g_array_append_vals(table_data, ssdt->buf->data, ssdt->buf->len);
+build_header(linker, table_data,
+(void *)(table_data->data + table_data->len - ssdt->buf->len),
+"SSDT", ssdt->buf->len, 1, "NVDIMM");
+free_aml_allocator();
+}
+
 void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data,
GArray *linker)
 {
@@ -378,5 +462,6 @@ void nvdimm_build_acpi(GArray *table_offsets, GArray 
*table_data,
 return;
 }
 nvdimm_build_nfit(device_list, table_offsets, table_data, linker);
+nvdimm_build_ssdt(device_list, table_offsets, table_data, linker);
 g_slist_free(device_list);
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 2/5] acpi: support specified oem table id for build_header

2015-11-16 Thread Xiao Guangrong
Let build_header() support specified OEM table id so that we can build
multiple SSDT later

If the oem table id is not specified (aka, NULL), we use the default id
instead as the previous behavior

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 15 +++
 hw/arm/virt-acpi-build.c| 13 +++--
 hw/i386/acpi-build.c| 20 ++--
 include/hw/acpi/aml-build.h |  3 ++-
 4 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index a00a0ab..92873bb 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1137,14 +1137,21 @@ Aml *aml_unicode(const char *str)
 
 void
 build_header(GArray *linker, GArray *table_data,
- AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
+ AcpiTableHeader *h, const char *sig, int len, uint8_t rev,
+ const char *oem_table_id)
 {
 memcpy(>signature, sig, 4);
 h->length = cpu_to_le32(len);
 h->revision = rev;
 memcpy(h->oem_id, ACPI_BUILD_APPNAME6, 6);
-memcpy(h->oem_table_id, ACPI_BUILD_APPNAME4, 4);
-memcpy(h->oem_table_id + 4, sig, 4);
+
+if (oem_table_id) {
+strncpy((char *)h->oem_table_id, oem_table_id, 
sizeof(h->oem_table_id));
+} else {
+memcpy(h->oem_table_id, ACPI_BUILD_APPNAME4, 4);
+memcpy(h->oem_table_id + 4, sig, 4);
+}
+
 h->oem_revision = cpu_to_le32(1);
 memcpy(h->asl_compiler_id, ACPI_BUILD_APPNAME4, 4);
 h->asl_compiler_revision = cpu_to_le32(1);
@@ -1211,5 +1218,5 @@ build_rsdt(GArray *table_data, GArray *linker, GArray 
*table_offsets)
sizeof(uint32_t));
 }
 build_header(linker, table_data,
- (void *)rsdt, "RSDT", rsdt_len, 1);
+ (void *)rsdt, "RSDT", rsdt_len, 1, NULL);
 }
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 3c2c5d6..da17779 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -381,7 +381,8 @@ build_spcr(GArray *table_data, GArray *linker, 
VirtGuestInfo *guest_info)
 spcr->pci_device_id = 0x;  /* PCI Device ID: not a PCI device */
 spcr->pci_vendor_id = 0x;  /* PCI Vendor ID: not a PCI device */
 
-build_header(linker, table_data, (void *)spcr, "SPCR", sizeof(*spcr), 2);
+build_header(linker, table_data, (void *)spcr, "SPCR", sizeof(*spcr), 2,
+ NULL);
 }
 
 static void
@@ -400,7 +401,7 @@ build_mcfg(GArray *table_data, GArray *linker, 
VirtGuestInfo *guest_info)
 mcfg->allocation[0].end_bus_number = (memmap[VIRT_PCIE_ECAM].size
   / PCIE_MMCFG_SIZE_MIN) - 1;
 
-build_header(linker, table_data, (void *)mcfg, "MCFG", len, 1);
+build_header(linker, table_data, (void *)mcfg, "MCFG", len, 1, NULL);
 }
 
 /* GTDT */
@@ -426,7 +427,7 @@ build_gtdt(GArray *table_data, GArray *linker)
 
 build_header(linker, table_data,
  (void *)(table_data->data + gtdt_start), "GTDT",
- table_data->len - gtdt_start, 2);
+ table_data->len - gtdt_start, 2, NULL);
 }
 
 /* MADT */
@@ -488,7 +489,7 @@ build_madt(GArray *table_data, GArray *linker, 
VirtGuestInfo *guest_info,
 
 build_header(linker, table_data,
  (void *)(table_data->data + madt_start), "APIC",
- table_data->len - madt_start, 3);
+ table_data->len - madt_start, 3, NULL);
 }
 
 /* FADT */
@@ -513,7 +514,7 @@ build_fadt(GArray *table_data, GArray *linker, unsigned 
dsdt)
sizeof fadt->dsdt);
 
 build_header(linker, table_data,
- (void *)fadt, "FACP", sizeof(*fadt), 5);
+ (void *)fadt, "FACP", sizeof(*fadt), 5, NULL);
 }
 
 /* DSDT */
@@ -546,7 +547,7 @@ build_dsdt(GArray *table_data, GArray *linker, 
VirtGuestInfo *guest_info)
 g_array_append_vals(table_data, dsdt->buf->data, dsdt->buf->len);
 build_header(linker, table_data,
 (void *)(table_data->data + table_data->len - dsdt->buf->len),
-"DSDT", dsdt->buf->len, 2);
+"DSDT", dsdt->buf->len, 2, NULL);
 free_aml_allocator();
 }
 
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 95e0c65..215b58c 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -361,7 +361,7 @@ build_fadt(GArray *table_data, GArray *linker, AcpiPmInfo 
*pm,
 fadt_setup(fadt, pm);
 
 build_header(linker, table_data,
- (void *)fadt, "FACP", sizeof(*fadt), 1);
+ (void *)fadt, "FACP", sizeof(*fadt), 1, NULL);
 }
 
 static void
@@ -431,7 +431,7 @@ build_madt(GArray *table_data, GArray *linker, AcpiCpuInfo 
*cpu,
 
 build_header(linker, table_data,
  (void *)(table_data->data + madt_start), "APIC",
- table_data->len - madt_start, 1);
+ table_data->len - madt_start, 1, NULL);
 }
 
 /* Assign BSEL property to all 

[PATCH v8 3/5] nvdimm acpi: build ACPI NFIT table

2015-11-16 Thread Xiao Guangrong
NFIT is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)

Currently, we only support PMEM mode. Each device has 3 structures:
- SPA structure, defines the PMEM region info

- MEM DEV structure, it has the @handle which is used to associate specified
  ACPI NVDIMM  device we will introduce in later patch.
  Also we can happily ignored the memory device's interleave, the real
  nvdimm hardware access is hidden behind host

- DCR structure, it defines vendor ID used to associate specified vendor
  nvdimm driver. Since we only implement PMEM mode this time, Command
  window and Data window are not needed

The NVDIMM functionality is controlled by the parameter, 'nvdimm-support',
is introduced for PIIX4_PM and ICH9-LPC, it is true on default and it is
false on 2.4 and its earlier version to keep compatibility

Signed-off-by: Xiao Guangrong 
---
 default-configs/i386-softmmu.mak   |   1 +
 default-configs/x86_64-softmmu.mak |   1 +
 hw/acpi/Makefile.objs  |   1 +
 hw/acpi/ich9.c |  19 ++
 hw/acpi/nvdimm.c   | 382 +
 hw/acpi/piix4.c|   4 +
 hw/i386/acpi-build.c   |   6 +
 include/hw/acpi/ich9.h |   3 +
 include/hw/i386/pc.h   |  12 +-
 include/hw/mem/nvdimm.h|  12 ++
 10 files changed, 440 insertions(+), 1 deletion(-)
 create mode 100644 hw/acpi/nvdimm.c

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 4c79d3b..53fb517 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -47,6 +47,7 @@ CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
 CONFIG_NVDIMM=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index e42d2fc..766c27c 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -47,6 +47,7 @@ CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
 CONFIG_NVDIMM=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
index 7d3230c..095597f 100644
--- a/hw/acpi/Makefile.objs
+++ b/hw/acpi/Makefile.objs
@@ -2,6 +2,7 @@ common-obj-$(CONFIG_ACPI_X86) += core.o piix4.o pcihp.o
 common-obj-$(CONFIG_ACPI_X86_ICH) += ich9.o tco.o
 common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu_hotplug.o
 common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o
+common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
 common-obj-$(CONFIG_ACPI) += acpi_interface.o
 common-obj-$(CONFIG_ACPI) += bios-linker-loader.o
 common-obj-$(CONFIG_ACPI) += aml-build.o
diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index 1c7fcfa..275796f 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -307,6 +307,20 @@ static void ich9_pm_set_memory_hotplug_support(Object 
*obj, bool value,
 s->pm.acpi_memory_hotplug.is_enabled = value;
 }
 
+static bool ich9_pm_get_nvdimm_support(Object *obj, Error **errp)
+{
+ICH9LPCState *s = ICH9_LPC_DEVICE(obj);
+
+return s->pm.nvdimm_acpi_state.is_enabled;
+}
+
+static void ich9_pm_set_nvdimm_support(Object *obj, bool value, Error **errp)
+{
+ICH9LPCState *s = ICH9_LPC_DEVICE(obj);
+
+s->pm.nvdimm_acpi_state.is_enabled = value;
+}
+
 static void ich9_pm_get_disable_s3(Object *obj, Visitor *v,
void *opaque, const char *name,
Error **errp)
@@ -404,6 +418,7 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, 
Error **errp)
 {
 static const uint32_t gpe0_len = ICH9_PMIO_GPE0_LEN;
 pm->acpi_memory_hotplug.is_enabled = true;
+pm->nvdimm_acpi_state.is_enabled = true;
 pm->disable_s3 = 0;
 pm->disable_s4 = 0;
 pm->s4_val = 2;
@@ -419,6 +434,10 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs 
*pm, Error **errp)
  ich9_pm_get_memory_hotplug_support,
  ich9_pm_set_memory_hotplug_support,
  NULL);
+object_property_add_bool(obj, "nvdimm-support",
+ ich9_pm_get_nvdimm_support,
+ ich9_pm_set_nvdimm_support,
+ NULL);
 object_property_add(obj, ACPI_PM_PROP_S3_DISABLED, "uint8",
 ich9_pm_get_disable_s3,
 ich9_pm_set_disable_s3,
diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
new file mode 100644
index 000..98c004d
--- /dev/null
+++ b/hw/acpi/nvdimm.c
@@ -0,0 +1,382 @@
+/*
+ * NVDIMM ACPI Implementation
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *  Xiao Guangrong 
+ *
+ * NFIT is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)
+ * and the DSM specification can be found at:
+ *   

[PATCH v8 5/5] nvdimm: add maintain info

2015-11-16 Thread Xiao Guangrong
Add NVDIMM maintainer

Signed-off-by: Xiao Guangrong 
---
 MAINTAINERS | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9e1fa72..da58bf4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -932,6 +932,13 @@ M: Jiri Pirko 
 S: Maintained
 F: hw/net/rocker/
 
+NVDIMM
+M: Xiao Guangrong 
+S: Maintained
+F: hw/acpi/nvdimm.c
+F: hw/mem/nvdimm.c
+F: include/hw/mem/nvdimm.h
+
 Subsystems
 --
 Audio
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 0/5] implement vNVDIMM

2015-11-16 Thread Xiao Guangrong
This patchset can be found at:
  https://github.com/xiaogr/qemu.git nvdimm-v8

It is based on pci branch on Michael's tree and the top commit is:
commit e3a4e177d9 (migration/ram: fix build on 32 bit hosts).

Changelog in v8:
We split the long patch series into the small parts, as you see now, this
is the first part which enables NVDIMM without label data support.

The command line has been changed because some patches simplifying the
things have not been included into this series, you should specify the
file size exactly using the parameters as follows:
   memory-backend-file,id=mem1,share,mem-path=/tmp/nvdimm1,size=10G \
   -device nvdimm,memdev=mem1,id=nv1

Changelog in v7:
- changes from Vladimir Sementsov-Ogievskiy's comments:
  1) let gethugepagesize() realize if fstat is failed instead of get
 normal page size
  2) rename  open_file_path to open_ram_file_path
  3) better log the error message by using error_setg_errno
  4) update commit in the commit log to explain hugepage detection on
 Windows

- changes from Eduardo Habkost's comments:
  1) use 'Error**' to collect error message for qemu_file_get_page_size()
  2) move gethugepagesize() replacement to the same patch to make it
 better for review
  3) introduce qemu_get_file_size to unity the code with raw_getlength()

- changes from Stefan's comments:
  1) check the memory region is large enough to contain DSM output
 buffer

- changes from Eric Blake's comments:
  1) update the shell command in the commit log to generate the patch
 which drops 'pc-dimm' prefix
  
- others:
  pick up Reviewed-by from Stefan, Vladimir Sementsov-Ogievskiy, and
  Eric Blake.

Changelog in v6:
- changes from Stefan's comments:
  1) fix code style of struct naming by CamelCase way
  2) fix offset + length overflow when read/write label data
  3) compile hw/acpi/nvdimm.c for per target so that TARGET_PAGE_SIZE can
 be used to replace getpagesize()

Changelog in v5:
- changes from Michael's comments:
  1) prefix nvdimm_ to everything in NVDIMM source files
  2) make parsing _DSM Arg3 more clear
  3) comment style fix
  5) drop single used definition
  6) fix dirty dsm buffer lost due to memory write happened on host
  7) check dsm buffer if it is big enough to contain input data
  8) use build_append_int_noprefix to store single value to GArray

- changes from Michael's and Igor's comments:
  1) introduce 'nvdimm-support' parameter to control nvdimm
 enablement and it is disabled for 2.4 and its earlier versions
 to make live migration compatible
  2) only reserve 1 RAM page and 4 bytes IO Port for NVDIMM ACPI
 virtualization

- changes from Stefan's comments:
  1) do endian adjustment for the buffer length

- changes from Bharata B Rao's comments:
  1) fix compile on ppc

- others:
  1) the buffer length is directly got from IO read rather than got
 from dsm memory
  2) fix dirty label data lost due to memory write happened on host

Changelog in v4:
- changes from Michael's comments:
  1) show the message, "Memory is not allocated from HugeTlbfs", if file
 based memory is not allocated from hugetlbfs.
  2) introduce function, acpi_get_nvdimm_state(), to get NVDIMMState
 from Machine.
  3) statically define UUID and make its operation more clear
  4) use GArray to build device structures to avoid potential buffer
 overflow
  4) improve comments in the code
  5) improve code style

- changes from Igor's comments:
  1) add NVDIMM ACPI spec document
  2) use serialized method to avoid Mutex
  3) move NVDIMM ACPI's code to hw/acpi/nvdimm.c
  4) introduce a common ASL method used by _DSM for all devices to reduce
 ACPI size
  5) handle UUID in ACPI AML code. BTW, i'd keep handling revision in QEMU
 it's better to upgrade QEMU to support Rev2 in the future

- changes from Stefan's comments:
  1) copy input data from DSM memory to local buffer to avoid potential
 issues as DSM memory is visible to guest. Output data is handled
 in a similar way

- changes from Dan's comments:
  1) drop static namespace as Linux has already supported label-less
 nvdimm devices

- changes from Vladimir's comments:
  1) print better message, "failed to get file size for %s, can't create
 backend on it", if any file operation filed to obtain file size

- others:
  create a git repo on github.com for better review/test

Also, thanks for Eric Blake's review on QAPI's side.

Thank all of you to review this patchset.

Changelog in v3:
There is huge change in this version, thank Igor, Stefan, Paolo, Eduardo,
Michael for their valuable comments, the patchset finally gets better shape.
- changes from Igor's comments:
  1) abstract dimm device type from pc-dimm and create nvdimm device based on
 dimm, then it uses memory backend device as nvdimm's memory and NUMA has
 easily been implemented.
  2) let file-backend device support any kind of filesystem not only for
 hugetlbfs and let it work on file not only for 

[PATCH v8 1/5] nvdimm: implement NVDIMM device abstract

2015-11-16 Thread Xiao Guangrong
Introduce "nvdimm" device which is based on pc-dimm device type

Currently, nothing is specific for nvdimm but hotplug is disabled

Signed-off-by: Xiao Guangrong 
---
 default-configs/i386-softmmu.mak   |  1 +
 default-configs/x86_64-softmmu.mak |  1 +
 hw/acpi/memory_hotplug.c   |  5 +
 hw/mem/Makefile.objs   |  1 +
 hw/mem/nvdimm.c| 46 ++
 include/hw/mem/nvdimm.h| 29 
 6 files changed, 83 insertions(+)
 create mode 100644 hw/mem/nvdimm.c
 create mode 100644 include/hw/mem/nvdimm.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 43c96d1..4c79d3b 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -46,6 +46,7 @@ CONFIG_APIC=y
 CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
+CONFIG_NVDIMM=y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index dfb8095..e42d2fc 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -46,6 +46,7 @@ CONFIG_APIC=y
 CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
+CONFIG_NVDIMM=y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c
index ce428df..8b6662b 100644
--- a/hw/acpi/memory_hotplug.c
+++ b/hw/acpi/memory_hotplug.c
@@ -230,6 +230,11 @@ void acpi_memory_plug_cb(ACPIREGS *ar, qemu_irq irq, 
MemHotplugState *mem_st,
  DeviceState *dev, Error **errp)
 {
 MemStatus *mdev;
+DeviceClass *dc = DEVICE_GET_CLASS(dev);
+
+if (!dc->hotpluggable) {
+return;
+}
 
 mdev = acpi_memory_slot_status(mem_st, dev, errp);
 if (!mdev) {
diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs
index b000fb4..f12f8b9 100644
--- a/hw/mem/Makefile.objs
+++ b/hw/mem/Makefile.objs
@@ -1 +1,2 @@
 common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o
+common-obj-$(CONFIG_NVDIMM) += nvdimm.o
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
new file mode 100644
index 000..4fd397f
--- /dev/null
+++ b/hw/mem/nvdimm.c
@@ -0,0 +1,46 @@
+/*
+ * Non-Volatile Dual In-line Memory Module Virtualization Implementation
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *  Xiao Guangrong 
+ *
+ * Currently, it only supports PMEM Virtualization.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see 
+ */
+
+#include "hw/mem/nvdimm.h"
+
+static void nvdimm_class_init(ObjectClass *oc, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(oc);
+
+/* nvdimm hotplug has not been supported yet. */
+dc->hotpluggable = false;
+}
+
+static TypeInfo nvdimm_info = {
+.name  = TYPE_NVDIMM,
+.parent= TYPE_PC_DIMM,
+.class_init= nvdimm_class_init,
+};
+
+static void nvdimm_register_types(void)
+{
+type_register_static(_info);
+}
+
+type_init(nvdimm_register_types)
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
new file mode 100644
index 000..dbfa8d6
--- /dev/null
+++ b/include/hw/mem/nvdimm.h
@@ -0,0 +1,29 @@
+/*
+ * Non-Volatile Dual In-line Memory Module Virtualization Implementation
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *  Xiao Guangrong 
+ *
+ * NVDIMM specifications and some documents can be found at:
+ * NVDIMM ACPI device and NFIT are introduced in ACPI 6:
+ *  http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
+ * NVDIMM Namespace specification:
+ *  http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf
+ * DSM Interface Example:
+ *  http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
+ * Driver Writer's Guide:
+ *  http://pmem.io/documents/NVDIMM_Driver_Writers_Guide.pdf
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_NVDIMM_H
+#define QEMU_NVDIMM_H
+
+#include "hw/mem/pc-dimm.h"
+
+#define TYPE_NVDIMM  "nvdimm"
+#endif
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 0/2] target-i386: save/restore vcpu's TSC rate during migration

2015-11-16 Thread Haozhong Zhang
This patchset enables QEMU to save/restore vcpu's TSC rate during the
migration on machine types pc-*-2.5 or newer.

On the source machine:
 * If the vcpu's TSC rate is specified by the cpu option 'tsc-freq',
   then this user-specified TSC rate will be migrated.
 * Otherwise, the TSC rate returned by KVM_GET_TSC_KHZ will be
   migrated. For a fresh VM, this is the host TSC rate.

On the destination machine:
 * If the vcpu's TSC rate has been specified by the cpu option
   'tsc-freq' and is inconsistent with the migrated TSC rate, then
   the migration will be aborted.
 * Otherwise, QEMU will try to use the migrated TSC rate. If KVM on
   the destination supports TSC scaling, guest programs will observe a
   consistent TSC rate across the migration. If TSC scaling is not
   supported, the migration will not be aborted and QEMU will behave
   like before, i.e using the host TSC rate instead.

Changes in v4:
 * Make all code x86 specific.
 * Abort the migration if the user-specified TSC rate is inconsistent
   with the migrated TSC rate.
 * Move the sanity check to cpu_post_load().
 * All KVM_SET_TSC_KHZ and save/restore use env->tsc_khz.
 * Replace env->tsc_khz_saved with env->user_tsc_khz, and only use the
   latter for sanity check.

Changes in v3:
 * Change the cpu option 'save-tsc-freq' to an internal flag.
 * Remove the cpu option 'load-tsc-freq' and change the logic of
   loading the migrated TSC rate as above.
 * Move the setup of migrated TSC rate back to
   do_kvm_cpu_synchronize_post_init().

Changes in v2:
 * Add a pair of cpu options 'save-tsc-freq' and 'load-tsc-freq' to
   control the migration of vcpu's TSC rate.
 * Move all logic of setting TSC rate to target-i386.
 * Remove the duplicated TSC setup in kvm_arch_init_vcpu().

Haozhong Zhang (2):
  target-i386: fallback vcpu's TSC rate to value returned by KVM
  target-i386: add support to migrate vcpu's TSC rate

 hw/i386/pc.c  |  1 +
 hw/i386/pc_piix.c |  1 +
 hw/i386/pc_q35.c  |  1 +
 include/hw/i386/pc.h  |  1 +
 target-i386/cpu.c |  2 +-
 target-i386/cpu.h |  1 +
 target-i386/kvm.c | 47 +++
 target-i386/machine.c | 28 
 8 files changed, 81 insertions(+), 1 deletion(-)

-- 
2.6.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 1/2] target-i386: fallback vcpu's TSC rate to value returned by KVM

2015-11-16 Thread Haozhong Zhang
If no user-specified TSC rate is present, we will try to set
env->tsc_khz to the value returned by KVM_GET_TSC_KHZ.

Signed-off-by: Haozhong Zhang 
---
 target-i386/kvm.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 2a9953b..9084b29 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -2327,6 +2327,27 @@ static int kvm_get_debugregs(X86CPU *cpu)
 return 0;
 }
 
+static void kvm_arch_set_tsc_khz(CPUState *cs)
+{
+X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = >env;
+int r;
+
+/*
+ * If no user-specified TSC frequency is present, we will try to
+ * set env->tsc_khz to the value used by KVM.
+ */
+if (!env->tsc_khz) {
+r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
+kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
+if (r < 0) {
+error_report("warning: KVM_GET_TSC_KHZ failed");
+} else {
+env->tsc_khz = r;
+}
+}
+}
+
 int kvm_arch_put_registers(CPUState *cpu, int level)
 {
 X86CPU *x86_cpu = X86_CPU(cpu);
@@ -2341,6 +2362,10 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
 }
 }
 
+if (level == KVM_PUT_FULL_STATE) {
+kvm_arch_set_tsc_khz(cpu);
+}
+
 ret = kvm_getput_regs(x86_cpu, 1);
 if (ret < 0) {
 return ret;
-- 
2.4.8

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 2/2] target-i386: add support to migrate vcpu's TSC rate

2015-11-16 Thread Haozhong Zhang
This patch enables migrating vcpu's TSC rate. If KVM on the destination
machine supports TSC scaling, guest programs will observe a consistent
TSC rate across the migration.

If TSC scaling is not supported on the destination machine, the
migration will not be aborted and QEMU on the destination will not set
vcpu's TSC rate to the migrated value.

If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
machine is inconsistent with the migrated TSC rate, the migration will
be aborted.

For backwards compatibility, the migration of vcpu's TSC rate is
disabled on pc-*-2.4 and older machine types.

Signed-off-by: Haozhong Zhang 
---
 hw/i386/pc.c  |  1 +
 hw/i386/pc_piix.c |  1 +
 hw/i386/pc_q35.c  |  1 +
 include/hw/i386/pc.h  |  1 +
 target-i386/cpu.c |  2 +-
 target-i386/cpu.h |  1 +
 target-i386/kvm.c | 26 --
 target-i386/machine.c | 28 
 8 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 0cb8afd..2f2fc93 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1952,6 +1952,7 @@ static void pc_machine_class_init(ObjectClass *oc, void 
*data)
 HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
 
 pcmc->get_hotplug_handler = mc->get_hotplug_handler;
+pcmc->save_tsc_khz = true;
 mc->get_hotplug_handler = pc_get_hotpug_handler;
 mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id;
 mc->default_boot_order = "cad";
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 07d0baa..7c5b0d2 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -489,6 +489,7 @@ static void pc_i440fx_2_4_machine_options(MachineClass *m)
 m->alias = NULL;
 m->is_default = 0;
 pcmc->broken_reserved_end = true;
+pcmc->save_tsc_khz = false;
 SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
 }
 
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 0fdae09..fd8efe3 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -387,6 +387,7 @@ static void pc_q35_2_4_machine_options(MachineClass *m)
 m->hw_version = "2.4.0";
 m->alias = NULL;
 pcmc->broken_reserved_end = true;
+pcmc->save_tsc_khz = false;
 SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
 }
 
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 4bbc0ff..fea0f28 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -60,6 +60,7 @@ struct PCMachineClass {
 
 /*< public >*/
 bool broken_reserved_end;
+bool save_tsc_khz;
 HotplugHandler *(*get_hotplug_handler)(MachineState *machine,
DeviceState *dev);
 };
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index e5f1c5b..98c6a4c 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -1724,7 +1724,7 @@ static void x86_cpuid_set_tsc_freq(Object *obj, Visitor 
*v, void *opaque,
 return;
 }
 
-cpu->env.tsc_khz = value / 1000;
+cpu->env.tsc_khz = cpu->env.user_tsc_khz = value / 1000;
 }
 
 static void x86_cpuid_get_apic_id(Object *obj, Visitor *v, void *opaque,
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index fc4a605..1ad1da8 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -973,6 +973,7 @@ typedef struct CPUX86State {
 uint32_t sipi_vector;
 bool tsc_valid;
 int64_t tsc_khz;
+int64_t user_tsc_khz;
 void *kvm_xsave_buf;
 
 uint64_t mcg_cap;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 9084b29..8448248 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -2334,8 +2334,30 @@ static void kvm_arch_set_tsc_khz(CPUState *cs)
 int r;
 
 /*
- * If no user-specified TSC frequency is present, we will try to
- * set env->tsc_khz to the value used by KVM.
+ * For other cases of env->tsc_khz and env->user_tsc_khz:
+ *
+ * - We have eliminated all cases that satisfy
+ *   env->tsc_khz && env->user_tsc_khz &&
+ *   env->tsc_khz != env->user_tsc_khz
+ *   in cpu_post_load().
+ *
+ * - If env->user_tsc_khz is not zero, then it must be equal to
+ *   env->tsc_khz (if the latter is not zero) and has been set in
+ *   kvm_arch_init_vcpu().
+ */
+if (env->tsc_khz && !env->user_tsc_khz) {
+r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
+kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) : -ENOTSUP;
+if (r < 0) {
+error_report("warning: TSC frequency mismatch between VM and host, 
"
+ "and TSC scaling unavailable");
+}
+}
+
+/*
+ * If neither the user-specified TSC frequency nor the migrated
+ * TSC frequency is present, we will try to set env->tsc_khz
+ * to the value used by KVM.
  */
 if (!env->tsc_khz) {
 r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
diff --git a/target-i386/machine.c b/target-i386/machine.c
index a18e16e..eb062d2 100644
--- a/target-i386/machine.c
+++