Re: [PATCH] kvm: x86: fix comment about {mmu,nested_mmu}.gva_to_gpa

2016-01-07 Thread Paolo Bonzini


On 30/12/2015 17:26, David Matlack wrote:
> The comment had the meaning of mmu.gva_to_gpa and nested_mmu.gva_to_gpa
> swapped. Fix that, and also add some details describing how each translation
> works.
> 
> Signed-off-by: David Matlack 
> ---
>  arch/x86/kvm/mmu.c | 10 ++
>  1 file changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index e7c2c14..098a9c2 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -4058,10 +4058,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
>   g_context->inject_page_fault = kvm_inject_page_fault;
>  
>   /*
> -  * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
> -  * translation of l2_gpa to l1_gpa addresses is done using the
> -  * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
> -  * functions between mmu and nested_mmu are swapped.
> +  * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
> +  * L1's nested page tables (e.g. EPT12). The nested translation
> +  * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
> +  * L2's page tables as the first level of translation and L1's
> +  * nested page tables as the second level of translation. Basically
> +  * the gva_to_gpa functions between mmu and nested_mmu are swapped.
>*/
>   if (!is_paging(vcpu)) {
>   g_context->nx = false;
> 

Applied, thanks.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 09/20] KVM: ARM64: Add access handler for event counter register

2016-01-07 Thread Marc Zyngier
On 22/12/15 08:08, Shannon Zhao wrote:
> From: Shannon Zhao 
> 
> These kind of registers include PMEVCNTRn, PMCCNTR and PMXEVCNTR which
> is mapped to PMEVCNTRn.
> 
> The access handler translates all aarch32 register offsets to aarch64
> ones and uses vcpu_sys_reg() to access their values to avoid taking care
> of big endian.
> 
> When reading these registers, return the sum of register value and the
> value perf event counts.
> 
> Signed-off-by: Shannon Zhao 
> ---
>  arch/arm64/kvm/sys_regs.c | 138 
> --
>  1 file changed, 134 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index ed2939b..1818947 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -569,6 +569,57 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, 
> struct sys_reg_params *p,
>   return true;
>  }
>  
> +static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
> +   struct sys_reg_params *p,
> +   const struct sys_reg_desc *r)
> +{
> + u64 idx, reg, val;
> +
> + if (!p->is_aarch32) {
> + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 2)
> + /* PMXEVCNTR_EL0 */
> + reg = 0;
> + else
> + /* PMEVCNTRn_EL0 or PMCCNTR_EL0 */
> + reg = r->reg;
> + } else {
> + if (r->CRn == 9 && r->CRm == 13) {
> + reg = (r->Op2 & 2) ? 0 : PMCCNTR_EL0;
> + } else {
> + reg = ((r->CRm & 3) << 3) | (r->Op2 & 7);
> + reg += PMEVCNTR0_EL0;
> + }
> + }

Same remark about the use of 0 instead of PMSELR_EL0.

> +
> + switch (reg) {
> + case PMEVCNTR0_EL0 ... PMEVCNTR30_EL0:
> + idx = reg - PMEVCNTR0_EL0;
> + if (!pmu_counter_idx_valid(vcpu, idx))
> + return true;
> + break;
> + case PMCCNTR_EL0:
> + idx = ARMV8_CYCLE_IDX;
> + break;
> + default:
> + /* PMXEVCNTR_EL0 */
> + idx = vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_COUNTER_MASK;
> + if (!pmu_counter_idx_valid(vcpu, idx))
> + return true;
> +
> + reg = (idx == ARMV8_CYCLE_IDX) ? PMCCNTR_EL0
> +  : PMEVCNTR0_EL0 + idx;
> + break;
> + }
> +
> + val = kvm_pmu_get_counter_value(vcpu, idx);
> + if (p->is_write)
> + vcpu_sys_reg(vcpu, reg) += (s64)p->regval - val;
> + else
> + p->regval = val;
> +
> + return true;
> +}
> +
>  /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
>  #define DBG_BCR_BVR_WCR_WVR_EL1(n)   \
>   /* DBGBVRn_EL1 */   \
> @@ -584,6 +635,13 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, 
> struct sys_reg_params *p,
>   { Op0(0b10), Op1(0b000), CRn(0b), CRm((n)), Op2(0b111), \
> trap_wcr, reset_wcr, n, 0,  get_wcr, set_wcr }
>  
> +/* Macro to expand the PMEVCNTRn_EL0 register */
> +#define PMU_PMEVCNTR_EL0(n)  \
> + /* PMEVCNTRn_EL0 */ \
> + { Op0(0b11), Op1(0b011), CRn(0b1110),   \
> +   CRm((0b1000 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \
> +   access_pmu_evcntr, reset_unknown, (PMEVCNTR0_EL0 + n), }
> +
>  /* Macro to expand the PMEVTYPERn_EL0 register */
>  #define PMU_PMEVTYPER_EL0(n) \
>   /* PMEVTYPERn_EL0 */\
> @@ -784,13 +842,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> access_pmceid },
>   /* PMCCNTR_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b000),
> -   trap_raz_wi },
> +   access_pmu_evcntr, reset_unknown, PMCCNTR_EL0 },
>   /* PMXEVTYPER_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b001),
> access_pmu_evtyper },
>   /* PMXEVCNTR_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b010),
> -   trap_raz_wi },
> +   access_pmu_evcntr },
>   /* PMUSERENR_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b000),
> trap_raz_wi },
> @@ -805,6 +863,38 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>   { Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b), Op2(0b011),
> NULL, reset_unknown, TPIDRRO_EL0 },
>  
> + /* PMEVCNTRn_EL0 */
> + PMU_PMEVCNTR_EL0(0),
> + PMU_PMEVCNTR_EL0(1),
> + PMU_PMEVCNTR_EL0(2),
> + PMU_PMEVCNTR_EL0(3),
> + PMU_PMEVCNTR_EL0(4),
> + PMU_PMEVCNTR_EL0(5),
> + PMU_PMEVCNTR_EL0(6),
> + PMU_PMEVCNTR_EL0(7),
> 

[PATCH] arm64: KVM: Fix AArch64 guest userspace exception injection

2016-01-07 Thread Marc Zyngier
At the moment, our fault injection is pretty limited. We always
generate a SYNC exception into EL1, as if the fault was actually
from EL1h, no matter how it was generated.

This is obviously wrong, as EL0 can generate faults of its own
(not to mention the pretty-much unused EL1t mode).

This patch fixes it by implementing section D1.10.2 of the ARMv8 ARM,
and in particular table D1-7 ("Vector offsets from vector table base
address"), which describes which vector to use depending on the source
exception level and type (synchronous, IRQ, FIQ or SError).

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/inject_fault.c | 38 +++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 648112e..4d1ac81 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -27,7 +27,11 @@
 
 #define PSTATE_FAULT_BITS_64   (PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | \
 PSR_I_BIT | PSR_D_BIT)
-#define EL1_EXCEPT_SYNC_OFFSET 0x200
+
+#define CURRENT_EL_SP_EL0_VECTOR   0x0
+#define CURRENT_EL_SP_ELx_VECTOR   0x200
+#define LOWER_EL_AArch64_VECTOR0x400
+#define LOWER_EL_AArch32_VECTOR0x600
 
 static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
 {
@@ -97,6 +101,34 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool 
is_pabt,
*fsr = 0x14;
 }
 
+enum exception_type {
+   except_type_sync= 0,
+   except_type_irq = 0x80,
+   except_type_fiq = 0x100,
+   except_type_serror  = 0x180,
+};
+
+static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type)
+{
+   u64 exc_offset;
+
+   switch (*vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT)) {
+   case PSR_MODE_EL1t:
+   exc_offset = CURRENT_EL_SP_EL0_VECTOR;
+   break;
+   case PSR_MODE_EL1h:
+   exc_offset = CURRENT_EL_SP_ELx_VECTOR;
+   break;
+   case PSR_MODE_EL0t:
+   exc_offset = LOWER_EL_AArch64_VECTOR;
+   break;
+   default:
+   exc_offset = LOWER_EL_AArch32_VECTOR;
+   }
+
+   return vcpu_sys_reg(vcpu, VBAR_EL1) + exc_offset + type;
+}
+
 static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long 
addr)
 {
unsigned long cpsr = *vcpu_cpsr(vcpu);
@@ -108,8 +140,8 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool 
is_iabt, unsigned long addr
*vcpu_spsr(vcpu) = cpsr;
*vcpu_elr_el1(vcpu) = *vcpu_pc(vcpu);
 
+   *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
*vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64;
-   *vcpu_pc(vcpu) = vcpu_sys_reg(vcpu, VBAR_EL1) + EL1_EXCEPT_SYNC_OFFSET;
 
vcpu_sys_reg(vcpu, FAR_EL1) = addr;
 
@@ -143,8 +175,8 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
*vcpu_spsr(vcpu) = cpsr;
*vcpu_elr_el1(vcpu) = *vcpu_pc(vcpu);
 
+   *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
*vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64;
-   *vcpu_pc(vcpu) = vcpu_sys_reg(vcpu, VBAR_EL1) + EL1_EXCEPT_SYNC_OFFSET;
 
/*
 * Build an unknown exception, depending on the instruction
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PULL] KVM/ARM updates for 4.5

2016-01-07 Thread Paolo Bonzini


On 24/12/2015 12:12, Marc Zyngier wrote:
> Hi Paolo,
> 
> THis is the first pull request for the 4.5 merge window. Not much in
> terms of features, but a rewrite of our 64bit world switch, making it
> a lot nicer, maintainable, and much more likely to cope with things
> like VHE. Also support 16bit VMIDs for systems that need to run that
> many VMs concurrently.
> 
> I was really hoping that the PMU code would make it this time around,
> but it got slightly delayed, and the holiday season didn't help. If
> we're lucky enough (read: if all known issues have been addressed), I
> may send you another pull request early in the new year.
> 
> In the mean time, please pull!

Pulled, thanks.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM pci-assign - iommu width is not sufficient for mapped address

2016-01-07 Thread Shyam
Hi All,

We are using Linux Kernel 3.18.19 for running KVM VM's with
pci-assign'ed SRIOV VF interfaces.

We understand that VFIO is the new recommended way, but unfortunately
it reduces performance significantly on our IO workloads (upto the
order of 40-50%) when compared to pci-passthrough. We run trusted VM's
& expose services to the external world. Since we control the VM's,
IOMMU security with VFIO is not exactly mandatory, but performance is
much more important that we get with pci-assign.

We observe a strange behaviour that has already been discussed in this
forum which is upon a VM spawn it causes the following fault resulting
in qemu-kvm crashing

Jan  7 09:41:57 q6-s1 kernel: [90037.228477] intel_iommu_map: iommu
width (48) is not sufficient for the mapped address (fe001000)
Jan  7 09:41:57 q6-s1 kernel: [90037.308229]
kvm_iommu_map_address:iommu failed to map pfn=95000

We observe that this problem happens only if guest linux running 3.5
kernel is spun up & this problem doesnt happen when running guest
linux with 3.6 kernel (i.e. all guest with kernels like 3.2 etc up
till 3.5 causes the above crash whereas any guest kernel >=3.6 doesnt
cause this issue).

So something changed between kernel 3.5 to 3.6 in the guest that
doesnt expose this problem. We have two questions:
1 - we understand that VFIO suffered a similar problem & it was fixed
with 
https://github.com/qemu/qemu/commit/d3a2fd9b29e43e202315d5e99399b99622469c4a.
Alex Williamson suggested that KVM driver needs an equivalent version
of the fix. Can anybody suggest hints on where this fix should be
made?
2 - Any insights on what changes in linux kernel between 3.5 to 3.6 on
the guest that avoids this problem?

Any helps/input greatly appreciated. Thanks!

--Shyam
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm/s390: drop unpaired smp_mb

2016-01-07 Thread Christian Borntraeger
On 01/05/2016 05:20 PM, Michael S. Tsirkin wrote:
> smp_mb on vcpu destroy isn't paired with anything, violating pairing
> rules, and seems to be useless.
> 
> Drop it.
> 
> Signed-off-by: Michael S. Tsirkin 

Applied.
(I had to fix this up a little to match kvm/next)

Thanks

> ---
> 
> Untested.
> 
>  arch/s390/kvm/kvm-s390.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 8465892..7305d2c 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -1195,7 +1195,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
>   (__u64) vcpu->arch.sie_block)
>   vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
>   }
> - smp_mb();
> 
>   if (kvm_is_ucontrol(vcpu->kvm))
>   gmap_free(vcpu->arch.gmap);
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND] kvm:x86:Make sure kvm_write_guest successes for first call in kvm_write_wall_clock

2016-01-07 Thread Paolo Bonzini


On 30/12/2015 19:08, Nicholas Krause wrote:
> This makes sure that kvm_write_guest successes for the first call
> in order to make sure that the wall clock is successfully written
> to the host system before being calucated as required by the
> guest system.
> 
> Signed-off-by: Nicholas Krause 
> ---
>  arch/x86/kvm/x86.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index eed3228..7551f30 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1167,7 +1167,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t 
> wall_clock)
>  
>   ++version;
>  
> - kvm_write_guest(kvm, wall_clock, , sizeof(version));
> + if (kvm_write_guest(kvm, wall_clock, , sizeof(version)))
> + return;
>  
>   /*
>* The guest calculates current wall clock time by adding
> 

Applied, thanks.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] How to reserve guest physical region for ACPI

2016-01-07 Thread Igor Mammedov
On Tue, 5 Jan 2016 18:43:02 +0200
"Michael S. Tsirkin"  wrote:

> On Tue, Jan 05, 2016 at 05:30:25PM +0100, Igor Mammedov wrote:
> > > > bios-linker-loader is a great interface for initializing some
> > > > guest owned data and linking it together but I think it adds
> > > > unnecessary complexity and is misused if it's used to handle
> > > > device owned data/on device memory in this and VMGID cases.
> > > 
> > > I want a generic interface for guest to enumerate these things.  linker
> > > seems quite reasonable but if you see a reason why it won't do, or want
> > > to propose a better interface, fine.
> > > 
> > > PCI would do, too - though windows guys had concerns about
> > > returning PCI BARs from ACPI.  
> > There were potential issues with pSeries bootloader that treated
> > PCI_CLASS_MEMORY_RAM as conventional RAM but it was fixed.
> > Could you point out to discussion about windows issues?
> > 
> > What VMGEN patches that used PCI for mapping purposes were
> > stuck at, was that it was suggested to use PCI_CLASS_MEMORY_RAM
> > class id but we couldn't agree on it.
> > 
> > VMGEN v13 with full discussion is here
> > https://patchwork.ozlabs.org/patch/443554/
> > So to continue with this route we would need to pick some other
> > driver less class id so windows won't prompt for driver or
> > maybe supply our own driver stub to guarantee that no one
> > would touch it. Any suggestions?  
> 
> Pick any device/vendor id pair for which windows specifies no driver.
> There's a small risk that this will conflict with some
> guest but I think it's minimal.
device/vendor id pair was QEMU specific so doesn't conflicts with anything
issue we were trying to solve was to prevent windows asking for driver
even though it does so only once if told not to ask again.

That's why PCI_CLASS_MEMORY_RAM was selected as it's generic driver-less
device descriptor in INF file which matches as the last resort if
there isn't any other diver that's matched device by device/vendor id pair.

> 
> 
> > > 
> > >   
> > > > There was RFC on list to make BIOS boot from NVDIMM already
> > > > doing some ACPI table lookup/parsing. Now if they were forced
> > > > to also parse and execute AML to initialize QEMU with guest
> > > > allocated address that would complicate them quite a bit.
> > > 
> > > If they just need to find a table by name, it won't be
> > > too bad, would it?  
> > that's what they were doing scanning memory for static NVDIMM table.
> > However if it were DataTable, BIOS side would have to execute
> > AML so that the table address could be told to QEMU.  
> 
> Not at all. You can find any table by its signature without
> parsing AML.
yep, and then BIOS would need to tell its address to QEMU
writing to IO port which is allocated statically in QEMU
for this purpose and is described in AML only on guest side.

> 
> 
> > In case of direct mapping or PCI BAR there is no need to initialize
> > QEMU side from AML.
> > That also saves us IO port where this address should be written
> > if bios-linker-loader approach is used.
> >   
> > >   
> > > > While with NVDIMM control memory region mapped directly by QEMU,
> > > > respective patches don't need in any way to initialize QEMU,
> > > > all they would need just read necessary data from control region.
> > > > 
> > > > Also using bios-linker-loader takes away some usable RAM
> > > > from guest and in the end that doesn't scale,
> > > > the more devices I add the less usable RAM is left for guest OS
> > > > while all the device needs is a piece of GPA address space
> > > > that would belong to it.
> > > 
> > > I don't get this comment. I don't think it's MMIO that is wanted.
> > > If it's backed by qemu virtual memory then it's RAM.  
> > Then why don't allocate video card VRAM the same way and try to explain
> > user that a guest started with '-m 128 -device cirrus-vga,vgamem_mb=64Mb'
> > only has 64Mb of available RAM because of we think that on device VRAM
> > is also RAM.
> > 
> > Maybe I've used MMIO term wrongly here but it roughly reflects the idea
> > that on device memory (whether it's VRAM, NVDIMM control block or VMGEN
> > area) is not allocated from guest's usable RAM (as described in E820)
> > but rather directly mapped in guest's GPA and doesn't consume available
> > RAM as guest sees it. That's also the way it's done on real hardware.
> > 
> > What we need in case of VMGEN ID and NVDIMM is on device memory
> > that could be directly accessed by guest.
> > Both direct mapping or PCI BAR do that job and we could use simple
> > static AML without any patching.  
> 
> At least with VMGEN the issue is that there's an AML method
> that returns the physical address.
> Then if guest OS moves the BAR (which is legal), it will break
> since caller has no way to know it's related to the BAR.
I've found a following MS doc "Firmware Allocation of PCI Device Resources in 
Windows". It looks like when MS implemented resource rebalancing in

Re: [PATCH v8 04/20] KVM: ARM64: Add access handler for PMCR register

2016-01-07 Thread Marc Zyngier
On 22/12/15 08:07, Shannon Zhao wrote:
> From: Shannon Zhao 
> 
> Add reset handler which gets host value of PMCR_EL0 and make writable
> bits architecturally UNKNOWN except PMCR.E which is zero. Add an access
> handler for PMCR.
> 
> Signed-off-by: Shannon Zhao 
> ---
>  arch/arm64/kvm/sys_regs.c | 39 +--
>  1 file changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index e8bf374..c60047e 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -34,6 +34,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -439,6 +440,40 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const 
> struct sys_reg_desc *r)
>   vcpu_sys_reg(vcpu, MPIDR_EL1) = (1ULL << 31) | mpidr;
>  }
>  
> +static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
> +{
> + u64 pmcr, val;
> +
> + asm volatile("mrs %0, pmcr_el0\n" : "=r" (pmcr));
> + /* Writable bits of PMCR_EL0 (ARMV8_PMCR_MASK) is reset to UNKNOWN
> +  * except PMCR.E resetting to zero.
> +  */
> + val = ((pmcr & ~ARMV8_PMCR_MASK) | (ARMV8_PMCR_MASK & 0xdecafbad))
> +   & (~ARMV8_PMCR_E);
> + vcpu_sys_reg(vcpu, r->reg) = val;
> +}
> +
> +static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> + const struct sys_reg_desc *r)
> +{
> + u64 val;
> +
> + if (p->is_write) {
> + /* Only update writeable bits of PMCR */
> + val = vcpu_sys_reg(vcpu, r->reg);
> + val &= ~ARMV8_PMCR_MASK;
> + val |= p->regval & ARMV8_PMCR_MASK;
> + vcpu_sys_reg(vcpu, r->reg) = val;
> + } else {
> + /* PMCR.P & PMCR.C are RAZ */
> + val = vcpu_sys_reg(vcpu, r->reg)
> +   & ~(ARMV8_PMCR_P | ARMV8_PMCR_C);
> + p->regval = val;
> + }

How can that work for 32bit, where r->reg is not populated from the trap
table? You *know* that you are accessing PMCR, so just use PMCR_EL0 as
an index into vcpu_sys_reg() in all cases. You can then drop PMCR_EL0
from the 64bit trap table entry.

> +
> + return true;
> +}
> +
>  /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
>  #define DBG_BCR_BVR_WCR_WVR_EL1(n)   \
>   /* DBGBVRn_EL1 */   \
> @@ -623,7 +658,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>  
>   /* PMCR_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b000),
> -   trap_raz_wi },
> +   access_pmcr, reset_pmcr, PMCR_EL0, },
>   /* PMCNTENSET_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b001),
> trap_raz_wi },
> @@ -885,7 +920,7 @@ static const struct sys_reg_desc cp15_regs[] = {
>   { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw },
>  
>   /* PMU */
> - { Op1( 0), CRn( 9), CRm(12), Op2( 0), trap_raz_wi },
> + { Op1( 0), CRn( 9), CRm(12), Op2( 0), access_pmcr },
>   { Op1( 0), CRn( 9), CRm(12), Op2( 1), trap_raz_wi },
>   { Op1( 0), CRn( 9), CRm(12), Op2( 2), trap_raz_wi },
>   { Op1( 0), CRn( 9), CRm(12), Op2( 3), trap_raz_wi },
> 

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 11/20] KVM: ARM64: Add access handler for PMINTENSET and PMINTENCLR register

2016-01-07 Thread Marc Zyngier
On 22/12/15 08:08, Shannon Zhao wrote:
> From: Shannon Zhao 
> 
> Since the reset value of PMINTENSET and PMINTENCLR is UNKNOWN, use
> reset_unknown for its reset handler. Add a handler to emulate writing
> PMINTENSET or PMINTENCLR register.
> 
> Signed-off-by: Shannon Zhao 
> ---
>  arch/arm64/kvm/sys_regs.c | 27 +++
>  1 file changed, 23 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 3416881..24ce4fe 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -644,6 +644,25 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct 
> sys_reg_params *p,
>   return true;
>  }
>  
> +static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> +const struct sys_reg_desc *r)
> +{
> + u64 mask = kvm_pmu_valid_counter_mask(vcpu);
> +
> + if (p->is_write) {
> + if (r->Op2 & 0x1)
> + /* accessing PMINTENSET_EL1 */
> + vcpu_sys_reg(vcpu, r->reg) |= (p->regval & mask);
> + else
> + /* accessing PMINTENCLR_EL1 */
> + vcpu_sys_reg(vcpu, r->reg) &= ~(p->regval & mask);
> + } else {
> + p->regval = vcpu_sys_reg(vcpu, r->reg) & mask;
> + }
> +
> + return true;
> +}
> +

Same bug again.

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] KVM: Remove KVM_REQ_MCLOCK_INPROGRESS to save a bit in vcpu->requests

2016-01-07 Thread Takuya Yoshikawa
Now that we are running out of the bits in vcpu->requests, using one
of them just to call kvm_make_all_cpus_request() with a valid request
number should be avoided.

This patch achieves this by making kvm_make_all_cpus_request() handle
an empty request.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/x86.c   |  2 --
 include/linux/kvm_host.h | 27 +--
 virt/kvm/kvm_main.c  |  8 +---
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b6102c1..88260d0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1701,8 +1701,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
/* guest entries allowed */
-   kvm_for_each_vcpu(i, vcpu, kvm)
-   clear_bit(KVM_REQ_MCLOCK_INPROGRESS, >requests);
 
spin_unlock(>pvclock_gtod_sync_lock);
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ca9b93e..bb9ae4f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -131,19 +131,18 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_PMI   15
 #define KVM_REQ_WATCHDOG  16
 #define KVM_REQ_MASTERCLOCK_UPDATE 17
-#define KVM_REQ_MCLOCK_INPROGRESS 18
-#define KVM_REQ_EPR_EXIT  19
-#define KVM_REQ_SCAN_IOAPIC   20
-#define KVM_REQ_GLOBAL_CLOCK_UPDATE 21
-#define KVM_REQ_ENABLE_IBS22
-#define KVM_REQ_DISABLE_IBS   23
-#define KVM_REQ_APIC_PAGE_RELOAD  24
-#define KVM_REQ_SMI   25
-#define KVM_REQ_HV_CRASH  26
-#define KVM_REQ_IOAPIC_EOI_EXIT   27
-#define KVM_REQ_HV_RESET  28
-#define KVM_REQ_HV_EXIT   29
-#define KVM_REQ_HV_STIMER 30
+#define KVM_REQ_EPR_EXIT  18
+#define KVM_REQ_SCAN_IOAPIC   19
+#define KVM_REQ_GLOBAL_CLOCK_UPDATE 20
+#define KVM_REQ_ENABLE_IBS21
+#define KVM_REQ_DISABLE_IBS   22
+#define KVM_REQ_APIC_PAGE_RELOAD  23
+#define KVM_REQ_SMI   24
+#define KVM_REQ_HV_CRASH  25
+#define KVM_REQ_IOAPIC_EOI_EXIT   26
+#define KVM_REQ_HV_RESET  27
+#define KVM_REQ_HV_EXIT   28
+#define KVM_REQ_HV_STIMER 29
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID   1
@@ -685,7 +684,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 void kvm_make_mclock_inprogress_request(struct kvm *kvm);
 void kvm_make_scan_ioapic_request(struct kvm *kvm);
-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
+bool kvm_make_all_cpus_request(struct kvm *kvm, int req);
 
 long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index be3cef1..0100a19 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -156,7 +156,7 @@ static void ack_flush(void *_completed)
 {
 }
 
-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
+bool kvm_make_all_cpus_request(struct kvm *kvm, int req)
 {
int i, cpu, me;
cpumask_var_t cpus;
@@ -167,7 +167,9 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned 
int req)
 
me = get_cpu();
kvm_for_each_vcpu(i, vcpu, kvm) {
-   kvm_make_request(req, vcpu);
+   if (req >= 0)
+   kvm_make_request(req, vcpu);
+
cpu = vcpu->cpu;
 
/* Set ->requests bit before we read ->mode */
@@ -208,7 +210,7 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
 
 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
 {
-   kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+   kvm_make_all_cpus_request(kvm, -1);
 }
 
 void kvm_make_scan_ioapic_request(struct kvm *kvm)
-- 
2.1.0



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 16/20] KVM: ARM64: Add access handler for PMUSERENR register

2016-01-07 Thread Shannon Zhao


On 2016/1/7 18:14, Marc Zyngier wrote:
> On 22/12/15 08:08, Shannon Zhao wrote:
>> > From: Shannon Zhao 
>> > 
>> > This register resets as unknown in 64bit mode while it resets as zero
>> > in 32bit mode. Here we choose to reset it as zero for consistency.
>> > 
>> > PMUSERENR_EL0 holds some bits which decide whether PMU registers can be
>> > accessed from EL0. Add some check helpers to handle the access from EL0.
>> > 
>> > When these bits are zero, only reading PMUSERENR will trap to EL2 and
>> > writing PMUSERENR or reading/writing other PMU registers will trap to
>> > EL1 other than EL2 when HCR.TGE==0. To current KVM configuration
>> > (HCR.TGE==0) there is no way to get these traps. Here we write 0xf to
>> > physical PMUSERENR register on VM entry, so that it will trap PMU access
>> > from EL0 to EL2. Within the register access handler we check the real
>> > value of guest PMUSERENR register to decide whether this access is
>> > allowed. If not allowed, forward this trap to EL1.
>> > 
>> > Signed-off-by: Shannon Zhao 
>> > ---
>> >  arch/arm64/include/asm/pmu.h |   9 
>> >  arch/arm64/kvm/hyp/switch.c  |   3 ++
>> >  arch/arm64/kvm/sys_regs.c| 122 
>> > +--
>> >  3 files changed, 129 insertions(+), 5 deletions(-)
>> > 
>> > diff --git a/arch/arm64/include/asm/pmu.h b/arch/arm64/include/asm/pmu.h
>> > index 2588f9c..1238ade 100644
>> > --- a/arch/arm64/include/asm/pmu.h
>> > +++ b/arch/arm64/include/asm/pmu.h
>> > @@ -67,4 +67,13 @@
>> >  #define   ARMV8_EXCLUDE_EL0   (1 << 30)
>> >  #define   ARMV8_INCLUDE_EL2   (1 << 27)
>> >  
>> > +/*
>> > + * PMUSERENR: user enable reg
>> > + */
>> > +#define ARMV8_USERENR_MASK0xf /* Mask for writable 
>> > bits */
>> > +#define ARMV8_USERENR_EN  (1 << 0) /* PMU regs can be accessed at EL0 */
>> > +#define ARMV8_USERENR_SW  (1 << 1) /* PMSWINC can be written at EL0 */
>> > +#define ARMV8_USERENR_CR  (1 << 2) /* Cycle counter can be read at EL0 */
>> > +#define ARMV8_USERENR_ER  (1 << 3) /* Event counter can be read at EL0 */
>> > +
>> >  #endif /* __ASM_PMU_H */
>> > diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
>> > index ca8f5a5..a85375f 100644
>> > --- a/arch/arm64/kvm/hyp/switch.c
>> > +++ b/arch/arm64/kvm/hyp/switch.c
>> > @@ -37,6 +37,8 @@ static void __hyp_text __activate_traps(struct kvm_vcpu 
>> > *vcpu)
>> >/* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
>> >write_sysreg(1 << 15, hstr_el2);
>> >write_sysreg(CPTR_EL2_TTA | CPTR_EL2_TFP, cptr_el2);
>> > +  /* Make sure we trap PMU access from EL0 to EL2 */
>> > +  write_sysreg(15, pmuserenr_el0);
> Please use the ARMV8_USERENR_* constants here instead of a magic number
> (since you went through the hassle of defining them!).
> 
Ok.

>> >write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
>> >  }
>> >  
>> > @@ -45,6 +47,7 @@ static void __hyp_text __deactivate_traps(struct 
>> > kvm_vcpu *vcpu)
>> >write_sysreg(HCR_RW, hcr_el2);
>> >write_sysreg(0, hstr_el2);
>> >write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2);
>> > +  write_sysreg(0, pmuserenr_el0);
>> >write_sysreg(0, cptr_el2);
>> >  }
>> >  
>> > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> > index 04281f1..ac0cbf8 100644
>> > --- a/arch/arm64/kvm/sys_regs.c
>> > +++ b/arch/arm64/kvm/sys_regs.c
>> > @@ -453,11 +453,47 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const 
>> > struct sys_reg_desc *r)
>> >vcpu_sys_reg(vcpu, r->reg) = val;
>> >  }
>> >  
>> > +static inline bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu)
> Please drop all the inline attributes. The compiler knows its stuff well
> enough to do it automagically, and this is hardly a fast path...
> 
>> > +{
>> > +  u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
>> > +
>> > +  return !((reg & ARMV8_USERENR_EN) || vcpu_mode_priv(vcpu));
>> > +}
>> > +
>> > +static inline bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu)
>> > +{
>> > +  u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
>> > +
>> > +  return !((reg & (ARMV8_USERENR_SW | ARMV8_USERENR_EN))
>> > +   || vcpu_mode_priv(vcpu));
>> > +}
>> > +
>> > +static inline bool pmu_access_cycle_counter_el0_disabled(struct kvm_vcpu 
>> > *vcpu)
>> > +{
>> > +  u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
>> > +
>> > +  return !((reg & (ARMV8_USERENR_CR | ARMV8_USERENR_EN))
>> > +   || vcpu_mode_priv(vcpu));
>> > +}
>> > +
>> > +static inline bool pmu_access_event_counter_el0_disabled(struct kvm_vcpu 
>> > *vcpu)
>> > +{
>> > +  u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
>> > +
>> > +  return !((reg & (ARMV8_USERENR_ER | ARMV8_USERENR_EN))
>> > +   || vcpu_mode_priv(vcpu));
>> > +}
>> > +
>> >  static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
>> >const struct sys_reg_desc *r)
>> >  {
>> >u64 val;
>> >  
>> > +  if 

Re: [PATCH v8 08/20] KVM: ARM64: Add access handler for event typer register

2016-01-07 Thread Shannon Zhao


On 2015/12/22 16:08, Shannon Zhao wrote:
> From: Shannon Zhao 
> 
> These kind of registers include PMEVTYPERn, PMCCFILTR and PMXEVTYPER
> which is mapped to PMEVTYPERn or PMCCFILTR.
> 
> The access handler translates all aarch32 register offsets to aarch64
> ones and uses vcpu_sys_reg() to access their values to avoid taking care
> of big endian.
> 
> When writing to these registers, create a perf_event for the selected
> event type.
> 
> Signed-off-by: Shannon Zhao 
> ---
>  arch/arm64/kvm/sys_regs.c | 156 
> +-
>  1 file changed, 154 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 2552db1..ed2939b 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -505,6 +505,70 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct 
> sys_reg_params *p,
>   return true;
>  }
>  
> +static inline bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
> +{
> + u64 pmcr, val;
> +
> + pmcr = vcpu_sys_reg(vcpu, PMCR_EL0);
> + val = (pmcr >> ARMV8_PMCR_N_SHIFT) & ARMV8_PMCR_N_MASK;
> + if (idx >= val && idx != ARMV8_CYCLE_IDX)
> + return false;
> +
> + return true;
> +}
> +
> +static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params 
> *p,
> +const struct sys_reg_desc *r)
> +{
> + u64 idx, reg;
> +
> + if (r->CRn == 9) {
> + /* PMXEVTYPER_EL0 */
> + reg = 0;
> + } else {
> + if (!p->is_aarch32) {
> + /* PMEVTYPERn_EL0 or PMCCFILTR_EL0 */
> + reg = r->reg;
> + } else {
> + if (r->CRn == 14 && r->CRm == 15 && r->Op2 == 7) {
> + reg = PMCCFILTR_EL0;
> + } else {
> + reg = ((r->CRm & 3) << 3) | (r->Op2 & 7);
> + reg += PMEVTYPER0_EL0;
> + }
> + }
> + }
> +
> + switch (reg) {
> + case PMEVTYPER0_EL0 ... PMEVTYPER30_EL0:
> + idx = reg - PMEVTYPER0_EL0;
> + if (!pmu_counter_idx_valid(vcpu, idx))
> + return true;
Hi Marc,

Here should we return false to inject an UND since there is no
PMEVTYPER(idx)_EL0? The ARMv8 spec says it should.

Thanks,
-- 
Shannon

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 01/20] ARM64: Move PMU register related defines to asm/pmu.h

2016-01-07 Thread Marc Zyngier
On 22/12/15 08:07, Shannon Zhao wrote:
> From: Shannon Zhao 
> 
> To use the ARMv8 PMU related register defines from the KVM code,
> we move the relevant definitions to asm/pmu.h header file.
> 
> Signed-off-by: Anup Patel 
> Signed-off-by: Shannon Zhao 

Acked-by: Marc Zyngier 

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 06/20] KVM: ARM64: Add access handler for PMCEID0 and PMCEID1 register

2016-01-07 Thread Marc Zyngier
On 22/12/15 08:08, Shannon Zhao wrote:
> From: Shannon Zhao 
> 
> Add access handler which gets host value of PMCEID0 or PMCEID1 when
> guest access these registers. Writing action to PMCEID0 or PMCEID1 is
> UNDEFINED.
> 
> Signed-off-by: Shannon Zhao 
> ---
>  arch/arm64/kvm/sys_regs.c | 27 +++
>  1 file changed, 23 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index f9985fc..2552db1 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -486,6 +486,25 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct 
> sys_reg_params *p,
>   return true;
>  }
>  
> +static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> +   const struct sys_reg_desc *r)
> +{
> + u64 pmceid;
> +
> + if (p->is_write) {
> + kvm_inject_undefined(vcpu);

Just "return false", which will do the right thing.

> + } else {
> + if (!(p->Op2 & 1))
> + asm volatile("mrs %0, pmceid0_el0\n" : "=r" (pmceid));
> + else
> + asm volatile("mrs %0, pmceid1_el0\n" : "=r" (pmceid));
> +
> + p->regval = pmceid;
> + }
> +
> + return true;
> +}
> +
>  /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
>  #define DBG_BCR_BVR_WCR_WVR_EL1(n)   \
>   /* DBGBVRn_EL1 */   \
> @@ -688,10 +707,10 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> access_pmselr, reset_unknown, PMSELR_EL0 },
>   /* PMCEID0_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b110),
> -   trap_raz_wi },
> +   access_pmceid },
>   /* PMCEID1_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b111),
> -   trap_raz_wi },
> +   access_pmceid },
>   /* PMCCNTR_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b000),
> trap_raz_wi },
> @@ -937,8 +956,8 @@ static const struct sys_reg_desc cp15_regs[] = {
>   { Op1( 0), CRn( 9), CRm(12), Op2( 2), trap_raz_wi },
>   { Op1( 0), CRn( 9), CRm(12), Op2( 3), trap_raz_wi },
>   { Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmselr },
> - { Op1( 0), CRn( 9), CRm(12), Op2( 6), trap_raz_wi },
> - { Op1( 0), CRn( 9), CRm(12), Op2( 7), trap_raz_wi },
> + { Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmceid },
> + { Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmceid },
>   { Op1( 0), CRn( 9), CRm(13), Op2( 0), trap_raz_wi },
>   { Op1( 0), CRn( 9), CRm(13), Op2( 1), trap_raz_wi },
>   { Op1( 0), CRn( 9), CRm(13), Op2( 2), trap_raz_wi },
> 

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 05/20] KVM: ARM64: Add access handler for PMSELR register

2016-01-07 Thread Marc Zyngier
On 22/12/15 08:08, Shannon Zhao wrote:
> From: Shannon Zhao 
> 
> Since the reset value of PMSELR_EL0 is UNKNOWN, use reset_unknown for
> its reset handler. When reading PMSELR, return the PMSELR.SEL field to
> guest.
> 
> Signed-off-by: Shannon Zhao 
> ---
>  arch/arm64/kvm/sys_regs.c | 16 ++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index c60047e..f9985fc 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -474,6 +474,18 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct 
> sys_reg_params *p,
>   return true;
>  }
>  
> +static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> +   const struct sys_reg_desc *r)
> +{
> + if (p->is_write)
> + vcpu_sys_reg(vcpu, r->reg) = p->regval;
> + else
> + /* return PMSELR.SEL field */
> + p->regval = vcpu_sys_reg(vcpu, r->reg) & ARMV8_COUNTER_MASK;

Same 32bit bug again.

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 08/20] KVM: ARM64: Add access handler for event typer register

2016-01-07 Thread Shannon Zhao


On 2016/1/7 19:03, Marc Zyngier wrote:
> On 22/12/15 08:08, Shannon Zhao wrote:
>> > From: Shannon Zhao 
>> > 
>> > These kind of registers include PMEVTYPERn, PMCCFILTR and PMXEVTYPER
>> > which is mapped to PMEVTYPERn or PMCCFILTR.
>> > 
>> > The access handler translates all aarch32 register offsets to aarch64
>> > ones and uses vcpu_sys_reg() to access their values to avoid taking care
>> > of big endian.
>> > 
>> > When writing to these registers, create a perf_event for the selected
>> > event type.
>> > 
>> > Signed-off-by: Shannon Zhao 
>> > ---
>> >  arch/arm64/kvm/sys_regs.c | 156 
>> > +-
>> >  1 file changed, 154 insertions(+), 2 deletions(-)
>> > 
>> > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> > index 2552db1..ed2939b 100644
>> > --- a/arch/arm64/kvm/sys_regs.c
>> > +++ b/arch/arm64/kvm/sys_regs.c
>> > @@ -505,6 +505,70 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, 
>> > struct sys_reg_params *p,
>> >return true;
>> >  }
>> >  
>> > +static inline bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
>> > +{
>> > +  u64 pmcr, val;
>> > +
>> > +  pmcr = vcpu_sys_reg(vcpu, PMCR_EL0);
>> > +  val = (pmcr >> ARMV8_PMCR_N_SHIFT) & ARMV8_PMCR_N_MASK;
>> > +  if (idx >= val && idx != ARMV8_CYCLE_IDX)
>> > +  return false;
>> > +
>> > +  return true;
>> > +}
>> > +
>> > +static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct 
>> > sys_reg_params *p,
>> > + const struct sys_reg_desc *r)
>> > +{
>> > +  u64 idx, reg;
>> > +
>> > +  if (r->CRn == 9) {
>> > +  /* PMXEVTYPER_EL0 */
>> > +  reg = 0;
> Is there any particular reason why you're not setting reg to PMSELR_EL0,
> since this is what you're using?
> 
>> > +  } else {
>> > +  if (!p->is_aarch32) {
>> > +  /* PMEVTYPERn_EL0 or PMCCFILTR_EL0 */
>> > +  reg = r->reg;
>> > +  } else {
>> > +  if (r->CRn == 14 && r->CRm == 15 && r->Op2 == 7) {
>> > +  reg = PMCCFILTR_EL0;
>> > +  } else {
>> > +  reg = ((r->CRm & 3) << 3) | (r->Op2 & 7);
>> > +  reg += PMEVTYPER0_EL0;
>> > +  }
>> > +  }
>> > +  }
>> > +
>> > +  switch (reg) {
>> > +  case PMEVTYPER0_EL0 ... PMEVTYPER30_EL0:
>> > +  idx = reg - PMEVTYPER0_EL0;
>> > +  if (!pmu_counter_idx_valid(vcpu, idx))
>> > +  return true;
>> > +  break;
>> > +  case PMCCFILTR_EL0:
>> > +  idx = ARMV8_CYCLE_IDX;
>> > +  break;
>> > +  default:
> This would allow this case to be more precise, and we could have the
> default case as a bug handler.
> 
Ah, you're right. Will fix this.

Thanks,
-- 
Shannon

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 13/20] KVM: ARM64: Add access handler for PMSWINC register

2016-01-07 Thread Marc Zyngier
On 22/12/15 08:08, Shannon Zhao wrote:
> From: Shannon Zhao 
> 
> Add access handler which emulates writing and reading PMSWINC
> register and add support for creating software increment event.
> 
> Signed-off-by: Shannon Zhao 
> ---
>  arch/arm64/kvm/sys_regs.c | 18 +-
>  include/kvm/arm_pmu.h |  2 ++
>  virt/kvm/arm/pmu.c| 33 +
>  3 files changed, 52 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index d61f271dd..92021dc 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -682,6 +682,21 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct 
> sys_reg_params *p,
>   return true;
>  }
>  
> +static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> +const struct sys_reg_desc *r)
> +{
> + u64 mask;
> +
> + if (p->is_write) {
> + mask = kvm_pmu_valid_counter_mask(vcpu);
> + kvm_pmu_software_increment(vcpu, p->regval & mask);
> + } else {
> + kvm_inject_undefined(vcpu);

"return false;" instead.

> + }
> +
> + return true;
> +}
> +
>  /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
>  #define DBG_BCR_BVR_WCR_WVR_EL1(n)   \
>   /* DBGBVRn_EL1 */   \
> @@ -892,7 +907,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> access_pmovs, NULL, PMOVSSET_EL0 },
>   /* PMSWINC_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b100),
> -   trap_raz_wi },
> +   access_pmswinc, reset_unknown, PMSWINC_EL0 },
>   /* PMSELR_EL0 */
>   { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b101),
> access_pmselr, reset_unknown, PMSELR_EL0 },
> @@ -1231,6 +1246,7 @@ static const struct sys_reg_desc cp15_regs[] = {
>   { Op1( 0), CRn( 9), CRm(12), Op2( 1), access_pmcnten },
>   { Op1( 0), CRn( 9), CRm(12), Op2( 2), access_pmcnten },
>   { Op1( 0), CRn( 9), CRm(12), Op2( 3), access_pmovs },
> + { Op1( 0), CRn( 9), CRm(12), Op2( 4), access_pmswinc },
>   { Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmselr },
>   { Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmceid },
>   { Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmceid },
> diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
> index 244970b..67d168c 100644
> --- a/include/kvm/arm_pmu.h
> +++ b/include/kvm/arm_pmu.h
> @@ -40,6 +40,7 @@ u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu);
>  void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u64 val);
>  void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val);
>  void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val);
> +void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val);
>  void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
>   u64 select_idx);
>  #else
> @@ -57,6 +58,7 @@ u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
>  void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u64 val) {}
>  void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val) {}
>  void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val) {}
> +void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {}
>  void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
>   u64 select_idx) {}
>  #endif
> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
> index c23d57e..409f3c4 100644
> --- a/virt/kvm/arm/pmu.c
> +++ b/virt/kvm/arm/pmu.c
> @@ -160,6 +160,35 @@ void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val)
>   kvm_vcpu_kick(vcpu);
>  }
>  
> +/**
> + * kvm_pmu_software_increment - do software increment
> + * @vcpu: The vcpu pointer
> + * @val: the value guest writes to PMSWINC register
> + */
> +void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
> +{
> + int i;
> + u64 type, enable, reg;
> +
> + if (val == 0)
> + return;
> +
> + for (i = 0; i < ARMV8_CYCLE_IDX; i++) {
> + if (!(val & BIT(i)))
> + continue;
> + type = vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i)
> +& ARMV8_EVTYPE_EVENT;
> + enable = vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
> + if ((type == 0) && (enable & BIT(i))) {

nit: Should we have a ARMV8_EVTYPE_EVENT_SW_INCR instead of just
checking for zero?

> + reg = vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
> + reg = lower_32_bits(reg);
> + vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
> + if (!reg)
> + kvm_pmu_overflow_set(vcpu, BIT(i));
> + }
> + }
> +}
> +
>  static inline bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu,
>

Re: [PATCH v8 16/20] KVM: ARM64: Add access handler for PMUSERENR register

2016-01-07 Thread Marc Zyngier
On 22/12/15 08:08, Shannon Zhao wrote:
> From: Shannon Zhao 
> 
> This register resets as unknown in 64bit mode while it resets as zero
> in 32bit mode. Here we choose to reset it as zero for consistency.
> 
> PMUSERENR_EL0 holds some bits which decide whether PMU registers can be
> accessed from EL0. Add some check helpers to handle the access from EL0.
> 
> When these bits are zero, only reading PMUSERENR will trap to EL2 and
> writing PMUSERENR or reading/writing other PMU registers will trap to
> EL1 other than EL2 when HCR.TGE==0. To current KVM configuration
> (HCR.TGE==0) there is no way to get these traps. Here we write 0xf to
> physical PMUSERENR register on VM entry, so that it will trap PMU access
> from EL0 to EL2. Within the register access handler we check the real
> value of guest PMUSERENR register to decide whether this access is
> allowed. If not allowed, forward this trap to EL1.
> 
> Signed-off-by: Shannon Zhao 
> ---
>  arch/arm64/include/asm/pmu.h |   9 
>  arch/arm64/kvm/hyp/switch.c  |   3 ++
>  arch/arm64/kvm/sys_regs.c| 122 
> +--
>  3 files changed, 129 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/pmu.h b/arch/arm64/include/asm/pmu.h
> index 2588f9c..1238ade 100644
> --- a/arch/arm64/include/asm/pmu.h
> +++ b/arch/arm64/include/asm/pmu.h
> @@ -67,4 +67,13 @@
>  #define  ARMV8_EXCLUDE_EL0   (1 << 30)
>  #define  ARMV8_INCLUDE_EL2   (1 << 27)
>  
> +/*
> + * PMUSERENR: user enable reg
> + */
> +#define ARMV8_USERENR_MASK   0xf /* Mask for writable bits */
> +#define ARMV8_USERENR_EN (1 << 0) /* PMU regs can be accessed at EL0 */
> +#define ARMV8_USERENR_SW (1 << 1) /* PMSWINC can be written at EL0 */
> +#define ARMV8_USERENR_CR (1 << 2) /* Cycle counter can be read at EL0 */
> +#define ARMV8_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */
> +
>  #endif /* __ASM_PMU_H */
> diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
> index ca8f5a5..a85375f 100644
> --- a/arch/arm64/kvm/hyp/switch.c
> +++ b/arch/arm64/kvm/hyp/switch.c
> @@ -37,6 +37,8 @@ static void __hyp_text __activate_traps(struct kvm_vcpu 
> *vcpu)
>   /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
>   write_sysreg(1 << 15, hstr_el2);
>   write_sysreg(CPTR_EL2_TTA | CPTR_EL2_TFP, cptr_el2);
> + /* Make sure we trap PMU access from EL0 to EL2 */
> + write_sysreg(15, pmuserenr_el0);

Please use the ARMV8_USERENR_* constants here instead of a magic number
(since you went through the hassle of defining them!).

>   write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
>  }
>  
> @@ -45,6 +47,7 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu 
> *vcpu)
>   write_sysreg(HCR_RW, hcr_el2);
>   write_sysreg(0, hstr_el2);
>   write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2);
> + write_sysreg(0, pmuserenr_el0);
>   write_sysreg(0, cptr_el2);
>  }
>  
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 04281f1..ac0cbf8 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -453,11 +453,47 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const 
> struct sys_reg_desc *r)
>   vcpu_sys_reg(vcpu, r->reg) = val;
>  }
>  
> +static inline bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu)

Please drop all the inline attributes. The compiler knows its stuff well
enough to do it automagically, and this is hardly a fast path...

> +{
> + u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
> +
> + return !((reg & ARMV8_USERENR_EN) || vcpu_mode_priv(vcpu));
> +}
> +
> +static inline bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu)
> +{
> + u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
> +
> + return !((reg & (ARMV8_USERENR_SW | ARMV8_USERENR_EN))
> +  || vcpu_mode_priv(vcpu));
> +}
> +
> +static inline bool pmu_access_cycle_counter_el0_disabled(struct kvm_vcpu 
> *vcpu)
> +{
> + u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
> +
> + return !((reg & (ARMV8_USERENR_CR | ARMV8_USERENR_EN))
> +  || vcpu_mode_priv(vcpu));
> +}
> +
> +static inline bool pmu_access_event_counter_el0_disabled(struct kvm_vcpu 
> *vcpu)
> +{
> + u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
> +
> + return !((reg & (ARMV8_USERENR_ER | ARMV8_USERENR_EN))
> +  || vcpu_mode_priv(vcpu));
> +}
> +
>  static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
>   const struct sys_reg_desc *r)
>  {
>   u64 val;
>  
> + if (pmu_access_el0_disabled(vcpu)) {
> + kvm_forward_trap_to_el1(vcpu);
> + return true;
> + }

So with the patch I posted earlier
(http://www.spinics.net/lists/arm-kernel/msg472693.html), all the
instances similar to that code can be rewritten as

+   if (pmu_access_el0_disabled(vcpu))
+   return 

Re: [PATCH v8 03/20] KVM: ARM64: Add offset defines for PMU registers

2016-01-07 Thread Marc Zyngier
On 22/12/15 08:07, Shannon Zhao wrote:
> From: Shannon Zhao 
> 
> We are about to trap and emulate accesses to each PMU register
> individually. This adds the context offsets for the AArch64 PMU
> registers.
> 
> Signed-off-by: Shannon Zhao 

Reviewed-by: Marc Zyngier 

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] How to reserve guest physical region for ACPI

2016-01-07 Thread Michael S. Tsirkin
On Thu, Jan 07, 2016 at 11:30:25AM +0100, Igor Mammedov wrote:
> On Tue, 5 Jan 2016 18:43:02 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > On Tue, Jan 05, 2016 at 05:30:25PM +0100, Igor Mammedov wrote:
> > > > > bios-linker-loader is a great interface for initializing some
> > > > > guest owned data and linking it together but I think it adds
> > > > > unnecessary complexity and is misused if it's used to handle
> > > > > device owned data/on device memory in this and VMGID cases.
> > > > 
> > > > I want a generic interface for guest to enumerate these things.  linker
> > > > seems quite reasonable but if you see a reason why it won't do, or want
> > > > to propose a better interface, fine.
> > > > 
> > > > PCI would do, too - though windows guys had concerns about
> > > > returning PCI BARs from ACPI.  
> > > There were potential issues with pSeries bootloader that treated
> > > PCI_CLASS_MEMORY_RAM as conventional RAM but it was fixed.
> > > Could you point out to discussion about windows issues?
> > > 
> > > What VMGEN patches that used PCI for mapping purposes were
> > > stuck at, was that it was suggested to use PCI_CLASS_MEMORY_RAM
> > > class id but we couldn't agree on it.
> > > 
> > > VMGEN v13 with full discussion is here
> > > https://patchwork.ozlabs.org/patch/443554/
> > > So to continue with this route we would need to pick some other
> > > driver less class id so windows won't prompt for driver or
> > > maybe supply our own driver stub to guarantee that no one
> > > would touch it. Any suggestions?  
> > 
> > Pick any device/vendor id pair for which windows specifies no driver.
> > There's a small risk that this will conflict with some
> > guest but I think it's minimal.
> device/vendor id pair was QEMU specific so doesn't conflicts with anything
> issue we were trying to solve was to prevent windows asking for driver
> even though it does so only once if told not to ask again.
> 
> That's why PCI_CLASS_MEMORY_RAM was selected as it's generic driver-less
> device descriptor in INF file which matches as the last resort if
> there isn't any other diver that's matched device by device/vendor id pair.

I think this is the only class in this inf.
If you can't use it, you must use an existing device/vendor id pair,
there's some risk involved but probably not much.

> > 
> > 
> > > > 
> > > >   
> > > > > There was RFC on list to make BIOS boot from NVDIMM already
> > > > > doing some ACPI table lookup/parsing. Now if they were forced
> > > > > to also parse and execute AML to initialize QEMU with guest
> > > > > allocated address that would complicate them quite a bit.
> > > > 
> > > > If they just need to find a table by name, it won't be
> > > > too bad, would it?  
> > > that's what they were doing scanning memory for static NVDIMM table.
> > > However if it were DataTable, BIOS side would have to execute
> > > AML so that the table address could be told to QEMU.  
> > 
> > Not at all. You can find any table by its signature without
> > parsing AML.
> yep, and then BIOS would need to tell its address to QEMU
> writing to IO port which is allocated statically in QEMU
> for this purpose and is described in AML only on guest side.

io ports are an ABI too but they are way easier to
maintain.

> > 
> > 
> > > In case of direct mapping or PCI BAR there is no need to initialize
> > > QEMU side from AML.
> > > That also saves us IO port where this address should be written
> > > if bios-linker-loader approach is used.
> > >   
> > > >   
> > > > > While with NVDIMM control memory region mapped directly by QEMU,
> > > > > respective patches don't need in any way to initialize QEMU,
> > > > > all they would need just read necessary data from control region.
> > > > > 
> > > > > Also using bios-linker-loader takes away some usable RAM
> > > > > from guest and in the end that doesn't scale,
> > > > > the more devices I add the less usable RAM is left for guest OS
> > > > > while all the device needs is a piece of GPA address space
> > > > > that would belong to it.
> > > > 
> > > > I don't get this comment. I don't think it's MMIO that is wanted.
> > > > If it's backed by qemu virtual memory then it's RAM.  
> > > Then why don't allocate video card VRAM the same way and try to explain
> > > user that a guest started with '-m 128 -device cirrus-vga,vgamem_mb=64Mb'
> > > only has 64Mb of available RAM because of we think that on device VRAM
> > > is also RAM.
> > > 
> > > Maybe I've used MMIO term wrongly here but it roughly reflects the idea
> > > that on device memory (whether it's VRAM, NVDIMM control block or VMGEN
> > > area) is not allocated from guest's usable RAM (as described in E820)
> > > but rather directly mapped in guest's GPA and doesn't consume available
> > > RAM as guest sees it. That's also the way it's done on real hardware.
> > > 
> > > What we need in case of VMGEN ID and NVDIMM is on device memory
> > > that could be directly accessed by guest.
> > > Both 

Re: [PATCH v8 04/20] KVM: ARM64: Add access handler for PMCR register

2016-01-07 Thread Shannon Zhao


On 2016/1/7 18:43, Marc Zyngier wrote:
> On 22/12/15 08:07, Shannon Zhao wrote:
>> > From: Shannon Zhao 
>> > 
>> > Add reset handler which gets host value of PMCR_EL0 and make writable
>> > bits architecturally UNKNOWN except PMCR.E which is zero. Add an access
>> > handler for PMCR.
>> > 
>> > Signed-off-by: Shannon Zhao 
>> > ---
>> >  arch/arm64/kvm/sys_regs.c | 39 +--
>> >  1 file changed, 37 insertions(+), 2 deletions(-)
>> > 
>> > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> > index e8bf374..c60047e 100644
>> > --- a/arch/arm64/kvm/sys_regs.c
>> > +++ b/arch/arm64/kvm/sys_regs.c
>> > @@ -34,6 +34,7 @@
>> >  #include 
>> >  #include 
>> >  #include 
>> > +#include 
>> >  
>> >  #include 
>> >  
>> > @@ -439,6 +440,40 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const 
>> > struct sys_reg_desc *r)
>> >vcpu_sys_reg(vcpu, MPIDR_EL1) = (1ULL << 31) | mpidr;
>> >  }
>> >  
>> > +static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc 
>> > *r)
>> > +{
>> > +  u64 pmcr, val;
>> > +
>> > +  asm volatile("mrs %0, pmcr_el0\n" : "=r" (pmcr));
>> > +  /* Writable bits of PMCR_EL0 (ARMV8_PMCR_MASK) is reset to UNKNOWN
>> > +   * except PMCR.E resetting to zero.
>> > +   */
>> > +  val = ((pmcr & ~ARMV8_PMCR_MASK) | (ARMV8_PMCR_MASK & 0xdecafbad))
>> > +& (~ARMV8_PMCR_E);
>> > +  vcpu_sys_reg(vcpu, r->reg) = val;
>> > +}
>> > +
>> > +static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
>> > +  const struct sys_reg_desc *r)
>> > +{
>> > +  u64 val;
>> > +
>> > +  if (p->is_write) {
>> > +  /* Only update writeable bits of PMCR */
>> > +  val = vcpu_sys_reg(vcpu, r->reg);
>> > +  val &= ~ARMV8_PMCR_MASK;
>> > +  val |= p->regval & ARMV8_PMCR_MASK;
>> > +  vcpu_sys_reg(vcpu, r->reg) = val;
>> > +  } else {
>> > +  /* PMCR.P & PMCR.C are RAZ */
>> > +  val = vcpu_sys_reg(vcpu, r->reg)
>> > +& ~(ARMV8_PMCR_P | ARMV8_PMCR_C);
>> > +  p->regval = val;
>> > +  }
> How can that work for 32bit, where r->reg is not populated from the trap
> table? You *know* that you are accessing PMCR, so just use PMCR_EL0 as
> an index into vcpu_sys_reg() in all cases. You can then drop PMCR_EL0
> from the 64bit trap table entry.
> 
Oh, sorry for this bug. Will fix this and those in other places.

Thanks,
-- 
Shannon

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 08/20] KVM: ARM64: Add access handler for event typer register

2016-01-07 Thread Marc Zyngier
On 07/01/16 12:09, Shannon Zhao wrote:
> 
> 
> On 2015/12/22 16:08, Shannon Zhao wrote:
>> From: Shannon Zhao 
>>
>> These kind of registers include PMEVTYPERn, PMCCFILTR and PMXEVTYPER
>> which is mapped to PMEVTYPERn or PMCCFILTR.
>>
>> The access handler translates all aarch32 register offsets to aarch64
>> ones and uses vcpu_sys_reg() to access their values to avoid taking care
>> of big endian.
>>
>> When writing to these registers, create a perf_event for the selected
>> event type.
>>
>> Signed-off-by: Shannon Zhao 
>> ---
>>  arch/arm64/kvm/sys_regs.c | 156 
>> +-
>>  1 file changed, 154 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> index 2552db1..ed2939b 100644
>> --- a/arch/arm64/kvm/sys_regs.c
>> +++ b/arch/arm64/kvm/sys_regs.c
>> @@ -505,6 +505,70 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct 
>> sys_reg_params *p,
>>  return true;
>>  }
>>  
>> +static inline bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
>> +{
>> +u64 pmcr, val;
>> +
>> +pmcr = vcpu_sys_reg(vcpu, PMCR_EL0);
>> +val = (pmcr >> ARMV8_PMCR_N_SHIFT) & ARMV8_PMCR_N_MASK;
>> +if (idx >= val && idx != ARMV8_CYCLE_IDX)
>> +return false;
>> +
>> +return true;
>> +}
>> +
>> +static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params 
>> *p,
>> +   const struct sys_reg_desc *r)
>> +{
>> +u64 idx, reg;
>> +
>> +if (r->CRn == 9) {
>> +/* PMXEVTYPER_EL0 */
>> +reg = 0;
>> +} else {
>> +if (!p->is_aarch32) {
>> +/* PMEVTYPERn_EL0 or PMCCFILTR_EL0 */
>> +reg = r->reg;
>> +} else {
>> +if (r->CRn == 14 && r->CRm == 15 && r->Op2 == 7) {
>> +reg = PMCCFILTR_EL0;
>> +} else {
>> +reg = ((r->CRm & 3) << 3) | (r->Op2 & 7);
>> +reg += PMEVTYPER0_EL0;
>> +}
>> +}
>> +}
>> +
>> +switch (reg) {
>> +case PMEVTYPER0_EL0 ... PMEVTYPER30_EL0:
>> +idx = reg - PMEVTYPER0_EL0;
>> +if (!pmu_counter_idx_valid(vcpu, idx))
>> +return true;
> Hi Marc,
> 
> Here should we return false to inject an UND since there is no
> PMEVTYPER(idx)_EL0? The ARMv8 spec says it should.

The spec says that the following behaviours are valid:
- Accesses to the register are UNDEFINED .
- Accesses to the register behave as RAZ/WI.
- Accesses to the register execute as a NOP .

Same for the counters. So you can either return true (act as a NOP), or
return false (UNDEF), and even zero r->regval on read and return true
(RAZ/WI).

This is entirely up to you. My personal preference is indeed to UNDEF,
but your current implementation is valid.

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/6] nvdimm acpi: let qemu handle _DSM method

2016-01-07 Thread Igor Mammedov
On Tue,  5 Jan 2016 02:52:07 +0800
Xiao Guangrong  wrote:

> If dsm memory is successfully patched, we let qemu fully emulate
> the dsm method
> 
> This patch saves _DSM input parameters into dsm memory, tell dsm
> memory address to QEMU, then fetch the result from the dsm memory
you also need to add NVDR._CRS method that would report
resources used by operation regions.

NVDIMM_COMMON_DSM - probably should be serialized, otherwise
there is a race risk, when several callers would write to
control region.


> 
> Signed-off-by: Xiao Guangrong 
> ---
>  hw/acpi/aml-build.c |  27 ++
>  hw/acpi/nvdimm.c| 124 
> ++--
>  include/hw/acpi/aml-build.h |   2 +
>  3 files changed, 150 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
> index 677c1a6..e65171f 100644
> --- a/hw/acpi/aml-build.c
> +++ b/hw/acpi/aml-build.c
> @@ -1013,6 +1013,19 @@ Aml *create_field_common(int opcode, Aml *srcbuf, Aml 
> *index, const char *name)
>  return var;
>  }
>  
> +/* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefCreateField */
> +Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, const char *name)
> +{
> +Aml *var = aml_alloc();
> +build_append_byte(var->buf, 0x5B); /* ExtOpPrefix */
> +build_append_byte(var->buf, 0x13); /* CreateFieldOp */
> +aml_append(var, srcbuf);
> +aml_append(var, index);
> +aml_append(var, len);
> +build_append_namestring(var->buf, "%s", name);
> +return var;
> +}
> +
>  /* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefCreateDWordField */
>  Aml *aml_create_dword_field(Aml *srcbuf, Aml *index, const char *name)
>  {
> @@ -1439,6 +1452,20 @@ Aml *aml_alias(const char *source_object, const char 
> *alias_object)
>  return var;
>  }
>  
> +/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefConcat */
> +Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target)
> +{
> +Aml *var = aml_opcode(0x73 /* ConcatOp */);
> +aml_append(var, source1);
> +aml_append(var, source2);
> +
> +if (target) {
> +aml_append(var, target);
> +}
> +
> +return var;
> +}
> +
>  void
>  build_header(GArray *linker, GArray *table_data,
>   AcpiTableHeader *h, const char *sig, int len, uint8_t rev,
> diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
> index a72104c..dfccbc0 100644
> --- a/hw/acpi/nvdimm.c
> +++ b/hw/acpi/nvdimm.c
> @@ -369,6 +369,24 @@ static void nvdimm_build_nfit(GSList *device_list, 
> GArray *table_offsets,
>  g_array_free(structures, true);
>  }
>  
> +struct NvdimmDsmIn {
> +uint32_t handle;
> +uint32_t revision;
> +uint32_t function;
> +   /* the remaining size in the page is used by arg3. */
> +union {
> +uint8_t arg3[0];
> +};
> +} QEMU_PACKED;
> +typedef struct NvdimmDsmIn NvdimmDsmIn;
> +
> +struct NvdimmDsmOut {
> +/* the size of buffer filled by QEMU. */
> +uint32_t len;
> +uint8_t data[0];
> +} QEMU_PACKED;
> +typedef struct NvdimmDsmOut NvdimmDsmOut;
> +
>  static uint64_t
>  nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
>  {
> @@ -408,11 +426,21 @@ void nvdimm_init_acpi_state(AcpiNVDIMMState *state, 
> MemoryRegion *io,
>  
>  static void nvdimm_build_common_dsm(Aml *dev)
>  {
> -Aml *method, *ifctx, *function;
> +Aml *method, *ifctx, *function, *unpatched, *field, *high_dsm_mem;
> +Aml *result_size, *dsm_mem;
>  uint8_t byte_list[1];
>  
>  method = aml_method(NVDIMM_COMMON_DSM, 4, AML_NOTSERIALIZED);
>  function = aml_arg(2);
> +dsm_mem = aml_arg(3);
> +
> +aml_append(method, aml_store(aml_call0(NVDIMM_GET_DSM_MEM), dsm_mem));
> +
> +/*
> + * do not support any method if DSM memory address has not been
> + * patched.
> + */
> +unpatched = aml_if(aml_equal(dsm_mem, aml_int64(0x0)));
>  
>  /*
>   * function 0 is called to inquire what functions are supported by
> @@ -421,12 +449,102 @@ static void nvdimm_build_common_dsm(Aml *dev)
>  ifctx = aml_if(aml_equal(function, aml_int(0)));
>  byte_list[0] = 0 /* No function Supported */;
>  aml_append(ifctx, aml_return(aml_buffer(1, byte_list)));
> -aml_append(method, ifctx);
> +aml_append(unpatched, ifctx);
>  
>  /* No function is supported yet. */
>  byte_list[0] = 1 /* Not Supported */;
> -aml_append(method, aml_return(aml_buffer(1, byte_list)));
> +aml_append(unpatched, aml_return(aml_buffer(1, byte_list)));
> +aml_append(method, unpatched);
> +
> +/* map DSM memory and IO into ACPI namespace. */
> +aml_append(method, aml_operation_region("NPIO", AML_SYSTEM_IO,
> +   aml_int(NVDIMM_ACPI_IO_BASE), NVDIMM_ACPI_IO_LEN));
> +aml_append(method, aml_operation_region("NRAM", AML_SYSTEM_MEMORY,
> +dsm_mem, TARGET_PAGE_SIZE));
> +
> +/*
> + * DSM notifier:
> + * LNTF: write 

Re: [PATCH v8 20/20] KVM: ARM64: Add a new kvm ARM PMU device

2016-01-07 Thread Peter Maydell
On 22 December 2015 at 08:08, Shannon Zhao  wrote:
> From: Shannon Zhao 
>
> Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
> the kvm_device_ops for it.
>
> Signed-off-by: Shannon Zhao 
> ---
>  Documentation/virtual/kvm/devices/arm-pmu.txt |  24 +
>  arch/arm64/include/uapi/asm/kvm.h |   4 +
>  include/linux/kvm_host.h  |   1 +
>  include/uapi/linux/kvm.h  |   2 +
>  virt/kvm/arm/pmu.c| 128 
> ++
>  virt/kvm/kvm_main.c   |   4 +
>  6 files changed, 163 insertions(+)
>  create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt
>
> diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt 
> b/Documentation/virtual/kvm/devices/arm-pmu.txt
> new file mode 100644
> index 000..dda864e
> --- /dev/null
> +++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
> @@ -0,0 +1,24 @@
> +ARM Virtual Performance Monitor Unit (vPMU)
> +===
> +
> +Device types supported:
> +  KVM_DEV_TYPE_ARM_PMU_V3 ARM Performance Monitor Unit v3
> +
> +Instantiate one PMU instance for per VCPU through this API.

Do you mean that userspace has to call this API once per vCPU to
create each PMU, or that calling the device create ioctl once makes
the kernel instantiate a PMU for each vCPU?

(It's a little bit confusing that we say "this API" to mean
"not the API documented in this file at all but actually
the KVM_CREATE_DEVICE ioctl", but I see we do that in the GIC
API docs too.)

> +
> +Groups:
> +  KVM_DEV_ARM_PMU_GRP_IRQ
> +  Attributes:
> +The attr field of kvm_device_attr encodes one value:
> +bits: | 63  32 | 31   0 |
> +values:   |  reserved  | vcpu_index |
> +A value describing the PMU overflow interrupt number for the specified
> +vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM the
> +interrupt type must be same for each vcpu. As a PPI, the interrupt 
> number is
> +same for all vcpus, while as a SPI it must be different for each vcpu.

I see we're using vcpu_index rather than MPIDR affinity value
for specifying which CPU we're configuring. Is this in line with
our planned API for GICv3 configuration?

> +  Errors:
> +-ENXIO: Unsupported attribute group
> +-EBUSY: The PMU overflow interrupt is already set
> +-ENODEV: Getting the PMU overflow interrupt number while it's not set
> +-EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied

What happens if you create a PMU but then never set the IRQ number?
Is there a default or does the VM refuse to run or something?

Do we want an attribute like KVM_DEV_ARM_VGIC_CTRL_INIT so userspace
can say "I have definitely completed configuration of this device now" ?

We wound up with a fair amount of legacy mess to deal with in the
GIC code because we didn't put one of those in from the start.
(Perhaps we should aim to standardize on all kvm devices having one?)

thanks
-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] KVM: renumber architecture-dependent requests

2016-01-07 Thread Christian Borntraeger
On 01/07/2016 03:17 PM, Paolo Bonzini wrote:
> Leave room for 4 more arch-independent requests.
> 


The patch subject is wrong.

"renumber architecture-dependent requests"

--> "renumber kvm requests"

as we also renumber the architecture independent ones.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: use PIT channel index in hpet_legacy_start mode

2016-01-07 Thread P J P
+-- On Thu, 7 Jan 2016, Paolo Bonzini wrote --+
| > Will this trigger the same issue like CVE-2015-7513 ?
| > 
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=0185604c2d82c560dab2f2933a18f797e74ab5a8
| 
| I am not sure (--verbose please :))

  IIUC, it shouldn't, because pit_load_count() does

/*
 * The largest possible initial count is 0; this is equivalent
 * to 216 for binary counting and 104 for BCD counting.
 */
if (val == 0)
val = 0x1;


| but the right fix is to change the caller like this:
 
| diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
| @@ -420,6 +420,7 @@ void kvm_pit_load_count(struct kvm *kvm, int channel, u32 
val, int hpet_legacy_s
|   u8 saved_mode;
|   if (hpet_legacy_start) {
|   /* save existing mode for later reenablement */
| + WARN_ON(channel != 0);
|   saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
|   kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable 
timer */
|   pit_load_count(kvm, channel, val);

  In that case I guess, 'pit_load_count' could be called as

+   pit_load_count(kvm, 0, val); 


Thank you.
--
 - P J P
47AF CE69 3A90 54AA 9045 1053 DD13 3D32 FE5B 041F
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 19/20] KVM: ARM64: Free perf event of PMU when destroying vcpu

2016-01-07 Thread Marc Zyngier
On Tue, 22 Dec 2015 16:08:14 +0800
Shannon Zhao  wrote:

> From: Shannon Zhao 
> 
> When KVM frees VCPU, it needs to free the perf_event of PMU.
> 
> Signed-off-by: Shannon Zhao 

Reviewed-by: Marc Zyngier 

M.
-- 
Jazz is not dead. It just smells funny.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: How to reserve guest physical region for ACPI

2016-01-07 Thread Igor Mammedov
On Mon, 4 Jan 2016 21:17:31 +0100
Laszlo Ersek  wrote:

> Michael CC'd me on the grandparent of the email below. I'll try to add
> my thoughts in a single go, with regard to OVMF.
> 
> On 12/30/15 20:52, Michael S. Tsirkin wrote:
> > On Wed, Dec 30, 2015 at 04:55:54PM +0100, Igor Mammedov wrote:  
> >> On Mon, 28 Dec 2015 14:50:15 +0200
> >> "Michael S. Tsirkin"  wrote:
> >>  
> >>> On Mon, Dec 28, 2015 at 10:39:04AM +0800, Xiao Guangrong wrote:  
> 
>  Hi Michael, Paolo,
> 
>  Now it is the time to return to the challenge that how to reserve guest
>  physical region internally used by ACPI.
> 
>  Igor suggested that:
>  | An alternative place to allocate reserve from could be high memory.
>  | For pc we have "reserved-memory-end" which currently makes sure
>  | that hotpluggable memory range isn't used by firmware
>  (https://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg00926.html) 
>   
> 
> OVMF has no support for the "reserved-memory-end" fw_cfg file. The
> reason is that nobody wrote that patch, nor asked for the patch to be
> written. (Not implying that just requesting the patch would be
> sufficient for the patch to be written.)
Hijacking this part of thread to check if OVMF would work with memory-hotplug
and if it needs "reserved-memory-end" support at all.

How OVMF determines which GPA ranges to use for initializing PCI BARs
at boot time, more specifically 64-bit BARs.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 0/6] NVDIMM ACPI: introduce the framework of QEMU emulated DSM

2016-01-07 Thread Igor Mammedov
On Tue,  5 Jan 2016 02:52:02 +0800
Xiao Guangrong  wrote:

> This patchset is against commit 5530427f0ca (acpi: extend aml_and() to
> accept target argument) on pci branch of Michael's git tree
> and can be found at:
>   https://github.com/xiaogr/qemu.git nvdimm-acpi-v1
> 
> This is the second part of vNVDIMM implementation which implements the
> BIOS patched dsm memory and introduces the framework that allows QEMU
> to emulate DSM method
> 
> Thanks to Michael's idea, we do not reserve any memory for NVDIMM ACPI,
> instead we let BIOS allocate the memory and patch the address to the
> offset we want
> 
> IO port is still enabled as it plays as the way to notify QEMU and pass
> the patched dsm memory address, so that IO port region, 0x0a18 - 0xa20,
> is reserved and it is divided into two 32 bits ports and used to pass
> the low 32 bits and high 32 bits of dsm memory address to QEMU
> 
> Thanks Igor's idea, this patchset also extends DSDT/SSDT to revision 2
> to apply 64 bit operations, in order to keeping compatibility, old
> version (<= 2.5) still uses revision 1. Since 64 bit operations breaks
> old guests (such as windows XP), we should keep the 64 bits stuff in
> the private place where common ACPI operation does not touch it
> 

general notes:
1. could you split out AML API additions/changes into separate patches?
   even if series nvdims patches couldn't be accepted on next respin,
   AML API patches could be good and we could pick them up just
   for API completeness. That also would make them easier to review
   and reduces count of patches you'd need to respin.
2. add test case for NVDIMM table blob, see tests/bios-tables-test.c
   at the beginning of series.
3. make V=1 check would show you ASL diff your patches are introducing,
   it will save you from booting real guest and dumping/decompiling
   tables manually.
4. at the end of series add NVDIMM table test blob with new table.
   you can use tests/acpi-test-data/rebuild-expected-aml.sh to make it
5. if make check by some miracle passes with these patches,
   dump NVDIMM table in guest and try to decompile and then compile it
   back with IASL, it will show you what needs to be fixed.
   
PS:
 under NVDIMM table I mean SSDT NVMDIM table.

> Igor Mammedov (1):
>   pc: acpi: bump DSDT/SSDT compliance revision to v2
> 
> Xiao Guangrong (5):
>   nvdimm acpi: initialize the resource used by NVDIMM ACPI
>   nvdimm acpi: introduce patched dsm memory
>   acpi: allow using acpi named offset for OperationRegion
>   nvdimm acpi: let qemu handle _DSM method
>   nvdimm acpi: emulate dsm method
> 
>  hw/acpi/Makefile.objs   |   2 +-
>  hw/acpi/aml-build.c |  45 +++-
>  hw/acpi/ich9.c  |  32 +
>  hw/acpi/nvdimm.c| 276 
> ++--
>  hw/acpi/piix4.c |   3 +
>  hw/i386/acpi-build.c|  41 ---
>  hw/i386/pc.c|   8 +-
>  hw/i386/pc_piix.c   |   5 +
>  hw/i386/pc_q35.c|   8 +-
>  include/hw/acpi/aml-build.h |   6 +-
>  include/hw/acpi/ich9.h  |   2 +
>  include/hw/i386/pc.h|  19 ++-
>  include/hw/mem/nvdimm.h |  44 ++-
>  13 files changed, 449 insertions(+), 42 deletions(-)
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] KVM: Remove unused KVM_REQ_KICK to save a bit in vcpu->requests

2016-01-07 Thread Paolo Bonzini


On 07/01/2016 12:43, Takuya Yoshikawa wrote:
> Signed-off-by: Takuya Yoshikawa 
> ---
>  include/linux/kvm_host.h | 45 ++---
>  1 file changed, 22 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 61c3e6c..ca9b93e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -122,29 +122,28 @@ static inline bool is_error_page(struct page *page)
>  #define KVM_REQ_UNHALT 6
>  #define KVM_REQ_MMU_SYNC   7
>  #define KVM_REQ_CLOCK_UPDATE   8
> -#define KVM_REQ_KICK   9

I'd prefer to leave just an /* unused: 9 */ here.

This patch can go in for 4.5.  Regarding the other patch,
KVM_REQ_MCLOCK_INPROGRESS is indeed not really necessary, see
http://www.spinics.net/lists/kvm/msg95944.html and the follow-up.  Were
you thinking of the same?  If so I would prefer to have some comments.

Paolo

> -#define KVM_REQ_DEACTIVATE_FPU10
> -#define KVM_REQ_EVENT 11
> -#define KVM_REQ_APF_HALT  12
> -#define KVM_REQ_STEAL_UPDATE  13
> -#define KVM_REQ_NMI   14
> -#define KVM_REQ_PMU   15
> -#define KVM_REQ_PMI   16
> -#define KVM_REQ_WATCHDOG  17
> -#define KVM_REQ_MASTERCLOCK_UPDATE 18
> -#define KVM_REQ_MCLOCK_INPROGRESS 19
> -#define KVM_REQ_EPR_EXIT  20
> -#define KVM_REQ_SCAN_IOAPIC   21
> -#define KVM_REQ_GLOBAL_CLOCK_UPDATE 22
> -#define KVM_REQ_ENABLE_IBS23
> -#define KVM_REQ_DISABLE_IBS   24
> -#define KVM_REQ_APIC_PAGE_RELOAD  25
> -#define KVM_REQ_SMI   26
> -#define KVM_REQ_HV_CRASH  27
> -#define KVM_REQ_IOAPIC_EOI_EXIT   28
> -#define KVM_REQ_HV_RESET  29
> -#define KVM_REQ_HV_EXIT   30
> -#define KVM_REQ_HV_STIMER 31
> +#define KVM_REQ_DEACTIVATE_FPU 9
> +#define KVM_REQ_EVENT 10
> +#define KVM_REQ_APF_HALT  11
> +#define KVM_REQ_STEAL_UPDATE  12
> +#define KVM_REQ_NMI   13
> +#define KVM_REQ_PMU   14
> +#define KVM_REQ_PMI   15
> +#define KVM_REQ_WATCHDOG  16
> +#define KVM_REQ_MASTERCLOCK_UPDATE 17
> +#define KVM_REQ_MCLOCK_INPROGRESS 18
> +#define KVM_REQ_EPR_EXIT  19
> +#define KVM_REQ_SCAN_IOAPIC   20
> +#define KVM_REQ_GLOBAL_CLOCK_UPDATE 21
> +#define KVM_REQ_ENABLE_IBS22
> +#define KVM_REQ_DISABLE_IBS   23
> +#define KVM_REQ_APIC_PAGE_RELOAD  24
> +#define KVM_REQ_SMI   25
> +#define KVM_REQ_HV_CRASH  26
> +#define KVM_REQ_IOAPIC_EOI_EXIT   27
> +#define KVM_REQ_HV_RESET  28
> +#define KVM_REQ_HV_EXIT   29
> +#define KVM_REQ_HV_STIMER 30
>  
>  #define KVM_USERSPACE_IRQ_SOURCE_ID  0
>  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: use PIT channel index in hpet_legacy_start mode

2016-01-07 Thread Yang Zhang

On 2016/1/7 20:32, P J P wrote:

From: P J P 

While setting the KVM PIT counters in 'kvm_pit_load_count', if
'hpet_legacy_start' is set, the function disables the timer on
channel[0], instead of the respective index 'channel'. Update it
to use 'channel' index parameter.

Signed-off-by: P J P 
---
  arch/x86/kvm/i8254.c | 7 ---
  1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 08116ff..154e936 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -420,10 +420,11 @@ void kvm_pit_load_count(struct kvm *kvm, int channel, u32 
val, int hpet_legacy_s
u8 saved_mode;
if (hpet_legacy_start) {
/* save existing mode for later reenablement */
-   saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
-   kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable 
timer */
+   saved_mode = kvm->arch.vpit->pit_state.channels[channel].mode;
+   /* disable timer */
+   kvm->arch.vpit->pit_state.channels[channel].mode = 0xff;
pit_load_count(kvm, channel, val);
-   kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+   kvm->arch.vpit->pit_state.channels[channel].mode = saved_mode;
} else {
pit_load_count(kvm, channel, val);
}



Will this trigger the same issue like CVE-2015-7513 ?

https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=0185604c2d82c560dab2f2933a18f797e74ab5a8

--
best regards
yang
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 08/20] KVM: ARM64: Add access handler for event typer register

2016-01-07 Thread Shannon Zhao


On 2016/1/7 19:03, Marc Zyngier wrote:
> On 22/12/15 08:08, Shannon Zhao wrote:
>> > From: Shannon Zhao 
>> > 
>> > These kind of registers include PMEVTYPERn, PMCCFILTR and PMXEVTYPER
>> > which is mapped to PMEVTYPERn or PMCCFILTR.
>> > 
>> > The access handler translates all aarch32 register offsets to aarch64
>> > ones and uses vcpu_sys_reg() to access their values to avoid taking care
>> > of big endian.
>> > 
>> > When writing to these registers, create a perf_event for the selected
>> > event type.
>> > 
>> > Signed-off-by: Shannon Zhao 
>> > ---
>> >  arch/arm64/kvm/sys_regs.c | 156 
>> > +-
>> >  1 file changed, 154 insertions(+), 2 deletions(-)
>> > 
>> > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> > index 2552db1..ed2939b 100644
>> > --- a/arch/arm64/kvm/sys_regs.c
>> > +++ b/arch/arm64/kvm/sys_regs.c
>> > @@ -505,6 +505,70 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, 
>> > struct sys_reg_params *p,
>> >return true;
>> >  }
>> >  
>> > +static inline bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
>> > +{
>> > +  u64 pmcr, val;
>> > +
>> > +  pmcr = vcpu_sys_reg(vcpu, PMCR_EL0);
>> > +  val = (pmcr >> ARMV8_PMCR_N_SHIFT) & ARMV8_PMCR_N_MASK;
>> > +  if (idx >= val && idx != ARMV8_CYCLE_IDX)
>> > +  return false;
>> > +
>> > +  return true;
>> > +}
>> > +
>> > +static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct 
>> > sys_reg_params *p,
>> > + const struct sys_reg_desc *r)
>> > +{
>> > +  u64 idx, reg;
>> > +
>> > +  if (r->CRn == 9) {
>> > +  /* PMXEVTYPER_EL0 */
>> > +  reg = 0;
> Is there any particular reason why you're not setting reg to PMSELR_EL0,
> since this is what you're using?
> 
>> > +  } else {
>> > +  if (!p->is_aarch32) {
>> > +  /* PMEVTYPERn_EL0 or PMCCFILTR_EL0 */
>> > +  reg = r->reg;
>> > +  } else {
>> > +  if (r->CRn == 14 && r->CRm == 15 && r->Op2 == 7) {
>> > +  reg = PMCCFILTR_EL0;
>> > +  } else {
>> > +  reg = ((r->CRm & 3) << 3) | (r->Op2 & 7);
>> > +  reg += PMEVTYPER0_EL0;
>> > +  }
>> > +  }
>> > +  }
>> > +
>> > +  switch (reg) {
>> > +  case PMEVTYPER0_EL0 ... PMEVTYPER30_EL0:
>> > +  idx = reg - PMEVTYPER0_EL0;
>> > +  if (!pmu_counter_idx_valid(vcpu, idx))
>> > +  return true;
>> > +  break;
>> > +  case PMCCFILTR_EL0:
>> > +  idx = ARMV8_CYCLE_IDX;
>> > +  break;
>> > +  default:
> This would allow this case to be more precise, and we could have the
> default case as a bug handler.
> 

It turns out that I refactor this function like below:

+static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct
sys_reg_params *p,
+  const struct sys_reg_desc *r)
+{
+   u64 idx, reg = 0;
+
+   if (r->CRn == 9) {
+   /* PMXEVTYPER_EL0 */
+   idx = vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_COUNTER_MASK;
+   reg = PMEVTYPER0_EL0 + idx;
+   } else {
+   if (r->CRm == 15 && r->Op2 == 7) {
+   idx = ARMV8_CYCLE_IDX;
+   reg = PMCCFILTR_EL0;
+   } else {
+   /* PMEVTYPERn_EL0 */
+   idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
+   reg = PMEVTYPER0_EL0 + idx;
+   }
+   }
+
+   BUG_ON(reg == 0);
+
+   if (!pmu_counter_idx_valid(vcpu, idx))
+   return false;
+
+   if (p->is_write) {
+   kvm_pmu_set_counter_event_type(vcpu, p->regval, idx);
+   vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_EVTYPE_MASK;
+   } else {
+   p->regval = vcpu_sys_reg(vcpu, reg) & ARMV8_EVTYPE_MASK;
+   }
+
+   return true;
+}

How about this?

Thanks,
-- 
Shannon

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 07/20] KVM: ARM64: PMU: Add perf event map and introduce perf event creating function

2016-01-07 Thread Marc Zyngier
On Tue, 22 Dec 2015 16:08:02 +0800
Shannon Zhao  wrote:

> From: Shannon Zhao 
> 
> When we use tools like perf on host, perf passes the event type and the
> id of this event type category to kernel, then kernel will map them to
> hardware event number and write this number to PMU PMEVTYPER_EL0
> register. When getting the event number in KVM, directly use raw event
> type to create a perf_event for it.
> 
> Signed-off-by: Shannon Zhao 
> ---
>  arch/arm64/include/asm/pmu.h |   3 ++
>  arch/arm64/kvm/Makefile  |   1 +
>  include/kvm/arm_pmu.h|  11 
>  virt/kvm/arm/pmu.c   | 122 
> +++
>  4 files changed, 137 insertions(+)
>  create mode 100644 virt/kvm/arm/pmu.c
> 
> diff --git a/arch/arm64/include/asm/pmu.h b/arch/arm64/include/asm/pmu.h
> index 4406184..2588f9c 100644
> --- a/arch/arm64/include/asm/pmu.h
> +++ b/arch/arm64/include/asm/pmu.h
> @@ -21,6 +21,7 @@
>  
>  #define ARMV8_MAX_COUNTERS  32
>  #define ARMV8_COUNTER_MASK  (ARMV8_MAX_COUNTERS - 1)
> +#define ARMV8_CYCLE_IDX (ARMV8_MAX_COUNTERS - 1)
>  
>  /*
>   * Per-CPU PMCR: config reg
> @@ -31,6 +32,8 @@
>  #define ARMV8_PMCR_D (1 << 3) /* CCNT counts every 64th cpu cycle */
>  #define ARMV8_PMCR_X (1 << 4) /* Export to ETM */
>  #define ARMV8_PMCR_DP(1 << 5) /* Disable CCNT if 
> non-invasive debug*/
> +/* Determines which PMCCNTR_EL0 bit generates an overflow */
> +#define ARMV8_PMCR_LC(1 << 6)
>  #define  ARMV8_PMCR_N_SHIFT  11   /* Number of counters 
> supported */
>  #define  ARMV8_PMCR_N_MASK   0x1f
>  #define  ARMV8_PMCR_MASK 0x3f /* Mask for writable bits */
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index caee9ee..122cff4 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -26,3 +26,4 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
>  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
> +kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
> diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
> index ddcb5b2..14bedb0 100644
> --- a/include/kvm/arm_pmu.h
> +++ b/include/kvm/arm_pmu.h
> @@ -34,9 +34,20 @@ struct kvm_pmu {
>   int irq_num;
>   struct kvm_pmc pmc[ARMV8_MAX_COUNTERS];
>  };
> +
> +u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx);
> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
> + u64 select_idx);
>  #else
>  struct kvm_pmu {
>  };
> +
> +u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
> +{
> + return 0;
> +}
> +void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
> + u64 select_idx) {}
>  #endif
>  
>  #endif
> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
> new file mode 100644
> index 000..9d27999
> --- /dev/null
> +++ b/virt/kvm/arm/pmu.c
> @@ -0,0 +1,122 @@
> +/*
> + * Copyright (C) 2015 Linaro Ltd.
> + * Author: Shannon Zhao 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see .
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +/**
> + * kvm_pmu_get_counter_value - get PMU counter value
> + * @vcpu: The vcpu pointer
> + * @select_idx: The counter index
> + */
> +u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
> +{
> + u64 counter, reg, enabled, running;
> + struct kvm_pmu *pmu = >arch.pmu;
> + struct kvm_pmc *pmc = >pmc[select_idx];
> +
> + reg = (select_idx == ARMV8_CYCLE_IDX)
> +   ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
> + counter = vcpu_sys_reg(vcpu, reg);
> +
> + /* The real counter value is equal to the value of counter register plus
> +  * the value perf event counts.
> +  */
> + if (pmc->perf_event)
> + counter += perf_event_read_value(pmc->perf_event, ,
> +  );
> +
> + return counter & pmc->bitmask;
> +}
> +
> +/**
> + * kvm_pmu_stop_counter - stop PMU counter
> + * @pmc: The PMU counter pointer
> + *
> + * If this counter has been configured to monitor some event, release it 
> here.

Re: [PATCH v8 00/20] KVM: ARM64: Add guest PMU support

2016-01-07 Thread Marc Zyngier
On 07/01/16 14:12, Will Deacon wrote:
> On Thu, Jan 07, 2016 at 02:10:38PM +, Marc Zyngier wrote:
>> On 22/12/15 08:07, Shannon Zhao wrote:
>>> From: Shannon Zhao 
>>>
>>> This patchset adds guest PMU support for KVM on ARM64. It takes
>>> trap-and-emulate approach. When guest wants to monitor one event, it
>>> will be trapped by KVM and KVM will call perf_event API to create a perf
>>> event and call relevant perf_event APIs to get the count value of event.
>>>
>>> Use perf to test this patchset in guest. When using "perf list", it
>>> shows the list of the hardware events and hardware cache events perf
>>> supports. Then use "perf stat -e EVENT" to monitor some event. For
>>> example, use "perf stat -e cycles" to count cpu cycles and
>>> "perf stat -e cache-misses" to count cache misses.
>>
>> I finally feel like we're pretty close to something we could merge. My
>> current concerns are:
> 
> The merge window opens on Monday and I'm not prepared to take further
> PMU changes at this point (unless they're bug fixes, of course).
> 
> This needs to wait until 4.6.

Fair enough. I guess I'll continue the review process and queue that
early in the next cycle.

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 20/20] KVM: ARM64: Add a new kvm ARM PMU device

2016-01-07 Thread Peter Maydell
On 7 January 2016 at 14:49, Shannon Zhao  wrote:
>
>
> On 2016/1/7 22:36, Peter Maydell wrote:
>> On 22 December 2015 at 08:08, Shannon Zhao  wrote:
>>> From: Shannon Zhao 
>>>
>>> Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
>>> the kvm_device_ops for it.
>>>
>>> Signed-off-by: Shannon Zhao 
>>> ---
>>>  Documentation/virtual/kvm/devices/arm-pmu.txt |  24 +
>>>  arch/arm64/include/uapi/asm/kvm.h |   4 +
>>>  include/linux/kvm_host.h  |   1 +
>>>  include/uapi/linux/kvm.h  |   2 +
>>>  virt/kvm/arm/pmu.c| 128 
>>> ++
>>>  virt/kvm/kvm_main.c   |   4 +
>>>  6 files changed, 163 insertions(+)
>>>  create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt
>>>
>>> diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt 
>>> b/Documentation/virtual/kvm/devices/arm-pmu.txt
>>> new file mode 100644
>>> index 000..dda864e
>>> --- /dev/null
>>> +++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
>>> @@ -0,0 +1,24 @@
>>> +ARM Virtual Performance Monitor Unit (vPMU)
>>> +===
>>> +
>>> +Device types supported:
>>> +  KVM_DEV_TYPE_ARM_PMU_V3 ARM Performance Monitor Unit v3
>>> +
>>> +Instantiate one PMU instance for per VCPU through this API.
>>
>> Do you mean that userspace has to call this API once per vCPU to
>> create each PMU, or that calling the device create ioctl once makes
>> the kernel instantiate a PMU for each vCPU?
>>
> Call the device create ioctl once and kvm will create a PMU for each
> vCPU. But userspace should set the irqs for each PMU since for SPI they
> are different.
>
>> (It's a little bit confusing that we say "this API" to mean
>> "not the API documented in this file at all but actually
>> the KVM_CREATE_DEVICE ioctl", but I see we do that in the GIC
>> API docs too.)
>>
>>> +
>>> +Groups:
>>> +  KVM_DEV_ARM_PMU_GRP_IRQ
>>> +  Attributes:
>>> +The attr field of kvm_device_attr encodes one value:
>>> +bits: | 63  32 | 31   0 |
>>> +values:   |  reserved  | vcpu_index |
>>> +A value describing the PMU overflow interrupt number for the specified
>>> +vcpu_index vcpu. This interrupt could be a PPI or SPI, but for one VM 
>>> the
>>> +interrupt type must be same for each vcpu. As a PPI, the interrupt 
>>> number is
>>> +same for all vcpus, while as a SPI it must be different for each vcpu.
>>
>> I see we're using vcpu_index rather than MPIDR affinity value
>> for specifying which CPU we're configuring. Is this in line with
>> our planned API for GICv3 configuration?
>>
> Here vcpu_index is used to indexing the vCPU, no special use.

Yes, but you can identify the CPU by index, or by its MPIDR.
We had a discussion about which was the best way for doing
the VGIC API, and I can't remember which way round we ended up
going for. Whichever we chose, we should do the same thing here.

>>> +  Errors:
>>> +-ENXIO: Unsupported attribute group
>>> +-EBUSY: The PMU overflow interrupt is already set
>>> +-ENODEV: Getting the PMU overflow interrupt number while it's not set
>>> +-EINVAL: Invalid vcpu_index or PMU overflow interrupt number supplied
>>
>> What happens if you create a PMU but then never set the IRQ number?
>> Is there a default or does the VM refuse to run or something?
>>
> If userspace doesn't specify the irq number, the guest will not receive
> the PMU interrupt because we check if the irq is initialized when we
> inject the interrupt. But guest could still use the vPMU if QEMU
> generates a proper DTB or ACPI.

So is it a valid use case to create a PMU with the interrupt not wired
up to anything? (If it's never valid it would be nice to diagnose it
rather than just silently letting the guest run but not work right.)

thanks
-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virtio/s390: use dev_to_virtio

2016-01-07 Thread Cornelia Huck
On Wed, 30 Dec 2015 22:05:25 +0800
Geliang Tang  wrote:

> Use dev_to_virtio() instead of open-coding it.
> 
> Signed-off-by: Geliang Tang 
> ---
>  drivers/s390/virtio/virtio_ccw.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)

Thanks, added to my queue.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] KVM: renumber architecture-dependent requests

2016-01-07 Thread Paolo Bonzini


On 07/01/2016 16:27, Christian Borntraeger wrote:
> On 01/07/2016 03:17 PM, Paolo Bonzini wrote:
>> Leave room for 4 more arch-independent requests.
> 
> The patch subject is wrong.
> 
> "renumber architecture-dependent requests"
> 
> --> "renumber kvm requests"
> 
> as we also renumber the architecture independent ones.

Right.  Ok with that change?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] KVM: move architecture-dependent requests to arch/

2016-01-07 Thread Paolo Bonzini


On 07/01/2016 16:54, Christian Borntraeger wrote:
> On 01/07/2016 03:17 PM, Paolo Bonzini wrote:
> 
> Can you add at least a one line patch description?

Yes, and it will be more than one line. :)

"Since the numbers now overlap, it makes sense to enumerate
them in asm/kvm_host.h rather than linux/kvm_host.h.  Functions
that refer to architecture-specific requests are also moved
to arch/."

Paolo

>> Signed-off-by: Paolo Bonzini 
> 
> 
> Reviewed-by: Christian Borntraeger 
>> ---
>>  arch/powerpc/include/asm/kvm_host.h |  4 
>>  arch/s390/include/asm/kvm_host.h|  4 
>>  arch/x86/include/asm/kvm_host.h | 28 +++
>>  arch/x86/kvm/x86.c  | 10 ++
>>  include/linux/kvm_host.h| 38 
>> ++---
>>  virt/kvm/kvm_main.c | 10 --
>>  6 files changed, 48 insertions(+), 46 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/kvm_host.h 
>> b/arch/powerpc/include/asm/kvm_host.h
>> index cfa758c6b4f6..271fefbbe521 100644
>> --- a/arch/powerpc/include/asm/kvm_host.h
>> +++ b/arch/powerpc/include/asm/kvm_host.h
>> @@ -50,6 +50,10 @@
>>  #define KVM_NR_IRQCHIPS  1
>>  #define KVM_IRQCHIP_NUM_PINS 256
>>
>> +/* PPC-specific vcpu->requests bit members */
>> +#define KVM_REQ_WATCHDOG   8
>> +#define KVM_REQ_EPR_EXIT   9
>> +
>>  #include 
>>
>>  #define KVM_ARCH_WANT_MMU_NOTIFIER
>> diff --git a/arch/s390/include/asm/kvm_host.h 
>> b/arch/s390/include/asm/kvm_host.h
>> index c83144110ea9..31fe20f4d129 100644
>> --- a/arch/s390/include/asm/kvm_host.h
>> +++ b/arch/s390/include/asm/kvm_host.h
>> @@ -39,6 +39,10 @@
>>  #define KVM_IRQCHIP_NUM_PINS 4096
>>  #define KVM_HALT_POLL_NS_DEFAULT 0
>>
>> +/* s390-specific vcpu->requests bit members */
>> +#define KVM_REQ_ENABLE_IBS 8
>> +#define KVM_REQ_DISABLE_IBS9
>> +
>>  #define SIGP_CTRL_C 0x80
>>  #define SIGP_CTRL_SCN_MASK  0x3f
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h 
>> b/arch/x86/include/asm/kvm_host.h
>> index a7c89876698b..44adbb819041 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -46,6 +46,31 @@
>>
>>  #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
>>
>> +/* x86-specific vcpu->requests bit members */
>> +#define KVM_REQ_MIGRATE_TIMER  8
>> +#define KVM_REQ_REPORT_TPR_ACCESS  9
>> +#define KVM_REQ_TRIPLE_FAULT  10
>> +#define KVM_REQ_MMU_SYNC  11
>> +#define KVM_REQ_CLOCK_UPDATE  12
>> +#define KVM_REQ_DEACTIVATE_FPU13
>> +#define KVM_REQ_EVENT 14
>> +#define KVM_REQ_APF_HALT  15
>> +#define KVM_REQ_STEAL_UPDATE  16
>> +#define KVM_REQ_NMI   17
>> +#define KVM_REQ_PMU   18
>> +#define KVM_REQ_PMI   19
>> +#define KVM_REQ_SMI   20
>> +#define KVM_REQ_MASTERCLOCK_UPDATE 21
>> +#define KVM_REQ_MCLOCK_INPROGRESS 22
>> +#define KVM_REQ_SCAN_IOAPIC   23
>> +#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
>> +#define KVM_REQ_APIC_PAGE_RELOAD  25
>> +#define KVM_REQ_HV_CRASH  26
>> +#define KVM_REQ_IOAPIC_EOI_EXIT   27
>> +#define KVM_REQ_HV_RESET  28
>> +#define KVM_REQ_HV_EXIT   29
>> +#define KVM_REQ_HV_STIMER 30
>> +
>>  #define CR0_RESERVED_BITS   \
>>  (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
>>| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
>> @@ -1268,6 +1293,9 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 
>> host_tsc);
>>  unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
>>  bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
>>
>> +void kvm_make_mclock_inprogress_request(struct kvm *kvm);
>> +void kvm_make_scan_ioapic_request(struct kvm *kvm);
>> +
>>  void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
>>   struct kvm_async_pf *work);
>>  void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 102c3028513f..dc6b37f47cd7 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -1686,6 +1686,11 @@ static void pvclock_update_vm_gtod_copy(struct kvm 
>> *kvm)
>>  #endif
>>  }
>>
>> +void kvm_make_mclock_inprogress_request(struct kvm *kvm)
>> +{
>> +kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
>> +}
>> +
>>  static void kvm_gen_update_masterclock(struct kvm *kvm)
>>  {
>>  #ifdef CONFIG_X86_64
>> @@ -6337,6 +6342,11 @@ static void process_smi(struct kvm_vcpu *vcpu)
>>  kvm_mmu_reset_context(vcpu);
>>  }
>>
>> +void kvm_make_scan_ioapic_request(struct kvm *kvm)
>> +{
>> +kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
>> +}
>> +
>>  static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
>>  {
>>  u64 eoi_exit_bitmap[4];
>> diff --git a/include/linux/kvm_host.h 

Re: [PATCH v1 0/6] KVM: Hyper-V SynIC timers migration fixes

2016-01-07 Thread Paolo Bonzini


On 23/12/2015 12:28, Andrey Smetanin wrote:
> During testing of Windows 2012R2 guest migration with
> Hyper-V SynIC timers enabled we found several bugs
> which lead to restoring guest in a hung state.
> 
> This patch series provides several fixes to make the
> migration of guest with Hyper-V SynIC timers enabled
> succeed.
> 
> The series applies on top of
> 'kvm/x86: Remove Hyper-V SynIC timer stopping'
> previously sent.
> 
> Signed-off-by: Andrey Smetanin 
> Reviewed-by: Roman Kagan 
> CC: Gleb Natapov 
> CC: Paolo Bonzini 
> CC: Roman Kagan 
> CC: Denis V. Lunev 
> CC: qemu-de...@nongnu.org
> 
> Andrey Smetanin (6):
>   kvm/x86: Drop stimer_stop() function
>   kvm/x86: Hyper-V unify stimer_start() and stimer_restart()
>   kvm/x86: Reorg stimer_expiration() to better control timer restart
>   kvm/x86: Hyper-V fix SynIC timer disabling condition
>   kvm/x86: Skip SynIC vector check for QEMU side
>   kvm/x86: Update SynIC timers on guest entry only
> 
>  arch/x86/kvm/hyperv.c | 112 
> +++---
>  arch/x86/kvm/x86.c|   6 +++
>  2 files changed, 58 insertions(+), 60 deletions(-)
> 

Applied, let me know if I should fix up patch 3 (the bug is preexisting
anyway, if it is a bug as I suspect).

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] KVM: move architecture-dependent requests to arch/

2016-01-07 Thread Christian Borntraeger
On 01/07/2016 03:17 PM, Paolo Bonzini wrote:

Can you add at least a one line patch description?

> Signed-off-by: Paolo Bonzini 


Reviewed-by: Christian Borntraeger 
> ---
>  arch/powerpc/include/asm/kvm_host.h |  4 
>  arch/s390/include/asm/kvm_host.h|  4 
>  arch/x86/include/asm/kvm_host.h | 28 +++
>  arch/x86/kvm/x86.c  | 10 ++
>  include/linux/kvm_host.h| 38 
> ++---
>  virt/kvm/kvm_main.c | 10 --
>  6 files changed, 48 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> b/arch/powerpc/include/asm/kvm_host.h
> index cfa758c6b4f6..271fefbbe521 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -50,6 +50,10 @@
>  #define KVM_NR_IRQCHIPS  1
>  #define KVM_IRQCHIP_NUM_PINS 256
> 
> +/* PPC-specific vcpu->requests bit members */
> +#define KVM_REQ_WATCHDOG   8
> +#define KVM_REQ_EPR_EXIT   9
> +
>  #include 
> 
>  #define KVM_ARCH_WANT_MMU_NOTIFIER
> diff --git a/arch/s390/include/asm/kvm_host.h 
> b/arch/s390/include/asm/kvm_host.h
> index c83144110ea9..31fe20f4d129 100644
> --- a/arch/s390/include/asm/kvm_host.h
> +++ b/arch/s390/include/asm/kvm_host.h
> @@ -39,6 +39,10 @@
>  #define KVM_IRQCHIP_NUM_PINS 4096
>  #define KVM_HALT_POLL_NS_DEFAULT 0
> 
> +/* s390-specific vcpu->requests bit members */
> +#define KVM_REQ_ENABLE_IBS 8
> +#define KVM_REQ_DISABLE_IBS9
> +
>  #define SIGP_CTRL_C  0x80
>  #define SIGP_CTRL_SCN_MASK   0x3f
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index a7c89876698b..44adbb819041 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -46,6 +46,31 @@
> 
>  #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
> 
> +/* x86-specific vcpu->requests bit members */
> +#define KVM_REQ_MIGRATE_TIMER  8
> +#define KVM_REQ_REPORT_TPR_ACCESS  9
> +#define KVM_REQ_TRIPLE_FAULT  10
> +#define KVM_REQ_MMU_SYNC  11
> +#define KVM_REQ_CLOCK_UPDATE  12
> +#define KVM_REQ_DEACTIVATE_FPU13
> +#define KVM_REQ_EVENT 14
> +#define KVM_REQ_APF_HALT  15
> +#define KVM_REQ_STEAL_UPDATE  16
> +#define KVM_REQ_NMI   17
> +#define KVM_REQ_PMU   18
> +#define KVM_REQ_PMI   19
> +#define KVM_REQ_SMI   20
> +#define KVM_REQ_MASTERCLOCK_UPDATE 21
> +#define KVM_REQ_MCLOCK_INPROGRESS 22
> +#define KVM_REQ_SCAN_IOAPIC   23
> +#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
> +#define KVM_REQ_APIC_PAGE_RELOAD  25
> +#define KVM_REQ_HV_CRASH  26
> +#define KVM_REQ_IOAPIC_EOI_EXIT   27
> +#define KVM_REQ_HV_RESET  28
> +#define KVM_REQ_HV_EXIT   29
> +#define KVM_REQ_HV_STIMER 30
> +
>  #define CR0_RESERVED_BITS   \
>   (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
> @@ -1268,6 +1293,9 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 
> host_tsc);
>  unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
>  bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
> 
> +void kvm_make_mclock_inprogress_request(struct kvm *kvm);
> +void kvm_make_scan_ioapic_request(struct kvm *kvm);
> +
>  void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
>struct kvm_async_pf *work);
>  void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 102c3028513f..dc6b37f47cd7 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1686,6 +1686,11 @@ static void pvclock_update_vm_gtod_copy(struct kvm 
> *kvm)
>  #endif
>  }
> 
> +void kvm_make_mclock_inprogress_request(struct kvm *kvm)
> +{
> + kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
> +}
> +
>  static void kvm_gen_update_masterclock(struct kvm *kvm)
>  {
>  #ifdef CONFIG_X86_64
> @@ -6337,6 +6342,11 @@ static void process_smi(struct kvm_vcpu *vcpu)
>   kvm_mmu_reset_context(vcpu);
>  }
> 
> +void kvm_make_scan_ioapic_request(struct kvm *kvm)
> +{
> + kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
> +}
> +
>  static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
>  {
>   u64 eoi_exit_bitmap[4];
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index b0ec0f778192..81c35dff52fd 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -111,46 +111,14 @@ static inline bool is_error_page(struct page *page)
>  }
> 
>  /*
> - * vcpu->requests bit members
> + * Architecture-independent vcpu->requests bit members
> + * Bits 4-7 are reserved for more arch-independent bits.
>   */
>  #define KVM_REQ_TLB_FLUSH  0
>  #define 

Re: [PATCH v2 3/7] kvm/x86: Hyper-V unify stimer_start() and stimer_restart()

2016-01-07 Thread Paolo Bonzini


On 28/12/2015 16:27, Andrey Smetanin wrote:
> This will be used in future to start Hyper-V SynIC timer
> in several places by one logic in one function.
> 
> Changes v2:
> * drop stimer->count == 0 check inside stimer_start()
> * comment stimer_start() assumptions

Can you replace comments with WARN_ON_ONCE?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


lovely meet dear

2016-01-07 Thread happie2
Hi nice to meet you i"m ms happy by name you got me interested on 
serverfault.com ;how are you doing?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3.16.y-ckt 061/126] MIPS: KVM: Fix CACHE immediate offset sign extension

2016-01-06 Thread Luis Henriques
3.16.7-ckt22 -stable review patch.  If anyone has any objections, please let me 
know.

--

From: James Hogan 

commit c5c2a3b998f1ff5a586f9d37e154070b8d550d17 upstream.

The immediate field of the CACHE instruction is signed, so ensure that
it gets sign extended by casting it to an int16_t rather than just
masking the low 16 bits.

Fixes: e685c689f3a8 ("KVM/MIPS32: Privileged instruction/target branch 
emulation.")
Signed-off-by: James Hogan 
Cc: Ralf Baechle 
Cc: Paolo Bonzini 
Cc: Gleb Natapov 
Cc: linux-m...@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini 
[ luis: backported to 3.16:
  - file rename: emulate.c -> kvm_mips_emul.c ]
Signed-off-by: Luis Henriques 
---
 arch/mips/kvm/kvm_mips_emul.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kvm/kvm_mips_emul.c b/arch/mips/kvm/kvm_mips_emul.c
index 18b4e2fdae33..950229176c2f 100644
--- a/arch/mips/kvm/kvm_mips_emul.c
+++ b/arch/mips/kvm/kvm_mips_emul.c
@@ -1434,7 +1434,7 @@ kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, 
uint32_t cause,
 
base = (inst >> 21) & 0x1f;
op_inst = (inst >> 16) & 0x1f;
-   offset = inst & 0x;
+   offset = (int16_t)inst;
cache = (inst >> 16) & 0x3;
op = (inst >> 18) & 0x7;
 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3.16.y-ckt 060/126] MIPS: KVM: Fix ASID restoration logic

2016-01-06 Thread Luis Henriques
3.16.7-ckt22 -stable review patch.  If anyone has any objections, please let me 
know.

--

From: James Hogan 

commit 002374f371bd02df864cce1fe85d90dc5b292837 upstream.

ASID restoration on guest resume should determine the guest execution
mode based on the guest Status register rather than bit 30 of the guest
PC.

Fix the two places in locore.S that do this, loading the guest status
from the cop0 area. Note, this assembly is specific to the trap &
emulate implementation of KVM, so it doesn't need to check the
supervisor bit as that mode is not implemented in the guest.

Fixes: b680f70fc111 ("KVM/MIPS32: Entry point for trampolining to...")
Signed-off-by: James Hogan 
Cc: Ralf Baechle 
Cc: Paolo Bonzini 
Cc: Gleb Natapov 
Cc: linux-m...@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini 
[ luis: backported to 3.16:
  - file rename: locore.S -> kvm_locore.S ]
Signed-off-by: Luis Henriques 
---
 arch/mips/kvm/kvm_locore.S | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
index 17376cd838e6..fc24acb3a837 100644
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -159,9 +159,11 @@ FEXPORT(__kvm_mips_vcpu_run)
 
 FEXPORT(__kvm_mips_load_asid)
/* Set the ASID for the Guest Kernel */
-   INT_SLL t0, t0, 1   /* with kseg0 @ 0x4000, kernel */
-   /* addresses shift to 0x8000 */
-   bltzt0, 1f  /* If kernel */
+   PTR_L   t0, VCPU_COP0(k1)
+   LONG_L  t0, COP0_STATUS(t0)
+   andit0, KSU_USER | ST0_ERL | ST0_EXL
+   xorit0, KSU_USER
+   bnezt0, 1f  /* If kernel */
 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID/* else user */
 1:
@@ -438,9 +440,11 @@ __kvm_mips_return_to_guest:
mtc0t0, CP0_EPC
 
/* Set the ASID for the Guest Kernel */
-   INT_SLL t0, t0, 1   /* with kseg0 @ 0x4000, kernel */
-   /* addresses shift to 0x8000 */
-   bltzt0, 1f  /* If kernel */
+   PTR_L   t0, VCPU_COP0(k1)
+   LONG_L  t0, COP0_STATUS(t0)
+   andit0, KSU_USER | ST0_ERL | ST0_EXL
+   xorit0, KSU_USER
+   bnezt0, 1f  /* If kernel */
 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID/* else user */
 1:
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3.16.y-ckt 062/126] MIPS: KVM: Uninit VCPU in vcpu_create error path

2016-01-06 Thread Luis Henriques
3.16.7-ckt22 -stable review patch.  If anyone has any objections, please let me 
know.

--

From: James Hogan 

commit 585bb8f9a5e592f2ce7abbe5ed3112d5438d2754 upstream.

If either of the memory allocations in kvm_arch_vcpu_create() fail, the
vcpu which has been allocated and kvm_vcpu_init'd doesn't get uninit'd
in the error handling path. Add a call to kvm_vcpu_uninit() to fix this.

Fixes: 669e846e6c4e ("KVM/MIPS32: MIPS arch specific APIs for KVM")
Signed-off-by: James Hogan 
Cc: Ralf Baechle 
Cc: Paolo Bonzini 
Cc: Gleb Natapov 
Cc: linux-m...@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini 
[ luis: backported to 3.16:
  - file rename: mips.c -> kvm_mips.c ]
Signed-off-by: Luis Henriques 
---
 arch/mips/kvm/kvm_mips.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index cc721a3c8996..2c81c2c9e8dc 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -307,7 +307,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 
unsigned int id)
 
if (!gebase) {
err = -ENOMEM;
-   goto out_free_cpu;
+   goto out_uninit_cpu;
}
kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n",
  ALIGN(size, PAGE_SIZE), gebase);
@@ -368,6 +368,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 
unsigned int id)
 out_free_gebase:
kfree(gebase);
 
+out_uninit_cpu:
+   kvm_vcpu_uninit(vcpu);
+
 out_free_cpu:
kfree(vcpu);
 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1] kvm/x86: Hyper-V tsc page setup

2016-01-06 Thread Andrey Smetanin



On 01/06/2016 12:48 AM, Peter Hornyack wrote:

On Thu, Dec 24, 2015 at 1:33 AM, Andrey Smetanin
 wrote:

Lately tsc page was implemented but filled with empty
values. This patch setup tsc page scale and offset based
on vcpu tsc, tsc_khz and  HV_X64_MSR_TIME_REF_COUNT value.

The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
reads count to zero which potentially improves performance.

The patch applies on top of
'kvm: Make vcpu->requests as 64 bit bitmap'
previously sent.

Signed-off-by: Andrey Smetanin 
CC: Paolo Bonzini 
CC: Gleb Natapov 
CC: Roman Kagan 
CC: Denis V. Lunev 
CC: qemu-de...@nongnu.org

Reviewed-by: Peter Hornyack 



---
  arch/x86/kvm/hyperv.c| 117 +--
  arch/x86/kvm/hyperv.h|   2 +
  arch/x86/kvm/x86.c   |  12 +
  include/linux/kvm_host.h |   1 +
  4 files changed, 117 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index d50675a..504fdc7 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -753,6 +753,105 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu 
*vcpu,
 return 0;
  }

+static u64 calc_tsc_page_scale(u32 tsc_khz)
+{
+   /*
+* reftime (in 100ns) = tsc * tsc_scale / 2^64 + tsc_offset
+* so reftime_delta = (tsc_delta * tsc_scale) / 2^64
+* so tsc_scale = (2^64 * reftime_delta)/tsc_delta
+* so tsc_scale = (2^64 * 10 * 10^6) / tsc_hz = (2^64 * 1) / tsc_khz
+* so tsc_scale = (2^63 * 2 * 1) / tsc_khz
+*/
+   return mul_u64_u32_div(1ULL << 63, 2 * 1, tsc_khz);
+}
+
+static int write_tsc_page(struct kvm *kvm, u64 gfn,
+ PHV_REFERENCE_TSC_PAGE tsc_ref)
+{
+   if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+   tsc_ref, sizeof(*tsc_ref)))
+   return 1;
+   mark_page_dirty(kvm, gfn);
+   return 0;
+}
+
+static int read_tsc_page(struct kvm *kvm, u64 gfn,
+PHV_REFERENCE_TSC_PAGE tsc_ref)
+{
+   if (kvm_read_guest(kvm, gfn_to_gpa(gfn),
+  tsc_ref, sizeof(*tsc_ref)))
+   return 1;
+   return 0;
+}
+
+static u64 calc_tsc_page_time(struct kvm_vcpu *vcpu,
+ PHV_REFERENCE_TSC_PAGE tsc_ref)
+{
+
+   u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+   return mul_u64_u64_shr(tsc, tsc_ref->tsc_scale, 64)
+   + tsc_ref->tsc_offset;
+}
+
+static int setup_blank_tsc_page(struct kvm_vcpu *vcpu, u64 gfn)
+{
+   HV_REFERENCE_TSC_PAGE tsc_ref;
+
+   memset(_ref, 0, sizeof(tsc_ref));
+   return write_tsc_page(vcpu->kvm, gfn, _ref);
+}
+
+int kvm_hv_setup_tsc_page(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_hv *hv = >arch.hyperv;
+   HV_REFERENCE_TSC_PAGE tsc_ref;
+   u32 tsc_khz;
+   int r;
+   u64 gfn, ref_time, tsc_scale, tsc_offset, tsc;
+
+   if (WARN_ON_ONCE(!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)))
+   return -EINVAL;
+
+   gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+   vcpu_debug(vcpu, "tsc page gfn 0x%llx\n", gfn);
+
+   tsc_khz = vcpu->arch.virtual_tsc_khz;
+   if (!tsc_khz) {
+   vcpu_unimpl(vcpu, "no tsc khz\n");
+   return setup_blank_tsc_page(vcpu, gfn);
+   }
+
+   r = read_tsc_page(kvm, gfn, _ref);
+   if (r) {
+   vcpu_err(vcpu, "can't access tsc page gfn 0x%llx\n", gfn);
+   return r;
+   }
+
+   tsc_scale = calc_tsc_page_scale(tsc_khz);
+   ref_time = get_time_ref_counter(kvm);
+   tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+   /* tsc_offset = reftime - tsc * tsc_scale / 2^64 */
+   tsc_offset = ref_time - mul_u64_u64_shr(tsc, tsc_scale, 64);
+   vcpu_debug(vcpu, "tsc khz %u tsc %llu scale %llu offset %llu\n",
+  tsc_khz, tsc, tsc_scale, tsc_offset);
+
+   tsc_ref.tsc_sequence++;
+   if (tsc_ref.tsc_sequence == 0)


Also avoid tsc_sequence == 0x here. In the Hyper-V TLFS 4.0
(Win2012 R2) 0x is the special sequence number to disable the
reference TSC page.


we already discussed with Microsoft
that documentation contains wrong sequence number
- 0x (instead of 0). please take a look into details here:
https://lkml.org/lkml/2015/11/2/655

+   tsc_ref.tsc_sequence = 1;
+
+   tsc_ref.tsc_scale = tsc_scale;
+   tsc_ref.tsc_offset = tsc_offset;
+
+   vcpu_debug(vcpu, "tsc page calibration time %llu vs. reftime %llu\n",
+  calc_tsc_page_time(vcpu, _ref),
+  get_time_ref_counter(kvm));
+
+   return write_tsc_page(kvm, gfn, _ref);
+}
+
  static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
  bool host)
  {
@@ 

Re: [RFC PATCH v2 3/3] vfio-pci: Allow to mmap MSI-X table if EEH is supported

2016-01-06 Thread Yongji Xie

On 2016/1/5 5:42, Benjamin Herrenschmidt wrote:

On Mon, 2016-01-04 at 14:07 -0700, Alex Williamson wrote:

On Thu, 2015-12-31 at 16:50 +0800, Yongji Xie wrote:

Current vfio-pci implementation disallows to mmap MSI-X
table in case that user get to touch this directly.

However, EEH mechanism can ensure that a given pci device
can only shoot the MSIs assigned for its PE. So we think
it's safe to expose the MSI-X table to userspace because
the exposed MSI-X table can't be used to do harm to other
memory space.

And with MSI-X table mmapped, some performance issues which
are caused when PCI adapters have critical registers in the
same page as the MSI-X table also can be resolved.

So this patch adds a Kconfig option, VFIO_PCI_MMAP_MSIX,
to support for mmapping MSI-X table.

Signed-off-by: Yongji Xie 
---
  drivers/vfio/pci/Kconfig|4 
  drivers/vfio/pci/vfio_pci.c |6 --
  2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 02912f1..67b0a2c 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -23,6 +23,10 @@ config VFIO_PCI_MMAP
depends on VFIO_PCI
def_bool y if !S390
  
+config VFIO_PCI_MMAP_MSIX

+   depends on VFIO_PCI_MMAP
+   def_bool y if EEH

Does CONFIG_EEH necessarily mean the EEH is enabled?  Could the
system
not support EEH or could EEH be disabled via kernel commandline
options?

EEH is definitely the wrong thing to test here anyway. What needs to be
tested is that the PCI Host bridge supports filtering of MSIs, so
ideally this should be some kind of host bridge attribute set by the
architecture backend.


So do you mean this attribute can be added in pci_host_bridge like this:

--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -412,6 +412,7 @@ struct pci_host_bridge {
void (*release_fn)(struct pci_host_bridge *);
void *release_data;
unsigned int ignore_reset_delay:1;  /* for entire hierarchy */
+   unsigned int msix_filtered:1;   /* support filtering of MSIs */
/* Resource alignment requirements */
resource_size_t (*align_resource)(struct pci_dev *dev,
const struct resource *res,

I can surely do it if there is no objection from PCI folks. Thanks.

Regards,
Yongji Xie


This can happen with or without CONFIG_EEH and you are right,
CONFIG_EEH can be enabled and the machine not support it.

Any IODA bridge will support this.

Cheers,
Ben.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: How to reserve guest physical region for ACPI

2016-01-06 Thread Igor Mammedov
On Tue, 5 Jan 2016 18:22:33 +0100
Laszlo Ersek  wrote:

> On 01/05/16 18:08, Igor Mammedov wrote:
> > On Mon, 4 Jan 2016 21:17:31 +0100
> > Laszlo Ersek  wrote:
> >   
> >> Michael CC'd me on the grandparent of the email below. I'll try to add
> >> my thoughts in a single go, with regard to OVMF.
> >>
> >> On 12/30/15 20:52, Michael S. Tsirkin wrote:  
> >>> On Wed, Dec 30, 2015 at 04:55:54PM +0100, Igor Mammedov wrote:
>  On Mon, 28 Dec 2015 14:50:15 +0200
>  "Michael S. Tsirkin"  wrote:
> 
> > On Mon, Dec 28, 2015 at 10:39:04AM +0800, Xiao Guangrong wrote:
> >>
> >> Hi Michael, Paolo,
> >>
> >> Now it is the time to return to the challenge that how to reserve guest
> >> physical region internally used by ACPI.
> >>
> >> Igor suggested that:
> >> | An alternative place to allocate reserve from could be high memory.
> >> | For pc we have "reserved-memory-end" which currently makes sure
> >> | that hotpluggable memory range isn't used by firmware
> >> (https://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg00926.html)
> >> 
> >>
> >> OVMF has no support for the "reserved-memory-end" fw_cfg file. The
> >> reason is that nobody wrote that patch, nor asked for the patch to be
> >> written. (Not implying that just requesting the patch would be
> >> sufficient for the patch to be written.)
> >>  
> > I don't want to tie things to reserved-memory-end because this
> > does not scale: next time we need to reserve memory,
> > we'll need to find yet another way to figure out what is where.
>  Could you elaborate a bit more on a problem you're seeing?
> 
>  To me it looks like it scales rather well.
>  For example lets imagine that we adding a device
>  that has some on device memory that should be mapped into GPA
>  code to do so would look like:
> 
>    pc_machine_device_plug_cb(dev)
>    {
> ...
> if (dev == OUR_NEW_DEVICE_TYPE) {
> memory_region_add_subregion(as, current_reserved_end, >mr);
> set_new_reserved_end(current_reserved_end + 
>  memory_region_size(>mr));
> }
>    }
> 
>  we can practically add any number of new devices that way.
> >>>
> >>> Yes but we'll have to build a host side allocator for these, and that's
> >>> nasty. We'll also have to maintain these addresses indefinitely (at
> >>> least per machine version) as they are guest visible.
> >>> Not only that, there's no way for guest to know if we move things
> >>> around, so basically we'll never be able to change addresses.
> >>>
> >>> 
>   
> > I would like ./hw/acpi/bios-linker-loader.c interface to be extended to
> > support 64 bit RAM instead
> >>
> >> This looks quite doable in OVMF, as long as the blob to allocate from
> >> high memory contains *zero* ACPI tables.
> >>
> >> (
> >> Namely, each ACPI table is installed from the containing fw_cfg blob
> >> with EFI_ACPI_TABLE_PROTOCOL.InstallAcpiTable(), and the latter has its
> >> own allocation policy for the *copies* of ACPI tables it installs.
> >>
> >> This allocation policy is left unspecified in the section of the UEFI
> >> spec that governs EFI_ACPI_TABLE_PROTOCOL.
> >>
> >> The current policy in edk2 (= the reference implementation) seems to be
> >> "allocate from under 4GB". It is currently being changed to "try to
> >> allocate from under 4GB, and if that fails, retry from high memory". (It
> >> is motivated by Aarch64 machines that may have no DRAM at all under 4GB.)
> >> )
> >>  
> > (and maybe a way to allocate and
> > zero-initialize buffer without loading it through fwcfg),
> >>
> >> Sounds reasonable.
> >>  
> > this way bios
> > does the allocation, and addresses can be patched into acpi.
>  and then guest side needs to parse/execute some AML that would
>  initialize QEMU side so it would know where to write data.
> >>>
> >>> Well not really - we can put it in a data table, by itself
> >>> so it's easy to find.
> >>
> >> Do you mean acpi_tb_find_table(), acpi_get_table_by_index() /
> >> acpi_get_table_with_size()?
> >>  
> >>>
> >>> AML is only needed if access from ACPI is desired.
> >>>
> >>> 
>  bios-linker-loader is a great interface for initializing some
>  guest owned data and linking it together but I think it adds
>  unnecessary complexity and is misused if it's used to handle
>  device owned data/on device memory in this and VMGID cases.
> >>>
> >>> I want a generic interface for guest to enumerate these things.  linker
> >>> seems quite reasonable but if you see a reason why it won't do, or want
> >>> to propose a better interface, fine.
> >>
> >> * The guest could do the following:
> >> - while processing the ALLOCATE commands, it would make a note where in
> >> GPA space each fw_cfg blob gets allocated
> >> - at the 

Re: [PATCH 3/6] nvdimm acpi: introduce patched dsm memory

2016-01-06 Thread Xiao Guangrong



On 01/06/2016 11:23 PM, Igor Mammedov wrote:

On Tue,  5 Jan 2016 02:52:05 +0800
Xiao Guangrong  wrote:


The dsm memory is used to save the input parameters and store
the dsm result which is filled by QEMU.

The address of dsm memory is decided by bios and patched into
int64 object returned by "MEMA" method

Signed-off-by: Xiao Guangrong 
---
  hw/acpi/aml-build.c | 12 
  hw/acpi/nvdimm.c| 24 ++--
  include/hw/acpi/aml-build.h |  1 +
  3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 78e1290..83eadb3 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -394,6 +394,18 @@ Aml *aml_int(const uint64_t val)
  }

  /*
+ * ACPI 1.0b: 16.2.3 Data Objects Encoding:
+ * encode: QWordConst
+ */
+Aml *aml_int64(const uint64_t val)
+{
+Aml *var = aml_alloc();
+build_append_byte(var->buf, 0x0E); /* QWordPrefix */
+build_append_int_noprefix(var->buf, val, 8);
+return var;
+}
+
+/*
   * helper to construct NameString, which returns Aml object
   * for using with aml_append or other aml_* terms
   */
diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index bc7cd8f..a72104c 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -28,6 +28,7 @@

  #include "hw/acpi/acpi.h"
  #include "hw/acpi/aml-build.h"
+#include "hw/acpi/bios-linker-loader.h"
  #include "hw/nvram/fw_cfg.h"
  #include "hw/mem/nvdimm.h"

@@ -402,7 +403,8 @@ void nvdimm_init_acpi_state(AcpiNVDIMMState *state, 
MemoryRegion *io,
  state->dsm_mem->len);
  }

-#define NVDIMM_COMMON_DSM  "NCAL"
+#define NVDIMM_GET_DSM_MEM  "MEMA"
+#define NVDIMM_COMMON_DSM   "NCAL"

  static void nvdimm_build_common_dsm(Aml *dev)
  {
@@ -468,7 +470,8 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray 
*table_offsets,
GArray *table_data, GArray *linker,
uint8_t revision)
  {
-Aml *ssdt, *sb_scope, *dev;
+Aml *ssdt, *sb_scope, *dev, *method;
+int offset;

  acpi_add_table(table_offsets, table_data);

@@ -499,9 +502,26 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray 
*table_offsets,

  aml_append(sb_scope, dev);

+/*
+ * leave it at the end of ssdt so that we can conveniently get the
+ * offset of int64 object returned by the function which will be
+ * patched with the real address of the dsm memory by BIOS.
+ */
+method = aml_method(NVDIMM_GET_DSM_MEM, 0, AML_NOTSERIALIZED);
+aml_append(method, aml_return(aml_int64(0x0)));

there is no need in dedicated aml_int64(), you can use aml_int(0x64) 
trick


We can not do this due to the trick in  bios_linker_loader_add_pointer() which 
will
issue a COMMAND_ADD_POINTER to BIOS, however, this request does:
/*
 * COMMAND_ADD_POINTER - patch the table (originating from
 * @dest_file) at @pointer.offset, by adding a pointer to the table
 * originating from @src_file. 1,2,4 or 8 byte unsigned
 * addition is used depending on @pointer.size.
 */

that means the new-offset = old-offset + the address of the new table allocated 
by BIOS.

So we expect 0 offset here.




+aml_append(sb_scope, method);
  aml_append(ssdt, sb_scope);
  /* copy AML table into ACPI tables blob and patch header there */
  g_array_append_vals(table_data, ssdt->buf->data, ssdt->buf->len);
+
+offset = table_data->len - 8;
+
+bios_linker_loader_alloc(linker, NVDIMM_DSM_MEM_FILE, TARGET_PAGE_SIZE,
+ false /* high memory */);
+bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
+   NVDIMM_DSM_MEM_FILE, table_data,
+   table_data->data + offset,
+   sizeof(uint64_t));

this offset magic will break badly as soon as someone add something
to the end of SSDT.



Yes, it is, so don't do that, :) and this is why we made the comment here:
 +/*
 + * leave it at the end of ssdt so that we can conveniently get the
 + * offset of int64 object returned by the function which will be
 + * patched with the real address of the dsm memory by BIOS.
 + */


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: How to reserve guest physical region for ACPI

2016-01-06 Thread Laszlo Ersek
On 01/06/16 14:39, Igor Mammedov wrote:
> On Tue, 5 Jan 2016 18:22:33 +0100
> Laszlo Ersek  wrote:
> 
>> On 01/05/16 18:08, Igor Mammedov wrote:
>>> On Mon, 4 Jan 2016 21:17:31 +0100
>>> Laszlo Ersek  wrote:
>>>   
 Michael CC'd me on the grandparent of the email below. I'll try to add
 my thoughts in a single go, with regard to OVMF.

 On 12/30/15 20:52, Michael S. Tsirkin wrote:  
> On Wed, Dec 30, 2015 at 04:55:54PM +0100, Igor Mammedov wrote:
>> On Mon, 28 Dec 2015 14:50:15 +0200
>> "Michael S. Tsirkin"  wrote:
>>
>>> On Mon, Dec 28, 2015 at 10:39:04AM +0800, Xiao Guangrong wrote:

 Hi Michael, Paolo,

 Now it is the time to return to the challenge that how to reserve guest
 physical region internally used by ACPI.

 Igor suggested that:
 | An alternative place to allocate reserve from could be high memory.
 | For pc we have "reserved-memory-end" which currently makes sure
 | that hotpluggable memory range isn't used by firmware
 (https://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg00926.html)
 

 OVMF has no support for the "reserved-memory-end" fw_cfg file. The
 reason is that nobody wrote that patch, nor asked for the patch to be
 written. (Not implying that just requesting the patch would be
 sufficient for the patch to be written.)
  
>>> I don't want to tie things to reserved-memory-end because this
>>> does not scale: next time we need to reserve memory,
>>> we'll need to find yet another way to figure out what is where.
>> Could you elaborate a bit more on a problem you're seeing?
>>
>> To me it looks like it scales rather well.
>> For example lets imagine that we adding a device
>> that has some on device memory that should be mapped into GPA
>> code to do so would look like:
>>
>>   pc_machine_device_plug_cb(dev)
>>   {
>>...
>>if (dev == OUR_NEW_DEVICE_TYPE) {
>>memory_region_add_subregion(as, current_reserved_end, >mr);
>>set_new_reserved_end(current_reserved_end + 
>> memory_region_size(>mr));
>>}
>>   }
>>
>> we can practically add any number of new devices that way.
>
> Yes but we'll have to build a host side allocator for these, and that's
> nasty. We'll also have to maintain these addresses indefinitely (at
> least per machine version) as they are guest visible.
> Not only that, there's no way for guest to know if we move things
> around, so basically we'll never be able to change addresses.
>
> 
>>  
>>> I would like ./hw/acpi/bios-linker-loader.c interface to be extended to
>>> support 64 bit RAM instead

 This looks quite doable in OVMF, as long as the blob to allocate from
 high memory contains *zero* ACPI tables.

 (
 Namely, each ACPI table is installed from the containing fw_cfg blob
 with EFI_ACPI_TABLE_PROTOCOL.InstallAcpiTable(), and the latter has its
 own allocation policy for the *copies* of ACPI tables it installs.

 This allocation policy is left unspecified in the section of the UEFI
 spec that governs EFI_ACPI_TABLE_PROTOCOL.

 The current policy in edk2 (= the reference implementation) seems to be
 "allocate from under 4GB". It is currently being changed to "try to
 allocate from under 4GB, and if that fails, retry from high memory". (It
 is motivated by Aarch64 machines that may have no DRAM at all under 4GB.)
 )
  
>>> (and maybe a way to allocate and
>>> zero-initialize buffer without loading it through fwcfg),

 Sounds reasonable.
  
>>> this way bios
>>> does the allocation, and addresses can be patched into acpi.
>> and then guest side needs to parse/execute some AML that would
>> initialize QEMU side so it would know where to write data.
>
> Well not really - we can put it in a data table, by itself
> so it's easy to find.

 Do you mean acpi_tb_find_table(), acpi_get_table_by_index() /
 acpi_get_table_with_size()?
  
>
> AML is only needed if access from ACPI is desired.
>
> 
>> bios-linker-loader is a great interface for initializing some
>> guest owned data and linking it together but I think it adds
>> unnecessary complexity and is misused if it's used to handle
>> device owned data/on device memory in this and VMGID cases.
>
> I want a generic interface for guest to enumerate these things.  linker
> seems quite reasonable but if you see a reason why it won't do, or want
> to propose a better interface, fine.

 * The guest could do the following:
 - while processing the ALLOCATE commands, it would make a note where in
 GPA space 

Re: [PATCH 3/6] nvdimm acpi: introduce patched dsm memory

2016-01-06 Thread Igor Mammedov
On Tue,  5 Jan 2016 02:52:05 +0800
Xiao Guangrong  wrote:

> The dsm memory is used to save the input parameters and store
> the dsm result which is filled by QEMU.
> 
> The address of dsm memory is decided by bios and patched into
> int64 object returned by "MEMA" method
> 
> Signed-off-by: Xiao Guangrong 
> ---
>  hw/acpi/aml-build.c | 12 
>  hw/acpi/nvdimm.c| 24 ++--
>  include/hw/acpi/aml-build.h |  1 +
>  3 files changed, 35 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
> index 78e1290..83eadb3 100644
> --- a/hw/acpi/aml-build.c
> +++ b/hw/acpi/aml-build.c
> @@ -394,6 +394,18 @@ Aml *aml_int(const uint64_t val)
>  }
>  
>  /*
> + * ACPI 1.0b: 16.2.3 Data Objects Encoding:
> + * encode: QWordConst
> + */
> +Aml *aml_int64(const uint64_t val)
> +{
> +Aml *var = aml_alloc();
> +build_append_byte(var->buf, 0x0E); /* QWordPrefix */
> +build_append_int_noprefix(var->buf, val, 8);
> +return var;
> +}
> +
> +/*
>   * helper to construct NameString, which returns Aml object
>   * for using with aml_append or other aml_* terms
>   */
> diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
> index bc7cd8f..a72104c 100644
> --- a/hw/acpi/nvdimm.c
> +++ b/hw/acpi/nvdimm.c
> @@ -28,6 +28,7 @@
>  
>  #include "hw/acpi/acpi.h"
>  #include "hw/acpi/aml-build.h"
> +#include "hw/acpi/bios-linker-loader.h"
>  #include "hw/nvram/fw_cfg.h"
>  #include "hw/mem/nvdimm.h"
>  
> @@ -402,7 +403,8 @@ void nvdimm_init_acpi_state(AcpiNVDIMMState *state, 
> MemoryRegion *io,
>  state->dsm_mem->len);
>  }
>  
> -#define NVDIMM_COMMON_DSM  "NCAL"
> +#define NVDIMM_GET_DSM_MEM  "MEMA"
> +#define NVDIMM_COMMON_DSM   "NCAL"
>  
>  static void nvdimm_build_common_dsm(Aml *dev)
>  {
> @@ -468,7 +470,8 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray 
> *table_offsets,
>GArray *table_data, GArray *linker,
>uint8_t revision)
>  {
> -Aml *ssdt, *sb_scope, *dev;
> +Aml *ssdt, *sb_scope, *dev, *method;
> +int offset;
>  
>  acpi_add_table(table_offsets, table_data);
>  
> @@ -499,9 +502,26 @@ static void nvdimm_build_ssdt(GSList *device_list, 
> GArray *table_offsets,
>  
>  aml_append(sb_scope, dev);
>  
> +/*
> + * leave it at the end of ssdt so that we can conveniently get the
> + * offset of int64 object returned by the function which will be
> + * patched with the real address of the dsm memory by BIOS.
> + */
> +method = aml_method(NVDIMM_GET_DSM_MEM, 0, AML_NOTSERIALIZED);
> +aml_append(method, aml_return(aml_int64(0x0)));
there is no need in dedicated aml_int64(), you can use aml_int(0x64) 
trick

> +aml_append(sb_scope, method);
>  aml_append(ssdt, sb_scope);
>  /* copy AML table into ACPI tables blob and patch header there */
>  g_array_append_vals(table_data, ssdt->buf->data, ssdt->buf->len);
> +
> +offset = table_data->len - 8;
> +
> +bios_linker_loader_alloc(linker, NVDIMM_DSM_MEM_FILE, TARGET_PAGE_SIZE,
> + false /* high memory */);
> +bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
> +   NVDIMM_DSM_MEM_FILE, table_data,
> +   table_data->data + offset,
> +   sizeof(uint64_t));
this offset magic will break badly as soon as someone add something
to the end of SSDT.


>  build_header(linker, table_data,
>  (void *)(table_data->data + table_data->len - ssdt->buf->len),
>  "SSDT", ssdt->buf->len, revision, "NVDIMM");
> diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
> index ef44d02..b4726a4 100644
> --- a/include/hw/acpi/aml-build.h
> +++ b/include/hw/acpi/aml-build.h
> @@ -246,6 +246,7 @@ Aml *aml_name(const char *name_format, ...) 
> GCC_FMT_ATTR(1, 2);
>  Aml *aml_name_decl(const char *name, Aml *val);
>  Aml *aml_return(Aml *val);
>  Aml *aml_int(const uint64_t val);
> +Aml *aml_int64(const uint64_t val);
>  Aml *aml_arg(int pos);
>  Aml *aml_to_integer(Aml *arg);
>  Aml *aml_to_hexstring(Aml *src, Aml *dst);

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 110441] New: KVM guests randomly get I/O errors on VirtIO based devices

2016-01-06 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=110441

Bug ID: 110441
   Summary: KVM guests randomly get I/O errors on VirtIO based
devices
   Product: Virtualization
   Version: unspecified
Kernel Version: 3.16.7-ckt11-1+deb8u5
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: kvm
  Assignee: virtualization_...@kernel-bugs.osdl.org
  Reporter: jordi.mall...@collabora.co.uk
Regression: No

We've been seeing a strange bug in KVM guests hosted by a Debian jessie box
(running 3.16.7-ckt11-1+deb8u5 on x86-64),

Basically, we are getting random VirtIO errors inside our guests, resulting in
stuff like this

[4735406.568235] blk_update_request: I/O error, dev vda, sector 142339584
[4735406.572008] EXT4-fs warning (device dm-0): ext4_end_bio:317: I/O error -5
writing to inode 1184437 (offset 0 size 208896 starting block 17729472)
[4735406.572008] Buffer I/O error on device dm-0, logical block 17729472
[ ... ]
[4735406.572008] Buffer I/O error on device dm-0, logical block 17729481
[4735406.643486] blk_update_request: I/O error, dev vda, sector 142356480
[ ... ]
[4735406.748456] blk_update_request: I/O error, dev vda, sector 38587480
[4735411.020309] Buffer I/O error on dev dm-0, logical block 12640808, lost
sync page write
[4735411.055184] Aborting journal on device dm-0-8.
[4735411.056148] Buffer I/O error on dev dm-0, logical block 12615680, lost
sync page write
[4735411.057626] JBD2: Error -5 detected when updating journal superblock for
dm-0-8.
[4735411.057936] Buffer I/O error on dev dm-0, logical block 0, lost sync page
write
[4735411.057946] EXT4-fs error (device dm-0): ext4_journal_check_start:56:
Detected aborted journal
[4735411.057948] EXT4-fs (dm-0): Remounting filesystem read-only
[4735411.057949] EXT4-fs (dm-0): previous I/O error to superblock detected

(From an Ubuntu 15.04 guest, EXT4 on LVM2)

Or,

Jan 06 03:39:11 titanium kernel: end_request: I/O error, dev vda, sector
1592467904
Jan 06 03:39:11 titanium kernel: EXT4-fs warning (device vda3):
ext4_end_bio:317: I/O error -5 writing to inode 31169653 (offset 0 size 0
starting block 199058492)
Jan 06 03:39:11 titanium kernel: Buffer I/O error on device vda3, logical block
198899256
[...]
Jan 06 03:39:12 titanium kernel: Aborting journal on device vda3-8.
Jan 06 03:39:12 titanium kernel: Buffer I/O error on device vda3, logical block
99647488

(From a Debian jessie guest, EXT4 directly on a VirtIO-based block device)

When this happens, it affects multiple guests on the hosts at the same time.
Normally they are severe enough that they end up with a r/o file system, but
we've seen a few hosts survive with a non-fatal I/O error. The host's dmesg has
nothing interesting to see.

We've seen this happen with quite heterogeneous guests:

Debian 6, 7 and 8 (Debian kernels 2.6.32, 3.2 and 3.16)
Ubuntu 14.09 and 15.04 (Ubuntu kernels)
32 bit and 64 bit installs.

In short, we haven't seen a clear characteristic in any guest, other than the
affected hosts being the ones with some sustained I/O load (build machines,
cgit servers, PostgreSQL RDBMs...). Most of the times, hosts that just sit
there doing nothing with their disks are not affected.

The host is a stock Debian jessie install that manages libvirt-based QEMU
guests. All the guests have their block devices using virtio drivers, some of
them on spinning media based on LSI RAID (was a 3ware card before, got replaced
as we were very suspicious about it, but are getting the same results), and
some of them based on PCIe SSD storage. We have some other 3 hosts, similar
setup except they run Debian wheezy (and honestly we're not too keen on
upgrading them yet, just in case), none of them has ever shown this kind of
problem

We've been seeing this since last summer, and haven't found a pattern that
tells us where these I/O error bugs are coming from. Google isn't revealing
other people with a similar problem, and we're finding that quite surprising as
our setup is quite basic.

This has also been reported downstream at the Debian BTS as Bug#810121
(https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=810121).

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: HAPPY NEW YEAR

2016-01-06 Thread Carl Leinbach


From: Carl Leinbach
Sent: Wednesday, January 06, 2016 4:21 PM
To: Carl Leinbach
Subject: HAPPY NEW YEAR


Donation has been made to you by Mrs. Liliane Bettencourt. Contact email: 
mrslilianebettencou...@gmail.com  for more details

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: HAPPY NEW YEAR

2016-01-06 Thread Carl Leinbach


From: Carl Leinbach
Sent: Wednesday, January 06, 2016 4:21 PM
To: Carl Leinbach
Subject: HAPPY NEW YEAR


Donation has been made to you by Mrs. Liliane Bettencourt. Contact email: 
mrslilianebettencou...@gmail.com  for more details

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


What's kvmclock's custom sched_clock for?

2016-01-06 Thread Andy Lutomirski
AFAICT KVM reliably passes a monotonic TSC through to guests, even if
the host suspends.  That's all that sched_clock needs, I think.

So why does kvmclock have a custom sched_clock?

On a related note, KVM doesn't pass the "invariant TSC" feature
through to guests on my machine even though "invtsc" is set in QEMU
and the kernel host code appears to support it.  What gives?

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 0/4] scsi: cleanup ioctl headers and provide UAPI versions

2016-01-06 Thread Martin K. Petersen
> "Paolo" == Paolo Bonzini  writes:

>> This is v3 of the series to provide an "official" sg.h header (and
>> scsi_ioctl.h too, though it's basically obsolete) together with the
>> other userspace API definitions.  The change from v2 to v3 is that
>> defaults for sg.c are not exported in include/uapi/linux/sg.c.

Paolo> What happened to these patches?...

They predate me being patch monkey. Please repost with any review tags
or acks you may have received.

-- 
Martin K. Petersen  Oracle Linux Engineering
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Michael S. Tsirkin
On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> >> The two mechanisms referenced above would likely require coordination with
> >> QEMU and as such are open to discussion.  I haven't attempted to address
> >> them as I am not sure there is a consensus as of yet.  My personal
> >> preference would be to add a vendor-specific configuration block to the
> >> emulated pci-bridge interfaces created by QEMU that would allow us to
> >> essentially extend shpc to support guest live migration with pass-through
> >> devices.
> >
> > shpc?
> 
> That is kind of what I was thinking.  We basically need some mechanism
> to allow for the host to ask the device to quiesce.  It has been
> proposed to possibly even look at something like an ACPI interface
> since I know ACPI is used by QEMU to manage hot-plug in the standard
> case.
> 
> - Alex


Start by using hot-unplug for this!

Really use your patch guest side, and write host side
to allow starting migration with the device, but
defer completing it.

So

1.- host tells guest to start tracking memory writes
2.- guest acks
3.- migration starts
4.- most memory is migrated
5.- host tells guest to eject device
6.- guest acks
7.- stop vm and migrate rest of state


It will already be a win since hot unplug after migration starts and
most memory has been migrated is better than hot unplug before migration
starts.

Then measure downtime and profile. Then we can look at ways
to quiesce device faster which really means step 5 is replaced
with "host tells guest to quiesce device and dirty (or just unmap!)
all memory mapped for write by device".

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Dr. David Alan Gilbert
* Michael S. Tsirkin (m...@redhat.com) wrote:
> On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> > >> The two mechanisms referenced above would likely require coordination 
> > >> with
> > >> QEMU and as such are open to discussion.  I haven't attempted to address
> > >> them as I am not sure there is a consensus as of yet.  My personal
> > >> preference would be to add a vendor-specific configuration block to the
> > >> emulated pci-bridge interfaces created by QEMU that would allow us to
> > >> essentially extend shpc to support guest live migration with pass-through
> > >> devices.
> > >
> > > shpc?
> > 
> > That is kind of what I was thinking.  We basically need some mechanism
> > to allow for the host to ask the device to quiesce.  It has been
> > proposed to possibly even look at something like an ACPI interface
> > since I know ACPI is used by QEMU to manage hot-plug in the standard
> > case.
> > 
> > - Alex
> 
> 
> Start by using hot-unplug for this!
> 
> Really use your patch guest side, and write host side
> to allow starting migration with the device, but
> defer completing it.
> 
> So
> 
> 1.- host tells guest to start tracking memory writes
> 2.- guest acks
> 3.- migration starts
> 4.- most memory is migrated
> 5.- host tells guest to eject device
> 6.- guest acks
> 7.- stop vm and migrate rest of state
> 
> 
> It will already be a win since hot unplug after migration starts and
> most memory has been migrated is better than hot unplug before migration
> starts.
> 
> Then measure downtime and profile. Then we can look at ways
> to quiesce device faster which really means step 5 is replaced
> with "host tells guest to quiesce device and dirty (or just unmap!)
> all memory mapped for write by device".


Doing a hot-unplug is going to upset the guests network stacks view
of the world; that's something we don't want to change.

Dave

> 
> -- 
> MST
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Michael S. Tsirkin
On Tue, Jan 05, 2016 at 10:01:04AM +, Dr. David Alan Gilbert wrote:
> * Michael S. Tsirkin (m...@redhat.com) wrote:
> > On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> > > >> The two mechanisms referenced above would likely require coordination 
> > > >> with
> > > >> QEMU and as such are open to discussion.  I haven't attempted to 
> > > >> address
> > > >> them as I am not sure there is a consensus as of yet.  My personal
> > > >> preference would be to add a vendor-specific configuration block to the
> > > >> emulated pci-bridge interfaces created by QEMU that would allow us to
> > > >> essentially extend shpc to support guest live migration with 
> > > >> pass-through
> > > >> devices.
> > > >
> > > > shpc?
> > > 
> > > That is kind of what I was thinking.  We basically need some mechanism
> > > to allow for the host to ask the device to quiesce.  It has been
> > > proposed to possibly even look at something like an ACPI interface
> > > since I know ACPI is used by QEMU to manage hot-plug in the standard
> > > case.
> > > 
> > > - Alex
> > 
> > 
> > Start by using hot-unplug for this!
> > 
> > Really use your patch guest side, and write host side
> > to allow starting migration with the device, but
> > defer completing it.
> > 
> > So
> > 
> > 1.- host tells guest to start tracking memory writes
> > 2.- guest acks
> > 3.- migration starts
> > 4.- most memory is migrated
> > 5.- host tells guest to eject device
> > 6.- guest acks
> > 7.- stop vm and migrate rest of state
> > 
> > 
> > It will already be a win since hot unplug after migration starts and
> > most memory has been migrated is better than hot unplug before migration
> > starts.
> > 
> > Then measure downtime and profile. Then we can look at ways
> > to quiesce device faster which really means step 5 is replaced
> > with "host tells guest to quiesce device and dirty (or just unmap!)
> > all memory mapped for write by device".
> 
> 
> Doing a hot-unplug is going to upset the guests network stacks view
> of the world; that's something we don't want to change.
> 
> Dave

It might but if you store the IP and restore it quickly
after migration e.g. using guest agent, as opposed to DHCP,
then it won't.

It allows calming the device down in a generic way,
specific drivers can then implement the fast quiesce.

> > 
> > -- 
> > MST
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Alexander Duyck
On Tue, Jan 5, 2016 at 1:40 AM, Michael S. Tsirkin  wrote:
> On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
>> >> The two mechanisms referenced above would likely require coordination with
>> >> QEMU and as such are open to discussion.  I haven't attempted to address
>> >> them as I am not sure there is a consensus as of yet.  My personal
>> >> preference would be to add a vendor-specific configuration block to the
>> >> emulated pci-bridge interfaces created by QEMU that would allow us to
>> >> essentially extend shpc to support guest live migration with pass-through
>> >> devices.
>> >
>> > shpc?
>>
>> That is kind of what I was thinking.  We basically need some mechanism
>> to allow for the host to ask the device to quiesce.  It has been
>> proposed to possibly even look at something like an ACPI interface
>> since I know ACPI is used by QEMU to manage hot-plug in the standard
>> case.
>>
>> - Alex
>
>
> Start by using hot-unplug for this!
>
> Really use your patch guest side, and write host side
> to allow starting migration with the device, but
> defer completing it.

Yeah, I'm fully on board with this idea, though I'm not really working
on this right now since last I knew the folks on this thread from
Intel were working on it.  My patches were mostly meant to be a nudge
in this direction so that we could get away from the driver specific
code.

> So
>
> 1.- host tells guest to start tracking memory writes
> 2.- guest acks
> 3.- migration starts
> 4.- most memory is migrated
> 5.- host tells guest to eject device
> 6.- guest acks
> 7.- stop vm and migrate rest of state
>

Sounds about right.  The only way this differs from what I see as the
final solution for this is that instead of fully ejecting the device
in step 5 the driver would instead pause the device and give the host
something like 10 seconds to stop the VM and resume with the same
device connected if it is available.  We would probably also need to
look at a solution that would force the device to be ejected or abort
prior to starting the migration if it doesn't give us the ack in step
2.

> It will already be a win since hot unplug after migration starts and
> most memory has been migrated is better than hot unplug before migration
> starts.

Right.  Generally the longer the VF can be maintained as a part of the
guest the longer the network performance is improved versus using a
purely virtual interface.

> Then measure downtime and profile. Then we can look at ways
> to quiesce device faster which really means step 5 is replaced
> with "host tells guest to quiesce device and dirty (or just unmap!)
> all memory mapped for write by device".

Step 5 will be the spot where we really need to start modifying
drivers.  Specifically we probably need to go through and clean-up
things so that we can reduce as many of the delays in the driver
suspend/resume path as possible.  I suspect there is quite a bit that
can be done there that would probably also improve boot and shutdown
times since those are also impacted by the devices.

- Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: How to reserve guest physical region for ACPI

2016-01-05 Thread Igor Mammedov
On Wed, 30 Dec 2015 21:52:32 +0200
"Michael S. Tsirkin"  wrote:

> On Wed, Dec 30, 2015 at 04:55:54PM +0100, Igor Mammedov wrote:
> > On Mon, 28 Dec 2015 14:50:15 +0200
> > "Michael S. Tsirkin"  wrote:
> >   
> > > On Mon, Dec 28, 2015 at 10:39:04AM +0800, Xiao Guangrong wrote:  
> > > > 
> > > > Hi Michael, Paolo,
> > > > 
> > > > Now it is the time to return to the challenge that how to reserve guest
> > > > physical region internally used by ACPI.
> > > > 
> > > > Igor suggested that:
> > > > | An alternative place to allocate reserve from could be high memory.
> > > > | For pc we have "reserved-memory-end" which currently makes sure
> > > > | that hotpluggable memory range isn't used by firmware
> > > > (https://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg00926.html)
> > > >   
> > > 
> > > I don't want to tie things to reserved-memory-end because this
> > > does not scale: next time we need to reserve memory,
> > > we'll need to find yet another way to figure out what is where.  
> > Could you elaborate a bit more on a problem you're seeing?
> > 
> > To me it looks like it scales rather well.
> > For example lets imagine that we adding a device
> > that has some on device memory that should be mapped into GPA
> > code to do so would look like:
> > 
> >   pc_machine_device_plug_cb(dev)
> >   {
> >...
> >if (dev == OUR_NEW_DEVICE_TYPE) {
> >memory_region_add_subregion(as, current_reserved_end, >mr);
> >set_new_reserved_end(current_reserved_end + 
> > memory_region_size(>mr));
> >}
> >   }
> > 
> > we can practically add any number of new devices that way.  
> 
> Yes but we'll have to build a host side allocator for these, and that's
> nasty. We'll also have to maintain these addresses indefinitely (at
> least per machine version) as they are guest visible.
> Not only that, there's no way for guest to know if we move things
> around, so basically we'll never be able to change addresses.
simplistic GPA allocator in snippet above does the job,

if one unconditionally adds a device in new version then yes
code has to have compat code based on machine version.
But that applies to any device that gas a state to migrate
or to any address space layout change.

However device that directly maps addresses doesn't have to
have fixed address though, it could behave the same way as
PCI device with BARs, with only difference that its
MemoryRegions are mapped before guest is running vs
BARs mapped by BIOS.
It could be worth to create a generic base device class
that would do above. Then it could be inherited from and
extended by concrete device implementations.

> >
> > > I would like ./hw/acpi/bios-linker-loader.c interface to be extended to
> > > support 64 bit RAM instead (and maybe a way to allocate and
> > > zero-initialize buffer without loading it through fwcfg), this way bios
> > > does the allocation, and addresses can be patched into acpi.  
> > and then guest side needs to parse/execute some AML that would
> > initialize QEMU side so it would know where to write data.  
> 
> Well not really - we can put it in a data table, by itself
> so it's easy to find.
> 
> AML is only needed if access from ACPI is desired.
in both cases (VMGEN, NVDIMM) access from ACPI is required
as minimum to write address back to QEMU and for NVDIM
to pass _DSM method data between guest and QEMU.

> 
> 
> > bios-linker-loader is a great interface for initializing some
> > guest owned data and linking it together but I think it adds
> > unnecessary complexity and is misused if it's used to handle
> > device owned data/on device memory in this and VMGID cases.  
> 
> I want a generic interface for guest to enumerate these things.  linker
> seems quite reasonable but if you see a reason why it won't do, or want
> to propose a better interface, fine.
> 
> PCI would do, too - though windows guys had concerns about
> returning PCI BARs from ACPI.
There were potential issues with pSeries bootloader that treated
PCI_CLASS_MEMORY_RAM as conventional RAM but it was fixed.
Could you point out to discussion about windows issues?

What VMGEN patches that used PCI for mapping purposes were
stuck at, was that it was suggested to use PCI_CLASS_MEMORY_RAM
class id but we couldn't agree on it.

VMGEN v13 with full discussion is here
https://patchwork.ozlabs.org/patch/443554/
So to continue with this route we would need to pick some other
driver less class id so windows won't prompt for driver or
maybe supply our own driver stub to guarantee that no one
would touch it. Any suggestions?

> 
> 
> > There was RFC on list to make BIOS boot from NVDIMM already
> > doing some ACPI table lookup/parsing. Now if they were forced
> > to also parse and execute AML to initialize QEMU with guest
> > allocated address that would complicate them quite a bit.  
> 
> If they just need to find a table by name, it won't be
> too bad, would it?
that's what they were doing scanning memory for 

Re: How to reserve guest physical region for ACPI

2016-01-05 Thread Igor Mammedov
On Mon, 4 Jan 2016 21:17:31 +0100
Laszlo Ersek  wrote:

> Michael CC'd me on the grandparent of the email below. I'll try to add
> my thoughts in a single go, with regard to OVMF.
> 
> On 12/30/15 20:52, Michael S. Tsirkin wrote:
> > On Wed, Dec 30, 2015 at 04:55:54PM +0100, Igor Mammedov wrote:  
> >> On Mon, 28 Dec 2015 14:50:15 +0200
> >> "Michael S. Tsirkin"  wrote:
> >>  
> >>> On Mon, Dec 28, 2015 at 10:39:04AM +0800, Xiao Guangrong wrote:  
> 
>  Hi Michael, Paolo,
> 
>  Now it is the time to return to the challenge that how to reserve guest
>  physical region internally used by ACPI.
> 
>  Igor suggested that:
>  | An alternative place to allocate reserve from could be high memory.
>  | For pc we have "reserved-memory-end" which currently makes sure
>  | that hotpluggable memory range isn't used by firmware
>  (https://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg00926.html) 
>   
> 
> OVMF has no support for the "reserved-memory-end" fw_cfg file. The
> reason is that nobody wrote that patch, nor asked for the patch to be
> written. (Not implying that just requesting the patch would be
> sufficient for the patch to be written.)
> 
> >>> I don't want to tie things to reserved-memory-end because this
> >>> does not scale: next time we need to reserve memory,
> >>> we'll need to find yet another way to figure out what is where.  
> >> Could you elaborate a bit more on a problem you're seeing?
> >>
> >> To me it looks like it scales rather well.
> >> For example lets imagine that we adding a device
> >> that has some on device memory that should be mapped into GPA
> >> code to do so would look like:
> >>
> >>   pc_machine_device_plug_cb(dev)
> >>   {
> >>...
> >>if (dev == OUR_NEW_DEVICE_TYPE) {
> >>memory_region_add_subregion(as, current_reserved_end, >mr);
> >>set_new_reserved_end(current_reserved_end + 
> >> memory_region_size(>mr));
> >>}
> >>   }
> >>
> >> we can practically add any number of new devices that way.  
> > 
> > Yes but we'll have to build a host side allocator for these, and that's
> > nasty. We'll also have to maintain these addresses indefinitely (at
> > least per machine version) as they are guest visible.
> > Not only that, there's no way for guest to know if we move things
> > around, so basically we'll never be able to change addresses.
> > 
> >   
> >>
> >>> I would like ./hw/acpi/bios-linker-loader.c interface to be extended to
> >>> support 64 bit RAM instead  
> 
> This looks quite doable in OVMF, as long as the blob to allocate from
> high memory contains *zero* ACPI tables.
> 
> (
> Namely, each ACPI table is installed from the containing fw_cfg blob
> with EFI_ACPI_TABLE_PROTOCOL.InstallAcpiTable(), and the latter has its
> own allocation policy for the *copies* of ACPI tables it installs.
> 
> This allocation policy is left unspecified in the section of the UEFI
> spec that governs EFI_ACPI_TABLE_PROTOCOL.
> 
> The current policy in edk2 (= the reference implementation) seems to be
> "allocate from under 4GB". It is currently being changed to "try to
> allocate from under 4GB, and if that fails, retry from high memory". (It
> is motivated by Aarch64 machines that may have no DRAM at all under 4GB.)
> )
> 
> >>> (and maybe a way to allocate and
> >>> zero-initialize buffer without loading it through fwcfg),  
> 
> Sounds reasonable.
> 
> >>> this way bios
> >>> does the allocation, and addresses can be patched into acpi.  
> >> and then guest side needs to parse/execute some AML that would
> >> initialize QEMU side so it would know where to write data.  
> > 
> > Well not really - we can put it in a data table, by itself
> > so it's easy to find.  
> 
> Do you mean acpi_tb_find_table(), acpi_get_table_by_index() /
> acpi_get_table_with_size()?
> 
> > 
> > AML is only needed if access from ACPI is desired.
> > 
> >   
> >> bios-linker-loader is a great interface for initializing some
> >> guest owned data and linking it together but I think it adds
> >> unnecessary complexity and is misused if it's used to handle
> >> device owned data/on device memory in this and VMGID cases.  
> > 
> > I want a generic interface for guest to enumerate these things.  linker
> > seems quite reasonable but if you see a reason why it won't do, or want
> > to propose a better interface, fine.  
> 
> * The guest could do the following:
> - while processing the ALLOCATE commands, it would make a note where in
> GPA space each fw_cfg blob gets allocated
> - at the end the guest would prepare a temporary array with a predefined
> record format, that associates each fw_cfg blob's name with the concrete
> allocation address
> - it would create an FWCfgDmaAccess stucture pointing at this array,
> with a new "control" bit set (or something similar)
> - the guest could write the address of the FWCfgDmaAccess struct to the
> appropriate register, as always.
> 
> * Another idea 

Re: How to reserve guest physical region for ACPI

2016-01-05 Thread Xiao Guangrong



On 01/06/2016 12:43 AM, Michael S. Tsirkin wrote:


Yes - if address is static, you need to put it outside
the table. Can come right before or right after this.


Also if OperationRegion() is used, then one has to patch
DefOpRegion directly as RegionOffset must be Integer,
using variable names is not permitted there.


I am not sure the comment was understood correctly.
The comment says really "we can't use DataTableRegion
so here is an alternative".

so how are you going to access data at which patched
NameString point to?
for that you'd need a normal patched OperationRegion
as well since DataTableRegion isn't usable.


For VMGENID you would patch the method that
returns the address - you do not need an op region
as you never access it.

I don't know about NVDIMM. Maybe OperationRegion can
use the patched NameString? Will need some thought.


The ACPI spec says that the offsetTerm in OperationRegion
is evaluated as Int, so the named object is allowed to be
used in OperationRegion, that is exact what my patchset
is doing (http://marc.info/?l=kvm=145193395624537=2):

+dsm_mem = aml_arg(3);
+aml_append(method, aml_store(aml_call0(NVDIMM_GET_DSM_MEM), dsm_mem));

+aml_append(method, aml_operation_region("NRAM", AML_SYSTEM_MEMORY,
+dsm_mem, TARGET_PAGE_SIZE));

We hide the int64 object which is patched by BIOS in the method,
NVDIMM_GET_DSM_MEM, to make windows XP happy.

However, the disadvantages i see are:
a) as Igor pointed out, we need a way to tell QEMU what is the patched
   address, in NVDIMM ACPI, we used a 64 bit IO ports to pass the address
   to QEMU.

b) BIOS allocated memory is RAM based so it stops us to use MMIO in ACPI,
   MMIO is the more scalable resource than IO port as it has larger region
   and supports 64 bits operation.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm/s390: drop unpaired smp_mb

2016-01-05 Thread Michael S. Tsirkin
smp_mb on vcpu destroy isn't paired with anything, violating pairing
rules, and seems to be useless.

Drop it.

Signed-off-by: Michael S. Tsirkin 
---

Untested.

 arch/s390/kvm/kvm-s390.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8465892..7305d2c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1195,7 +1195,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
(__u64) vcpu->arch.sie_block)
vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
}
-   smp_mb();
 
if (kvm_is_ucontrol(vcpu->kvm))
gmap_free(vcpu->arch.gmap);
-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: How to reserve guest physical region for ACPI

2016-01-05 Thread Laszlo Ersek
On 01/05/16 18:08, Igor Mammedov wrote:
> On Mon, 4 Jan 2016 21:17:31 +0100
> Laszlo Ersek  wrote:
> 
>> Michael CC'd me on the grandparent of the email below. I'll try to add
>> my thoughts in a single go, with regard to OVMF.
>>
>> On 12/30/15 20:52, Michael S. Tsirkin wrote:
>>> On Wed, Dec 30, 2015 at 04:55:54PM +0100, Igor Mammedov wrote:  
 On Mon, 28 Dec 2015 14:50:15 +0200
 "Michael S. Tsirkin"  wrote:
  
> On Mon, Dec 28, 2015 at 10:39:04AM +0800, Xiao Guangrong wrote:  
>>
>> Hi Michael, Paolo,
>>
>> Now it is the time to return to the challenge that how to reserve guest
>> physical region internally used by ACPI.
>>
>> Igor suggested that:
>> | An alternative place to allocate reserve from could be high memory.
>> | For pc we have "reserved-memory-end" which currently makes sure
>> | that hotpluggable memory range isn't used by firmware
>> (https://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg00926.html) 
>>  
>>
>> OVMF has no support for the "reserved-memory-end" fw_cfg file. The
>> reason is that nobody wrote that patch, nor asked for the patch to be
>> written. (Not implying that just requesting the patch would be
>> sufficient for the patch to be written.)
>>
> I don't want to tie things to reserved-memory-end because this
> does not scale: next time we need to reserve memory,
> we'll need to find yet another way to figure out what is where.  
 Could you elaborate a bit more on a problem you're seeing?

 To me it looks like it scales rather well.
 For example lets imagine that we adding a device
 that has some on device memory that should be mapped into GPA
 code to do so would look like:

   pc_machine_device_plug_cb(dev)
   {
...
if (dev == OUR_NEW_DEVICE_TYPE) {
memory_region_add_subregion(as, current_reserved_end, >mr);
set_new_reserved_end(current_reserved_end + 
 memory_region_size(>mr));
}
   }

 we can practically add any number of new devices that way.  
>>>
>>> Yes but we'll have to build a host side allocator for these, and that's
>>> nasty. We'll also have to maintain these addresses indefinitely (at
>>> least per machine version) as they are guest visible.
>>> Not only that, there's no way for guest to know if we move things
>>> around, so basically we'll never be able to change addresses.
>>>
>>>   

> I would like ./hw/acpi/bios-linker-loader.c interface to be extended to
> support 64 bit RAM instead  
>>
>> This looks quite doable in OVMF, as long as the blob to allocate from
>> high memory contains *zero* ACPI tables.
>>
>> (
>> Namely, each ACPI table is installed from the containing fw_cfg blob
>> with EFI_ACPI_TABLE_PROTOCOL.InstallAcpiTable(), and the latter has its
>> own allocation policy for the *copies* of ACPI tables it installs.
>>
>> This allocation policy is left unspecified in the section of the UEFI
>> spec that governs EFI_ACPI_TABLE_PROTOCOL.
>>
>> The current policy in edk2 (= the reference implementation) seems to be
>> "allocate from under 4GB". It is currently being changed to "try to
>> allocate from under 4GB, and if that fails, retry from high memory". (It
>> is motivated by Aarch64 machines that may have no DRAM at all under 4GB.)
>> )
>>
> (and maybe a way to allocate and
> zero-initialize buffer without loading it through fwcfg),  
>>
>> Sounds reasonable.
>>
> this way bios
> does the allocation, and addresses can be patched into acpi.  
 and then guest side needs to parse/execute some AML that would
 initialize QEMU side so it would know where to write data.  
>>>
>>> Well not really - we can put it in a data table, by itself
>>> so it's easy to find.  
>>
>> Do you mean acpi_tb_find_table(), acpi_get_table_by_index() /
>> acpi_get_table_with_size()?
>>
>>>
>>> AML is only needed if access from ACPI is desired.
>>>
>>>   
 bios-linker-loader is a great interface for initializing some
 guest owned data and linking it together but I think it adds
 unnecessary complexity and is misused if it's used to handle
 device owned data/on device memory in this and VMGID cases.  
>>>
>>> I want a generic interface for guest to enumerate these things.  linker
>>> seems quite reasonable but if you see a reason why it won't do, or want
>>> to propose a better interface, fine.  
>>
>> * The guest could do the following:
>> - while processing the ALLOCATE commands, it would make a note where in
>> GPA space each fw_cfg blob gets allocated
>> - at the end the guest would prepare a temporary array with a predefined
>> record format, that associates each fw_cfg blob's name with the concrete
>> allocation address
>> - it would create an FWCfgDmaAccess stucture pointing at this array,
>> with a new "control" bit set (or something similar)
>> - the guest could write the address of the FWCfgDmaAccess 

Re: How to reserve guest physical region for ACPI

2016-01-05 Thread Michael S. Tsirkin
On Tue, Jan 05, 2016 at 05:30:25PM +0100, Igor Mammedov wrote:
> > > bios-linker-loader is a great interface for initializing some
> > > guest owned data and linking it together but I think it adds
> > > unnecessary complexity and is misused if it's used to handle
> > > device owned data/on device memory in this and VMGID cases.  
> > 
> > I want a generic interface for guest to enumerate these things.  linker
> > seems quite reasonable but if you see a reason why it won't do, or want
> > to propose a better interface, fine.
> > 
> > PCI would do, too - though windows guys had concerns about
> > returning PCI BARs from ACPI.
> There were potential issues with pSeries bootloader that treated
> PCI_CLASS_MEMORY_RAM as conventional RAM but it was fixed.
> Could you point out to discussion about windows issues?
> 
> What VMGEN patches that used PCI for mapping purposes were
> stuck at, was that it was suggested to use PCI_CLASS_MEMORY_RAM
> class id but we couldn't agree on it.
> 
> VMGEN v13 with full discussion is here
> https://patchwork.ozlabs.org/patch/443554/
> So to continue with this route we would need to pick some other
> driver less class id so windows won't prompt for driver or
> maybe supply our own driver stub to guarantee that no one
> would touch it. Any suggestions?

Pick any device/vendor id pair for which windows specifies no driver.
There's a small risk that this will conflict with some
guest but I think it's minimal.


> > 
> > 
> > > There was RFC on list to make BIOS boot from NVDIMM already
> > > doing some ACPI table lookup/parsing. Now if they were forced
> > > to also parse and execute AML to initialize QEMU with guest
> > > allocated address that would complicate them quite a bit.  
> > 
> > If they just need to find a table by name, it won't be
> > too bad, would it?
> that's what they were doing scanning memory for static NVDIMM table.
> However if it were DataTable, BIOS side would have to execute
> AML so that the table address could be told to QEMU.

Not at all. You can find any table by its signature without
parsing AML.


> In case of direct mapping or PCI BAR there is no need to initialize
> QEMU side from AML.
> That also saves us IO port where this address should be written
> if bios-linker-loader approach is used.
> 
> > 
> > > While with NVDIMM control memory region mapped directly by QEMU,
> > > respective patches don't need in any way to initialize QEMU,
> > > all they would need just read necessary data from control region.
> > > 
> > > Also using bios-linker-loader takes away some usable RAM
> > > from guest and in the end that doesn't scale,
> > > the more devices I add the less usable RAM is left for guest OS
> > > while all the device needs is a piece of GPA address space
> > > that would belong to it.  
> > 
> > I don't get this comment. I don't think it's MMIO that is wanted.
> > If it's backed by qemu virtual memory then it's RAM.
> Then why don't allocate video card VRAM the same way and try to explain
> user that a guest started with '-m 128 -device cirrus-vga,vgamem_mb=64Mb'
> only has 64Mb of available RAM because of we think that on device VRAM
> is also RAM.
> 
> Maybe I've used MMIO term wrongly here but it roughly reflects the idea
> that on device memory (whether it's VRAM, NVDIMM control block or VMGEN
> area) is not allocated from guest's usable RAM (as described in E820)
> but rather directly mapped in guest's GPA and doesn't consume available
> RAM as guest sees it. That's also the way it's done on real hardware.
> 
> What we need in case of VMGEN ID and NVDIMM is on device memory
> that could be directly accessed by guest.
> Both direct mapping or PCI BAR do that job and we could use simple
> static AML without any patching.

At least with VMGEN the issue is that there's an AML method
that returns the physical address.
Then if guest OS moves the BAR (which is legal), it will break
since caller has no way to know it's related to the BAR.


> > > > 
> > > > See patch at the bottom that might be handy.
> > > >   
> > > > > he also innovated a way to use 64-bit address in DSDT/SSDT.rev = 1:
> > > > > | when writing ASL one shall make sure that only XP supported
> > > > > | features are in global scope, which is evaluated when tables
> > > > > | are loaded and features of rev2 and higher are inside methods.
> > > > > | That way XP doesn't crash as far as it doesn't evaluate unsupported
> > > > > | opcode and one can guard those opcodes checking _REV object if 
> > > > > neccesary.
> > > > > (https://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg01010.html)
> > > > >   
> > > > 
> > > > Yes, this technique works.
> > > > 
> > > > An alternative is to add an XSDT, XP ignores that.
> > > > XSDT at the moment breaks OVMF (because it loads both
> > > > the RSDT and the XSDT, which is wrong), but I think
> > > > Laszlo was working on a fix for that.  
> > > Using XSDT would increase ACPI tables occupied RAM
> > > as it would duplicate DSDT + non XP 

Re: How to reserve guest physical region for ACPI

2016-01-05 Thread Laszlo Ersek
On 01/05/16 17:43, Michael S. Tsirkin wrote:
> On Tue, Jan 05, 2016 at 05:30:25PM +0100, Igor Mammedov wrote:
 bios-linker-loader is a great interface for initializing some
 guest owned data and linking it together but I think it adds
 unnecessary complexity and is misused if it's used to handle
 device owned data/on device memory in this and VMGID cases.  
>>>
>>> I want a generic interface for guest to enumerate these things.  linker
>>> seems quite reasonable but if you see a reason why it won't do, or want
>>> to propose a better interface, fine.
>>>
>>> PCI would do, too - though windows guys had concerns about
>>> returning PCI BARs from ACPI.
>> There were potential issues with pSeries bootloader that treated
>> PCI_CLASS_MEMORY_RAM as conventional RAM but it was fixed.
>> Could you point out to discussion about windows issues?
>>
>> What VMGEN patches that used PCI for mapping purposes were
>> stuck at, was that it was suggested to use PCI_CLASS_MEMORY_RAM
>> class id but we couldn't agree on it.
>>
>> VMGEN v13 with full discussion is here
>> https://patchwork.ozlabs.org/patch/443554/
>> So to continue with this route we would need to pick some other
>> driver less class id so windows won't prompt for driver or
>> maybe supply our own driver stub to guarantee that no one
>> would touch it. Any suggestions?
> 
> Pick any device/vendor id pair for which windows specifies no driver.
> There's a small risk that this will conflict with some
> guest but I think it's minimal.
> 
> 
>>>
>>>
 There was RFC on list to make BIOS boot from NVDIMM already
 doing some ACPI table lookup/parsing. Now if they were forced
 to also parse and execute AML to initialize QEMU with guest
 allocated address that would complicate them quite a bit.  
>>>
>>> If they just need to find a table by name, it won't be
>>> too bad, would it?
>> that's what they were doing scanning memory for static NVDIMM table.
>> However if it were DataTable, BIOS side would have to execute
>> AML so that the table address could be told to QEMU.
> 
> Not at all. You can find any table by its signature without
> parsing AML.
> 
> 
>> In case of direct mapping or PCI BAR there is no need to initialize
>> QEMU side from AML.
>> That also saves us IO port where this address should be written
>> if bios-linker-loader approach is used.
>>
>>>
 While with NVDIMM control memory region mapped directly by QEMU,
 respective patches don't need in any way to initialize QEMU,
 all they would need just read necessary data from control region.

 Also using bios-linker-loader takes away some usable RAM
 from guest and in the end that doesn't scale,
 the more devices I add the less usable RAM is left for guest OS
 while all the device needs is a piece of GPA address space
 that would belong to it.  
>>>
>>> I don't get this comment. I don't think it's MMIO that is wanted.
>>> If it's backed by qemu virtual memory then it's RAM.
>> Then why don't allocate video card VRAM the same way and try to explain
>> user that a guest started with '-m 128 -device cirrus-vga,vgamem_mb=64Mb'
>> only has 64Mb of available RAM because of we think that on device VRAM
>> is also RAM.
>>
>> Maybe I've used MMIO term wrongly here but it roughly reflects the idea
>> that on device memory (whether it's VRAM, NVDIMM control block or VMGEN
>> area) is not allocated from guest's usable RAM (as described in E820)
>> but rather directly mapped in guest's GPA and doesn't consume available
>> RAM as guest sees it. That's also the way it's done on real hardware.
>>
>> What we need in case of VMGEN ID and NVDIMM is on device memory
>> that could be directly accessed by guest.
>> Both direct mapping or PCI BAR do that job and we could use simple
>> static AML without any patching.
> 
> At least with VMGEN the issue is that there's an AML method
> that returns the physical address.
> Then if guest OS moves the BAR (which is legal), it will break
> since caller has no way to know it's related to the BAR.
> 
> 
>
> See patch at the bottom that might be handy.
>   
>> he also innovated a way to use 64-bit address in DSDT/SSDT.rev = 1:
>> | when writing ASL one shall make sure that only XP supported
>> | features are in global scope, which is evaluated when tables
>> | are loaded and features of rev2 and higher are inside methods.
>> | That way XP doesn't crash as far as it doesn't evaluate unsupported
>> | opcode and one can guard those opcodes checking _REV object if 
>> neccesary.
>> (https://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg01010.html) 
>>  
>
> Yes, this technique works.
>
> An alternative is to add an XSDT, XP ignores that.
> XSDT at the moment breaks OVMF (because it loads both
> the RSDT and the XSDT, which is wrong), but I think
> Laszlo was working on a fix for that.  
 Using XSDT would increase ACPI tables occupied RAM
 

[patch added to the 3.12 stable tree] MIPS: KVM: Fix CACHE immediate offset sign extension

2016-01-05 Thread Jiri Slaby
From: James Hogan 

This patch has been added to the 3.12 stable tree. If you have any
objections, please let us know.

===

commit c5c2a3b998f1ff5a586f9d37e154070b8d550d17 upstream.

The immediate field of the CACHE instruction is signed, so ensure that
it gets sign extended by casting it to an int16_t rather than just
masking the low 16 bits.

Fixes: e685c689f3a8 ("KVM/MIPS32: Privileged instruction/target branch 
emulation.")
Signed-off-by: James Hogan 
Cc: Ralf Baechle 
Cc: Paolo Bonzini 
Cc: Gleb Natapov 
Cc: linux-m...@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini 
Signed-off-by: James Hogan 
Signed-off-by: Jiri Slaby 
---
 arch/mips/kvm/kvm_mips_emul.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kvm/kvm_mips_emul.c b/arch/mips/kvm/kvm_mips_emul.c
index c76f297b7149..33085819cd89 100644
--- a/arch/mips/kvm/kvm_mips_emul.c
+++ b/arch/mips/kvm/kvm_mips_emul.c
@@ -935,7 +935,7 @@ kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, 
uint32_t cause,
 
base = (inst >> 21) & 0x1f;
op_inst = (inst >> 16) & 0x1f;
-   offset = inst & 0x;
+   offset = (int16_t)inst;
cache = (inst >> 16) & 0x3;
op = (inst >> 18) & 0x7;
 
-- 
2.6.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch added to the 3.12 stable tree] MIPS: KVM: Uninit VCPU in vcpu_create error path

2016-01-05 Thread Jiri Slaby
From: James Hogan 

This patch has been added to the 3.12 stable tree. If you have any
objections, please let us know.

===

commit 585bb8f9a5e592f2ce7abbe5ed3112d5438d2754 upstream.

If either of the memory allocations in kvm_arch_vcpu_create() fail, the
vcpu which has been allocated and kvm_vcpu_init'd doesn't get uninit'd
in the error handling path. Add a call to kvm_vcpu_uninit() to fix this.

Fixes: 669e846e6c4e ("KVM/MIPS32: MIPS arch specific APIs for KVM")
Signed-off-by: James Hogan 
Cc: Ralf Baechle 
Cc: Paolo Bonzini 
Cc: Gleb Natapov 
Cc: linux-m...@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini 
Signed-off-by: James Hogan 
Signed-off-by: Jiri Slaby 
---
 arch/mips/kvm/kvm_mips.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index 2cb24788a8a6..7e7de1f2b8ed 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -312,7 +312,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 
unsigned int id)
 
if (!gebase) {
err = -ENOMEM;
-   goto out_free_cpu;
+   goto out_uninit_cpu;
}
kvm_info("Allocated %d bytes for KVM Exception Handlers @ %p\n",
 ALIGN(size, PAGE_SIZE), gebase);
@@ -372,6 +372,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 
unsigned int id)
 out_free_gebase:
kfree(gebase);
 
+out_uninit_cpu:
+   kvm_vcpu_uninit(vcpu);
+
 out_free_cpu:
kfree(vcpu);
 
-- 
2.6.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kbuild-all] [kvm:queue 27/38] arch/x86/kvm/hyperv.c:186:41: sparse: incorrect type in argument 2 (different modifiers)

2016-01-05 Thread Luc Van Oostenryck
On Fri, Sep 18, 2015 at 11:06:44PM +0800, Fengguang Wu wrote:
> [CC sparse people]
> 
> On Fri, Sep 18, 2015 at 04:41:56PM +0200, Paolo Bonzini wrote:
> > 
> > 
> > On 18/09/2015 16:40, Roman Kagan wrote:
> > > typedef unsigned long __nocast cputime_t;
> > > 
> > > extern void task_cputime_adjusted(cputime_t *);
> > > extern void current_task_runtime_100ns(void);
> > > 
> > > void current_task_runtime_100ns(void)
> > > {
> > > cputime_t utime;
> > > 
> > > task_cputime_adjusted();
> > > }
> > > %%% gcc -c x.c -Wall -Werror -O2; echo $?
> > > 0
> > > %%% sparse x.c
> > > x.c:16:32: warning: incorrect type in argument 1 (different modifiers)
> > > x.c:16:32:expected unsigned long [nocast] [usertype] *
> > > x.c:16:32:got unsigned long *
> > > x.c:16:32: warning: implicit cast to nocast type
> > > 
> > > Looks like a sparse bug to me.
> > 
> > Indeed...
> > 
> > Paolo

The problem is that the intent and semantic of 'nocast' is not clear at all.
There is an explanation about 'nocast' vs. 'bitwise' here:

https://git.kernel.org/cgit/devel/sparse/sparse.git/tree/Documentation/sparse.txt
but it doesn't give much info about the exact wanted behaviour.

Since for the kernel 'nocast' is only used for cputime_t & cputime64_t,
I think it should be clarified and fixed if needed.
A patch proposal is following.

Regards,
Luc
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch added to the 3.12 stable tree] MIPS: KVM: Fix ASID restoration logic

2016-01-05 Thread Jiri Slaby
From: James Hogan 

This patch has been added to the 3.12 stable tree. If you have any
objections, please let us know.

===

commit 002374f371bd02df864cce1fe85d90dc5b292837 upstream.

ASID restoration on guest resume should determine the guest execution
mode based on the guest Status register rather than bit 30 of the guest
PC.

Fix the two places in locore.S that do this, loading the guest status
from the cop0 area. Note, this assembly is specific to the trap &
emulate implementation of KVM, so it doesn't need to check the
supervisor bit as that mode is not implemented in the guest.

Fixes: b680f70fc111 ("KVM/MIPS32: Entry point for trampolining to...")
Signed-off-by: James Hogan 
Cc: Ralf Baechle 
Cc: Paolo Bonzini 
Cc: Gleb Natapov 
Cc: linux-m...@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini 
Signed-off-by: James Hogan 
Signed-off-by: Jiri Slaby 
---
 arch/mips/kvm/kvm_locore.S | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
index 03a2db58b22d..ba5ce99c021d 100644
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -159,9 +159,11 @@ FEXPORT(__kvm_mips_vcpu_run)
 
 FEXPORT(__kvm_mips_load_asid)
/* Set the ASID for the Guest Kernel */
-   INT_SLL t0, t0, 1   /* with kseg0 @ 0x4000, kernel */
-   /* addresses shift to 0x8000 */
-   bltzt0, 1f  /* If kernel */
+   PTR_L   t0, VCPU_COP0(k1)
+   LONG_L  t0, COP0_STATUS(t0)
+   andit0, KSU_USER | ST0_ERL | ST0_EXL
+   xorit0, KSU_USER
+   bnezt0, 1f  /* If kernel */
 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID/* else user */
 1:
@@ -438,9 +440,11 @@ __kvm_mips_return_to_guest:
mtc0t0, CP0_EPC
 
/* Set the ASID for the Guest Kernel */
-   INT_SLL t0, t0, 1   /* with kseg0 @ 0x4000, kernel */
-   /* addresses shift to 0x8000 */
-   bltzt0, 1f  /* If kernel */
+   PTR_L   t0, VCPU_COP0(k1)
+   LONG_L  t0, COP0_STATUS(t0)
+   andit0, KSU_USER | ST0_ERL | ST0_EXL
+   xorit0, KSU_USER
+   bnezt0, 1f  /* If kernel */
 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID/* else user */
 1:
-- 
2.6.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 1/6] arm/arm64: KVM: Introduce armv7 fp/simd vcpu fields and helpers

2016-01-05 Thread Christoffer Dall
On Sat, Dec 26, 2015 at 01:54:55PM -0800, Mario Smarduch wrote:
> Add helper functions to enable access to fp/smid on guest entry and save host
> fpexc on vcpu put, check if fp/simd registers are dirty and add new vcpu
> fields.
> 
> Signed-off-by: Mario Smarduch 
> ---
>  arch/arm/include/asm/kvm_emulate.h   | 42 
> 
>  arch/arm/include/asm/kvm_host.h  |  6 ++
>  arch/arm64/include/asm/kvm_emulate.h |  8 +++
>  3 files changed, 56 insertions(+)
> 
> diff --git a/arch/arm/include/asm/kvm_emulate.h 
> b/arch/arm/include/asm/kvm_emulate.h
> index 3095df0..d4d9da1 100644
> --- a/arch/arm/include/asm/kvm_emulate.h
> +++ b/arch/arm/include/asm/kvm_emulate.h
> @@ -24,6 +24,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include "../vfp/vfpinstr.h"

this looks dodgy...

can you move vfpinstr.h instead?

>  
>  unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num);
>  unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu);
> @@ -255,4 +257,44 @@ static inline unsigned long 
> vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
>   }
>  }
>  
> +#ifdef CONFIG_VFPv3
> +/* Called from vcpu_load - save fpexc and enable guest access to fp/simd 
> unit */

the comment is misleading, you're not enabling guest access to the
fp/simd unit, you're just setting the enabled bit to ensure guest
accesses trap.

> +static inline void vcpu_trap_vfp_enable(struct kvm_vcpu *vcpu)
> +{
> + u32 fpexc;
> +
> + /* Save host fpexc, and enable guest access to fp unit */
> + fpexc = fmrx(FPEXC);
> + vcpu->arch.host_fpexc = fpexc;
> + fpexc |= FPEXC_EN;
> + fmxr(FPEXC, fpexc);
> +
> + /* Configure HCPTR to trap on tracing and fp/simd access */
> + vcpu->arch.hcptr = HCPTR_TTA | HCPTR_TCP(10)  | HCPTR_TCP(11);
> +}
> +
> +/* Called from vcpu_put - restore host fpexc */

I would probably get rid of the "Called from" stuff and just describe
what these functions do locally.  Comments like this are likely to be
out of date soon'ish.

> +static inline void vcpu_restore_host_fpexc(struct kvm_vcpu *vcpu)
> +{
> + fmxr(FPEXC, vcpu->arch.host_fpexc);
> +}
> +
> +/* If trap bits are reset then fp/simd registers are dirty */
> +static inline bool vcpu_vfp_isdirty(struct kvm_vcpu *vcpu)
> +{
> + return !(vcpu->arch.hcptr & (HCPTR_TCP(10) | HCPTR_TCP(11)));
> +}
> +#else
> +static inline void vcpu_trap_vfp_enable(struct kvm_vcpu *vcpu)
> +{
> + vcpu->arch.hcptr = HCPTR_TTA;
> +}
> +
> +static inline void vcpu_restore_host_fpexc(struct kvm_vcpu *vcpu) {}
> +static inline bool vcpu_vfp_isdirty(struct kvm_vcpu *vcpu)
> +{
> + return false;
> +}
> +#endif

this kind of feels like it belongs in its own C-file instead of a header
file, perhaps arch/arm/kvm/vfp.C.

Marc, what do you think?

> +
>  #endif /* __ARM_KVM_EMULATE_H__ */
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index f9f2779..d3ef58a 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -105,6 +105,12 @@ struct kvm_vcpu_arch {
>   /* HYP trapping configuration */
>   u32 hcr;
>  
> + /* HYP Co-processor fp/simd and trace trapping configuration */
> + u32 hcptr;
> +
> + /* Save host FPEXC register to later restore on vcpu put */
> + u32 host_fpexc;
> +
>   /* Interrupt related fields */
>   u32 irq_lines;  /* IRQ and FIQ levels */
>  
> diff --git a/arch/arm64/include/asm/kvm_emulate.h 
> b/arch/arm64/include/asm/kvm_emulate.h
> index 3066328..ffe8ccf 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -299,4 +299,12 @@ static inline unsigned long 
> vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
>   return data;/* Leave LE untouched */
>  }
>  
> +static inline void vcpu_trap_vfp_enable(struct kvm_vcpu *vcpu) {}
> +static inline void vcpu_restore_host_fpexc(struct kvm_vcpu *vcpu) {}
> +
> +static inline bool vcpu_vfp_isdirty(struct kvm_vcpu *vcpu)
> +{
> + return false;
> +}
> +
>  #endif /* __ARM64_KVM_EMULATE_H__ */
> -- 
> 1.9.1
> 

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 1/6] arm/arm64: KVM: Introduce armv7 fp/simd vcpu fields and helpers

2016-01-05 Thread Mario Smarduch


On 1/5/2016 7:00 AM, Christoffer Dall wrote:
> On Sat, Dec 26, 2015 at 01:54:55PM -0800, Mario Smarduch wrote:
>> Add helper functions to enable access to fp/smid on guest entry and save host
>> fpexc on vcpu put, check if fp/simd registers are dirty and add new vcpu
>> fields.
>>
>> Signed-off-by: Mario Smarduch 
>> ---
>>  arch/arm/include/asm/kvm_emulate.h   | 42 
>> 
>>  arch/arm/include/asm/kvm_host.h  |  6 ++
>>  arch/arm64/include/asm/kvm_emulate.h |  8 +++
>>  3 files changed, 56 insertions(+)
>>
>> diff --git a/arch/arm/include/asm/kvm_emulate.h 
>> b/arch/arm/include/asm/kvm_emulate.h
>> index 3095df0..d4d9da1 100644
>> --- a/arch/arm/include/asm/kvm_emulate.h
>> +++ b/arch/arm/include/asm/kvm_emulate.h
>> @@ -24,6 +24,8 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>> +#include "../vfp/vfpinstr.h"
> 
> this looks dodgy...
> 
> can you move vfpinstr.h instead?
Sure I'll fix it up, it's in couple other places in kernel and kvm
 - copied it.
> 
>>  
>>  unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num);
>>  unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu);
>> @@ -255,4 +257,44 @@ static inline unsigned long 
>> vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
>>  }
>>  }
>>  
>> +#ifdef CONFIG_VFPv3
>> +/* Called from vcpu_load - save fpexc and enable guest access to fp/simd 
>> unit */
> 
> the comment is misleading, you're not enabling guest access to the
> fp/simd unit, you're just setting the enabled bit to ensure guest
> accesses trap.

That's more accurate.
> 
>> +static inline void vcpu_trap_vfp_enable(struct kvm_vcpu *vcpu)
>> +{
>> +u32 fpexc;
>> +
>> +/* Save host fpexc, and enable guest access to fp unit */
>> +fpexc = fmrx(FPEXC);
>> +vcpu->arch.host_fpexc = fpexc;
>> +fpexc |= FPEXC_EN;
>> +fmxr(FPEXC, fpexc);
>> +
>> +/* Configure HCPTR to trap on tracing and fp/simd access */
>> +vcpu->arch.hcptr = HCPTR_TTA | HCPTR_TCP(10)  | HCPTR_TCP(11);
>> +}
>> +
>> +/* Called from vcpu_put - restore host fpexc */
> 
> I would probably get rid of the "Called from" stuff and just describe
> what these functions do locally.  Comments like this are likely to be
> out of date soon'ish.

Yeah true, will do.
> 
>> +static inline void vcpu_restore_host_fpexc(struct kvm_vcpu *vcpu)
>> +{
>> +fmxr(FPEXC, vcpu->arch.host_fpexc);
>> +}
>> +
>> +/* If trap bits are reset then fp/simd registers are dirty */
>> +static inline bool vcpu_vfp_isdirty(struct kvm_vcpu *vcpu)
>> +{
>> +return !(vcpu->arch.hcptr & (HCPTR_TCP(10) | HCPTR_TCP(11)));
>> +}
>> +#else
>> +static inline void vcpu_trap_vfp_enable(struct kvm_vcpu *vcpu)
>> +{
>> +vcpu->arch.hcptr = HCPTR_TTA;
>> +}
>> +
>> +static inline void vcpu_restore_host_fpexc(struct kvm_vcpu *vcpu) {}
>> +static inline bool vcpu_vfp_isdirty(struct kvm_vcpu *vcpu)
>> +{
>> +return false;
>> +}
>> +#endif
> 
> this kind of feels like it belongs in its own C-file instead of a header
> file, perhaps arch/arm/kvm/vfp.C.
> 
> Marc, what do you think?
> 

That would be starting from vcpu_trap_vfp_enable()? The file is getting
little overloaded.

I'm also thinking that 3rd patch should have one function call for vcpu_put
like vcpu_load does instead of exposing arm32/arm64, arm32 only relevant logic.
When you have a chance to review that patch please keep that in mind.

>> +
>>  #endif /* __ARM_KVM_EMULATE_H__ */
>> diff --git a/arch/arm/include/asm/kvm_host.h 
>> b/arch/arm/include/asm/kvm_host.h
>> index f9f2779..d3ef58a 100644
>> --- a/arch/arm/include/asm/kvm_host.h
>> +++ b/arch/arm/include/asm/kvm_host.h
>> @@ -105,6 +105,12 @@ struct kvm_vcpu_arch {
>>  /* HYP trapping configuration */
>>  u32 hcr;
>>  
>> +/* HYP Co-processor fp/simd and trace trapping configuration */
>> +u32 hcptr;
>> +
>> +/* Save host FPEXC register to later restore on vcpu put */
>> +u32 host_fpexc;
>> +
>>  /* Interrupt related fields */
>>  u32 irq_lines;  /* IRQ and FIQ levels */
>>  
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h 
>> b/arch/arm64/include/asm/kvm_emulate.h
>> index 3066328..ffe8ccf 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -299,4 +299,12 @@ static inline unsigned long 
>> vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
>>  return data;/* Leave LE untouched */
>>  }
>>  
>> +static inline void vcpu_trap_vfp_enable(struct kvm_vcpu *vcpu) {}
>> +static inline void vcpu_restore_host_fpexc(struct kvm_vcpu *vcpu) {}
>> +
>> +static inline bool vcpu_vfp_isdirty(struct kvm_vcpu *vcpu)
>> +{
>> +return false;
>> +}
>> +
>>  #endif /* __ARM64_KVM_EMULATE_H__ */
>> -- 
>> 1.9.1
>>
> 
> Thanks,
> -Christoffer
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3.12 04/91] MIPS: KVM: Uninit VCPU in vcpu_create error path

2016-01-05 Thread Jiri Slaby
From: James Hogan 

3.12-stable review patch.  If anyone has any objections, please let me know.

===

commit 585bb8f9a5e592f2ce7abbe5ed3112d5438d2754 upstream.

If either of the memory allocations in kvm_arch_vcpu_create() fail, the
vcpu which has been allocated and kvm_vcpu_init'd doesn't get uninit'd
in the error handling path. Add a call to kvm_vcpu_uninit() to fix this.

Fixes: 669e846e6c4e ("KVM/MIPS32: MIPS arch specific APIs for KVM")
Signed-off-by: James Hogan 
Cc: Ralf Baechle 
Cc: Paolo Bonzini 
Cc: Gleb Natapov 
Cc: linux-m...@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini 
Signed-off-by: James Hogan 
Signed-off-by: Jiri Slaby 
---
 arch/mips/kvm/kvm_mips.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index 2cb24788a8a6..7e7de1f2b8ed 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -312,7 +312,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 
unsigned int id)
 
if (!gebase) {
err = -ENOMEM;
-   goto out_free_cpu;
+   goto out_uninit_cpu;
}
kvm_info("Allocated %d bytes for KVM Exception Handlers @ %p\n",
 ALIGN(size, PAGE_SIZE), gebase);
@@ -372,6 +372,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 
unsigned int id)
 out_free_gebase:
kfree(gebase);
 
+out_uninit_cpu:
+   kvm_vcpu_uninit(vcpu);
+
 out_free_cpu:
kfree(vcpu);
 
-- 
2.6.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3.12 03/91] MIPS: KVM: Fix CACHE immediate offset sign extension

2016-01-05 Thread Jiri Slaby
From: James Hogan 

3.12-stable review patch.  If anyone has any objections, please let me know.

===

commit c5c2a3b998f1ff5a586f9d37e154070b8d550d17 upstream.

The immediate field of the CACHE instruction is signed, so ensure that
it gets sign extended by casting it to an int16_t rather than just
masking the low 16 bits.

Fixes: e685c689f3a8 ("KVM/MIPS32: Privileged instruction/target branch 
emulation.")
Signed-off-by: James Hogan 
Cc: Ralf Baechle 
Cc: Paolo Bonzini 
Cc: Gleb Natapov 
Cc: linux-m...@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini 
Signed-off-by: James Hogan 
Signed-off-by: Jiri Slaby 
---
 arch/mips/kvm/kvm_mips_emul.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kvm/kvm_mips_emul.c b/arch/mips/kvm/kvm_mips_emul.c
index c76f297b7149..33085819cd89 100644
--- a/arch/mips/kvm/kvm_mips_emul.c
+++ b/arch/mips/kvm/kvm_mips_emul.c
@@ -935,7 +935,7 @@ kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, 
uint32_t cause,
 
base = (inst >> 21) & 0x1f;
op_inst = (inst >> 16) & 0x1f;
-   offset = inst & 0x;
+   offset = (int16_t)inst;
cache = (inst >> 16) & 0x3;
op = (inst >> 18) & 0x7;
 
-- 
2.6.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3.12 02/91] MIPS: KVM: Fix ASID restoration logic

2016-01-05 Thread Jiri Slaby
From: James Hogan 

3.12-stable review patch.  If anyone has any objections, please let me know.

===

commit 002374f371bd02df864cce1fe85d90dc5b292837 upstream.

ASID restoration on guest resume should determine the guest execution
mode based on the guest Status register rather than bit 30 of the guest
PC.

Fix the two places in locore.S that do this, loading the guest status
from the cop0 area. Note, this assembly is specific to the trap &
emulate implementation of KVM, so it doesn't need to check the
supervisor bit as that mode is not implemented in the guest.

Fixes: b680f70fc111 ("KVM/MIPS32: Entry point for trampolining to...")
Signed-off-by: James Hogan 
Cc: Ralf Baechle 
Cc: Paolo Bonzini 
Cc: Gleb Natapov 
Cc: linux-m...@linux-mips.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini 
Signed-off-by: James Hogan 
Signed-off-by: Jiri Slaby 
---
 arch/mips/kvm/kvm_locore.S | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
index 03a2db58b22d..ba5ce99c021d 100644
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -159,9 +159,11 @@ FEXPORT(__kvm_mips_vcpu_run)
 
 FEXPORT(__kvm_mips_load_asid)
/* Set the ASID for the Guest Kernel */
-   INT_SLL t0, t0, 1   /* with kseg0 @ 0x4000, kernel */
-   /* addresses shift to 0x8000 */
-   bltzt0, 1f  /* If kernel */
+   PTR_L   t0, VCPU_COP0(k1)
+   LONG_L  t0, COP0_STATUS(t0)
+   andit0, KSU_USER | ST0_ERL | ST0_EXL
+   xorit0, KSU_USER
+   bnezt0, 1f  /* If kernel */
 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID/* else user */
 1:
@@ -438,9 +440,11 @@ __kvm_mips_return_to_guest:
mtc0t0, CP0_EPC
 
/* Set the ASID for the Guest Kernel */
-   INT_SLL t0, t0, 1   /* with kseg0 @ 0x4000, kernel */
-   /* addresses shift to 0x8000 */
-   bltzt0, 1f  /* If kernel */
+   PTR_L   t0, VCPU_COP0(k1)
+   LONG_L  t0, COP0_STATUS(t0)
+   andit0, KSU_USER | ST0_ERL | ST0_EXL
+   xorit0, KSU_USER
+   bnezt0, 1f  /* If kernel */
 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID/* else user */
 1:
-- 
2.6.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] vhost: basic device IOTLB support

2016-01-05 Thread Jason Wang


On 01/05/2016 11:18 AM, Yang Zhang wrote:
> On 2016/1/4 14:22, Jason Wang wrote:
>>
>>
>> On 01/04/2016 09:39 AM, Yang Zhang wrote:
>>> On 2015/12/31 15:13, Jason Wang wrote:
 This patch tries to implement an device IOTLB for vhost. This could be
 used with for co-operation with userspace(qemu) implementation of
 iommu for a secure DMA environment in guest.

 The idea is simple. When vhost meets an IOTLB miss, it will request
 the assistance of userspace to do the translation, this is done
 through:

 - Fill the translation request in a preset userspace address (This
 address is set through ioctl VHOST_SET_IOTLB_REQUEST_ENTRY).
 - Notify userspace through eventfd (This eventfd was set through ioctl
 VHOST_SET_IOTLB_FD).

 When userspace finishes the translation, it will update the vhost
 IOTLB through VHOST_UPDATE_IOTLB ioctl. Userspace is also in charge of
 snooping the IOTLB invalidation of IOMMU IOTLB and use
 VHOST_UPDATE_IOTLB to invalidate the possible entry in vhost.
>>>
>>> Is there any performance data shows the difference with IOTLB
>>> supporting?
>>
>> Basic testing show it was slower than without IOTLB.
>>
>>> I doubt we may see performance decrease since the flush code path is
>>> longer than before.
>>>
>>
>> Yes, it also depend on the TLB hit rate.
>>
>> If lots of dynamic mappings and unmappings are used in guest (e.g normal
>> Linux driver). This method should be much more slower since:
>>
>> - lots of invalidation and its path is slow.
>> - the hit rate is low and the high price of userspace assisted address
>> translation.
>> - limitation of userspace IOMMU/IOTLB implementation (qemu's vtd
>> emulation simply empty all entries when it's full).
>>
>> Another method is to implement kernel IOMMU (e.g vtd). But I'm not sure
>> vhost is the best place to do this, since vhost should be architecture
>> independent. Maybe we'd better to do it in kvm or have a pv IOMMU
>> implementation in vhost.
>
> Actually, i have the kernel IOMMU(virtual vtd) patch which can pass
> though the physical device to L2 guest on hand.

A little bit confused, I believe the first step is to exporting an IOMMU
to L1 guest for it to use for a assigned device?

> But it is just a draft patch which was written several years ago. If
> there is real requirement for it, I can rebase it and send out it for
> review.

Interesting but I think the goal is different. This patch tries to make
vhost/virtio works with emulated IOMMU.

>
>>
>> Another side, if fixed mappings were used in guest, (e.g dpdk in guest).
>> We have the possibility to have 100% hit rate with almost no
>> invalidation, the performance penalty should be ignorable, this should
>> be the main use case for this patch.
>>
>> The patch is just a prototype for discussion. Any other ideas are
>> welcomed.
>>
>> Thanks
>>
>
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] Threaded MSI interrupt for VFIO PCI device

2016-01-05 Thread Yunhong Jiang
On Wed, Dec 16, 2015 at 10:55:12PM +0100, Paolo Bonzini wrote:
> 
> 
> On 16/12/2015 20:15, Alex Williamson wrote:
> > The consumers would be, for instance, Intel PI + the threaded handler
> > added in this series.  These run independently, the PI bypass simply
> > makes the interrupt disappear from the host when it catches it, but if
> > the vCPU isn't running in the right place at the time of the interrupt,
> > it gets delivered to the host, in which case the secondary consumer
> > implementing handle_irq() provides a lower latency injection than the
> > eventfd path.  If PI isn't supported, only this latter consumer is
> > registered.
> 
> I would implement the two in a single consumer, knowing that only one of
> the two parts would effectively run.  But because of the possibility of
> multiple consumers implementing handle_irq(), I am not sure if this is
> feasible.

So is it possible that we limit only one consumer with handle_irq(), as my 
previous response to Alex? We can extend it in future if we do need support 
multiple consumder implementing handle_irq()?

Thanks
--jyh

> 
> > On the surface it seems like a reasonable solution, though having
> > multiple consumers implementing handle_irq() seems problematic.  Do we
> > get multiple injections if we call them all?
> 
> Indeed.
> 
> > Should we have some way
> > to prioritize one handler versus another?  Perhaps KVM should have a
> > single unified consumer that can provide that sort of logic, though we
> > still need the srcu code added here to protect against registration and
> > irq_handler() races.  Thanks,
> 
> I'm happy to see that we have the same doubts. :)
> 
> Paolo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] Threaded MSI interrupt for VFIO PCI device

2016-01-05 Thread Yunhong Jiang
On Wed, Dec 16, 2015 at 12:15:23PM -0700, Alex Williamson wrote:
> On Wed, 2015-12-16 at 18:56 +0100, Paolo Bonzini wrote:
> > Alex,
> > 
> > can you take a look at the extension to the irq bypass interface in
> > patch 2?  I'm not sure I understand what is the case where you have
> > multiple consumers for the same token.
> 
> The consumers would be, for instance, Intel PI + the threaded handler
> added in this series.  These run independently, the PI bypass simply
> makes the interrupt disappear from the host when it catches it, but if
> the vCPU isn't running in the right place at the time of the interrupt,
> it gets delivered to the host, in which case the secondary consumer
> implementing handle_irq() provides a lower latency injection than the

Sorry for slow response.

If the PI is delivered to the host because guest is not running, I think it 
will not trigger the secondary consumer. The reason is, with PI, the 
interrupt will be delivered as the POSTED_INTR_VECTOR or 
POSTED_INTR_WAKEUP_VECTOR. So for the PI consumer will not be invoked on run 
time scenario.

> eventfd path.  If PI isn't supported, only this latter consumer is
> registered.
> 
> On the surface it seems like a reasonable solution, though having
> multiple consumers implementing handle_irq() seems problematic.  Do we

Yes, agree that has multiple consumers implementing handle_irq() seems not 
good. But I do think it can be helpful. A naive case is, a consumer can be 
created to log all the interrupt event, or to create a pipe for analysis.

> get multiple injections if we call them all?  Should we have some way

As discussed above, currently I think we have only one consumer to 
handle_irq(), so it should be ok? We can limit the framework to support only 
one consumer with handle_irq()?

> to prioritize one handler versus another?  Perhaps KVM should have a
> single unified consumer that can provide that sort of logic, though we
I'd think still different consumer for the PI and this fast_IRQ.

Thanks
--jyh

> still need the srcu code added here to protect against registration and
> irq_handler() races.  Thanks,
> 
> Alex
> 
> > On 03/12/2015 19:22, Yunhong Jiang wrote:
> > > When assigning a VFIO device to a KVM guest with low latency
> > > requirement, it  
> > > is better to handle the interrupt in the hard interrupt context, to
> > > reduce 
> > > the context switch to/from the IRQ thread.
> > > 
> > > Based on discussion on https://lkml.org/lkml/2015/10/26/764, the
> > > VFIO msi 
> > > interrupt is changed to use request_threaded_irq(). The primary
> > > interrupt 
> > > handler tries to set the guest interrupt atomically. If it fails to
> > > achieve 
> > > it, a threaded interrupt handler will be invoked.
> > > 
> > > The irq_bypass manager is extended for this purpose. The KVM
> > > eventfd will 
> > > provide a irqbypass consumer to handle the interrupt at hard
> > > interrupt 
> > > context. The producer will invoke the consumer's handler then.
> > > 
> > > Yunhong Jiang (5):
> > >   Extract the irqfd_wakeup_pollin/irqfd_wakeup_pollup
> > >   Support runtime irq_bypass consumer
> > >   Support threaded interrupt handling on VFIO
> > >   Add the irq handling consumer
> > >   Expose x86 kvm_arch_set_irq_inatomic()
> > > 
> > >  arch/x86/kvm/Kconfig  |   1 +
> > >  drivers/vfio/pci/vfio_pci_intrs.c |  39 ++--
> > >  include/linux/irqbypass.h |   8 +++
> > >  include/linux/kvm_host.h  |  19 +-
> > >  include/linux/kvm_irqfd.h |   1 +
> > >  virt/kvm/Kconfig  |   3 +
> > >  virt/kvm/eventfd.c| 131
> > > ++
> > >  virt/lib/irqbypass.c  |  82 ++--
> > >  8 files changed, 214 insertions(+), 70 deletions(-)
> > > 
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Dr. David Alan Gilbert
* Michael S. Tsirkin (m...@redhat.com) wrote:
> On Tue, Jan 05, 2016 at 10:01:04AM +, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> > > > >> The two mechanisms referenced above would likely require 
> > > > >> coordination with
> > > > >> QEMU and as such are open to discussion.  I haven't attempted to 
> > > > >> address
> > > > >> them as I am not sure there is a consensus as of yet.  My personal
> > > > >> preference would be to add a vendor-specific configuration block to 
> > > > >> the
> > > > >> emulated pci-bridge interfaces created by QEMU that would allow us to
> > > > >> essentially extend shpc to support guest live migration with 
> > > > >> pass-through
> > > > >> devices.
> > > > >
> > > > > shpc?
> > > > 
> > > > That is kind of what I was thinking.  We basically need some mechanism
> > > > to allow for the host to ask the device to quiesce.  It has been
> > > > proposed to possibly even look at something like an ACPI interface
> > > > since I know ACPI is used by QEMU to manage hot-plug in the standard
> > > > case.
> > > > 
> > > > - Alex
> > > 
> > > 
> > > Start by using hot-unplug for this!
> > > 
> > > Really use your patch guest side, and write host side
> > > to allow starting migration with the device, but
> > > defer completing it.
> > > 
> > > So
> > > 
> > > 1.- host tells guest to start tracking memory writes
> > > 2.- guest acks
> > > 3.- migration starts
> > > 4.- most memory is migrated
> > > 5.- host tells guest to eject device
> > > 6.- guest acks
> > > 7.- stop vm and migrate rest of state
> > > 
> > > 
> > > It will already be a win since hot unplug after migration starts and
> > > most memory has been migrated is better than hot unplug before migration
> > > starts.
> > > 
> > > Then measure downtime and profile. Then we can look at ways
> > > to quiesce device faster which really means step 5 is replaced
> > > with "host tells guest to quiesce device and dirty (or just unmap!)
> > > all memory mapped for write by device".
> > 
> > 
> > Doing a hot-unplug is going to upset the guests network stacks view
> > of the world; that's something we don't want to change.
> > 
> > Dave
> 
> It might but if you store the IP and restore it quickly
> after migration e.g. using guest agent, as opposed to DHCP,
> then it won't.

I thought if you hot-unplug then it will lose any outstanding connections
on that device.

> It allows calming the device down in a generic way,
> specific drivers can then implement the fast quiesce.

Except that if it breaks the guest networking it's useless.

Dave

> 
> > > 
> > > -- 
> > > MST
> > --
> > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Michael S. Tsirkin
On Tue, Jan 05, 2016 at 10:45:25AM +, Dr. David Alan Gilbert wrote:
> * Michael S. Tsirkin (m...@redhat.com) wrote:
> > On Tue, Jan 05, 2016 at 10:01:04AM +, Dr. David Alan Gilbert wrote:
> > > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > > On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> > > > > >> The two mechanisms referenced above would likely require 
> > > > > >> coordination with
> > > > > >> QEMU and as such are open to discussion.  I haven't attempted to 
> > > > > >> address
> > > > > >> them as I am not sure there is a consensus as of yet.  My personal
> > > > > >> preference would be to add a vendor-specific configuration block 
> > > > > >> to the
> > > > > >> emulated pci-bridge interfaces created by QEMU that would allow us 
> > > > > >> to
> > > > > >> essentially extend shpc to support guest live migration with 
> > > > > >> pass-through
> > > > > >> devices.
> > > > > >
> > > > > > shpc?
> > > > > 
> > > > > That is kind of what I was thinking.  We basically need some mechanism
> > > > > to allow for the host to ask the device to quiesce.  It has been
> > > > > proposed to possibly even look at something like an ACPI interface
> > > > > since I know ACPI is used by QEMU to manage hot-plug in the standard
> > > > > case.
> > > > > 
> > > > > - Alex
> > > > 
> > > > 
> > > > Start by using hot-unplug for this!
> > > > 
> > > > Really use your patch guest side, and write host side
> > > > to allow starting migration with the device, but
> > > > defer completing it.
> > > > 
> > > > So
> > > > 
> > > > 1.- host tells guest to start tracking memory writes
> > > > 2.- guest acks
> > > > 3.- migration starts
> > > > 4.- most memory is migrated
> > > > 5.- host tells guest to eject device
> > > > 6.- guest acks
> > > > 7.- stop vm and migrate rest of state
> > > > 
> > > > 
> > > > It will already be a win since hot unplug after migration starts and
> > > > most memory has been migrated is better than hot unplug before migration
> > > > starts.
> > > > 
> > > > Then measure downtime and profile. Then we can look at ways
> > > > to quiesce device faster which really means step 5 is replaced
> > > > with "host tells guest to quiesce device and dirty (or just unmap!)
> > > > all memory mapped for write by device".
> > > 
> > > 
> > > Doing a hot-unplug is going to upset the guests network stacks view
> > > of the world; that's something we don't want to change.
> > > 
> > > Dave
> > 
> > It might but if you store the IP and restore it quickly
> > after migration e.g. using guest agent, as opposed to DHCP,
> > then it won't.
> 
> I thought if you hot-unplug then it will lose any outstanding connections
> on that device.

Which connections and which device?  TCP connections and an ethernet
device?  These are on different layers so of course you don't lose them.
Just do not change the IP address.

Some guests send a signal to applications to close connections
when all links go down. One can work around this
in a variety of ways.

> > It allows calming the device down in a generic way,
> > specific drivers can then implement the fast quiesce.
> 
> Except that if it breaks the guest networking it's useless.
> 
> Dave
> 
> > 
> > > > 
> > > > -- 
> > > > MST
> > > --
> > > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v2 1/3] PCI: Add support for enforcing all MMIO BARs to be page aligned

2016-01-05 Thread Yongji Xie

On 2016/1/5 4:47, Alex Williamson wrote:

On Thu, 2015-12-31 at 16:50 +0800, Yongji Xie wrote:

When vfio passthrough a PCI device of which MMIO BARs
are smaller than PAGE_SIZE, guest will not handle the
mmio accesses to the BARs which leads to mmio emulations
in host.

This is because vfio will not allow to passthrough one
BAR's mmio page which may be shared with other BARs.

To solve this performance issue, this patch adds a kernel
parameter "pci=resource_page_aligned=on" to enforce
the alignments of all MMIO BARs to be at least PAGE_SIZE,
so that one BAR's mmio page would not be shared with other
BARs. We can also disable it through kernel parameter
"pci=resource_page_aligned=off".

Shouldn't this somehow be associated with the realloc option?  I don't
think PCI code will attempt to reprogram anything unless it needs to
otherwise.


So you mean we need to ignore firmware setup and force re-assigning all
resources if we want to use the option "pci=resource_page_aligned=on"?


For the default value of this parameter, we think it should be
arch-independent, so we add a macro PCI_RESOURCE_PAGE_ALIGNED
to change it. And we define this macro to enable this parameter
by default on PPC64 platform which can easily hit this
performance issue because its PAGE_SIZE is 64KB.

Signed-off-by: Yongji Xie 
---
  Documentation/kernel-parameters.txt |4 
  arch/powerpc/include/asm/pci.h  |   11 +++
  drivers/pci/pci.c   |   17 +
  drivers/pci/pci.h   |7 ++-
  include/linux/pci.h |2 ++
  5 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 742f69d..a53aaee 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2857,6 +2857,10 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
PAGE_SIZE is used as alignment.
PCI-PCI bridge can be specified, if resource
windows need to be expanded.
+   resource_page_aligned=  Enable/disable enforcing the alignment
+   of all PCI devices' memory resources to be
+   at least PAGE_SIZE.
+   Format: { "on" | "off" }
ecrc=   Enable/disable PCIe ECRC (transaction layer
end-to-end CRC checking).
bios: Use BIOS/firmware settings. This is the
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 3453bd8..27bff59 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -136,6 +136,17 @@ extern pgprot_tpci_phys_mem_access_prot(struct file 
*file,
 unsigned long pfn,
 unsigned long size,
 pgprot_t prot);
+#ifdef CONFIG_PPC64
+
+/* For PPC64, We enforce all PCI MMIO BARs to be page aligned
+ * by default. This would be helpful to improve performance
+ * when we passthrough a PCI device of which BARs are smaller
+ * than PAGE_SIZE(64KB). And we can use bootcmd
+ * "pci=resource_page_aligned=off" to disable it.
+ */
+#define PCI_ENABLE_RESOURCE_PAGE_ALIGNED
+
+#endif

This should be done with something like HAVE_PCI_DEFAULT_RESOURCE_PAGE_
ALIGNED in arch/powerpc/include/asm


OK, I will fix it in next version.


  #define HAVE_ARCH_PCI_RESOURCE_TO_USER
  extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 314db8c..9f14ba5 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -99,6 +99,13 @@ u8 pci_cache_line_size;
   */
  unsigned int pcibios_max_latency = 255;
  
+#ifdef PCI_ENABLE_RESOURCE_PAGE_ALIGNED

+bool pci_resource_page_aligned = true;
+#else
+bool pci_resource_page_aligned;
+#endif
+EXPORT_SYMBOL(pci_resource_page_aligned);

Couldn't this be done in a single line with IS_ENABLED() macro?


I'm not sure whether IS_ENABLED() macro should be used there because
it is always used for CONFIG_ options.


Should this symbol be GPL-only?


Yes, it will be fixed in next version.


+
  /* If set, the PCIe ARI capability will not be used. */
  static bool pcie_ari_disabled;
  
@@ -4746,6 +4753,14 @@ static ssize_t pci_resource_alignment_store(struct bus_type *bus,

  BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show,
pci_resource_alignment_store);
  
+static void pci_resource_get_page_aligned(char *str)

+{
+   if (!strncmp(str, "off", 3))
+   pci_resource_page_aligned = false;
+   else if (!strncmp(str, "on", 2))
+   pci_resource_page_aligned = true;
+}
+
  static int __init pci_resource_alignment_sysfs_init(void)
  {
 

Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Michael S. Tsirkin
On Tue, Jan 05, 2016 at 12:59:54PM +0200, Michael S. Tsirkin wrote:
> On Tue, Jan 05, 2016 at 10:45:25AM +, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > On Tue, Jan 05, 2016 at 10:01:04AM +, Dr. David Alan Gilbert wrote:
> > > > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > > > On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> > > > > > >> The two mechanisms referenced above would likely require 
> > > > > > >> coordination with
> > > > > > >> QEMU and as such are open to discussion.  I haven't attempted to 
> > > > > > >> address
> > > > > > >> them as I am not sure there is a consensus as of yet.  My 
> > > > > > >> personal
> > > > > > >> preference would be to add a vendor-specific configuration block 
> > > > > > >> to the
> > > > > > >> emulated pci-bridge interfaces created by QEMU that would allow 
> > > > > > >> us to
> > > > > > >> essentially extend shpc to support guest live migration with 
> > > > > > >> pass-through
> > > > > > >> devices.
> > > > > > >
> > > > > > > shpc?
> > > > > > 
> > > > > > That is kind of what I was thinking.  We basically need some 
> > > > > > mechanism
> > > > > > to allow for the host to ask the device to quiesce.  It has been
> > > > > > proposed to possibly even look at something like an ACPI interface
> > > > > > since I know ACPI is used by QEMU to manage hot-plug in the standard
> > > > > > case.
> > > > > > 
> > > > > > - Alex
> > > > > 
> > > > > 
> > > > > Start by using hot-unplug for this!
> > > > > 
> > > > > Really use your patch guest side, and write host side
> > > > > to allow starting migration with the device, but
> > > > > defer completing it.
> > > > > 
> > > > > So
> > > > > 
> > > > > 1.- host tells guest to start tracking memory writes
> > > > > 2.- guest acks
> > > > > 3.- migration starts
> > > > > 4.- most memory is migrated
> > > > > 5.- host tells guest to eject device
> > > > > 6.- guest acks
> > > > > 7.- stop vm and migrate rest of state
> > > > > 
> > > > > 
> > > > > It will already be a win since hot unplug after migration starts and
> > > > > most memory has been migrated is better than hot unplug before 
> > > > > migration
> > > > > starts.
> > > > > 
> > > > > Then measure downtime and profile. Then we can look at ways
> > > > > to quiesce device faster which really means step 5 is replaced
> > > > > with "host tells guest to quiesce device and dirty (or just unmap!)
> > > > > all memory mapped for write by device".
> > > > 
> > > > 
> > > > Doing a hot-unplug is going to upset the guests network stacks view
> > > > of the world; that's something we don't want to change.
> > > > 
> > > > Dave
> > > 
> > > It might but if you store the IP and restore it quickly
> > > after migration e.g. using guest agent, as opposed to DHCP,
> > > then it won't.
> > 
> > I thought if you hot-unplug then it will lose any outstanding connections
> > on that device.
> > 
> > > It allows calming the device down in a generic way,
> > > specific drivers can then implement the fast quiesce.
> > 
> > Except that if it breaks the guest networking it's useless.
> > 
> > Dave
> 
> Is hot unplug useless then?

Actually I misunderstood the question, unplug does not
have to break guest networking.

> > > 
> > > > > 
> > > > > -- 
> > > > > MST
> > > > --
> > > > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> > --
> > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 0/2] VFIO SRIOV support

2016-01-05 Thread Haggai Eran
On 24/12/2015 15:51, Alex Williamson wrote:
> No.  A privileged entity needs to grant a user ownership of a group and
> sufficient locked memory limits to make it useful, but then use of the
> group does not require root permission.

So we're thinking how we can force the VFs in these cases to be in the same
IOMMU group with the PF, and make sure it is vfio-pci that probes them. We
thought about the following:

We could add a flag to pci_dev->dev_flags on the PF, that says that the PF's
VFs must be in the same IOMMU group with it. Modify
iommu_group_get_for_pci_dev() so that it will return the PFs group for VFs
whose PF has that flag set.

In the vfio_group_nb_add_dev() function set driver_override to "vfio-pci" for
PCI devices that are added to a live group. That would prevent the host from
probing these devices with the default driver.

What do you think?

Regards,
Haggai and Ilya
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: VIA Eden X4

2016-01-05 Thread Matwey V. Kornilov
2016-01-05 10:20 GMT+03:00 Bandan Das :
> "Matwey V. Kornilov"  writes:
>
>> Hello,
>>
>> According to WikiPedia VIA claims x86 hardware assisted virtualization
>> for VIA Eden X4 CPU.
>> Does anybody know if it is supported by Linux KVM?
>>
>
> I can't say for sure but my guess is that it should work since VIA implements
> VT-x like virtualization extensions, so KVM will find VMX capable hardware.
>

Do I understand correctly, that VIA engineers are not frequent guests
here and VIA doesn't disclosure docs for their CPUs?

> Bandan
>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
With best regards,
Matwey V. Kornilov
http://blog.matwey.name
xmpp://0x2...@jabber.ru
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Michael S. Tsirkin
On Tue, Jan 05, 2016 at 10:45:25AM +, Dr. David Alan Gilbert wrote:
> * Michael S. Tsirkin (m...@redhat.com) wrote:
> > On Tue, Jan 05, 2016 at 10:01:04AM +, Dr. David Alan Gilbert wrote:
> > > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > > On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> > > > > >> The two mechanisms referenced above would likely require 
> > > > > >> coordination with
> > > > > >> QEMU and as such are open to discussion.  I haven't attempted to 
> > > > > >> address
> > > > > >> them as I am not sure there is a consensus as of yet.  My personal
> > > > > >> preference would be to add a vendor-specific configuration block 
> > > > > >> to the
> > > > > >> emulated pci-bridge interfaces created by QEMU that would allow us 
> > > > > >> to
> > > > > >> essentially extend shpc to support guest live migration with 
> > > > > >> pass-through
> > > > > >> devices.
> > > > > >
> > > > > > shpc?
> > > > > 
> > > > > That is kind of what I was thinking.  We basically need some mechanism
> > > > > to allow for the host to ask the device to quiesce.  It has been
> > > > > proposed to possibly even look at something like an ACPI interface
> > > > > since I know ACPI is used by QEMU to manage hot-plug in the standard
> > > > > case.
> > > > > 
> > > > > - Alex
> > > > 
> > > > 
> > > > Start by using hot-unplug for this!
> > > > 
> > > > Really use your patch guest side, and write host side
> > > > to allow starting migration with the device, but
> > > > defer completing it.
> > > > 
> > > > So
> > > > 
> > > > 1.- host tells guest to start tracking memory writes
> > > > 2.- guest acks
> > > > 3.- migration starts
> > > > 4.- most memory is migrated
> > > > 5.- host tells guest to eject device
> > > > 6.- guest acks
> > > > 7.- stop vm and migrate rest of state
> > > > 
> > > > 
> > > > It will already be a win since hot unplug after migration starts and
> > > > most memory has been migrated is better than hot unplug before migration
> > > > starts.
> > > > 
> > > > Then measure downtime and profile. Then we can look at ways
> > > > to quiesce device faster which really means step 5 is replaced
> > > > with "host tells guest to quiesce device and dirty (or just unmap!)
> > > > all memory mapped for write by device".
> > > 
> > > 
> > > Doing a hot-unplug is going to upset the guests network stacks view
> > > of the world; that's something we don't want to change.
> > > 
> > > Dave
> > 
> > It might but if you store the IP and restore it quickly
> > after migration e.g. using guest agent, as opposed to DHCP,
> > then it won't.
> 
> I thought if you hot-unplug then it will lose any outstanding connections
> on that device.
> 
> > It allows calming the device down in a generic way,
> > specific drivers can then implement the fast quiesce.
> 
> Except that if it breaks the guest networking it's useless.
> 
> Dave

Is hot unplug useless then?

> > 
> > > > 
> > > > -- 
> > > > MST
> > > --
> > > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Dr. David Alan Gilbert
* Michael S. Tsirkin (m...@redhat.com) wrote:
> On Tue, Jan 05, 2016 at 10:45:25AM +, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > On Tue, Jan 05, 2016 at 10:01:04AM +, Dr. David Alan Gilbert wrote:
> > > > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > > > On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> > > > > > >> The two mechanisms referenced above would likely require 
> > > > > > >> coordination with
> > > > > > >> QEMU and as such are open to discussion.  I haven't attempted to 
> > > > > > >> address
> > > > > > >> them as I am not sure there is a consensus as of yet.  My 
> > > > > > >> personal
> > > > > > >> preference would be to add a vendor-specific configuration block 
> > > > > > >> to the
> > > > > > >> emulated pci-bridge interfaces created by QEMU that would allow 
> > > > > > >> us to
> > > > > > >> essentially extend shpc to support guest live migration with 
> > > > > > >> pass-through
> > > > > > >> devices.
> > > > > > >
> > > > > > > shpc?
> > > > > > 
> > > > > > That is kind of what I was thinking.  We basically need some 
> > > > > > mechanism
> > > > > > to allow for the host to ask the device to quiesce.  It has been
> > > > > > proposed to possibly even look at something like an ACPI interface
> > > > > > since I know ACPI is used by QEMU to manage hot-plug in the standard
> > > > > > case.
> > > > > > 
> > > > > > - Alex
> > > > > 
> > > > > 
> > > > > Start by using hot-unplug for this!
> > > > > 
> > > > > Really use your patch guest side, and write host side
> > > > > to allow starting migration with the device, but
> > > > > defer completing it.
> > > > > 
> > > > > So
> > > > > 
> > > > > 1.- host tells guest to start tracking memory writes
> > > > > 2.- guest acks
> > > > > 3.- migration starts
> > > > > 4.- most memory is migrated
> > > > > 5.- host tells guest to eject device
> > > > > 6.- guest acks
> > > > > 7.- stop vm and migrate rest of state
> > > > > 
> > > > > 
> > > > > It will already be a win since hot unplug after migration starts and
> > > > > most memory has been migrated is better than hot unplug before 
> > > > > migration
> > > > > starts.
> > > > > 
> > > > > Then measure downtime and profile. Then we can look at ways
> > > > > to quiesce device faster which really means step 5 is replaced
> > > > > with "host tells guest to quiesce device and dirty (or just unmap!)
> > > > > all memory mapped for write by device".
> > > > 
> > > > 
> > > > Doing a hot-unplug is going to upset the guests network stacks view
> > > > of the world; that's something we don't want to change.
> > > > 
> > > > Dave
> > > 
> > > It might but if you store the IP and restore it quickly
> > > after migration e.g. using guest agent, as opposed to DHCP,
> > > then it won't.
> > 
> > I thought if you hot-unplug then it will lose any outstanding connections
> > on that device.
> > 
> > > It allows calming the device down in a generic way,
> > > specific drivers can then implement the fast quiesce.
> > 
> > Except that if it breaks the guest networking it's useless.
> > 
> > Dave
> 
> Is hot unplug useless then?

As a migration hack, yes, unless it's paired with a second network device
as a redundent route.
To do what's being suggested here it's got to be done at the device level
and not visible to the networking stack.

Dave

> 
> > > 
> > > > > 
> > > > > -- 
> > > > > MST
> > > > --
> > > > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> > --
> > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Michael S. Tsirkin
On Tue, Jan 05, 2016 at 11:03:38AM +, Dr. David Alan Gilbert wrote:
> * Michael S. Tsirkin (m...@redhat.com) wrote:
> > On Tue, Jan 05, 2016 at 10:45:25AM +, Dr. David Alan Gilbert wrote:
> > > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > > On Tue, Jan 05, 2016 at 10:01:04AM +, Dr. David Alan Gilbert wrote:
> > > > > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > > > > On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> > > > > > > >> The two mechanisms referenced above would likely require 
> > > > > > > >> coordination with
> > > > > > > >> QEMU and as such are open to discussion.  I haven't attempted 
> > > > > > > >> to address
> > > > > > > >> them as I am not sure there is a consensus as of yet.  My 
> > > > > > > >> personal
> > > > > > > >> preference would be to add a vendor-specific configuration 
> > > > > > > >> block to the
> > > > > > > >> emulated pci-bridge interfaces created by QEMU that would 
> > > > > > > >> allow us to
> > > > > > > >> essentially extend shpc to support guest live migration with 
> > > > > > > >> pass-through
> > > > > > > >> devices.
> > > > > > > >
> > > > > > > > shpc?
> > > > > > > 
> > > > > > > That is kind of what I was thinking.  We basically need some 
> > > > > > > mechanism
> > > > > > > to allow for the host to ask the device to quiesce.  It has been
> > > > > > > proposed to possibly even look at something like an ACPI interface
> > > > > > > since I know ACPI is used by QEMU to manage hot-plug in the 
> > > > > > > standard
> > > > > > > case.
> > > > > > > 
> > > > > > > - Alex
> > > > > > 
> > > > > > 
> > > > > > Start by using hot-unplug for this!
> > > > > > 
> > > > > > Really use your patch guest side, and write host side
> > > > > > to allow starting migration with the device, but
> > > > > > defer completing it.
> > > > > > 
> > > > > > So
> > > > > > 
> > > > > > 1.- host tells guest to start tracking memory writes
> > > > > > 2.- guest acks
> > > > > > 3.- migration starts
> > > > > > 4.- most memory is migrated
> > > > > > 5.- host tells guest to eject device
> > > > > > 6.- guest acks
> > > > > > 7.- stop vm and migrate rest of state
> > > > > > 
> > > > > > 
> > > > > > It will already be a win since hot unplug after migration starts and
> > > > > > most memory has been migrated is better than hot unplug before 
> > > > > > migration
> > > > > > starts.
> > > > > > 
> > > > > > Then measure downtime and profile. Then we can look at ways
> > > > > > to quiesce device faster which really means step 5 is replaced
> > > > > > with "host tells guest to quiesce device and dirty (or just unmap!)
> > > > > > all memory mapped for write by device".
> > > > > 
> > > > > 
> > > > > Doing a hot-unplug is going to upset the guests network stacks view
> > > > > of the world; that's something we don't want to change.
> > > > > 
> > > > > Dave
> > > > 
> > > > It might but if you store the IP and restore it quickly
> > > > after migration e.g. using guest agent, as opposed to DHCP,
> > > > then it won't.
> > > 
> > > I thought if you hot-unplug then it will lose any outstanding connections
> > > on that device.
> > > 
> > > > It allows calming the device down in a generic way,
> > > > specific drivers can then implement the fast quiesce.
> > > 
> > > Except that if it breaks the guest networking it's useless.
> > > 
> > > Dave
> > 
> > Is hot unplug useless then?
> 
> As a migration hack, yes,

Based on a premise that it breaks connections but it does not
have to.

> unless it's paired with a second network device
> as a redundent route.

You can do this too.

But this is not a must at all.

> To do what's being suggested here it's got to be done at the device level
> and not visible to the networking stack.
> 
> Dave

Need for this was never demonstrated.

> > 
> > > > 
> > > > > > 
> > > > > > -- 
> > > > > > MST
> > > > > --
> > > > > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> > > --
> > > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1] kvm/x86: Hyper-V tsc page setup

2016-01-05 Thread Peter Hornyack
On Thu, Dec 24, 2015 at 1:33 AM, Andrey Smetanin
 wrote:
> Lately tsc page was implemented but filled with empty
> values. This patch setup tsc page scale and offset based
> on vcpu tsc, tsc_khz and  HV_X64_MSR_TIME_REF_COUNT value.
>
> The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
> reads count to zero which potentially improves performance.
>
> The patch applies on top of
> 'kvm: Make vcpu->requests as 64 bit bitmap'
> previously sent.
>
> Signed-off-by: Andrey Smetanin 
> CC: Paolo Bonzini 
> CC: Gleb Natapov 
> CC: Roman Kagan 
> CC: Denis V. Lunev 
> CC: qemu-de...@nongnu.org
Reviewed-by: Peter Hornyack 

>
> ---
>  arch/x86/kvm/hyperv.c| 117 
> +--
>  arch/x86/kvm/hyperv.h|   2 +
>  arch/x86/kvm/x86.c   |  12 +
>  include/linux/kvm_host.h |   1 +
>  4 files changed, 117 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index d50675a..504fdc7 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -753,6 +753,105 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu 
> *vcpu,
> return 0;
>  }
>
> +static u64 calc_tsc_page_scale(u32 tsc_khz)
> +{
> +   /*
> +* reftime (in 100ns) = tsc * tsc_scale / 2^64 + tsc_offset
> +* so reftime_delta = (tsc_delta * tsc_scale) / 2^64
> +* so tsc_scale = (2^64 * reftime_delta)/tsc_delta
> +* so tsc_scale = (2^64 * 10 * 10^6) / tsc_hz = (2^64 * 1) / 
> tsc_khz
> +* so tsc_scale = (2^63 * 2 * 1) / tsc_khz
> +*/
> +   return mul_u64_u32_div(1ULL << 63, 2 * 1, tsc_khz);
> +}
> +
> +static int write_tsc_page(struct kvm *kvm, u64 gfn,
> + PHV_REFERENCE_TSC_PAGE tsc_ref)
> +{
> +   if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
> +   tsc_ref, sizeof(*tsc_ref)))
> +   return 1;
> +   mark_page_dirty(kvm, gfn);
> +   return 0;
> +}
> +
> +static int read_tsc_page(struct kvm *kvm, u64 gfn,
> +PHV_REFERENCE_TSC_PAGE tsc_ref)
> +{
> +   if (kvm_read_guest(kvm, gfn_to_gpa(gfn),
> +  tsc_ref, sizeof(*tsc_ref)))
> +   return 1;
> +   return 0;
> +}
> +
> +static u64 calc_tsc_page_time(struct kvm_vcpu *vcpu,
> + PHV_REFERENCE_TSC_PAGE tsc_ref)
> +{
> +
> +   u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> +
> +   return mul_u64_u64_shr(tsc, tsc_ref->tsc_scale, 64)
> +   + tsc_ref->tsc_offset;
> +}
> +
> +static int setup_blank_tsc_page(struct kvm_vcpu *vcpu, u64 gfn)
> +{
> +   HV_REFERENCE_TSC_PAGE tsc_ref;
> +
> +   memset(_ref, 0, sizeof(tsc_ref));
> +   return write_tsc_page(vcpu->kvm, gfn, _ref);
> +}
> +
> +int kvm_hv_setup_tsc_page(struct kvm_vcpu *vcpu)
> +{
> +   struct kvm *kvm = vcpu->kvm;
> +   struct kvm_hv *hv = >arch.hyperv;
> +   HV_REFERENCE_TSC_PAGE tsc_ref;
> +   u32 tsc_khz;
> +   int r;
> +   u64 gfn, ref_time, tsc_scale, tsc_offset, tsc;
> +
> +   if (WARN_ON_ONCE(!(hv->hv_tsc_page & 
> HV_X64_MSR_TSC_REFERENCE_ENABLE)))
> +   return -EINVAL;
> +
> +   gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
> +   vcpu_debug(vcpu, "tsc page gfn 0x%llx\n", gfn);
> +
> +   tsc_khz = vcpu->arch.virtual_tsc_khz;
> +   if (!tsc_khz) {
> +   vcpu_unimpl(vcpu, "no tsc khz\n");
> +   return setup_blank_tsc_page(vcpu, gfn);
> +   }
> +
> +   r = read_tsc_page(kvm, gfn, _ref);
> +   if (r) {
> +   vcpu_err(vcpu, "can't access tsc page gfn 0x%llx\n", gfn);
> +   return r;
> +   }
> +
> +   tsc_scale = calc_tsc_page_scale(tsc_khz);
> +   ref_time = get_time_ref_counter(kvm);
> +   tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> +
> +   /* tsc_offset = reftime - tsc * tsc_scale / 2^64 */
> +   tsc_offset = ref_time - mul_u64_u64_shr(tsc, tsc_scale, 64);
> +   vcpu_debug(vcpu, "tsc khz %u tsc %llu scale %llu offset %llu\n",
> +  tsc_khz, tsc, tsc_scale, tsc_offset);
> +
> +   tsc_ref.tsc_sequence++;
> +   if (tsc_ref.tsc_sequence == 0)

Also avoid tsc_sequence == 0x here. In the Hyper-V TLFS 4.0
(Win2012 R2) 0x is the special sequence number to disable the
reference TSC page.

> +   tsc_ref.tsc_sequence = 1;
> +
> +   tsc_ref.tsc_scale = tsc_scale;
> +   tsc_ref.tsc_offset = tsc_offset;
> +
> +   vcpu_debug(vcpu, "tsc page calibration time %llu vs. reftime %llu\n",
> +  calc_tsc_page_time(vcpu, _ref),
> +  get_time_ref_counter(kvm));
> +
> +   return write_tsc_page(kvm, gfn, _ref);
> +}
> +
>  static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
> 

Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Michael S. Tsirkin
On Tue, Jan 05, 2016 at 12:43:03PM +, Dr. David Alan Gilbert wrote:
> * Michael S. Tsirkin (m...@redhat.com) wrote:
> > On Tue, Jan 05, 2016 at 10:45:25AM +, Dr. David Alan Gilbert wrote:
> > > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > > On Tue, Jan 05, 2016 at 10:01:04AM +, Dr. David Alan Gilbert wrote:
> > > > > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > > > > On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> > > > > > > >> The two mechanisms referenced above would likely require 
> > > > > > > >> coordination with
> > > > > > > >> QEMU and as such are open to discussion.  I haven't attempted 
> > > > > > > >> to address
> > > > > > > >> them as I am not sure there is a consensus as of yet.  My 
> > > > > > > >> personal
> > > > > > > >> preference would be to add a vendor-specific configuration 
> > > > > > > >> block to the
> > > > > > > >> emulated pci-bridge interfaces created by QEMU that would 
> > > > > > > >> allow us to
> > > > > > > >> essentially extend shpc to support guest live migration with 
> > > > > > > >> pass-through
> > > > > > > >> devices.
> > > > > > > >
> > > > > > > > shpc?
> > > > > > > 
> > > > > > > That is kind of what I was thinking.  We basically need some 
> > > > > > > mechanism
> > > > > > > to allow for the host to ask the device to quiesce.  It has been
> > > > > > > proposed to possibly even look at something like an ACPI interface
> > > > > > > since I know ACPI is used by QEMU to manage hot-plug in the 
> > > > > > > standard
> > > > > > > case.
> > > > > > > 
> > > > > > > - Alex
> > > > > > 
> > > > > > 
> > > > > > Start by using hot-unplug for this!
> > > > > > 
> > > > > > Really use your patch guest side, and write host side
> > > > > > to allow starting migration with the device, but
> > > > > > defer completing it.
> > > > > > 
> > > > > > So
> > > > > > 
> > > > > > 1.- host tells guest to start tracking memory writes
> > > > > > 2.- guest acks
> > > > > > 3.- migration starts
> > > > > > 4.- most memory is migrated
> > > > > > 5.- host tells guest to eject device
> > > > > > 6.- guest acks
> > > > > > 7.- stop vm and migrate rest of state
> > > > > > 
> > > > > > 
> > > > > > It will already be a win since hot unplug after migration starts and
> > > > > > most memory has been migrated is better than hot unplug before 
> > > > > > migration
> > > > > > starts.
> > > > > > 
> > > > > > Then measure downtime and profile. Then we can look at ways
> > > > > > to quiesce device faster which really means step 5 is replaced
> > > > > > with "host tells guest to quiesce device and dirty (or just unmap!)
> > > > > > all memory mapped for write by device".
> > > > > 
> > > > > 
> > > > > Doing a hot-unplug is going to upset the guests network stacks view
> > > > > of the world; that's something we don't want to change.
> > > > > 
> > > > > Dave
> > > > 
> > > > It might but if you store the IP and restore it quickly
> > > > after migration e.g. using guest agent, as opposed to DHCP,
> > > > then it won't.
> > > 
> > > I thought if you hot-unplug then it will lose any outstanding connections
> > > on that device.
> > 
> > Which connections and which device?  TCP connections and an ethernet
> > device?  These are on different layers so of course you don't lose them.
> > Just do not change the IP address.
> > 
> > Some guests send a signal to applications to close connections
> > when all links go down. One can work around this
> > in a variety of ways.
> 
> So, OK, I was surprised that a simple connection didn't go down when
> I tested and just removed the network card; I'd thought stuff was more
> aggressive when there was no route.
> But as you say, some stuff does close connections when the links go down/away
> so we do need to work around that; and any new outgoing connections get
> a 'no route to host'.


You can create a dummy device in guest for the duration of migration.
Use guest agent to move IP address there and that should be enough to trick 
most guests.


>  So I'm still nervous what will break.
> 
> Dave

I'm not saying nothing breaks.  Far being from it.  For example, some NAT
or firewall implementations keep state per interface and these might
lose state (if using NAT/stateful firewall within guest).


So yes it *would* be useful to teach guests, for example, that a device
is "not dead, just resting" and that another device will shortly come
and take its place.


But the simple setup is already useful and worth supporting, and merging
things gradually will help this project finally get off the ground.


> > 
> > > > It allows calming the device down in a generic way,
> > > > specific drivers can then implement the fast quiesce.
> > > 
> > > Except that if it breaks the guest networking it's useless.
> > > 
> > > Dave
> > > 
> > > > 
> > > > > > 
> > > > > > -- 
> > > > > > MST
> > > > > --
> > > > > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> > > --
> > > Dr. David Alan Gilbert / 

Re: [RFC PATCH 0/3] x86: Add support for guest DMA dirty page tracking

2016-01-05 Thread Dr. David Alan Gilbert
* Michael S. Tsirkin (m...@redhat.com) wrote:
> On Tue, Jan 05, 2016 at 10:45:25AM +, Dr. David Alan Gilbert wrote:
> > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > On Tue, Jan 05, 2016 at 10:01:04AM +, Dr. David Alan Gilbert wrote:
> > > > * Michael S. Tsirkin (m...@redhat.com) wrote:
> > > > > On Mon, Jan 04, 2016 at 07:11:25PM -0800, Alexander Duyck wrote:
> > > > > > >> The two mechanisms referenced above would likely require 
> > > > > > >> coordination with
> > > > > > >> QEMU and as such are open to discussion.  I haven't attempted to 
> > > > > > >> address
> > > > > > >> them as I am not sure there is a consensus as of yet.  My 
> > > > > > >> personal
> > > > > > >> preference would be to add a vendor-specific configuration block 
> > > > > > >> to the
> > > > > > >> emulated pci-bridge interfaces created by QEMU that would allow 
> > > > > > >> us to
> > > > > > >> essentially extend shpc to support guest live migration with 
> > > > > > >> pass-through
> > > > > > >> devices.
> > > > > > >
> > > > > > > shpc?
> > > > > > 
> > > > > > That is kind of what I was thinking.  We basically need some 
> > > > > > mechanism
> > > > > > to allow for the host to ask the device to quiesce.  It has been
> > > > > > proposed to possibly even look at something like an ACPI interface
> > > > > > since I know ACPI is used by QEMU to manage hot-plug in the standard
> > > > > > case.
> > > > > > 
> > > > > > - Alex
> > > > > 
> > > > > 
> > > > > Start by using hot-unplug for this!
> > > > > 
> > > > > Really use your patch guest side, and write host side
> > > > > to allow starting migration with the device, but
> > > > > defer completing it.
> > > > > 
> > > > > So
> > > > > 
> > > > > 1.- host tells guest to start tracking memory writes
> > > > > 2.- guest acks
> > > > > 3.- migration starts
> > > > > 4.- most memory is migrated
> > > > > 5.- host tells guest to eject device
> > > > > 6.- guest acks
> > > > > 7.- stop vm and migrate rest of state
> > > > > 
> > > > > 
> > > > > It will already be a win since hot unplug after migration starts and
> > > > > most memory has been migrated is better than hot unplug before 
> > > > > migration
> > > > > starts.
> > > > > 
> > > > > Then measure downtime and profile. Then we can look at ways
> > > > > to quiesce device faster which really means step 5 is replaced
> > > > > with "host tells guest to quiesce device and dirty (or just unmap!)
> > > > > all memory mapped for write by device".
> > > > 
> > > > 
> > > > Doing a hot-unplug is going to upset the guests network stacks view
> > > > of the world; that's something we don't want to change.
> > > > 
> > > > Dave
> > > 
> > > It might but if you store the IP and restore it quickly
> > > after migration e.g. using guest agent, as opposed to DHCP,
> > > then it won't.
> > 
> > I thought if you hot-unplug then it will lose any outstanding connections
> > on that device.
> 
> Which connections and which device?  TCP connections and an ethernet
> device?  These are on different layers so of course you don't lose them.
> Just do not change the IP address.
> 
> Some guests send a signal to applications to close connections
> when all links go down. One can work around this
> in a variety of ways.

So, OK, I was surprised that a simple connection didn't go down when
I tested and just removed the network card; I'd thought stuff was more
aggressive when there was no route.
But as you say, some stuff does close connections when the links go down/away
so we do need to work around that; and any new outgoing connections get
a 'no route to host'.  So I'm still nervous what will break.

Dave

> 
> > > It allows calming the device down in a generic way,
> > > specific drivers can then implement the fast quiesce.
> > 
> > Except that if it breaks the guest networking it's useless.
> > 
> > Dave
> > 
> > > 
> > > > > 
> > > > > -- 
> > > > > MST
> > > > --
> > > > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> > --
> > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/4] x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader

2016-01-04 Thread Marcelo Tosatti
On Sun, Dec 20, 2015 at 03:05:41AM -0800, Andy Lutomirski wrote:
> From: Andy Lutomirski 
> 
> The pvclock vdso code was too abstracted to understand easily and
> excessively paranoid.  Simplify it for a huge speedup.
> 
> This opens the door for additional simplifications, as the vdso no
> longer accesses the pvti for any vcpu other than vcpu 0.
> 
> Before, vclock_gettime using kvm-clock took about 45ns on my machine.
> With this change, it takes 29ns, which is almost as fast as the pure TSC
> implementation.
> 
> Reviewed-by: Paolo Bonzini 
> Signed-off-by: Andy Lutomirski 
> ---
>  arch/x86/entry/vdso/vclock_gettime.c | 81 
> 
>  1 file changed, 46 insertions(+), 35 deletions(-)
> 
> diff --git a/arch/x86/entry/vdso/vclock_gettime.c 
> b/arch/x86/entry/vdso/vclock_gettime.c
> index ca94fa649251..c325ba1bdddf 100644
> --- a/arch/x86/entry/vdso/vclock_gettime.c
> +++ b/arch/x86/entry/vdso/vclock_gettime.c
> @@ -78,47 +78,58 @@ static notrace const struct pvclock_vsyscall_time_info 
> *get_pvti(int cpu)
>  
>  static notrace cycle_t vread_pvclock(int *mode)
>  {
> - const struct pvclock_vsyscall_time_info *pvti;
> + const struct pvclock_vcpu_time_info *pvti = _pvti(0)->pvti;
>   cycle_t ret;
> - u64 last;
> - u32 version;
> - u8 flags;
> - unsigned cpu, cpu1;
> -
> + u64 tsc, pvti_tsc;
> + u64 last, delta, pvti_system_time;
> + u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
>  
>   /*
> -  * Note: hypervisor must guarantee that:
> -  * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
> -  * 2. that per-CPU pvclock time info is updated if the
> -  *underlying CPU changes.
> -  * 3. that version is increased whenever underlying CPU
> -  *changes.
> +  * Note: The kernel and hypervisor must guarantee that cpu ID
> +  * number maps 1:1 to per-CPU pvclock time info.
> +  *
> +  * Because the hypervisor is entirely unaware of guest userspace
> +  * preemption, it cannot guarantee that per-CPU pvclock time
> +  * info is updated if the underlying CPU changes or that that
> +  * version is increased whenever underlying CPU changes.
>*
> +  * On KVM, we are guaranteed that pvti updates for any vCPU are
> +  * atomic as seen by *all* vCPUs.  This is an even stronger
> +  * guarantee than we get with a normal seqlock.
> +  *
> +  * On Xen, we don't appear to have that guarantee, but Xen still
> +  * supplies a valid seqlock using the version field.
> +
> +  * We only do pvclock vdso timing at all if
> +  * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
> +  * mean that all vCPUs have matching pvti and that the TSC is
> +  * synced, so we can just look at vCPU 0's pvti.
>*/
> - do {
> - cpu = __getcpu() & VGETCPU_CPU_MASK;
> - /* TODO: We can put vcpu id into higher bits of pvti.version.
> -  * This will save a couple of cycles by getting rid of
> -  * __getcpu() calls (Gleb).
> -  */
> -
> - pvti = get_pvti(cpu);
> -
> - version = __pvclock_read_cycles(>pvti, , );
> -
> - /*
> -  * Test we're still on the cpu as well as the version.
> -  * We could have been migrated just after the first
> -  * vgetcpu but before fetching the version, so we
> -  * wouldn't notice a version change.
> -  */
> - cpu1 = __getcpu() & VGETCPU_CPU_MASK;
> - } while (unlikely(cpu != cpu1 ||
> -   (pvti->pvti.version & 1) ||
> -   pvti->pvti.version != version));
> -
> - if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
> +
> + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
>   *mode = VCLOCK_NONE;
> + return 0;
> + }
> +
> + do {
> + version = pvti->version;
> +
> + /* This is also a read barrier, so we'll read version first. */
> + tsc = rdtsc_ordered();
> +
> + pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
> + pvti_tsc_shift = pvti->tsc_shift;
> + pvti_system_time = pvti->system_time;
> + pvti_tsc = pvti->tsc_timestamp;
> +
> + /* Make sure that the version double-check is last. */
> + smp_rmb();
> + } while (unlikely((version & 1) || version != pvti->version));

Andy,

What happens if PVCLOCK_TSC_STABLE_BIT is disabled here?

> +
> + delta = tsc - pvti_tsc;
> + ret = pvti_system_time +
> + pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
> + pvti_tsc_shift);
>  
>   /* refer to tsc.c read_tsc() comment for rationale */
>   last = gtod->cycle_last;
> -- 
> 2.5.0
> 
> --
> To unsubscribe from this list: send the line 

Re: How to reserve guest physical region for ACPI

2016-01-04 Thread Laszlo Ersek
Michael CC'd me on the grandparent of the email below. I'll try to add
my thoughts in a single go, with regard to OVMF.

On 12/30/15 20:52, Michael S. Tsirkin wrote:
> On Wed, Dec 30, 2015 at 04:55:54PM +0100, Igor Mammedov wrote:
>> On Mon, 28 Dec 2015 14:50:15 +0200
>> "Michael S. Tsirkin"  wrote:
>>
>>> On Mon, Dec 28, 2015 at 10:39:04AM +0800, Xiao Guangrong wrote:

 Hi Michael, Paolo,

 Now it is the time to return to the challenge that how to reserve guest
 physical region internally used by ACPI.

 Igor suggested that:
 | An alternative place to allocate reserve from could be high memory.
 | For pc we have "reserved-memory-end" which currently makes sure
 | that hotpluggable memory range isn't used by firmware
 (https://lists.nongnu.org/archive/html/qemu-devel/2015-11/msg00926.html)

OVMF has no support for the "reserved-memory-end" fw_cfg file. The
reason is that nobody wrote that patch, nor asked for the patch to be
written. (Not implying that just requesting the patch would be
sufficient for the patch to be written.)

>>> I don't want to tie things to reserved-memory-end because this
>>> does not scale: next time we need to reserve memory,
>>> we'll need to find yet another way to figure out what is where.
>> Could you elaborate a bit more on a problem you're seeing?
>>
>> To me it looks like it scales rather well.
>> For example lets imagine that we adding a device
>> that has some on device memory that should be mapped into GPA
>> code to do so would look like:
>>
>>   pc_machine_device_plug_cb(dev)
>>   {
>>...
>>if (dev == OUR_NEW_DEVICE_TYPE) {
>>memory_region_add_subregion(as, current_reserved_end, >mr);
>>set_new_reserved_end(current_reserved_end + 
>> memory_region_size(>mr));
>>}
>>   }
>>
>> we can practically add any number of new devices that way.
> 
> Yes but we'll have to build a host side allocator for these, and that's
> nasty. We'll also have to maintain these addresses indefinitely (at
> least per machine version) as they are guest visible.
> Not only that, there's no way for guest to know if we move things
> around, so basically we'll never be able to change addresses.
> 
> 
>>  
>>> I would like ./hw/acpi/bios-linker-loader.c interface to be extended to
>>> support 64 bit RAM instead

This looks quite doable in OVMF, as long as the blob to allocate from
high memory contains *zero* ACPI tables.

(
Namely, each ACPI table is installed from the containing fw_cfg blob
with EFI_ACPI_TABLE_PROTOCOL.InstallAcpiTable(), and the latter has its
own allocation policy for the *copies* of ACPI tables it installs.

This allocation policy is left unspecified in the section of the UEFI
spec that governs EFI_ACPI_TABLE_PROTOCOL.

The current policy in edk2 (= the reference implementation) seems to be
"allocate from under 4GB". It is currently being changed to "try to
allocate from under 4GB, and if that fails, retry from high memory". (It
is motivated by Aarch64 machines that may have no DRAM at all under 4GB.)
)

>>> (and maybe a way to allocate and
>>> zero-initialize buffer without loading it through fwcfg),

Sounds reasonable.

>>> this way bios
>>> does the allocation, and addresses can be patched into acpi.
>> and then guest side needs to parse/execute some AML that would
>> initialize QEMU side so it would know where to write data.
> 
> Well not really - we can put it in a data table, by itself
> so it's easy to find.

Do you mean acpi_tb_find_table(), acpi_get_table_by_index() /
acpi_get_table_with_size()?

> 
> AML is only needed if access from ACPI is desired.
> 
> 
>> bios-linker-loader is a great interface for initializing some
>> guest owned data and linking it together but I think it adds
>> unnecessary complexity and is misused if it's used to handle
>> device owned data/on device memory in this and VMGID cases.
> 
> I want a generic interface for guest to enumerate these things.  linker
> seems quite reasonable but if you see a reason why it won't do, or want
> to propose a better interface, fine.

* The guest could do the following:
- while processing the ALLOCATE commands, it would make a note where in
GPA space each fw_cfg blob gets allocated
- at the end the guest would prepare a temporary array with a predefined
record format, that associates each fw_cfg blob's name with the concrete
allocation address
- it would create an FWCfgDmaAccess stucture pointing at this array,
with a new "control" bit set (or something similar)
- the guest could write the address of the FWCfgDmaAccess struct to the
appropriate register, as always.

* Another idea would be a GET_ALLOCATION_ADDRESS linker/loader command,
specifying:
- the fw_cfg blob's name, for which to retrieve the guest-allocated
  address (this command could only follow the matching ALLOCATE
  command, never precede it)
- a flag whether the address should be written to IO or MMIO space
  (would be likely IO on x86, MMIO on ARM)
- a unique 

Re: [RFC PATCH v2 3/3] vfio-pci: Allow to mmap MSI-X table if EEH is supported

2016-01-04 Thread Alex Williamson
On Thu, 2015-12-31 at 16:50 +0800, Yongji Xie wrote:
> Current vfio-pci implementation disallows to mmap MSI-X
> table in case that user get to touch this directly.
> 
> However, EEH mechanism can ensure that a given pci device
> can only shoot the MSIs assigned for its PE. So we think
> it's safe to expose the MSI-X table to userspace because
> the exposed MSI-X table can't be used to do harm to other
> memory space.
> 
> And with MSI-X table mmapped, some performance issues which
> are caused when PCI adapters have critical registers in the
> same page as the MSI-X table also can be resolved.
> 
> So this patch adds a Kconfig option, VFIO_PCI_MMAP_MSIX,
> to support for mmapping MSI-X table.
> 
> Signed-off-by: Yongji Xie 
> ---
>  drivers/vfio/pci/Kconfig|4 
>  drivers/vfio/pci/vfio_pci.c |6 --
>  2 files changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 02912f1..67b0a2c 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -23,6 +23,10 @@ config VFIO_PCI_MMAP
>   depends on VFIO_PCI
>   def_bool y if !S390
>  
> +config VFIO_PCI_MMAP_MSIX
> + depends on VFIO_PCI_MMAP
> + def_bool y if EEH

Does CONFIG_EEH necessarily mean the EEH is enabled?  Could the system
not support EEH or could EEH be disabled via kernel commandline
options?

> +
>  config VFIO_PCI_INTX
>   depends on VFIO_PCI
>   def_bool y if !S390
> diff --git a/drivers/vfio/pci/vfio_pci.c
> b/drivers/vfio/pci/vfio_pci.c
> index 09b3805..d536985 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -555,7 +555,8 @@ static long vfio_pci_ioctl(void *device_data,
>   IORESOURCE_MEM && (info.size >=
> PAGE_SIZE ||
>   pci_resource_page_aligned)) {
>   info.flags |=
> VFIO_REGION_INFO_FLAG_MMAP;
> - if (info.index == vdev->msix_bar) {
> + if
> (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP_MSIX) &&
> + info.index == vdev->msix_bar) {
>   ret =
> msix_sparse_mmap_cap(vdev, );
>   if (ret)
>   return ret;
> @@ -967,7 +968,8 @@ static int vfio_pci_mmap(void *device_data,
> struct vm_area_struct *vma)
>   if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
>   return -EINVAL;
>  
> - if (index == vdev->msix_bar) {
> + if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP_MSIX) &&
> + index == vdev->msix_bar) {
>   /*
>    * Disallow mmaps overlapping the MSI-X table; users
> don't
>    * get to touch this directly.  We could find
> somewhere

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v2 1/3] PCI: Add support for enforcing all MMIO BARs to be page aligned

2016-01-04 Thread Alex Williamson
On Thu, 2015-12-31 at 16:50 +0800, Yongji Xie wrote:
> When vfio passthrough a PCI device of which MMIO BARs
> are smaller than PAGE_SIZE, guest will not handle the
> mmio accesses to the BARs which leads to mmio emulations
> in host.
> 
> This is because vfio will not allow to passthrough one
> BAR's mmio page which may be shared with other BARs.
> 
> To solve this performance issue, this patch adds a kernel
> parameter "pci=resource_page_aligned=on" to enforce
> the alignments of all MMIO BARs to be at least PAGE_SIZE,
> so that one BAR's mmio page would not be shared with other
> BARs. We can also disable it through kernel parameter
> "pci=resource_page_aligned=off".

Shouldn't this somehow be associated with the realloc option?  I don't
think PCI code will attempt to reprogram anything unless it needs to
otherwise.

> For the default value of this parameter, we think it should be
> arch-independent, so we add a macro PCI_RESOURCE_PAGE_ALIGNED
> to change it. And we define this macro to enable this parameter
> by default on PPC64 platform which can easily hit this
> performance issue because its PAGE_SIZE is 64KB.
> 
> Signed-off-by: Yongji Xie 
> ---
>  Documentation/kernel-parameters.txt |4 
>  arch/powerpc/include/asm/pci.h  |   11 +++
>  drivers/pci/pci.c   |   17 +
>  drivers/pci/pci.h   |7 ++-
>  include/linux/pci.h |2 ++
>  5 files changed, 40 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/kernel-parameters.txt 
> b/Documentation/kernel-parameters.txt
> index 742f69d..a53aaee 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -2857,6 +2857,10 @@ bytes respectively. Such letter suffixes can also be 
> entirely omitted.
>   PAGE_SIZE is used as alignment.
>   PCI-PCI bridge can be specified, if resource
>   windows need to be expanded.
> + resource_page_aligned=  Enable/disable enforcing the alignment
> + of all PCI devices' memory resources to be
> + at least PAGE_SIZE.
> + Format: { "on" | "off" }
>   ecrc=   Enable/disable PCIe ECRC (transaction layer
>   end-to-end CRC checking).
>   bios: Use BIOS/firmware settings. This is the
> diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
> index 3453bd8..27bff59 100644
> --- a/arch/powerpc/include/asm/pci.h
> +++ b/arch/powerpc/include/asm/pci.h
> @@ -136,6 +136,17 @@ extern pgprot_t  pci_phys_mem_access_prot(struct file 
> *file,
>    unsigned long pfn,
>    unsigned long size,
>    pgprot_t prot);
> +#ifdef CONFIG_PPC64
> +
> +/* For PPC64, We enforce all PCI MMIO BARs to be page aligned
> + * by default. This would be helpful to improve performance
> + * when we passthrough a PCI device of which BARs are smaller
> + * than PAGE_SIZE(64KB). And we can use bootcmd
> + * "pci=resource_page_aligned=off" to disable it.
> + */
> +#define PCI_ENABLE_RESOURCE_PAGE_ALIGNED
> +
> +#endif

This should be done with something like HAVE_PCI_DEFAULT_RESOURCE_PAGE_
ALIGNED in arch/powerpc/include/asm

>  #define HAVE_ARCH_PCI_RESOURCE_TO_USER
>  extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 314db8c..9f14ba5 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -99,6 +99,13 @@ u8 pci_cache_line_size;
>   */
>  unsigned int pcibios_max_latency = 255;
>  
> +#ifdef PCI_ENABLE_RESOURCE_PAGE_ALIGNED
> +bool pci_resource_page_aligned = true;
> +#else
> +bool pci_resource_page_aligned;
> +#endif
> +EXPORT_SYMBOL(pci_resource_page_aligned);

Couldn't this be done in a single line with IS_ENABLED() macro?

Should this symbol be GPL-only?

> +
>  /* If set, the PCIe ARI capability will not be used. */
>  static bool pcie_ari_disabled;
>  
> @@ -4746,6 +4753,14 @@ static ssize_t pci_resource_alignment_store(struct 
> bus_type *bus,
>  BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show,
>   pci_resource_alignment_store);
>  
> +static void pci_resource_get_page_aligned(char *str)
> +{
> + if (!strncmp(str, "off", 3))
> + pci_resource_page_aligned = false;
> + else if (!strncmp(str, "on", 2))
> + pci_resource_page_aligned = true;
> +}
> +
>  static int __init pci_resource_alignment_sysfs_init(void)
>  {
>   return bus_create_file(_bus_type,
> @@ -4859,6 +4874,8 @@ static int __init pci_setup(char *str)
>   } else if (!strncmp(str, "resource_alignment=", 19)) {
>   

[PATCH 3/6] nvdimm acpi: introduce patched dsm memory

2016-01-04 Thread Xiao Guangrong
The dsm memory is used to save the input parameters and store
the dsm result which is filled by QEMU.

The address of dsm memory is decided by bios and patched into
int64 object returned by "MEMA" method

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 12 
 hw/acpi/nvdimm.c| 24 ++--
 include/hw/acpi/aml-build.h |  1 +
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 78e1290..83eadb3 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -394,6 +394,18 @@ Aml *aml_int(const uint64_t val)
 }
 
 /*
+ * ACPI 1.0b: 16.2.3 Data Objects Encoding:
+ * encode: QWordConst
+ */
+Aml *aml_int64(const uint64_t val)
+{
+Aml *var = aml_alloc();
+build_append_byte(var->buf, 0x0E); /* QWordPrefix */
+build_append_int_noprefix(var->buf, val, 8);
+return var;
+}
+
+/*
  * helper to construct NameString, which returns Aml object
  * for using with aml_append or other aml_* terms
  */
diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index bc7cd8f..a72104c 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -28,6 +28,7 @@
 
 #include "hw/acpi/acpi.h"
 #include "hw/acpi/aml-build.h"
+#include "hw/acpi/bios-linker-loader.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
 
@@ -402,7 +403,8 @@ void nvdimm_init_acpi_state(AcpiNVDIMMState *state, 
MemoryRegion *io,
 state->dsm_mem->len);
 }
 
-#define NVDIMM_COMMON_DSM  "NCAL"
+#define NVDIMM_GET_DSM_MEM  "MEMA"
+#define NVDIMM_COMMON_DSM   "NCAL"
 
 static void nvdimm_build_common_dsm(Aml *dev)
 {
@@ -468,7 +470,8 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray 
*table_offsets,
   GArray *table_data, GArray *linker,
   uint8_t revision)
 {
-Aml *ssdt, *sb_scope, *dev;
+Aml *ssdt, *sb_scope, *dev, *method;
+int offset;
 
 acpi_add_table(table_offsets, table_data);
 
@@ -499,9 +502,26 @@ static void nvdimm_build_ssdt(GSList *device_list, GArray 
*table_offsets,
 
 aml_append(sb_scope, dev);
 
+/*
+ * leave it at the end of ssdt so that we can conveniently get the
+ * offset of int64 object returned by the function which will be
+ * patched with the real address of the dsm memory by BIOS.
+ */
+method = aml_method(NVDIMM_GET_DSM_MEM, 0, AML_NOTSERIALIZED);
+aml_append(method, aml_return(aml_int64(0x0)));
+aml_append(sb_scope, method);
 aml_append(ssdt, sb_scope);
 /* copy AML table into ACPI tables blob and patch header there */
 g_array_append_vals(table_data, ssdt->buf->data, ssdt->buf->len);
+
+offset = table_data->len - 8;
+
+bios_linker_loader_alloc(linker, NVDIMM_DSM_MEM_FILE, TARGET_PAGE_SIZE,
+ false /* high memory */);
+bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
+   NVDIMM_DSM_MEM_FILE, table_data,
+   table_data->data + offset,
+   sizeof(uint64_t));
 build_header(linker, table_data,
 (void *)(table_data->data + table_data->len - ssdt->buf->len),
 "SSDT", ssdt->buf->len, revision, "NVDIMM");
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index ef44d02..b4726a4 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -246,6 +246,7 @@ Aml *aml_name(const char *name_format, ...) GCC_FMT_ATTR(1, 
2);
 Aml *aml_name_decl(const char *name, Aml *val);
 Aml *aml_return(Aml *val);
 Aml *aml_int(const uint64_t val);
+Aml *aml_int64(const uint64_t val);
 Aml *aml_arg(int pos);
 Aml *aml_to_integer(Aml *arg);
 Aml *aml_to_hexstring(Aml *src, Aml *dst);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/6] nvdimm acpi: emulate dsm method

2016-01-04 Thread Xiao Guangrong
Emulate dsm method after IO VM-exit

Currently, we only introduce the framework and no function is actually
supported

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c |  2 +-
 hw/acpi/nvdimm.c| 83 -
 include/hw/acpi/aml-build.h |  1 +
 include/hw/mem/nvdimm.h | 17 ++
 4 files changed, 101 insertions(+), 2 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index e65171f..5a7644a 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -231,7 +231,7 @@ static void build_extop_package(GArray *package, uint8_t op)
 build_prepend_byte(package, 0x5B); /* ExtOpPrefix */
 }
 
-static void build_append_int_noprefix(GArray *table, uint64_t value, int size)
+void build_append_int_noprefix(GArray *table, uint64_t value, int size)
 {
 int i;
 
diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index dfccbc0..7be9857 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -390,12 +390,80 @@ typedef struct NvdimmDsmOut NvdimmDsmOut;
 static uint64_t
 nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
 {
+fprintf(stderr, "BUG: we never read _DSM IO Port.\n");
 return 0;
 }
 
 static void
 nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
 {
+AcpiNVDIMMState *state = opaque;
+NvdimmDsmIn *in;
+hwaddr dsm_mem_addr;
+GArray *out;
+uint32_t buf_size;
+
+nvdimm_debug("write address %#lx value %#lx.\n", addr, val);
+
+if (size != sizeof(uint32_t)) {
+fprintf(stderr, "BUG: invalid IO bit width %#x.\n", size);
+return;
+}
+
+switch (addr) {
+case 0:
+state->low_dsm_mem_addr = val;
+return;
+case sizeof(uint32_t):
+state->high_dsm_mem_addr = val;
+break;
+default:
+fprintf(stderr, "BUG: IO access address %#lx is not dword"
+" aligned.\n", addr);
+return;
+};
+
+dsm_mem_addr = state->low_dsm_mem_addr;
+dsm_mem_addr |= (hwaddr)state->high_dsm_mem_addr << (sizeof(uint32_t) *
+BITS_PER_BYTE);
+nvdimm_debug("dsm address %#lx\n", dsm_mem_addr);
+
+/*
+ * The DSM memory is mapped to guest address space so an evil guest
+ * can change its content while we are doing DSM emulation. Avoid
+ * this by copying DSM memory to QEMU local memory.
+ */
+in = g_malloc(TARGET_PAGE_SIZE);
+cpu_physical_memory_read(dsm_mem_addr, in, TARGET_PAGE_SIZE);
+
+le32_to_cpus(>revision);
+le32_to_cpus(>function);
+le32_to_cpus(>handle);
+
+nvdimm_debug("Revision %#x Handler %#x Function %#x.\n", in->revision,
+ in->handle, in->function);
+
+out = g_array_new(false, true /* clear */, 1);
+
+/*
+ * function 0 is called to inquire what functions are supported by
+ * OSPM
+ */
+if (in->function == 0) {
+build_append_int_noprefix(out, 0 /* No function Supported */,
+  sizeof(uint8_t));
+} else {
+/* No function is supported yet. */
+build_append_int_noprefix(out, 1 /* Not Supported */,
+  sizeof(uint8_t));
+}
+
+buf_size = cpu_to_le32(out->len);
+cpu_physical_memory_write(dsm_mem_addr, _size, sizeof(buf_size));
+cpu_physical_memory_write(dsm_mem_addr + sizeof(buf_size), out->data,
+  out->len);
+g_free(in);
+g_array_free(out, true);
 }
 
 static const MemoryRegionOps nvdimm_dsm_ops = {
@@ -408,6 +476,17 @@ static const MemoryRegionOps nvdimm_dsm_ops = {
 },
 };
 
+static const VMStateDescription nvdimm_acpi_vmstate = {
+.name = "nvdimm_acpi_vmstate",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_UINT32(low_dsm_mem_addr, AcpiNVDIMMState),
+VMSTATE_UINT32(high_dsm_mem_addr, AcpiNVDIMMState),
+VMSTATE_END_OF_LIST()
+},
+};
+
 void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
 FWCfgState *fw_cfg, Object *owner)
 {
@@ -419,6 +498,8 @@ void nvdimm_init_acpi_state(AcpiNVDIMMState *state, 
MemoryRegion *io,
 acpi_data_push(state->dsm_mem, TARGET_PAGE_SIZE);
 fw_cfg_add_file(fw_cfg, NVDIMM_DSM_MEM_FILE, state->dsm_mem->data,
 state->dsm_mem->len);
+
+vmstate_register(NULL, 0, _acpi_vmstate, state);
 }
 
 #define NVDIMM_GET_DSM_MEM  "MEMA"
@@ -430,7 +511,7 @@ static void nvdimm_build_common_dsm(Aml *dev)
 Aml *result_size, *dsm_mem;
 uint8_t byte_list[1];
 
-method = aml_method(NVDIMM_COMMON_DSM, 4, AML_NOTSERIALIZED);
+method = aml_method(NVDIMM_COMMON_DSM, 4, AML_SERIALIZED);
 function = aml_arg(2);
 dsm_mem = aml_arg(3);
 
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 6c1816e..2fa8daa 100644
--- a/include/hw/acpi/aml-build.h
+++ 

[PATCH 4/6] acpi: allow using acpi named offset for OperationRegion

2016-01-04 Thread Xiao Guangrong
Extend aml_operation_region() to use named object

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 4 ++--
 hw/i386/acpi-build.c| 7 ---
 include/hw/acpi/aml-build.h | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 83eadb3..677c1a6 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -958,14 +958,14 @@ Aml *aml_package(uint8_t num_elements)
 
 /* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefOpRegion */
 Aml *aml_operation_region(const char *name, AmlRegionSpace rs,
-  uint32_t offset, uint32_t len)
+  Aml *offset, uint32_t len)
 {
 Aml *var = aml_alloc();
 build_append_byte(var->buf, 0x5B); /* ExtOpPrefix */
 build_append_byte(var->buf, 0x80); /* OpRegionOp */
 build_append_namestring(var->buf, "%s", name);
 build_append_byte(var->buf, rs);
-build_append_int(var->buf, offset);
+aml_append(var, offset);
 build_append_int(var->buf, len);
 return var;
 }
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 0836119..ad10c48 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1139,7 +1139,7 @@ build_ssdt(GArray *table_data, GArray *linker,
 aml_append(dev, aml_name_decl("_CRS", crs));
 
 aml_append(dev, aml_operation_region("PEOR", AML_SYSTEM_IO,
-  misc->pvpanic_port, 1));
+  aml_int(misc->pvpanic_port), 1));
 field = aml_field("PEOR", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE);
 aml_append(field, aml_named_field("PEPT", 8));
 aml_append(dev, field);
@@ -1179,7 +1179,8 @@ build_ssdt(GArray *table_data, GArray *linker,
 aml_append(sb_scope, dev);
 /* declare CPU hotplug MMIO region and PRS field to access it */
 aml_append(sb_scope, aml_operation_region(
-"PRST", AML_SYSTEM_IO, pm->cpu_hp_io_base, pm->cpu_hp_io_len));
+"PRST", AML_SYSTEM_IO, aml_int(pm->cpu_hp_io_base),
+pm->cpu_hp_io_len));
 field = aml_field("PRST", AML_BYTE_ACC, AML_NOLOCK, AML_PRESERVE);
 aml_append(field, aml_named_field("PRS", 256));
 aml_append(sb_scope, field);
@@ -1251,7 +1252,7 @@ build_ssdt(GArray *table_data, GArray *linker,
 
 aml_append(scope, aml_operation_region(
 stringify(MEMORY_HOTPLUG_IO_REGION), AML_SYSTEM_IO,
-pm->mem_hp_io_base, pm->mem_hp_io_len)
+aml_int(pm->mem_hp_io_base), pm->mem_hp_io_len)
 );
 
 field = aml_field(stringify(MEMORY_HOTPLUG_IO_REGION), AML_DWORD_ACC,
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index b4726a4..a8d8f3b 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -285,7 +285,7 @@ Aml *aml_interrupt(AmlConsumerAndProducer con_and_pro,
 Aml *aml_io(AmlIODecode dec, uint16_t min_base, uint16_t max_base,
 uint8_t aln, uint8_t len);
 Aml *aml_operation_region(const char *name, AmlRegionSpace rs,
-  uint32_t offset, uint32_t len);
+  Aml *offset, uint32_t len);
 Aml *aml_irq_no_flags(uint8_t irq);
 Aml *aml_named_field(const char *name, unsigned length);
 Aml *aml_reserved_field(unsigned length);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/6] nvdimm acpi: initialize the resource used by NVDIMM ACPI

2016-01-04 Thread Xiao Guangrong
IO port 0x0a18 - 0x0a20 in guest is reserved for NVDIMM ACPI emulation,
the table, NVDIMM_DSM_MEM_FILE, will be patched into NVDIMM ACPI
binary code

OSPM uses this port to tell QEMU the final address of the DSM memory
and notify QEMU to emulate the DSM method

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/Makefile.objs   |  2 +-
 hw/acpi/nvdimm.c| 35 +++
 hw/i386/acpi-build.c| 10 +-
 hw/i386/pc.c|  8 +---
 hw/i386/pc_piix.c   |  5 +
 hw/i386/pc_q35.c|  8 +++-
 include/hw/i386/pc.h|  5 -
 include/hw/mem/nvdimm.h | 25 -
 8 files changed, 82 insertions(+), 16 deletions(-)

diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
index 095597f..84c082d 100644
--- a/hw/acpi/Makefile.objs
+++ b/hw/acpi/Makefile.objs
@@ -2,7 +2,7 @@ common-obj-$(CONFIG_ACPI_X86) += core.o piix4.o pcihp.o
 common-obj-$(CONFIG_ACPI_X86_ICH) += ich9.o tco.o
 common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu_hotplug.o
 common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o
-common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
+obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
 common-obj-$(CONFIG_ACPI) += acpi_interface.o
 common-obj-$(CONFIG_ACPI) += bios-linker-loader.o
 common-obj-$(CONFIG_ACPI) += aml-build.o
diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index a2c58dd..bc7cd8f 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -28,6 +28,7 @@
 
 #include "hw/acpi/acpi.h"
 #include "hw/acpi/aml-build.h"
+#include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
 
 static int nvdimm_plugged_device_list(Object *obj, void *opaque)
@@ -367,6 +368,40 @@ static void nvdimm_build_nfit(GSList *device_list, GArray 
*table_offsets,
 g_array_free(structures, true);
 }
 
+static uint64_t
+nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
+{
+return 0;
+}
+
+static void
+nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
+{
+}
+
+static const MemoryRegionOps nvdimm_dsm_ops = {
+.read = nvdimm_dsm_read,
+.write = nvdimm_dsm_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+.valid = {
+.min_access_size = 4,
+.max_access_size = 4,
+},
+};
+
+void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io,
+FWCfgState *fw_cfg, Object *owner)
+{
+memory_region_init_io(>io_mr, owner, _dsm_ops, state,
+  "nvdimm-acpi-io", NVDIMM_ACPI_IO_LEN);
+memory_region_add_subregion(io, NVDIMM_ACPI_IO_BASE, >io_mr);
+
+state->dsm_mem = g_array_new(false, true /* clear */, 1);
+acpi_data_push(state->dsm_mem, TARGET_PAGE_SIZE);
+fw_cfg_add_file(fw_cfg, NVDIMM_DSM_MEM_FILE, state->dsm_mem->data,
+state->dsm_mem->len);
+}
+
 #define NVDIMM_COMMON_DSM  "NCAL"
 
 static void nvdimm_build_common_dsm(Aml *dev)
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 4674461..0836119 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -39,7 +39,6 @@
 #include "hw/loader.h"
 #include "hw/isa/isa.h"
 #include "hw/acpi/memory_hotplug.h"
-#include "hw/mem/nvdimm.h"
 #include "sysemu/tpm.h"
 #include "hw/acpi/tpm.h"
 #include "sysemu/tpm_backend.h"
@@ -1696,13 +1695,6 @@ static bool acpi_has_iommu(void)
 return intel_iommu && !ambiguous;
 }
 
-static bool acpi_has_nvdimm(void)
-{
-PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
-
-return pcms->nvdimm;
-}
-
 static
 void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
 {
@@ -1787,7 +1779,7 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables 
*tables)
 build_dmar_q35(tables_blob, tables->linker);
 }
 
-if (acpi_has_nvdimm()) {
+if (guest_info->has_nvdimm) {
 nvdimm_build_acpi(table_offsets, tables_blob, tables->linker,
   pm.dsdt_revision);
 }
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 459260b..c7819e7 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1220,6 +1220,8 @@ PcGuestInfo *pc_guest_info_init(PCMachineState *pcms)
 }
 }
 
+guest_info->has_nvdimm = pcms->acpi_nvdimm_state.is_enabled;
+
 guest_info_state->machine_done.notify = pc_guest_info_machine_done;
 qemu_add_machine_init_done_notifier(_info_state->machine_done);
 return guest_info;
@@ -1869,14 +1871,14 @@ static bool pc_machine_get_nvdimm(Object *obj, Error 
**errp)
 {
 PCMachineState *pcms = PC_MACHINE(obj);
 
-return pcms->nvdimm;
+return pcms->acpi_nvdimm_state.is_enabled;
 }
 
 static void pc_machine_set_nvdimm(Object *obj, bool value, Error **errp)
 {
 PCMachineState *pcms = PC_MACHINE(obj);
 
-pcms->nvdimm = value;
+pcms->acpi_nvdimm_state.is_enabled = value;
 }
 
 static void pc_machine_initfn(Object *obj)
@@ -1915,7 +1917,7 @@ static void pc_machine_initfn(Object *obj)
 _abort);
 
 /* nvdimm is disabled on default. */
-pcms->nvdimm = false;
+   

  1   2   3   4   5   6   7   8   9   10   >