Re: [PATCH 1/2] KVM: arm64: PSCI: Narrow input registers when using 32bit functions

2020-04-02 Thread Christoffer Dall
On Wed, Apr 01, 2020 at 05:58:15PM +0100, Marc Zyngier wrote:
> When a guest delibarately uses an SSMC32 function number (which is allowed),
> we should make sure we drop the top 32bits from the input arguments, as they
> could legitimately be junk.
> 
> Reported-by: Christoffer Dall 
> Signed-off-by: Marc Zyngier 
> ---
>  virt/kvm/arm/psci.c | 16 
>  1 file changed, 16 insertions(+)
> 
> diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
> index 17e2bdd4b76f..69ff4a51ceb5 100644
> --- a/virt/kvm/arm/psci.c
> +++ b/virt/kvm/arm/psci.c
> @@ -187,6 +187,18 @@ static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
>   kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
>  }
>  
> +static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
> +{
> + int i;
> +
> + /*
> +  * Zero the input registers' upper 32 bits. They will be fully
> +  * zeroed on exit, so we're fine changing them in place.
> +  */
> + for (i = 1; i < 4; i++)
> + vcpu_set_reg(vcpu, i, (u32)vcpu_get_reg(vcpu, i));
> +}
> +
>  static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
>  {
>   struct kvm *kvm = vcpu->kvm;
> @@ -211,12 +223,16 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
>   val = PSCI_RET_SUCCESS;
>   break;
>   case PSCI_0_2_FN_CPU_ON:
> + kvm_psci_narrow_to_32bit(vcpu);
> + fallthrough;
>   case PSCI_0_2_FN64_CPU_ON:
>   mutex_lock(>lock);
>   val = kvm_psci_vcpu_on(vcpu);
>   mutex_unlock(>lock);
>   break;
>   case PSCI_0_2_FN_AFFINITY_INFO:
> + kvm_psci_narrow_to_32bit(vcpu);
> + fallthrough;
>   case PSCI_0_2_FN64_AFFINITY_INFO:
>   val = kvm_psci_vcpu_affinity_info(vcpu);
>   break;
> -- 
> 2.25.0
> 

Reviewed-by: Christoffer Dall 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/2] KVM: arm64: PSCI: Forbid 64bit functions for 32bit guests

2020-04-02 Thread Christoffer Dall
On Wed, Apr 01, 2020 at 05:58:16PM +0100, Marc Zyngier wrote:
> Implementing (and even advertising) 64bit PSCI functions to 32bit
> guests is at least a bit odd, if not altogether violating the
> spec which says ("5.2.1 Register usage in arguments and return values"):
> 
> "Adherence to the SMC Calling Conventions implies that any AArch32
> caller of an SMC64 function will get a return code of 0x(int32).
> This matches the NOT_SUPPORTED error code used in PSCI"
> 
> Tighten the implementation by pretending these functions are not
> there for 32bit guests.
> 
> Signed-off-by: Marc Zyngier 
> ---
>  virt/kvm/arm/psci.c | 24 
>  1 file changed, 24 insertions(+)
> 
> diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
> index 69ff4a51ceb5..122795cdd984 100644
> --- a/virt/kvm/arm/psci.c
> +++ b/virt/kvm/arm/psci.c
> @@ -199,6 +199,21 @@ static void kvm_psci_narrow_to_32bit(struct kvm_vcpu 
> *vcpu)
>   vcpu_set_reg(vcpu, i, (u32)vcpu_get_reg(vcpu, i));
>  }
>  
> +static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, 
> u32 fn)
> +{
> + switch(fn) {
> + case PSCI_0_2_FN64_CPU_SUSPEND:
> + case PSCI_0_2_FN64_CPU_ON:
> + case PSCI_0_2_FN64_AFFINITY_INFO:
> + /* Disallow these functions for 32bit guests */
> + if (vcpu_mode_is_32bit(vcpu))
> + return PSCI_RET_NOT_SUPPORTED;
> + break;
> + }
> +
> + return 0;
> +}
> +
>  static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
>  {
>   struct kvm *kvm = vcpu->kvm;
> @@ -206,6 +221,10 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
>   unsigned long val;
>   int ret = 1;
>  
> + val = kvm_psci_check_allowed_function(vcpu, psci_fn);
> + if (val)
> + goto out;
> +
>   switch (psci_fn) {
>   case PSCI_0_2_FN_PSCI_VERSION:
>   /*
> @@ -273,6 +292,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
>   break;
>   }
>  
> +out:
>   smccc_set_retval(vcpu, val, 0, 0, 0);
>   return ret;
>  }
> @@ -290,6 +310,10 @@ static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
>   break;
>   case PSCI_1_0_FN_PSCI_FEATURES:
>   feature = smccc_get_arg1(vcpu);
> + val = kvm_psci_check_allowed_function(vcpu, feature);
> +     if (val)
> + break;
> +
>   switch(feature) {
>   case PSCI_0_2_FN_PSCI_VERSION:
>   case PSCI_0_2_FN_CPU_SUSPEND:
> -- 
> 2.25.0
> 

Reviewed-by: Christoffer Dall 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 0/5] Removing support for 32bit KVM/arm host

2020-02-19 Thread Christoffer Dall
On Mon, Feb 10, 2020 at 02:13:19PM +, Marc Zyngier wrote:
> KVM/arm was merged just over 7 years ago, and has lived a very quiet
> life so far. It mostly works if you're prepared to deal with its
> limitations, it has been a good prototype for the arm64 version,
> but it suffers a few problems:
> 
> - It is incomplete (no debug support, no PMU)
> - It hasn't followed any of the architectural evolutions
> - It has zero users (I don't count myself here)
> - It is more and more getting in the way of new arm64 developments
> 
> So here it is: unless someone screams and shows that they rely on
> KVM/arm to be maintained upsteam, I'll remove 32bit host support
> form the tree. One of the reasons that makes me confident nobody is
> using it is that I never receive *any* bug report. Yes, it is perfect.
> But if you depend on KVM/arm being available in mainline, please shout.
> 
> To reiterate: 32bit guest support for arm64 stays, of course. Only
> 32bit host goes. Once this is merged, I plan to move virt/kvm/arm to
> arm64, and cleanup all the now unnecessary abstractions.
> 
> The patches have been generated with the -D option to avoid spamming
> everyone with huge diffs, and there is a kvm-arm/goodbye branch in
> my kernel.org repository.

Acked-by: Christoffer Dall 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] KVM: arm/arm64: Re-check VMA on detecting a poisoned page

2019-12-18 Thread Christoffer Dall
On Tue, Dec 17, 2019 at 12:38:09PM +, James Morse wrote:
> When we check for a poisoned page, we use the VMA to tell userspace
> about the looming disaster. But we pass a pointer to this VMA
> after having released the mmap_sem, which isn't a good idea.
> 
> Instead, stash the shift value that goes with this pfn while
> we are holding the mmap_sem.
> 
> Reported-by: Marc Zyngier 
> Signed-off-by: James Morse 
> ---
> 
> Based on Marc's patch:
> Link: lore.kernel.org/r/20191211165651.7889-3-...@kernel.org
> 
>  virt/kvm/arm/mmu.c | 20 +---
>  1 file changed, 9 insertions(+), 11 deletions(-)
> 
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 38b4c910b6c3..bb0f8d648678 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1591,16 +1591,8 @@ static void invalidate_icache_guest_page(kvm_pfn_t 
> pfn, unsigned long size)
>   __invalidate_icache_guest_page(pfn, size);
>  }
>  
> -static void kvm_send_hwpoison_signal(unsigned long address,
> -  struct vm_area_struct *vma)
> +static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
>  {
> - short lsb;
> -
> - if (is_vm_hugetlb_page(vma))
> - lsb = huge_page_shift(hstate_vma(vma));
> - else
> - lsb = PAGE_SHIFT;
> -
>   send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
>  }
>  
> @@ -1673,6 +1665,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>   struct kvm *kvm = vcpu->kvm;
>   struct kvm_mmu_memory_cache *memcache = >arch.mmu_page_cache;
>   struct vm_area_struct *vma;
> + short vma_shift;
>   kvm_pfn_t pfn;
>   pgprot_t mem_type = PAGE_S2;
>   bool logging_active = memslot_is_logging(memslot);
> @@ -1696,7 +1689,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>   return -EFAULT;
>   }
>  
> - vma_pagesize = vma_kernel_pagesize(vma);
> + if (is_vm_hugetlb_page(vma))
> + vma_shift = huge_page_shift(hstate_vma(vma));
> + else
> + vma_shift = PAGE_SHIFT;
> +
> + vma_pagesize = 1ULL << vma_shift;
>   if (logging_active ||
>   !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
>   force_pte = true;
> @@ -1735,7 +1733,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>  
>   pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, );
>   if (pfn == KVM_PFN_ERR_HWPOISON) {
> - kvm_send_hwpoison_signal(hva, vma);
> + kvm_send_hwpoison_signal(hva, vma_shift);
>   return 0;
>   }
>   if (is_error_noslot_pfn(pfn))
> -- 
> 2.24.0
> 
> 
Reviewed-by: Christoffer Dall 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/3] KVM: arm/arm64: Properly handle faulting of device mappings

2019-12-18 Thread Christoffer Dall
On Mon, Dec 16, 2019 at 10:31:19AM +, Marc Zyngier wrote:
> On 2019-12-13 11:14, Christoffer Dall wrote:
> > On Fri, Dec 13, 2019 at 09:28:59AM +, Marc Zyngier wrote:
> > > Hi Christoffer,
> > > 
> > > On 2019-12-13 08:29, Christoffer Dall wrote:
> > > > Hi Marc,
> > > >
> > > > On Wed, Dec 11, 2019 at 04:56:48PM +, Marc Zyngier wrote:
> > > > > A device mapping is normally always mapped at Stage-2, since
> > > there
> > > > > is very little gain in having it faulted in.
> > > >
> > > > It is actually becoming less clear to me what the real benefits of
> > > > pre-populating the stage 2 page table are, especially given that
> > > we can
> > > > provoke a situation where they're faulted in anyhow.  Do you
> > > recall if
> > > > we had any specific case that motivated us to pre-fault in the
> > > pages?
> > > 
> > > It's only a minor performance optimization that was introduced by
> > > Ard in
> > > 8eef91239e57d. Which makes sense for platform devices that have a
> > > single
> > > fixed location in memory. It makes slightly less sense for PCI,
> > > where
> > > you can move things around.
> > 
> > User space could still decide to move things around in its VA map even
> > if the device is fixed.
> > 
> > Anyway, I was thinking more if there was some sort of device, like a
> > frambuffer, which for example crosses page boundaries and where it would
> > be visible to the user that there's a sudden performance drop while
> > operating the device over page boundaries.  Anything like that?
> > 
> > > 
> > > > > Nonetheless, it is possible to end-up in a situation where the
> > > > > device
> > > > > mapping has been removed from Stage-2 (userspace munmaped the
> > > VFIO
> > > > > region, and the MMU notifier did its job), but present in a
> > > > > userspace
> > > > > mapping (userpace has mapped it back at the same address). In
> > > such
> > > > > a situation, the device mapping will be demand-paged as the
> > > guest
> > > > > performs memory accesses.
> > > > >
> > > > > This requires to be careful when dealing with mapping size,
> > > cache
> > > > > management, and to handle potential execution of a device
> > > mapping.
> > > > >
> > > > > Cc: sta...@vger.kernel.org
> > > > > Reported-by: Alexandru Elisei 
> > > > > Signed-off-by: Marc Zyngier 
> > > > > ---
> > > > >  virt/kvm/arm/mmu.c | 21 +
> > > > >  1 file changed, 17 insertions(+), 4 deletions(-)
> > > > >
> > > > > diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> > > > > index a48994af70b8..0b32a904a1bb 100644
> > > > > --- a/virt/kvm/arm/mmu.c
> > > > > +++ b/virt/kvm/arm/mmu.c
> > > > > @@ -38,6 +38,11 @@ static unsigned long io_map_base;
> > > > >  #define KVM_S2PTE_FLAG_IS_IOMAP  (1UL << 0)
> > > > >  #define KVM_S2_FLAG_LOGGING_ACTIVE   (1UL << 1)
> > > > >
> > > > > +static bool is_iomap(unsigned long flags)
> > > > > +{
> > > > > + return flags & KVM_S2PTE_FLAG_IS_IOMAP;
> > > > > +}
> > > > > +
> > > >
> > > > nit: I'm not really sure this indirection makes the code more
> > > readable,
> > > > but I guess that's a matter of taste.
> > > >
> > > > >  static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> > > > >  {
> > > > >   return memslot->dirty_bitmap && !(memslot->flags &
> > > > > KVM_MEM_READONLY);
> > > > > @@ -1698,6 +1703,7 @@ static int user_mem_abort(struct kvm_vcpu
> > > > > *vcpu, phys_addr_t fault_ipa,
> > > > >
> > > > >   vma_pagesize = vma_kernel_pagesize(vma);
> > > > >   if (logging_active ||
> > > > > + (vma->vm_flags & VM_PFNMAP) ||
> > > >
> > > > WHat is actually the rationale for this?
> > > >
> > > > Why is a huge mapping not permitted to device memory?
> > > >
> > > > Are we guaranteed that VM_PFNMAP on the vma results in device
> > > mappings?
> > > &

Re: [PATCH] KVM: arm64: Only sign-extend MMIO up to register width

2019-12-13 Thread Christoffer Dall
On Fri, Dec 13, 2019 at 03:05:19PM +0100, Djordje Kovacevic wrote:
>Hi Christoffer,
> 
>I have run some test payload to get the exact behavior of all nine
>LDR[S][W|H|B] [Xt|Wt] instructions. Here it is:
> 
> # instruction   sas  sse  sf   Xt contents
>===
>
> 1 LDRXt, ... 301   b[63:0] = MEM[63:0]
> 2 LDRWt, ... 200   b[63:32]='0..0' b[31:0] = MEM[31:0]
> 3 LDRH   Wt, ... 100   b[63:16]='0..0' b[15:0] = MEM[15:0]
> 4 LDRB   Wt, ... 000   b[63:8] ='0..0' b[7:0]  = MEM[7:0]
> 5 LDRSW  Xt, ... 211   b[63:32] = MEM[31] b[31:0] =
>MEM[31:0]
> 6 LDRSH  Xt, ... 111   b[63:16] = MEM[15] b[15:0] =
>MEM[15:0]
> 7 LDRSH  Wt, ... 110   b[63:32] = '0..0' b[31:16] =
>MEM[15] b[15:0] = MEM[15:0]
> 8 LDRSB  Xt, ... 011   b[63:8] = MEM[7] b[7:0] = MEM[7:0]
> 9 LDRSB  Wt, ... 010   b[63:32] = '0..0' b[31:8] = MEM[7]
>b[7:0] = MEM[7:0]
> 
>Any surprises?

No, this looks as I expected it to.

Thanks for the test.

Christoffer

>  ______
> 
>From: Christoffer Dall 
>Sent: 13 December 2019 10:56
>To: Marc Zyngier 
>Cc: kvmarm@lists.cs.columbia.edu ;
>linux-arm-ker...@lists.infradead.org
>; Djordje Kovacevic
>; James Morse ; Julien
>Thierry ; Suzuki Poulose
>
>Subject: Re: [PATCH] KVM: arm64: Only sign-extend MMIO up to register
>width
> 
>On Fri, Dec 13, 2019 at 10:12:19AM +, Marc Zyngier wrote:
>> On 2019-12-12 19:50, Christoffer Dall wrote:
>> > On AArch64 you can do a sign-extended load to either a 32-bit or
>64-bit
>> > register, and we should only sign extend the register up to the
>width of
>> > the register as specified in the operation (by using the 32-bit Wn
>or
>> > 64-bit Xn register specifier).
>>
>> Nice catch. It's only been there for... Oh wait! ;-)
>>
>> >
>> > As it turns out, the architecture provides this decoding
>information in
>> > the SF ("Sixty-Four" -- how cute...) bit.
>> >
>> > Let's take advantage of this with the usual 32-bit/64-bit header
>file
>> > dance and do the right thing on AArch64 hosts.
>> >
>> > Signed-off-by: Christoffer Dall 
>>
>> Cc: stable?
>>
>Yes, good idea.
>> > ---
>> >  arch/arm/include/asm/kvm_emulate.h   | 5 +
>> >  arch/arm/include/asm/kvm_mmio.h  | 2 ++
>> >  arch/arm64/include/asm/kvm_emulate.h | 5 +
>> >  arch/arm64/include/asm/kvm_mmio.h| 6 ++
>> >  virt/kvm/arm/mmio.c  | 8 +++-
>> >  5 files changed, 21 insertions(+), 5 deletions(-)
>> >
>> > diff --git a/arch/arm/include/asm/kvm_emulate.h
>> > b/arch/arm/include/asm/kvm_emulate.h
>> > index 9b118516d2db..fe55d8737a11 100644
>> > --- a/arch/arm/include/asm/kvm_emulate.h
>> > +++ b/arch/arm/include/asm/kvm_emulate.h
>> > @@ -182,6 +182,11 @@ static inline bool kvm_vcpu_dabt_issext(struct
>> > kvm_vcpu *vcpu)
>> >  return kvm_vcpu_get_hsr(vcpu) & HSR_SSE;
>> >  }
>> >
>> > +static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu)
>> > +{
>> > +   return false;
>> > +}
>> > +
>> >  static inline int kvm_vcpu_dabt_get_rd(struct kvm_vcpu *vcpu)
>> >  {
>> >  return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >>
>HSR_SRT_SHIFT;
>> > diff --git a/arch/arm/include/asm/kvm_mmio.h
>> > b/arch/arm/include/asm/kvm_mmio.h
>> > index 7c0eddb0adb2..32fbf82e3ebc 100644
>> > --- a/arch/arm/include/asm/kvm_mmio.h
>> > +++ b/arch/arm/include/asm/kvm_mmio.h
>> > @@ -14,6 +14,8 @@
>> >  struct kvm_decode {
>> >  unsigned long rt;
>> >  bool sign_extend;
>> > +   /* Not used on 32-bit arm */
>> > +   bool sixty_four;
>> >  };
>> >
>> >  void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long
>> > data);
>> > diff --git a/arch/arm64/include/asm/kvm_emulate.h
>> > b/arch/arm64/include/asm/kvm_emulate.h
>> > index 5e

Re: [PATCH 1/3] KVM: arm/arm64: Properly handle faulting of device mappings

2019-12-13 Thread Christoffer Dall
On Fri, Dec 13, 2019 at 09:28:59AM +, Marc Zyngier wrote:
> Hi Christoffer,
> 
> On 2019-12-13 08:29, Christoffer Dall wrote:
> > Hi Marc,
> > 
> > On Wed, Dec 11, 2019 at 04:56:48PM +, Marc Zyngier wrote:
> > > A device mapping is normally always mapped at Stage-2, since there
> > > is very little gain in having it faulted in.
> > 
> > It is actually becoming less clear to me what the real benefits of
> > pre-populating the stage 2 page table are, especially given that we can
> > provoke a situation where they're faulted in anyhow.  Do you recall if
> > we had any specific case that motivated us to pre-fault in the pages?
> 
> It's only a minor performance optimization that was introduced by Ard in
> 8eef91239e57d. Which makes sense for platform devices that have a single
> fixed location in memory. It makes slightly less sense for PCI, where
> you can move things around.

User space could still decide to move things around in its VA map even
if the device is fixed.

Anyway, I was thinking more if there was some sort of device, like a
frambuffer, which for example crosses page boundaries and where it would
be visible to the user that there's a sudden performance drop while
operating the device over page boundaries.  Anything like that?

> 
> > > Nonetheless, it is possible to end-up in a situation where the
> > > device
> > > mapping has been removed from Stage-2 (userspace munmaped the VFIO
> > > region, and the MMU notifier did its job), but present in a
> > > userspace
> > > mapping (userpace has mapped it back at the same address). In such
> > > a situation, the device mapping will be demand-paged as the guest
> > > performs memory accesses.
> > > 
> > > This requires to be careful when dealing with mapping size, cache
> > > management, and to handle potential execution of a device mapping.
> > > 
> > > Cc: sta...@vger.kernel.org
> > > Reported-by: Alexandru Elisei 
> > > Signed-off-by: Marc Zyngier 
> > > ---
> > >  virt/kvm/arm/mmu.c | 21 +
> > >  1 file changed, 17 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> > > index a48994af70b8..0b32a904a1bb 100644
> > > --- a/virt/kvm/arm/mmu.c
> > > +++ b/virt/kvm/arm/mmu.c
> > > @@ -38,6 +38,11 @@ static unsigned long io_map_base;
> > >  #define KVM_S2PTE_FLAG_IS_IOMAP  (1UL << 0)
> > >  #define KVM_S2_FLAG_LOGGING_ACTIVE   (1UL << 1)
> > > 
> > > +static bool is_iomap(unsigned long flags)
> > > +{
> > > + return flags & KVM_S2PTE_FLAG_IS_IOMAP;
> > > +}
> > > +
> > 
> > nit: I'm not really sure this indirection makes the code more readable,
> > but I guess that's a matter of taste.
> > 
> > >  static bool memslot_is_logging(struct kvm_memory_slot *memslot)
> > >  {
> > >   return memslot->dirty_bitmap && !(memslot->flags &
> > > KVM_MEM_READONLY);
> > > @@ -1698,6 +1703,7 @@ static int user_mem_abort(struct kvm_vcpu
> > > *vcpu, phys_addr_t fault_ipa,
> > > 
> > >   vma_pagesize = vma_kernel_pagesize(vma);
> > >   if (logging_active ||
> > > + (vma->vm_flags & VM_PFNMAP) ||
> > 
> > WHat is actually the rationale for this?
> > 
> > Why is a huge mapping not permitted to device memory?
> > 
> > Are we guaranteed that VM_PFNMAP on the vma results in device mappings?
> > I'm not convinced this is the case, and it would be better if we can
> > stick to a single primitive (either kvm_is_device_pfn, or VM_PFNMAP) to
> > detect device mappings.
> 
> For now, I've tried to keep the two paths that deal with mapping devices
> (or rather, things that we interpret as devices) as close as possible.
> If we drop the "eager" mapping, then we're at liberty to restructure
> this in creative ways.
> 
> This includes potential huge mappings, but I'm not sure the rest of the
> kernel uses them for devices anyway (I need to find out).
> 
> > As a subsequent patch, I'd like to make sure that at the very least our
> > memslot prepare function follows the exact same logic for mapping device
> > memory as a fault-in approach does, or that we simply always fault pages
> > in.
> 
> As far as I can see, the two approach are now identical. Am I missing
> something?
> And yes, getting rid of the eager mapping works for me.
> 

As far as I can tell, our user_mem_abort() uses gfn_to_pfn_prot() which
goes doesn a long trail whi

Re: [PATCH] KVM: arm64: Only sign-extend MMIO up to register width

2019-12-13 Thread Christoffer Dall
On Fri, Dec 13, 2019 at 10:12:19AM +, Marc Zyngier wrote:
> On 2019-12-12 19:50, Christoffer Dall wrote:
> > On AArch64 you can do a sign-extended load to either a 32-bit or 64-bit
> > register, and we should only sign extend the register up to the width of
> > the register as specified in the operation (by using the 32-bit Wn or
> > 64-bit Xn register specifier).
> 
> Nice catch. It's only been there for... Oh wait! ;-)
> 
> > 
> > As it turns out, the architecture provides this decoding information in
> > the SF ("Sixty-Four" -- how cute...) bit.
> > 
> > Let's take advantage of this with the usual 32-bit/64-bit header file
> > dance and do the right thing on AArch64 hosts.
> > 
> > Signed-off-by: Christoffer Dall 
> 
> Cc: stable?
> 

Yes, good idea.

> > ---
> >  arch/arm/include/asm/kvm_emulate.h   | 5 +
> >  arch/arm/include/asm/kvm_mmio.h  | 2 ++
> >  arch/arm64/include/asm/kvm_emulate.h | 5 +
> >  arch/arm64/include/asm/kvm_mmio.h| 6 ++
> >  virt/kvm/arm/mmio.c  | 8 +++-
> >  5 files changed, 21 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/arm/include/asm/kvm_emulate.h
> > b/arch/arm/include/asm/kvm_emulate.h
> > index 9b118516d2db..fe55d8737a11 100644
> > --- a/arch/arm/include/asm/kvm_emulate.h
> > +++ b/arch/arm/include/asm/kvm_emulate.h
> > @@ -182,6 +182,11 @@ static inline bool kvm_vcpu_dabt_issext(struct
> > kvm_vcpu *vcpu)
> > return kvm_vcpu_get_hsr(vcpu) & HSR_SSE;
> >  }
> > 
> > +static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu)
> > +{
> > +   return false;
> > +}
> > +
> >  static inline int kvm_vcpu_dabt_get_rd(struct kvm_vcpu *vcpu)
> >  {
> > return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT;
> > diff --git a/arch/arm/include/asm/kvm_mmio.h
> > b/arch/arm/include/asm/kvm_mmio.h
> > index 7c0eddb0adb2..32fbf82e3ebc 100644
> > --- a/arch/arm/include/asm/kvm_mmio.h
> > +++ b/arch/arm/include/asm/kvm_mmio.h
> > @@ -14,6 +14,8 @@
> >  struct kvm_decode {
> > unsigned long rt;
> > bool sign_extend;
> > +   /* Not used on 32-bit arm */
> > +   bool sixty_four;
> >  };
> > 
> >  void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long
> > data);
> > diff --git a/arch/arm64/include/asm/kvm_emulate.h
> > b/arch/arm64/include/asm/kvm_emulate.h
> > index 5efe5ca8fecf..f407b6bdad2e 100644
> > --- a/arch/arm64/include/asm/kvm_emulate.h
> > +++ b/arch/arm64/include/asm/kvm_emulate.h
> > @@ -283,6 +283,11 @@ static inline bool kvm_vcpu_dabt_issext(const
> > struct kvm_vcpu *vcpu)
> > return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SSE);
> >  }
> > 
> > +static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu)
> > +{
> > +   return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SF);
> > +}
> > +
> >  static inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu)
> >  {
> > return (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SRT_MASK) >>
> > ESR_ELx_SRT_SHIFT;
> > diff --git a/arch/arm64/include/asm/kvm_mmio.h
> > b/arch/arm64/include/asm/kvm_mmio.h
> > index 02b5c48fd467..b204501a0c39 100644
> > --- a/arch/arm64/include/asm/kvm_mmio.h
> > +++ b/arch/arm64/include/asm/kvm_mmio.h
> > @@ -10,13 +10,11 @@
> >  #include 
> >  #include 
> > 
> > -/*
> > - * This is annoying. The mmio code requires this, even if we don't
> > - * need any decoding. To be fixed.
> > - */
> >  struct kvm_decode {
> > unsigned long rt;
> > bool sign_extend;
> > +   /* Witdth of the register accessed by the faulting instruction is
> > 64-bits */
> > +   bool sixty_four;
> >  };
> > 
> >  void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long
> > data);
> > diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
> > index 70d3b449692c..e62454b2e529 100644
> > --- a/virt/kvm/arm/mmio.c
> > +++ b/virt/kvm/arm/mmio.c
> > @@ -83,7 +83,7 @@ unsigned long kvm_mmio_read_buf(const void *buf,
> > unsigned int len)
> >  int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
> >  {
> > unsigned long data;
> > -   unsigned int len;
> > +   unsigned int len, regsize;
> 
> Unused variable?
> 

Ah, yes, whoops.  Guess which unstaged change I still have in my tree...

> > int mask;
> > 
> > /* Detect an already handled MMIO return */
> > @@ -105,6 +105,9 @@ int kvm_handl

Re: [PATCH 3/3] KVM: arm/arm64: Drop spurious message when faulting on a non-existent mapping

2019-12-13 Thread Christoffer Dall
On Wed, Dec 11, 2019 at 04:56:50PM +, Marc Zyngier wrote:
> Should userspace unmap memory whilst the guest is running, we exit
> with a -EFAULT, but also having spat a useless message on the console.
> 
> Get rid of it.

Acked-by: Christoffer Dall 

> 
> Signed-off-by: Marc Zyngier 
> ---
>  virt/kvm/arm/mmu.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index f73393f5ddb7..fbfdffb8fe8e 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1696,7 +1696,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>   down_read(>mm->mmap_sem);
>   vma = find_vma_intersection(current->mm, hva, hva + 1);
>   if (unlikely(!vma)) {
> - kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
>   up_read(>mm->mmap_sem);
>   return -EFAULT;
>   }
> -- 
> 2.20.1
> 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/3] KVM: arm/arm64: Re-check VMA on detecting a poisoned page

2019-12-13 Thread Christoffer Dall
Hi James,

On Thu, Dec 12, 2019 at 03:34:31PM +, James Morse wrote:
> Hi Marc,
> 
> On 12/12/2019 11:33, Marc Zyngier wrote:
> > On 2019-12-11 16:56, Marc Zyngier wrote:

[...]

> 
> (allocating from a kmemcache while holding current's mmap_sem. I don't want 
> to think about
> it!)
> 
> Can we be lazier? We want the VMA to get the size of the poisoned mapping 
> correct in the
> signal. The bug is that this could change when we drop the lock, before 
> queuing the
> signal, so we report hwpoison on old-vmas:pfn with new-vmas:size.
> 
> Can't it equally change when we drop the lock after queuing the signal? Any 
> time before
> the thread returns to user-space to take the signal gives us a stale value.
> 
> I think all that matters is the size goes with the pfn that was poisoned. If 
> we look the
> vma up by hva again, we have to check if the pfn has changed too... (which 
> you are doing)
> 
> Can we stash the size in the existing mmap_sem region, and use that in
> kvm_send_hwpoison_signal()? We know it matches the pfn we saw as poisoned.
> 
> The vma could be changed before/after we send the signal, but user-space 
> can't know which.
> This is user-spaces' problem for messing with the memslots while a vpcu is 
> running.
> 

(I should clearly have expanded this thread before I replied to the
original patch...)

> 
> How about (untested):
> -%<-
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 38b4c910b6c3..80212d4935bd 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1591,16 +1591,8 @@ static void invalidate_icache_guest_page(kvm_pfn_t 
> pfn, unsigned
> long size)
> __invalidate_icache_guest_page(pfn, size);
>  }
> 
> -static void kvm_send_hwpoison_signal(unsigned long address,
> -struct vm_area_struct *vma)
> +static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
>  {
> -   short lsb;
> -
> -   if (is_vm_hugetlb_page(vma))
> -   lsb = huge_page_shift(hstate_vma(vma));
> -   else
> -   lsb = PAGE_SHIFT;
> -
> send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
>  }
> 
> @@ -1673,6 +1665,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
> struct kvm *kvm = vcpu->kvm;
> struct kvm_mmu_memory_cache *memcache = >arch.mmu_page_cache;
> struct vm_area_struct *vma;
> +   short stage1_vma_size;
> kvm_pfn_t pfn;
> pgprot_t mem_type = PAGE_S2;
> bool logging_active = memslot_is_logging(memslot);
> 
> @@ -1703,6 +1696,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
> vma_pagesize = PAGE_SIZE;
> }
> 
> +   /* For signals due to hwpoison, we need to use the stage1 size */
> +   if (is_vm_hugetlb_page(vma))
> +   stage1_vma_size = huge_page_shift(hstate_vma(vma));
> +   else
> +   stage1_vma_size = PAGE_SHIFT;
> +

But (see my patch) as far as I can tell, this is already what we have in
vma_pagesize, and do we really have to provide the stage 1 size to user
space if the fault happened within a smaller boundary?  Isn't that just
providing more precise information to the user?


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/3] KVM: arm/arm64: Re-check VMA on detecting a poisoned page

2019-12-13 Thread Christoffer Dall
On Wed, Dec 11, 2019 at 04:56:49PM +, Marc Zyngier wrote:
> When we check for a poisoned page, we use the VMA to tell userspace
> about the looming disaster. But we pass a pointer to this VMA
> after having released the mmap_sem, which isn't a good idea.
> 
> Instead, re-check that we have still have a VMA, and that this
> VMA still points to a poisoned page. If the VMA isn't there,
> userspace is playing with our nerves, so lety's give it a -EFAULT
> (it deserves it). If the PFN isn't poisoned anymore, let's restart
> from the top and handle the fault again.
> 
> Signed-off-by: Marc Zyngier 
> ---
>  virt/kvm/arm/mmu.c | 25 +++--
>  1 file changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 0b32a904a1bb..f73393f5ddb7 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1741,9 +1741,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>  
>   pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, );
>   if (pfn == KVM_PFN_ERR_HWPOISON) {
> - kvm_send_hwpoison_signal(hva, vma);
> - return 0;
> + /*
> +  * Search for the VMA again, as it may have been
> +  * removed in the interval...
> +  */
> + down_read(>mm->mmap_sem);
> + vma = find_vma_intersection(current->mm, hva, hva + 1);
> + if (vma) {
> + /*
> +  * Recheck for a poisoned page. If something changed
> +  * behind our back, don't do a thing and take the
> +  * fault again.
> +  */
> + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, );
> + if (pfn == KVM_PFN_ERR_HWPOISON)
> + kvm_send_hwpoison_signal(hva, vma);
> +
> + ret = 0;
> + } else {
> + ret = -EFAULT;
> + }
> + up_read(>mm->mmap_sem);
> + return ret;
>   }
> +
>   if (is_error_noslot_pfn(pfn))
>   return -EFAULT;
>  
> -- 
> 2.20.1
> 

If I read this code correctly, then all we use the VMA for is to find
the page size used by the MMU to back the VMA, which we've already
established in the vma_pagesize (and possibly adjusted to something more
accurate based on our constraints in stage 2 which generated the error),
so all we need is the size and a way to convert that into a shift.

Not being 100% confident about the semantics of the lsb bit we pass to
user space (is it indicating the size of the mapping which caused the
error or the size of the mapping where user space could potentially
trigger an error?), or wheter we care enough at that level, could we
consider something like the following instead?

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 38b4c910b6c3..2509d9dec42d 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1592,15 +1592,9 @@ static void invalidate_icache_guest_page(kvm_pfn_t pfn, 
unsigned long size)
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address,
-struct vm_area_struct *vma)
+unsigned long vma_pagesize)
 {
-   short lsb;
-
-   if (is_vm_hugetlb_page(vma))
-   lsb = huge_page_shift(hstate_vma(vma));
-   else
-   lsb = PAGE_SHIFT;
-
+   short lsb = __ffs(vma_pagesize);
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
 }
 
@@ -1735,7 +1729,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 
pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, );
if (pfn == KVM_PFN_ERR_HWPOISON) {
-   kvm_send_hwpoison_signal(hva, vma);
+   kvm_send_hwpoison_signal(hva, vma_pagesize);
return 0;
}
if (is_error_noslot_pfn(pfn))


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/3] KVM: arm/arm64: Properly handle faulting of device mappings

2019-12-13 Thread Christoffer Dall
Hi Marc,

On Wed, Dec 11, 2019 at 04:56:48PM +, Marc Zyngier wrote:
> A device mapping is normally always mapped at Stage-2, since there
> is very little gain in having it faulted in.

It is actually becoming less clear to me what the real benefits of
pre-populating the stage 2 page table are, especially given that we can
provoke a situation where they're faulted in anyhow.  Do you recall if
we had any specific case that motivated us to pre-fault in the pages?

> 
> Nonetheless, it is possible to end-up in a situation where the device
> mapping has been removed from Stage-2 (userspace munmaped the VFIO
> region, and the MMU notifier did its job), but present in a userspace
> mapping (userpace has mapped it back at the same address). In such
> a situation, the device mapping will be demand-paged as the guest
> performs memory accesses.
> 
> This requires to be careful when dealing with mapping size, cache
> management, and to handle potential execution of a device mapping.
> 
> Cc: sta...@vger.kernel.org
> Reported-by: Alexandru Elisei 
> Signed-off-by: Marc Zyngier 
> ---
>  virt/kvm/arm/mmu.c | 21 +
>  1 file changed, 17 insertions(+), 4 deletions(-)
> 
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index a48994af70b8..0b32a904a1bb 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -38,6 +38,11 @@ static unsigned long io_map_base;
>  #define KVM_S2PTE_FLAG_IS_IOMAP  (1UL << 0)
>  #define KVM_S2_FLAG_LOGGING_ACTIVE   (1UL << 1)
>  
> +static bool is_iomap(unsigned long flags)
> +{
> + return flags & KVM_S2PTE_FLAG_IS_IOMAP;
> +}
> +

nit: I'm not really sure this indirection makes the code more readable,
but I guess that's a matter of taste.

>  static bool memslot_is_logging(struct kvm_memory_slot *memslot)
>  {
>   return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
> @@ -1698,6 +1703,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>  
>   vma_pagesize = vma_kernel_pagesize(vma);
>   if (logging_active ||
> + (vma->vm_flags & VM_PFNMAP) ||

WHat is actually the rationale for this?

Why is a huge mapping not permitted to device memory?

Are we guaranteed that VM_PFNMAP on the vma results in device mappings?
I'm not convinced this is the case, and it would be better if we can
stick to a single primitive (either kvm_is_device_pfn, or VM_PFNMAP) to
detect device mappings.

As a subsequent patch, I'd like to make sure that at the very least our
memslot prepare function follows the exact same logic for mapping device
memory as a fault-in approach does, or that we simply always fault pages
in.

>   !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
>   force_pte = true;
>   vma_pagesize = PAGE_SIZE;
> @@ -1760,6 +1766,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>   writable = false;
>   }
>  
> + if (exec_fault && is_iomap(flags))
> + return -ENOEXEC;
> +

nit: why don't you just do this when checking kvm_is_device_pfn() and
avoid having logic in two places to deal with this case?

>   spin_lock(>mmu_lock);
>   if (mmu_notifier_retry(kvm, mmu_seq))
>   goto out_unlock;
> @@ -1781,7 +1790,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>   if (writable)
>   kvm_set_pfn_dirty(pfn);
>  
> - if (fault_status != FSC_PERM)
> + if (fault_status != FSC_PERM && !is_iomap(flags))
>   clean_dcache_guest_page(pfn, vma_pagesize);
>  
>   if (exec_fault)
> @@ -1948,9 +1957,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, 
> struct kvm_run *run)
>   if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
>   if (is_iabt) {
>   /* Prefetch Abort on I/O address */
> - kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
> - ret = 1;
> - goto out_unlock;
> + ret = -ENOEXEC;
> + goto out;
>   }
>  
>   /*
> @@ -1992,6 +2000,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, 
> struct kvm_run *run)
>   ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
>   if (ret == 0)
>   ret = 1;
> +out:
> + if (ret == -ENOEXEC) {
> + kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
> + ret = 1;
> + }
>  out_unlock:
>   srcu_read_unlock(>kvm->srcu, idx);
>   return ret;
> -- 
> 2.20.1
> 

I can't seem to decide for myself if I think there's a sematic
difference between trying to execute from somewhere the VMM has
explicitly told us is device memory and from somewhere which we happen
to have mapped with VM_PFNMAP from user space.  But I also can't seem to
really fault it (pun intended).  Thoughts?


Thanks,

Christoffer

[PATCH] KVM: arm64: Only sign-extend MMIO up to register width

2019-12-12 Thread Christoffer Dall
On AArch64 you can do a sign-extended load to either a 32-bit or 64-bit
register, and we should only sign extend the register up to the width of
the register as specified in the operation (by using the 32-bit Wn or
64-bit Xn register specifier).

As it turns out, the architecture provides this decoding information in
the SF ("Sixty-Four" -- how cute...) bit.

Let's take advantage of this with the usual 32-bit/64-bit header file
dance and do the right thing on AArch64 hosts.

Signed-off-by: Christoffer Dall 
---
 arch/arm/include/asm/kvm_emulate.h   | 5 +
 arch/arm/include/asm/kvm_mmio.h  | 2 ++
 arch/arm64/include/asm/kvm_emulate.h | 5 +
 arch/arm64/include/asm/kvm_mmio.h| 6 ++
 virt/kvm/arm/mmio.c  | 8 +++-
 5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/arch/arm/include/asm/kvm_emulate.h 
b/arch/arm/include/asm/kvm_emulate.h
index 9b118516d2db..fe55d8737a11 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -182,6 +182,11 @@ static inline bool kvm_vcpu_dabt_issext(struct kvm_vcpu 
*vcpu)
return kvm_vcpu_get_hsr(vcpu) & HSR_SSE;
 }
 
+static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu)
+{
+   return false;
+}
+
 static inline int kvm_vcpu_dabt_get_rd(struct kvm_vcpu *vcpu)
 {
return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT;
diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h
index 7c0eddb0adb2..32fbf82e3ebc 100644
--- a/arch/arm/include/asm/kvm_mmio.h
+++ b/arch/arm/include/asm/kvm_mmio.h
@@ -14,6 +14,8 @@
 struct kvm_decode {
unsigned long rt;
bool sign_extend;
+   /* Not used on 32-bit arm */
+   bool sixty_four;
 };
 
 void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index 5efe5ca8fecf..f407b6bdad2e 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -283,6 +283,11 @@ static inline bool kvm_vcpu_dabt_issext(const struct 
kvm_vcpu *vcpu)
return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SSE);
 }
 
+static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu)
+{
+   return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SF);
+}
+
 static inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu)
 {
return (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
diff --git a/arch/arm64/include/asm/kvm_mmio.h 
b/arch/arm64/include/asm/kvm_mmio.h
index 02b5c48fd467..b204501a0c39 100644
--- a/arch/arm64/include/asm/kvm_mmio.h
+++ b/arch/arm64/include/asm/kvm_mmio.h
@@ -10,13 +10,11 @@
 #include 
 #include 
 
-/*
- * This is annoying. The mmio code requires this, even if we don't
- * need any decoding. To be fixed.
- */
 struct kvm_decode {
unsigned long rt;
bool sign_extend;
+   /* Witdth of the register accessed by the faulting instruction is 
64-bits */
+   bool sixty_four;
 };
 
 void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
index 70d3b449692c..e62454b2e529 100644
--- a/virt/kvm/arm/mmio.c
+++ b/virt/kvm/arm/mmio.c
@@ -83,7 +83,7 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int 
len)
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
unsigned long data;
-   unsigned int len;
+   unsigned int len, regsize;
int mask;
 
/* Detect an already handled MMIO return */
@@ -105,6 +105,9 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
data = (data ^ mask) - mask;
}
 
+   if (!vcpu->arch.mmio_decode.sixty_four)
+   data = data & 0x;
+
trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
   );
data = vcpu_data_host_to_guest(vcpu, data, len);
@@ -125,6 +128,7 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool 
*is_write, int *len)
unsigned long rt;
int access_size;
bool sign_extend;
+   bool sixty_four;
 
if (kvm_vcpu_dabt_iss1tw(vcpu)) {
/* page table accesses IO mem: tell guest to fix its TTBR */
@@ -138,11 +142,13 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool 
*is_write, int *len)
 
*is_write = kvm_vcpu_dabt_iswrite(vcpu);
sign_extend = kvm_vcpu_dabt_issext(vcpu);
+   sixty_four = kvm_vcpu_dabt_issf(vcpu);
rt = kvm_vcpu_dabt_get_rd(vcpu);
 
*len = access_size;
vcpu->arch.mmio_decode.sign_extend = sign_extend;
vcpu->arch.mmio_decode.rt = rt;
+   vcpu->arch.mmio_decode.sixty_four = sixty_four;
 
return 0;
 }
-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm: remove excessive permission check in kvm_arch_prepare_memory_region

2019-12-06 Thread Christoffer Dall
On Fri, Dec 06, 2019 at 10:08:02AM +0800, Jia He wrote:
> In kvm_arch_prepare_memory_region, arm kvm regards the memory region as
> writable if the flag has no KVM_MEM_READONLY, and the vm is readonly if
> !VM_WRITE.
> 
> But there is common usage for setting kvm memory region as follows:
> e.g. qemu side (see the PROT_NONE flag)
> 1. mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>memory_region_init_ram_ptr()
> 2. re mmap the above area with read/write authority.
> 
> Such example is used in virtio-fs qemu codes which hasn't been upstreamed
> [1]. But seems we can't forbid this example.
> 
> Without this patch, it will cause an EPERM during kvm_set_memory_region()
> and cause qemu boot crash.
> 
> As told by Ard, "the underlying assumption is incorrect, i.e., that the
> value of vm_flags at this point in time defines how the VMA is used
> during its lifetime. There may be other cases where a VMA is created
> with VM_READ vm_flags that are changed to VM_READ|VM_WRITE later, and
> we are currently rejecting this use case as well."
> 
> [1] 
> https://gitlab.com/virtio-fs/qemu/blob/5a356e/hw/virtio/vhost-user-fs.c#L488

Reviewed-by: Christoffer Dall 

> 
> Cc: Ard Biesheuvel 
> Suggested-by: Ard Biesheuvel 
> Signed-off-by: Jia He 
> ---
>  virt/kvm/arm/mmu.c | 9 -
>  1 file changed, 9 deletions(-)
> 
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 38b4c910b6c3..a48994af70b8 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -2301,15 +2301,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>   if (!vma || vma->vm_start >= reg_end)
>   break;
>  
> - /*
> -  * Mapping a read-only VMA is only allowed if the
> -  * memory region is configured as read-only.
> -  */
> - if (writable && !(vma->vm_flags & VM_WRITE)) {
> - ret = -EPERM;
> - break;
> - }
> -
>   /*
>* Take the intersection of this VMA with the memory region
>*/
> -- 
> 2.17.1
> 
> 
> ___
> linux-arm-kernel mailing list
> linux-arm-ker...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] arm64: Fix Kconfig indentation

2019-11-22 Thread Christoffer Dall
On Thu, Nov 21, 2019 at 04:20:20AM +0100, Krzysztof Kozlowski wrote:
> Adjust indentation from spaces to tab (+optional two spaces) as in
> coding style with command like:
>   $ sed -e 's/^/\t/' -i */Kconfig
> 
> Signed-off-by: Krzysztof Kozlowski 
> 

FWIW, the config file should read fine in editors and I think this serves
little other purpose than making it harder to use git blame.

Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Memory regions and VMAs across architectures

2019-11-21 Thread Christoffer Dall
On Wed, Nov 20, 2019 at 07:28:07AM -0800, Sean Christopherson wrote:
> On Wed, Nov 20, 2019 at 12:52:16PM +0100, Christoffer Dall wrote:
> > On Tue, Nov 19, 2019 at 07:44:48PM -0800, Sean Christopherson wrote:
> > > On Fri, Nov 08, 2019 at 12:19:20PM +0100, Christoffer Dall wrote:
> > > > First, what prevents user space from messing around with the VMAs after
> > > > kvm_arch_prepare_memory_region() completes?  If nothing, then what is
> > > > the value of the cheks we perform wrt. to VMAs?
> > > 
> > > Arm's prepare_memory_region() holds mmap_sem and mmu_lock while processing
> > > the VMAs and populating the stage 2 page tables.  Holding mmap_sem 
> > > prevents
> > > the VMAs from being invalidated while the stage 2 tables are populated,
> > > e.g. prevents racing with the mmu notifier.  The VMAs could be modified
> > > after prepare_memory_region(), but the mmu notifier will ensure they are
> > > unmapped from stage2 prior the the host change taking effect.  So I think
> > > you're safe (famous last words).
> > > 
> > 
> > So we for example check:
> > 
> > writeable = !(memslot->falgs & KVM_MEM_READONLY);
> > if (writeable && !(vma->vm_flags & VM_WRITE))
> > return -EPERM;
> > 
> > And yes, user space can then unmap the VMAs and MMU notifiers will
> > unmap the stage 2 entries, but user space can then create a new
> > read-only VMA covering the area of the memslot and the fault-handling
> > path will have to deal with this same check later.Only, the fault
> > handling path, via gfn_to_pfn_prot(), returns an address based on an
> > entirely different set of mechanics, than our prepare_memory_region,
> > which I think indicates we are doing something wrong somewhere, and we
> > should have a common path for faulting things in, for I/O, both if we do
> > this up-front or if we do this at fault time.
> 
> Unconditionally interpreting vm_pgoff as a physical address does not seem
> correct.  There are cases where that might be correct, e.g. if the backing
> (virtual) file is a flat representation of the address space, which appears
> to be the case on some architectures, e.g. for PCI handling.  But even then
> there should be some confirmation that the VMA is actually associated with
> such a file, otherwise KVM is at the mercy of userspace to do the right
> thing (unless there are other guarantees on arm I am unaware of).
> 
> > > > Second, why would arm/arm64 need special handling for I/O mappings
> > > > compared to other architectures, and how is this dealt with for
> > > > x86/s390/power/... ?
> > > 
> > > As Ard mentioned, it looks like an optimization.  The "passthrough"
> > > part from the changelog implies that VM_PFNMAP memory regions are 
> > > exclusive
> > > to the guest.  Mapping the entire thing would be a nice boot optimization
> > > as it would save taking page faults on every page of the MMIO region.
> > > 
> > > As for how this is different from other archs... at least on x86, 
> > > VM_PFNMAP
> > > isn't guaranteed to be passthrough or even MMIO, e.g. prefaulting the
> > > pages may actually trigger allocation, and remapping the addresses could 
> > > be
> > > flat out wrong.
> > 
> > What does VM_PFNMAP mean on x86?  I didn't think we were relying on
> > anything architecture specific in their meaning in the arm code, and I
> > thought the VM_PFNMAP was a generic mm flag with generic mm meaning,
> > but I could be wrong here?
> 
> No, you're correct, VM_PFNMAP is a generic flag that state the VMA doesn't
> have an associated struct page and is being managed directly by something
> other than the core mmu.
> 
> But not having a struct page doesn't guarantee that the PFN is backed by
> MMIO, or that it is exclusive to the guest (although in practice this is
> probably the case 99.% of the time).  E.g. x86 supports having guest
> memory backed by regular ram that is hidden from the host kernel via
> 'mem=', which will show up as VM_PFNMAP.
> 
> > Is there any valid semantics for creating a memslot backed by a
> > VM_PFNMAP on x86, and if so, what are those?
> > 
> > Similarly, if you do map a device region straight to the guest on x86,
> > how is that handled?  (A pointer to the right place in the myriad of EPT
> > and shadow code in x86 would be much appreciated.)
> 
> There is no special handling in x86 for VM_PFNMAP memory, we rely on KVM's
> generic __gfn_to_pfn_memslot() to retrieve the PFN on demand, and use
> mmu_notifier_seq to ensure the stale PFNs (invalidated in the host) aren't
> inserted into the guest page tables.  Effectively the same thing arm does,
> sans the prepare_memory_region() shenanigans.

Thanks Sean, I'll have a look at reworking said shenanigans ;)

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Memory regions and VMAs across architectures

2019-11-20 Thread Christoffer Dall
On Tue, Nov 19, 2019 at 07:44:48PM -0800, Sean Christopherson wrote:
> On Fri, Nov 08, 2019 at 12:19:20PM +0100, Christoffer Dall wrote:
> > Hi,
> > 
> > I had a look at our relatively complicated logic in
> > kvm_arch_prepare_memory_region(), and was wondering if there was room to
> > unify some of this handling between architectures.
> > 
> > (If you haven't seen our implementation, you can find it in
> > virt/kvm/arm/mmu.c, and it has lovely ASCII art!)
> > 
> > I then had a look at the x86 code, but that doesn't actually do anything
> > when creating memory regions, which makes me wonder why the arhitectures
> > differ in this aspect.
> > 
> > The reason we added the logic that we have for arm/arm64 is that we
> > don't really want to take faults for I/O accesses.  I'm not actually
> > sure if this is a corretness thing, or an optimization effort, and the
> > original commit message doesn't really explain.  Ard, you wrote that
> > code, do you recall the details?
> > 
> > In any case, what we do is to check for each VMA backing a memslot, we
> > check if the memslot flags and vma flags are a reasonable match, and we
> > try to detect I/O mappings by looking for the VM_PFNMAP flag on the VMA
> > and pre-populate stage 2 page tables (our equivalent of EPT/NPT/...).
> > However, there are some things which are not clear to me:
> > 
> > First, what prevents user space from messing around with the VMAs after
> > kvm_arch_prepare_memory_region() completes?  If nothing, then what is
> > the value of the cheks we perform wrt. to VMAs?
> 
> Arm's prepare_memory_region() holds mmap_sem and mmu_lock while processing
> the VMAs and populating the stage 2 page tables.  Holding mmap_sem prevents
> the VMAs from being invalidated while the stage 2 tables are populated,
> e.g. prevents racing with the mmu notifier.  The VMAs could be modified
> after prepare_memory_region(), but the mmu notifier will ensure they are
> unmapped from stage2 prior the the host change taking effect.  So I think
> you're safe (famous last words).
> 

So we for example check:

writeable = !(memslot->falgs & KVM_MEM_READONLY);
if (writeable && !(vma->vm_flags & VM_WRITE))
return -EPERM;

And yes, user space can then unmap the VMAs and MMU notifiers will
unmap the stage 2 entries, but user space can then create a new
read-only VMA covering the area of the memslot and the fault-handling
path will have to deal with this same check later.  Only, the fault
handling path, via gfn_to_pfn_prot(), returns an address based on an
entirely different set of mechanics, than our prepare_memory_region,
which I think indicates we are doing something wrong somewhere, and we
should have a common path for faulting things in, for I/O, both if we do
this up-front or if we do this at fault time.


> > Second, why would arm/arm64 need special handling for I/O mappings
> > compared to other architectures, and how is this dealt with for
> > x86/s390/power/... ?
> 
> As Ard mentioned, it looks like an optimization.  The "passthrough"
> part from the changelog implies that VM_PFNMAP memory regions are exclusive
> to the guest.  Mapping the entire thing would be a nice boot optimization
> as it would save taking page faults on every page of the MMIO region.
> 
> As for how this is different from other archs... at least on x86, VM_PFNMAP
> isn't guaranteed to be passthrough or even MMIO, e.g. prefaulting the
> pages may actually trigger allocation, and remapping the addresses could be
> flat out wrong.

What does VM_PFNMAP mean on x86?  I didn't think we were relying on
anything architecture specific in their meaning in the arm code, and I
thought the VM_PFNMAP was a generic mm flag with generic mm meaning,
but I could be wrong here?

Is there any valid semantics for creating a memslot backed by a
VM_PFNMAP on x86, and if so, what are those?

Similarly, if you do map a device region straight to the guest on x86,
how is that handled?  (A pointer to the right place in the myriad of EPT
and shadow code in x86 would be much appreciated.)


Thanks!

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Memory regions and VMAs across architectures

2019-11-08 Thread Christoffer Dall
Hi,

I had a look at our relatively complicated logic in
kvm_arch_prepare_memory_region(), and was wondering if there was room to
unify some of this handling between architectures.

(If you haven't seen our implementation, you can find it in
virt/kvm/arm/mmu.c, and it has lovely ASCII art!)

I then had a look at the x86 code, but that doesn't actually do anything
when creating memory regions, which makes me wonder why the arhitectures
differ in this aspect.

The reason we added the logic that we have for arm/arm64 is that we
don't really want to take faults for I/O accesses.  I'm not actually
sure if this is a corretness thing, or an optimization effort, and the
original commit message doesn't really explain.  Ard, you wrote that
code, do you recall the details?

In any case, what we do is to check for each VMA backing a memslot, we
check if the memslot flags and vma flags are a reasonable match, and we
try to detect I/O mappings by looking for the VM_PFNMAP flag on the VMA
and pre-populate stage 2 page tables (our equivalent of EPT/NPT/...).
However, there are some things which are not clear to me:

First, what prevents user space from messing around with the VMAs after
kvm_arch_prepare_memory_region() completes?  If nothing, then what is
the value of the cheks we perform wrt. to VMAs?

Second, why would arm/arm64 need special handling for I/O mappings
compared to other architectures, and how is this dealt with for
x86/s390/power/... ?


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 0/2] KVM: arm64: Reduce occurence of GICv4 doorbells on non-oversubscribed systems

2019-11-07 Thread Christoffer Dall
On Thu, Nov 07, 2019 at 04:04:10PM +, Marc Zyngier wrote:
> As I was cleaning up some of the GICv4 code to make way for GICv4.1 it
> occured to me that we could drastically reduce the impact of the GICv4
> doorbells on systems that are not oversubscribed (each vcpu "owns" a
> physical CPU).
> 
> The technique borrows its logic from the way we disable WFE trapping
> when a vcpu is the only process on the CPU run-queue. If this vcpu is
> the target of VLPIs, it is then beneficial not to trap blocking WFIs
> and to leave the vcpu waiting for interrupts in guest state.
> 
> All we need to do here is to track whether VLPIs are associated to a
> vcpu (which is easily done by using a counter that we update on MAPI,
> DISCARD and MOVI).
> 
> It has been *very lightly* tested on a D05, and behaved pretty well in
> my limited test cases (I get almost no doorbell at all in the non
> oversubscribed case, and the usual hailstorm as soon as there is
> oversubscription). I'd welcome some testing on more current HW.
> 
Reviewed-by: Christoffer Dall 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] KVM: arm64: Don't set HCR_EL2.TVM when S2FWB is supported

2019-11-06 Thread Christoffer Dall
Hi Alexandru,

On Wed, Nov 06, 2019 at 01:02:21PM +, Alexandru Elisei wrote:
> 
> On 10/28/19 1:05 PM, Christoffer Dall wrote:
> > On CPUs that support S2FWB (Armv8.4+), KVM configures the stage 2 page
> > tables to override the memory attributes of memory accesses, regardless
> > of the stage 1 page table configurations, and also when the stage 1 MMU
> > is turned off.  This results in all memory accesses to RAM being
> > cacheable, including during early boot of the guest.
> >
> > On CPUs without this feature, memory accesses were non-cacheable during
> > boot until the guest turned on the stage 1 MMU, and we had to detect
> > when the guest turned on the MMU, such that we could invalidate all cache
> > entries and ensure a consistent view of memory with the MMU turned on.
> > When the guest turned on the caches, we would call stage2_flush_vm()
> > from kvm_toggle_cache().
> >
> > However, stage2_flush_vm() walks all the stage 2 tables, and calls
> > __kvm_flush-dcache_pte, which on a system with S2FWD does ... absolutely
> > nothing.
> >
> > We can avoid that whole song and dance, and simply not set TVM when
> > creating a VM on a system that has S2FWB.
> >
> > Signed-off-by: Christoffer Dall 
> > Reviewed-by: Mark Rutland 
> > ---
> > I was only able to test this on the model with cache modeling enabled,
> > but even removing TVM from HCR_EL2 without having FWB also worked with
> > that setup, so the testing of this has been light.  It seems like it
> > should obviously work, but it would be good if someone with access to
> > appropriate hardware could give this a spin.
> >
> >  arch/arm64/include/asm/kvm_arm.h |  3 +--
> >  arch/arm64/include/asm/kvm_emulate.h | 12 +++-
> >  2 files changed, 12 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/kvm_arm.h 
> > b/arch/arm64/include/asm/kvm_arm.h
> > index ddf9d762ac62..6e5d839f42b5 100644
> > --- a/arch/arm64/include/asm/kvm_arm.h
> > +++ b/arch/arm64/include/asm/kvm_arm.h
> > @@ -61,7 +61,6 @@
> >   * RW: 64bit by default, can be overridden for 32bit VMs
> >   * TAC:Trap ACTLR
> >   * TSC:Trap SMC
> > - * TVM:Trap VM ops (until M+C set in SCTLR_EL1)
> >   * TSW:Trap cache operations by set/way
> >   * TWE:Trap WFE
> >   * TWI:Trap WFI
> > @@ -74,7 +73,7 @@
> >   * SWIO:   Turn set/way invalidates into set/way clean+invalidate
> >   */
> >  #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
> > -HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
> > +HCR_BSU_IS | HCR_FB | HCR_TAC | \
> >  HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
> >  HCR_FMO | HCR_IMO)
> >  #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
> > diff --git a/arch/arm64/include/asm/kvm_emulate.h 
> > b/arch/arm64/include/asm/kvm_emulate.h
> > index d69c1efc63e7..70509799a2a9 100644
> > --- a/arch/arm64/include/asm/kvm_emulate.h
> > +++ b/arch/arm64/include/asm/kvm_emulate.h
> > @@ -53,8 +53,18 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
> > /* trap error record accesses */
> > vcpu->arch.hcr_el2 |= HCR_TERR;
> > }
> > -   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> > +
> > +   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
> > vcpu->arch.hcr_el2 |= HCR_FWB;
> > +   } else {
> > +   /*
> > +* For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C
> > +* get set in SCTLR_EL1 such that we can detect when the guest
> > +* MMU gets turned off and do the necessary cache maintenance
> > +* then.
> > +*/
> > +   vcpu->arch.hcr_el2 &= ~HCR_TVM;
> > +   }
> >  
> > if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
> > vcpu->arch.hcr_el2 &= ~HCR_RW;
> 
> This patch makes sense to me: when FWB is available, the guest memory is 
> cacheable
> even when the stage 1 MMU is disabled, which means it's now impossible to 
> have a
> situation where the data in memory is newer than the data in the cache.
> 
> I tested the patch with the fix suggested by Marc by doing a linux boot and 
> then a
> 'ls -R /', and by running kvm-unit-tests in a loop a couple dozen times. For 
> what
> it's worth:
> 
> Tested-by: Alexandru Elisei 
> 
> I do need to point out t

[PATCH v4 5/5] KVM: mips: Move to common kvm_mmu_memcache infrastructure

2019-11-05 Thread Christoffer Dall
Now that we have a common infrastructure for doing MMU cache
allocations, use this for mips as well.

This will change the GFP flags used for mips from plain GFP_KERNEL to
GFP_PGTABLE_USER.  This means that mips KVM page table allocations now
gain __GFP_ACCOUNT and __GFP_ZERO.  There should be no harm in the
former, and while the latter might result in slight overhead for zeroing
the page, it seems this is what a hypervisor should do on page table
allocations.

Signed-off-by: Christoffer Dall 
---
 arch/mips/include/asm/kvm_host.h  | 15 ++---
 arch/mips/include/asm/kvm_types.h |  4 +++
 arch/mips/kvm/mips.c  |  2 +-
 arch/mips/kvm/mmu.c   | 54 ++-
 4 files changed, 17 insertions(+), 58 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 41204a49cf95..418c941f1382 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -293,17 +293,6 @@ struct kvm_mips_tlb {
long tlb_lo[2];
 };
 
-#define KVM_NR_MEM_OBJS 4
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_mmu_memory_cache {
-   int nobjs;
-   void *objects[KVM_NR_MEM_OBJS];
-};
-
 #define KVM_MIPS_AUX_FPU   0x1
 #define KVM_MIPS_AUX_MSA   0x2
 
@@ -378,7 +367,7 @@ struct kvm_vcpu_arch {
unsigned int last_user_gasid;
 
/* Cache some mmu pages needed inside spinlock regions */
-   struct kvm_mmu_memory_cache mmu_page_cache;
+   struct kvm_mmu_memcache mmu_page_cache;
 
 #ifdef CONFIG_KVM_MIPS_VZ
/* vcpu's vzguestid is different on each host cpu in an smp system */
@@ -915,7 +904,7 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush 
flags);
 bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
 int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
 pgd_t *kvm_pgd_alloc(void);
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_memcaches(struct kvm_vcpu *vcpu);
 void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
  bool user);
 void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/include/asm/kvm_types.h 
b/arch/mips/include/asm/kvm_types.h
index 5efeb32a5926..c7b906568a0e 100644
--- a/arch/mips/include/asm/kvm_types.h
+++ b/arch/mips/include/asm/kvm_types.h
@@ -2,4 +2,8 @@
 #ifndef _ASM_MIPS_KVM_TYPES_H
 #define _ASM_MIPS_KVM_TYPES_H
 
+#define KVM_ARCH_WANT_MMU_MEMCACHE
+
+#define KVM_MMU_NR_MEMCACHE_OBJS 4
+
 #endif /* _ASM_MIPS_KVM_TYPES_H */
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 1109924560d8..8bf12ed539b5 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -415,7 +415,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 
kvm_mips_dump_stats(vcpu);
 
-   kvm_mmu_free_memory_caches(vcpu);
+   kvm_mmu_free_memcaches(vcpu);
kfree(vcpu->arch.guest_ebase);
kfree(vcpu->arch.kseg0_commpage);
kfree(vcpu);
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 97e538a8c1be..aed5284d642e 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -25,41 +25,9 @@
 #define KVM_MMU_CACHE_MIN_PAGES 2
 #endif
 
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- int min, int max)
+void kvm_mmu_free_memcaches(struct kvm_vcpu *vcpu)
 {
-   void *page;
-
-   BUG_ON(max > KVM_NR_MEM_OBJS);
-   if (cache->nobjs >= min)
-   return 0;
-   while (cache->nobjs < max) {
-   page = (void *)__get_free_page(GFP_KERNEL);
-   if (!page)
-   return -ENOMEM;
-   cache->objects[cache->nobjs++] = page;
-   }
-   return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
-   while (mc->nobjs)
-   free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
-{
-   void *p;
-
-   BUG_ON(!mc || !mc->nobjs);
-   p = mc->objects[--mc->nobjs];
-   return p;
-}
-
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-   mmu_free_memory_cache(>arch.mmu_page_cache);
+   kvm_mmu_free_memcache_page(>arch.mmu_page_cache);
 }
 
 /**
@@ -133,7 +101,7 @@ pgd_t *kvm_pgd_alloc(void)
  * NULL if a page table doesn't exist for @addr and !@cache.
  * NULL if a page table allocation failed.
  */
-static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
+static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memcache *cache,
unsigned long addr)
 {
pud_t *pud;
@@ -151,7 +119,7 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct 
kvm_mm

[PATCH v4 3/5] KVM: x86: Rename mmu_memory_cache to kvm_mmu_memcache

2019-11-05 Thread Christoffer Dall
As we have moved the mmu memory cache definitions and functions to
common code, they are exported as symols to the rest of the kernel.

Let's rename the functions and data types to have a kvm_ prefix to make
it clear where these functions belong and take this chance to rename
memory_cache to memcache to avoid overly long lines.

This is a bit tedious on the callsites but ends up looking more
palatable.

Signed-off-by: Christoffer Dall 
---
 arch/x86/include/asm/kvm_host.h  |  6 ++---
 arch/x86/include/asm/kvm_types.h |  4 ++--
 arch/x86/kvm/mmu.c   | 38 
 arch/x86/kvm/paging_tmpl.h   |  4 ++--
 include/linux/kvm_host.h | 14 ++--
 include/linux/kvm_types.h|  6 ++---
 virt/kvm/kvm_main.c  | 14 ++--
 7 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e5080b618f3c..47e183ca0fb2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -586,9 +586,9 @@ struct kvm_vcpu_arch {
 */
struct kvm_mmu *walk_mmu;
 
-   struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
-   struct kvm_mmu_memory_cache mmu_page_cache;
-   struct kvm_mmu_memory_cache mmu_page_header_cache;
+   struct kvm_mmu_memcache mmu_pte_list_desc_cache;
+   struct kvm_mmu_memcache mmu_page_cache;
+   struct kvm_mmu_memcache mmu_page_header_cache;
 
/*
 * QEMU userspace and the guest each have their own FPU state.
diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h
index 40428651dc7a..d391490ab8d1 100644
--- a/arch/x86/include/asm/kvm_types.h
+++ b/arch/x86/include/asm/kvm_types.h
@@ -2,8 +2,8 @@
 #ifndef _ASM_X86_KVM_TYPES_H
 #define _ASM_X86_KVM_TYPES_H
 
-#define KVM_ARCH_WANT_MMU_MEMORY_CACHE
+#define KVM_ARCH_WANT_MMU_MEMCACHE
 
-#define KVM_NR_MEM_OBJS 40
+#define KVM_MMU_NR_MEMCACHE_OBJS 40
 
 #endif /* _ASM_X86_KVM_TYPES_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index abcdb47b0ac7..431ac346a1e8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1017,35 +1017,35 @@ static void walk_shadow_page_lockless_end(struct 
kvm_vcpu *vcpu)
local_irq_enable();
 }
 
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+static int kvm_mmu_topup_memcaches(struct kvm_vcpu *vcpu)
 {
int r;
 
-   r = mmu_topup_memory_cache(>arch.mmu_pte_list_desc_cache,
+   r = kvm_mmu_topup_memcache(>arch.mmu_pte_list_desc_cache,
   pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
if (r)
goto out;
-   r = mmu_topup_memory_cache_page(>arch.mmu_page_cache, 8);
+   r = kvm_mmu_topup_memcache_page(>arch.mmu_page_cache, 8);
if (r)
goto out;
-   r = mmu_topup_memory_cache(>arch.mmu_page_header_cache,
+   r = kvm_mmu_topup_memcache(>arch.mmu_page_header_cache,
   mmu_page_header_cache, 4);
 out:
return r;
 }
 
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+static void kvm_mmu_free_memcaches(struct kvm_vcpu *vcpu)
 {
-   mmu_free_memory_cache(>arch.mmu_pte_list_desc_cache,
+   kvm_mmu_free_memcache(>arch.mmu_pte_list_desc_cache,
pte_list_desc_cache);
-   mmu_free_memory_cache_page(>arch.mmu_page_cache);
-   mmu_free_memory_cache(>arch.mmu_page_header_cache,
+   kvm_mmu_free_memcache_page(>arch.mmu_page_cache);
+   kvm_mmu_free_memcache(>arch.mmu_page_header_cache,
mmu_page_header_cache);
 }
 
 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 {
-   return mmu_memory_cache_alloc(>arch.mmu_pte_list_desc_cache);
+   return kvm_mmu_memcache_alloc(>arch.mmu_pte_list_desc_cache);
 }
 
 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -1371,10 +1371,10 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm 
*kvm, gfn_t gfn,
 
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
 {
-   struct kvm_mmu_memory_cache *cache;
+   struct kvm_mmu_memcache *cache;
 
cache = >arch.mmu_pte_list_desc_cache;
-   return mmu_memory_cache_free_objects(cache);
+   return kvm_mmu_memcache_free_objects(cache);
 }
 
 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -2062,10 +2062,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct 
kvm_vcpu *vcpu, int direct
 {
struct kvm_mmu_page *sp;
 
-   sp = mmu_memory_cache_alloc(>arch.mmu_page_header_cache);
-   sp->spt = mmu_memory_cache_alloc(>arch.mmu_page_cache);
+   sp = kvm_mmu_memcache_alloc(>arch.mmu_page_header_cache);
+   sp->spt = kvm_mmu_memcache_alloc(>arch.mmu_page_cache);
if (!direct)
-   sp->gfns = mmu_memory_cache_alloc(>arch.mmu_page_cache);
+

[PATCH v4 1/5] KVM: x86: Move memcache allocation to GFP_PGTABLE_USER

2019-11-05 Thread Christoffer Dall
Recent commit 50f11a8a4620eee6b6831e69ab5d42456546d7d8 moved page table
allocations for both KVM and normal user page table allocations to
GFP_PGTABLE_USER in order to get __GFP_ACCOUNT for the page tables.

However, while KVM on other architectures such as arm64 were included in
this change, curiously KVM on x86 was not.

Currently, KVM on x86 uses kmem_cache_zalloc(GFP_KERNEL_ACCOUNT) for
kmem_cache-based allocations, which expands in the following way:
  kmem_cache_zalloc(..., GFP_KERNEL_ACCOUNT) =>
  kmem_cache_alloc(..., GFP_KERNEL_ACCOUNT | __GFP_ZERO) =>
  kmem_cache_alloc(..., GFP_KERNEL | __GFP_ACCOUNT | __GFP_ZERO)

It so happens that GFP_PGTABLE_USER expands as:
  GFP_PGTABLE_USER =>
  (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) =>
  ((GFP_KERNEL | __GFP_ZERO) | __GFP_ACCOUNT) =>
  (GFP_KERNEL | __GFP_ACCOUNT | __GFP_ZERO)

Which means that we can replace the current KVM on x86 call as:
-  obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
+  obj = kmem_cache_alloc(base_cache, GFP_PGTABLE_USER);

For the single page cache topup allocation, KVM on x86 currently uses
__get_free_page(GFP_KERNEL_ACCOUNT).  It seems to me that is equivalent
to the above, except that the allocated page is not guaranteed to be
zero (unless I missed the place where __get_free_page(!__GFP_ZERO) is
still guaranteed to be zeroed.  It seems natural (and in fact desired)
to have both topup functions implement the same expectations towards the
caller, and we therefore move to GFP_PGTABLE_USER here as well.

This will make it easier to unify the memchace implementation between
architectures.

Signed-off-by: Christoffer Dall 
---
 arch/x86/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 24c23c66b226..540190cee3cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -40,6 +40,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1025,7 +1026,7 @@ static int mmu_topup_memory_cache(struct 
kvm_mmu_memory_cache *cache,
if (cache->nobjs >= min)
return 0;
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-   obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
+   obj = kmem_cache_alloc(base_cache, GFP_PGTABLE_USER);
if (!obj)
return cache->nobjs >= min ? 0 : -ENOMEM;
cache->objects[cache->nobjs++] = obj;
@@ -1053,7 +1054,7 @@ static int mmu_topup_memory_cache_page(struct 
kvm_mmu_memory_cache *cache,
if (cache->nobjs >= min)
return 0;
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-   page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+   page = (void *)__get_free_page(GFP_PGTABLE_USER);
if (!page)
return cache->nobjs >= min ? 0 : -ENOMEM;
cache->objects[cache->nobjs++] = page;
-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v4 2/5] KVM: x86: Move mmu_memory_cache functions to common code

2019-11-05 Thread Christoffer Dall
We are currently duplicating the mmu memory cache functionality quite
heavily between the architectures that support KVM.  As a first step,
move the x86 implementation (which seems to have the most recently
maintained version of the mmu memory cache) to common code.

We introduce an arch-specific kvm_types.h which can be used to specify
how many objects are required in the memory cache, an aspect which
diverges across architectures.  Since kvm_host.h defines structures with
fields of the memcache object, we define the memcache structure in
kvm_types.h, and we include the architecture-specific kvm_types.h to
know the size of object in kvm_host.h.

We only define the functions and data types if
KVM_ARCH_WANT_MMU_MEMORY_CACHE is defined, because not all architectures
require the mmu memory cache.

Signed-off-by: Christoffer Dall 
---
 arch/arm/include/asm/kvm_types.h |  5 +++
 arch/arm64/include/asm/kvm_types.h   |  6 +++
 arch/mips/include/asm/kvm_types.h|  5 +++
 arch/powerpc/include/asm/kvm_types.h |  5 +++
 arch/s390/include/asm/kvm_types.h|  5 +++
 arch/x86/include/asm/kvm_host.h  | 11 -
 arch/x86/include/asm/kvm_types.h |  9 
 arch/x86/kvm/mmu.c   | 60 ---
 include/linux/kvm_host.h | 11 +
 include/linux/kvm_types.h| 13 ++
 virt/kvm/kvm_main.c  | 61 
 11 files changed, 120 insertions(+), 71 deletions(-)
 create mode 100644 arch/arm/include/asm/kvm_types.h
 create mode 100644 arch/arm64/include/asm/kvm_types.h
 create mode 100644 arch/mips/include/asm/kvm_types.h
 create mode 100644 arch/powerpc/include/asm/kvm_types.h
 create mode 100644 arch/s390/include/asm/kvm_types.h
 create mode 100644 arch/x86/include/asm/kvm_types.h

diff --git a/arch/arm/include/asm/kvm_types.h b/arch/arm/include/asm/kvm_types.h
new file mode 100644
index ..bc389f82e88d
--- /dev/null
+++ b/arch/arm/include/asm/kvm_types.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ARM_KVM_TYPES_H
+#define _ASM_ARM_KVM_TYPES_H
+
+#endif /* _ASM_ARM_KVM_TYPES_H */
diff --git a/arch/arm64/include/asm/kvm_types.h 
b/arch/arm64/include/asm/kvm_types.h
new file mode 100644
index ..d0987007d581
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_types.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ARM64_KVM_TYPES_H
+#define _ASM_ARM64_KVM_TYPES_H
+
+#endif /* _ASM_ARM64_KVM_TYPES_H */
+
diff --git a/arch/mips/include/asm/kvm_types.h 
b/arch/mips/include/asm/kvm_types.h
new file mode 100644
index ..5efeb32a5926
--- /dev/null
+++ b/arch/mips/include/asm/kvm_types.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_MIPS_KVM_TYPES_H
+#define _ASM_MIPS_KVM_TYPES_H
+
+#endif /* _ASM_MIPS_KVM_TYPES_H */
diff --git a/arch/powerpc/include/asm/kvm_types.h 
b/arch/powerpc/include/asm/kvm_types.h
new file mode 100644
index ..f627eceaa314
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_types.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_KVM_TYPES_H
+#define _ASM_POWERPC_KVM_TYPES_H
+
+#endif /* _ASM_POWERPC_KVM_TYPES_H */
diff --git a/arch/s390/include/asm/kvm_types.h 
b/arch/s390/include/asm/kvm_types.h
new file mode 100644
index ..b66a81f8a354
--- /dev/null
+++ b/arch/s390/include/asm/kvm_types.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_KVM_TYPES_H
+#define _ASM_S390_KVM_TYPES_H
+
+#endif /* _ASM_S390_KVM_TYPES_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 50eb430b0ad8..e5080b618f3c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -179,8 +179,6 @@ enum {
 
 #include 
 
-#define KVM_NR_MEM_OBJS 40
-
 #define KVM_NR_DB_REGS 4
 
 #define DR6_BD (1 << 13)
@@ -231,15 +229,6 @@ enum {
 
 struct kvm_kernel_irq_routing_entry;
 
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_mmu_memory_cache {
-   int nobjs;
-   void *objects[KVM_NR_MEM_OBJS];
-};
-
 /*
  * the pages used as guest page table on soft mmu are tracked by
  * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h
new file mode 100644
index ..40428651dc7a
--- /dev/null
+++ b/arch/x86/include/asm/kvm_types.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_TYPES_H
+#define _ASM_X86_KVM_TYPES_H
+
+#define KVM_ARCH_WANT_MMU_MEMORY_CACHE
+
+#define KVM_NR_MEM_OBJS 40
+
+#endif /* _ASM_X86_KVM_TYPES_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 540190cee3cb..abcdb47b0ac7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -40,7 +40,6 @@
 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -1018,56 +

[PATCH v4 0/5] KVM: Unify mmu_memory_cache functionality across architectures

2019-11-05 Thread Christoffer Dall
We currently have duplicated functionality for the mmu_memory_cache used
to pre-allocate memory for the page table manipulation code which cannot
allocate memory while holding spinlocks.  This functionality is
duplicated across x86, arm/arm64, and mips.

This was motivated by a debate of modifying the arm code to be more in
line with the x86 code and some discussions around changing the page
flags used for allocation.  This series should make it easier to take a
uniform approach across architectures.

While there's not a huge amount of code sharing, we come out with a net
gain, and the real win is in the consistency of how we allocate memory
for page tables used by secondary MMUs driven by KVM in Linux.

Only tested on arm/arm64, and only compile-tested on x86 and mips.  I'm
especially curious on getting feedback on the change of GFP flags for
x86 (patch 1) and on the use of __GFP_ACCOUNT for mips.

Changes since v3:
 - Moved to common GFP_PGTABLE_USER definition for page allocations in
   the MMU cache for all three architectures.  This follows recent work
   which already did this for arm/arm64.
 - Rebased on v5.4-rc4.

Changes since v2:
 - Simplified kalloc flag definitions as per Paolo's review comment.

Changes since v1:
 - Split out rename from initial x86 patch to have separate patches to
   move the logic to common code and to rename.
 - Introduce KVM_ARCH_WANT_MMU_MEMCACHE to avoid compile breakage on
   architectures that don't use this functionality.
 - Rename KVM_NR_MEM_OBJS to KVM_MMU_NR_MEMCACHE_OBJS

Christoffer Dall (5):
  KVM: x86: Move memcache allocation to GFP_PGTABLE_USER
  KVM: x86: Move mmu_memory_cache functions to common code
  KVM: x86: Rename mmu_memory_cache to kvm_mmu_memcache
  KVM: arm/arm64: Move to common kvm_mmu_memcache infrastructure
  KVM: mips: Move to common kvm_mmu_memcache infrastructure

 arch/arm/include/asm/kvm_host.h  | 13 +---
 arch/arm/include/asm/kvm_mmu.h   |  2 +-
 arch/arm/include/asm/kvm_types.h |  9 +++
 arch/arm64/include/asm/kvm_host.h| 13 +---
 arch/arm64/include/asm/kvm_mmu.h |  2 +-
 arch/arm64/include/asm/kvm_types.h   |  9 +++
 arch/mips/include/asm/kvm_host.h | 15 +
 arch/mips/include/asm/kvm_types.h|  9 +++
 arch/mips/kvm/mips.c |  2 +-
 arch/mips/kvm/mmu.c  | 54 +++-
 arch/powerpc/include/asm/kvm_types.h |  5 ++
 arch/s390/include/asm/kvm_types.h|  5 ++
 arch/x86/include/asm/kvm_host.h  | 17 +
 arch/x86/include/asm/kvm_types.h |  9 +++
 arch/x86/kvm/mmu.c   | 97 ++--
 arch/x86/kvm/paging_tmpl.h   |  4 +-
 include/linux/kvm_host.h | 11 
 include/linux/kvm_types.h| 13 
 virt/kvm/arm/arm.c   |  2 +-
 virt/kvm/arm/mmu.c   | 68 +--
 virt/kvm/kvm_main.c  | 61 +
 21 files changed, 190 insertions(+), 230 deletions(-)
 create mode 100644 arch/arm/include/asm/kvm_types.h
 create mode 100644 arch/arm64/include/asm/kvm_types.h
 create mode 100644 arch/mips/include/asm/kvm_types.h
 create mode 100644 arch/powerpc/include/asm/kvm_types.h
 create mode 100644 arch/s390/include/asm/kvm_types.h
 create mode 100644 arch/x86/include/asm/kvm_types.h

-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v4 4/5] KVM: arm/arm64: Move to common kvm_mmu_memcache infrastructure

2019-11-05 Thread Christoffer Dall
Now when we have a common mmu mmemcache implementation, we can reuse
this for arm and arm64.

The common implementation has a slightly different behavior when
allocating objects under high memory pressure; whereas the current
arm/arm64 implementation will give up and return -ENOMEM if the full
size of the cache cannot be allocated during topup, the common
implementation is happy with any allocation between min and max.  There
should be no architecture-specific requirement for doing it one way or
the other and it's in fact better to enforce a cross-architecture KVM
policy on this behavior.

Signed-off-by: Christoffer Dall 
---
 arch/arm/include/asm/kvm_host.h| 13 +-
 arch/arm/include/asm/kvm_mmu.h |  2 +-
 arch/arm/include/asm/kvm_types.h   |  4 ++
 arch/arm64/include/asm/kvm_host.h  | 13 +-
 arch/arm64/include/asm/kvm_mmu.h   |  2 +-
 arch/arm64/include/asm/kvm_types.h |  5 ++-
 virt/kvm/arm/arm.c |  2 +-
 virt/kvm/arm/mmu.c | 68 --
 8 files changed, 30 insertions(+), 79 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 8a37c8e89777..04e7c5868132 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -78,17 +78,6 @@ struct kvm_arch {
u32 psci_version;
 };
 
-#define KVM_NR_MEM_OBJS 40
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_mmu_memory_cache {
-   int nobjs;
-   void *objects[KVM_NR_MEM_OBJS];
-};
-
 struct kvm_vcpu_fault_info {
u32 hsr;/* Hyp Syndrome Register */
u32 hxfar;  /* Hyp Data/Inst. Fault Address Register */
@@ -196,7 +185,7 @@ struct kvm_vcpu_arch {
struct kvm_decode mmio_decode;
 
/* Cache some mmu pages needed inside spinlock regions */
-   struct kvm_mmu_memory_cache mmu_page_cache;
+   struct kvm_mmu_memcache mmu_page_cache;
 
struct vcpu_reset_state reset_state;
 
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 0d84d50bf9ba..b1ff76aac0cd 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -59,7 +59,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t 
guest_ipa,
 
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_memcaches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
diff --git a/arch/arm/include/asm/kvm_types.h b/arch/arm/include/asm/kvm_types.h
index bc389f82e88d..de5be31a5a77 100644
--- a/arch/arm/include/asm/kvm_types.h
+++ b/arch/arm/include/asm/kvm_types.h
@@ -2,4 +2,8 @@
 #ifndef _ASM_ARM_KVM_TYPES_H
 #define _ASM_ARM_KVM_TYPES_H
 
+#define KVM_ARCH_WANT_MMU_MEMCACHE
+
+#define KVM_MMU_NR_MEMCACHE_OBJS 40
+
 #endif /* _ASM_ARM_KVM_TYPES_H */
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index f656169db8c3..00b8d1f65e44 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -85,17 +85,6 @@ struct kvm_arch {
u32 psci_version;
 };
 
-#define KVM_NR_MEM_OBJS 40
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_mmu_memory_cache {
-   int nobjs;
-   void *objects[KVM_NR_MEM_OBJS];
-};
-
 struct kvm_vcpu_fault_info {
u32 esr_el2;/* Hyp Syndrom Register */
u64 far_el2;/* Hyp Fault Address Register */
@@ -320,7 +309,7 @@ struct kvm_vcpu_arch {
struct kvm_decode mmio_decode;
 
/* Cache some mmu pages needed inside spinlock regions */
-   struct kvm_mmu_memory_cache mmu_page_cache;
+   struct kvm_mmu_memcache mmu_page_cache;
 
/* Target CPU and feature flags */
int target;
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index befe37d4bc0e..e23e91f368ae 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -160,7 +160,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t 
guest_ipa,
 
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_memcaches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
diff --git a/arch/arm64/include/asm/kvm_types.h 
b/arch/arm64/include/asm/kvm_types.h
index d0987007d581..89b15f62e466 100644
--- a/arch/arm64/include/asm/kvm_types.h
+++ b/arch/arm64/include/asm/kvm_types.h
@@ -2,5 +2,8 @@
 #ifndef _ASM_ARM64_KVM_TYPES_H
 #define _ASM_ARM64_KVM_TYPES_H
 
-#endif /* _ASM_ARM64_KVM_TYPES_H */
+#define KVM_ARCH_WANT_MMU_MEMCACHE
+
+#define KVM_MMU_NR_MEMCACHE_OBJS 40
 
+#endif

[PATCH] KVM: arm64: Don't set HCR_EL2.TVM when S2FWB is supported

2019-10-25 Thread Christoffer Dall
On CPUs that support S2FWB (Armv8.4+), KVM configures the stage 2 page
tables to override the memory attributes of memory accesses, regardless
of the stage 1 page table configurations, and also when the stage 1 MMU
is turned off.  This results in all memory accesses to RAM being
cacheable, including during early boot of the guest.

On CPUs without this feature, memory accesses were non-cacheable during
boot until the guest turned on the stage 1 MMU, and we had to detect
when the guest turned on the MMU, such that we could invalidate all cache
entries and ensure a consistent view of memory with the MMU turned on.
When the guest turned on the caches, we would call stage2_flush_vm()
from kvm_toggle_cache().

However, stage2_flush_vm() walks all the stage 2 tables, and calls
__kvm_flush-dcache_pte, which on a system with S2FWD does ... absolutely
nothing.

We can avoid that whole song and dance, and simply not set TVM when
creating a VM on a systme that has S2FWB.

Signed-off-by: Christoffer Dall 
Cc: Mark Rutland 
Cc: Alexandru Elisei 
---
I was only able to test this on the model with cache modeling enabled,
but even removing TVM from HCR_EL2 without having FWB also worked with
that setup, so the testing of this has been light.  It seems like it
should obviously work, but it would be good if someone with access to
appropriate hardware could give this a spin.

 arch/arm64/include/asm/kvm_emulate.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index d69c1efc63e7..41820c3e70b8 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -53,8 +53,10 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
/* trap error record accesses */
vcpu->arch.hcr_el2 |= HCR_TERR;
}
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
+   vcpu->arch.hcr_el2 &= ~HCR_TVM;
vcpu->arch.hcr_el2 |= HCR_FWB;
+   }
 
if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
vcpu->arch.hcr_el2 &= ~HCR_RW;
-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 00/15] KVM: Dynamically size memslot arrays

2019-10-25 Thread Christoffer Dall
On Thu, Oct 24, 2019 at 04:07:29PM -0700, Sean Christopherson wrote:
> The end goal of this series is to dynamically size the memslot array so
> that KVM allocates memory based on the number of memslots in use, as
> opposed to unconditionally allocating memory for the maximum number of
> memslots.  On x86, each memslot consumes 88 bytes, and so with 2 address
> spaces of 512 memslots, each VM consumes ~90k bytes for the memslots.
> E.g. given a VM that uses a total of 30 memslots, dynamic sizing reduces
> the memory footprint from 90k to ~2.6k bytes.
> 
> The changes required to support dynamic sizing are relatively small,
> e.g. are essentially contained in patches 14/15 and 15/15.  Patches 1-13
> clean up the memslot code, which has gotten quite crusty, especially
> __kvm_set_memory_region().  The clean up is likely not strictly necessary
> to switch to dynamic sizing, but I didn't have a remotely reasonable
> level of confidence in the correctness of the dynamic sizing without first
> doing the clean up.
> 
> Christoffer, I added your Tested-by to the patches that I was confident
> would be fully tested based on the desription of what you tested.  Let me
> know if you disagree with any of 'em.
> 
The only testing I've done of patch 9 would be via the vm_free part of
kvm selftest, so not sure how valid that is, but sure.

Looks fine otherwise.


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH RFC 0/7] Support KVM being compiled as a kernel module on arm64

2019-10-25 Thread Christoffer Dall
On Fri, Oct 25, 2019 at 10:48:46AM +0800, Shannon Zhao wrote:
> 
> 
> On 2019/10/24 21:41, Marc Zyngier wrote:
> > On 2019-10-24 11:58, James Morse wrote:
> > > Hi Shannon,
> > > 
> > > On 24/10/2019 11:27, Shannon Zhao wrote:
> > > > Curently KVM ARM64 doesn't support to compile as a kernel module. It's
> > > > useful to compile KVM as a module.
> > > 
> > > > For example, it could reload kvm without rebooting host machine.
> > > 
> > > What problem does this solve?
> > > 
> > > KVM has some funny requirements that aren't normal for a module. On
> > > v8.0 hardware it must
> > > have an idmap. Modules don't usually expect their code to be
> > > physically contiguous, but
> > > KVM does. KVM influences they way some of the irqchip stuff is set up
> > > during early boot
> > > (EOI mode ... not that I understand it).
> > 
> > We change the EOImode solely based on how we were booted (EL2 or not).
> > KVM doesn't directly influences that (it comes in the picture much
> > later).
> > 
> > > (I think KVM-as-a-module on x86 is an artifact of how it was developed)
> > > 
> > > 
> > > > This patchset support this feature while there are some limitations
> > > > to be solved. But I just send it out as RFC to get more suggestion and
> > > > comments.
> > > 
> > > > Curently it only supports for VHE system due to the hyp code section
> > > > address variables like __hyp_text_start.
> > > 
> > > We still need to support !VHE systems, and we need to do it with a
> > > single image.
> > > 
> > > 
> > > > Also it can't call
> > > > kvm_update_va_mask when loading kvm module and kernel panic with below
> > > > errors. So I make kern_hyp_va into a nop funtion.
> > > 
> > > Making this work for the single-Image on v8.0 is going to be a
> > > tremendous amount of work.
> > > What is the payoff?
> > 
> > I can only agree. !VHE is something we're going to support for the
> > foreseeable
> > future (which is roughly equivalent to "forever"), and modules have
> > properties
> > that are fundamentally incompatible with the way KVM works with !VHE.
> > 
> Yes, with this patchset we still support !VHE system with built-in KVM.
> While for VHE system we could support kernel module and check at module init
> to avoid wrong usage of kvm module on !VHE systems.
> 
> > If the only purpose of this work is to be able to swap KVM implementations
> > in a development environment, then it really isn't worth the effort.
> > 
> Making KVM as a kernel module has many advantages both for development and
> real use environment. For example, we can backport and update KVM codes
> independently and don't need to recompile kernel. Also making KVM as a
> kernel module is a basic for kvm hot upgrade feature without shutdown VMs
> and hosts. This is very important for Cloud Service Provider to provides
> non-stop services for its customers.
> 
But KVM on arm64 is pretty intertwined with the rest of the kernel, and
things like the arch timers, for example, really depend on the exact
semantics of how the rest of the kernel changes.  I fear that you'd end
up back-porting patches that depend on changes to irqchip and timers in
the core code, and you'll get even more oddly-defined behavior in the
wild.

How would you manage that, and how would the end result be a more stable
environment than what you have today?

Also, I'm curious if you expect to find more bugs in the hypervisor
itself than in the rest of the kernel, because it's only in the former
case you can avoid a reboot of the host, and all things considered this
would appear to only help in a small fraction of the cases where you
have to patch things?


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 00/15] KVM: Dynamically size memslot arrays

2019-10-23 Thread Christoffer Dall
On Mon, Oct 21, 2019 at 05:35:22PM -0700, Sean Christopherson wrote:
> The end goal of this series is to dynamically size the memslot array so
> that KVM allocates memory based on the number of memslots in use, as
> opposed to unconditionally allocating memory for the maximum number of
> memslots.  On x86, each memslot consumes 88 bytes, and so with 2 address
> spaces of 512 memslots, each VM consumes ~90k bytes for the memslots.
> E.g. given a VM that uses a total of 30 memslots, dynamic sizing reduces
> the memory footprint from 90k to ~2.6k bytes.
> 
> The changes required to support dynamic sizing are relatively small,
> e.g. are essentially contained in patches 12/13 and 13/13.  Patches 1-11
> clean up the memslot code, which has gotten quite crusy, especially
> __kvm_set_memory_region().  The clean up is likely not strictly necessary
> to switch to dynamic sizing, but I didn't have a remotely reasonable
> level of confidence in the correctness of the dynamic sizing without first
> doing the clean up.
> 
> Testing, especially non-x86 platforms, would be greatly appreciated.  The
> non-x86 changes are for all intents and purposes untested, e.g. I compile
> tested pieces of the code by copying them into x86, but that's it.  In
> theory, the vast majority of the functional changes are arch agnostic, in
> theory...

I've built this for arm/arm64, and I've ran my usual set of tests which
pass fine.  I've also run the selftest framework's tests for the dirty
logging and the migration loop test for arm64, and they pass fine.

You can add my (for arm64):

Tested-by: Christoffer Dall 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 01/15] KVM: Reinstall old memslots if arch preparation fails

2019-10-23 Thread Christoffer Dall
On Mon, Oct 21, 2019 at 05:35:23PM -0700, Sean Christopherson wrote:
> Reinstall the old memslots if preparing the new memory region fails
> after invalidating a to-be-{re}moved memslot.
> 
> Remove the superfluous 'old_memslots' variable so that it's somewhat
> clear that the error handling path needs to free the unused memslots,
> not simply the 'old' memslots.
> 
> Fixes: bc6678a33d9b9 ("KVM: introduce kvm->srcu and convert 
> kvm_set_memory_region to SRCU update")
> Signed-off-by: Sean Christopherson 
> ---
>  virt/kvm/kvm_main.c | 23 ---
>  1 file changed, 12 insertions(+), 11 deletions(-)
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 67ef3f2e19e8..9afd706dc038 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -924,7 +924,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
>   unsigned long npages;
>   struct kvm_memory_slot *slot;
>   struct kvm_memory_slot old, new;
> - struct kvm_memslots *slots = NULL, *old_memslots;
> + struct kvm_memslots *slots;
>   int as_id, id;
>   enum kvm_mr_change change;
>  
> @@ -1032,7 +1032,13 @@ int __kvm_set_memory_region(struct kvm *kvm,
>   slot = id_to_memslot(slots, id);
>   slot->flags |= KVM_MEMSLOT_INVALID;
>  
> - old_memslots = install_new_memslots(kvm, as_id, slots);
> + /*
> +  * We can re-use the old memslots, the only difference from the
> +  * newly installed memslots is the invalid flag, which will get
> +  * dropped by update_memslots anyway.  We'll also revert to the
> +  * old memslots if preparing the new memory region fails.
> +  */
> + slots = install_new_memslots(kvm, as_id, slots);
>  
>   /* From this point no new shadow pages pointing to a deleted,
>* or moved, memslot will be created.
> @@ -1042,13 +1048,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
>*  - kvm_is_visible_gfn (mmu_check_roots)
>*/
>   kvm_arch_flush_shadow_memslot(kvm, slot);
> -
> - /*
> -  * We can re-use the old_memslots from above, the only 
> difference
> -  * from the currently installed memslots is the invalid flag.  
> This
> -  * will get overwritten by update_memslots anyway.
> -  */
> - slots = old_memslots;
>   }
>  
>   r = kvm_arch_prepare_memory_region(kvm, , mem, change);
> @@ -1062,15 +1061,17 @@ int __kvm_set_memory_region(struct kvm *kvm,
>   }
>  
>   update_memslots(slots, , change);
> - old_memslots = install_new_memslots(kvm, as_id, slots);
> + slots = install_new_memslots(kvm, as_id, slots);
>  
>   kvm_arch_commit_memory_region(kvm, mem, , , change);
>  
>   kvm_free_memslot(kvm, , );
> - kvfree(old_memslots);
> + kvfree(slots);
>   return 0;
>  
>  out_slots:
> +     if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
> + slots = install_new_memslots(kvm, as_id, slots);
>   kvfree(slots);
>  out_free:
>   kvm_free_memslot(kvm, , );
> -- 
> 2.22.0

Reviewed-by: Christoffer Dall 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 02/15] KVM: Don't free new memslot if allocation of said memslot fails

2019-10-23 Thread Christoffer Dall
On Mon, Oct 21, 2019 at 05:35:24PM -0700, Sean Christopherson wrote:
> The two implementations of kvm_arch_create_memslot() in x86 and PPC are
> both good citizens and free up all local resources if creation fails.
> Return immediately (via a superfluous goto) instead of calling
> kvm_free_memslot().
> 
> Note, the call to kvm_free_memslot() is effectively an expensive nop in
> this case as there are no resources to be freed.
> 
> No functional change intended.
> 
> Signed-off-by: Sean Christopherson 
> ---
>  virt/kvm/kvm_main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 9afd706dc038..2cb38b2148cb 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1014,7 +1014,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
>   new.userspace_addr = mem->userspace_addr;
>  
>   if (kvm_arch_create_memslot(kvm, , npages))
> - goto out_free;
> + goto out;
>   }
>  
>       /* Allocate page dirty bitmap if needed */
> -- 
> 2.22.0
> 

Acked-by: Christoffer Dall 


Re: [PATCH v2 09/15] KVM: Move memslot deletion to helper function

2019-10-23 Thread Christoffer Dall
Hi Sean,

On Mon, Oct 21, 2019 at 05:35:31PM -0700, Sean Christopherson wrote:
> Move memslot deletion into its own routine so that the success path for
> other memslot updates does not need to use kvm_free_memslot(), i.e. can
> explicitly destroy the dirty bitmap when necessary.  This paves the way
> for dropping @dont from kvm_free_memslot(), i.e. all callers now pass
> NULL for @dont.
> 
> Add a comment above the code to make a copy of the existing memslot
> prior to deletion, it is not at all obvious that the pointer will become
> stale due sorting and/or installation of new memslots.

nit: due to / during

> 
> Note, kvm_arch_commit_memory_region() allows an architecture to free
> resources when moving a memslot or changing its flags, i.e. implement
> logic similar to the dirty bitmap is handling, if such functionality is

nit: s/is handling/handling/

> needed in the future.
> 
> Signed-off-by: Sean Christopherson 

Otherwise looks good to me.

Acked-by: Christoffer Dall 


Re: [PATCH 00/45] KVM: Refactor vCPU creation

2019-10-22 Thread Christoffer Dall
Hi Sean,

On Mon, Oct 21, 2019 at 06:58:40PM -0700, Sean Christopherson wrote:
> *** DISCLAIMER **
> The non-x86 arch specific patches are completely untested.  Although the
> changes are conceptually straightforward, I'm not remotely confident that
> the patches are bug free, e.g. checkpatch caught several blatant typos
> that would break compilation.
> *
> 
> The end goal of this series is to strip down the interface between common
> KVM code and arch specific code so that there is precisely one arch hook
> for creating a vCPU and one hook for destroying a vCPU.  In addition to
> cleaning up the code base, simplifying the interface gives architectures
> more freedom to organize their vCPU creation code.
> 
> KVM's vCPU creation code is comically messy.  kvm_vm_ioctl_create_vcpu()
> calls three separate arch hooks: init(), create() and setup().  The init()
> call is especially nasty as it's hidden away in a common KVM function,
> kvm_init_vcpu(), that for all intents and purposes must be immediately
> invoked after the vcpu object is allocated.
> 
> Not to be outdone, vCPU destruction also has three arch hooks: uninit(),
> destroy() and free(), the latter of which isn't actually invoked by common
> KVM code, but the hook declaration still exists because architectures are
> relying on its forward declaration.
> 
> Eliminating the extra arch hooks is relatively straightforward, just
> tedious.  For the most part, there is no fundamental constraint that
> necessitated the proliferation of arch hooks, rather they crept in over
> time, usually when x86-centric code was moved out of generic KVM and into
> x86 code.
> 
> E.g. kvm_arch_vcpu_setup() was added to allow x86 to do vcpu_load(), which
> can only be done after preempt_notifier initialization, but adding setup()
> overlooked the fact that the preempt_notifier was only initialized after
> kvm_arch_vcpu_create() because preemption support was added when x86's MMU
> setup (the vcpu_load() user) was called from common KVM code.
> 
> For all intents and purposes, there is no true functional change in this
> series.  The order of some allocations will change, and a few memory leaks
> are fixed, but the actual functionality of a guest should be unaffected.
> 
> Patches 01-03 are bug fixes in error handling paths that were found by
> inspection when refactoring the associated code.
> 
> Patches 04-43 refactor each arch implementation so that the unwanted arch
> hooks can be dropped without a functional change, e.g. move code out of
> kvm_arch_vcpu_setup() so that all implementations are empty, then drop the
> functions and caller.
> 
> Patches 44-45 are minor clean up to eliminate kvm_vcpu_uninit().
> 
> 
> The net result is to go from this:
> 
> vcpu = kvm_arch_vcpu_create(kvm, id);
>|
>|-> kvm_vcpu_init()
>|
>|-> kvm_arch_vcpu_init()
> 
> if (IS_ERR(vcpu)) {
> r = PTR_ERR(vcpu);
> goto vcpu_decrement;
> }
> 
> preempt_notifier_init(>preempt_notifier, _preempt_ops);
> 
> r = kvm_arch_vcpu_setup(vcpu);
> if (r)
> goto vcpu_destroy;
> 
> to this:
> 
> r = kvm_arch_vcpu_precreate(kvm, id);
> if (r)
> goto vcpu_decrement;
> 
> vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
> if (!vcpu) {
> r = -ENOMEM;
> goto vcpu_decrement;
> }
> 
> page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> if (!page) {
> r = -ENOMEM;
> goto vcpu_free;
> }
> vcpu->run = page_address(page);
> 
> kvm_vcpu_init(vcpu, kvm, id);
> 
> r = kvm_arch_vcpu_create(vcpu);
> if (r)
> goto vcpu_free_run_page;
> 

What a fantastically welcome piece of work!  Thanks for doing this,
many's the time I waded through all those calls to ensure a patch was
doing the right thing.

Modulo the nit in patch 42, the arm64 changes survive a guest boot +
hackbench and build fine.  The lack of changing the arm-specific destroy
function to a void also causes a series of warnings for a 32-bit arm
build, but otherwise builds fine.

You can add my:

  Acked-by: Christoffer Dall 

To the arm/arm64 and generic parts.


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 42/45] KVM: arm64: Free sve_state via arm specific hook

2019-10-22 Thread Christoffer Dall
On Mon, Oct 21, 2019 at 06:59:22PM -0700, Sean Christopherson wrote:
> Add an arm specific hook to free the arm64-only sve_state.  Doing so
> eliminates the last functional code from kvm_arch_vcpu_uninit() across
> all architectures and paves the way for removing kvm_arch_vcpu_init()
> and kvm_arch_vcpu_uninit() entirely.
> 
> Signed-off-by: Sean Christopherson 
> ---
>  arch/arm/include/asm/kvm_host.h   | 1 +
>  arch/arm64/include/asm/kvm_host.h | 1 +
>  arch/arm64/kvm/reset.c| 5 +
>  virt/kvm/arm/arm.c| 2 ++
>  4 files changed, 9 insertions(+)
> 
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index 8a37c8e89777..cc414de5acd3 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -333,6 +333,7 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) 
> {}
>  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
>  static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
>  static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
> +static inline int kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) {}
>  
>  static inline void kvm_arm_init_debug(void) {}
>  static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index f656169db8c3..92d7c384a4ed 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -52,6 +52,7 @@ int kvm_arm_init_sve(void);
>  
>  int __attribute_const__ kvm_target_cpu(void);
>  int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
> +int kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
>  void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
>  int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
>  void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t 
> idmap_start);
> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index f4a8ae918827..98abc4278f42 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -205,6 +205,11 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu)
>  }
>  
>  void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> +{
> +
> +}
> +
> +int kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
>  {
>   kfree(vcpu->arch.sve_state);
>  }

nit: warning: control reaches end of non-void function

> diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
> index aac4e5a1a521..b38088415cde 100644
> --- a/virt/kvm/arm/arm.c
> +++ b/virt/kvm/arm/arm.c
> @@ -298,6 +298,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
>   kvm_mmu_free_memory_caches(vcpu);
>   kvm_timer_vcpu_terminate(vcpu);
>   kvm_pmu_vcpu_destroy(vcpu);
> +
> + kvm_arm_vcpu_destroy(vcpu);
>  }
>  
>  int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
> -- 
> 2.22.0
> 
> ___

Thanks,

Christoffer


Re: [PATCH v3 0/2] Improve handling of stage 2 aborts without instruction decode

2019-10-21 Thread Christoffer Dall
On Sun, Oct 20, 2019 at 11:25:24AM +0100, Marc Zyngier wrote:
> Hi Christoffer,
> 
> On Fri, 11 Oct 2019 12:07:04 +0100,
> Christoffer Dall  wrote:
> > 
> > When a guest accesses memory outside the memory slots, KVM usually
> > bounces the access back to userspace with KVM_EXIT_MMIO.  However, on
> > arm/arm64 systems, certain load/store instructions did not provide
> > decoding info for the hypervisor to emulate the instruction, and in this
> > case KVM has rather rudely returned -ENOSYS and printed a not overly
> > helpful error message:
> > 
> >   load/store instruction decoding not implemented
> > 
> > This patch series improves the error message and allows userspace to be
> > notified of this event instead of receiving -ENOSYS, and also allows
> > userspace to ask KVM to inject an external abort to the guest, which it
> > can use for any memory access that it either cannot handle.
> > 
> > One remaining case which this patch set does not address is if the guest
> > accesses an in-kernel emulated device, such as the VGIC, but using a
> > load/store instruction which doesn't provide decode info.  With these
> > patches, this will return to userspace for it to handle, but there's no
> > way for userspace to return the decoding information to KVM and have KVM
> > complete the access to the in-kernel emulated device.  I have no plans
> > to address this limitation.
> 
> You had some pending comments on patch 2, and you seem to indicate
> that you would respin the series. Do you have plans to do so in the
> coming days? I'd like to put that series into -next, but I can either
> wait for your respin, or queue it as it is and apply fixes on top.
> 

I think those for v2 and this is v3 which should address those concerns.
Did I miss something or manage to confuse mysel here?

Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[kvmtool v3 5/5] arm: Inject external data aborts when accessing holes in the memory map

2019-10-11 Thread Christoffer Dall
Occasionally guests will attempt to access parts of the guest memory map
where there is... nothing at all.  Until now, we've handled this by
either forcefully killing the guest, or silently (unless a debug option
was enabled) ignoring the access.  Neither is very helpful to a user,
who is most likely running either a broken or misconfigured guest.

A more appropriate action is to inject an external abort to the guest.
Luckily, with KVM_CAP_ARM_INJECT_EXT_DABT, we can use the set event
mechanism and ask KVM to do this for us.

So we add an architecture specific hook to handle accesses to MMIO
regions which cannot be found, and allow them to return if the invalid
access was handled or not.

Signed-off-by: Christoffer Dall 
---
 arm/include/arm-common/kvm-cpu-arch.h | 16 
 arm/kvm-cpu.c |  2 +-
 mips/include/kvm/kvm-cpu-arch.h   |  5 +
 mmio.c|  3 ++-
 powerpc/include/kvm/kvm-cpu-arch.h|  5 +
 x86/include/kvm/kvm-cpu-arch.h|  5 +
 6 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/arm/include/arm-common/kvm-cpu-arch.h 
b/arm/include/arm-common/kvm-cpu-arch.h
index 923d2c4..33defa2 100644
--- a/arm/include/arm-common/kvm-cpu-arch.h
+++ b/arm/include/arm-common/kvm-cpu-arch.h
@@ -57,6 +57,22 @@ static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu 
*vcpu, u64 phys_addr,
return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
 }
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   struct kvm_vcpu_events events = {
+   .exception.ext_dabt_pending = 1,
+   };
+   int err;
+
+   if (!kvm__supports_extension(vcpu->kvm, KVM_CAP_ARM_INJECT_EXT_DABT))
+   return false;
+
+   err = ioctl(vcpu->vcpu_fd, KVM_SET_VCPU_EVENTS, );
+   if (err)
+   die("failed to inject external abort");
+   return true;
+}
+
 unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu);
 
 #endif /* ARM_COMMON__KVM_CPU_ARCH_H */
diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c
index 25bd3ed..321a3e4 100644
--- a/arm/kvm-cpu.c
+++ b/arm/kvm-cpu.c
@@ -142,7 +142,7 @@ bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
 
if (!arm_addr_in_ioport_region(phys_addr) &&
!kvm__mmio_exists(vcpu, phys_addr))
-   die("Guest accessed memory outside RAM and IO ranges");
+   return kvm_cpu__mmio_not_found(vcpu, phys_addr);
 
/*
 * We cannot fetch and decode instructions from a KVM guest,
diff --git a/mips/include/kvm/kvm-cpu-arch.h b/mips/include/kvm/kvm-cpu-arch.h
index 45e69f6..512ab34 100644
--- a/mips/include/kvm/kvm-cpu-arch.h
+++ b/mips/include/kvm/kvm-cpu-arch.h
@@ -40,4 +40,9 @@ static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu 
*vcpu, u64 phys_addr, u8
return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
 }
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   return false;
+}
+
 #endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/mmio.c b/mmio.c
index 2ab7fa7..d6df303 100644
--- a/mmio.c
+++ b/mmio.c
@@ -130,7 +130,8 @@ bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, 
u8 *data, u32 len, u
if (mmio)
mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr);
else {
-   if (vcpu->kvm->cfg.mmio_debug)
+   if (!kvm_cpu__mmio_not_found(vcpu, phys_addr) &&
+   vcpu->kvm->cfg.mmio_debug)
fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx 
(length %u)\n",
to_direction(is_write),
(unsigned long long)phys_addr, len);
diff --git a/powerpc/include/kvm/kvm-cpu-arch.h 
b/powerpc/include/kvm/kvm-cpu-arch.h
index a69e0cc..64b69b1 100644
--- a/powerpc/include/kvm/kvm-cpu-arch.h
+++ b/powerpc/include/kvm/kvm-cpu-arch.h
@@ -76,4 +76,9 @@ static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, 
u16 port, void *dat
 
 bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 
len, u8 is_write);
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   return false;
+}
+
 #endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/x86/include/kvm/kvm-cpu-arch.h b/x86/include/kvm/kvm-cpu-arch.h
index 05e5bb6..10cbe6e 100644
--- a/x86/include/kvm/kvm-cpu-arch.h
+++ b/x86/include/kvm/kvm-cpu-arch.h
@@ -47,4 +47,9 @@ static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu 
*vcpu, u64 phys_addr, u8
return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
 }
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   return false;
+}
+
 #endif /* KVM__KVM_CPU_ARCH_H */
-- 
2.18.0

___
kvmarm mailing list
kvmar

[kvmtool v3 3/5] update headers: Update the KVM headers for new Arm fault reporting features

2019-10-11 Thread Christoffer Dall
In preparation for improving our handling of guest aborts with missing
decode info or outside any mapped resource, sync updated Linux header
files.

NOTE: This is a development update and these headers are not yet in an
upstream tree.  DO NOT MERGE.

Signed-off-by: Christoffer Dall 
---
 arm/aarch32/include/asm/kvm.h | 3 ++-
 arm/aarch64/include/asm/kvm.h | 3 ++-
 include/linux/kvm.h   | 8 
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arm/aarch32/include/asm/kvm.h b/arm/aarch32/include/asm/kvm.h
index 4602464..b450900 100644
--- a/arm/aarch32/include/asm/kvm.h
+++ b/arm/aarch32/include/asm/kvm.h
@@ -131,8 +131,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arm/aarch64/include/asm/kvm.h b/arm/aarch64/include/asm/kvm.h
index 97c3478..e4cf9bd 100644
--- a/arm/aarch64/include/asm/kvm.h
+++ b/arm/aarch64/include/asm/kvm.h
@@ -160,8 +160,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6d4ea4b..fadebb4 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -235,6 +235,7 @@ struct kvm_hyperv_exit {
 #define KVM_EXIT_S390_STSI25
 #define KVM_EXIT_IOAPIC_EOI   26
 #define KVM_EXIT_HYPERV   27
+#define KVM_EXIT_ARM_NISV 28
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -392,6 +393,11 @@ struct kvm_run {
} eoi;
/* KVM_EXIT_HYPERV */
struct kvm_hyperv_exit hyperv;
+   /* KVM_EXIT_ARM_NISV */
+   struct {
+   __u64 esr_iss;
+   __u64 fault_ipa;
+   } arm_nisv;
/* Fix the size of the union. */
char padding[256];
};
@@ -988,6 +994,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_VM_IPA_SIZE 165
 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
 #define KVM_CAP_HYPERV_CPUID 167
+#define KVM_CAP_ARM_NISV_TO_USER 176
+#define KVM_CAP_ARM_INJECT_EXT_DABT 177
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 0/2] Improve handling of stage 2 aborts without instruction decode

2019-10-11 Thread Christoffer Dall
When a guest accesses memory outside the memory slots, KVM usually
bounces the access back to userspace with KVM_EXIT_MMIO.  However, on
arm/arm64 systems, certain load/store instructions did not provide
decoding info for the hypervisor to emulate the instruction, and in this
case KVM has rather rudely returned -ENOSYS and printed a not overly
helpful error message:

  load/store instruction decoding not implemented

This patch series improves the error message and allows userspace to be
notified of this event instead of receiving -ENOSYS, and also allows
userspace to ask KVM to inject an external abort to the guest, which it
can use for any memory access that it either cannot handle.

One remaining case which this patch set does not address is if the guest
accesses an in-kernel emulated device, such as the VGIC, but using a
load/store instruction which doesn't provide decode info.  With these
patches, this will return to userspace for it to handle, but there's no
way for userspace to return the decoding information to KVM and have KVM
complete the access to the in-kernel emulated device.  I have no plans
to address this limitation.

Changes since v2:
 - Cleanup inconsistent use of has_ in patch 2 and allow injecting
   SError and external abort simultaenously with a single call to set
   VCPU events.

Changes since v1:
 - Rebased on v5.4-rc2
 - Fixed some documentation and coding nit in review of v1

Christoffer Dall (2):
  KVM: arm/arm64: Allow reporting non-ISV data aborts to userspace
  KVM: arm/arm64: Allow user injection of external data aborts

 Documentation/virt/kvm/api.txt   | 55 +++-
 arch/arm/include/asm/kvm_arm.h   |  1 +
 arch/arm/include/asm/kvm_emulate.h   |  5 +++
 arch/arm/include/asm/kvm_host.h  |  8 
 arch/arm/include/uapi/asm/kvm.h  |  3 +-
 arch/arm/kvm/guest.c | 10 +
 arch/arm64/include/asm/kvm_emulate.h |  5 +++
 arch/arm64/include/asm/kvm_host.h|  8 
 arch/arm64/include/uapi/asm/kvm.h|  3 +-
 arch/arm64/kvm/guest.c   | 10 +
 arch/arm64/kvm/inject_fault.c|  4 +-
 include/uapi/linux/kvm.h |  8 
 virt/kvm/arm/arm.c   | 22 +++
 virt/kvm/arm/mmio.c  |  9 -
 14 files changed, 145 insertions(+), 6 deletions(-)

-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 1/2] KVM: arm/arm64: Allow reporting non-ISV data aborts to userspace

2019-10-11 Thread Christoffer Dall
For a long time, if a guest accessed memory outside of a memslot using
any of the load/store instructions in the architecture which doesn't
supply decoding information in the ESR_EL2 (the ISV bit is not set), the
kernel would print the following message and terminate the VM as a
result of returning -ENOSYS to userspace:

  load/store instruction decoding not implemented

The reason behind this message is that KVM assumes that all accesses
outside a memslot is an MMIO access which should be handled by
userspace, and we originally expected to eventually implement some sort
of decoding of load/store instructions where the ISV bit was not set.

However, it turns out that many of the instructions which don't provide
decoding information on abort are not safe to use for MMIO accesses, and
the remaining few that would potentially make sense to use on MMIO
accesses, such as those with register writeback, are not used in
practice.  It also turns out that fetching an instruction from guest
memory can be a pretty horrible affair, involving stopping all CPUs on
SMP systems, handling multiple corner cases of address translation in
software, and more.  It doesn't appear likely that we'll ever implement
this in the kernel.

What is much more common is that a user has misconfigured his/her guest
and is actually not accessing an MMIO region, but just hitting some
random hole in the IPA space.  In this scenario, the error message above
is almost misleading and has led to a great deal of confusion over the
years.

It is, nevertheless, ABI to userspace, and we therefore need to
introduce a new capability that userspace explicitly enables to change
behavior.

This patch introduces KVM_CAP_ARM_NISV_TO_USER (NISV meaning Non-ISV)
which does exactly that, and introduces a new exit reason to report the
event to userspace.  User space can then emulate an exception to the
guest, restart the guest, suspend the guest, or take any other
appropriate action as per the policy of the running system.

Reported-by: Heinrich Schuchardt 
Signed-off-by: Christoffer Dall 
Reviewed-by: Alexander Graf 
---
 Documentation/virt/kvm/api.txt   | 33 
 arch/arm/include/asm/kvm_arm.h   |  1 +
 arch/arm/include/asm/kvm_emulate.h   |  5 +
 arch/arm/include/asm/kvm_host.h  |  8 +++
 arch/arm64/include/asm/kvm_emulate.h |  5 +
 arch/arm64/include/asm/kvm_host.h|  8 +++
 include/uapi/linux/kvm.h |  7 ++
 virt/kvm/arm/arm.c   | 21 ++
 virt/kvm/arm/mmio.c  |  9 +++-
 9 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index 4833904d32a5..7403f15657c2 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -4468,6 +4468,39 @@ Hyper-V SynIC state change. Notification is used to 
remap SynIC
 event/message pages and to enable/disable SynIC messages/events processing
 in userspace.
 
+   /* KVM_EXIT_ARM_NISV */
+   struct {
+   __u64 esr_iss;
+   __u64 fault_ipa;
+   } arm_nisv;
+
+Used on arm and arm64 systems. If a guest accesses memory not in a memslot,
+KVM will typically return to userspace and ask it to do MMIO emulation on its
+behalf. However, for certain classes of instructions, no instruction decode
+(direction, length of memory access) is provided, and fetching and decoding
+the instruction from the VM is overly complicated to live in the kernel.
+
+Historically, when this situation occurred, KVM would print a warning and kill
+the VM. KVM assumed that if the guest accessed non-memslot memory, it was
+trying to do I/O, which just couldn't be emulated, and the warning message was
+phrased accordingly. However, what happened more often was that a guest bug
+caused access outside the guest memory areas which should lead to a more
+meaningful warning message and an external abort in the guest, if the access
+did not fall within an I/O window.
+
+Userspace implementations can query for KVM_CAP_ARM_NISV_TO_USER, and enable
+this capability at VM creation. Once this is done, these types of errors will
+instead return to userspace with KVM_EXIT_ARM_NISV, with the valid bits from
+the HSR (arm) and ESR_EL2 (arm64) in the esr_iss field, and the faulting IPA
+in the fault_ipa field. Userspace can either fix up the access if it's
+actually an I/O access by decoding the instruction from guest memory (if it's
+very brave) and continue executing the guest, or it can decide to suspend,
+dump, or restart the guest.
+
+Note that KVM does not skip the faulting instruction as it does for
+KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state
+if it decides to decode and emulate the instruction.
+
/* Fix the size of the union. */
char padding[256];
};
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm

[PATCH v3 2/2] KVM: arm/arm64: Allow user injection of external data aborts

2019-10-11 Thread Christoffer Dall
In some scenarios, such as buggy guest or incorrect configuration of the
VMM and firmware description data, userspace will detect a memory access
to a portion of the IPA, which is not mapped to any MMIO region.

For this purpose, the appropriate action is to inject an external abort
to the guest.  The kernel already has functionality to inject an
external abort, but we need to wire up a signal from user space that
lets user space tell the kernel to do this.

It turns out, we already have the set event functionality which we can
perfectly reuse for this.

Signed-off-by: Christoffer Dall 
---
 Documentation/virt/kvm/api.txt| 22 +-
 arch/arm/include/uapi/asm/kvm.h   |  3 ++-
 arch/arm/kvm/guest.c  | 10 ++
 arch/arm64/include/uapi/asm/kvm.h |  3 ++-
 arch/arm64/kvm/guest.c| 10 ++
 arch/arm64/kvm/inject_fault.c |  4 ++--
 include/uapi/linux/kvm.h  |  1 +
 virt/kvm/arm/arm.c|  1 +
 8 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index 7403f15657c2..bd29d44af32b 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -1002,12 +1002,18 @@ Specifying exception.has_esr on a system that does not 
support it will return
 -EINVAL. Setting anything other than the lower 24bits of exception.serror_esr
 will return -EINVAL.
 
+It is not possible to read back a pending external abort (injected via
+KVM_SET_VCPU_EVENTS or otherwise) because such an exception is always delivered
+directly to the virtual CPU).
+
+
 struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
@@ -1051,9 +1057,23 @@ contain a valid state and shall be written into the VCPU.
 
 ARM/ARM64:
 
+User space may need to inject several types of events to the guest.
+
 Set the pending SError exception state for this VCPU. It is not possible to
 'cancel' an Serror that has been made pending.
 
+If the guest performed an access to I/O memory which could not be handled by
+userspace, for example because of missing instruction syndrome decode
+information or because there is no device mapped at the accessed IPA, then
+userspace can ask the kernel to inject an external abort using the address
+from the exiting fault on the VCPU. It is a programming error to set
+ext_dabt_pending after an exit which was not either KVM_EXIT_MMIO or
+KVM_EXIT_ARM_NISV. This feature is only available if the system supports
+KVM_CAP_ARM_INJECT_EXT_DABT. This is a helper which provides commonality in
+how userspace reports accesses for the above cases to guests, across different
+userspace implementations. Nevertheless, userspace can still emulate all Arm
+exceptions by manipulating individual registers using the KVM_SET_ONE_REG API.
+
 See KVM_GET_VCPU_EVENTS for the data structure.
 
 
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 2769360f195c..03cd7c19a683 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -131,8 +131,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index 684cf64b4033..735f9b007e58 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -255,6 +255,12 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
 {
events->exception.serror_pending = !!(*vcpu_hcr(vcpu) & HCR_VA);
 
+   /*
+* We never return a pending ext_dabt here because we deliver it to
+* the virtual CPU directly when setting the event and it's no longer
+* 'pending' at this point.
+*/
+
return 0;
 }
 
@@ -263,12 +269,16 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 {
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
+   bool ext_dabt_pending = events->exception.ext_dabt_pending;
 
if (serror_pending && has_esr)
return -EINVAL;
else if (serror_pending)
kvm_inject_vabt(vcpu);
 
+   if (ext_dabt_pending)
+   kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+
return 0;
 }
 
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index 67c21f9bdbad..d49c17a80491 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@

[kvmtool v3 4/5] arm: Handle exits from undecoded load/store instructions

2019-10-11 Thread Christoffer Dall
KVM occasionally encounters guests that attempt to access memory outside
the registered RAM memory slots using instructions that don't provide
decoding information in the ESR_EL2 (the ISV bit is not set), and
historically this has led to the kernel printing a confusing error
message in dmesg and returning -ENOYSYS from KVM_RUN.

KVM/Arm now has KVM_CAP_ARM_NISV_TO_USER, which can be enabled from
userspace, and which allows us to handle this with a little bit more
helpful information to the user.  For example, we can at least tell the
user if the guest just hit a hole in the guest's memory map, or if this
appeared to be an attempt at doing MMIO.

Signed-off-by: Christoffer Dall 
---
 arm/kvm-cpu.c | 20 +++-
 arm/kvm.c |  8 
 include/kvm/kvm.h |  1 +
 kvm.c |  1 +
 mmio.c| 11 +++
 5 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c
index 7780251..25bd3ed 100644
--- a/arm/kvm-cpu.c
+++ b/arm/kvm-cpu.c
@@ -136,7 +136,25 @@ void kvm_cpu__delete(struct kvm_cpu *vcpu)
 
 bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
 {
-   return false;
+   switch (vcpu->kvm_run->exit_reason) {
+   case KVM_EXIT_ARM_NISV: {
+   u64 phys_addr = vcpu->kvm_run->arm_nisv.fault_ipa;
+
+   if (!arm_addr_in_ioport_region(phys_addr) &&
+   !kvm__mmio_exists(vcpu, phys_addr))
+   die("Guest accessed memory outside RAM and IO ranges");
+
+   /*
+* We cannot fetch and decode instructions from a KVM guest,
+* which used a load/store instruction that doesn't get
+* decoded in the ESR towards an I/O device, so we have no
+* choice but to exit to the user with an error.
+*/
+   die("Guest accessed I/O device with unsupported load/store 
instruction");
+   }
+   default:
+   return false;
+   }
 }
 
 void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
diff --git a/arm/kvm.c b/arm/kvm.c
index 1f85fc6..2572ac2 100644
--- a/arm/kvm.c
+++ b/arm/kvm.c
@@ -59,6 +59,8 @@ void kvm__arch_set_cmdline(char *cmdline, bool video)
 
 void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
 {
+   struct kvm_enable_cap enable_cap = { .flags = 0 };
+
/*
 * Allocate guest memory. We must align our buffer to 64K to
 * correlate with the maximum guest page size for virtio-mmio.
@@ -83,6 +85,12 @@ void kvm__arch_init(struct kvm *kvm, const char 
*hugetlbfs_path, u64 ram_size)
madvise(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size,
MADV_HUGEPAGE);
 
+   if (kvm__supports_extension(kvm, KVM_CAP_ARM_NISV_TO_USER)) {
+   enable_cap.cap = KVM_CAP_ARM_NISV_TO_USER;
+   if (ioctl(kvm->vm_fd, KVM_ENABLE_CAP, _cap) < 0)
+   die("unable to enable NISV_TO_USER capability");
+   }
+
/* Create the virtual GIC. */
if (gic__create(kvm, kvm->cfg.arch.irqchip))
die("Failed to create virtual GIC");
diff --git a/include/kvm/kvm.h b/include/kvm/kvm.h
index 7a73818..05d90ee 100644
--- a/include/kvm/kvm.h
+++ b/include/kvm/kvm.h
@@ -107,6 +107,7 @@ bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void 
*data, int direction,
 bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, 
u8 is_write);
 int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void 
*userspace_addr,
  enum kvm_mem_type type);
+bool kvm__mmio_exists(struct kvm_cpu *vcpu, u64 phys_addr);
 static inline int kvm__register_ram(struct kvm *kvm, u64 guest_phys, u64 size,
void *userspace_addr)
 {
diff --git a/kvm.c b/kvm.c
index 57c4ff9..03ec43f 100644
--- a/kvm.c
+++ b/kvm.c
@@ -55,6 +55,7 @@ const char *kvm_exit_reasons[] = {
 #ifdef CONFIG_PPC64
DEFINE_KVM_EXIT_REASON(KVM_EXIT_PAPR_HCALL),
 #endif
+   DEFINE_KVM_EXIT_REASON(KVM_EXIT_ARM_NISV),
 };
 
 static int pause_event;
diff --git a/mmio.c b/mmio.c
index 61e1d47..2ab7fa7 100644
--- a/mmio.c
+++ b/mmio.c
@@ -139,3 +139,14 @@ bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 
phys_addr, u8 *data, u32 len, u
 
return true;
 }
+
+bool kvm__mmio_exists(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   struct mmio_mapping *mmio;
+
+   br_read_lock(vcpu->kvm);
+   mmio = mmio_search(_tree, phys_addr, 1);
+   br_read_unlock(vcpu->kvm);
+
+   return mmio != NULL;
+}
-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 2/2] KVM: arm/arm64: Allow user injection of external data aborts

2019-10-09 Thread Christoffer Dall
On Tue, Oct 08, 2019 at 02:03:07PM +0200, Alexander Graf wrote:
> 
> 
> On 08.10.19 11:36, Christoffer Dall wrote:
> > In some scenarios, such as buggy guest or incorrect configuration of the
> > VMM and firmware description data, userspace will detect a memory access
> > to a portion of the IPA, which is not mapped to any MMIO region.
> > 
> > For this purpose, the appropriate action is to inject an external abort
> > to the guest.  The kernel already has functionality to inject an
> > external abort, but we need to wire up a signal from user space that
> > lets user space tell the kernel to do this.
> > 
> > It turns out, we already have the set event functionality which we can
> > perfectly reuse for this.
> > 
> > Signed-off-by: Christoffer Dall 
> > ---
> >   Documentation/virt/kvm/api.txt| 18 +-
> >   arch/arm/include/uapi/asm/kvm.h   |  3 ++-
> >   arch/arm/kvm/guest.c  |  3 +++
> >   arch/arm64/include/uapi/asm/kvm.h |  3 ++-
> >   arch/arm64/kvm/guest.c|  3 +++
> >   arch/arm64/kvm/inject_fault.c |  4 ++--
> >   include/uapi/linux/kvm.h  |  1 +
> >   virt/kvm/arm/arm.c|  1 +
> >   8 files changed, 31 insertions(+), 5 deletions(-)
> > 
> > diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
> > index 7403f15657c2..10ebe8cfda29 100644
> > --- a/Documentation/virt/kvm/api.txt
> > +++ b/Documentation/virt/kvm/api.txt
> > @@ -968,6 +968,8 @@ The following bits are defined in the flags field:
> >   ARM/ARM64:
> > +User space may need to inject several types of events to the guest.
> > +
> >   If the guest accesses a device that is being emulated by the host kernel 
> > in
> >   such a way that a real device would generate a physical SError, KVM may 
> > make
> >   a virtual SError pending for that VCPU. This system error interrupt 
> > remains
> > @@ -1002,12 +1004,26 @@ Specifying exception.has_esr on a system that does 
> > not support it will return
> >   -EINVAL. Setting anything other than the lower 24bits of 
> > exception.serror_esr
> >   will return -EINVAL.
> > +If the guest performed an access to I/O memory which could not be handled 
> > by
> > +userspace, for example because of missing instruction syndrome decode
> > +information or because there is no device mapped at the accessed IPA, then
> > +userspace can ask the kernel to inject an external abort using the address
> > +from the exiting fault on the VCPU. It is a programming error to set
> > +ext_dabt_pending at the same time as any of the serror fields, or to set
> > +ext_dabt_pending after an exit which was not either KVM_EXIT_MMIO or
> > +KVM_EXIT_ARM_NISV. This feature is only available if the system supports
> > +KVM_CAP_ARM_INJECT_EXT_DABT. This is a helper which provides commonality in
> > +how userspace reports accesses for the above cases to guests, across 
> > different
> > +userspace implementations. Nevertheless, userspace can still emulate all 
> > Arm
> > +exceptions by manipulating individual registers using the KVM_SET_ONE_REG 
> > API.
> > +
> >   struct kvm_vcpu_events {
> > struct {
> > __u8 serror_pending;
> > __u8 serror_has_esr;
> > +   __u8 ext_dabt_pending;
> > /* Align it to 8 bytes */
> > -   __u8 pad[6];
> > +   __u8 pad[5];
> > __u64 serror_esr;
> > } exception;
> > __u32 reserved[12];
> > diff --git a/arch/arm/include/uapi/asm/kvm.h 
> > b/arch/arm/include/uapi/asm/kvm.h
> > index 2769360f195c..03cd7c19a683 100644
> > --- a/arch/arm/include/uapi/asm/kvm.h
> > +++ b/arch/arm/include/uapi/asm/kvm.h
> > @@ -131,8 +131,9 @@ struct kvm_vcpu_events {
> > struct {
> > __u8 serror_pending;
> > __u8 serror_has_esr;
> > +   __u8 ext_dabt_pending;
> > /* Align it to 8 bytes */
> > -   __u8 pad[6];
> > +   __u8 pad[5];
> > __u64 serror_esr;
> > } exception;
> > __u32 reserved[12];
> > diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
> > index 684cf64b4033..4154c5589501 100644
> > --- a/arch/arm/kvm/guest.c
> > +++ b/arch/arm/kvm/guest.c
> > @@ -263,11 +263,14 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
> >   {
> > bool serror_pending = events->exception.serror_pending;
> > bool has_esr = events->exception.serror_has_esr;
> > +   bool has_ext_da

[PATCH v2 0/2] Improve handling of stage 2 aborts without instruction decode

2019-10-08 Thread Christoffer Dall
When a guest accesses memory outside the memory slots, KVM usually
bounces the access back to userspace with KVM_EXIT_MMIO.  However, on
arm/arm64 systems, certain load/store instructions did not provide
decoding info for the hypervisor to emulate the instruction, and in this
case KVM has rather rudely returned -ENOSYS and printed a not overly
helpful error message:

  load/store instruction decoding not implemented

This patch series improves the error message and allows userspace to be
notified of this event instead of receiving -ENOSYS, and also allows
userspace to ask KVM to inject an external abort to the guest, which it
can use for any memory access that it either cannot handle.

One remaining case which this patch set does not address is if the guest
accesses an in-kernel emulated device, such as the VGIC, but using a
load/store instruction which doesn't provide decode info.  With these
patches, this will return to userspace for it to handle, but there's no
way for userspace to return the decoding information to KVM and have KVM
complete the access to the in-kernel emulated device.  I have no plans
to address this limitation.

Changes since v1:
 - Rebased on v5.4-rc2
 - Fixed some documentation and coding nit in review of v1

Christoffer Dall (2):
  KVM: arm/arm64: Allow reporting non-ISV data aborts to userspace
  KVM: arm/arm64: Allow user injection of external data aborts

 Documentation/virt/kvm/api.txt   | 51 +++-
 arch/arm/include/asm/kvm_arm.h   |  1 +
 arch/arm/include/asm/kvm_emulate.h   |  5 +++
 arch/arm/include/asm/kvm_host.h  |  8 +
 arch/arm/include/uapi/asm/kvm.h  |  3 +-
 arch/arm/kvm/guest.c |  3 ++
 arch/arm64/include/asm/kvm_emulate.h |  5 +++
 arch/arm64/include/asm/kvm_host.h|  8 +
 arch/arm64/include/uapi/asm/kvm.h|  3 +-
 arch/arm64/kvm/guest.c   |  3 ++
 arch/arm64/kvm/inject_fault.c|  4 +--
 include/uapi/linux/kvm.h |  8 +
 virt/kvm/arm/arm.c   | 22 
 virt/kvm/arm/mmio.c  |  9 -
 14 files changed, 127 insertions(+), 6 deletions(-)

-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 1/2] KVM: arm/arm64: Allow reporting non-ISV data aborts to userspace

2019-10-08 Thread Christoffer Dall
For a long time, if a guest accessed memory outside of a memslot using
any of the load/store instructions in the architecture which doesn't
supply decoding information in the ESR_EL2 (the ISV bit is not set), the
kernel would print the following message and terminate the VM as a
result of returning -ENOSYS to userspace:

  load/store instruction decoding not implemented

The reason behind this message is that KVM assumes that all accesses
outside a memslot is an MMIO access which should be handled by
userspace, and we originally expected to eventually implement some sort
of decoding of load/store instructions where the ISV bit was not set.

However, it turns out that many of the instructions which don't provide
decoding information on abort are not safe to use for MMIO accesses, and
the remaining few that would potentially make sense to use on MMIO
accesses, such as those with register writeback, are not used in
practice.  It also turns out that fetching an instruction from guest
memory can be a pretty horrible affair, involving stopping all CPUs on
SMP systems, handling multiple corner cases of address translation in
software, and more.  It doesn't appear likely that we'll ever implement
this in the kernel.

What is much more common is that a user has misconfigured his/her guest
and is actually not accessing an MMIO region, but just hitting some
random hole in the IPA space.  In this scenario, the error message above
is almost misleading and has led to a great deal of confusion over the
years.

It is, nevertheless, ABI to userspace, and we therefore need to
introduce a new capability that userspace explicitly enables to change
behavior.

This patch introduces KVM_CAP_ARM_NISV_TO_USER (NISV meaning Non-ISV)
which does exactly that, and introduces a new exit reason to report the
event to userspace.  User space can then emulate an exception to the
guest, restart the guest, suspend the guest, or take any other
appropriate action as per the policy of the running system.

Reported-by: Heinrich Schuchardt 
Signed-off-by: Christoffer Dall 
---
 Documentation/virt/kvm/api.txt   | 33 
 arch/arm/include/asm/kvm_arm.h   |  1 +
 arch/arm/include/asm/kvm_emulate.h   |  5 +
 arch/arm/include/asm/kvm_host.h  |  8 +++
 arch/arm64/include/asm/kvm_emulate.h |  5 +
 arch/arm64/include/asm/kvm_host.h|  8 +++
 include/uapi/linux/kvm.h |  7 ++
 virt/kvm/arm/arm.c   | 21 ++
 virt/kvm/arm/mmio.c  |  9 +++-
 9 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index 4833904d32a5..7403f15657c2 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -4468,6 +4468,39 @@ Hyper-V SynIC state change. Notification is used to 
remap SynIC
 event/message pages and to enable/disable SynIC messages/events processing
 in userspace.
 
+   /* KVM_EXIT_ARM_NISV */
+   struct {
+   __u64 esr_iss;
+   __u64 fault_ipa;
+   } arm_nisv;
+
+Used on arm and arm64 systems. If a guest accesses memory not in a memslot,
+KVM will typically return to userspace and ask it to do MMIO emulation on its
+behalf. However, for certain classes of instructions, no instruction decode
+(direction, length of memory access) is provided, and fetching and decoding
+the instruction from the VM is overly complicated to live in the kernel.
+
+Historically, when this situation occurred, KVM would print a warning and kill
+the VM. KVM assumed that if the guest accessed non-memslot memory, it was
+trying to do I/O, which just couldn't be emulated, and the warning message was
+phrased accordingly. However, what happened more often was that a guest bug
+caused access outside the guest memory areas which should lead to a more
+meaningful warning message and an external abort in the guest, if the access
+did not fall within an I/O window.
+
+Userspace implementations can query for KVM_CAP_ARM_NISV_TO_USER, and enable
+this capability at VM creation. Once this is done, these types of errors will
+instead return to userspace with KVM_EXIT_ARM_NISV, with the valid bits from
+the HSR (arm) and ESR_EL2 (arm64) in the esr_iss field, and the faulting IPA
+in the fault_ipa field. Userspace can either fix up the access if it's
+actually an I/O access by decoding the instruction from guest memory (if it's
+very brave) and continue executing the guest, or it can decide to suspend,
+dump, or restart the guest.
+
+Note that KVM does not skip the faulting instruction as it does for
+KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state
+if it decides to decode and emulate the instruction.
+
/* Fix the size of the union. */
char padding[256];
};
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index

[kvmtool v2 3/5] update headers: Update the KVM headers for new Arm fault reporting features

2019-10-08 Thread Christoffer Dall
In preparation for improving our handling of guest aborts with missing
decode info or outside any mapped resource, sync updated Linux header
files.

NOTE: This is a development update and these headers are not yet in an
upstream tree.  DO NOT MERGE.

Signed-off-by: Christoffer Dall 
---
 arm/aarch32/include/asm/kvm.h | 3 ++-
 arm/aarch64/include/asm/kvm.h | 3 ++-
 include/linux/kvm.h   | 8 
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arm/aarch32/include/asm/kvm.h b/arm/aarch32/include/asm/kvm.h
index 4602464..b450900 100644
--- a/arm/aarch32/include/asm/kvm.h
+++ b/arm/aarch32/include/asm/kvm.h
@@ -131,8 +131,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arm/aarch64/include/asm/kvm.h b/arm/aarch64/include/asm/kvm.h
index 97c3478..e4cf9bd 100644
--- a/arm/aarch64/include/asm/kvm.h
+++ b/arm/aarch64/include/asm/kvm.h
@@ -160,8 +160,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6d4ea4b..fadebb4 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -235,6 +235,7 @@ struct kvm_hyperv_exit {
 #define KVM_EXIT_S390_STSI25
 #define KVM_EXIT_IOAPIC_EOI   26
 #define KVM_EXIT_HYPERV   27
+#define KVM_EXIT_ARM_NISV 28
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -392,6 +393,11 @@ struct kvm_run {
} eoi;
/* KVM_EXIT_HYPERV */
struct kvm_hyperv_exit hyperv;
+   /* KVM_EXIT_ARM_NISV */
+   struct {
+   __u64 esr_iss;
+   __u64 fault_ipa;
+   } arm_nisv;
/* Fix the size of the union. */
char padding[256];
};
@@ -988,6 +994,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_VM_IPA_SIZE 165
 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
 #define KVM_CAP_HYPERV_CPUID 167
+#define KVM_CAP_ARM_NISV_TO_USER 176
+#define KVM_CAP_ARM_INJECT_EXT_DABT 177
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[kvmtool v2 5/5] arm: Inject external data aborts when accessing holes in the memory map

2019-10-08 Thread Christoffer Dall
Occasionally guests will attempt to access parts of the guest memory map
where there is... nothing at all.  Until now, we've handled this by
either forcefully killing the guest, or silently (unless a debug option
was enabled) ignoring the access.  Neither is very helpful to a user,
who is most likely running either a broken or misconfigured guest.

A more appropriate action is to inject an external abort to the guest.
Luckily, with KVM_CAP_ARM_INJECT_EXT_DABT, we can use the set event
mechanism and ask KVM to do this for us.

So we add an architecture specific hook to handle accesses to MMIO
regions which cannot be found, and allow them to return if the invalid
access was handled or not.

Signed-off-by: Christoffer Dall 
---
 arm/include/arm-common/kvm-cpu-arch.h | 16 
 arm/kvm-cpu.c |  2 +-
 mips/include/kvm/kvm-cpu-arch.h   |  5 +
 mmio.c|  3 ++-
 powerpc/include/kvm/kvm-cpu-arch.h|  5 +
 x86/include/kvm/kvm-cpu-arch.h|  5 +
 6 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/arm/include/arm-common/kvm-cpu-arch.h 
b/arm/include/arm-common/kvm-cpu-arch.h
index 923d2c4..33defa2 100644
--- a/arm/include/arm-common/kvm-cpu-arch.h
+++ b/arm/include/arm-common/kvm-cpu-arch.h
@@ -57,6 +57,22 @@ static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu 
*vcpu, u64 phys_addr,
return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
 }
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   struct kvm_vcpu_events events = {
+   .exception.ext_dabt_pending = 1,
+   };
+   int err;
+
+   if (!kvm__supports_extension(vcpu->kvm, KVM_CAP_ARM_INJECT_EXT_DABT))
+   return false;
+
+   err = ioctl(vcpu->vcpu_fd, KVM_SET_VCPU_EVENTS, );
+   if (err)
+   die("failed to inject external abort");
+   return true;
+}
+
 unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu);
 
 #endif /* ARM_COMMON__KVM_CPU_ARCH_H */
diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c
index 25bd3ed..321a3e4 100644
--- a/arm/kvm-cpu.c
+++ b/arm/kvm-cpu.c
@@ -142,7 +142,7 @@ bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
 
if (!arm_addr_in_ioport_region(phys_addr) &&
!kvm__mmio_exists(vcpu, phys_addr))
-   die("Guest accessed memory outside RAM and IO ranges");
+   return kvm_cpu__mmio_not_found(vcpu, phys_addr);
 
/*
 * We cannot fetch and decode instructions from a KVM guest,
diff --git a/mips/include/kvm/kvm-cpu-arch.h b/mips/include/kvm/kvm-cpu-arch.h
index 45e69f6..512ab34 100644
--- a/mips/include/kvm/kvm-cpu-arch.h
+++ b/mips/include/kvm/kvm-cpu-arch.h
@@ -40,4 +40,9 @@ static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu 
*vcpu, u64 phys_addr, u8
return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
 }
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   return false;
+}
+
 #endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/mmio.c b/mmio.c
index 2ab7fa7..d6df303 100644
--- a/mmio.c
+++ b/mmio.c
@@ -130,7 +130,8 @@ bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, 
u8 *data, u32 len, u
if (mmio)
mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr);
else {
-   if (vcpu->kvm->cfg.mmio_debug)
+   if (!kvm_cpu__mmio_not_found(vcpu, phys_addr) &&
+   vcpu->kvm->cfg.mmio_debug)
fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx 
(length %u)\n",
to_direction(is_write),
(unsigned long long)phys_addr, len);
diff --git a/powerpc/include/kvm/kvm-cpu-arch.h 
b/powerpc/include/kvm/kvm-cpu-arch.h
index a69e0cc..64b69b1 100644
--- a/powerpc/include/kvm/kvm-cpu-arch.h
+++ b/powerpc/include/kvm/kvm-cpu-arch.h
@@ -76,4 +76,9 @@ static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, 
u16 port, void *dat
 
 bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 
len, u8 is_write);
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   return false;
+}
+
 #endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/x86/include/kvm/kvm-cpu-arch.h b/x86/include/kvm/kvm-cpu-arch.h
index 05e5bb6..10cbe6e 100644
--- a/x86/include/kvm/kvm-cpu-arch.h
+++ b/x86/include/kvm/kvm-cpu-arch.h
@@ -47,4 +47,9 @@ static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu 
*vcpu, u64 phys_addr, u8
return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
 }
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   return false;
+}
+
 #endif /* KVM__KVM_CPU_ARCH_H */
-- 
2.18.0

___
kvmarm mailing list
kvmar

[PATCH v2 2/2] KVM: arm/arm64: Allow user injection of external data aborts

2019-10-08 Thread Christoffer Dall
In some scenarios, such as buggy guest or incorrect configuration of the
VMM and firmware description data, userspace will detect a memory access
to a portion of the IPA, which is not mapped to any MMIO region.

For this purpose, the appropriate action is to inject an external abort
to the guest.  The kernel already has functionality to inject an
external abort, but we need to wire up a signal from user space that
lets user space tell the kernel to do this.

It turns out, we already have the set event functionality which we can
perfectly reuse for this.

Signed-off-by: Christoffer Dall 
---
 Documentation/virt/kvm/api.txt| 18 +-
 arch/arm/include/uapi/asm/kvm.h   |  3 ++-
 arch/arm/kvm/guest.c  |  3 +++
 arch/arm64/include/uapi/asm/kvm.h |  3 ++-
 arch/arm64/kvm/guest.c|  3 +++
 arch/arm64/kvm/inject_fault.c |  4 ++--
 include/uapi/linux/kvm.h  |  1 +
 virt/kvm/arm/arm.c|  1 +
 8 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index 7403f15657c2..10ebe8cfda29 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -968,6 +968,8 @@ The following bits are defined in the flags field:
 
 ARM/ARM64:
 
+User space may need to inject several types of events to the guest.
+
 If the guest accesses a device that is being emulated by the host kernel in
 such a way that a real device would generate a physical SError, KVM may make
 a virtual SError pending for that VCPU. This system error interrupt remains
@@ -1002,12 +1004,26 @@ Specifying exception.has_esr on a system that does not 
support it will return
 -EINVAL. Setting anything other than the lower 24bits of exception.serror_esr
 will return -EINVAL.
 
+If the guest performed an access to I/O memory which could not be handled by
+userspace, for example because of missing instruction syndrome decode
+information or because there is no device mapped at the accessed IPA, then
+userspace can ask the kernel to inject an external abort using the address
+from the exiting fault on the VCPU. It is a programming error to set
+ext_dabt_pending at the same time as any of the serror fields, or to set
+ext_dabt_pending after an exit which was not either KVM_EXIT_MMIO or
+KVM_EXIT_ARM_NISV. This feature is only available if the system supports
+KVM_CAP_ARM_INJECT_EXT_DABT. This is a helper which provides commonality in
+how userspace reports accesses for the above cases to guests, across different
+userspace implementations. Nevertheless, userspace can still emulate all Arm
+exceptions by manipulating individual registers using the KVM_SET_ONE_REG API.
+
 struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 2769360f195c..03cd7c19a683 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -131,8 +131,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index 684cf64b4033..4154c5589501 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -263,11 +263,14 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 {
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
+   bool has_ext_dabt_pending = events->exception.ext_dabt_pending;
 
if (serror_pending && has_esr)
return -EINVAL;
else if (serror_pending)
kvm_inject_vabt(vcpu);
+   else if (has_ext_dabt_pending)
+   kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
 
return 0;
 }
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index 67c21f9bdbad..d49c17a80491 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -164,8 +164,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index dfd626447482..10e6e2144dca 100644
--- a/arch/arm64/kvm/g

[kvmtool v2 4/5] arm: Handle exits from undecoded load/store instructions

2019-10-08 Thread Christoffer Dall
KVM occasionally encounters guests that attempt to access memory outside
the registered RAM memory slots using instructions that don't provide
decoding information in the ESR_EL2 (the ISV bit is not set), and
historically this has led to the kernel printing a confusing error
message in dmesg and returning -ENOYSYS from KVM_RUN.

KVM/Arm now has KVM_CAP_ARM_NISV_TO_USER, which can be enabled from
userspace, and which allows us to handle this with a little bit more
helpful information to the user.  For example, we can at least tell the
user if the guest just hit a hole in the guest's memory map, or if this
appeared to be an attempt at doing MMIO.

Signed-off-by: Christoffer Dall 
---
 arm/kvm-cpu.c | 20 +++-
 arm/kvm.c |  8 
 include/kvm/kvm.h |  1 +
 kvm.c |  1 +
 mmio.c| 11 +++
 5 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c
index 7780251..25bd3ed 100644
--- a/arm/kvm-cpu.c
+++ b/arm/kvm-cpu.c
@@ -136,7 +136,25 @@ void kvm_cpu__delete(struct kvm_cpu *vcpu)
 
 bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
 {
-   return false;
+   switch (vcpu->kvm_run->exit_reason) {
+   case KVM_EXIT_ARM_NISV: {
+   u64 phys_addr = vcpu->kvm_run->arm_nisv.fault_ipa;
+
+   if (!arm_addr_in_ioport_region(phys_addr) &&
+   !kvm__mmio_exists(vcpu, phys_addr))
+   die("Guest accessed memory outside RAM and IO ranges");
+
+   /*
+* We cannot fetch and decode instructions from a KVM guest,
+* which used a load/store instruction that doesn't get
+* decoded in the ESR towards an I/O device, so we have no
+* choice but to exit to the user with an error.
+*/
+   die("Guest accessed I/O device with unsupported load/store 
instruction");
+   }
+   default:
+   return false;
+   }
 }
 
 void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
diff --git a/arm/kvm.c b/arm/kvm.c
index 1f85fc6..2572ac2 100644
--- a/arm/kvm.c
+++ b/arm/kvm.c
@@ -59,6 +59,8 @@ void kvm__arch_set_cmdline(char *cmdline, bool video)
 
 void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
 {
+   struct kvm_enable_cap enable_cap = { .flags = 0 };
+
/*
 * Allocate guest memory. We must align our buffer to 64K to
 * correlate with the maximum guest page size for virtio-mmio.
@@ -83,6 +85,12 @@ void kvm__arch_init(struct kvm *kvm, const char 
*hugetlbfs_path, u64 ram_size)
madvise(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size,
MADV_HUGEPAGE);
 
+   if (kvm__supports_extension(kvm, KVM_CAP_ARM_NISV_TO_USER)) {
+   enable_cap.cap = KVM_CAP_ARM_NISV_TO_USER;
+   if (ioctl(kvm->vm_fd, KVM_ENABLE_CAP, _cap) < 0)
+   die("unable to enable NISV_TO_USER capability");
+   }
+
/* Create the virtual GIC. */
if (gic__create(kvm, kvm->cfg.arch.irqchip))
die("Failed to create virtual GIC");
diff --git a/include/kvm/kvm.h b/include/kvm/kvm.h
index 7a73818..05d90ee 100644
--- a/include/kvm/kvm.h
+++ b/include/kvm/kvm.h
@@ -107,6 +107,7 @@ bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void 
*data, int direction,
 bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, 
u8 is_write);
 int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void 
*userspace_addr,
  enum kvm_mem_type type);
+bool kvm__mmio_exists(struct kvm_cpu *vcpu, u64 phys_addr);
 static inline int kvm__register_ram(struct kvm *kvm, u64 guest_phys, u64 size,
void *userspace_addr)
 {
diff --git a/kvm.c b/kvm.c
index 57c4ff9..03ec43f 100644
--- a/kvm.c
+++ b/kvm.c
@@ -55,6 +55,7 @@ const char *kvm_exit_reasons[] = {
 #ifdef CONFIG_PPC64
DEFINE_KVM_EXIT_REASON(KVM_EXIT_PAPR_HCALL),
 #endif
+   DEFINE_KVM_EXIT_REASON(KVM_EXIT_ARM_NISV),
 };
 
 static int pause_event;
diff --git a/mmio.c b/mmio.c
index 61e1d47..2ab7fa7 100644
--- a/mmio.c
+++ b/mmio.c
@@ -139,3 +139,14 @@ bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 
phys_addr, u8 *data, u32 len, u
 
return true;
 }
+
+bool kvm__mmio_exists(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   struct mmio_mapping *mmio;
+
+   br_read_lock(vcpu->kvm);
+   mmio = mmio_search(_tree, phys_addr, 1);
+   br_read_unlock(vcpu->kvm);
+
+   return mmio != NULL;
+}
-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/2] KVM: arm/arm64: Allow user injection of external data aborts

2019-10-08 Thread Christoffer Dall
On Thu, Sep 26, 2019 at 03:09:11PM +0100, Marc Zyngier wrote:
> On 09/09/2019 13:13, Christoffer Dall wrote:
> > In some scenarios, such as buggy guest or incorrect configuration of the
> > VMM and firmware description data, userspace will detect a memory access
> > to a portion of the IPA, which is not mapped to any MMIO region.
> > 
> > For this purpose, the appropriate action is to inject an external abort
> > to the guest.  The kernel already has functionality to inject an
> > external abort, but we need to wire up a signal from user space that
> > lets user space tell the kernel to do this.
> > 
> > It turns out, we already have the set event functionality which we can
> > perfectly reuse for this.
> > 
> > Signed-off-by: Christoffer Dall 
> > ---
> >  Documentation/virt/kvm/api.txt| 15 ++-
> >  arch/arm/include/uapi/asm/kvm.h   |  3 ++-
> >  arch/arm/kvm/guest.c  |  3 +++
> >  arch/arm64/include/uapi/asm/kvm.h |  3 ++-
> >  arch/arm64/kvm/guest.c|  3 +++
> >  arch/arm64/kvm/inject_fault.c |  4 ++--
> >  include/uapi/linux/kvm.h  |  1 +
> >  virt/kvm/arm/arm.c|  1 +
> >  8 files changed, 28 insertions(+), 5 deletions(-)
> > 
> > diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
> > index 02501333f746..edd6cdc470ca 100644
> > --- a/Documentation/virt/kvm/api.txt
> > +++ b/Documentation/virt/kvm/api.txt
> > @@ -955,6 +955,8 @@ The following bits are defined in the flags field:
> >  
> >  ARM/ARM64:
> >  
> > +User space may need to inject several types of events to the guest.
> > +
> >  If the guest accesses a device that is being emulated by the host kernel in
> >  such a way that a real device would generate a physical SError, KVM may 
> > make
> >  a virtual SError pending for that VCPU. This system error interrupt remains
> > @@ -989,12 +991,23 @@ Specifying exception.has_esr on a system that does 
> > not support it will return
> >  -EINVAL. Setting anything other than the lower 24bits of 
> > exception.serror_esr
> >  will return -EINVAL.
> >  
> > +If the guest performed an access to I/O memory which could not be handled 
> > by
> > +user space, for example because of missing instruction syndrome decode
> > +information or because there is no device mapped at the accessed IPA, then
> > +user space can ask the kernel to inject an external abort using the address
> > +from the exiting fault on the VCPU. It is a programming error to set
> > +ext_dabt_pending at the same time as any of the serror fields, or to set
> > +ext_dabt_pending on an exit which was not either KVM_EXIT_MMIO or
> 
> ... on *re-entry from* an exit?
> 
> > +KVM_EXIT_ARM_NISV. This feature is only available if the system supports
> > +KVM_CAP_ARM_INJECT_EXT_DABT;
> 
> s/;/./
> 
> Can we add some wording to the fact that this is only a helper for the
> most common case? Most of the ARM exceptions can otherwise be
> constructed/injected using the SET_ONE_REG API.
> 
> > +
> >  struct kvm_vcpu_events {
> > struct {
> > __u8 serror_pending;
> > __u8 serror_has_esr;
> > +   __u8 ext_dabt_pending;
> > /* Align it to 8 bytes */
> > -   __u8 pad[6];
> > +   __u8 pad[5];
> > __u64 serror_esr;
> > } exception;
> > __u32 reserved[12];
> > diff --git a/arch/arm/include/uapi/asm/kvm.h 
> > b/arch/arm/include/uapi/asm/kvm.h
> > index a4217c1a5d01..d2449a5bf8d5 100644
> > --- a/arch/arm/include/uapi/asm/kvm.h
> > +++ b/arch/arm/include/uapi/asm/kvm.h
> > @@ -131,8 +131,9 @@ struct kvm_vcpu_events {
> > struct {
> > __u8 serror_pending;
> > __u8 serror_has_esr;
> > +   __u8 ext_dabt_pending;
> > /* Align it to 8 bytes */
> > -   __u8 pad[6];
> > +   __u8 pad[5];
> > __u64 serror_esr;
> > } exception;
> > __u32 reserved[12];
> > diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
> > index 684cf64b4033..4154c5589501 100644
> > --- a/arch/arm/kvm/guest.c
> > +++ b/arch/arm/kvm/guest.c
> > @@ -263,11 +263,14 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
> >  {
> > bool serror_pending = events->exception.serror_pending;
> > bool has_esr = events->exception.serror_has_esr;
> > +   bool has_ext_dabt_pending = events->exception.ext_dabt_pending;
> >  
> > if (serror_pendi

Re: [PATCH 1/2] KVM: arm/arm64: Allow reporting non-ISV data aborts to userspace

2019-10-08 Thread Christoffer Dall
On Tue, Oct 01, 2019 at 06:21:43PM +0100, James Morse wrote:
> Hi Christoffer,
> 
> On 09/09/2019 13:13, Christoffer Dall wrote:
> > For a long time, if a guest accessed memory outside of a memslot using
> > any of the load/store instructions in the architecture which doesn't
> > supply decoding information in the ESR_EL2 (the ISV bit is not set), the
> > kernel would print the following message and terminate the VM as a
> > result of returning -ENOSYS to userspace:
> > 
> >   load/store instruction decoding not implemented
> > 
> > The reason behind this message is that KVM assumes that all accesses
> > outside a memslot is an MMIO access which should be handled by
> > userspace, and we originally expected to eventually implement some sort
> > of decoding of load/store instructions where the ISV bit was not set.
> 
> > However, it turns out that many of the instructions which don't provide
> > decoding information on abort are not safe to use for MMIO accesses, and
> > the remaining few that would potentially make sense to use on MMIO
> > accesses, such as those with register writeback, are not used in
> > practice.  It also turns out that fetching an instruction from guest
> > memory can be a pretty horrible affair, involving stopping all CPUs on
> > SMP systems, handling multiple corner cases of address translation in
> > software, and more.  It doesn't appear likely that we'll ever implement
> > this in the kernel.
> 
> > What is much more common is that a user has misconfigured his/her guest
> > and is actually not accessing an MMIO region, but just hitting some
> > random hole in the IPA space.  In this scenario, the error message above
> > is almost misleading and has led to a great deal of confusion over the
> > years.
> > 
> > It is, nevertheless, ABI to userspace, and we therefore need to
> > introduce a new capability that userspace explicitly enables to change
> > behavior.
> > 
> > This patch introduces KVM_CAP_ARM_NISV_TO_USER (NISV meaning Non-ISV)
> > which does exactly that, and introduces a new exit reason to report the
> > event to userspace.  User space can then emulate an exception to the
> > guest, restart the guest, suspend the guest, or take any other
> > appropriate action as per the policy of the running system.
> 
> 
> 
> > diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
> > index 2d067767b617..02501333f746 100644
> > --- a/Documentation/virt/kvm/api.txt
> > +++ b/Documentation/virt/kvm/api.txt
> > @@ -4453,6 +4453,35 @@ Hyper-V SynIC state change. Notification is used to 
> > remap SynIC
> >  event/message pages and to enable/disable SynIC messages/events processing
> >  in userspace.
> >  
> > +   /* KVM_EXIT_ARM_NISV */
> > +   struct {
> > +   __u64 esr_iss;
> > +   __u64 fault_ipa;
> > +   } arm_nisv;
> > +
> > +Used on arm and arm64 systems. If a guest accesses memory not in a memslot,
> > +KVM will typically return to userspace and ask it to do MMIO emulation on 
> > its
> > +behalf. However, for certain classes of instructions, no instruction decode
> > +(direction, length of memory access) is provided, and fetching and decoding
> > +the instruction from the VM is overly complicated to live in the kernel.
> > +
> > +Historically, when this situation occurred, KVM would print a warning and 
> > kill
> > +the VM. KVM assumed that if the guest accessed non-memslot memory, it was
> > +trying to do I/O, which just couldn't be emulated, and the warning message 
> > was
> > +phrased accordingly. However, what happened more often was that a guest bug
> > +caused access outside the guest memory areas which should lead to a more
> > +mearningful warning message and an external abort in the guest, if the 
> > access
> > +did not fall within an I/O window.
> > +
> > +Userspace implementations can query for KVM_CAP_ARM_NISV_TO_USER, and 
> > enable
> > +this capability at VM creation. Once this is done, these types of errors 
> > will
> > +instead return to userspace with KVM_EXIT_ARM_NISV, with the valid bits 
> > from
> > +the HSR (arm) and ESR_EL2 (arm64) in the esr_iss field, and the faulting 
> > IPA
> > +in the fault_ipa field. Userspace can either fix up the access if it's
> > +actually an I/O access by decoding the instruction from guest memory (if 
> > it's
> > +very brave) and continue executing the guest, or it can decide to suspend,
> > +dump, or restart the guest.
> 
> S

Re: [PATCH 1/2] KVM: arm/arm64: Allow reporting non-ISV data aborts to userspace

2019-10-08 Thread Christoffer Dall
On Thu, Sep 26, 2019 at 02:47:55PM +0100, Marc Zyngier wrote:
> On 09/09/2019 13:13, Christoffer Dall wrote:
> > For a long time, if a guest accessed memory outside of a memslot using
> > any of the load/store instructions in the architecture which doesn't
> > supply decoding information in the ESR_EL2 (the ISV bit is not set), the
> > kernel would print the following message and terminate the VM as a
> > result of returning -ENOSYS to userspace:
> > 
> >   load/store instruction decoding not implemented
> > 
> > The reason behind this message is that KVM assumes that all accesses
> > outside a memslot is an MMIO access which should be handled by
> > userspace, and we originally expected to eventually implement some sort
> > of decoding of load/store instructions where the ISV bit was not set.
> > 
> > However, it turns out that many of the instructions which don't provide
> > decoding information on abort are not safe to use for MMIO accesses, and
> > the remaining few that would potentially make sense to use on MMIO
> > accesses, such as those with register writeback, are not used in
> > practice.  It also turns out that fetching an instruction from guest
> > memory can be a pretty horrible affair, involving stopping all CPUs on
> > SMP systems, handling multiple corner cases of address translation in
> > software, and more.  It doesn't appear likely that we'll ever implement
> > this in the kernel.
> > 
> > What is much more common is that a user has misconfigured his/her guest
> > and is actually not accessing an MMIO region, but just hitting some
> > random hole in the IPA space.  In this scenario, the error message above
> > is almost misleading and has led to a great deal of confusion over the
> > years.
> > 
> > It is, nevertheless, ABI to userspace, and we therefore need to
> > introduce a new capability that userspace explicitly enables to change
> > behavior.
> > 
> > This patch introduces KVM_CAP_ARM_NISV_TO_USER (NISV meaning Non-ISV)
> > which does exactly that, and introduces a new exit reason to report the
> > event to userspace.  User space can then emulate an exception to the
> > guest, restart the guest, suspend the guest, or take any other
> > appropriate action as per the policy of the running system.
> > 
> > Reported-by: Heinrich Schuchardt 
> > Signed-off-by: Christoffer Dall 
> > ---
> >  Documentation/virt/kvm/api.txt   | 29 
> >  arch/arm/include/asm/kvm_arm.h   |  2 ++
> >  arch/arm/include/asm/kvm_emulate.h   |  5 +
> >  arch/arm/include/asm/kvm_host.h  |  8 
> >  arch/arm64/include/asm/kvm_emulate.h |  5 +
> >  arch/arm64/include/asm/kvm_host.h|  8 
> >  include/uapi/linux/kvm.h |  7 +++
> >  virt/kvm/arm/arm.c   | 21 
> >  virt/kvm/arm/mmio.c  | 11 +--
> >  9 files changed, 94 insertions(+), 2 deletions(-)
> > 
> > diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
> > index 2d067767b617..02501333f746 100644
> > --- a/Documentation/virt/kvm/api.txt
> > +++ b/Documentation/virt/kvm/api.txt
> > @@ -4453,6 +4453,35 @@ Hyper-V SynIC state change. Notification is used to 
> > remap SynIC
> >  event/message pages and to enable/disable SynIC messages/events processing
> >  in userspace.
> >  
> > +   /* KVM_EXIT_ARM_NISV */
> > +   struct {
> > +   __u64 esr_iss;
> > +   __u64 fault_ipa;
> > +   } arm_nisv;
> > +
> > +Used on arm and arm64 systems. If a guest accesses memory not in a memslot,
> > +KVM will typically return to userspace and ask it to do MMIO emulation on 
> > its
> > +behalf. However, for certain classes of instructions, no instruction decode
> > +(direction, length of memory access) is provided, and fetching and decoding
> > +the instruction from the VM is overly complicated to live in the kernel.
> > +
> > +Historically, when this situation occurred, KVM would print a warning and 
> > kill
> > +the VM. KVM assumed that if the guest accessed non-memslot memory, it was
> > +trying to do I/O, which just couldn't be emulated, and the warning message 
> > was
> > +phrased accordingly. However, what happened more often was that a guest bug
> > +caused access outside the guest memory areas which should lead to a more
> > +mearningful warning message and an external abort in the guest, if the 
> > access
> 
> meaningful?
> 
> > +did 

Re: [PATCH 2/2] KVM: arm/arm64: Allow user injection of external data aborts

2019-09-09 Thread Christoffer Dall
On Mon, Sep 09, 2019 at 04:56:23PM +0100, Peter Maydell wrote:
> On Mon, 9 Sep 2019 at 16:16, Christoffer Dall  
> wrote:
> >
> > On Mon, Sep 09, 2019 at 01:32:46PM +0100, Peter Maydell wrote:
> > > This API seems to be missing support for userspace to specify
> > > whether the ESR_ELx for the guest should have the EA bit set
> > > (and more generally other syndrome/fault status bits). I think
> > > if we have an API for "KVM_EXIT_MMIO but the access failed"
> > > then it should either (a) be architecture agnostic, since
> > > pretty much any architecture might have a concept of "access
> > > gave some bus-error-type failure" and it would be nice if userspace
> > > didn't have to special case them all in arch-specific code,
> > > or (b) have the same flexibility for specifying exactly what
> > > kind of fault as the architecture does. This sort of seems to
> > > fall between two stools. (My ideal for KVM_EXIT_MMIO faults
> > > would be a generic API which included space for optional
> > > arch-specific info, which for Arm would pretty much just be
> > > the EA bit.)
> >
> > I'm not sure I understand exactly what would be improved by making this
> > either more architecture speific or more architecture generic.  The
> > EA bit will always be set, that's why the field is called
> > 'ext_dabt_pending'.
> 
> ESR_EL1.EA doesn't mean "this is an external abort". It means
> "given that this is an external abort as indicated by ESR_EL1.DFSC,
> specify the external abort type". Traditionally this is 0 for
> an AXI bus Decode error ("interconnect says there's nothing there")
> and 1 for a Slave error ("there's something there but it told us
> to go away"), though architecturally it's specified as impdef
> because not everybody uses AXI. In QEMU we track the difference
> between these two things and for TCG will raise external aborts
> with the correct EA bit value.
> 

Ah, I missed that.  I don't think we want to allow userspace to supply
any implementation defined values for the VM, though.

> > I thought as per the previous discussion, that we were specifically
> > trying to avoid userspace emulating the exception in detail, so I
> > designed this to provide the minimal effort API for userspace.
> >
> > Since we already have an architecture specific ioctl, kvm_vcpu_events, I
> > don't think we're painting ourselves into a corner by using that.  Is a
> > natural consequence of what you're saying not that we should try to make
> > that whole call architecture generic?
> >
> > Unless we already have specific examples of how other architectures
> > would want to use something like this, and given the impact of this
> > patch, I'm not sure it's worth trying to speculate about that.
> 
> In QEMU, use of a generic API would look something like
> this in kvm-all.c:
> 
> case KVM_EXIT_MMIO:
> DPRINTF("handle_mmio\n");
> /* Called outside BQL */
> MemTxResult res;
> 
> res = address_space_rw(_space_memory,
>run->mmio.phys_addr, attrs,
>run->mmio.data,
>run->mmio.len,
>run->mmio.is_write);
> if (res != MEMTX_OK) {
> /* tell the kernel the access failed, eg
>  * by updating the kvm_run struct to say so
>  */
> } else {
> /* access passed, we have updated the kvm_run
>  * struct's mmio subfield, proceed as usual
>  */
> }
> ret = 0;
> break;
> 
> [this is exactly the current QEMU code except that today
> we throw away the 'res' that tells us if the transaction
> succeeded because we have no way to report it to KVM and
> effectively always RAZ/WI the access.]
> 
> This is nice because you don't need anything here that has to do
> "bail out to architecture specific handling of anything",
> you just say "nope, the access failed", and let the kernel handle
> that however the CPU would handle it. It just immediately works
> for all architectures on the userspace side (assuming the kernel
> defaults to not actually trying to report an abort to the guest
> if nobody's implemented that on the kernel side, which is exactly
> what happens today where there's no way to report the error for
> any architecture).
> The downside is that you lose the ability to be more specific about
> ar

Re: [PATCH 2/2] KVM: arm/arm64: Allow user injection of external data aborts

2019-09-09 Thread Christoffer Dall
On Mon, Sep 09, 2019 at 01:32:46PM +0100, Peter Maydell wrote:
> On Mon, 9 Sep 2019 at 13:13, Christoffer Dall  
> wrote:
> >
> > In some scenarios, such as buggy guest or incorrect configuration of the
> > VMM and firmware description data, userspace will detect a memory access
> > to a portion of the IPA, which is not mapped to any MMIO region.
> >
> > For this purpose, the appropriate action is to inject an external abort
> > to the guest.  The kernel already has functionality to inject an
> > external abort, but we need to wire up a signal from user space that
> > lets user space tell the kernel to do this.
> >
> > It turns out, we already have the set event functionality which we can
> > perfectly reuse for this.
> >
> > Signed-off-by: Christoffer Dall 
> > ---
> >  Documentation/virt/kvm/api.txt| 15 ++-
> >  arch/arm/include/uapi/asm/kvm.h   |  3 ++-
> >  arch/arm/kvm/guest.c  |  3 +++
> >  arch/arm64/include/uapi/asm/kvm.h |  3 ++-
> >  arch/arm64/kvm/guest.c|  3 +++
> >  arch/arm64/kvm/inject_fault.c |  4 ++--
> >  include/uapi/linux/kvm.h  |  1 +
> >  virt/kvm/arm/arm.c|  1 +
> >  8 files changed, 28 insertions(+), 5 deletions(-)
> >
> > diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
> > index 02501333f746..edd6cdc470ca 100644
> > --- a/Documentation/virt/kvm/api.txt
> > +++ b/Documentation/virt/kvm/api.txt
> > @@ -955,6 +955,8 @@ The following bits are defined in the flags field:
> >
> >  ARM/ARM64:
> >
> > +User space may need to inject several types of events to the guest.
> > +
> >  If the guest accesses a device that is being emulated by the host kernel in
> >  such a way that a real device would generate a physical SError, KVM may 
> > make
> >  a virtual SError pending for that VCPU. This system error interrupt remains
> > @@ -989,12 +991,23 @@ Specifying exception.has_esr on a system that does 
> > not support it will return
> >  -EINVAL. Setting anything other than the lower 24bits of 
> > exception.serror_esr
> >  will return -EINVAL.
> >
> > +If the guest performed an access to I/O memory which could not be handled 
> > by
> > +user space, for example because of missing instruction syndrome decode
> > +information or because there is no device mapped at the accessed IPA, then
> > +user space can ask the kernel to inject an external abort using the address
> > +from the exiting fault on the VCPU. It is a programming error to set
> > +ext_dabt_pending at the same time as any of the serror fields, or to set
> > +ext_dabt_pending on an exit which was not either KVM_EXIT_MMIO or
> > +KVM_EXIT_ARM_NISV. This feature is only available if the system supports
> > +KVM_CAP_ARM_INJECT_EXT_DABT;
> > +
> >  struct kvm_vcpu_events {
> > struct {
> > __u8 serror_pending;
> > __u8 serror_has_esr;
> > +   __u8 ext_dabt_pending;
> > /* Align it to 8 bytes */
> > -   __u8 pad[6];
> > +   __u8 pad[5];
> > __u64 serror_esr;
> > } exception;
> > __u32 reserved[12];
> 
> This API seems to be missing support for userspace to specify
> whether the ESR_ELx for the guest should have the EA bit set
> (and more generally other syndrome/fault status bits). I think
> if we have an API for "KVM_EXIT_MMIO but the access failed"
> then it should either (a) be architecture agnostic, since
> pretty much any architecture might have a concept of "access
> gave some bus-error-type failure" and it would be nice if userspace
> didn't have to special case them all in arch-specific code,
> or (b) have the same flexibility for specifying exactly what
> kind of fault as the architecture does. This sort of seems to
> fall between two stools. (My ideal for KVM_EXIT_MMIO faults
> would be a generic API which included space for optional
> arch-specific info, which for Arm would pretty much just be
> the EA bit.)

I'm not sure I understand exactly what would be improved by making this
either more architecture speific or more architecture generic.  The
EA bit will always be set, that's why the field is called
'ext_dabt_pending'.

I thought as per the previous discussion, that we were specifically
trying to avoid userspace emulating the exception in detail, so I
designed this to provide the minimal effort API for userspace.

Since we already have an architecture specific ioctl, kvm_vcpu_events, I
don't think we're painting ourselves into a corner by

[kvmtool PATCH 3/5] update headers: Update the KVM headers for new Arm fault reporting features

2019-09-09 Thread Christoffer Dall
In preparation for improving our handling of guest aborts with missing
decode info or outside any mapped resource, sync updated Linux header
files.

NOTE: This is a development update and these headers are not yet in an
upstream tree.  DO NOT MERGE.

Signed-off-by: Christoffer Dall 
---
 arm/aarch32/include/asm/kvm.h | 3 ++-
 arm/aarch64/include/asm/kvm.h | 3 ++-
 include/linux/kvm.h   | 8 
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arm/aarch32/include/asm/kvm.h b/arm/aarch32/include/asm/kvm.h
index 4602464..b450900 100644
--- a/arm/aarch32/include/asm/kvm.h
+++ b/arm/aarch32/include/asm/kvm.h
@@ -131,8 +131,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arm/aarch64/include/asm/kvm.h b/arm/aarch64/include/asm/kvm.h
index 97c3478..e4cf9bd 100644
--- a/arm/aarch64/include/asm/kvm.h
+++ b/arm/aarch64/include/asm/kvm.h
@@ -160,8 +160,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6d4ea4b..6e2d2df 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -235,6 +235,7 @@ struct kvm_hyperv_exit {
 #define KVM_EXIT_S390_STSI25
 #define KVM_EXIT_IOAPIC_EOI   26
 #define KVM_EXIT_HYPERV   27
+#define KVM_EXIT_ARM_NISV 28
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -392,6 +393,11 @@ struct kvm_run {
} eoi;
/* KVM_EXIT_HYPERV */
struct kvm_hyperv_exit hyperv;
+   /* KVM_EXIT_ARM_NISV */
+   struct {
+   __u64 esr_iss;
+   __u64 fault_ipa;
+   } arm_nisv;
/* Fix the size of the union. */
char padding[256];
};
@@ -988,6 +994,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_VM_IPA_SIZE 165
 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
 #define KVM_CAP_HYPERV_CPUID 167
+#define KVM_CAP_ARM_NISV_TO_USER 174
+#define KVM_CAP_ARM_INJECT_EXT_DABT 175
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.17.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[kvmtool PATCH 5/5] arm: Inject external data aborts when accessing holes in the memory map

2019-09-09 Thread Christoffer Dall
Occasionally guests will attempt to access parts of the guest memory map
where there is... nothing at all.  Until now, we've handled this by
either forcefully killing the guest, or silently (unless a debug option
was enabled) ignoring the access.  Neither is very helpful to a user,
who is most likely running either a broken or misconfigured guest.

A more appropriate action is to inject an external abort to the guest.
Luckily, with KVM_CAP_ARM_INJECT_EXT_DABT, we can use the set event
mechanism and ask KVM to do this for us.

So we add an architecture specific hook to handle accesses to MMIO
regions which cannot be found, and allow them to return if the invalid
access was handled or not.

Signed-off-by: Christoffer Dall 
---
 arm/include/arm-common/kvm-cpu-arch.h | 16 
 arm/kvm-cpu.c |  2 +-
 mips/include/kvm/kvm-cpu-arch.h   |  5 +
 mmio.c|  3 ++-
 powerpc/include/kvm/kvm-cpu-arch.h|  5 +
 x86/include/kvm/kvm-cpu-arch.h|  5 +
 6 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/arm/include/arm-common/kvm-cpu-arch.h 
b/arm/include/arm-common/kvm-cpu-arch.h
index 923d2c4..33defa2 100644
--- a/arm/include/arm-common/kvm-cpu-arch.h
+++ b/arm/include/arm-common/kvm-cpu-arch.h
@@ -57,6 +57,22 @@ static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu 
*vcpu, u64 phys_addr,
return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
 }
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   struct kvm_vcpu_events events = {
+   .exception.ext_dabt_pending = 1,
+   };
+   int err;
+
+   if (!kvm__supports_extension(vcpu->kvm, KVM_CAP_ARM_INJECT_EXT_DABT))
+   return false;
+
+   err = ioctl(vcpu->vcpu_fd, KVM_SET_VCPU_EVENTS, );
+   if (err)
+   die("failed to inject external abort");
+   return true;
+}
+
 unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu);
 
 #endif /* ARM_COMMON__KVM_CPU_ARCH_H */
diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c
index 25bd3ed..321a3e4 100644
--- a/arm/kvm-cpu.c
+++ b/arm/kvm-cpu.c
@@ -142,7 +142,7 @@ bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
 
if (!arm_addr_in_ioport_region(phys_addr) &&
!kvm__mmio_exists(vcpu, phys_addr))
-   die("Guest accessed memory outside RAM and IO ranges");
+   return kvm_cpu__mmio_not_found(vcpu, phys_addr);
 
/*
 * We cannot fetch and decode instructions from a KVM guest,
diff --git a/mips/include/kvm/kvm-cpu-arch.h b/mips/include/kvm/kvm-cpu-arch.h
index 45e69f6..512ab34 100644
--- a/mips/include/kvm/kvm-cpu-arch.h
+++ b/mips/include/kvm/kvm-cpu-arch.h
@@ -40,4 +40,9 @@ static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu 
*vcpu, u64 phys_addr, u8
return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
 }
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   return false;
+}
+
 #endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/mmio.c b/mmio.c
index 2ab7fa7..d6df303 100644
--- a/mmio.c
+++ b/mmio.c
@@ -130,7 +130,8 @@ bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, 
u8 *data, u32 len, u
if (mmio)
mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr);
else {
-   if (vcpu->kvm->cfg.mmio_debug)
+   if (!kvm_cpu__mmio_not_found(vcpu, phys_addr) &&
+   vcpu->kvm->cfg.mmio_debug)
fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx 
(length %u)\n",
to_direction(is_write),
(unsigned long long)phys_addr, len);
diff --git a/powerpc/include/kvm/kvm-cpu-arch.h 
b/powerpc/include/kvm/kvm-cpu-arch.h
index a69e0cc..64b69b1 100644
--- a/powerpc/include/kvm/kvm-cpu-arch.h
+++ b/powerpc/include/kvm/kvm-cpu-arch.h
@@ -76,4 +76,9 @@ static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, 
u16 port, void *dat
 
 bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 
len, u8 is_write);
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   return false;
+}
+
 #endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/x86/include/kvm/kvm-cpu-arch.h b/x86/include/kvm/kvm-cpu-arch.h
index 05e5bb6..10cbe6e 100644
--- a/x86/include/kvm/kvm-cpu-arch.h
+++ b/x86/include/kvm/kvm-cpu-arch.h
@@ -47,4 +47,9 @@ static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu 
*vcpu, u64 phys_addr, u8
return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
 }
 
+static inline bool kvm_cpu__mmio_not_found(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   return false;
+}
+
 #endif /* KVM__KVM_CPU_ARCH_H */
-- 
2.17.1

___
kvmarm mailing list
kvmar

[PATCH 0/2] Improve handling of stage 2 aborts without instruction decode

2019-09-09 Thread Christoffer Dall
When a guest accesses memory outside the memory slots, KVM usually
bounces the access back to userspace with KVM_EXIT_MMIO.  However, on
arm/arm64 systems, certain load/store instructions did not provide
decoding info for the hypervisor to emulate the instruction, and in this
case KVM has rather rudely returned -ENOSYS and printed a not overly
helpful error message:

  load/store instruction decoding not implemented

This patch series improves the error message and allows userspace to be
notified of this event instead of receiving -ENOSYS, and also allows
userspace to ask KVM to inject an external abort to the guest, which it
can use for any memory access that it either cannot handle.

One remaining case which this patch set does not address is if the guest
accesses an in-kernel emulated device, such as the VGIC, but using a
load/store instruction which doesn't provide decode info.  With these
patches, this will return to userspace for it to handle, but there's no
way for userspace to return the decoding information to KVM and have KVM
complete the access to the in-kernel emulated device.  I have no plans
to address this limitation.

Christoffer Dall (2):
  KVM: arm/arm64: Allow reporting non-ISV data aborts to userspace
  KVM: arm/arm64: Allow user injection of external data aborts

 Documentation/virt/kvm/api.txt   | 44 +++-
 arch/arm/include/asm/kvm_arm.h   |  2 ++
 arch/arm/include/asm/kvm_emulate.h   |  5 
 arch/arm/include/asm/kvm_host.h  |  8 +
 arch/arm/include/uapi/asm/kvm.h  |  3 +-
 arch/arm/kvm/guest.c |  3 ++
 arch/arm64/include/asm/kvm_emulate.h |  5 
 arch/arm64/include/asm/kvm_host.h|  8 +
 arch/arm64/include/uapi/asm/kvm.h|  3 +-
 arch/arm64/kvm/guest.c   |  3 ++
 arch/arm64/kvm/inject_fault.c|  4 +--
 include/uapi/linux/kvm.h |  8 +
 virt/kvm/arm/arm.c   | 22 ++
 virt/kvm/arm/mmio.c  | 11 +--
 14 files changed, 122 insertions(+), 7 deletions(-)

-- 
2.17.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 1/2] KVM: arm/arm64: Allow reporting non-ISV data aborts to userspace

2019-09-09 Thread Christoffer Dall
For a long time, if a guest accessed memory outside of a memslot using
any of the load/store instructions in the architecture which doesn't
supply decoding information in the ESR_EL2 (the ISV bit is not set), the
kernel would print the following message and terminate the VM as a
result of returning -ENOSYS to userspace:

  load/store instruction decoding not implemented

The reason behind this message is that KVM assumes that all accesses
outside a memslot is an MMIO access which should be handled by
userspace, and we originally expected to eventually implement some sort
of decoding of load/store instructions where the ISV bit was not set.

However, it turns out that many of the instructions which don't provide
decoding information on abort are not safe to use for MMIO accesses, and
the remaining few that would potentially make sense to use on MMIO
accesses, such as those with register writeback, are not used in
practice.  It also turns out that fetching an instruction from guest
memory can be a pretty horrible affair, involving stopping all CPUs on
SMP systems, handling multiple corner cases of address translation in
software, and more.  It doesn't appear likely that we'll ever implement
this in the kernel.

What is much more common is that a user has misconfigured his/her guest
and is actually not accessing an MMIO region, but just hitting some
random hole in the IPA space.  In this scenario, the error message above
is almost misleading and has led to a great deal of confusion over the
years.

It is, nevertheless, ABI to userspace, and we therefore need to
introduce a new capability that userspace explicitly enables to change
behavior.

This patch introduces KVM_CAP_ARM_NISV_TO_USER (NISV meaning Non-ISV)
which does exactly that, and introduces a new exit reason to report the
event to userspace.  User space can then emulate an exception to the
guest, restart the guest, suspend the guest, or take any other
appropriate action as per the policy of the running system.

Reported-by: Heinrich Schuchardt 
Signed-off-by: Christoffer Dall 
---
 Documentation/virt/kvm/api.txt   | 29 
 arch/arm/include/asm/kvm_arm.h   |  2 ++
 arch/arm/include/asm/kvm_emulate.h   |  5 +
 arch/arm/include/asm/kvm_host.h  |  8 
 arch/arm64/include/asm/kvm_emulate.h |  5 +
 arch/arm64/include/asm/kvm_host.h|  8 
 include/uapi/linux/kvm.h |  7 +++
 virt/kvm/arm/arm.c   | 21 
 virt/kvm/arm/mmio.c  | 11 +--
 9 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index 2d067767b617..02501333f746 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -4453,6 +4453,35 @@ Hyper-V SynIC state change. Notification is used to 
remap SynIC
 event/message pages and to enable/disable SynIC messages/events processing
 in userspace.
 
+   /* KVM_EXIT_ARM_NISV */
+   struct {
+   __u64 esr_iss;
+   __u64 fault_ipa;
+   } arm_nisv;
+
+Used on arm and arm64 systems. If a guest accesses memory not in a memslot,
+KVM will typically return to userspace and ask it to do MMIO emulation on its
+behalf. However, for certain classes of instructions, no instruction decode
+(direction, length of memory access) is provided, and fetching and decoding
+the instruction from the VM is overly complicated to live in the kernel.
+
+Historically, when this situation occurred, KVM would print a warning and kill
+the VM. KVM assumed that if the guest accessed non-memslot memory, it was
+trying to do I/O, which just couldn't be emulated, and the warning message was
+phrased accordingly. However, what happened more often was that a guest bug
+caused access outside the guest memory areas which should lead to a more
+mearningful warning message and an external abort in the guest, if the access
+did not fall within an I/O window.
+
+Userspace implementations can query for KVM_CAP_ARM_NISV_TO_USER, and enable
+this capability at VM creation. Once this is done, these types of errors will
+instead return to userspace with KVM_EXIT_ARM_NISV, with the valid bits from
+the HSR (arm) and ESR_EL2 (arm64) in the esr_iss field, and the faulting IPA
+in the fault_ipa field. Userspace can either fix up the access if it's
+actually an I/O access by decoding the instruction from guest memory (if it's
+very brave) and continue executing the guest, or it can decide to suspend,
+dump, or restart the guest.
+
/* Fix the size of the union. */
char padding[256];
};
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 0125aa059d5b..ce61b3b0058d 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -162,6 +162,8 @@
 #define HSR_ISV(_AC(1, UL) << HSR_ISV_SHIFT)
 #

[kvmtool PATCH 4/5] arm: Handle exits from undecoded load/store instructions

2019-09-09 Thread Christoffer Dall
KVM occasionally encounters guests that attempt to access memory outside
the registered RAM memory slots using instructions that don't provide
decoding information in the ESR_EL2 (the ISV bit is not set), and
historically this has led to the kernel printing a confusing error
message in dmesg and returning -ENOYSYS from KVM_RUN.

KVM/Arm now has KVM_CAP_ARM_NISV_TO_USER, which can be enabled from
userspace, and which allows us to handle this with a little bit more
helpful information to the user.  For example, we can at least tell the
user if the guest just hit a hole in the guest's memory map, or if this
appeared to be an attempt at doing MMIO.

Signed-off-by: Christoffer Dall 
---
 arm/kvm-cpu.c | 20 +++-
 arm/kvm.c |  8 
 include/kvm/kvm.h |  1 +
 kvm.c |  1 +
 mmio.c| 11 +++
 5 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/arm/kvm-cpu.c b/arm/kvm-cpu.c
index 7780251..25bd3ed 100644
--- a/arm/kvm-cpu.c
+++ b/arm/kvm-cpu.c
@@ -136,7 +136,25 @@ void kvm_cpu__delete(struct kvm_cpu *vcpu)
 
 bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
 {
-   return false;
+   switch (vcpu->kvm_run->exit_reason) {
+   case KVM_EXIT_ARM_NISV: {
+   u64 phys_addr = vcpu->kvm_run->arm_nisv.fault_ipa;
+
+   if (!arm_addr_in_ioport_region(phys_addr) &&
+   !kvm__mmio_exists(vcpu, phys_addr))
+   die("Guest accessed memory outside RAM and IO ranges");
+
+   /*
+* We cannot fetch and decode instructions from a KVM guest,
+* which used a load/store instruction that doesn't get
+* decoded in the ESR towards an I/O device, so we have no
+* choice but to exit to the user with an error.
+*/
+   die("Guest accessed I/O device with unsupported load/store 
instruction");
+   }
+   default:
+   return false;
+   }
 }
 
 void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
diff --git a/arm/kvm.c b/arm/kvm.c
index 1f85fc6..2572ac2 100644
--- a/arm/kvm.c
+++ b/arm/kvm.c
@@ -59,6 +59,8 @@ void kvm__arch_set_cmdline(char *cmdline, bool video)
 
 void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
 {
+   struct kvm_enable_cap enable_cap = { .flags = 0 };
+
/*
 * Allocate guest memory. We must align our buffer to 64K to
 * correlate with the maximum guest page size for virtio-mmio.
@@ -83,6 +85,12 @@ void kvm__arch_init(struct kvm *kvm, const char 
*hugetlbfs_path, u64 ram_size)
madvise(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size,
MADV_HUGEPAGE);
 
+   if (kvm__supports_extension(kvm, KVM_CAP_ARM_NISV_TO_USER)) {
+   enable_cap.cap = KVM_CAP_ARM_NISV_TO_USER;
+   if (ioctl(kvm->vm_fd, KVM_ENABLE_CAP, _cap) < 0)
+   die("unable to enable NISV_TO_USER capability");
+   }
+
/* Create the virtual GIC. */
if (gic__create(kvm, kvm->cfg.arch.irqchip))
die("Failed to create virtual GIC");
diff --git a/include/kvm/kvm.h b/include/kvm/kvm.h
index 7a73818..05d90ee 100644
--- a/include/kvm/kvm.h
+++ b/include/kvm/kvm.h
@@ -107,6 +107,7 @@ bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void 
*data, int direction,
 bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, 
u8 is_write);
 int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void 
*userspace_addr,
  enum kvm_mem_type type);
+bool kvm__mmio_exists(struct kvm_cpu *vcpu, u64 phys_addr);
 static inline int kvm__register_ram(struct kvm *kvm, u64 guest_phys, u64 size,
void *userspace_addr)
 {
diff --git a/kvm.c b/kvm.c
index 57c4ff9..03ec43f 100644
--- a/kvm.c
+++ b/kvm.c
@@ -55,6 +55,7 @@ const char *kvm_exit_reasons[] = {
 #ifdef CONFIG_PPC64
DEFINE_KVM_EXIT_REASON(KVM_EXIT_PAPR_HCALL),
 #endif
+   DEFINE_KVM_EXIT_REASON(KVM_EXIT_ARM_NISV),
 };
 
 static int pause_event;
diff --git a/mmio.c b/mmio.c
index 61e1d47..2ab7fa7 100644
--- a/mmio.c
+++ b/mmio.c
@@ -139,3 +139,14 @@ bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 
phys_addr, u8 *data, u32 len, u
 
return true;
 }
+
+bool kvm__mmio_exists(struct kvm_cpu *vcpu, u64 phys_addr)
+{
+   struct mmio_mapping *mmio;
+
+   br_read_lock(vcpu->kvm);
+   mmio = mmio_search(_tree, phys_addr, 1);
+   br_read_unlock(vcpu->kvm);
+
+   return mmio != NULL;
+}
-- 
2.17.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 2/2] KVM: arm/arm64: Allow user injection of external data aborts

2019-09-09 Thread Christoffer Dall
In some scenarios, such as buggy guest or incorrect configuration of the
VMM and firmware description data, userspace will detect a memory access
to a portion of the IPA, which is not mapped to any MMIO region.

For this purpose, the appropriate action is to inject an external abort
to the guest.  The kernel already has functionality to inject an
external abort, but we need to wire up a signal from user space that
lets user space tell the kernel to do this.

It turns out, we already have the set event functionality which we can
perfectly reuse for this.

Signed-off-by: Christoffer Dall 
---
 Documentation/virt/kvm/api.txt| 15 ++-
 arch/arm/include/uapi/asm/kvm.h   |  3 ++-
 arch/arm/kvm/guest.c  |  3 +++
 arch/arm64/include/uapi/asm/kvm.h |  3 ++-
 arch/arm64/kvm/guest.c|  3 +++
 arch/arm64/kvm/inject_fault.c |  4 ++--
 include/uapi/linux/kvm.h  |  1 +
 virt/kvm/arm/arm.c|  1 +
 8 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index 02501333f746..edd6cdc470ca 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -955,6 +955,8 @@ The following bits are defined in the flags field:
 
 ARM/ARM64:
 
+User space may need to inject several types of events to the guest.
+
 If the guest accesses a device that is being emulated by the host kernel in
 such a way that a real device would generate a physical SError, KVM may make
 a virtual SError pending for that VCPU. This system error interrupt remains
@@ -989,12 +991,23 @@ Specifying exception.has_esr on a system that does not 
support it will return
 -EINVAL. Setting anything other than the lower 24bits of exception.serror_esr
 will return -EINVAL.
 
+If the guest performed an access to I/O memory which could not be handled by
+user space, for example because of missing instruction syndrome decode
+information or because there is no device mapped at the accessed IPA, then
+user space can ask the kernel to inject an external abort using the address
+from the exiting fault on the VCPU. It is a programming error to set
+ext_dabt_pending at the same time as any of the serror fields, or to set
+ext_dabt_pending on an exit which was not either KVM_EXIT_MMIO or
+KVM_EXIT_ARM_NISV. This feature is only available if the system supports
+KVM_CAP_ARM_INJECT_EXT_DABT;
+
 struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index a4217c1a5d01..d2449a5bf8d5 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -131,8 +131,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index 684cf64b4033..4154c5589501 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -263,11 +263,14 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 {
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
+   bool has_ext_dabt_pending = events->exception.ext_dabt_pending;
 
if (serror_pending && has_esr)
return -EINVAL;
else if (serror_pending)
kvm_inject_vabt(vcpu);
+   else if (has_ext_dabt_pending)
+   kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
 
return 0;
 }
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index 9a507716ae2f..7729efdb1c0c 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -164,8 +164,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+   __u8 ext_dabt_pending;
/* Align it to 8 bytes */
-   __u8 pad[6];
+   __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index dfd626447482..10e6e2144dca 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -720,6 +720,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 {
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
+   bool has_ext_dabt_pending = events->

Re: [PATCH 1/1] KVM: inject data abort if instruction cannot be decoded

2019-09-06 Thread Christoffer Dall
On Fri, Sep 06, 2019 at 02:31:42PM +0100, Peter Maydell wrote:
> On Fri, 6 Sep 2019 at 14:13, Christoffer Dall  
> wrote:
> > I'd prefer leaving it to userspace to worry about, but I thought Peter
> > said that had been problematic historically, which I took at face value,
> > but I could have misunderstood.
> >
> > If QEMU, kvmtool, and whatever the crazy^H cool kids are using in
> > userspace these days are happy emulating the exception, then that's a
> > viable approach.  The main concern I have with that is whether they'll
> > all get it right, and since we already have the code in the kernel to do
> > this, it might make sense to re-use the kernel logic for it.
> 
> Well, for QEMU we've had code that in theory might do this but
> in practice it's never been tested. Essentially the problem is
> that nobody ever wants to inject an exception from userspace
> except in incredibly rare cases like "trying to use h/w breakpoints
> simultaneously inside the VM and also to debug the VM from outside"
> or "we're running on RAS hardware that just told us that the VM's
> RAM was faulty". There's no even vaguely commonly-used usecase
> for it today; and this ABI suggestion adds another "this is in
> practice almost never going to happen" case to the pile. The
> codepath is unreliable in QEMU because (a) it requires getting
> syncing of sysreg values to and from the kernel right -- this is
> about the only situation where userspace wants to modify sysregs
> during execution of the VM, as opposed to just reading them -- and
> (b) we try to reuse the code we already have that does TCG exception
> injection, which might or might not be a design mistake, and
> (c) as noted above it's a never-actually-used untested codepath...
> 
> So I think if I were you I wouldn't commit to the kernel ABI until
> somebody had at least written some RFC-quality patches for QEMU and
> tested that they work and the ABI is OK in that sense. (For the
> avoidance of doubt, I'm not volunteering to do that myself.)
> I don't object to the idea in principle, though.
> 
> PS: the other "injecting exceptions to the guest" oddity is that
> if the kernel *does* find the ISV information and returns to userspace
> for it to handle the MMIO, there's no way for userspace to say
> "actually that address is supposed to generate a data abort".
> 

That's a good point.  A synchronous interface with a separate mechanism
to ask the kernel to inject an exception might be a good solution, if
there's an elegant way to do the latter.  I'll have a look at that.

Thanks,

Christoffer


Re: [PATCH 1/1] KVM: inject data abort if instruction cannot be decoded

2019-09-06 Thread Christoffer Dall
On Fri, Sep 06, 2019 at 02:08:15PM +0200, Alexander Graf wrote:
> 
> 
> On 06.09.19 10:00, Christoffer Dall wrote:
> > On Thu, Sep 05, 2019 at 02:09:18PM +0100, Marc Zyngier wrote:
> > > On 05/09/2019 10:22, Christoffer Dall wrote:
> > > > On Thu, Sep 05, 2019 at 09:56:44AM +0100, Peter Maydell wrote:
> > > > > On Thu, 5 Sep 2019 at 09:52, Marc Zyngier  wrote:
> > > > > > 
> > > > > > On Thu, 05 Sep 2019 09:16:54 +0100,
> > > > > > Peter Maydell  wrote:
> > > > > > > This is true, but the problem is that barfing out to userspace
> > > > > > > makes it harder to debug the guest because it means that
> > > > > > > the VM is immediately destroyed, whereas AIUI if we
> > > > > > > inject some kind of exception then (assuming you're set up
> > > > > > > to do kernel-debug via gdbstub) you can actually examine
> > > > > > > the offending guest code with a debugger because at least
> > > > > > > your VM is still around to inspect...
> > > > > > 
> > > > > > To Christoffer's point, I find the benefit a bit dubious. Yes, you 
> > > > > > get
> > > > > > an exception, but the instruction that caused it may be completely
> > > > > > legal (store with post-increment, for example), leading to an even
> > > > > > more puzzled developer (that exception should never have been
> > > > > > delivered the first place).
> > > > > 
> > > > > Right, but the combination of "host kernel prints a message
> > > > > about an unsupported load/store insn" and "within-guest debug
> > > > > dump/stack trace/etc" is much more useful than just having
> > > > > "host kernel prints message" and "QEMU exits"; and it requires
> > > > > about 3 lines of code change...
> > > > > 
> > > > > > I'm far more in favour of dumping the state of the access in the run
> > > > > > structure (much like we do for a MMIO access) and let userspace do
> > > > > > something about it (such as dumping information on the console or
> > > > > > breaking). It could even inject an exception *if* the user has asked
> > > > > > for it.
> > > > > 
> > > > > ...whereas this requires agreement on a kernel-userspace API,
> > > > > larger changes in the kernel, somebody to implement the userspace
> > > > > side of things, and the user to update both the kernel and QEMU.
> > > > > It's hard for me to see that the benefit here over the 3-line
> > > > > approach really outweighs the extra effort needed. In practice
> > > > > saying "we should do this" is saying "we're going to do nothing",
> > > > > based on the historical record.
> > > > > 
> > > > 
> > > > How about something like the following (completely untested, liable for
> > > > ABI discussions etc. etc., but for illustration purposes).
> > > > 
> > > > I think it raises the question (and likely many other) of whether we can
> > > > break the existing 'ABI' and change behavior for missing ISV
> > > > retrospectively for legacy user space when the issue has occurred?
> > > > Someone might have written code that reacts to the -ENOSYS, so I've
> > > > taken the conservative approach for this for the time being.
> > > > 
> > > > 
> > > > diff --git a/arch/arm/include/asm/kvm_host.h 
> > > > b/arch/arm/include/asm/kvm_host.h
> > > > index 8a37c8e89777..19a92c49039c 100644
> > > > --- a/arch/arm/include/asm/kvm_host.h
> > > > +++ b/arch/arm/include/asm/kvm_host.h
> > > > @@ -76,6 +76,14 @@ struct kvm_arch {
> > > > /* Mandated version of PSCI */
> > > > u32 psci_version;
> > > > +
> > > > +   /*
> > > > +* If we encounter a data abort without valid instruction 
> > > > syndrome
> > > > +* information, report this to user space.  User space can (and
> > > > +* should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
> > > > +* supported.
> > > > +*/
> > > > +   bool return_nisv_io_abort_to_user;
> > > >   };
> > > >   #define KVM_NR_MEM_OBJS   

Re: [PATCH 1/1] KVM: inject data abort if instruction cannot be decoded

2019-09-06 Thread Christoffer Dall
On Thu, Sep 05, 2019 at 02:09:18PM +0100, Marc Zyngier wrote:
> On 05/09/2019 10:22, Christoffer Dall wrote:
> > On Thu, Sep 05, 2019 at 09:56:44AM +0100, Peter Maydell wrote:
> >> On Thu, 5 Sep 2019 at 09:52, Marc Zyngier  wrote:
> >>>
> >>> On Thu, 05 Sep 2019 09:16:54 +0100,
> >>> Peter Maydell  wrote:
> >>>> This is true, but the problem is that barfing out to userspace
> >>>> makes it harder to debug the guest because it means that
> >>>> the VM is immediately destroyed, whereas AIUI if we
> >>>> inject some kind of exception then (assuming you're set up
> >>>> to do kernel-debug via gdbstub) you can actually examine
> >>>> the offending guest code with a debugger because at least
> >>>> your VM is still around to inspect...
> >>>
> >>> To Christoffer's point, I find the benefit a bit dubious. Yes, you get
> >>> an exception, but the instruction that caused it may be completely
> >>> legal (store with post-increment, for example), leading to an even
> >>> more puzzled developer (that exception should never have been
> >>> delivered the first place).
> >>
> >> Right, but the combination of "host kernel prints a message
> >> about an unsupported load/store insn" and "within-guest debug
> >> dump/stack trace/etc" is much more useful than just having
> >> "host kernel prints message" and "QEMU exits"; and it requires
> >> about 3 lines of code change...
> >>
> >>> I'm far more in favour of dumping the state of the access in the run
> >>> structure (much like we do for a MMIO access) and let userspace do
> >>> something about it (such as dumping information on the console or
> >>> breaking). It could even inject an exception *if* the user has asked
> >>> for it.
> >>
> >> ...whereas this requires agreement on a kernel-userspace API,
> >> larger changes in the kernel, somebody to implement the userspace
> >> side of things, and the user to update both the kernel and QEMU.
> >> It's hard for me to see that the benefit here over the 3-line
> >> approach really outweighs the extra effort needed. In practice
> >> saying "we should do this" is saying "we're going to do nothing",
> >> based on the historical record.
> >>
> > 
> > How about something like the following (completely untested, liable for
> > ABI discussions etc. etc., but for illustration purposes).
> > 
> > I think it raises the question (and likely many other) of whether we can
> > break the existing 'ABI' and change behavior for missing ISV
> > retrospectively for legacy user space when the issue has occurred?
> >
> > Someone might have written code that reacts to the -ENOSYS, so I've
> > taken the conservative approach for this for the time being.
> > 
> > 
> > diff --git a/arch/arm/include/asm/kvm_host.h 
> > b/arch/arm/include/asm/kvm_host.h
> > index 8a37c8e89777..19a92c49039c 100644
> > --- a/arch/arm/include/asm/kvm_host.h
> > +++ b/arch/arm/include/asm/kvm_host.h
> > @@ -76,6 +76,14 @@ struct kvm_arch {
> >  
> > /* Mandated version of PSCI */
> > u32 psci_version;
> > +
> > +   /*
> > +* If we encounter a data abort without valid instruction syndrome
> > +* information, report this to user space.  User space can (and
> > +* should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
> > +* supported.
> > +*/
> > +   bool return_nisv_io_abort_to_user;
> >  };
> >  
> >  #define KVM_NR_MEM_OBJS 40
> > diff --git a/arch/arm64/include/asm/kvm_host.h 
> > b/arch/arm64/include/asm/kvm_host.h
> > index f656169db8c3..019bc560edc1 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -83,6 +83,14 @@ struct kvm_arch {
> >  
> > /* Mandated version of PSCI */
> > u32 psci_version;
> > +
> > +   /*
> > +* If we encounter a data abort without valid instruction syndrome
> > +* information, report this to user space.  User space can (and
> > +* should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
> > +* supported.
> > +*/
> > +   bool return_nisv_io_abort_to_user;
> >  };
> >  
> >  #define KVM_NR_MEM_OBJS 40
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 5e3f12d5359e..a4dd004d0db9 100644

Re: [PATCH 1/1] KVM: inject data abort if instruction cannot be decoded

2019-09-06 Thread Christoffer Dall
On Thu, Sep 05, 2019 at 03:25:47PM +0200, Heinrich Schuchardt wrote:
> On 9/5/19 11:22 AM, Christoffer Dall wrote:
> > On Thu, Sep 05, 2019 at 09:56:44AM +0100, Peter Maydell wrote:
> > > On Thu, 5 Sep 2019 at 09:52, Marc Zyngier  wrote:
> > > > 
> > > > On Thu, 05 Sep 2019 09:16:54 +0100,
> > > > Peter Maydell  wrote:
> > > > > This is true, but the problem is that barfing out to userspace
> > > > > makes it harder to debug the guest because it means that
> > > > > the VM is immediately destroyed, whereas AIUI if we
> > > > > inject some kind of exception then (assuming you're set up
> > > > > to do kernel-debug via gdbstub) you can actually examine
> > > > > the offending guest code with a debugger because at least
> > > > > your VM is still around to inspect...
> > > > 
> > > > To Christoffer's point, I find the benefit a bit dubious. Yes, you get
> > > > an exception, but the instruction that caused it may be completely
> > > > legal (store with post-increment, for example), leading to an even
> > > > more puzzled developer (that exception should never have been
> > > > delivered the first place).
> > > 
> > > Right, but the combination of "host kernel prints a message
> > > about an unsupported load/store insn" and "within-guest debug
> > > dump/stack trace/etc" is much more useful than just having
> > > "host kernel prints message" and "QEMU exits"; and it requires
> > > about 3 lines of code change...
> > > 
> > > > I'm far more in favour of dumping the state of the access in the run
> > > > structure (much like we do for a MMIO access) and let userspace do
> > > > something about it (such as dumping information on the console or
> > > > breaking). It could even inject an exception *if* the user has asked
> > > > for it.
> > > 
> > > ...whereas this requires agreement on a kernel-userspace API,
> > > larger changes in the kernel, somebody to implement the userspace
> > > side of things, and the user to update both the kernel and QEMU.
> > > It's hard for me to see that the benefit here over the 3-line
> > > approach really outweighs the extra effort needed. In practice
> > > saying "we should do this" is saying "we're going to do nothing",
> > > based on the historical record.
> > > 
> > 
> > How about something like the following (completely untested, liable for
> > ABI discussions etc. etc., but for illustration purposes).
> > 
> > I think it raises the question (and likely many other) of whether we can
> > break the existing 'ABI' and change behavior for missing ISV
> > retrospectively for legacy user space when the issue has occurred?
> > 
> > Someone might have written code that reacts to the -ENOSYS, so I've
> > taken the conservative approach for this for the time being.
> > 
> > 
> > diff --git a/arch/arm/include/asm/kvm_host.h 
> > b/arch/arm/include/asm/kvm_host.h
> > index 8a37c8e89777..19a92c49039c 100644
> > --- a/arch/arm/include/asm/kvm_host.h
> > +++ b/arch/arm/include/asm/kvm_host.h
> > @@ -76,6 +76,14 @@ struct kvm_arch {
> > 
> > /* Mandated version of PSCI */
> > u32 psci_version;
> > +
> > +   /*
> > +* If we encounter a data abort without valid instruction syndrome
> > +* information, report this to user space.  User space can (and
> > +* should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
> > +* supported.
> > +*/
> > +   bool return_nisv_io_abort_to_user;
> >   };
> > 
> >   #define KVM_NR_MEM_OBJS 40
> > diff --git a/arch/arm64/include/asm/kvm_host.h 
> > b/arch/arm64/include/asm/kvm_host.h
> > index f656169db8c3..019bc560edc1 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -83,6 +83,14 @@ struct kvm_arch {
> > 
> > /* Mandated version of PSCI */
> > u32 psci_version;
> > +
> > +   /*
> > +* If we encounter a data abort without valid instruction syndrome
> > +* information, report this to user space.  User space can (and
> > +* should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
> > +* supported.
> > +*/
> > +   bool return_nisv_io_abort_to_user;
> 
> How about 32bit ARM?
> 

What about it?  Not sure I understand the comment.

> >   };
> > 
&

Re: [PATCH 1/1] KVM: inject data abort if instruction cannot be decoded

2019-09-05 Thread Christoffer Dall
Hi Heinrich,

On Thu, Sep 05, 2019 at 02:01:36PM +0200, Heinrich Schuchardt wrote:
> On 9/5/19 11:20 AM, Stefan Hajnoczi wrote:
> > On Wed, Sep 04, 2019 at 08:07:36PM +0200, Heinrich Schuchardt wrote:
> > > If an application tries to access memory that is not mapped, an error
> > > ENOSYS, "load/store instruction decoding not implemented" may occur.
> > > QEMU will hang with a register dump.
> > > 
> > > Instead create a data abort that can be handled gracefully by the
> > > application running in the virtual environment.
> > > 
> > > Now the virtual machine can react to the event in the most appropriate
> > > way - by recovering, by writing an informative log, or by rebooting.
> > > 
> > > Signed-off-by: Heinrich Schuchardt 
> > > ---
> > >   virt/kvm/arm/mmio.c | 4 ++--
> > >   1 file changed, 2 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
> > > index a8a6a0c883f1..0cbed7d6a0f4 100644
> > > --- a/virt/kvm/arm/mmio.c
> > > +++ b/virt/kvm/arm/mmio.c
> > > @@ -161,8 +161,8 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct 
> > > kvm_run *run,
> > >   if (ret)
> > >   return ret;
> > >   } else {
> > > - kvm_err("load/store instruction decoding not implemented\n");
> > > - return -ENOSYS;
> > > + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
> > > + return 1;
> > 
> > I see this more as a temporary debugging hack than something to merge.
> > 
> > It sounds like in your case the guest environment provided good
> > debugging information and you preferred it over debugging this from the
> > host side.  That's fine, but allowing the guest to continue running in
> > the general case makes it much harder to track down the root cause of a
> > problem because many guest CPU instructions may be executed after the
> > original problem occurs.  Other guest software may fail silently in
> > weird ways.  IMO it's best to fail early.
> > 
> > Stefan
> > 
> 
> As virtual machine are ubiquitous, expect also mission critical system
> to run on them. At development time halting a machine may be a good
> idea. In production this is often the worst solution. Rebooting may be
> essential for survival.
> 
> For an anecdotal example see:
> https://www.hq.nasa.gov/alsj/a11/a11.1201-pa.html
> 
> I am convinced that leaving it to the guest to decide how to react is
> the best choice.
> 
Maintaining strong adherence to the architecture is equally important,
and I'm sure we can find anecdotes to support how not doing the
expected, can also lead to disastrous outcomes.

Have you had a look at the suggested patch I sent?  The idea is that we
can preserve existing legacy ABI, allow for a better debugging
experience, allow userspace to do emulation if it so wishes, and provide
a better error message if userspace doesn't handle this properly.

One thing we could change from my proposed patch would be to have KVM
inject the access as an external abort if the target address also
doesn't hit an MMIO device, which is by far the common scenario reported
here on the list.

Hopefully, a mission critical deployment based on KVM/Arm (scary as that
sounds), would use a recent and patched VMM (QEMU) that either causes
the external abort, or reboots the VM, as per the configuration of the
particular system in question.


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/1] KVM: inject data abort if instruction cannot be decoded

2019-09-05 Thread Christoffer Dall
On Thu, Sep 05, 2019 at 09:56:44AM +0100, Peter Maydell wrote:
> On Thu, 5 Sep 2019 at 09:52, Marc Zyngier  wrote:
> >
> > On Thu, 05 Sep 2019 09:16:54 +0100,
> > Peter Maydell  wrote:
> > > This is true, but the problem is that barfing out to userspace
> > > makes it harder to debug the guest because it means that
> > > the VM is immediately destroyed, whereas AIUI if we
> > > inject some kind of exception then (assuming you're set up
> > > to do kernel-debug via gdbstub) you can actually examine
> > > the offending guest code with a debugger because at least
> > > your VM is still around to inspect...
> >
> > To Christoffer's point, I find the benefit a bit dubious. Yes, you get
> > an exception, but the instruction that caused it may be completely
> > legal (store with post-increment, for example), leading to an even
> > more puzzled developer (that exception should never have been
> > delivered the first place).
> 
> Right, but the combination of "host kernel prints a message
> about an unsupported load/store insn" and "within-guest debug
> dump/stack trace/etc" is much more useful than just having
> "host kernel prints message" and "QEMU exits"; and it requires
> about 3 lines of code change...
> 
> > I'm far more in favour of dumping the state of the access in the run
> > structure (much like we do for a MMIO access) and let userspace do
> > something about it (such as dumping information on the console or
> > breaking). It could even inject an exception *if* the user has asked
> > for it.
> 
> ...whereas this requires agreement on a kernel-userspace API,
> larger changes in the kernel, somebody to implement the userspace
> side of things, and the user to update both the kernel and QEMU.
> It's hard for me to see that the benefit here over the 3-line
> approach really outweighs the extra effort needed. In practice
> saying "we should do this" is saying "we're going to do nothing",
> based on the historical record.
> 

How about something like the following (completely untested, liable for
ABI discussions etc. etc., but for illustration purposes).

I think it raises the question (and likely many other) of whether we can
break the existing 'ABI' and change behavior for missing ISV
retrospectively for legacy user space when the issue has occurred?
   
Someone might have written code that reacts to the -ENOSYS, so I've
taken the conservative approach for this for the time being.


diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 8a37c8e89777..19a92c49039c 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -76,6 +76,14 @@ struct kvm_arch {
 
/* Mandated version of PSCI */
u32 psci_version;
+
+   /*
+* If we encounter a data abort without valid instruction syndrome
+* information, report this to user space.  User space can (and
+* should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
+* supported.
+*/
+   bool return_nisv_io_abort_to_user;
 };
 
 #define KVM_NR_MEM_OBJS 40
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index f656169db8c3..019bc560edc1 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -83,6 +83,14 @@ struct kvm_arch {
 
/* Mandated version of PSCI */
u32 psci_version;
+
+   /*
+* If we encounter a data abort without valid instruction syndrome
+* information, report this to user space.  User space can (and
+* should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
+* supported.
+*/
+   bool return_nisv_io_abort_to_user;
 };
 
 #define KVM_NR_MEM_OBJS 40
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5e3f12d5359e..a4dd004d0db9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -235,6 +235,7 @@ struct kvm_hyperv_exit {
 #define KVM_EXIT_S390_STSI25
 #define KVM_EXIT_IOAPIC_EOI   26
 #define KVM_EXIT_HYPERV   27
+#define KVM_EXIT_ARM_NISV 28
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -996,6 +997,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_PTRAUTH_ADDRESS 171
 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172
 #define KVM_CAP_PMU_EVENT_FILTER 173
+#define KVM_CAP_ARM_NISV_TO_USER 174
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 35a069815baf..2ce94bd9d4a9 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -98,6 +98,26 @@ int kvm_arch_check_processor_compat(void)
return 0;
 }
 
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+   struct kvm_enable_cap *cap)
+{
+   int r;
+
+   if (cap->flags)
+   return -EINVAL;
+
+   switch (cap->cap) {
+   case KVM_CAP_ARM_NISV_TO_USER:
+   r = 0;
+   kvm->arch.return_nisv_io_abort_to_user = true;
+   break;
+   

Re: [PATCH 1/1] KVM: inject data abort if instruction cannot be decoded

2019-09-05 Thread Christoffer Dall
On Thu, Sep 05, 2019 at 09:16:54AM +0100, Peter Maydell wrote:
> On Thu, 5 Sep 2019 at 09:04, Marc Zyngier  wrote:
> > How can you tell that the access would fault? You have no idea at that
> > stage (the kernel doesn't know about the MMIO ranges that userspace
> > handles). All you know is that you're faced with a memory access that
> > you cannot emulate in the kernel. Injecting a data abort at that stage
> > is not something that the architecture allows.
> 
> To be fair, locking up the whole CPU (which is effectively
> what the kvm_err/ENOSYS is going to do to the VM) isn't
> something the architecture allows either :-)
> 
> > Of course, the best thing would be to actually fix the guest so that
> > it doesn't use non-emulatable MMIO accesses. In general, that the sign
> > of a bug in low-level accessors.
> 
> This is true, but the problem is that barfing out to userspace
> makes it harder to debug the guest because it means that
> the VM is immediately destroyed, whereas AIUI if we
> inject some kind of exception then (assuming you're set up
> to do kernel-debug via gdbstub) you can actually examine
> the offending guest code with a debugger because at least
> your VM is still around to inspect...
> 

Is it really going to be easier to debug a guest that sees behavior
which may not be architecturally correct?  For example, seeing a data
abort on an access to an MMIO region because the guest used a strange
instruction?

I appreaciate that the current way we handle this is confusing and has
led many people down a rabbit hole, so we should do better.

Would a better approach not be to return to userspace saying, "we can't
handle this in the kernel, you decide", without printing the dubious
kernel error message.  Then user space could suspend the VM and print a
lenghty explanation of all the possible problems there could be, or
re-inject something back into the guest, or whatever, for a particular
environment.

Thoughts?


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 0/3] arm64: KVM: Kiss hyp_alternate_select() goodbye

2019-09-02 Thread Christoffer Dall
On Sun, Sep 01, 2019 at 10:12:34PM +0100, Marc Zyngier wrote:
> hyp_alternate_select() is a leftover from the my second attempt at
> supporting VHE (the first one was never merged, thankfully), and is
> now an irrelevant relic. It was a way to patch function pointers
> without having to dereference memory, a bit like static keys for
> function calls.
> 
> Lovely idea, but since Christoffer mostly separated the VHE and !VHE
> hypervisor paths, most of the uses of hyp_alternate_select() are
> gone. What is left is two instances that are better replaced by
> already existing static keys. One of the instances becomes
> cpus_have_const_cap(), and the rest is a light sprinkling of
> has_vhe().
> 
> So off it goes.

I'm not sure I want to kiss hyp_alternate_select() at all, but away it
must go!

Reviewed-by: Christoffer Dall 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 05/10] KVM: arm64: Support stolen time reporting via shared structure

2019-08-30 Thread Christoffer Dall
On Fri, Aug 30, 2019 at 09:42:50AM +0100, Steven Price wrote:
> Implement the service call for configuring a shared structure between a
> VCPU and the hypervisor in which the hypervisor can write the time
> stolen from the VCPU's execution time by other tasks on the host.
> 
> The hypervisor allocates memory which is placed at an IPA chosen by user
> space. The hypervisor then updates the shared structure using
> kvm_put_guest() to ensure single copy atomicity of the 64-bit value
> reporting the stolen time in nanoseconds.
> 
> Whenever stolen time is enabled by the guest, the stolen time counter is
> reset.
> 
> The stolen time itself is retrieved from the sched_info structure
> maintained by the Linux scheduler code. We enable SCHEDSTATS when
> selecting KVM Kconfig to ensure this value is meaningful.
> 
> Signed-off-by: Steven Price 
> ---
>  arch/arm/include/asm/kvm_host.h   | 20 +++
>  arch/arm64/include/asm/kvm_host.h | 21 +++-
>  arch/arm64/kvm/Kconfig|  1 +
>  include/linux/kvm_types.h |  2 ++
>  virt/kvm/arm/arm.c| 11 ++
>  virt/kvm/arm/hypercalls.c |  3 ++
>  virt/kvm/arm/pvtime.c | 56 +++
>  7 files changed, 113 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index 5a0c3569ebde..5c401482d62d 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -39,6 +39,7 @@
>   KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_IRQ_PENDING  KVM_ARCH_REQ(1)
>  #define KVM_REQ_VCPU_RESET   KVM_ARCH_REQ(2)
> +#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
>  
>  DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
>  
> @@ -329,6 +330,25 @@ static inline long kvm_hypercall_pv_features(struct 
> kvm_vcpu *vcpu)
>   return SMCCC_RET_NOT_SUPPORTED;
>  }
>  
> +static inline long kvm_hypercall_stolen_time(struct kvm_vcpu *vcpu)
> +{
> + return SMCCC_RET_NOT_SUPPORTED;
> +}
> +
> +static inline int kvm_update_stolen_time(struct kvm_vcpu *vcpu, bool init)
> +{
> + return -ENOTSUPP;
> +}
> +
> +static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
> +{
> +}
> +
> +static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
> +{
> + return false;
> +}
> +
>  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>  
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index 93b46d9526d0..1697e63f6dd8 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -44,6 +44,7 @@
>   KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_IRQ_PENDING  KVM_ARCH_REQ(1)
>  #define KVM_REQ_VCPU_RESET   KVM_ARCH_REQ(2)
> +#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
>  
>  DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
>  
> @@ -338,8 +339,14 @@ struct kvm_vcpu_arch {
>   /* True when deferrable sysregs are loaded on the physical CPU,
>* see kvm_vcpu_load_sysregs and kvm_vcpu_put_sysregs. */
>   bool sysregs_loaded_on_cpu;
> -};
>  
> + /* Guest PV state */
> + struct {
> + u64 steal;
> + u64 last_steal;
> + gpa_t base;
> + } steal;
> +};
>  /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
>  #define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \
> sve_ffr_offset((vcpu)->arch.sve_max_vl)))
> @@ -479,6 +486,18 @@ int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
>  
>  long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu);
> +long kvm_hypercall_stolen_time(struct kvm_vcpu *vcpu);
> +int kvm_update_stolen_time(struct kvm_vcpu *vcpu, bool init);
> +
> +static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
> +{
> + vcpu_arch->steal.base = GPA_INVALID;
> +}
> +
> +static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
> +{
> + return (vcpu_arch->steal.base != GPA_INVALID);
> +}
>  
>  void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
>  
> diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
> index a67121d419a2..d8b88e40d223 100644
> --- a/arch/arm64/kvm/Kconfig
> +++ b/arch/arm64/kvm/Kconfig
> @@ -39,6 +39,7 @@ config KVM
>   select IRQ_BYPASS_MANAGER
>   select HAVE_KVM_IRQ_BYPASS
>   select HAVE_KVM_VCPU_RUN_PID_CHANGE
> + select SCHEDSTATS
>   ---help---
> Support hosting virtualized guest machines.
> We don't support KVM with 16K page tables yet, due to the multiple
> diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
> index bde5374ae021..1c88e69db3d9 100644
> --- a/include/linux/kvm_types.h
> +++ b/include/linux/kvm_types.h
> @@ -35,6 +35,8 @@ typedef unsigned long  gva_t;
>  typedef u64   

Re: [PATCH v3 01/10] KVM: arm64: Document PV-time interface

2019-08-30 Thread Christoffer Dall
On Wed, Aug 28, 2019 at 01:09:15PM +0100, Steven Price wrote:
> On 27/08/2019 09:57, Christoffer Dall wrote:
> > On Wed, Aug 21, 2019 at 04:36:47PM +0100, Steven Price wrote:
> >> Introduce a paravirtualization interface for KVM/arm64 based on the
> >> "Arm Paravirtualized Time for Arm-Base Systems" specification DEN 0057A.
> >>
> >> This only adds the details about "Stolen Time" as the details of "Live
> >> Physical Time" have not been fully agreed.
> >>
> >> User space can specify a reserved area of memory for the guest and
> >> inform KVM to populate the memory with information on time that the host
> >> kernel has stolen from the guest.
> >>
> >> A hypercall interface is provided for the guest to interrogate the
> >> hypervisor's support for this interface and the location of the shared
> >> memory structures.
> >>
> >> Signed-off-by: Steven Price 
> >> ---
> >>  Documentation/virt/kvm/arm/pvtime.txt | 100 ++
> >>  1 file changed, 100 insertions(+)
> >>  create mode 100644 Documentation/virt/kvm/arm/pvtime.txt
> >>
> >> diff --git a/Documentation/virt/kvm/arm/pvtime.txt 
> >> b/Documentation/virt/kvm/arm/pvtime.txt
> >> new file mode 100644
> >> index ..1ceb118694e7
> >> --- /dev/null
> >> +++ b/Documentation/virt/kvm/arm/pvtime.txt
> >> @@ -0,0 +1,100 @@
> >> +Paravirtualized time support for arm64
> >> +==
> >> +
> >> +Arm specification DEN0057/A defined a standard for paravirtualised time
> >> +support for AArch64 guests:
> >> +
> >> +https://developer.arm.com/docs/den0057/a
> >> +
> >> +KVM/arm64 implements the stolen time part of this specification by 
> >> providing
> >> +some hypervisor service calls to support a paravirtualized guest 
> >> obtaining a
> >> +view of the amount of time stolen from its execution.
> >> +
> >> +Two new SMCCC compatible hypercalls are defined:
> >> +
> >> +PV_FEATURES 0xC520
> >> +PV_TIME_ST  0xC522
> >> +
> >> +These are only available in the SMC64/HVC64 calling convention as
> >> +paravirtualized time is not available to 32 bit Arm guests. The existence 
> >> of
> >> +the PV_FEATURES hypercall should be probed using the SMCCC 1.1 
> >> ARCH_FEATURES
> >> +mechanism before calling it.
> >> +
> >> +PV_FEATURES
> >> +Function ID:  (uint32)  : 0xC520
> >> +PV_func_id:   (uint32)  : Either PV_TIME_LPT or PV_TIME_ST
> >> +Return value: (int32)   : NOT_SUPPORTED (-1) or SUCCESS (0) if the 
> >> relevant
> >> +  PV-time feature is supported by the 
> >> hypervisor.
> >> +
> >> +PV_TIME_ST
> >> +Function ID:  (uint32)  : 0xC522
> >> +Return value: (int64)   : IPA of the stolen time data structure for 
> >> this
> >> +  (V)CPU. On failure:
> >> +  NOT_SUPPORTED (-1)
> >> +
> >> +The IPA returned by PV_TIME_ST should be mapped by the guest as normal 
> >> memory
> >> +with inner and outer write back caching attributes, in the inner shareable
> >> +domain. A total of 16 bytes from the IPA returned are guaranteed to be
> >> +meaningfully filled by the hypervisor (see structure below).
> >> +
> >> +PV_TIME_ST returns the structure for the calling VCPU.
> >> +
> >> +Stolen Time
> >> +---
> >> +
> >> +The structure pointed to by the PV_TIME_ST hypercall is as follows:
> >> +
> >> +  Field   | Byte Length | Byte Offset | Description
> >> +  --- | --- | --- | --
> >> +  Revision|  4  |  0  | Must be 0 for version 0.1
> >> +  Attributes  |  4  |  4  | Must be 0
> >> +  Stolen time |  8  |  8  | Stolen time in unsigned
> >> +  | | | nanoseconds indicating how
> >> +  | | | much time this VCPU thread
> >> +  | | | was involuntarily not
> >> +  | | | running on a physical CPU.
> >> +
> >> +The structure will be updated by the hypervisor prior to scheduling a 
> >> VCPU. It
> >

Re: [PATCH v3 01/10] KVM: arm64: Document PV-time interface

2019-08-28 Thread Christoffer Dall
On Tue, Aug 27, 2019 at 10:57:06AM +0200, Christoffer Dall wrote:
> On Wed, Aug 21, 2019 at 04:36:47PM +0100, Steven Price wrote:
> > Introduce a paravirtualization interface for KVM/arm64 based on the
> > "Arm Paravirtualized Time for Arm-Base Systems" specification DEN 0057A.
> > 
> > This only adds the details about "Stolen Time" as the details of "Live
> > Physical Time" have not been fully agreed.
> > 
> > User space can specify a reserved area of memory for the guest and
> > inform KVM to populate the memory with information on time that the host
> > kernel has stolen from the guest.
> > 
> > A hypercall interface is provided for the guest to interrogate the
> > hypervisor's support for this interface and the location of the shared
> > memory structures.
> > 
> > Signed-off-by: Steven Price 
> > ---
> >  Documentation/virt/kvm/arm/pvtime.txt | 100 ++
> >  1 file changed, 100 insertions(+)
> >  create mode 100644 Documentation/virt/kvm/arm/pvtime.txt
> > 
> > diff --git a/Documentation/virt/kvm/arm/pvtime.txt 
> > b/Documentation/virt/kvm/arm/pvtime.txt
> > new file mode 100644
> > index ..1ceb118694e7
> > --- /dev/null
> > +++ b/Documentation/virt/kvm/arm/pvtime.txt
> > @@ -0,0 +1,100 @@
> > +Paravirtualized time support for arm64
> > +==
> > +
> > +Arm specification DEN0057/A defined a standard for paravirtualised time
> > +support for AArch64 guests:
> > +
> > +https://developer.arm.com/docs/den0057/a
> > +
> > +KVM/arm64 implements the stolen time part of this specification by 
> > providing
> > +some hypervisor service calls to support a paravirtualized guest obtaining 
> > a
> > +view of the amount of time stolen from its execution.
> > +
> > +Two new SMCCC compatible hypercalls are defined:
> > +
> > +PV_FEATURES 0xC520
> > +PV_TIME_ST  0xC522
> > +
> > +These are only available in the SMC64/HVC64 calling convention as
> > +paravirtualized time is not available to 32 bit Arm guests. The existence 
> > of
> > +the PV_FEATURES hypercall should be probed using the SMCCC 1.1 
> > ARCH_FEATURES
> > +mechanism before calling it.
> > +
> > +PV_FEATURES
> > +Function ID:  (uint32)  : 0xC520
> > +PV_func_id:   (uint32)  : Either PV_TIME_LPT or PV_TIME_ST
> > +Return value: (int32)   : NOT_SUPPORTED (-1) or SUCCESS (0) if the 
> > relevant
> > +  PV-time feature is supported by the 
> > hypervisor.
> > +
> > +PV_TIME_ST
> > +Function ID:  (uint32)  : 0xC522
> > +Return value: (int64)   : IPA of the stolen time data structure for 
> > this
> > +  (V)CPU. On failure:
> > +  NOT_SUPPORTED (-1)
> > +
> > +The IPA returned by PV_TIME_ST should be mapped by the guest as normal 
> > memory
> > +with inner and outer write back caching attributes, in the inner shareable
> > +domain. A total of 16 bytes from the IPA returned are guaranteed to be
> > +meaningfully filled by the hypervisor (see structure below).
> > +
> > +PV_TIME_ST returns the structure for the calling VCPU.
> > +
> > +Stolen Time
> > +---
> > +
> > +The structure pointed to by the PV_TIME_ST hypercall is as follows:
> > +
> > +  Field   | Byte Length | Byte Offset | Description
> > +  --- | --- | --- | --
> > +  Revision|  4  |  0  | Must be 0 for version 0.1
> > +  Attributes  |  4  |  4  | Must be 0
> > +  Stolen time |  8  |  8  | Stolen time in unsigned
> > +  | | | nanoseconds indicating how
> > +  | | | much time this VCPU thread
> > +  | | | was involuntarily not
> > +  | | | running on a physical CPU.
> > +
> > +The structure will be updated by the hypervisor prior to scheduling a 
> > VCPU. It
> > +will be present within a reserved region of the normal memory given to the
> > +guest. The guest should not attempt to write into this memory. There is a
> > +structure per VCPU of the guest.
> > +
> > +User space interface
> > +
> > +
> > +User space can request that KVM provide the paravirtualized time interface 
> > to
> > +a guest by

Re: [PATCH v3 01/10] KVM: arm64: Document PV-time interface

2019-08-27 Thread Christoffer Dall
On Wed, Aug 21, 2019 at 04:36:47PM +0100, Steven Price wrote:
> Introduce a paravirtualization interface for KVM/arm64 based on the
> "Arm Paravirtualized Time for Arm-Base Systems" specification DEN 0057A.
> 
> This only adds the details about "Stolen Time" as the details of "Live
> Physical Time" have not been fully agreed.
> 
> User space can specify a reserved area of memory for the guest and
> inform KVM to populate the memory with information on time that the host
> kernel has stolen from the guest.
> 
> A hypercall interface is provided for the guest to interrogate the
> hypervisor's support for this interface and the location of the shared
> memory structures.
> 
> Signed-off-by: Steven Price 
> ---
>  Documentation/virt/kvm/arm/pvtime.txt | 100 ++
>  1 file changed, 100 insertions(+)
>  create mode 100644 Documentation/virt/kvm/arm/pvtime.txt
> 
> diff --git a/Documentation/virt/kvm/arm/pvtime.txt 
> b/Documentation/virt/kvm/arm/pvtime.txt
> new file mode 100644
> index ..1ceb118694e7
> --- /dev/null
> +++ b/Documentation/virt/kvm/arm/pvtime.txt
> @@ -0,0 +1,100 @@
> +Paravirtualized time support for arm64
> +==
> +
> +Arm specification DEN0057/A defined a standard for paravirtualised time
> +support for AArch64 guests:
> +
> +https://developer.arm.com/docs/den0057/a
> +
> +KVM/arm64 implements the stolen time part of this specification by providing
> +some hypervisor service calls to support a paravirtualized guest obtaining a
> +view of the amount of time stolen from its execution.
> +
> +Two new SMCCC compatible hypercalls are defined:
> +
> +PV_FEATURES 0xC520
> +PV_TIME_ST  0xC522
> +
> +These are only available in the SMC64/HVC64 calling convention as
> +paravirtualized time is not available to 32 bit Arm guests. The existence of
> +the PV_FEATURES hypercall should be probed using the SMCCC 1.1 ARCH_FEATURES
> +mechanism before calling it.
> +
> +PV_FEATURES
> +Function ID:  (uint32)  : 0xC520
> +PV_func_id:   (uint32)  : Either PV_TIME_LPT or PV_TIME_ST
> +Return value: (int32)   : NOT_SUPPORTED (-1) or SUCCESS (0) if the 
> relevant
> +  PV-time feature is supported by the hypervisor.
> +
> +PV_TIME_ST
> +Function ID:  (uint32)  : 0xC522
> +Return value: (int64)   : IPA of the stolen time data structure for this
> +  (V)CPU. On failure:
> +  NOT_SUPPORTED (-1)
> +
> +The IPA returned by PV_TIME_ST should be mapped by the guest as normal memory
> +with inner and outer write back caching attributes, in the inner shareable
> +domain. A total of 16 bytes from the IPA returned are guaranteed to be
> +meaningfully filled by the hypervisor (see structure below).
> +
> +PV_TIME_ST returns the structure for the calling VCPU.
> +
> +Stolen Time
> +---
> +
> +The structure pointed to by the PV_TIME_ST hypercall is as follows:
> +
> +  Field   | Byte Length | Byte Offset | Description
> +  --- | --- | --- | --
> +  Revision|  4  |  0  | Must be 0 for version 0.1
> +  Attributes  |  4  |  4  | Must be 0
> +  Stolen time |  8  |  8  | Stolen time in unsigned
> +  | | | nanoseconds indicating how
> +  | | | much time this VCPU thread
> +  | | | was involuntarily not
> +  | | | running on a physical CPU.
> +
> +The structure will be updated by the hypervisor prior to scheduling a VCPU. 
> It
> +will be present within a reserved region of the normal memory given to the
> +guest. The guest should not attempt to write into this memory. There is a
> +structure per VCPU of the guest.
> +
> +User space interface
> +
> +
> +User space can request that KVM provide the paravirtualized time interface to
> +a guest by creating a KVM_DEV_TYPE_ARM_PV_TIME device, for example:
> +
> +struct kvm_create_device pvtime_device = {
> +.type = KVM_DEV_TYPE_ARM_PV_TIME,
> +.attr = 0,
> +.flags = 0,
> +};
> +
> +pvtime_fd = ioctl(vm_fd, KVM_CREATE_DEVICE, _device);
> +
> +Creation of the device should be done after creating the vCPUs of the virtual
> +machine.
> +
> +The IPA of the structures must be given to KVM. This is the base address
> +of an array of stolen time structures (one for each VCPU). The base address
> +must be page aligned. The size must be at least 64 * number of VCPUs and be a
> +multiple of PAGE_SIZE.
> +
> +The memory for these structures should be added to the guest in the usual
> +manner (e.g. using KVM_SET_USER_MEMORY_REGION).
> +
> +For example:
> +
> +struct kvm_dev_arm_st_region region = {
> +.gpa = ,
> +.size = 
> +};

This feel fragile; how are you handling 

Re: [PATCH v3 01/10] KVM: arm64: Document PV-time interface

2019-08-27 Thread Christoffer Dall
On Wed, Aug 21, 2019 at 04:36:47PM +0100, Steven Price wrote:
> Introduce a paravirtualization interface for KVM/arm64 based on the
> "Arm Paravirtualized Time for Arm-Base Systems" specification DEN 0057A.
> 
> This only adds the details about "Stolen Time" as the details of "Live
> Physical Time" have not been fully agreed.
> 
> User space can specify a reserved area of memory for the guest and
> inform KVM to populate the memory with information on time that the host
> kernel has stolen from the guest.
> 
> A hypercall interface is provided for the guest to interrogate the
> hypervisor's support for this interface and the location of the shared
> memory structures.
> 
> Signed-off-by: Steven Price 
> ---
>  Documentation/virt/kvm/arm/pvtime.txt | 100 ++
>  1 file changed, 100 insertions(+)
>  create mode 100644 Documentation/virt/kvm/arm/pvtime.txt
> 
> diff --git a/Documentation/virt/kvm/arm/pvtime.txt 
> b/Documentation/virt/kvm/arm/pvtime.txt
> new file mode 100644
> index ..1ceb118694e7
> --- /dev/null
> +++ b/Documentation/virt/kvm/arm/pvtime.txt
> @@ -0,0 +1,100 @@
> +Paravirtualized time support for arm64
> +==
> +
> +Arm specification DEN0057/A defined a standard for paravirtualised time
> +support for AArch64 guests:
> +
> +https://developer.arm.com/docs/den0057/a
> +
> +KVM/arm64 implements the stolen time part of this specification by providing
> +some hypervisor service calls to support a paravirtualized guest obtaining a
> +view of the amount of time stolen from its execution.
> +
> +Two new SMCCC compatible hypercalls are defined:
> +
> +PV_FEATURES 0xC520
> +PV_TIME_ST  0xC522
> +
> +These are only available in the SMC64/HVC64 calling convention as
> +paravirtualized time is not available to 32 bit Arm guests. The existence of
> +the PV_FEATURES hypercall should be probed using the SMCCC 1.1 ARCH_FEATURES
> +mechanism before calling it.
> +
> +PV_FEATURES
> +Function ID:  (uint32)  : 0xC520
> +PV_func_id:   (uint32)  : Either PV_TIME_LPT or PV_TIME_ST
> +Return value: (int32)   : NOT_SUPPORTED (-1) or SUCCESS (0) if the 
> relevant
> +  PV-time feature is supported by the hypervisor.
> +
> +PV_TIME_ST
> +Function ID:  (uint32)  : 0xC522
> +Return value: (int64)   : IPA of the stolen time data structure for this
> +  (V)CPU. On failure:
> +  NOT_SUPPORTED (-1)
> +
> +The IPA returned by PV_TIME_ST should be mapped by the guest as normal memory
> +with inner and outer write back caching attributes, in the inner shareable
> +domain. A total of 16 bytes from the IPA returned are guaranteed to be
> +meaningfully filled by the hypervisor (see structure below).
> +
> +PV_TIME_ST returns the structure for the calling VCPU.
> +
> +Stolen Time
> +---
> +
> +The structure pointed to by the PV_TIME_ST hypercall is as follows:
> +
> +  Field   | Byte Length | Byte Offset | Description
> +  --- | --- | --- | --
> +  Revision|  4  |  0  | Must be 0 for version 0.1
> +  Attributes  |  4  |  4  | Must be 0
> +  Stolen time |  8  |  8  | Stolen time in unsigned
> +  | | | nanoseconds indicating how
> +  | | | much time this VCPU thread
> +  | | | was involuntarily not
> +  | | | running on a physical CPU.
> +
> +The structure will be updated by the hypervisor prior to scheduling a VCPU. 
> It
> +will be present within a reserved region of the normal memory given to the
> +guest. The guest should not attempt to write into this memory. There is a
> +structure per VCPU of the guest.
> +
> +User space interface
> +
> +
> +User space can request that KVM provide the paravirtualized time interface to
> +a guest by creating a KVM_DEV_TYPE_ARM_PV_TIME device, for example:
> +

I feel it would be more consistent to have the details of this in
Documentation/virt/kvm/devices/arm-pv-time.txt and refer to this
document from here.

> +struct kvm_create_device pvtime_device = {
> +.type = KVM_DEV_TYPE_ARM_PV_TIME,
> +.attr = 0,
> +.flags = 0,
> +};
> +
> +pvtime_fd = ioctl(vm_fd, KVM_CREATE_DEVICE, _device);
> +
> +Creation of the device should be done after creating the vCPUs of the virtual
> +machine.
> +
> +The IPA of the structures must be given to KVM. This is the base address
> +of an array of stolen time structures (one for each VCPU). The base address
> +must be page aligned. The size must be at least 64 * number of VCPUs and be a
> +multiple of PAGE_SIZE.
> +
> +The memory for these structures should be added to the guest in the usual
> +manner (e.g. using KVM_SET_USER_MEMORY_REGION).
> +
> +For example:

Re: [PATCH] kvm: arm: Promote KVM_ARM_TARGET_CORTEX_A7 to generic V7 core

2019-08-26 Thread Christoffer Dall
On Wed, Jul 31, 2019 at 10:17:53AM +0200, Jan Kiszka wrote:
> On 30.06.19 17:19, Jan Kiszka wrote:
> > From: Jan Kiszka 
> > 
> > The only difference between the currently supported A15 and A7 target
> > cores is the reset state of bit 11 in SCTLR. This bit is RES1 or RAO/WI
> > in other ARM cores, including ARMv8 ones. By promoting A7 to a generic
> > default target, this allows to use yet unsupported core types. E.g.,
> > this enables KVM on the A72 of the RPi4.
> > 
> > Signed-off-by: Jan Kiszka 
> > ---
> >  arch/arm/include/uapi/asm/kvm.h|  1 +
> >  arch/arm/kvm/Makefile  |  2 +-
> >  arch/arm/kvm/{coproc_a7.c => coproc_generic.c} | 18 +-
> >  arch/arm/kvm/guest.c   |  4 +---
> >  arch/arm/kvm/reset.c   |  5 +
> >  5 files changed, 13 insertions(+), 17 deletions(-)
> >  rename arch/arm/kvm/{coproc_a7.c => coproc_generic.c} (70%)
> > 
> > diff --git a/arch/arm/include/uapi/asm/kvm.h 
> > b/arch/arm/include/uapi/asm/kvm.h
> > index 4602464ebdfb..e0c5bbec3d3d 100644
> > --- a/arch/arm/include/uapi/asm/kvm.h
> > +++ b/arch/arm/include/uapi/asm/kvm.h
> > @@ -70,6 +70,7 @@ struct kvm_regs {
> >  /* Supported Processor Types */
> >  #define KVM_ARM_TARGET_CORTEX_A15  0
> >  #define KVM_ARM_TARGET_CORTEX_A7   1
> > +#define KVM_ARM_TARGET_GENERIC_V7  KVM_ARM_TARGET_CORTEX_A7
> >  #define KVM_ARM_NUM_TARGETS2
> > 
> >  /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
> > diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
> > index 531e59f5be9c..d959f89135d6 100644
> > --- a/arch/arm/kvm/Makefile
> > +++ b/arch/arm/kvm/Makefile
> > @@ -21,7 +21,7 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/
> > 
> >  obj-y += kvm-arm.o init.o interrupts.o
> >  obj-y += handle_exit.o guest.o emulate.o reset.o
> > -obj-y += coproc.o coproc_a15.o coproc_a7.o   vgic-v3-coproc.o
> > +obj-y += coproc.o coproc_a15.o coproc_generic.o   vgic-v3-coproc.o
> >  obj-y += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
> >  obj-y += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
> >  obj-y += $(KVM)/arm/aarch32.o
> > diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_generic.c
> > similarity index 70%
> > rename from arch/arm/kvm/coproc_a7.c
> > rename to arch/arm/kvm/coproc_generic.c
> > index 40f643e1e05c..b32a541ad7bf 100644
> > --- a/arch/arm/kvm/coproc_a7.c
> > +++ b/arch/arm/kvm/coproc_generic.c
> > @@ -15,28 +15,28 @@
> >  #include "coproc.h"
> > 
> >  /*
> > - * Cortex-A7 specific CP15 registers.
> > + * Generic CP15 registers.
> >   * CRn denotes the primary register number, but is copied to the CRm in the
> >   * user space API for 64-bit register access in line with the terminology 
> > used
> >   * in the ARM ARM.
> >   * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 and with 
> > 64-bit
> >   *registers preceding 32-bit ones.
> >   */
> > -static const struct coproc_reg a7_regs[] = {
> > +static const struct coproc_reg generic_regs[] = {
> > /* SCTLR: swapped by interrupt.S. */
> > { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
> > access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 },
> >  };
> > 
> > -static struct kvm_coproc_target_table a7_target_table = {
> > -   .target = KVM_ARM_TARGET_CORTEX_A7,
> > -   .table = a7_regs,
> > -   .num = ARRAY_SIZE(a7_regs),
> > +static struct kvm_coproc_target_table generic_target_table = {
> > +   .target = KVM_ARM_TARGET_GENERIC_V7,
> > +   .table = generic_regs,
> > +   .num = ARRAY_SIZE(generic_regs),
> >  };
> > 
> > -static int __init coproc_a7_init(void)
> > +static int __init coproc_generic_init(void)
> >  {
> > -   kvm_register_target_coproc_table(_target_table);
> > +   kvm_register_target_coproc_table(_target_table);
> > return 0;
> >  }
> > -late_initcall(coproc_a7_init);
> > +late_initcall(coproc_generic_init);
> > diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
> > index 684cf64b4033..d33a24e70f49 100644
> > --- a/arch/arm/kvm/guest.c
> > +++ b/arch/arm/kvm/guest.c
> > @@ -275,12 +275,10 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
> >  int __attribute_const__ kvm_target_cpu(void)
> >  {
> > switch (read_cpuid_part()) {
> > -   case ARM_CPU_PART_CORTEX_A7:
> > -   return KVM_ARM_TARGET_CORTEX_A7;
> > case ARM_CPU_PART_CORTEX_A15:
> > return KVM_ARM_TARGET_CORTEX_A15;
> > default:
> > -   return -EINVAL;
> > +   return KVM_ARM_TARGET_GENERIC_V7;
> > }
> >  }
> > 
> > diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
> > index eb4174f6ebbd..d6e07500bab4 100644
> > --- a/arch/arm/kvm/reset.c
> > +++ b/arch/arm/kvm/reset.c
> > @@ -43,13 +43,10 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
> > struct kvm_regs *reset_regs;
> > 
> > switch (vcpu->arch.target) {
> > -   case KVM_ARM_TARGET_CORTEX_A7:
> > -   case KVM_ARM_TARGET_CORTEX_A15:
> > +   default:
> > reset_regs = _regs_reset;
> > 

Re: KVM works on RPi4

2019-08-26 Thread Christoffer Dall
Hi Jan,

On Sun, Jun 30, 2019 at 12:18:59PM +0200, Jan Kiszka wrote:
> On 30.06.19 11:34, Jan Kiszka wrote:
> >On 30.06.19 00:42, Marc Zyngier wrote:
> >>On Sat, 29 Jun 2019 19:09:37 +0200
> >>Jan Kiszka  wrote:
> >>>However, as the Raspberry kernel is not yet ready for 64-bit (and
> >>>upstream is not in sight), I had to use legacy 32-bit mode. And there we
> >>>stumble over the core detection. This little patch made it work, though:
> >>>
> >>>diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
> >>>index 2b8de885b2bf..01606aad73cc 100644
> >>>--- a/arch/arm/kvm/guest.c
> >>>+++ b/arch/arm/kvm/guest.c
> >>>@@ -290,6 +290,7 @@ int __attribute_const__ kvm_target_cpu(void)
> >>>  case ARM_CPU_PART_CORTEX_A7:
> >>>  return KVM_ARM_TARGET_CORTEX_A7;
> >>>  case ARM_CPU_PART_CORTEX_A15:
> >>>+    case ARM_CPU_PART_CORTEX_A72:
> >>>  return KVM_ARM_TARGET_CORTEX_A15;
> >>>  default:
> >>>  return -EINVAL;
> >>>
> >>>That raises the question if this is hack or a valid change and if there
> >>>is general interest in mapping 64-bit cores on 32-bit if they happen to
> >>>run in 32-bit mode.
> >>
> >>The real thing to do here would be to move to a generic target, much
> >>like we did on the 64bit side. Could you investigate that instead? It
> >>would also allow KVM to be used on other 32bit cores such as
> >>A12/A17/A32.
> >
> >You mean something like KVM_ARM_TARGET_GENERIC_V8? Need to study that...
> >
> 
> Hmm, looking at what KVM_ARM_TARGET_CORTEX_A7 and ..._A15 differentiates, I
> found nothing so far:
> 
> kvm_reset_vcpu:
> switch (vcpu->arch.target) {
> case KVM_ARM_TARGET_CORTEX_A7:
> case KVM_ARM_TARGET_CORTEX_A15:
> reset_regs = _regs_reset;
> vcpu->arch.midr = read_cpuid_id();
> break;
> 
> And arch/arm/kvm/coproc_a15.c looks like a copy of coproc_a7.c, just with some
> symbols renamed.
> 
> What's the purpose of all that? Planned for something bigger but never
> implemented? From that perspective, there seems to be no need to arch.target 
> and
> kvm_coproc_target_table at all.
> 

There was some speculation involved here, and we needed to figure out
how we would deal with implementation defined behavior, so we built this
support for each type of CPU etc.

In reality, most CPUs that we support are pretty similar and that's why
we did the generic CPU type instead.  In practice, there might be a more
light-weight appraoch to handling the minor differences between CPU
implementations than what we have here.


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Reference count on pages held in secondary MMUs

2019-06-26 Thread Christoffer Dall
On Sat, Jun 22, 2019 at 03:11:36PM -0400, Andrea Arcangeli wrote:
> Hello Christoffer,
> 
> On Tue, Jun 11, 2019 at 01:51:32PM +0200, Christoffer Dall wrote:
> > Sorry, what does this mean?  Do you mean that we can either do:
> > 
> >   on_vm_s2_fault() {
> >   page = gup();
> >   map_page_in_s2_mmu();
> >   put_page();
> >   }
> > 
> >   mmu_notifier_invalidate() {
> >   unmap_page_in_s2_mmu();
> >   }
> > 
> > or
> > 
> >   on_vm_s2_fault() {
> >   page = gup();
> >   map_page_in_s2_mmu();
> >   }
> > 
> >   mmu_notifier_invalidate() {
> >   unmap_page_in_s2_mmu();
> >   put_page();
> >   }
> 
> Yes both work, refcounting always works.
> 
> > > and in fact Jerome also thinks
> > > like me that we should eventually optimize away the FOLL_GET and not
> > > take the refcount in the first place, 
> > 
> > So if I understood the above correct, the next point is that there are
> > advantages to avoiding keeping the extra reference on that page, because
> > we have problematic race conditions related to set_page_dirty(), and we
> > can reduce the problem of race conditions further by not getting a
> > reference on the page at all when going GUP as part of a KVM fault?
> 
> You could still keep the extra reference until the
> invalidate.
> 
> The set_page_dirty however if you do in the context of the secondary
> MMU fault (i.e. atomically with the mapping of the page in the
> secondary MMU, with respect of MMU notifier invalidates), it solves
> the whole problem with the ->mkwrite/mkclean and then you can keep a
> GUP long term pin fully safely already. That is a solution that always
> works and becomes guaranteed by design by the MMU notifier not to
> interfere with the _current_ writeback code in the filesystem. It also
> already provides stable pages.
> 

Ok.

> > Can you explain, or provide a pointer to, the root cause of the
> > problem with holding a reference on the page and setting it dirty?
> 
> The filesystem/VM doesn't possibly expect set_page_dirty to be called
> again after it called page_mkclean. Supposedly a wrprotect fault
> should have been generated if somebody tried to write to the page
> under writeback, so page_mkwrite should have run again before you
> could have called set_page_dirty.
> 
> Instead page_mkclean failed to get rid of the long term GUP obtained
> with FOLL_WRITE because it simply can't ask the device to release it
> without MMU notifier, so the device can later still call
> set_page_dirty despite page_mkclean already run.
> 

I see, I'm now able to link this to recent articles on LWN.

> > > but a whole different chapter is
> > > dedicated on the set_page_dirty_lock crash on MAP_SHARED mappings
> > > after long term GUP pins. So since you're looking into how to handle
> > > the page struct in the MMU notifier it's worth mentioning the issues
> > > related to set_page_dirty too.
> > 
> > Is there some background info on the "set_page_dirty_lock crash on
> > MAP_SHARED" ?  I'm having trouble following this without the background.
> 
> Jan Kara leaded the topic explained all the details on this filesystem
> issue at the LSF-MM and also last year.
> 
> Which is what makes me think there can't be too many uses cases that
> require writback to work while long term GUP pin allow some device to
> write to the pages at any given time, if nobody requires this to be
> urgently fixed.
> 
> You can find coverage on lwn and on linux-mm.
> 
> > 
> > > 
> > > To achieve the cleanest writeback fix to avoid crashes in
> > > set_page_dirty_lock on long term secondary MMU mappings that supports
> > > MMU notifier like KVM shadow MMU, the ideal is to mark the page dirty
> > > before establishing a writable the mapping in the secondary MMU like
> > > in the model below.
> > > 
> > > The below solution works also for those secondary MMU that are like a
> > > TLB and if there are two concurrent invalidates on the same page
> > > invoked at the same time (a potential problem Jerome noticed), you
> > > don't know which come out first and you would risk to call
> > > set_page_dirty twice, which would be still potentially kernel crashing
> > > (even if only a theoretical issue like O_DIRECT).
> > 
> > Why is it problematic to call set_page_dirty() twice?  I thought that at
> > worst it would only lead to writing out data to disk unnecessarily ?
> 
> According to Jerome, after the first set_page_dirty returns, wri

Re: [PATCH] KVM: arm/arm64: fix emulated ptimer irq injection

2019-06-13 Thread Christoffer Dall
On Thu, Jun 13, 2019 at 11:01:41AM +0100, Marc Zyngier wrote:
> On Mon, 03 Jun 2019 13:14:40 +0100,
> Andrew Jones  wrote:
> > 
> > On Wed, May 29, 2019 at 12:03:11PM +0200, Christoffer Dall wrote:
> > > On Wed, May 29, 2019 at 10:13:21AM +0100, Marc Zyngier wrote:
> > > > On 29/05/2019 10:08, Christoffer Dall wrote:
> > > > > On Tue, May 28, 2019 at 05:08:53PM +0100, Marc Zyngier wrote:
> > > > >> On 28/05/2019 14:40, Andrew Jones wrote:
> > > > >>> On Tue, May 28, 2019 at 03:12:15PM +0200, Christoffer Dall wrote:
> > > > >>>> On Tue, May 28, 2019 at 01:25:52PM +0100, Marc Zyngier wrote:
> > > > >>>>> On 28/05/2019 12:01, Christoffer Dall wrote:
> > > > >>>>>> On Mon, May 27, 2019 at 01:46:19PM +0200, Andrew Jones wrote:
> > > > >>>>>>> The emulated ptimer needs to track the level changes, otherwise 
> > > > >>>>>>> the
> > > > >>>>>>> the interrupt will never get deasserted, resulting in the guest 
> > > > >>>>>>> getting
> > > > >>>>>>> stuck in an interrupt storm if it enables ptimer interrupts. 
> > > > >>>>>>> This was
> > > > >>>>>>> found with kvm-unit-tests; the ptimer tests hung as soon as 
> > > > >>>>>>> interrupts
> > > > >>>>>>> were enabled. Typical Linux guests don't have a problem as they 
> > > > >>>>>>> prefer
> > > > >>>>>>> using the virtual timer.
> > > > >>>>>>>
> > > > >>>>>>> Fixes: bee038a674875 ("KVM: arm/arm64: Rework the timer code to 
> > > > >>>>>>> use a timer_map")
> > > > >>>>>>> Signed-off-by: Andrew Jones 
> > > > >>>>>>> ---
> > > > >>>>>>>  virt/kvm/arm/arch_timer.c | 7 ++-
> > > > >>>>>>>  1 file changed, 6 insertions(+), 1 deletion(-)
> > > > >>>>>>>
> > > > >>>>>>> diff --git a/virt/kvm/arm/arch_timer.c 
> > > > >>>>>>> b/virt/kvm/arm/arch_timer.c
> > > > >>>>>>> index 7fc272ecae16..9f5d8cc8b5e5 100644
> > > > >>>>>>> --- a/virt/kvm/arm/arch_timer.c
> > > > >>>>>>> +++ b/virt/kvm/arm/arch_timer.c
> > > > >>>>>>> @@ -324,10 +324,15 @@ static void kvm_timer_update_irq(struct 
> > > > >>>>>>> kvm_vcpu *vcpu, bool new_level,
> > > > >>>>>>>  static void timer_emulate(struct arch_timer_context *ctx)
> > > > >>>>>>>  {
> > > > >>>>>>> bool should_fire = kvm_timer_should_fire(ctx);
> > > > >>>>>>> +   struct timer_map map;
> > > > >>>>>>> +
> > > > >>>>>>> +   get_timer_map(ctx->vcpu, );
> > > > >>>>>>>  
> > > > >>>>>>> trace_kvm_timer_emulate(ctx, should_fire);
> > > > >>>>>>>  
> > > > >>>>>>> -   if (should_fire) {
> > > > >>>>>>> +   if (ctx == map.emul_ptimer && should_fire != 
> > > > >>>>>>> ctx->irq.level) {
> > > > >>>>>>> +   kvm_timer_update_irq(ctx->vcpu, 
> > > > >>>>>>> !ctx->irq.level, ctx);
> > > > >>>>>>> +   } else if (should_fire) {
> > > > >>>>>>> kvm_timer_update_irq(ctx->vcpu, true, ctx);
> > > > >>>>>>> return;
> > > > >>>>>>> }
> > > > >>>>>>
> > > > >>>>>> Hmm, this doesn't feel completely right.
> > > > >>>
> > > > >>> I won't try to argue that this is the right fix, as I haven't fully
> > > > >>> grasped how all this code works, but, afaict, this is how it worked
> > > > >>> prior to bee038a6.
> > > > >>>
> > > > >>>>>>
> 

Re: Reference count on pages held in secondary MMUs

2019-06-11 Thread Christoffer Dall
On Sun, Jun 09, 2019 at 01:40:24PM -0400, Andrea Arcangeli wrote:
> Hello,
> 
> On Sun, Jun 09, 2019 at 11:37:19AM +0200, Paolo Bonzini wrote:
> > On 09/06/19 10:18, Christoffer Dall wrote:
> > > In some sense, we are thus maintaining a 'hidden', or internal,
> > > reference to the page, which is not counted anywhere.
> > > 
> > > I am wondering if it would be equally valid to take a reference on the
> > > page, and remove that reference when unmapping via MMU notifiers, and if
> > > so, if there would be any advantages/drawbacks in doing so?
> > 
> > If I understand correctly, I think the MMU notifier would not fire if
> > you took an actual reference; the page would be pinned in memory and
> > could not be swapped out.
> 
> MMU notifiers still fires, the refcount is simple and can be dropped
> also in the mmu notifier invalidate 

Sorry, what does this mean?  Do you mean that we can either do:

  on_vm_s2_fault() {
  page = gup();
  map_page_in_s2_mmu();
  put_page();
  }

  mmu_notifier_invalidate() {
  unmap_page_in_s2_mmu();
  }

or

  on_vm_s2_fault() {
  page = gup();
  map_page_in_s2_mmu();
  }

  mmu_notifier_invalidate() {
  unmap_page_in_s2_mmu();
  put_page();
  }


> and in fact Jerome also thinks
> like me that we should eventually optimize away the FOLL_GET and not
> take the refcount in the first place, 

So if I understood the above correct, the next point is that there are
advantages to avoiding keeping the extra reference on that page, because
we have problematic race conditions related to set_page_dirty(), and we
can reduce the problem of race conditions further by not getting a
reference on the page at all when going GUP as part of a KVM fault?

Can you explain, or provide a pointer to, the root cause of the
problem with holding a reference on the page and setting it dirty?

> but a whole different chapter is
> dedicated on the set_page_dirty_lock crash on MAP_SHARED mappings
> after long term GUP pins. So since you're looking into how to handle
> the page struct in the MMU notifier it's worth mentioning the issues
> related to set_page_dirty too.

Is there some background info on the "set_page_dirty_lock crash on
MAP_SHARED" ?  I'm having trouble following this without the background.

> 
> To achieve the cleanest writeback fix to avoid crashes in
> set_page_dirty_lock on long term secondary MMU mappings that supports
> MMU notifier like KVM shadow MMU, the ideal is to mark the page dirty
> before establishing a writable the mapping in the secondary MMU like
> in the model below.
> 
> The below solution works also for those secondary MMU that are like a
> TLB and if there are two concurrent invalidates on the same page
> invoked at the same time (a potential problem Jerome noticed), you
> don't know which come out first and you would risk to call
> set_page_dirty twice, which would be still potentially kernel crashing
> (even if only a theoretical issue like O_DIRECT).

Why is it problematic to call set_page_dirty() twice?  I thought that at
worst it would only lead to writing out data to disk unnecessarily ?

I am also not familiar with a problem related to KVM and O_DIRECT, so
I'm having trouble keeping up here as well :(


> So the below model
> will solve that and it's also valid for KVM/vhost accelleration,
> despite KVM can figure out how to issue a single set_page_dirty call
> for each spte that gets invalidated by concurrent invalidates on the
> same page because it has shadow pagetables and it's not just a TLB.
> 
>   access = FOLL_WRITE|FOLL_GET
> 
> repeat:
>   page = gup(access)
>   put_page(page)
> 
>   spin_lock(mmu_notifier_lock);
>   if (race with invalidate) {
> spin_unlock..
> goto repeat;
>   }
>   if (access == FOLL_WRITE)
> set_page_dirty(page)
>   establish writable mapping in secondary MMU on page
>   spin_unlock
> 
> The above solves the crash in set_page_dirty_lock without having to
> modify any filesystem, it should work theoretically safer than the
> O_DIRECT short term GUP pin.

That is not exactly how we do things today on the arm64 side.  We do
something that looks like:

  /*
   * user_mem_abort is our function for a secondary MMU fault that
   * resolves to a memslot.
   */
  user_mem_abort() {
  page = gup(access, );
  spin_lock(>mmu_lock);
  if (mmu_notifier_retry(kvm, mmu_seq))
  goto out; /* run the VM again and see what happens */

  if (writable)
  kvm_set_pfn_dirty(page_to_pfn(page));
  stage2_set_pte(); /* establish_writable mapping in secondary MMU on page 
*/

  out:
  spin_unlock(_mmu_lock);
  put_page(page);
  }

Should we rework this to address the race you are refering to, and are
other architectures 

Re: Reference count on pages held in secondary MMUs

2019-06-11 Thread Christoffer Dall
On Sun, Jun 09, 2019 at 11:37:19AM +0200, Paolo Bonzini wrote:
> On 09/06/19 10:18, Christoffer Dall wrote:
> > In some sense, we are thus maintaining a 'hidden', or internal,
> > reference to the page, which is not counted anywhere.
> > 
> > I am wondering if it would be equally valid to take a reference on the
> > page, and remove that reference when unmapping via MMU notifiers, and if
> > so, if there would be any advantages/drawbacks in doing so?
> 
> If I understand correctly, I think the MMU notifier would not fire if
> you took an actual reference; the page would be pinned in memory and
> could not be swapped out.
> 

That was my understanding too, but I can't find the code path that would
support this theory.

The closest thing I could find was is_page_cache_freeable(), and as far
as I'm able to understand that code, that is called (via pageout()) later in
shrink_page_list() than try_to_unmap() which fires the MMU notifiers
through the rmap code.

It is entirely possible that I'm looking at the wrong place and missing
something overall though?


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Reference count on pages held in secondary MMUs

2019-06-09 Thread Christoffer Dall
Hi,

I have been looking at how we deal with page_count(page) on pages held
in stage 2 page tables on KVM/arm64.

What we do currently is to drop the reference on the page we get from
get_user_pages() once the page is inserted into our stage 2 page table,
typically leaving page_count(page) == page_mapcount(page) == 1 which
represents the userspace stage 1 mapping of the page,
and we rely on MMU notifiers to remove the stage 2 mapping if
corresponding stage 1 mapping is being unmapped.

I believe this is analogous to what other architectures do?

In some sense, we are thus maintaining a 'hidden', or internal,
reference to the page, which is not counted anywhere.

I am wondering if it would be equally valid to take a reference on the
page, and remove that reference when unmapping via MMU notifiers, and if
so, if there would be any advantages/drawbacks in doing so?


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 0/4] KVM: Unify mmu_memory_cache functionality across architectures

2019-06-04 Thread Christoffer Dall
We currently have duplicated functionality for the mmu_memory_cache used
to pre-allocate memory for the page table manipulation code which cannot
allocate memory while holding spinlocks.  This functionality is
duplicated across x86, arm/arm64, and mips.

There were recently a debate of modifying the arm code to be more in
line with the x86 code and some discussions around changing the page
flags used for allocation.  This series should make it easier to take a
uniform approach across architectures.

While there's not a huge amount of code sharing, we come out with a net
gain.

Only tested on arm/arm64, and only compile-tested on x86 and mips.

Changes since v2:
 - Simplified kalloc flag definitions as per Paolo's review comment.

Changes since v1:
 - Split out rename from initial x86 patch to have separate patches to
   move the logic to common code and to rename.
 - Introduce KVM_ARCH_WANT_MMU_MEMCACHE to avoid compile breakage on
   architectures that don't use this functionality.
 - Rename KVM_NR_MEM_OBJS to KVM_MMU_NR_MEMCACHE_OBJS

Christoffer Dall (4):
  KVM: x86: Move mmu_memory_cache functions to common code
  KVM: x86: Rename mmu_memory_cache to kvm_mmu_memcache
  KVM: arm/arm64: Move to common kvm_mmu_memcache infrastructure
  KVM: mips: Move to common kvm_mmu_memcache infrastructure

 arch/arm/include/asm/kvm_host.h  | 13 +---
 arch/arm/include/asm/kvm_mmu.h   |  2 +-
 arch/arm/include/asm/kvm_types.h | 11 
 arch/arm64/include/asm/kvm_host.h| 13 +---
 arch/arm64/include/asm/kvm_mmu.h |  2 +-
 arch/arm64/include/asm/kvm_types.h   | 12 
 arch/mips/include/asm/kvm_host.h | 15 +
 arch/mips/include/asm/kvm_types.h| 11 
 arch/mips/kvm/mips.c |  2 +-
 arch/mips/kvm/mmu.c  | 54 +++-
 arch/powerpc/include/asm/kvm_types.h |  5 ++
 arch/s390/include/asm/kvm_types.h|  5 ++
 arch/x86/include/asm/kvm_host.h  | 17 +
 arch/x86/include/asm/kvm_types.h | 11 
 arch/x86/kvm/mmu.c   | 97 ++--
 arch/x86/kvm/paging_tmpl.h   |  4 +-
 include/linux/kvm_host.h | 11 
 include/linux/kvm_types.h| 13 
 virt/kvm/arm/arm.c   |  2 +-
 virt/kvm/arm/mmu.c   | 68 +--
 virt/kvm/kvm_main.c  | 60 +
 21 files changed, 198 insertions(+), 230 deletions(-)
 create mode 100644 arch/arm/include/asm/kvm_types.h
 create mode 100644 arch/arm64/include/asm/kvm_types.h
 create mode 100644 arch/mips/include/asm/kvm_types.h
 create mode 100644 arch/powerpc/include/asm/kvm_types.h
 create mode 100644 arch/s390/include/asm/kvm_types.h
 create mode 100644 arch/x86/include/asm/kvm_types.h

-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 2/4] KVM: x86: Rename mmu_memory_cache to kvm_mmu_memcache

2019-06-04 Thread Christoffer Dall
As we have moved the mmu memory cache definitions and functions to
common code, they are exported as symols to the rest of the kernel.

Let's rename the functions and data types to have a kvm_ prefix to make
it clear where these functions belong and take this chance to rename
memory_cache to memcache to avoid overly long lines.

This is a bit tedious on the callsites but ends up looking more
palatable.

Signed-off-by: Christoffer Dall 
---
 arch/x86/include/asm/kvm_host.h  |  6 ++---
 arch/x86/include/asm/kvm_types.h |  4 ++--
 arch/x86/kvm/mmu.c   | 38 
 arch/x86/kvm/paging_tmpl.h   |  4 ++--
 include/linux/kvm_host.h | 14 ++--
 include/linux/kvm_types.h|  6 ++---
 virt/kvm/kvm_main.c  | 14 ++--
 7 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 85d54aff72ec..908e07fb2368 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -593,9 +593,9 @@ struct kvm_vcpu_arch {
 */
struct kvm_mmu *walk_mmu;
 
-   struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
-   struct kvm_mmu_memory_cache mmu_page_cache;
-   struct kvm_mmu_memory_cache mmu_page_header_cache;
+   struct kvm_mmu_memcache mmu_pte_list_desc_cache;
+   struct kvm_mmu_memcache mmu_page_cache;
+   struct kvm_mmu_memcache mmu_page_header_cache;
 
/*
 * QEMU userspace and the guest each have their own FPU state.
diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h
index ef35a627f69e..d2da445502ba 100644
--- a/arch/x86/include/asm/kvm_types.h
+++ b/arch/x86/include/asm/kvm_types.h
@@ -2,9 +2,9 @@
 #ifndef _ASM_X86_KVM_TYPES_H
 #define _ASM_X86_KVM_TYPES_H
 
-#define KVM_ARCH_WANT_MMU_MEMORY_CACHE
+#define KVM_ARCH_WANT_MMU_MEMCACHE
 
-#define KVM_NR_MEM_OBJS 40
+#define KVM_MMU_NR_MEMCACHE_OBJS 40
 
 #define KVM_MMU_CACHE_GFP_FLAGS__GFP_ACCOUNT
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 79cf345e5d7c..0cfa219b186a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -971,35 +971,35 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu 
*vcpu)
local_irq_enable();
 }
 
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+static int kvm_mmu_topup_memcaches(struct kvm_vcpu *vcpu)
 {
int r;
 
-   r = mmu_topup_memory_cache(>arch.mmu_pte_list_desc_cache,
+   r = kvm_mmu_topup_memcache(>arch.mmu_pte_list_desc_cache,
   pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
if (r)
goto out;
-   r = mmu_topup_memory_cache_page(>arch.mmu_page_cache, 8);
+   r = kvm_mmu_topup_memcache_page(>arch.mmu_page_cache, 8);
if (r)
goto out;
-   r = mmu_topup_memory_cache(>arch.mmu_page_header_cache,
+   r = kvm_mmu_topup_memcache(>arch.mmu_page_header_cache,
   mmu_page_header_cache, 4);
 out:
return r;
 }
 
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+static void kvm_mmu_free_memcaches(struct kvm_vcpu *vcpu)
 {
-   mmu_free_memory_cache(>arch.mmu_pte_list_desc_cache,
+   kvm_mmu_free_memcache(>arch.mmu_pte_list_desc_cache,
pte_list_desc_cache);
-   mmu_free_memory_cache_page(>arch.mmu_page_cache);
-   mmu_free_memory_cache(>arch.mmu_page_header_cache,
+   kvm_mmu_free_memcache_page(>arch.mmu_page_cache);
+   kvm_mmu_free_memcache(>arch.mmu_page_header_cache,
mmu_page_header_cache);
 }
 
 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 {
-   return mmu_memory_cache_alloc(>arch.mmu_pte_list_desc_cache);
+   return kvm_mmu_memcache_alloc(>arch.mmu_pte_list_desc_cache);
 }
 
 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -1319,10 +1319,10 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm 
*kvm, gfn_t gfn,
 
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
 {
-   struct kvm_mmu_memory_cache *cache;
+   struct kvm_mmu_memcache *cache;
 
cache = >arch.mmu_pte_list_desc_cache;
-   return mmu_memory_cache_free_objects(cache);
+   return kvm_mmu_memcache_free_objects(cache);
 }
 
 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -2005,10 +2005,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct 
kvm_vcpu *vcpu, int direct
 {
struct kvm_mmu_page *sp;
 
-   sp = mmu_memory_cache_alloc(>arch.mmu_page_header_cache);
-   sp->spt = mmu_memory_cache_alloc(>arch.mmu_page_cache);
+   sp = kvm_mmu_memcache_alloc(>arch.mmu_page_header_cache);
+   sp->spt = kvm_mmu_memcache_alloc(>arch.mmu_page_cache);
if (!direct)
-   sp->gfns = mmu_memory_cache_alloc(>ar

[PATCH v3 4/4] KVM: mips: Move to common kvm_mmu_memcache infrastructure

2019-06-04 Thread Christoffer Dall
Now that we have a common infrastructure for doing MMU cache
allocations, use this for mips as well.

Signed-off-by: Christoffer Dall 
---
 arch/mips/include/asm/kvm_host.h  | 15 ++---
 arch/mips/include/asm/kvm_types.h |  6 
 arch/mips/kvm/mips.c  |  2 +-
 arch/mips/kvm/mmu.c   | 54 ++-
 4 files changed, 19 insertions(+), 58 deletions(-)

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 41204a49cf95..418c941f1382 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -293,17 +293,6 @@ struct kvm_mips_tlb {
long tlb_lo[2];
 };
 
-#define KVM_NR_MEM_OBJS 4
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_mmu_memory_cache {
-   int nobjs;
-   void *objects[KVM_NR_MEM_OBJS];
-};
-
 #define KVM_MIPS_AUX_FPU   0x1
 #define KVM_MIPS_AUX_MSA   0x2
 
@@ -378,7 +367,7 @@ struct kvm_vcpu_arch {
unsigned int last_user_gasid;
 
/* Cache some mmu pages needed inside spinlock regions */
-   struct kvm_mmu_memory_cache mmu_page_cache;
+   struct kvm_mmu_memcache mmu_page_cache;
 
 #ifdef CONFIG_KVM_MIPS_VZ
/* vcpu's vzguestid is different on each host cpu in an smp system */
@@ -915,7 +904,7 @@ void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush 
flags);
 bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
 int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
 pgd_t *kvm_pgd_alloc(void);
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_memcaches(struct kvm_vcpu *vcpu);
 void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
  bool user);
 void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/include/asm/kvm_types.h 
b/arch/mips/include/asm/kvm_types.h
index 5efeb32a5926..f821c659a5b1 100644
--- a/arch/mips/include/asm/kvm_types.h
+++ b/arch/mips/include/asm/kvm_types.h
@@ -2,4 +2,10 @@
 #ifndef _ASM_MIPS_KVM_TYPES_H
 #define _ASM_MIPS_KVM_TYPES_H
 
+#define KVM_ARCH_WANT_MMU_MEMCACHE
+
+#define KVM_MMU_NR_MEMCACHE_OBJS 4
+
+#define KVM_MMU_CACHE_GFP_FLAGS 0
+
 #endif /* _ASM_MIPS_KVM_TYPES_H */
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 6d0517ac18e5..2737f837cd9f 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -425,7 +425,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 
kvm_mips_dump_stats(vcpu);
 
-   kvm_mmu_free_memory_caches(vcpu);
+   kvm_mmu_free_memcaches(vcpu);
kfree(vcpu->arch.guest_ebase);
kfree(vcpu->arch.kseg0_commpage);
kfree(vcpu);
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 97e538a8c1be..aed5284d642e 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -25,41 +25,9 @@
 #define KVM_MMU_CACHE_MIN_PAGES 2
 #endif
 
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- int min, int max)
+void kvm_mmu_free_memcaches(struct kvm_vcpu *vcpu)
 {
-   void *page;
-
-   BUG_ON(max > KVM_NR_MEM_OBJS);
-   if (cache->nobjs >= min)
-   return 0;
-   while (cache->nobjs < max) {
-   page = (void *)__get_free_page(GFP_KERNEL);
-   if (!page)
-   return -ENOMEM;
-   cache->objects[cache->nobjs++] = page;
-   }
-   return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
-   while (mc->nobjs)
-   free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
-{
-   void *p;
-
-   BUG_ON(!mc || !mc->nobjs);
-   p = mc->objects[--mc->nobjs];
-   return p;
-}
-
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-   mmu_free_memory_cache(>arch.mmu_page_cache);
+   kvm_mmu_free_memcache_page(>arch.mmu_page_cache);
 }
 
 /**
@@ -133,7 +101,7 @@ pgd_t *kvm_pgd_alloc(void)
  * NULL if a page table doesn't exist for @addr and !@cache.
  * NULL if a page table allocation failed.
  */
-static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
+static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memcache *cache,
unsigned long addr)
 {
pud_t *pud;
@@ -151,7 +119,7 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct 
kvm_mmu_memory_cache *cache,
 
if (!cache)
return NULL;
-   new_pmd = mmu_memory_cache_alloc(cache);
+   new_pmd = kvm_mmu_memcache_alloc(cache);
pmd_init((unsigned long)new_pmd,
 (unsigned long)invalid_pte_table);
pud_popul

[PATCH v3 1/4] KVM: x86: Move mmu_memory_cache functions to common code

2019-06-04 Thread Christoffer Dall
We are currently duplicating the mmu memory cache functionality quite
heavily between the architectures that support KVM.  As a first step,
move the x86 implementation (which seems to have the most recently
maintained version of the mmu memory cache) to common code.

We introduce an arch-specific kvm_types.h which can be used to
define the architecture-specific GFP flags for allocating memory to the
memory cache, and to specify how many objects are required in the memory
cache.  These are the two points where the current implementations
diverge across architectures.  Since kvm_host.h defines structures with
fields of the memcache object, we define the memcache structure in
kvm_types.h, and we include the architecture-specific kvm_types.h to
know the size of object in kvm_host.h.

We only define the functions and data types if
KVM_ARCH_WANT_MMU_MEMORY_CACHE is defined, because not all architectures
require the mmu memory cache.

Signed-off-by: Christoffer Dall 
---
 arch/arm/include/asm/kvm_types.h |  5 +++
 arch/arm64/include/asm/kvm_types.h   |  6 +++
 arch/mips/include/asm/kvm_types.h|  5 +++
 arch/powerpc/include/asm/kvm_types.h |  5 +++
 arch/s390/include/asm/kvm_types.h|  5 +++
 arch/x86/include/asm/kvm_host.h  | 11 -
 arch/x86/include/asm/kvm_types.h | 11 +
 arch/x86/kvm/mmu.c   | 59 ---
 include/linux/kvm_host.h | 11 +
 include/linux/kvm_types.h| 13 ++
 virt/kvm/kvm_main.c  | 60 
 11 files changed, 121 insertions(+), 70 deletions(-)
 create mode 100644 arch/arm/include/asm/kvm_types.h
 create mode 100644 arch/arm64/include/asm/kvm_types.h
 create mode 100644 arch/mips/include/asm/kvm_types.h
 create mode 100644 arch/powerpc/include/asm/kvm_types.h
 create mode 100644 arch/s390/include/asm/kvm_types.h
 create mode 100644 arch/x86/include/asm/kvm_types.h

diff --git a/arch/arm/include/asm/kvm_types.h b/arch/arm/include/asm/kvm_types.h
new file mode 100644
index ..bc389f82e88d
--- /dev/null
+++ b/arch/arm/include/asm/kvm_types.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ARM_KVM_TYPES_H
+#define _ASM_ARM_KVM_TYPES_H
+
+#endif /* _ASM_ARM_KVM_TYPES_H */
diff --git a/arch/arm64/include/asm/kvm_types.h 
b/arch/arm64/include/asm/kvm_types.h
new file mode 100644
index ..d0987007d581
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_types.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ARM64_KVM_TYPES_H
+#define _ASM_ARM64_KVM_TYPES_H
+
+#endif /* _ASM_ARM64_KVM_TYPES_H */
+
diff --git a/arch/mips/include/asm/kvm_types.h 
b/arch/mips/include/asm/kvm_types.h
new file mode 100644
index ..5efeb32a5926
--- /dev/null
+++ b/arch/mips/include/asm/kvm_types.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_MIPS_KVM_TYPES_H
+#define _ASM_MIPS_KVM_TYPES_H
+
+#endif /* _ASM_MIPS_KVM_TYPES_H */
diff --git a/arch/powerpc/include/asm/kvm_types.h 
b/arch/powerpc/include/asm/kvm_types.h
new file mode 100644
index ..f627eceaa314
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_types.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_KVM_TYPES_H
+#define _ASM_POWERPC_KVM_TYPES_H
+
+#endif /* _ASM_POWERPC_KVM_TYPES_H */
diff --git a/arch/s390/include/asm/kvm_types.h 
b/arch/s390/include/asm/kvm_types.h
new file mode 100644
index ..b66a81f8a354
--- /dev/null
+++ b/arch/s390/include/asm/kvm_types.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_KVM_TYPES_H
+#define _ASM_S390_KVM_TYPES_H
+
+#endif /* _ASM_S390_KVM_TYPES_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 450d69a1e6fa..85d54aff72ec 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -180,8 +180,6 @@ enum {
 
 #include 
 
-#define KVM_NR_MEM_OBJS 40
-
 #define KVM_NR_DB_REGS 4
 
 #define DR6_BD (1 << 13)
@@ -239,15 +237,6 @@ enum {
 
 struct kvm_kernel_irq_routing_entry;
 
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_mmu_memory_cache {
-   int nobjs;
-   void *objects[KVM_NR_MEM_OBJS];
-};
-
 /*
  * the pages used as guest page table on soft mmu are tracked by
  * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h
new file mode 100644
index ..ef35a627f69e
--- /dev/null
+++ b/arch/x86/include/asm/kvm_types.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_TYPES_H
+#define _ASM_X86_KVM_TYPES_H
+
+#define KVM_ARCH_WANT_MMU_MEMORY_CACHE
+
+#define KVM_NR_MEM_OBJS 40
+
+#define KVM_MMU_CACHE_GFP_FLAGS__GFP_ACCOUNT
+
+#endif /* _ASM_X86_KVM_TYPES_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x

[PATCH v3 3/4] KVM: arm/arm64: Move to common kvm_mmu_memcache infrastructure

2019-06-04 Thread Christoffer Dall
Now when we have a common mmu mmemcache implementation, we can reuse
this for arm and arm64.

The common implementation has a slightly different behavior when
allocating objects under high memory pressure; whereas the current
arm/arm64 implementation will give up and return -ENOMEM if the full
size of the cache cannot be allocated during topup, the common
implementation is happy with any allocation between min and max.  There
should be no architecture-specific requirement for doing it one way or
the other and it's in fact better to enforce a cross-architecture KVM
policy on this behavior.

Signed-off-by: Christoffer Dall 
---
 arch/arm/include/asm/kvm_host.h| 13 +-
 arch/arm/include/asm/kvm_mmu.h |  2 +-
 arch/arm/include/asm/kvm_types.h   |  6 +++
 arch/arm64/include/asm/kvm_host.h  | 13 +-
 arch/arm64/include/asm/kvm_mmu.h   |  2 +-
 arch/arm64/include/asm/kvm_types.h |  6 +++
 virt/kvm/arm/arm.c |  2 +-
 virt/kvm/arm/mmu.c | 68 --
 8 files changed, 34 insertions(+), 78 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 075e1921fdd9..6f3f4ab65c8e 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -91,17 +91,6 @@ struct kvm_arch {
u32 psci_version;
 };
 
-#define KVM_NR_MEM_OBJS 40
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_mmu_memory_cache {
-   int nobjs;
-   void *objects[KVM_NR_MEM_OBJS];
-};
-
 struct kvm_vcpu_fault_info {
u32 hsr;/* Hyp Syndrome Register */
u32 hxfar;  /* Hyp Data/Inst. Fault Address Register */
@@ -210,7 +199,7 @@ struct kvm_vcpu_arch {
struct kvm_decode mmio_decode;
 
/* Cache some mmu pages needed inside spinlock regions */
-   struct kvm_mmu_memory_cache mmu_page_cache;
+   struct kvm_mmu_memcache mmu_page_cache;
 
struct vcpu_reset_state reset_state;
 
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 31de4ab93005..4a61159fadfb 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -71,7 +71,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t 
guest_ipa,
 
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_memcaches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
diff --git a/arch/arm/include/asm/kvm_types.h b/arch/arm/include/asm/kvm_types.h
index bc389f82e88d..9f944872a746 100644
--- a/arch/arm/include/asm/kvm_types.h
+++ b/arch/arm/include/asm/kvm_types.h
@@ -2,4 +2,10 @@
 #ifndef _ASM_ARM_KVM_TYPES_H
 #define _ASM_ARM_KVM_TYPES_H
 
+#define KVM_ARCH_WANT_MMU_MEMCACHE
+
+#define KVM_MMU_NR_MEMCACHE_OBJS 40
+
+#define KVM_MMU_CACHE_GFP_FLAGS__GFP_ZERO
+
 #endif /* _ASM_ARM_KVM_TYPES_H */
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 2a8d3f8ca22c..5f6bf47e165c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -96,17 +96,6 @@ struct kvm_arch {
u32 psci_version;
 };
 
-#define KVM_NR_MEM_OBJS 40
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_mmu_memory_cache {
-   int nobjs;
-   void *objects[KVM_NR_MEM_OBJS];
-};
-
 struct kvm_vcpu_fault_info {
u32 esr_el2;/* Hyp Syndrom Register */
u64 far_el2;/* Hyp Fault Address Register */
@@ -331,7 +320,7 @@ struct kvm_vcpu_arch {
struct kvm_decode mmio_decode;
 
/* Cache some mmu pages needed inside spinlock regions */
-   struct kvm_mmu_memory_cache mmu_page_cache;
+   struct kvm_mmu_memcache mmu_page_cache;
 
/* Target CPU and feature flags */
int target;
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index ebeefcf835e8..0b686498e64a 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -171,7 +171,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t 
guest_ipa,
 
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_memcaches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
diff --git a/arch/arm64/include/asm/kvm_types.h 
b/arch/arm64/include/asm/kvm_types.h
index d0987007d581..e28cf83b782b 100644
--- a/arch/arm64/include/asm/kvm_types.h
+++ b/arch/arm64/include/asm/kvm_types.h
@@ -2,5 +2,11 @@
 #ifndef _ASM_ARM64_KVM_TYPES_H
 #define _ASM_ARM64_KVM_TYPES_H
 
+#define KVM_ARCH_WANT_MMU_MEMCACHE
+
+#define

Re: [PATCH] KVM: arm/arm64: fix emulated ptimer irq injection

2019-05-29 Thread Christoffer Dall
On Wed, May 29, 2019 at 10:13:21AM +0100, Marc Zyngier wrote:
> On 29/05/2019 10:08, Christoffer Dall wrote:
> > On Tue, May 28, 2019 at 05:08:53PM +0100, Marc Zyngier wrote:
> >> On 28/05/2019 14:40, Andrew Jones wrote:
> >>> On Tue, May 28, 2019 at 03:12:15PM +0200, Christoffer Dall wrote:
> >>>> On Tue, May 28, 2019 at 01:25:52PM +0100, Marc Zyngier wrote:
> >>>>> On 28/05/2019 12:01, Christoffer Dall wrote:
> >>>>>> On Mon, May 27, 2019 at 01:46:19PM +0200, Andrew Jones wrote:
> >>>>>>> The emulated ptimer needs to track the level changes, otherwise the
> >>>>>>> the interrupt will never get deasserted, resulting in the guest 
> >>>>>>> getting
> >>>>>>> stuck in an interrupt storm if it enables ptimer interrupts. This was
> >>>>>>> found with kvm-unit-tests; the ptimer tests hung as soon as interrupts
> >>>>>>> were enabled. Typical Linux guests don't have a problem as they prefer
> >>>>>>> using the virtual timer.
> >>>>>>>
> >>>>>>> Fixes: bee038a674875 ("KVM: arm/arm64: Rework the timer code to use a 
> >>>>>>> timer_map")
> >>>>>>> Signed-off-by: Andrew Jones 
> >>>>>>> ---
> >>>>>>>  virt/kvm/arm/arch_timer.c | 7 ++-
> >>>>>>>  1 file changed, 6 insertions(+), 1 deletion(-)
> >>>>>>>
> >>>>>>> diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
> >>>>>>> index 7fc272ecae16..9f5d8cc8b5e5 100644
> >>>>>>> --- a/virt/kvm/arm/arch_timer.c
> >>>>>>> +++ b/virt/kvm/arm/arch_timer.c
> >>>>>>> @@ -324,10 +324,15 @@ static void kvm_timer_update_irq(struct 
> >>>>>>> kvm_vcpu *vcpu, bool new_level,
> >>>>>>>  static void timer_emulate(struct arch_timer_context *ctx)
> >>>>>>>  {
> >>>>>>>   bool should_fire = kvm_timer_should_fire(ctx);
> >>>>>>> + struct timer_map map;
> >>>>>>> +
> >>>>>>> + get_timer_map(ctx->vcpu, );
> >>>>>>>  
> >>>>>>>   trace_kvm_timer_emulate(ctx, should_fire);
> >>>>>>>  
> >>>>>>> - if (should_fire) {
> >>>>>>> + if (ctx == map.emul_ptimer && should_fire != ctx->irq.level) {
> >>>>>>> + kvm_timer_update_irq(ctx->vcpu, !ctx->irq.level, ctx);
> >>>>>>> + } else if (should_fire) {
> >>>>>>>   kvm_timer_update_irq(ctx->vcpu, true, ctx);
> >>>>>>>   return;
> >>>>>>>   }
> >>>>>>
> >>>>>> Hmm, this doesn't feel completely right.
> >>>
> >>> I won't try to argue that this is the right fix, as I haven't fully
> >>> grasped how all this code works, but, afaict, this is how it worked
> >>> prior to bee038a6.
> >>>
> >>>>>>
> >>>>>> Lowering the line of an emulated timer should only ever happen when the
> >>>>>> guest (or user space) writes to one of the system registers for that
> >>>>>> timer, which should be trapped and that should cause an update of the
> >>>>>> line.
> >>>>>>
> >>>>>> Are we missing a call to kvm_timer_update_irq() from
> >>>>>> kvm_arm_timer_set_reg() ?
> >>>>>
> >>>>> Which is exactly what we removed in 6bc210003dff, for good reasons.
> >>>>>
> >>>>
> >>>> Ah well, I can be wrong twice.  Or even three times.
> >>>>
> >>>>> Looking at kvm_arm_timer_write_sysreg(), we end-up calling 
> >>>>> kvm_timer_vcpu_load, but not updating the irq status.
> >>>>>
> >>>>> How about something like this instead (untested):
> >>>>>
> >>>>> diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
> >>>>> index 7fc272ecae16..6a418dcc5433 100644
> >>>>> --- a/virt/kvm/arm/arch_timer.c
> >>>>> +++ b/virt/kvm/arm/arch_timer.c
> >>>>> @@ -882,10 +882,14 @@ void kvm_a

Re: [PATCH] KVM: arm/arm64: fix emulated ptimer irq injection

2019-05-29 Thread Christoffer Dall
On Tue, May 28, 2019 at 05:08:53PM +0100, Marc Zyngier wrote:
> On 28/05/2019 14:40, Andrew Jones wrote:
> > On Tue, May 28, 2019 at 03:12:15PM +0200, Christoffer Dall wrote:
> >> On Tue, May 28, 2019 at 01:25:52PM +0100, Marc Zyngier wrote:
> >>> On 28/05/2019 12:01, Christoffer Dall wrote:
> >>>> On Mon, May 27, 2019 at 01:46:19PM +0200, Andrew Jones wrote:
> >>>>> The emulated ptimer needs to track the level changes, otherwise the
> >>>>> the interrupt will never get deasserted, resulting in the guest getting
> >>>>> stuck in an interrupt storm if it enables ptimer interrupts. This was
> >>>>> found with kvm-unit-tests; the ptimer tests hung as soon as interrupts
> >>>>> were enabled. Typical Linux guests don't have a problem as they prefer
> >>>>> using the virtual timer.
> >>>>>
> >>>>> Fixes: bee038a674875 ("KVM: arm/arm64: Rework the timer code to use a 
> >>>>> timer_map")
> >>>>> Signed-off-by: Andrew Jones 
> >>>>> ---
> >>>>>  virt/kvm/arm/arch_timer.c | 7 ++-
> >>>>>  1 file changed, 6 insertions(+), 1 deletion(-)
> >>>>>
> >>>>> diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
> >>>>> index 7fc272ecae16..9f5d8cc8b5e5 100644
> >>>>> --- a/virt/kvm/arm/arch_timer.c
> >>>>> +++ b/virt/kvm/arm/arch_timer.c
> >>>>> @@ -324,10 +324,15 @@ static void kvm_timer_update_irq(struct kvm_vcpu 
> >>>>> *vcpu, bool new_level,
> >>>>>  static void timer_emulate(struct arch_timer_context *ctx)
> >>>>>  {
> >>>>> bool should_fire = kvm_timer_should_fire(ctx);
> >>>>> +   struct timer_map map;
> >>>>> +
> >>>>> +   get_timer_map(ctx->vcpu, );
> >>>>>  
> >>>>> trace_kvm_timer_emulate(ctx, should_fire);
> >>>>>  
> >>>>> -   if (should_fire) {
> >>>>> +   if (ctx == map.emul_ptimer && should_fire != ctx->irq.level) {
> >>>>> +   kvm_timer_update_irq(ctx->vcpu, !ctx->irq.level, ctx);
> >>>>> +   } else if (should_fire) {
> >>>>> kvm_timer_update_irq(ctx->vcpu, true, ctx);
> >>>>> return;
> >>>>> }
> >>>>
> >>>> Hmm, this doesn't feel completely right.
> > 
> > I won't try to argue that this is the right fix, as I haven't fully
> > grasped how all this code works, but, afaict, this is how it worked
> > prior to bee038a6.
> > 
> >>>>
> >>>> Lowering the line of an emulated timer should only ever happen when the
> >>>> guest (or user space) writes to one of the system registers for that
> >>>> timer, which should be trapped and that should cause an update of the
> >>>> line.
> >>>>
> >>>> Are we missing a call to kvm_timer_update_irq() from
> >>>> kvm_arm_timer_set_reg() ?
> >>>
> >>> Which is exactly what we removed in 6bc210003dff, for good reasons.
> >>>
> >>
> >> Ah well, I can be wrong twice.  Or even three times.
> >>
> >>> Looking at kvm_arm_timer_write_sysreg(), we end-up calling 
> >>> kvm_timer_vcpu_load, but not updating the irq status.
> >>>
> >>> How about something like this instead (untested):
> >>>
> >>> diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
> >>> index 7fc272ecae16..6a418dcc5433 100644
> >>> --- a/virt/kvm/arm/arch_timer.c
> >>> +++ b/virt/kvm/arm/arch_timer.c
> >>> @@ -882,10 +882,14 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu 
> >>> *vcpu,
> >>>   enum kvm_arch_timer_regs treg,
> >>>   u64 val)
> >>>  {
> >>> + struct arch_timer_context *timer;
> >>> +
> >>>   preempt_disable();
> >>>   kvm_timer_vcpu_put(vcpu);
> >>>  
> >>> - kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
> >>> + timer = vcpu_get_timer(vcpu, tmr);
> >>> + kvm_arm_timer_write(vcpu, timer, treg, val);
> >>> + kvm_timer_update_irq(vcpu, kvm_timer_should_fire(timer), timer);
> >>>  
&g

Re: [PATCH] KVM: arm/arm64: fix emulated ptimer irq injection

2019-05-28 Thread Christoffer Dall
On Tue, May 28, 2019 at 01:25:52PM +0100, Marc Zyngier wrote:
> On 28/05/2019 12:01, Christoffer Dall wrote:
> > On Mon, May 27, 2019 at 01:46:19PM +0200, Andrew Jones wrote:
> >> The emulated ptimer needs to track the level changes, otherwise the
> >> the interrupt will never get deasserted, resulting in the guest getting
> >> stuck in an interrupt storm if it enables ptimer interrupts. This was
> >> found with kvm-unit-tests; the ptimer tests hung as soon as interrupts
> >> were enabled. Typical Linux guests don't have a problem as they prefer
> >> using the virtual timer.
> >>
> >> Fixes: bee038a674875 ("KVM: arm/arm64: Rework the timer code to use a 
> >> timer_map")
> >> Signed-off-by: Andrew Jones 
> >> ---
> >>  virt/kvm/arm/arch_timer.c | 7 ++-
> >>  1 file changed, 6 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
> >> index 7fc272ecae16..9f5d8cc8b5e5 100644
> >> --- a/virt/kvm/arm/arch_timer.c
> >> +++ b/virt/kvm/arm/arch_timer.c
> >> @@ -324,10 +324,15 @@ static void kvm_timer_update_irq(struct kvm_vcpu 
> >> *vcpu, bool new_level,
> >>  static void timer_emulate(struct arch_timer_context *ctx)
> >>  {
> >>bool should_fire = kvm_timer_should_fire(ctx);
> >> +  struct timer_map map;
> >> +
> >> +  get_timer_map(ctx->vcpu, );
> >>  
> >>trace_kvm_timer_emulate(ctx, should_fire);
> >>  
> >> -  if (should_fire) {
> >> +  if (ctx == map.emul_ptimer && should_fire != ctx->irq.level) {
> >> +  kvm_timer_update_irq(ctx->vcpu, !ctx->irq.level, ctx);
> >> +  } else if (should_fire) {
> >>kvm_timer_update_irq(ctx->vcpu, true, ctx);
> >>return;
> >>}
> > 
> > Hmm, this doesn't feel completely right.
> > 
> > Lowering the line of an emulated timer should only ever happen when the
> > guest (or user space) writes to one of the system registers for that
> > timer, which should be trapped and that should cause an update of the
> > line.
> > 
> > Are we missing a call to kvm_timer_update_irq() from
> > kvm_arm_timer_set_reg() ?
> 
> Which is exactly what we removed in 6bc210003dff, for good reasons.
> 

Ah well, I can be wrong twice.  Or even three times.

> Looking at kvm_arm_timer_write_sysreg(), we end-up calling 
> kvm_timer_vcpu_load, but not updating the irq status.
> 
> How about something like this instead (untested):
> 
> diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
> index 7fc272ecae16..6a418dcc5433 100644
> --- a/virt/kvm/arm/arch_timer.c
> +++ b/virt/kvm/arm/arch_timer.c
> @@ -882,10 +882,14 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
>   enum kvm_arch_timer_regs treg,
>   u64 val)
>  {
> + struct arch_timer_context *timer;
> +
>   preempt_disable();
>   kvm_timer_vcpu_put(vcpu);
>  
> - kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
> + timer = vcpu_get_timer(vcpu, tmr);
> + kvm_arm_timer_write(vcpu, timer, treg, val);
> + kvm_timer_update_irq(vcpu, kvm_timer_should_fire(timer), timer);
>  
>   kvm_timer_vcpu_load(vcpu);
>   preempt_enable();
> 

Yes, that looks reasonable.  Basically, in 6bc210003dff we should have
only removed the call to timer_emulate, and not messed around with
kvm_timer_update_irq()?

After this patch, we'll have moved the call to kvm_timer_update_irq()
from kvm_arm_timer_set_reg() to kvm_arm_timer_write_sysreg().  I can't
seem to decide if clearly belongs in one place or the other.

Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm/arm64: fix emulated ptimer irq injection

2019-05-28 Thread Christoffer Dall
On Mon, May 27, 2019 at 01:46:19PM +0200, Andrew Jones wrote:
> The emulated ptimer needs to track the level changes, otherwise the
> the interrupt will never get deasserted, resulting in the guest getting
> stuck in an interrupt storm if it enables ptimer interrupts. This was
> found with kvm-unit-tests; the ptimer tests hung as soon as interrupts
> were enabled. Typical Linux guests don't have a problem as they prefer
> using the virtual timer.
> 
> Fixes: bee038a674875 ("KVM: arm/arm64: Rework the timer code to use a 
> timer_map")
> Signed-off-by: Andrew Jones 
> ---
>  virt/kvm/arm/arch_timer.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
> index 7fc272ecae16..9f5d8cc8b5e5 100644
> --- a/virt/kvm/arm/arch_timer.c
> +++ b/virt/kvm/arm/arch_timer.c
> @@ -324,10 +324,15 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, 
> bool new_level,
>  static void timer_emulate(struct arch_timer_context *ctx)
>  {
>   bool should_fire = kvm_timer_should_fire(ctx);
> + struct timer_map map;
> +
> + get_timer_map(ctx->vcpu, );
>  
>   trace_kvm_timer_emulate(ctx, should_fire);
>  
> - if (should_fire) {
> + if (ctx == map.emul_ptimer && should_fire != ctx->irq.level) {
> + kvm_timer_update_irq(ctx->vcpu, !ctx->irq.level, ctx);
> + } else if (should_fire) {
>   kvm_timer_update_irq(ctx->vcpu, true, ctx);
>   return;
>   }

Hmm, this doesn't feel completely right.

Lowering the line of an emulated timer should only ever happen when the
guest (or user space) writes to one of the system registers for that
timer, which should be trapped and that should cause an update of the
line.

Are we missing a call to kvm_timer_update_irq() from
kvm_arm_timer_set_reg() ?


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH] MAINTAINERS: KVM: arm/arm64: Remove myself as maintainer

2019-05-21 Thread Christoffer Dall
I no longer have time to actively review patches and manage the tree and
it's time to make that official.

Huge thanks to the incredible Linux community and all the contributors
who have put up with me over the past years.

I also take this opportunity to remove the website link to the Columbia
web page, as that information is no longer up to date and I don't know
who manages that anymore.

Signed-off-by: Christoffer Dall 
---
 MAINTAINERS | 2 --
 1 file changed, 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 5cfbea4ce575..4ba271a8e0ef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8611,14 +8611,12 @@ F:  arch/x86/include/asm/svm.h
 F: arch/x86/kvm/svm.c
 
 KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64)
-M: Christoffer Dall 
 M: Marc Zyngier 
 R: James Morse 
 R: Julien Thierry 
 R: Suzuki K Pouloze 
 L: linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
 L: kvmarm@lists.cs.columbia.edu
-W: http://systems.cs.columbia.edu/projects/kvm-arm
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
 S: Maintained
 F: arch/arm/include/uapi/asm/kvm*
-- 
2.18.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC] arm/cpu: fix soft lockup panic after resuming from stop

2019-03-15 Thread Christoffer Dall
Hi Steve,

On Wed, Mar 13, 2019 at 10:11:30AM +, Steven Price wrote:
> 
> Personally I think what we need is:
> 
> * Either a patch like the one from Heyi Guo (save/restore CNTVCT_EL0) or
> alternatively hooking up KVM_KVMCLOCK_CTRL to prevent the watchdog
> firing when user space explicitly stops scheduling the guest for a while.

If we save/restore CNTVCT_EL0 and the warning goes away, does the guest
wall clock timekeeping get all confused and does it figure this out
automagically somehow?

Does KVM_KVMCLOCK_CTRL solve that problem?

> 
> * KVM itself saving/restoring CNTVCT_EL0 during suspend/resume so the
> guest doesn't see time pass during a suspend.

This smells like policy to me so I'd much prefer keeping as much
functionality in user space as possible.  If we already have the APIs we
need from KVM, let's use them.

> 
> * Something equivalent to MSR_KVM_WALL_CLOCK_NEW for arm which allows
> the guest to query the wall clock time from the host and provides an
> offset between CNTVCT_EL0 to wall clock time which the KVM can update
> during suspend/resume. This means that during a suspend/resume the guest
> can observe that wall clock time has passed, without having to be
> bothered about CNTVCT_EL0 jumping forwards.
> 

Isn't the proper Arm architectural solution for this to read the
physical counter for wall clock time keeping ?

(Yes that will require a trap on physical counter reads after migration
on current systems, but migration sucks in terms of timekeeping
already.)


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Kick cpu when WFI in single-threaded kvm integration

2019-03-15 Thread Christoffer Dall
Hi Jan,

On Thu, Mar 14, 2019 at 12:19:02PM +, Jan Bolke wrote:
> Hi all,
> 
> Currently I am working on a SystemC integration of kvm on arm.
> Therefore, I use the kvm api and of course SystemC (library to simulate 
> hardware platforms with C++).
> 
> As I need the virtual cpu to interrupt its execution loop from time to time 
> to let the rest of the SystemC simulation execute,
> I use a perf_event and let the kernel send a signal on overflow to the 
> simulation thread which kicks the virtual cpu (suggested by this mailing 
> list, thanks again).
> Thus I am able to simulate a quantum mechanism for the virtual cpu.
> 
> As I am running benchmarks (e.g. Coremark) on my virtual platform this works 
> fine.
> 
> I also get to boot Linux until it spawns the terminal and then wait for 
> interrupts from my virtual uart.
> Here comes the problem:
> The perf event counting mechanism does increment its counted instructions 
> very very slowly when the virtual cpu executes wfi.
> Thus my whole simulation starts to hang.
> As my simulation is single threaded I need the signal from the kernel to kick 
> my cpu to let the virtual uart deliver its interrupt to react to my input.
> I tried to use the request_interrupt_window flag but this does not seem to 
> work.
> 
> Is there a way to kick the virtual cpu when it is waiting for interrupts? Or 
> do I have to patch my kvm code?
> 

Let me see if I understand your question properly; you are running a KVM
virtual CPU which executes WFI in the guest, and then you are not
receiving interrupts in the guest nor getting events back from KVM which
you somehow use to run a backend simulation in userspace?

KVM/Arm can do two things for WFI:

 1. Let the guest directly execute it without trapping to the hypervisor
(the physical CPU will NOT exit the guest until there's a physical
interrupt on the CPU).

 2. Trap WFI to KVM.  KVM asks Linux to schedule another process until
there's a virtual interrupt for the VCPU.  This is what mainline
KVM/Arm does.


I suspect what's happening is that you are using a normal kernel
configured as (2), and therefore you only count cycles for the perf
event while the guest runs the timer ISR which obviously is much much
less than if you had a constant running VCPU.  Am I on the right track?

If so, there are a couple of things you could try.

First, you can try disabling the trap on WFI (note that this changes
pretty fundamental behavior of KVM, and this is not recommended for
production use or for systems level performance investigations where
more than one workload contends for a single physical CPU):

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 7f9d2bfcf82e..b38a5a134fef 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -84,7 +84,7 @@
  * FMO:Override CPSR.F and enable signaling with VF
  * SWIO:   Turn set/way invalidates into set/way clean+invalidate
  */
-#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
+#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_VM | \
 HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
 HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
 HCR_FMO | HCR_IMO)


Note that I'm not sure how the performance counter counts on your
particular platform when the CPU is in WFI, so this may not help at all.


Second, and possibly preferred, you can hook up your simulation event to
a timer event in the case of trapping on a WFI.  See kvm_handle_wfx() in
arch/arm64/kvm/handle_exit.c and follow kvm_vcpu_block() from there to
see how KVM/Arm handles this event.


Hope this helps,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC] arm/cpu: fix soft lockup panic after resuming from stop

2019-03-12 Thread Christoffer Dall
[Adding Steven Price, who has recently looked at this, in cc]

On Tue, Mar 12, 2019 at 10:08:47AM +, Peter Maydell wrote:
> On Tue, 12 Mar 2019 at 06:10, Heyi Guo  wrote:
> >
> > When we stop a VM for more than 30 seconds and then resume it, by qemu
> > monitor command "stop" and "cont", Linux on VM will complain of "soft
> > lockup - CPU#x stuck for xxs!" as below:
> >
> > [ 2783.809517] watchdog: BUG: soft lockup - CPU#3 stuck for 2395s!
> > [ 2783.809559] watchdog: BUG: soft lockup - CPU#2 stuck for 2395s!
> > [ 2783.809561] watchdog: BUG: soft lockup - CPU#1 stuck for 2395s!
> > [ 2783.809563] Modules linked in...
> >
> > This is because Guest Linux uses generic timer virtual counter as
> > a software watchdog, and CNTVCT_EL0 does not stop when VM is stopped
> > by qemu.
> >
> > This patch is to fix this issue by saving the value of CNTVCT_EL0 when
> > stopping and restoring it when resuming.
> 
> Hi -- I know we have issues with the passage of time in Arm VMs
> running under KVM when the VM is suspended, but the topic is
> a tricky one, and it's not clear to me that this is the correct
> way to fix it. I would prefer to see us start with a discussion
> on the kvm-arm mailing list about the best approach to the problem.
> 
> I've cc'd that list and a couple of the Arm KVM maintainers
> for their opinion.
> 
> QEMU patch left below for context -- the brief summary is that
> it uses KVM_GET_ONE_REG/KVM_SET_ONE_REG on the timer CNT register
> to save it on VM pause and write that value back on VM resume.
> 
> thanks
> -- PMM
> 
> 
> 
> > Cc: Peter Maydell 
> > Signed-off-by: Heyi Guo 
> > ---
> >  target/arm/cpu.c | 65 
> > 
> >  1 file changed, 65 insertions(+)
> >
> > diff --git a/target/arm/cpu.c b/target/arm/cpu.c
> > index 96f0ff0..7bbba3d 100644
> > --- a/target/arm/cpu.c
> > +++ b/target/arm/cpu.c
> > @@ -896,6 +896,60 @@ static void arm_cpu_finalizefn(Object *obj)
> >  #endif
> >  }
> >
> > +static int get_vcpu_timer_tick(CPUState *cs, uint64_t *tick_at_pause)
> > +{
> > +int err;
> > +struct kvm_one_reg reg;
> > +
> > +reg.id = KVM_REG_ARM_TIMER_CNT;
> > +reg.addr = (uintptr_t) tick_at_pause;
> > +
> > +err = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, );
> > +return err;
> > +}
> > +
> > +static int set_vcpu_timer_tick(CPUState *cs, uint64_t tick_at_pause)
> > +{
> > +int err;
> > +struct kvm_one_reg reg;
> > +
> > +reg.id = KVM_REG_ARM_TIMER_CNT;
> > +reg.addr = (uintptr_t) _at_pause;
> > +
> > +err = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, );
> > +return err;
> > +}
> > +
> > +static void arch_timer_change_state_handler(void *opaque, int running,
> > +RunState state)
> > +{
> > +static uint64_t hw_ticks_at_paused;
> > +static RunState pre_state = RUN_STATE__MAX;
> > +int err;
> > +CPUState *cs = (CPUState *)opaque;
> > +
> > +switch (state) {
> > +case RUN_STATE_PAUSED:
> > +err = get_vcpu_timer_tick(cs, _ticks_at_paused);
> > +if (err) {
> > +error_report("Get vcpu timer tick failed: %d", err);
> > +}
> > +break;
> > +case RUN_STATE_RUNNING:
> > +if (pre_state == RUN_STATE_PAUSED) {
> > +err = set_vcpu_timer_tick(cs, hw_ticks_at_paused);
> > +if (err) {
> > +error_report("Resume vcpu timer tick failed: %d", err);
> > +}
> > +}
> > +break;
> > +default:
> > +break;
> > +}
> > +
> > +pre_state = state;
> > +}
> > +
> >  static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
> >  {
> >  CPUState *cs = CPU(dev);
> > @@ -906,6 +960,12 @@ static void arm_cpu_realizefn(DeviceState *dev, Error 
> > **errp)
> >  Error *local_err = NULL;
> >  bool no_aa32 = false;
> >
> > +/*
> > + * Only add change state handler for arch timer once, for KVM will 
> > help to
> > + * synchronize virtual timer of all VCPUs.
> > + */
> > +static bool arch_timer_change_state_handler_added;
> > +
> >  /* If we needed to query the host kernel for the CPU features
> >   * then it's possible that might have failed in the initfn, but
> >   * this is the first point where we can report it.
> > @@ -1181,6 +1241,11 @@ static void arm_cpu_realizefn(DeviceState *dev, 
> > Error **errp)
> >
> >  init_cpreg_list(cpu);
> >
> > +if (!arch_timer_change_state_handler_added && kvm_enabled()) {
> > +qemu_add_vm_change_state_handler(arch_timer_change_state_handler, 
> > cs);
> > +arch_timer_change_state_handler_added = true;
> > +}
> > +
> >  #ifndef CONFIG_USER_ONLY
> >  if (cpu->has_el3 || arm_feature(env, ARM_FEATURE_M_SECURITY)) {
> >  cs->num_ases = 2;
> > --
> > 1.8.3.1
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v10 4/5] arm64: arm_pmu: Add support for exclude_host/exclude_guest attributes

2019-03-06 Thread Christoffer Dall
On Tue, Mar 05, 2019 at 11:45:52AM +, Andrew Murray wrote:
> On Mon, Mar 04, 2019 at 11:14:24AM +, Andrew Murray wrote:
> > On Tue, Feb 26, 2019 at 01:44:59PM +0100, Christoffer Dall wrote:
> > > On Wed, Feb 20, 2019 at 04:15:40PM +, Andrew Murray wrote:
> > > > On Mon, Feb 18, 2019 at 10:53:07PM +0100, Christoffer Dall wrote:
> > > > > On Mon, Jan 14, 2019 at 04:11:47PM +, Andrew Murray wrote:
> > > > > > Add support for the :G and :H attributes in perf by handling the
> > > > > > exclude_host/exclude_guest event attributes.
> > > > > > 
> > > > > > We notify KVM of counters that we wish to be enabled or disabled on
> > > > > > guest entry/exit and thus defer from starting or stopping :G events
> > > > > > as per the events exclude_host attribute.
> > > > > > 
> > > > > > With both VHE and non-VHE we switch the counters between host/guest
> > > > > > at EL2. We are able to eliminate counters counting host events on
> > > > > > the boundaries of guest entry/exit when using :G by filtering out
> > > > > > EL2 for exclude_host. However when using :H unless exclude_hv is set
> > > > > > on non-VHE then there is a small blackout window at the guest
> > > > > > entry/exit where host events are not captured.
> > > > > > 
> > > > > > Signed-off-by: Andrew Murray 
> > > > > > Reviewed-by: Suzuki K Poulose 
> > > > > > ---
> 
> > > > > 
> > > > > Let me see if I get this right:
> > > > > 
> > > > >   exclude_user:   VHE: Don't count EL0
> > > > >   Non-VHE: Don't count EL0
> > > > > 
> > > > >   exclude_kernel: VHE: Don't count EL2 and don't count EL1
> > > > >   Non-VHE: Don't count EL1
> > > > > 
> > > > >   exclude_hv: VHE: No effect
> > > > >   Non-VHE: Don't count EL2
> > > > > 
> > > > >   exclude_host:   VHE: Don't count EL2 + enable/disable on guest 
> > > > > entry/exit
> > > > >   Non-VHE: disable on guest entry/disable on 
> > > > > guest entry/exit
> > > > > 
> > > > > And the logic I extract is that _user applies across both guest and
> > > > > host, as does _kernel (regardless of the mode the kernel on the 
> > > > > current
> > > > > system runs in, might be only EL1, might be EL1 and EL2), and _hv is
> > > > > specific to non-VHE systems to measure events in a specific piece of 
> > > > > KVM
> > > > > code that runs at EL2.
> > > > > 
> > > > > As I expressed before, that doesn't seem to be the intent behind the
> > > > > exclude_hv flag, but I'm not sure how other architectures actually
> > > > > implement things today, and even if it's a curiosity of the Arm
> > > > > architecture and has value to non-VHE hypervisor hackers, and we don't
> > > > > really have to care about uniformity with the other architectures, 
> > > > > then
> > > > > fine.
> > > > > 
> > > > > It has taken me a while to make sense of this code change, so I really
> > > > > wish we can find a suitable place to document the semantics clearly 
> > > > > for
> > > > > perf users on arm64.
> > > > > 
> > > > > Now, another thing comes to mind:  Do we really need to enable and
> > > > > disable anything on a VHE system on entry/exit to/from a guest?  Can 
> > > > > we
> > > > > instead do the following:
> > > > > 
> > > > >   exclude_host:   Disable EL2 counting
> > > > >   Disable EL0 counting
> > > > >   Enable EL0 counting on vcpu_load
> > > > > (unless exclude_user is also set)
> > > > >   Disable EL0 counting on vcpu_put
> > > > > 
> > > > >   exclude_guest:  Disable EL1 counting
> > > > >   Disable EL0 counting on vcpu_load
> > > > >   Enable EL0 counting on vcpu_put
> > > > > (unless exclude_user is also set)
> > > >

Re: [PATCH v10 4/5] arm64: arm_pmu: Add support for exclude_host/exclude_guest attributes

2019-02-26 Thread Christoffer Dall
On Wed, Feb 20, 2019 at 04:15:40PM +, Andrew Murray wrote:
> On Mon, Feb 18, 2019 at 10:53:07PM +0100, Christoffer Dall wrote:
> > On Mon, Jan 14, 2019 at 04:11:47PM +, Andrew Murray wrote:
> > > Add support for the :G and :H attributes in perf by handling the
> > > exclude_host/exclude_guest event attributes.
> > > 
> > > We notify KVM of counters that we wish to be enabled or disabled on
> > > guest entry/exit and thus defer from starting or stopping :G events
> > > as per the events exclude_host attribute.
> > > 
> > > With both VHE and non-VHE we switch the counters between host/guest
> > > at EL2. We are able to eliminate counters counting host events on
> > > the boundaries of guest entry/exit when using :G by filtering out
> > > EL2 for exclude_host. However when using :H unless exclude_hv is set
> > > on non-VHE then there is a small blackout window at the guest
> > > entry/exit where host events are not captured.
> > > 
> > > Signed-off-by: Andrew Murray 
> > > Reviewed-by: Suzuki K Poulose 
> > > ---
> > >  arch/arm64/kernel/perf_event.c | 53 
> > > --
> > >  1 file changed, 46 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/arch/arm64/kernel/perf_event.c 
> > > b/arch/arm64/kernel/perf_event.c
> > > index 1c71796..21c6831 100644
> > > --- a/arch/arm64/kernel/perf_event.c
> > > +++ b/arch/arm64/kernel/perf_event.c
> > > @@ -26,6 +26,7 @@
> > >  
> > >  #include 
> > >  #include 
> > > +#include 
> > >  #include 
> > >  #include 
> > >  #include 
> > > @@ -528,11 +529,27 @@ static inline int armv8pmu_enable_counter(int idx)
> > >  
> > >  static inline void armv8pmu_enable_event_counter(struct perf_event 
> > > *event)
> > >  {
> > > + struct perf_event_attr *attr = >attr;
> > >   int idx = event->hw.idx;
> > > + int flags = 0;
> > > + u32 counter_bits = BIT(ARMV8_IDX_TO_COUNTER(idx));
> > >  
> > > - armv8pmu_enable_counter(idx);
> > >   if (armv8pmu_event_is_chained(event))
> > > - armv8pmu_enable_counter(idx - 1);
> > > + counter_bits |= BIT(ARMV8_IDX_TO_COUNTER(idx - 1));
> > > +
> > > + if (!attr->exclude_host)
> > > + flags |= KVM_PMU_EVENTS_HOST;
> > > + if (!attr->exclude_guest)
> > > + flags |= KVM_PMU_EVENTS_GUEST;
> > > +
> > > + kvm_set_pmu_events(counter_bits, flags);
> > > +
> > > + /* We rely on the hypervisor switch code to enable guest counters */
> > > + if (!attr->exclude_host) {
> > > + armv8pmu_enable_counter(idx);
> > > + if (armv8pmu_event_is_chained(event))
> > > + armv8pmu_enable_counter(idx - 1);
> > > + }
> > >  }
> > >  
> > >  static inline int armv8pmu_disable_counter(int idx)
> > > @@ -545,11 +562,21 @@ static inline int armv8pmu_disable_counter(int idx)
> > >  static inline void armv8pmu_disable_event_counter(struct perf_event 
> > > *event)
> > >  {
> > >   struct hw_perf_event *hwc = >hw;
> > > + struct perf_event_attr *attr = >attr;
> > >   int idx = hwc->idx;
> > > + u32 counter_bits = BIT(ARMV8_IDX_TO_COUNTER(idx));
> > >  
> > >   if (armv8pmu_event_is_chained(event))
> > > - armv8pmu_disable_counter(idx - 1);
> > > - armv8pmu_disable_counter(idx);
> > > + counter_bits |= BIT(ARMV8_IDX_TO_COUNTER(idx - 1));
> > > +
> > > + kvm_clr_pmu_events(counter_bits);
> > > +
> > > + /* We rely on the hypervisor switch code to disable guest counters */
> > > + if (!attr->exclude_host) {
> > > + if (armv8pmu_event_is_chained(event))
> > > + armv8pmu_disable_counter(idx - 1);
> > > + armv8pmu_disable_counter(idx);
> > > + }
> > >  }
> > >  
> > >  static inline int armv8pmu_enable_intens(int idx)
> > > @@ -824,16 +851,25 @@ static int armv8pmu_set_event_filter(struct 
> > > hw_perf_event *event,
> > >* Therefore we ignore exclude_hv in this configuration, since
> > >* there's no hypervisor to sample anyway. This is consistent
> > >* with other architectures (x86 and Power).
> > > +  *
> > > +  * To eliminate counting host events on the boundaries of
> >^comma

Re: [PATCH 1/5] KVM: arm/arm64: Reset the VCPU without preemption and vcpu state loaded

2019-02-26 Thread Christoffer Dall
On Wed, Feb 20, 2019 at 07:14:53PM +, Dave Martin wrote:
> On Fri, Jan 25, 2019 at 10:46:52AM +0100, Christoffer Dall wrote:
> > Resetting the VCPU state modifies the system register state in memory,
> > but this may interact with vcpu_load/vcpu_put if running with preemption
> > disabled, which in turn may lead to corrupted system register state.
> 
> Should this be "enabled"?
> 
> Too late now, but I want to make sure I understand this right for
> patches that will go on top.
> 
> > Address this by disabling preemption and doing put/load if required
> > around the reset logic.
> > 
> > Signed-off-by: Christoffer Dall 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/arm64/kvm/reset.c | 26 --
> >  1 file changed, 24 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> > index b72a3dd56204..f21a2a575939 100644
> > --- a/arch/arm64/kvm/reset.c
> > +++ b/arch/arm64/kvm/reset.c
> > @@ -105,16 +105,33 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm 
> > *kvm, long ext)
> >   * This function finds the right table above and sets the registers on
> >   * the virtual CPU struct to their architecturally defined reset
> >   * values.
> > + *
> > + * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT
> > + * ioctl or as part of handling a request issued by another VCPU in the 
> > PSCI
> > + * handling code.  In the first case, the VCPU will not be loaded, and in 
> > the
> > + * second case the VCPU will be loaded.  Because this function operates 
> > purely
> > + * on the memory-backed valus of system registers, we want to do a full 
> > put if
> > + * we were loaded (handling a request) and load the values back at the end 
> > of
> > + * the function.  Otherwise we leave the state alone.  In both cases, we
> > + * disable preemption around the vcpu reset as we would otherwise race with
> > + * preempt notifiers which also call put/load.
> >   */
> >  int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
> >  {
> > const struct kvm_regs *cpu_reset;
> > +   int ret = -EINVAL;
> > +   bool loaded;
> > +
> > +   preempt_disable();
> > +   loaded = (vcpu->cpu != -1);
> > +   if (loaded)
> > +   kvm_arch_vcpu_put(vcpu);
> >  
> > switch (vcpu->arch.target) {
> > default:
> > if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) {
> > if (!cpu_has_32bit_el1())
> > -   return -EINVAL;
> > +   goto out;
> > cpu_reset = _regs_reset32;
> > } else {
> > cpu_reset = _regs_reset;
> > @@ -137,7 +154,12 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
> > vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
> >  
> > /* Reset timer */
> > -   return kvm_timer_vcpu_reset(vcpu);
> > +   ret = kvm_timer_vcpu_reset(vcpu);
> > +out:
> > +   if (loaded)
> > +   kvm_arch_vcpu_load(vcpu, smp_processor_id());
> > +   preempt_enable();
> > +   return ret;
> >  }
> >  
> >  void kvm_set_ipa_limit(void)
> 
> I was really confused by this: as far as I can see, we don't really need
> to disable preemption here once kvm_arch_vcpu_put() is complete -- at
> least not for the purpose of avoiding corruption of the reg state.  But
> we _do_ need to disable the preempt notifier so that it doesn't fire
> before we are ready.
> 
> It actually seems a bit surprising for a powered-off CPU to sit with the
> VM regs live and preempt notifier armed, when the vcpu thread is
> heading to interruptible sleep anyway until someone turns it on.
> Perhaps an alternative approach would be to nobble the preempt notifier
> and stick an explicit vcpu_put()...vcpu_load() around the
> swait_event_interruptible_exclusive() call in vcpu_req_sleep().  This
> is not fast path.
> 
> 

I think you've understood the problem correctly, and the thing here is
that we (sort-of) "abuse" disabling preemption as a way to disable
preempt notifiers, which I don't think we have.  So we could add that,
and do something like:

  preempt_disable();
  vcpu_put(vcpu);
  disable_preempt_notifiers(vcpu);
  preempt_disable();
  funky_stuff();
  vcpu_load();
  preempt_enable();

But I think that's additional complexity to get a slightly shorter
section with disabled preemption.

We could also re-architect a lot of the vcpu_load/vpcu_put functionality
more drastically, but that is difficult and requires understanding of
how the other architectures work, so at the end of the day we just use
this pattern in multiple places, which is:

  preempt_disable();
  vcpu_put();
  modify_vcpu_state_in_memory();
  vcpu_load();
  preempt_enable();

Does that help?


Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 12/14] KVM: arm/arm64: arch_timer: Assign the phys timer on VHE systems

2019-02-19 Thread Christoffer Dall
On Tue, Feb 19, 2019 at 11:39:50AM +, Alexandru Elisei wrote:
> 
> On 1/24/19 2:00 PM, Christoffer Dall wrote:
> > VHE systems don't have to emulate the physical timer, we can simply
> > assigne the EL1 physical timer directly to the VM as the host always
> > uses the EL2 timers.
> >
> > In order to minimize the amount of cruft, AArch32 gets definitions for
> > the physical timer too, but is should be generally unused on this
> > architecture.
> >
> > Co-written with Marc Zyngier 
> >
> > Signed-off-by: Marc Zyngier 
> > Signed-off-by: Christoffer Dall 
> > ---
> >  arch/arm/include/asm/kvm_hyp.h |   4 +
> >  include/kvm/arm_arch_timer.h   |   6 +
> >  virt/kvm/arm/arch_timer.c  | 206 ++---
> >  3 files changed, 171 insertions(+), 45 deletions(-)
> >
> > diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
> > index e93a0cac9add..87bcd18df8d5 100644
> > --- a/arch/arm/include/asm/kvm_hyp.h
> > +++ b/arch/arm/include/asm/kvm_hyp.h
> > @@ -40,6 +40,7 @@
> >  #define TTBR1  __ACCESS_CP15_64(1, c2)
> >  #define VTTBR  __ACCESS_CP15_64(6, c2)
> >  #define PAR__ACCESS_CP15_64(0, c7)
> > +#define CNTP_CVAL  __ACCESS_CP15_64(2, c14)
> >  #define CNTV_CVAL  __ACCESS_CP15_64(3, c14)
> >  #define CNTVOFF__ACCESS_CP15_64(4, c14)
> >  
> > @@ -85,6 +86,7 @@
> >  #define TID_PRIV   __ACCESS_CP15(c13, 0, c0, 4)
> >  #define HTPIDR __ACCESS_CP15(c13, 4, c0, 2)
> >  #define CNTKCTL__ACCESS_CP15(c14, 0, c1, 0)
> > +#define CNTP_CTL   __ACCESS_CP15(c14, 0, c2, 1)
> >  #define CNTV_CTL   __ACCESS_CP15(c14, 0, c3, 1)
> >  #define CNTHCTL__ACCESS_CP15(c14, 4, c1, 0)
> >  
> > @@ -94,6 +96,8 @@
> >  #define read_sysreg_el0(r) read_sysreg(r##_el0)
> >  #define write_sysreg_el0(v, r) write_sysreg(v, r##_el0)
> >  
> > +#define cntp_ctl_el0   CNTP_CTL
> > +#define cntp_cval_el0  CNTP_CVAL
> >  #define cntv_ctl_el0   CNTV_CTL
> >  #define cntv_cval_el0  CNTV_CVAL
> >  #define cntvoff_el2CNTVOFF
> > diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
> > index d40fe57a2d0d..722e0481f310 100644
> > --- a/include/kvm/arm_arch_timer.h
> > +++ b/include/kvm/arm_arch_timer.h
> > @@ -50,6 +50,10 @@ struct arch_timer_context {
> >  
> > /* Emulated Timer (may be unused) */
> > struct hrtimer  hrtimer;
> > +
> > +   /* Duplicated state from arch_timer.c for convenience */
> > +   u32 host_timer_irq;
> > +   u32 host_timer_irq_flags;
> >  };
> >  
> >  enum loaded_timer_state {
> > @@ -107,6 +111,8 @@ bool kvm_arch_timer_get_input_level(int vintid);
> >  #define vcpu_vtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_VTIMER])
> >  #define vcpu_ptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_PTIMER])
> >  
> > +#define arch_timer_ctx_index(ctx)  ((ctx) - 
> > vcpu_timer((ctx)->vcpu)->timers)
> > +
> >  u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
> >   enum kvm_arch_timers tmr,
> >   enum kvm_arch_timer_regs treg);
> > diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
> > index 8b0eca5fbad1..eed8f48fbf9b 100644
> > --- a/virt/kvm/arm/arch_timer.c
> > +++ b/virt/kvm/arm/arch_timer.c
> > @@ -35,7 +35,9 @@
> >  
> >  static struct timecounter *timecounter;
> >  static unsigned int host_vtimer_irq;
> > +static unsigned int host_ptimer_irq;
> >  static u32 host_vtimer_irq_flags;
> > +static u32 host_ptimer_irq_flags;
> >  
> >  static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
> >  
> > @@ -86,20 +88,24 @@ static void soft_timer_cancel(struct hrtimer *hrt)
> >  static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
> >  {
> > struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
> > -   struct arch_timer_context *vtimer;
> > +   struct arch_timer_context *ctx;
> >  
> > /*
> >  * We may see a timer interrupt after vcpu_put() has been called which
> >  * sets the CPU's vcpu pointer to NULL, because even though the timer
> > -* has been disabled in vtimer_save_state(), the hardware interrupt
> > +* has been disabled in timer_save_state(), the hardware interrupt
> > 

Re: [PATCH 12/14] KVM: arm/arm64: arch_timer: Assign the phys timer on VHE systems

2019-02-19 Thread Christoffer Dall
On Mon, Feb 18, 2019 at 03:10:49PM +, André Przywara wrote:
> On Thu, 24 Jan 2019 15:00:30 +0100
> Christoffer Dall  wrote:
> 
> Hi,
> 
> > VHE systems don't have to emulate the physical timer, we can simply
> > assigne the EL1 physical timer directly to the VM as the host always
> > uses the EL2 timers.
> > 
> > In order to minimize the amount of cruft, AArch32 gets definitions for
> > the physical timer too, but is should be generally unused on this
> > architecture.
> > 
> > Co-written with Marc Zyngier 
> > 
> > Signed-off-by: Marc Zyngier 
> > Signed-off-by: Christoffer Dall 
> > ---
> >  arch/arm/include/asm/kvm_hyp.h |   4 +
> >  include/kvm/arm_arch_timer.h   |   6 +
> >  virt/kvm/arm/arch_timer.c  | 206 ++---
> >  3 files changed, 171 insertions(+), 45 deletions(-)
> > 
> > diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
> > index e93a0cac9add..87bcd18df8d5 100644
> > --- a/arch/arm/include/asm/kvm_hyp.h
> > +++ b/arch/arm/include/asm/kvm_hyp.h
> > @@ -40,6 +40,7 @@
> >  #define TTBR1  __ACCESS_CP15_64(1, c2)
> >  #define VTTBR  __ACCESS_CP15_64(6, c2)
> >  #define PAR__ACCESS_CP15_64(0, c7)
> > +#define CNTP_CVAL  __ACCESS_CP15_64(2, c14)
> >  #define CNTV_CVAL  __ACCESS_CP15_64(3, c14)
> >  #define CNTVOFF__ACCESS_CP15_64(4, c14)
> >  
> > @@ -85,6 +86,7 @@
> >  #define TID_PRIV   __ACCESS_CP15(c13, 0, c0, 4)
> >  #define HTPIDR __ACCESS_CP15(c13, 4, c0, 2)
> >  #define CNTKCTL__ACCESS_CP15(c14, 0, c1, 0)
> > +#define CNTP_CTL   __ACCESS_CP15(c14, 0, c2, 1)
> >  #define CNTV_CTL   __ACCESS_CP15(c14, 0, c3, 1)
> >  #define CNTHCTL__ACCESS_CP15(c14, 4, c1, 0)
> >  
> > @@ -94,6 +96,8 @@
> >  #define read_sysreg_el0(r) read_sysreg(r##_el0)
> >  #define write_sysreg_el0(v, r) write_sysreg(v, r##_el0)
> >  
> > +#define cntp_ctl_el0   CNTP_CTL
> > +#define cntp_cval_el0  CNTP_CVAL
> >  #define cntv_ctl_el0   CNTV_CTL
> >  #define cntv_cval_el0  CNTV_CVAL
> >  #define cntvoff_el2CNTVOFF
> > diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
> > index d40fe57a2d0d..722e0481f310 100644
> > --- a/include/kvm/arm_arch_timer.h
> > +++ b/include/kvm/arm_arch_timer.h
> > @@ -50,6 +50,10 @@ struct arch_timer_context {
> >  
> > /* Emulated Timer (may be unused) */
> > struct hrtimer  hrtimer;
> > +
> > +   /* Duplicated state from arch_timer.c for convenience */
> > +   u32 host_timer_irq;
> > +   u32 host_timer_irq_flags;
> >  };
> >  
> >  enum loaded_timer_state {
> > @@ -107,6 +111,8 @@ bool kvm_arch_timer_get_input_level(int vintid);
> >  #define vcpu_vtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_VTIMER])
> >  #define vcpu_ptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_PTIMER])
> >  
> > +#define arch_timer_ctx_index(ctx)  ((ctx) - 
> > vcpu_timer((ctx)->vcpu)->timers)
> > +
> >  u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
> >   enum kvm_arch_timers tmr,
> >   enum kvm_arch_timer_regs treg);
> > diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
> > index 8b0eca5fbad1..eed8f48fbf9b 100644
> > --- a/virt/kvm/arm/arch_timer.c
> > +++ b/virt/kvm/arm/arch_timer.c
> > @@ -35,7 +35,9 @@
> >  
> >  static struct timecounter *timecounter;
> >  static unsigned int host_vtimer_irq;
> > +static unsigned int host_ptimer_irq;
> >  static u32 host_vtimer_irq_flags;
> > +static u32 host_ptimer_irq_flags;
> >  
> >  static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
> >  
> > @@ -86,20 +88,24 @@ static void soft_timer_cancel(struct hrtimer *hrt)
> >  static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
> >  {
> > struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
> > -   struct arch_timer_context *vtimer;
> > +   struct arch_timer_context *ctx;
> >  
> > /*
> >  * We may see a timer interrupt after vcpu_put() has been called which
> >  * sets the CPU's vcpu pointer to NULL, because even though the timer
> > -* has been disabled in vtimer_save_state(), the hardware interrupt
> > +* has bee

Re: [PATCH 11/14] KVM: arm/arm64: timer: Rework data structures for multiple timers

2019-02-19 Thread Christoffer Dall
On Mon, Feb 18, 2019 at 03:10:16PM +, André Przywara wrote:
> On Thu, 24 Jan 2019 15:00:29 +0100
> Christoffer Dall  wrote:
> 
> Hi,
> 
> I already looked at most of these patches earlier, without finding
> serious issues, but figured I would give those without any Reviewed-by:
> or Acked-by: tags a closer look.
> (This patch just carries a S-o-b: tag from Marc in the kvm-arm git repo.)
> 
> > Prepare for having 4 timer data structures (2 for now).
> > 
> > Change loaded to an enum so that we know not just whether *some* state
> > is loaded on the CPU, but also *which* state is loaded.
> > 
> > Move loaded to the cpu data structure and not the individual timer
> > structure, in preparation for assigning the EL1 phys timer as well.
> > 
> > Signed-off-by: Christoffer Dall 
> > Acked-by: Marc Zyngier 
> > ---
> >  include/kvm/arm_arch_timer.h | 44 ++-
> >  virt/kvm/arm/arch_timer.c| 58 +++-
> >  2 files changed, 54 insertions(+), 48 deletions(-)
> > 
> > diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
> > index d26b7fde9935..d40fe57a2d0d 100644
> > --- a/include/kvm/arm_arch_timer.h
> > +++ b/include/kvm/arm_arch_timer.h
> > @@ -36,6 +36,8 @@ enum kvm_arch_timer_regs {
> >  };
> >  
> >  struct arch_timer_context {
> > +   struct kvm_vcpu *vcpu;
> > +
> > /* Registers: control register, timer value */
> > u32 cnt_ctl;
> > u64 cnt_cval;
> > @@ -43,32 +45,34 @@ struct arch_timer_context {
> > /* Timer IRQ */
> > struct kvm_irq_levelirq;
> >  
> > -   /*
> > -* We have multiple paths which can save/restore the timer state
> > -* onto the hardware, so we need some way of keeping track of
> > -* where the latest state is.
> > -*
> > -* loaded == true:  State is loaded on the hardware registers.
> > -* loaded == false: State is stored in memory.
> > -*/
> > -   boolloaded;
> > -
> > /* Virtual offset */
> > -   u64 cntvoff;
> > +   u64 cntvoff;
> > +
> > +   /* Emulated Timer (may be unused) */
> > +   struct hrtimer  hrtimer;
> > +};
> > +
> > +enum loaded_timer_state {
> > +   TIMER_NOT_LOADED,
> > +   TIMER_EL1_LOADED,
> 
> So this gets reverted in PATCH 13/14, and I don't see it reappearing in
> the nv series later on.
> Is that just needed for assigning the phys timer in the next patch, and
> gets obsolete with the timer_map?
> Or is this a rebase artefact and we don't actually need this?

I think this is a rebase problem and we could have optimized this out to
reduce the patch diff.  The end result is the same though.

> 
> The rest of the patch looks like valid transformations to me.
> 
Thanks for having a look.

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v10 5/5] arm64: KVM: Enable support for :G/:H perf event modifiers

2019-02-18 Thread Christoffer Dall
On Mon, Jan 14, 2019 at 04:11:48PM +, Andrew Murray wrote:
> Enable/disable event counters as appropriate when entering and exiting
> the guest to enable support for guest or host only event counting.
> 
> For both VHE and non-VHE we switch the counters between host/guest at
> EL2. EL2 is filtered out by the PMU when we are using the :G modifier.

I don't think the last part is strictly true as per the former patch on
a non-vhe system if you have the :h modifier, so maybe just leave that
out of the commit message.

> 
> The PMU may be on when we change which counters are enabled however
> we avoid adding an isb as we instead rely on existing context
> synchronisation events: the isb in kvm_arm_vhe_guest_exit for VHE and
> the eret from the hvc in kvm_call_hyp.
> 
> Signed-off-by: Andrew Murray 
> Reviewed-by: Suzuki K Poulose 
> ---
>  arch/arm64/kvm/hyp/switch.c | 60 
> +
>  1 file changed, 60 insertions(+)
> 
> diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
> index b0b1478..9018fb3 100644
> --- a/arch/arm64/kvm/hyp/switch.c
> +++ b/arch/arm64/kvm/hyp/switch.c
> @@ -357,6 +357,54 @@ static bool __hyp_text __hyp_switch_fpsimd(struct 
> kvm_vcpu *vcpu)
>   return true;
>  }
>  
> +static bool __hyp_text __pmu_switch_to_guest(struct kvm_cpu_context 
> *host_ctxt)
> +{
> + struct kvm_host_data *host;
> + struct kvm_pmu_events *pmu;
> + u32 clr, set;
> +
> + host = container_of(host_ctxt, struct kvm_host_data, host_ctxt);
> + pmu = >pmu_events;
> +
> + /* We can potentially avoid a sysreg write by only changing bits that
> +  * differ between the guest/host. E.g. where events are enabled in
> +  * both guest and host
> +  */

super nit: kernel coding style requires 'wings' on both side of a
multi-line comment.  Only if you respin anyhow.

> + clr = pmu->events_host & ~pmu->events_guest;
> + set = pmu->events_guest & ~pmu->events_host;
> +
> + if (clr)
> + write_sysreg(clr, pmcntenclr_el0);
> +
> + if (set)
> + write_sysreg(set, pmcntenset_el0);
> +
> + return (clr || set);
> +}
> +
> +static void __hyp_text __pmu_switch_to_host(struct kvm_cpu_context 
> *host_ctxt)
> +{
> + struct kvm_host_data *host;
> + struct kvm_pmu_events *pmu;
> + u32 clr, set;
> +
> + host = container_of(host_ctxt, struct kvm_host_data, host_ctxt);
> + pmu = >pmu_events;
> +
> + /* We can potentially avoid a sysreg write by only changing bits that
> +  * differ between the guest/host. E.g. where events are enabled in
> +  * both guest and host
> +  */

ditto

> + clr = pmu->events_guest & ~pmu->events_host;
> + set = pmu->events_host & ~pmu->events_guest;
> +
> + if (clr)
> + write_sysreg(clr, pmcntenclr_el0);
> +
> + if (set)
> + write_sysreg(set, pmcntenset_el0);
> +}
> +
>  /*
>   * Return true when we were able to fixup the guest exit and should return to
>   * the guest, false when we should restore the host state and return to the
> @@ -464,12 +512,15 @@ int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
>  {
>   struct kvm_cpu_context *host_ctxt;
>   struct kvm_cpu_context *guest_ctxt;
> + bool pmu_switch_needed;
>   u64 exit_code;
>  
>   host_ctxt = vcpu->arch.host_cpu_context;
>   host_ctxt->__hyp_running_vcpu = vcpu;
>   guest_ctxt = >arch.ctxt;
>  
> + pmu_switch_needed = __pmu_switch_to_guest(host_ctxt);
> +
>   sysreg_save_host_state_vhe(host_ctxt);
>  
>   /*
> @@ -511,6 +562,9 @@ int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
>  
>   __debug_switch_to_host(vcpu);
>  
> + if (pmu_switch_needed)
> + __pmu_switch_to_host(host_ctxt);
> +
>   return exit_code;
>  }
>  
> @@ -519,6 +573,7 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
>  {
>   struct kvm_cpu_context *host_ctxt;
>   struct kvm_cpu_context *guest_ctxt;
> + bool pmu_switch_needed;
>   u64 exit_code;
>  
>   vcpu = kern_hyp_va(vcpu);
> @@ -527,6 +582,8 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
>   host_ctxt->__hyp_running_vcpu = vcpu;
>   guest_ctxt = >arch.ctxt;
>  
> + pmu_switch_needed = __pmu_switch_to_guest(host_ctxt);
> +
>   __sysreg_save_state_nvhe(host_ctxt);
>  
>   __activate_vm(kern_hyp_va(vcpu->kvm));
> @@ -573,6 +630,9 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
>*/
>   __debug_switch_to_host(vcpu);
>  
> + if (pmu_switch_needed)
> + __pmu_switch_to_host(host_ctxt);
> +
>   return exit_code;
>  }
>  
> -- 
> 2.7.4
> 

Thanks,

Christoffer
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


  1   2   3   4   5   6   7   8   9   10   >