Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler

2021-02-26 Thread wangyanan (Y)



On 2021/2/25 17:55, Marc Zyngier wrote:

Hi Yanan,

On Mon, 08 Feb 2021 11:22:47 +,
Yanan Wang  wrote:

We currently uniformly clean dcache in user_mem_abort() before calling the
fault handlers, if we take a translation fault and the pfn is cacheable.
But if there are concurrent translation faults on the same page or block,
clean of dcache for the first time is necessary while the others are not.

By moving clean of dcache to the map handler, we can easily identify the
conditions where CMOs are really needed and avoid the unnecessary ones.
As it's a time consuming process to perform CMOs especially when flushing
a block range, so this solution reduces much load of kvm and improve the
efficiency of creating mappings.

That's an interesting approach. However, wouldn't it be better to
identify early that there is already something mapped, and return to
the guest ASAP?

Can you quantify the benefit of this patch alone?


Signed-off-by: Yanan Wang 
---
  arch/arm64/include/asm/kvm_mmu.h | 16 --
  arch/arm64/kvm/hyp/pgtable.c | 38 
  arch/arm64/kvm/mmu.c | 14 +++-
  3 files changed, 27 insertions(+), 41 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e52d82aeadca..4ec9879e82ed 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu 
*vcpu)
return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
  }
  
-static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)

-{
-   void *va = page_address(pfn_to_page(pfn));
-
-   /*
-* With FWB, we ensure that the guest always accesses memory using
-* cacheable attributes, and we don't have to clean to PoC when
-* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
-* PoU is not required either in this case.
-*/
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   return;
-
-   kvm_flush_dcache_to_poc(va, size);
-}
-
  static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
  unsigned long size)
  {
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 4d177ce1d536..2f4f87021980 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
return 0;
  }
  
+static bool stage2_pte_cacheable(kvm_pte_t pte)

+{
+   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
+   return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+   /*
+* With FWB, we ensure that the guest always accesses memory using
+* cacheable attributes, and we don't have to clean to PoC when
+* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
+* PoU is not required either in this case.
+*/
+   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+   return;
+
+   __flush_dcache_area(addr, size);
+}
+
  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
  kvm_pte_t *ptep,
  struct stage2_map_data *data)
@@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
put_page(page);
}
  
+	/* Flush data cache before installation of the new PTE */

+   if (stage2_pte_cacheable(new))
+   stage2_flush_dcache(__va(phys), granule);
+
smp_store_release(ptep, new);
get_page(page);
data->phys += granule;
@@ -651,20 +675,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
addr, u64 size,
return ret;
  }
  
-static void stage2_flush_dcache(void *addr, u64 size)

-{
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   return;
-
-   __flush_dcache_area(addr, size);
-}
-
-static bool stage2_pte_cacheable(kvm_pte_t pte)
-{
-   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
-   return memattr == PAGE_S2_MEMATTR(NORMAL);
-}
-
  static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
   enum kvm_pgtable_walk_flags flag,
   void * const arg)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 77cb2d28f2a4..d151927a7d62 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -609,11 +609,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm 
*kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
  }
  
-static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)

-{
-   __clean_dcache_guest_page(pfn, size);
-}
-
  static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
  {

Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler

2021-02-26 Thread wangyanan (Y)

Hi Marc, Alex,

On 2021/2/26 2:30, Marc Zyngier wrote:

On Thu, 25 Feb 2021 17:39:00 +,
Alexandru Elisei  wrote:

Hi Marc,

On 2/25/21 9:55 AM, Marc Zyngier wrote:

Hi Yanan,

On Mon, 08 Feb 2021 11:22:47 +,
Yanan Wang  wrote:

We currently uniformly clean dcache in user_mem_abort() before calling the
fault handlers, if we take a translation fault and the pfn is cacheable.
But if there are concurrent translation faults on the same page or block,
clean of dcache for the first time is necessary while the others are not.

By moving clean of dcache to the map handler, we can easily identify the
conditions where CMOs are really needed and avoid the unnecessary ones.
As it's a time consuming process to perform CMOs especially when flushing
a block range, so this solution reduces much load of kvm and improve the
efficiency of creating mappings.

That's an interesting approach. However, wouldn't it be better to
identify early that there is already something mapped, and return to
the guest ASAP?

Wouldn't that introduce overhead for the common case, when there's
only one VCPU that faults on an address? For each data abort caused
by a missing stage 2 entry we would now have to determine if the IPA
isn't already mapped and that means walking the stage 2 tables.

The problem is that there is no easy to define "common case". It all
depends on what you are running in the guest.


Or am I mistaken and either:

(a) The common case is multiple simultaneous translation faults from
different VCPUs on the same IPA. Or

(b) There's a fast way to check if an IPA is mapped at stage 2 and
the overhead would be negligible.

Checking that something is mapped is simple enough: walk the S2 PT (in
SW or using AT/PAR), and return early if there is *anything*. You
already have taken the fault, which is the most expensive part of the
handling.
I think maybe it could be better to move CMOs (both dcache and icache) 
to the fault handlers.
The map path and permission path are actually a page table walk, and we 
can easily distinguish
between conditions that need CMOs and the ones that don't in the paths 
now.  Why do we have
to add one more PTW early just for identifying the cases of CMOs and 
ignore the existing one?


Besides, if we know in advance there is already something mapped (page 
table is valid), maybe it's
not appropriate to just return early in all cases. What if we are going 
to change the output address(OA)
of the existing table entry? We can't just return in this case. I'm not 
sure whether this is a correct example :).


Actually, moving CMOs to the fault handlers will not ruin the existing 
stage2 page table framework,

and there will not be so much code change. Please see below.

Can you quantify the benefit of this patch alone?

And this ^^^ part is crucial to evaluating the merit of this patch,
specially outside of the micro-benchmark space.
The following test results represent the benefit of this patch alone, 
and it's
indicated that the benefit increase as the page table granularity 
increases.
Selftest: 
https://lore.kernel.org/lkml/20210208090841.333724-1-wangyana...@huawei.com/ 



---
hardware platform: HiSilicon Kunpeng920 Server(FWB not supported)
host kernel: Linux mainline v5.11-rc6 (with series of 
https://lore.kernel.org/r/20210114121350.123684-4-wangyana...@huawei.com 
applied)


(1) multiple vcpus concurrently access 1G memory.
    execution time of: a) KVM create new page mappings(normal 4K), b) 
update the mappings from RO to RW.


cmdline: ./kvm_page_table_test -m 4 -t 0 -g 4K -s 1G -v 50
   (50 vcpus, 1G memory, page mappings(normal 4K))
a) Before patch: KVM_CREATE_MAPPINGS: 62.752s 62.123s 61.733s 62.562s 
61.847s
   After  patch: KVM_CREATE_MAPPINGS: 58.800s 58.364s 58.163s 58.370s 
58.677s *average 7% improvement*
b) Before patch: KVM_UPDATE_MAPPINGS: 49.083s 49.920s 49.484s 49.551s 
49.410s
   After  patch: KVM_UPDATE_MAPPINGS: 48.723s 49.259s 49.204s 48.207s 
49.112s *no change*


cmdline: ./kvm_page_table_test -m 4 -t 0 -g 4K -s 1G -v 100
   (100 vcpus, 1G memory, page mappings(normal 4K))
a) Before patch: KVM_CREATE_MAPPINGS: 129.70s 129.66s 126.78s 126.07s 
130.21s
   After  patch: KVM_CREATE_MAPPINGS: 120.69s 120.28s 120.68s 121.09s 
121.34s *average 9% improvement*
b) Before patch: KVM_UPDATE_MAPPINGS: 94.097s 94.501s 92.589s 93.957s 
94.317s
   After  patch: KVM_UPDATE_MAPPINGS: 93.677s 93.701s 93.036s 93.484s 
93.584s *no change*


(2) multiple vcpus concurrently access 20G memory.
    execution time of: a) KVM create new block mappings(THP 2M), b) 
split the blocks in dirty logging, c) reconstitute the blocks after 
dirty logging.


cmdline: ./kvm_page_table_test -m 4 -t 1 -g 2M -s 20G -v 20
   (20 vcpus, 20G memory, block mappings(THP 2M))
a) Before patch: KVM_CREATE_MAPPINGS: 12.546s 13.300s 12.448s 12.496s 
12.420s
   After  patch: KVM_CREATE_MAPPINGS:  5.679s  5.773s  5.759s 5.698s  
5.835s *average 54% improvement*
b) Before patch: 

Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler

2021-02-25 Thread Marc Zyngier
On Thu, 25 Feb 2021 17:39:00 +,
Alexandru Elisei  wrote:
> 
> Hi Marc,
> 
> On 2/25/21 9:55 AM, Marc Zyngier wrote:
> > Hi Yanan,
> >
> > On Mon, 08 Feb 2021 11:22:47 +,
> > Yanan Wang  wrote:
> >> We currently uniformly clean dcache in user_mem_abort() before calling the
> >> fault handlers, if we take a translation fault and the pfn is cacheable.
> >> But if there are concurrent translation faults on the same page or block,
> >> clean of dcache for the first time is necessary while the others are not.
> >>
> >> By moving clean of dcache to the map handler, we can easily identify the
> >> conditions where CMOs are really needed and avoid the unnecessary ones.
> >> As it's a time consuming process to perform CMOs especially when flushing
> >> a block range, so this solution reduces much load of kvm and improve the
> >> efficiency of creating mappings.
> > That's an interesting approach. However, wouldn't it be better to
> > identify early that there is already something mapped, and return to
> > the guest ASAP?
> 
> Wouldn't that introduce overhead for the common case, when there's
> only one VCPU that faults on an address? For each data abort caused
> by a missing stage 2 entry we would now have to determine if the IPA
> isn't already mapped and that means walking the stage 2 tables.

The problem is that there is no easy to define "common case". It all
depends on what you are running in the guest.

> Or am I mistaken and either:
> 
> (a) The common case is multiple simultaneous translation faults from
> different VCPUs on the same IPA. Or
> 
> (b) There's a fast way to check if an IPA is mapped at stage 2 and
> the overhead would be negligible.

Checking that something is mapped is simple enough: walk the S2 PT (in
SW or using AT/PAR), and return early if there is *anything*. You
already have taken the fault, which is the most expensive part of the
handling.

> 
> >
> > Can you quantify the benefit of this patch alone?

And this ^^^ part is crucial to evaluating the merit of this patch,
specially outside of the micro-benchmark space.

> >
> >> Signed-off-by: Yanan Wang 
> >> ---
> >>  arch/arm64/include/asm/kvm_mmu.h | 16 --
> >>  arch/arm64/kvm/hyp/pgtable.c | 38 
> >>  arch/arm64/kvm/mmu.c | 14 +++-
> >>  3 files changed, 27 insertions(+), 41 deletions(-)
> >>
> >> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> >> b/arch/arm64/include/asm/kvm_mmu.h
> >> index e52d82aeadca..4ec9879e82ed 100644
> >> --- a/arch/arm64/include/asm/kvm_mmu.h
> >> +++ b/arch/arm64/include/asm/kvm_mmu.h
> >> @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct 
> >> kvm_vcpu *vcpu)
> >>return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
> >>  }
> >>  
> >> -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long 
> >> size)
> >> -{
> >> -  void *va = page_address(pfn_to_page(pfn));
> >> -
> >> -  /*
> >> -   * With FWB, we ensure that the guest always accesses memory using
> >> -   * cacheable attributes, and we don't have to clean to PoC when
> >> -   * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
> >> -   * PoU is not required either in this case.
> >> -   */
> >> -  if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> >> -  return;
> >> -
> >> -  kvm_flush_dcache_to_poc(va, size);
> >> -}
> >> -
> >>  static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
> >>  unsigned long size)
> >>  {
> >> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> >> index 4d177ce1d536..2f4f87021980 100644
> >> --- a/arch/arm64/kvm/hyp/pgtable.c
> >> +++ b/arch/arm64/kvm/hyp/pgtable.c
> >> @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum 
> >> kvm_pgtable_prot prot,
> >>return 0;
> >>  }
> >>  
> >> +static bool stage2_pte_cacheable(kvm_pte_t pte)
> >> +{
> >> +  u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
> >> +  return memattr == PAGE_S2_MEMATTR(NORMAL);
> >> +}
> >> +
> >> +static void stage2_flush_dcache(void *addr, u64 size)
> >> +{
> >> +  /*
> >> +   * With FWB, we ensure that the guest always accesses memory using
> >> +   * cacheable attributes, and we don't have to clean to PoC when
> >> +   * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
> >> +   * PoU is not required either in this case.
> >> +   */
> >> +  if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> >> +  return;
> >> +
> >> +  __flush_dcache_area(addr, size);
> >> +}
> >> +
> >>  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
> >>  kvm_pte_t *ptep,
> >>  struct stage2_map_data *data)
> >> @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 
> >> end, u32 level,
> >>put_page(page);
> >>}
> >>  
> >> +  /* Flush data cache before installation of the new PTE */
> >> +  if 

Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler

2021-02-25 Thread Alexandru Elisei
Hi Marc,

On 2/25/21 9:55 AM, Marc Zyngier wrote:
> Hi Yanan,
>
> On Mon, 08 Feb 2021 11:22:47 +,
> Yanan Wang  wrote:
>> We currently uniformly clean dcache in user_mem_abort() before calling the
>> fault handlers, if we take a translation fault and the pfn is cacheable.
>> But if there are concurrent translation faults on the same page or block,
>> clean of dcache for the first time is necessary while the others are not.
>>
>> By moving clean of dcache to the map handler, we can easily identify the
>> conditions where CMOs are really needed and avoid the unnecessary ones.
>> As it's a time consuming process to perform CMOs especially when flushing
>> a block range, so this solution reduces much load of kvm and improve the
>> efficiency of creating mappings.
> That's an interesting approach. However, wouldn't it be better to
> identify early that there is already something mapped, and return to
> the guest ASAP?

Wouldn't that introduce overhead for the common case, when there's only one VCPU
that faults on an address? For each data abort caused by a missing stage 2 entry
we would now have to determine if the IPA isn't already mapped and that means
walking the stage 2 tables.

Or am I mistaken and either:

(a) The common case is multiple simultaneous translation faults from different
VCPUs on the same IPA. Or

(b) There's a fast way to check if an IPA is mapped at stage 2 and the overhead
would be negligible.

>
> Can you quantify the benefit of this patch alone?
>
>> Signed-off-by: Yanan Wang 
>> ---
>>  arch/arm64/include/asm/kvm_mmu.h | 16 --
>>  arch/arm64/kvm/hyp/pgtable.c | 38 
>>  arch/arm64/kvm/mmu.c | 14 +++-
>>  3 files changed, 27 insertions(+), 41 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
>> b/arch/arm64/include/asm/kvm_mmu.h
>> index e52d82aeadca..4ec9879e82ed 100644
>> --- a/arch/arm64/include/asm/kvm_mmu.h
>> +++ b/arch/arm64/include/asm/kvm_mmu.h
>> @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct 
>> kvm_vcpu *vcpu)
>>  return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
>>  }
>>  
>> -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long 
>> size)
>> -{
>> -void *va = page_address(pfn_to_page(pfn));
>> -
>> -/*
>> - * With FWB, we ensure that the guest always accesses memory using
>> - * cacheable attributes, and we don't have to clean to PoC when
>> - * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
>> - * PoU is not required either in this case.
>> - */
>> -if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
>> -return;
>> -
>> -kvm_flush_dcache_to_poc(va, size);
>> -}
>> -
>>  static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
>>unsigned long size)
>>  {
>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
>> index 4d177ce1d536..2f4f87021980 100644
>> --- a/arch/arm64/kvm/hyp/pgtable.c
>> +++ b/arch/arm64/kvm/hyp/pgtable.c
>> @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum 
>> kvm_pgtable_prot prot,
>>  return 0;
>>  }
>>  
>> +static bool stage2_pte_cacheable(kvm_pte_t pte)
>> +{
>> +u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
>> +return memattr == PAGE_S2_MEMATTR(NORMAL);
>> +}
>> +
>> +static void stage2_flush_dcache(void *addr, u64 size)
>> +{
>> +/*
>> + * With FWB, we ensure that the guest always accesses memory using
>> + * cacheable attributes, and we don't have to clean to PoC when
>> + * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
>> + * PoU is not required either in this case.
>> + */
>> +if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
>> +return;
>> +
>> +__flush_dcache_area(addr, size);
>> +}
>> +
>>  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
>>kvm_pte_t *ptep,
>>struct stage2_map_data *data)
>> @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 
>> end, u32 level,
>>  put_page(page);
>>  }
>>  
>> +/* Flush data cache before installation of the new PTE */
>> +if (stage2_pte_cacheable(new))
>> +stage2_flush_dcache(__va(phys), granule);
>> +
>>  smp_store_release(ptep, new);
>>  get_page(page);
>>  data->phys += granule;
>> @@ -651,20 +675,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
>> addr, u64 size,
>>  return ret;
>>  }
>>  
>> -static void stage2_flush_dcache(void *addr, u64 size)
>> -{
>> -if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
>> -return;
>> -
>> -__flush_dcache_area(addr, size);
>> -}
>> -
>> -static bool stage2_pte_cacheable(kvm_pte_t pte)
>> -{
>> -u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
>> -return memattr == PAGE_S2_MEMATTR(NORMAL);
>> -}
>> 

Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler

2021-02-25 Thread Alexandru Elisei
Hi Marc,

On 2/24/21 5:39 PM, Marc Zyngier wrote:
> On Wed, 24 Feb 2021 17:21:22 +,
> Alexandru Elisei  wrote:
>> Hello,
>>
>> On 2/8/21 11:22 AM, Yanan Wang wrote:
>>> We currently uniformly clean dcache in user_mem_abort() before calling the
>>> fault handlers, if we take a translation fault and the pfn is cacheable.
>>> But if there are concurrent translation faults on the same page or block,
>>> clean of dcache for the first time is necessary while the others are not.
>>>
>>> By moving clean of dcache to the map handler, we can easily identify the
>>> conditions where CMOs are really needed and avoid the unnecessary ones.
>>> As it's a time consuming process to perform CMOs especially when flushing
>>> a block range, so this solution reduces much load of kvm and improve the
>>> efficiency of creating mappings.
>>>
>>> Signed-off-by: Yanan Wang 
>>> ---
>>>  arch/arm64/include/asm/kvm_mmu.h | 16 --
>>>  arch/arm64/kvm/hyp/pgtable.c | 38 
>>>  arch/arm64/kvm/mmu.c | 14 +++-
>>>  3 files changed, 27 insertions(+), 41 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
>>> b/arch/arm64/include/asm/kvm_mmu.h
>>> index e52d82aeadca..4ec9879e82ed 100644
>>> --- a/arch/arm64/include/asm/kvm_mmu.h
>>> +++ b/arch/arm64/include/asm/kvm_mmu.h
>>> @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct 
>>> kvm_vcpu *vcpu)
>>> return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
>>>  }
>>>  
>>> -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long 
>>> size)
>>> -{
>>> -   void *va = page_address(pfn_to_page(pfn));
>>> -
>>> -   /*
>>> -* With FWB, we ensure that the guest always accesses memory using
>>> -* cacheable attributes, and we don't have to clean to PoC when
>>> -* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
>>> -* PoU is not required either in this case.
>>> -*/
>>> -   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
>>> -   return;
>>> -
>>> -   kvm_flush_dcache_to_poc(va, size);
>>> -}
>>> -
>>>  static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
>>>   unsigned long size)
>>>  {
>>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
>>> index 4d177ce1d536..2f4f87021980 100644
>>> --- a/arch/arm64/kvm/hyp/pgtable.c
>>> +++ b/arch/arm64/kvm/hyp/pgtable.c
>>> @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum 
>>> kvm_pgtable_prot prot,
>>> return 0;
>>>  }
>>>  
>>> +static bool stage2_pte_cacheable(kvm_pte_t pte)
>>> +{
>>> +   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
>>> +   return memattr == PAGE_S2_MEMATTR(NORMAL);
>>> +}
>>> +
>>> +static void stage2_flush_dcache(void *addr, u64 size)
>>> +{
>>> +   /*
>>> +* With FWB, we ensure that the guest always accesses memory using
>>> +* cacheable attributes, and we don't have to clean to PoC when
>>> +* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
>>> +* PoU is not required either in this case.
>>> +*/
>>> +   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
>>> +   return;
>>> +
>>> +   __flush_dcache_area(addr, size);
>>> +}
>>> +
>>>  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
>>>   kvm_pte_t *ptep,
>>>   struct stage2_map_data *data)
>>> @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 
>>> end, u32 level,
>>> put_page(page);
>>> }
>>>  
>>> +   /* Flush data cache before installation of the new PTE */
>>> +   if (stage2_pte_cacheable(new))
>>> +   stage2_flush_dcache(__va(phys), granule);
>> This makes sense to me. kvm_pgtable_stage2_map() is protected
>> against concurrent calls by the kvm->mmu_lock, so only one VCPU can
>> change the stage 2 translation table at any given moment. In the
>> case of concurrent translation faults on the same IPA, the first
>> VCPU that will take the lock will create the mapping and do the
>> dcache clean+invalidate. The other VCPUs will return -EAGAIN because
>> the mapping they are trying to install is almost identical* to the
>> mapping created by the first VCPU that took the lock.
>>
>> I have a question. Why are you doing the cache maintenance *before*
>> installing the new mapping? This is what the kernel already does, so
>> I'm not saying it's incorrect, I'm just curious about the reason
>> behind it.
> The guarantee KVM offers to the guest is that by the time it can
> access the memory, it is cleaned to the PoC. If you establish a
> mapping before cleaning, another vcpu can access the PoC (no fault,
> you just set up S2) and not see it up to date.

Right, I knew I was missing something, thanks for the explanation.

Thanks,

Alex



Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler

2021-02-24 Thread Marc Zyngier
On Wed, 24 Feb 2021 17:21:22 +,
Alexandru Elisei  wrote:
> 
> Hello,
> 
> On 2/8/21 11:22 AM, Yanan Wang wrote:
> > We currently uniformly clean dcache in user_mem_abort() before calling the
> > fault handlers, if we take a translation fault and the pfn is cacheable.
> > But if there are concurrent translation faults on the same page or block,
> > clean of dcache for the first time is necessary while the others are not.
> >
> > By moving clean of dcache to the map handler, we can easily identify the
> > conditions where CMOs are really needed and avoid the unnecessary ones.
> > As it's a time consuming process to perform CMOs especially when flushing
> > a block range, so this solution reduces much load of kvm and improve the
> > efficiency of creating mappings.
> >
> > Signed-off-by: Yanan Wang 
> > ---
> >  arch/arm64/include/asm/kvm_mmu.h | 16 --
> >  arch/arm64/kvm/hyp/pgtable.c | 38 
> >  arch/arm64/kvm/mmu.c | 14 +++-
> >  3 files changed, 27 insertions(+), 41 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> > b/arch/arm64/include/asm/kvm_mmu.h
> > index e52d82aeadca..4ec9879e82ed 100644
> > --- a/arch/arm64/include/asm/kvm_mmu.h
> > +++ b/arch/arm64/include/asm/kvm_mmu.h
> > @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct 
> > kvm_vcpu *vcpu)
> > return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
> >  }
> >  
> > -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long 
> > size)
> > -{
> > -   void *va = page_address(pfn_to_page(pfn));
> > -
> > -   /*
> > -* With FWB, we ensure that the guest always accesses memory using
> > -* cacheable attributes, and we don't have to clean to PoC when
> > -* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
> > -* PoU is not required either in this case.
> > -*/
> > -   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> > -   return;
> > -
> > -   kvm_flush_dcache_to_poc(va, size);
> > -}
> > -
> >  static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
> >   unsigned long size)
> >  {
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 4d177ce1d536..2f4f87021980 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum 
> > kvm_pgtable_prot prot,
> > return 0;
> >  }
> >  
> > +static bool stage2_pte_cacheable(kvm_pte_t pte)
> > +{
> > +   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
> > +   return memattr == PAGE_S2_MEMATTR(NORMAL);
> > +}
> > +
> > +static void stage2_flush_dcache(void *addr, u64 size)
> > +{
> > +   /*
> > +* With FWB, we ensure that the guest always accesses memory using
> > +* cacheable attributes, and we don't have to clean to PoC when
> > +* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
> > +* PoU is not required either in this case.
> > +*/
> > +   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> > +   return;
> > +
> > +   __flush_dcache_area(addr, size);
> > +}
> > +
> >  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
> >   kvm_pte_t *ptep,
> >   struct stage2_map_data *data)
> > @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 
> > end, u32 level,
> > put_page(page);
> > }
> >  
> > +   /* Flush data cache before installation of the new PTE */
> > +   if (stage2_pte_cacheable(new))
> > +   stage2_flush_dcache(__va(phys), granule);
> 
> This makes sense to me. kvm_pgtable_stage2_map() is protected
> against concurrent calls by the kvm->mmu_lock, so only one VCPU can
> change the stage 2 translation table at any given moment. In the
> case of concurrent translation faults on the same IPA, the first
> VCPU that will take the lock will create the mapping and do the
> dcache clean+invalidate. The other VCPUs will return -EAGAIN because
> the mapping they are trying to install is almost identical* to the
> mapping created by the first VCPU that took the lock.
> 
> I have a question. Why are you doing the cache maintenance *before*
> installing the new mapping? This is what the kernel already does, so
> I'm not saying it's incorrect, I'm just curious about the reason
> behind it.

The guarantee KVM offers to the guest is that by the time it can
access the memory, it is cleaned to the PoC. If you establish a
mapping before cleaning, another vcpu can access the PoC (no fault,
you just set up S2) and not see it up to date.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.


Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler

2021-02-24 Thread Alexandru Elisei
Hello,

On 2/8/21 11:22 AM, Yanan Wang wrote:
> We currently uniformly clean dcache in user_mem_abort() before calling the
> fault handlers, if we take a translation fault and the pfn is cacheable.
> But if there are concurrent translation faults on the same page or block,
> clean of dcache for the first time is necessary while the others are not.
>
> By moving clean of dcache to the map handler, we can easily identify the
> conditions where CMOs are really needed and avoid the unnecessary ones.
> As it's a time consuming process to perform CMOs especially when flushing
> a block range, so this solution reduces much load of kvm and improve the
> efficiency of creating mappings.
>
> Signed-off-by: Yanan Wang 
> ---
>  arch/arm64/include/asm/kvm_mmu.h | 16 --
>  arch/arm64/kvm/hyp/pgtable.c | 38 
>  arch/arm64/kvm/mmu.c | 14 +++-
>  3 files changed, 27 insertions(+), 41 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> b/arch/arm64/include/asm/kvm_mmu.h
> index e52d82aeadca..4ec9879e82ed 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct 
> kvm_vcpu *vcpu)
>   return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
>  }
>  
> -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long 
> size)
> -{
> - void *va = page_address(pfn_to_page(pfn));
> -
> - /*
> -  * With FWB, we ensure that the guest always accesses memory using
> -  * cacheable attributes, and we don't have to clean to PoC when
> -  * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
> -  * PoU is not required either in this case.
> -  */
> - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> - return;
> -
> - kvm_flush_dcache_to_poc(va, size);
> -}
> -
>  static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
> unsigned long size)
>  {
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 4d177ce1d536..2f4f87021980 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum 
> kvm_pgtable_prot prot,
>   return 0;
>  }
>  
> +static bool stage2_pte_cacheable(kvm_pte_t pte)
> +{
> + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
> + return memattr == PAGE_S2_MEMATTR(NORMAL);
> +}
> +
> +static void stage2_flush_dcache(void *addr, u64 size)
> +{
> + /*
> +  * With FWB, we ensure that the guest always accesses memory using
> +  * cacheable attributes, and we don't have to clean to PoC when
> +  * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
> +  * PoU is not required either in this case.
> +  */
> + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> + return;
> +
> + __flush_dcache_area(addr, size);
> +}
> +
>  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
> kvm_pte_t *ptep,
> struct stage2_map_data *data)
> @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
> u32 level,
>   put_page(page);
>   }
>  
> + /* Flush data cache before installation of the new PTE */
> + if (stage2_pte_cacheable(new))
> + stage2_flush_dcache(__va(phys), granule);

This makes sense to me. kvm_pgtable_stage2_map() is protected against concurrent
calls by the kvm->mmu_lock, so only one VCPU can change the stage 2 translation
table at any given moment. In the case of concurrent translation faults on the
same IPA, the first VCPU that will take the lock will create the mapping and do
the dcache clean+invalidate. The other VCPUs will return -EAGAIN because the
mapping they are trying to install is almost identical* to the mapping created 
by
the first VCPU that took the lock.

I have a question. Why are you doing the cache maintenance *before* installing 
the
new mapping? This is what the kernel already does, so I'm not saying it's
incorrect, I'm just curious about the reason behind it.

*permissions might be different.

Thanks,

Alex

> +
>   smp_store_release(ptep, new);
>   get_page(page);
>   data->phys += granule;
> @@ -651,20 +675,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
> addr, u64 size,
>   return ret;
>  }
>  
> -static void stage2_flush_dcache(void *addr, u64 size)
> -{
> - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> - return;
> -
> - __flush_dcache_area(addr, size);
> -}
> -
> -static bool stage2_pte_cacheable(kvm_pte_t pte)
> -{
> - u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
> - return memattr == PAGE_S2_MEMATTR(NORMAL);
> -}
> -
>  static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> 

[RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler

2021-02-08 Thread Yanan Wang
We currently uniformly clean dcache in user_mem_abort() before calling the
fault handlers, if we take a translation fault and the pfn is cacheable.
But if there are concurrent translation faults on the same page or block,
clean of dcache for the first time is necessary while the others are not.

By moving clean of dcache to the map handler, we can easily identify the
conditions where CMOs are really needed and avoid the unnecessary ones.
As it's a time consuming process to perform CMOs especially when flushing
a block range, so this solution reduces much load of kvm and improve the
efficiency of creating mappings.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/kvm_mmu.h | 16 --
 arch/arm64/kvm/hyp/pgtable.c | 38 
 arch/arm64/kvm/mmu.c | 14 +++-
 3 files changed, 27 insertions(+), 41 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e52d82aeadca..4ec9879e82ed 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu 
*vcpu)
return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-   void *va = page_address(pfn_to_page(pfn));
-
-   /*
-* With FWB, we ensure that the guest always accesses memory using
-* cacheable attributes, and we don't have to clean to PoC when
-* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
-* PoU is not required either in this case.
-*/
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   return;
-
-   kvm_flush_dcache_to_poc(va, size);
-}
-
 static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
  unsigned long size)
 {
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 4d177ce1d536..2f4f87021980 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
return 0;
 }
 
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
+   return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+   /*
+* With FWB, we ensure that the guest always accesses memory using
+* cacheable attributes, and we don't have to clean to PoC when
+* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
+* PoU is not required either in this case.
+*/
+   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+   return;
+
+   __flush_dcache_area(addr, size);
+}
+
 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
  kvm_pte_t *ptep,
  struct stage2_map_data *data)
@@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
put_page(page);
}
 
+   /* Flush data cache before installation of the new PTE */
+   if (stage2_pte_cacheable(new))
+   stage2_flush_dcache(__va(phys), granule);
+
smp_store_release(ptep, new);
get_page(page);
data->phys += granule;
@@ -651,20 +675,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
addr, u64 size,
return ret;
 }
 
-static void stage2_flush_dcache(void *addr, u64 size)
-{
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   return;
-
-   __flush_dcache_area(addr, size);
-}
-
-static bool stage2_pte_cacheable(kvm_pte_t pte)
-{
-   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
-   return memattr == PAGE_S2_MEMATTR(NORMAL);
-}
-
 static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
   enum kvm_pgtable_walk_flags flag,
   void * const arg)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 77cb2d28f2a4..d151927a7d62 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -609,11 +609,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm 
*kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-   __clean_dcache_guest_page(pfn, size);
-}
-
 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
 {
__invalidate_icache_guest_page(pfn, size);
@@ -882,9 +877,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
if (writable)
prot |= KVM_PGTABLE_PROT_W;
 
-   if (fault_status != FSC_PERM && !device)
-   clean_dcache_guest_page(pfn, vma_pagesize);
-
if