Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler
On 2021/2/25 17:55, Marc Zyngier wrote: Hi Yanan, On Mon, 08 Feb 2021 11:22:47 +, Yanan Wang wrote: We currently uniformly clean dcache in user_mem_abort() before calling the fault handlers, if we take a translation fault and the pfn is cacheable. But if there are concurrent translation faults on the same page or block, clean of dcache for the first time is necessary while the others are not. By moving clean of dcache to the map handler, we can easily identify the conditions where CMOs are really needed and avoid the unnecessary ones. As it's a time consuming process to perform CMOs especially when flushing a block range, so this solution reduces much load of kvm and improve the efficiency of creating mappings. That's an interesting approach. However, wouldn't it be better to identify early that there is already something mapped, and return to the guest ASAP? Can you quantify the benefit of this patch alone? Signed-off-by: Yanan Wang --- arch/arm64/include/asm/kvm_mmu.h | 16 -- arch/arm64/kvm/hyp/pgtable.c | 38 arch/arm64/kvm/mmu.c | 14 +++- 3 files changed, 27 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index e52d82aeadca..4ec9879e82ed 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; } -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) -{ - void *va = page_address(pfn_to_page(pfn)); - - /* -* With FWB, we ensure that the guest always accesses memory using -* cacheable attributes, and we don't have to clean to PoC when -* faulting in pages. Furthermore, FWB implies IDC, so cleaning to -* PoU is not required either in this case. -*/ - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) - return; - - kvm_flush_dcache_to_poc(va, size); -} - static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) { diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 4d177ce1d536..2f4f87021980 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, return 0; } +static bool stage2_pte_cacheable(kvm_pte_t pte) +{ + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; + return memattr == PAGE_S2_MEMATTR(NORMAL); +} + +static void stage2_flush_dcache(void *addr, u64 size) +{ + /* +* With FWB, we ensure that the guest always accesses memory using +* cacheable attributes, and we don't have to clean to PoC when +* faulting in pages. Furthermore, FWB implies IDC, so cleaning to +* PoU is not required either in this case. +*/ + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + return; + + __flush_dcache_area(addr, size); +} + static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, put_page(page); } + /* Flush data cache before installation of the new PTE */ + if (stage2_pte_cacheable(new)) + stage2_flush_dcache(__va(phys), granule); + smp_store_release(ptep, new); get_page(page); data->phys += granule; @@ -651,20 +675,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, return ret; } -static void stage2_flush_dcache(void *addr, u64 size) -{ - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) - return; - - __flush_dcache_area(addr, size); -} - -static bool stage2_pte_cacheable(kvm_pte_t pte) -{ - u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; - return memattr == PAGE_S2_MEMATTR(NORMAL); -} - static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 77cb2d28f2a4..d151927a7d62 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -609,11 +609,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) -{ - __clean_dcache_guest_page(pfn, size); -} - static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) {
Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler
Hi Marc, Alex, On 2021/2/26 2:30, Marc Zyngier wrote: On Thu, 25 Feb 2021 17:39:00 +, Alexandru Elisei wrote: Hi Marc, On 2/25/21 9:55 AM, Marc Zyngier wrote: Hi Yanan, On Mon, 08 Feb 2021 11:22:47 +, Yanan Wang wrote: We currently uniformly clean dcache in user_mem_abort() before calling the fault handlers, if we take a translation fault and the pfn is cacheable. But if there are concurrent translation faults on the same page or block, clean of dcache for the first time is necessary while the others are not. By moving clean of dcache to the map handler, we can easily identify the conditions where CMOs are really needed and avoid the unnecessary ones. As it's a time consuming process to perform CMOs especially when flushing a block range, so this solution reduces much load of kvm and improve the efficiency of creating mappings. That's an interesting approach. However, wouldn't it be better to identify early that there is already something mapped, and return to the guest ASAP? Wouldn't that introduce overhead for the common case, when there's only one VCPU that faults on an address? For each data abort caused by a missing stage 2 entry we would now have to determine if the IPA isn't already mapped and that means walking the stage 2 tables. The problem is that there is no easy to define "common case". It all depends on what you are running in the guest. Or am I mistaken and either: (a) The common case is multiple simultaneous translation faults from different VCPUs on the same IPA. Or (b) There's a fast way to check if an IPA is mapped at stage 2 and the overhead would be negligible. Checking that something is mapped is simple enough: walk the S2 PT (in SW or using AT/PAR), and return early if there is *anything*. You already have taken the fault, which is the most expensive part of the handling. I think maybe it could be better to move CMOs (both dcache and icache) to the fault handlers. The map path and permission path are actually a page table walk, and we can easily distinguish between conditions that need CMOs and the ones that don't in the paths now. Why do we have to add one more PTW early just for identifying the cases of CMOs and ignore the existing one? Besides, if we know in advance there is already something mapped (page table is valid), maybe it's not appropriate to just return early in all cases. What if we are going to change the output address(OA) of the existing table entry? We can't just return in this case. I'm not sure whether this is a correct example :). Actually, moving CMOs to the fault handlers will not ruin the existing stage2 page table framework, and there will not be so much code change. Please see below. Can you quantify the benefit of this patch alone? And this ^^^ part is crucial to evaluating the merit of this patch, specially outside of the micro-benchmark space. The following test results represent the benefit of this patch alone, and it's indicated that the benefit increase as the page table granularity increases. Selftest: https://lore.kernel.org/lkml/20210208090841.333724-1-wangyana...@huawei.com/ --- hardware platform: HiSilicon Kunpeng920 Server(FWB not supported) host kernel: Linux mainline v5.11-rc6 (with series of https://lore.kernel.org/r/20210114121350.123684-4-wangyana...@huawei.com applied) (1) multiple vcpus concurrently access 1G memory. execution time of: a) KVM create new page mappings(normal 4K), b) update the mappings from RO to RW. cmdline: ./kvm_page_table_test -m 4 -t 0 -g 4K -s 1G -v 50 (50 vcpus, 1G memory, page mappings(normal 4K)) a) Before patch: KVM_CREATE_MAPPINGS: 62.752s 62.123s 61.733s 62.562s 61.847s After patch: KVM_CREATE_MAPPINGS: 58.800s 58.364s 58.163s 58.370s 58.677s *average 7% improvement* b) Before patch: KVM_UPDATE_MAPPINGS: 49.083s 49.920s 49.484s 49.551s 49.410s After patch: KVM_UPDATE_MAPPINGS: 48.723s 49.259s 49.204s 48.207s 49.112s *no change* cmdline: ./kvm_page_table_test -m 4 -t 0 -g 4K -s 1G -v 100 (100 vcpus, 1G memory, page mappings(normal 4K)) a) Before patch: KVM_CREATE_MAPPINGS: 129.70s 129.66s 126.78s 126.07s 130.21s After patch: KVM_CREATE_MAPPINGS: 120.69s 120.28s 120.68s 121.09s 121.34s *average 9% improvement* b) Before patch: KVM_UPDATE_MAPPINGS: 94.097s 94.501s 92.589s 93.957s 94.317s After patch: KVM_UPDATE_MAPPINGS: 93.677s 93.701s 93.036s 93.484s 93.584s *no change* (2) multiple vcpus concurrently access 20G memory. execution time of: a) KVM create new block mappings(THP 2M), b) split the blocks in dirty logging, c) reconstitute the blocks after dirty logging. cmdline: ./kvm_page_table_test -m 4 -t 1 -g 2M -s 20G -v 20 (20 vcpus, 20G memory, block mappings(THP 2M)) a) Before patch: KVM_CREATE_MAPPINGS: 12.546s 13.300s 12.448s 12.496s 12.420s After patch: KVM_CREATE_MAPPINGS: 5.679s 5.773s 5.759s 5.698s 5.835s *average 54% improvement* b) Before patch:
Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler
On Thu, 25 Feb 2021 17:39:00 +, Alexandru Elisei wrote: > > Hi Marc, > > On 2/25/21 9:55 AM, Marc Zyngier wrote: > > Hi Yanan, > > > > On Mon, 08 Feb 2021 11:22:47 +, > > Yanan Wang wrote: > >> We currently uniformly clean dcache in user_mem_abort() before calling the > >> fault handlers, if we take a translation fault and the pfn is cacheable. > >> But if there are concurrent translation faults on the same page or block, > >> clean of dcache for the first time is necessary while the others are not. > >> > >> By moving clean of dcache to the map handler, we can easily identify the > >> conditions where CMOs are really needed and avoid the unnecessary ones. > >> As it's a time consuming process to perform CMOs especially when flushing > >> a block range, so this solution reduces much load of kvm and improve the > >> efficiency of creating mappings. > > That's an interesting approach. However, wouldn't it be better to > > identify early that there is already something mapped, and return to > > the guest ASAP? > > Wouldn't that introduce overhead for the common case, when there's > only one VCPU that faults on an address? For each data abort caused > by a missing stage 2 entry we would now have to determine if the IPA > isn't already mapped and that means walking the stage 2 tables. The problem is that there is no easy to define "common case". It all depends on what you are running in the guest. > Or am I mistaken and either: > > (a) The common case is multiple simultaneous translation faults from > different VCPUs on the same IPA. Or > > (b) There's a fast way to check if an IPA is mapped at stage 2 and > the overhead would be negligible. Checking that something is mapped is simple enough: walk the S2 PT (in SW or using AT/PAR), and return early if there is *anything*. You already have taken the fault, which is the most expensive part of the handling. > > > > > Can you quantify the benefit of this patch alone? And this ^^^ part is crucial to evaluating the merit of this patch, specially outside of the micro-benchmark space. > > > >> Signed-off-by: Yanan Wang > >> --- > >> arch/arm64/include/asm/kvm_mmu.h | 16 -- > >> arch/arm64/kvm/hyp/pgtable.c | 38 > >> arch/arm64/kvm/mmu.c | 14 +++- > >> 3 files changed, 27 insertions(+), 41 deletions(-) > >> > >> diff --git a/arch/arm64/include/asm/kvm_mmu.h > >> b/arch/arm64/include/asm/kvm_mmu.h > >> index e52d82aeadca..4ec9879e82ed 100644 > >> --- a/arch/arm64/include/asm/kvm_mmu.h > >> +++ b/arch/arm64/include/asm/kvm_mmu.h > >> @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct > >> kvm_vcpu *vcpu) > >>return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; > >> } > >> > >> -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long > >> size) > >> -{ > >> - void *va = page_address(pfn_to_page(pfn)); > >> - > >> - /* > >> - * With FWB, we ensure that the guest always accesses memory using > >> - * cacheable attributes, and we don't have to clean to PoC when > >> - * faulting in pages. Furthermore, FWB implies IDC, so cleaning to > >> - * PoU is not required either in this case. > >> - */ > >> - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) > >> - return; > >> - > >> - kvm_flush_dcache_to_poc(va, size); > >> -} > >> - > >> static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, > >> unsigned long size) > >> { > >> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > >> index 4d177ce1d536..2f4f87021980 100644 > >> --- a/arch/arm64/kvm/hyp/pgtable.c > >> +++ b/arch/arm64/kvm/hyp/pgtable.c > >> @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum > >> kvm_pgtable_prot prot, > >>return 0; > >> } > >> > >> +static bool stage2_pte_cacheable(kvm_pte_t pte) > >> +{ > >> + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; > >> + return memattr == PAGE_S2_MEMATTR(NORMAL); > >> +} > >> + > >> +static void stage2_flush_dcache(void *addr, u64 size) > >> +{ > >> + /* > >> + * With FWB, we ensure that the guest always accesses memory using > >> + * cacheable attributes, and we don't have to clean to PoC when > >> + * faulting in pages. Furthermore, FWB implies IDC, so cleaning to > >> + * PoU is not required either in this case. > >> + */ > >> + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) > >> + return; > >> + > >> + __flush_dcache_area(addr, size); > >> +} > >> + > >> static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, > >> kvm_pte_t *ptep, > >> struct stage2_map_data *data) > >> @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 > >> end, u32 level, > >>put_page(page); > >>} > >> > >> + /* Flush data cache before installation of the new PTE */ > >> + if
Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler
Hi Marc, On 2/25/21 9:55 AM, Marc Zyngier wrote: > Hi Yanan, > > On Mon, 08 Feb 2021 11:22:47 +, > Yanan Wang wrote: >> We currently uniformly clean dcache in user_mem_abort() before calling the >> fault handlers, if we take a translation fault and the pfn is cacheable. >> But if there are concurrent translation faults on the same page or block, >> clean of dcache for the first time is necessary while the others are not. >> >> By moving clean of dcache to the map handler, we can easily identify the >> conditions where CMOs are really needed and avoid the unnecessary ones. >> As it's a time consuming process to perform CMOs especially when flushing >> a block range, so this solution reduces much load of kvm and improve the >> efficiency of creating mappings. > That's an interesting approach. However, wouldn't it be better to > identify early that there is already something mapped, and return to > the guest ASAP? Wouldn't that introduce overhead for the common case, when there's only one VCPU that faults on an address? For each data abort caused by a missing stage 2 entry we would now have to determine if the IPA isn't already mapped and that means walking the stage 2 tables. Or am I mistaken and either: (a) The common case is multiple simultaneous translation faults from different VCPUs on the same IPA. Or (b) There's a fast way to check if an IPA is mapped at stage 2 and the overhead would be negligible. > > Can you quantify the benefit of this patch alone? > >> Signed-off-by: Yanan Wang >> --- >> arch/arm64/include/asm/kvm_mmu.h | 16 -- >> arch/arm64/kvm/hyp/pgtable.c | 38 >> arch/arm64/kvm/mmu.c | 14 +++- >> 3 files changed, 27 insertions(+), 41 deletions(-) >> >> diff --git a/arch/arm64/include/asm/kvm_mmu.h >> b/arch/arm64/include/asm/kvm_mmu.h >> index e52d82aeadca..4ec9879e82ed 100644 >> --- a/arch/arm64/include/asm/kvm_mmu.h >> +++ b/arch/arm64/include/asm/kvm_mmu.h >> @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct >> kvm_vcpu *vcpu) >> return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; >> } >> >> -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long >> size) >> -{ >> -void *va = page_address(pfn_to_page(pfn)); >> - >> -/* >> - * With FWB, we ensure that the guest always accesses memory using >> - * cacheable attributes, and we don't have to clean to PoC when >> - * faulting in pages. Furthermore, FWB implies IDC, so cleaning to >> - * PoU is not required either in this case. >> - */ >> -if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) >> -return; >> - >> -kvm_flush_dcache_to_poc(va, size); >> -} >> - >> static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, >>unsigned long size) >> { >> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c >> index 4d177ce1d536..2f4f87021980 100644 >> --- a/arch/arm64/kvm/hyp/pgtable.c >> +++ b/arch/arm64/kvm/hyp/pgtable.c >> @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum >> kvm_pgtable_prot prot, >> return 0; >> } >> >> +static bool stage2_pte_cacheable(kvm_pte_t pte) >> +{ >> +u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; >> +return memattr == PAGE_S2_MEMATTR(NORMAL); >> +} >> + >> +static void stage2_flush_dcache(void *addr, u64 size) >> +{ >> +/* >> + * With FWB, we ensure that the guest always accesses memory using >> + * cacheable attributes, and we don't have to clean to PoC when >> + * faulting in pages. Furthermore, FWB implies IDC, so cleaning to >> + * PoU is not required either in this case. >> + */ >> +if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) >> +return; >> + >> +__flush_dcache_area(addr, size); >> +} >> + >> static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, >>kvm_pte_t *ptep, >>struct stage2_map_data *data) >> @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 >> end, u32 level, >> put_page(page); >> } >> >> +/* Flush data cache before installation of the new PTE */ >> +if (stage2_pte_cacheable(new)) >> +stage2_flush_dcache(__va(phys), granule); >> + >> smp_store_release(ptep, new); >> get_page(page); >> data->phys += granule; >> @@ -651,20 +675,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 >> addr, u64 size, >> return ret; >> } >> >> -static void stage2_flush_dcache(void *addr, u64 size) >> -{ >> -if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) >> -return; >> - >> -__flush_dcache_area(addr, size); >> -} >> - >> -static bool stage2_pte_cacheable(kvm_pte_t pte) >> -{ >> -u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; >> -return memattr == PAGE_S2_MEMATTR(NORMAL); >> -} >>
Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler
Hi Marc, On 2/24/21 5:39 PM, Marc Zyngier wrote: > On Wed, 24 Feb 2021 17:21:22 +, > Alexandru Elisei wrote: >> Hello, >> >> On 2/8/21 11:22 AM, Yanan Wang wrote: >>> We currently uniformly clean dcache in user_mem_abort() before calling the >>> fault handlers, if we take a translation fault and the pfn is cacheable. >>> But if there are concurrent translation faults on the same page or block, >>> clean of dcache for the first time is necessary while the others are not. >>> >>> By moving clean of dcache to the map handler, we can easily identify the >>> conditions where CMOs are really needed and avoid the unnecessary ones. >>> As it's a time consuming process to perform CMOs especially when flushing >>> a block range, so this solution reduces much load of kvm and improve the >>> efficiency of creating mappings. >>> >>> Signed-off-by: Yanan Wang >>> --- >>> arch/arm64/include/asm/kvm_mmu.h | 16 -- >>> arch/arm64/kvm/hyp/pgtable.c | 38 >>> arch/arm64/kvm/mmu.c | 14 +++- >>> 3 files changed, 27 insertions(+), 41 deletions(-) >>> >>> diff --git a/arch/arm64/include/asm/kvm_mmu.h >>> b/arch/arm64/include/asm/kvm_mmu.h >>> index e52d82aeadca..4ec9879e82ed 100644 >>> --- a/arch/arm64/include/asm/kvm_mmu.h >>> +++ b/arch/arm64/include/asm/kvm_mmu.h >>> @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct >>> kvm_vcpu *vcpu) >>> return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; >>> } >>> >>> -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long >>> size) >>> -{ >>> - void *va = page_address(pfn_to_page(pfn)); >>> - >>> - /* >>> -* With FWB, we ensure that the guest always accesses memory using >>> -* cacheable attributes, and we don't have to clean to PoC when >>> -* faulting in pages. Furthermore, FWB implies IDC, so cleaning to >>> -* PoU is not required either in this case. >>> -*/ >>> - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) >>> - return; >>> - >>> - kvm_flush_dcache_to_poc(va, size); >>> -} >>> - >>> static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, >>> unsigned long size) >>> { >>> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c >>> index 4d177ce1d536..2f4f87021980 100644 >>> --- a/arch/arm64/kvm/hyp/pgtable.c >>> +++ b/arch/arm64/kvm/hyp/pgtable.c >>> @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum >>> kvm_pgtable_prot prot, >>> return 0; >>> } >>> >>> +static bool stage2_pte_cacheable(kvm_pte_t pte) >>> +{ >>> + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; >>> + return memattr == PAGE_S2_MEMATTR(NORMAL); >>> +} >>> + >>> +static void stage2_flush_dcache(void *addr, u64 size) >>> +{ >>> + /* >>> +* With FWB, we ensure that the guest always accesses memory using >>> +* cacheable attributes, and we don't have to clean to PoC when >>> +* faulting in pages. Furthermore, FWB implies IDC, so cleaning to >>> +* PoU is not required either in this case. >>> +*/ >>> + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) >>> + return; >>> + >>> + __flush_dcache_area(addr, size); >>> +} >>> + >>> static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, >>> kvm_pte_t *ptep, >>> struct stage2_map_data *data) >>> @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 >>> end, u32 level, >>> put_page(page); >>> } >>> >>> + /* Flush data cache before installation of the new PTE */ >>> + if (stage2_pte_cacheable(new)) >>> + stage2_flush_dcache(__va(phys), granule); >> This makes sense to me. kvm_pgtable_stage2_map() is protected >> against concurrent calls by the kvm->mmu_lock, so only one VCPU can >> change the stage 2 translation table at any given moment. In the >> case of concurrent translation faults on the same IPA, the first >> VCPU that will take the lock will create the mapping and do the >> dcache clean+invalidate. The other VCPUs will return -EAGAIN because >> the mapping they are trying to install is almost identical* to the >> mapping created by the first VCPU that took the lock. >> >> I have a question. Why are you doing the cache maintenance *before* >> installing the new mapping? This is what the kernel already does, so >> I'm not saying it's incorrect, I'm just curious about the reason >> behind it. > The guarantee KVM offers to the guest is that by the time it can > access the memory, it is cleaned to the PoC. If you establish a > mapping before cleaning, another vcpu can access the PoC (no fault, > you just set up S2) and not see it up to date. Right, I knew I was missing something, thanks for the explanation. Thanks, Alex
Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler
On Wed, 24 Feb 2021 17:21:22 +, Alexandru Elisei wrote: > > Hello, > > On 2/8/21 11:22 AM, Yanan Wang wrote: > > We currently uniformly clean dcache in user_mem_abort() before calling the > > fault handlers, if we take a translation fault and the pfn is cacheable. > > But if there are concurrent translation faults on the same page or block, > > clean of dcache for the first time is necessary while the others are not. > > > > By moving clean of dcache to the map handler, we can easily identify the > > conditions where CMOs are really needed and avoid the unnecessary ones. > > As it's a time consuming process to perform CMOs especially when flushing > > a block range, so this solution reduces much load of kvm and improve the > > efficiency of creating mappings. > > > > Signed-off-by: Yanan Wang > > --- > > arch/arm64/include/asm/kvm_mmu.h | 16 -- > > arch/arm64/kvm/hyp/pgtable.c | 38 > > arch/arm64/kvm/mmu.c | 14 +++- > > 3 files changed, 27 insertions(+), 41 deletions(-) > > > > diff --git a/arch/arm64/include/asm/kvm_mmu.h > > b/arch/arm64/include/asm/kvm_mmu.h > > index e52d82aeadca..4ec9879e82ed 100644 > > --- a/arch/arm64/include/asm/kvm_mmu.h > > +++ b/arch/arm64/include/asm/kvm_mmu.h > > @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct > > kvm_vcpu *vcpu) > > return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; > > } > > > > -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long > > size) > > -{ > > - void *va = page_address(pfn_to_page(pfn)); > > - > > - /* > > -* With FWB, we ensure that the guest always accesses memory using > > -* cacheable attributes, and we don't have to clean to PoC when > > -* faulting in pages. Furthermore, FWB implies IDC, so cleaning to > > -* PoU is not required either in this case. > > -*/ > > - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) > > - return; > > - > > - kvm_flush_dcache_to_poc(va, size); > > -} > > - > > static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, > > unsigned long size) > > { > > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > > index 4d177ce1d536..2f4f87021980 100644 > > --- a/arch/arm64/kvm/hyp/pgtable.c > > +++ b/arch/arm64/kvm/hyp/pgtable.c > > @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum > > kvm_pgtable_prot prot, > > return 0; > > } > > > > +static bool stage2_pte_cacheable(kvm_pte_t pte) > > +{ > > + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; > > + return memattr == PAGE_S2_MEMATTR(NORMAL); > > +} > > + > > +static void stage2_flush_dcache(void *addr, u64 size) > > +{ > > + /* > > +* With FWB, we ensure that the guest always accesses memory using > > +* cacheable attributes, and we don't have to clean to PoC when > > +* faulting in pages. Furthermore, FWB implies IDC, so cleaning to > > +* PoU is not required either in this case. > > +*/ > > + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) > > + return; > > + > > + __flush_dcache_area(addr, size); > > +} > > + > > static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, > > kvm_pte_t *ptep, > > struct stage2_map_data *data) > > @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 > > end, u32 level, > > put_page(page); > > } > > > > + /* Flush data cache before installation of the new PTE */ > > + if (stage2_pte_cacheable(new)) > > + stage2_flush_dcache(__va(phys), granule); > > This makes sense to me. kvm_pgtable_stage2_map() is protected > against concurrent calls by the kvm->mmu_lock, so only one VCPU can > change the stage 2 translation table at any given moment. In the > case of concurrent translation faults on the same IPA, the first > VCPU that will take the lock will create the mapping and do the > dcache clean+invalidate. The other VCPUs will return -EAGAIN because > the mapping they are trying to install is almost identical* to the > mapping created by the first VCPU that took the lock. > > I have a question. Why are you doing the cache maintenance *before* > installing the new mapping? This is what the kernel already does, so > I'm not saying it's incorrect, I'm just curious about the reason > behind it. The guarantee KVM offers to the guest is that by the time it can access the memory, it is cleaned to the PoC. If you establish a mapping before cleaning, another vcpu can access the PoC (no fault, you just set up S2) and not see it up to date. Thanks, M. -- Without deviation from the norm, progress is not possible.
Re: [RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler
Hello, On 2/8/21 11:22 AM, Yanan Wang wrote: > We currently uniformly clean dcache in user_mem_abort() before calling the > fault handlers, if we take a translation fault and the pfn is cacheable. > But if there are concurrent translation faults on the same page or block, > clean of dcache for the first time is necessary while the others are not. > > By moving clean of dcache to the map handler, we can easily identify the > conditions where CMOs are really needed and avoid the unnecessary ones. > As it's a time consuming process to perform CMOs especially when flushing > a block range, so this solution reduces much load of kvm and improve the > efficiency of creating mappings. > > Signed-off-by: Yanan Wang > --- > arch/arm64/include/asm/kvm_mmu.h | 16 -- > arch/arm64/kvm/hyp/pgtable.c | 38 > arch/arm64/kvm/mmu.c | 14 +++- > 3 files changed, 27 insertions(+), 41 deletions(-) > > diff --git a/arch/arm64/include/asm/kvm_mmu.h > b/arch/arm64/include/asm/kvm_mmu.h > index e52d82aeadca..4ec9879e82ed 100644 > --- a/arch/arm64/include/asm/kvm_mmu.h > +++ b/arch/arm64/include/asm/kvm_mmu.h > @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct > kvm_vcpu *vcpu) > return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; > } > > -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long > size) > -{ > - void *va = page_address(pfn_to_page(pfn)); > - > - /* > - * With FWB, we ensure that the guest always accesses memory using > - * cacheable attributes, and we don't have to clean to PoC when > - * faulting in pages. Furthermore, FWB implies IDC, so cleaning to > - * PoU is not required either in this case. > - */ > - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) > - return; > - > - kvm_flush_dcache_to_poc(va, size); > -} > - > static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, > unsigned long size) > { > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > index 4d177ce1d536..2f4f87021980 100644 > --- a/arch/arm64/kvm/hyp/pgtable.c > +++ b/arch/arm64/kvm/hyp/pgtable.c > @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum > kvm_pgtable_prot prot, > return 0; > } > > +static bool stage2_pte_cacheable(kvm_pte_t pte) > +{ > + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; > + return memattr == PAGE_S2_MEMATTR(NORMAL); > +} > + > +static void stage2_flush_dcache(void *addr, u64 size) > +{ > + /* > + * With FWB, we ensure that the guest always accesses memory using > + * cacheable attributes, and we don't have to clean to PoC when > + * faulting in pages. Furthermore, FWB implies IDC, so cleaning to > + * PoU is not required either in this case. > + */ > + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) > + return; > + > + __flush_dcache_area(addr, size); > +} > + > static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, > kvm_pte_t *ptep, > struct stage2_map_data *data) > @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, > u32 level, > put_page(page); > } > > + /* Flush data cache before installation of the new PTE */ > + if (stage2_pte_cacheable(new)) > + stage2_flush_dcache(__va(phys), granule); This makes sense to me. kvm_pgtable_stage2_map() is protected against concurrent calls by the kvm->mmu_lock, so only one VCPU can change the stage 2 translation table at any given moment. In the case of concurrent translation faults on the same IPA, the first VCPU that will take the lock will create the mapping and do the dcache clean+invalidate. The other VCPUs will return -EAGAIN because the mapping they are trying to install is almost identical* to the mapping created by the first VCPU that took the lock. I have a question. Why are you doing the cache maintenance *before* installing the new mapping? This is what the kernel already does, so I'm not saying it's incorrect, I'm just curious about the reason behind it. *permissions might be different. Thanks, Alex > + > smp_store_release(ptep, new); > get_page(page); > data->phys += granule; > @@ -651,20 +675,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 > addr, u64 size, > return ret; > } > > -static void stage2_flush_dcache(void *addr, u64 size) > -{ > - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) > - return; > - > - __flush_dcache_area(addr, size); > -} > - > -static bool stage2_pte_cacheable(kvm_pte_t pte) > -{ > - u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; > - return memattr == PAGE_S2_MEMATTR(NORMAL); > -} > - > static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, >
[RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler
We currently uniformly clean dcache in user_mem_abort() before calling the fault handlers, if we take a translation fault and the pfn is cacheable. But if there are concurrent translation faults on the same page or block, clean of dcache for the first time is necessary while the others are not. By moving clean of dcache to the map handler, we can easily identify the conditions where CMOs are really needed and avoid the unnecessary ones. As it's a time consuming process to perform CMOs especially when flushing a block range, so this solution reduces much load of kvm and improve the efficiency of creating mappings. Signed-off-by: Yanan Wang --- arch/arm64/include/asm/kvm_mmu.h | 16 -- arch/arm64/kvm/hyp/pgtable.c | 38 arch/arm64/kvm/mmu.c | 14 +++- 3 files changed, 27 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index e52d82aeadca..4ec9879e82ed 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; } -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) -{ - void *va = page_address(pfn_to_page(pfn)); - - /* -* With FWB, we ensure that the guest always accesses memory using -* cacheable attributes, and we don't have to clean to PoC when -* faulting in pages. Furthermore, FWB implies IDC, so cleaning to -* PoU is not required either in this case. -*/ - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) - return; - - kvm_flush_dcache_to_poc(va, size); -} - static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) { diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 4d177ce1d536..2f4f87021980 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, return 0; } +static bool stage2_pte_cacheable(kvm_pte_t pte) +{ + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; + return memattr == PAGE_S2_MEMATTR(NORMAL); +} + +static void stage2_flush_dcache(void *addr, u64 size) +{ + /* +* With FWB, we ensure that the guest always accesses memory using +* cacheable attributes, and we don't have to clean to PoC when +* faulting in pages. Furthermore, FWB implies IDC, so cleaning to +* PoU is not required either in this case. +*/ + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + return; + + __flush_dcache_area(addr, size); +} + static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) @@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, put_page(page); } + /* Flush data cache before installation of the new PTE */ + if (stage2_pte_cacheable(new)) + stage2_flush_dcache(__va(phys), granule); + smp_store_release(ptep, new); get_page(page); data->phys += granule; @@ -651,20 +675,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, return ret; } -static void stage2_flush_dcache(void *addr, u64 size) -{ - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) - return; - - __flush_dcache_area(addr, size); -} - -static bool stage2_pte_cacheable(kvm_pte_t pte) -{ - u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; - return memattr == PAGE_S2_MEMATTR(NORMAL); -} - static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 77cb2d28f2a4..d151927a7d62 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -609,11 +609,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) -{ - __clean_dcache_guest_page(pfn, size); -} - static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) { __invalidate_icache_guest_page(pfn, size); @@ -882,9 +877,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (writable) prot |= KVM_PGTABLE_PROT_W; - if (fault_status != FSC_PERM && !device) - clean_dcache_guest_page(pfn, vma_pagesize); - if