Re: [Xen-devel] [PATCH v2 4/9] x86/mm/tlb: Flush remote and local TLBs concurrently
> On Jul 3, 2019, at 10:43 AM, Andrew Cooper wrote: > > On 03/07/2019 18:02, Nadav Amit wrote: >>> On Jul 3, 2019, at 7:04 AM, Juergen Gross wrote: >>> >>> On 03.07.19 01:51, Nadav Amit wrote: To improve TLB shootdown performance, flush the remote and local TLBs concurrently. Introduce flush_tlb_multi() that does so. Introduce paravirtual versions of flush_tlb_multi() for KVM, Xen and hyper-v (Xen and hyper-v are only compile-tested). While the updated smp infrastructure is capable of running a function on a single local core, it is not optimized for this case. The multiple function calls and the indirect branch introduce some overhead, and might make local TLB flushes slower than they were before the recent changes. Before calling the SMP infrastructure, check if only a local TLB flush is needed to restore the lost performance in this common case. This requires to check mm_cpumask() one more time, but unless this mask is updated very frequently, this should impact performance negatively. Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Sasha Levin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: x...@kernel.org Cc: Juergen Gross Cc: Paolo Bonzini Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Boris Ostrovsky Cc: linux-hyp...@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualizat...@lists.linux-foundation.org Cc: k...@vger.kernel.org Cc: xen-de...@lists.xenproject.org Signed-off-by: Nadav Amit --- arch/x86/hyperv/mmu.c | 13 +++--- arch/x86/include/asm/paravirt.h | 6 +-- arch/x86/include/asm/paravirt_types.h | 4 +- arch/x86/include/asm/tlbflush.h | 9 ++-- arch/x86/include/asm/trace/hyperv.h | 2 +- arch/x86/kernel/kvm.c | 11 +++-- arch/x86/kernel/paravirt.c| 2 +- arch/x86/mm/tlb.c | 65 --- arch/x86/xen/mmu_pv.c | 20 ++--- include/trace/events/xen.h| 2 +- 10 files changed, 91 insertions(+), 43 deletions(-) >>> ... >>> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index beb44e22afdf..19e481e6e904 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1355,8 +1355,8 @@ static void xen_flush_tlb_one_user(unsigned long addr) preempt_enable(); } -static void xen_flush_tlb_others(const struct cpumask *cpus, - const struct flush_tlb_info *info) +static void xen_flush_tlb_multi(const struct cpumask *cpus, + const struct flush_tlb_info *info) { struct { struct mmuext_op op; @@ -1366,7 +1366,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, const size_t mc_entry_size = sizeof(args->op) + sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus()); - trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); + trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end); if (cpumask_empty(cpus)) return; /* nothing to do */ @@ -1375,9 +1375,17 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, args = mcs.args; args->op.arg2.vcpumask = to_cpumask(args->mask); - /* Remove us, and any offline CPUS. */ + /* Flush locally if needed and remove us */ + if (cpumask_test_cpu(smp_processor_id(), to_cpumask(args->mask))) { + local_irq_disable(); + flush_tlb_func_local(info); >>> I think this isn't the correct function for PV guests. >>> >>> In fact it should be much easier: just don't clear the own cpu from the >>> mask, that's all what's needed. The hypervisor is just fine having the >>> current cpu in the mask and it will do the right thing. >> Thanks. I will do so in v3. I don’t think Hyper-V people would want to do >> the same, unfortunately, since it would induce VM-exit on TLB flushes. > > Why do you believe the vmexit matters? You're talking one anyway for > the IPI. > > Intel only have virtualised self-IPI, and while AMD do have working > non-self IPIs, you still take a vmexit anyway if any destination vcpu > isn't currently running in non-root mode (IIRC). > > At that point, you might as well have the hypervisor do all the hard > work via a multi-cpu shootdown/flush hypercall, rather than trying to > arrange it locally. I forgot that xen_flush_tlb_multi() should actually only be called when there are some remote CPUs (as I optimized the case in which there is only a single local CPU that needs to be flushed), so you are right.
Re: [Xen-devel] [PATCH v2 4/9] x86/mm/tlb: Flush remote and local TLBs concurrently
On 03/07/2019 18:02, Nadav Amit wrote: >> On Jul 3, 2019, at 7:04 AM, Juergen Gross wrote: >> >> On 03.07.19 01:51, Nadav Amit wrote: >>> To improve TLB shootdown performance, flush the remote and local TLBs >>> concurrently. Introduce flush_tlb_multi() that does so. Introduce >>> paravirtual versions of flush_tlb_multi() for KVM, Xen and hyper-v (Xen >>> and hyper-v are only compile-tested). >>> While the updated smp infrastructure is capable of running a function on >>> a single local core, it is not optimized for this case. The multiple >>> function calls and the indirect branch introduce some overhead, and >>> might make local TLB flushes slower than they were before the recent >>> changes. >>> Before calling the SMP infrastructure, check if only a local TLB flush >>> is needed to restore the lost performance in this common case. This >>> requires to check mm_cpumask() one more time, but unless this mask is >>> updated very frequently, this should impact performance negatively. >>> Cc: "K. Y. Srinivasan" >>> Cc: Haiyang Zhang >>> Cc: Stephen Hemminger >>> Cc: Sasha Levin >>> Cc: Thomas Gleixner >>> Cc: Ingo Molnar >>> Cc: Borislav Petkov >>> Cc: x...@kernel.org >>> Cc: Juergen Gross >>> Cc: Paolo Bonzini >>> Cc: Dave Hansen >>> Cc: Andy Lutomirski >>> Cc: Peter Zijlstra >>> Cc: Boris Ostrovsky >>> Cc: linux-hyp...@vger.kernel.org >>> Cc: linux-kernel@vger.kernel.org >>> Cc: virtualizat...@lists.linux-foundation.org >>> Cc: k...@vger.kernel.org >>> Cc: xen-de...@lists.xenproject.org >>> Signed-off-by: Nadav Amit >>> --- >>> arch/x86/hyperv/mmu.c | 13 +++--- >>> arch/x86/include/asm/paravirt.h | 6 +-- >>> arch/x86/include/asm/paravirt_types.h | 4 +- >>> arch/x86/include/asm/tlbflush.h | 9 ++-- >>> arch/x86/include/asm/trace/hyperv.h | 2 +- >>> arch/x86/kernel/kvm.c | 11 +++-- >>> arch/x86/kernel/paravirt.c| 2 +- >>> arch/x86/mm/tlb.c | 65 --- >>> arch/x86/xen/mmu_pv.c | 20 ++--- >>> include/trace/events/xen.h| 2 +- >>> 10 files changed, 91 insertions(+), 43 deletions(-) >> ... >> >>> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c >>> index beb44e22afdf..19e481e6e904 100644 >>> --- a/arch/x86/xen/mmu_pv.c >>> +++ b/arch/x86/xen/mmu_pv.c >>> @@ -1355,8 +1355,8 @@ static void xen_flush_tlb_one_user(unsigned long addr) >>> preempt_enable(); >>> } >>> -static void xen_flush_tlb_others(const struct cpumask *cpus, >>> -const struct flush_tlb_info *info) >>> +static void xen_flush_tlb_multi(const struct cpumask *cpus, >>> + const struct flush_tlb_info *info) >>> { >>> struct { >>> struct mmuext_op op; >>> @@ -1366,7 +1366,7 @@ static void xen_flush_tlb_others(const struct cpumask >>> *cpus, >>> const size_t mc_entry_size = sizeof(args->op) + >>> sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus()); >>> - trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); >>> + trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end); >>> if (cpumask_empty(cpus)) >>> return; /* nothing to do */ >>> @@ -1375,9 +1375,17 @@ static void xen_flush_tlb_others(const struct >>> cpumask *cpus, >>> args = mcs.args; >>> args->op.arg2.vcpumask = to_cpumask(args->mask); >>> - /* Remove us, and any offline CPUS. */ >>> + /* Flush locally if needed and remove us */ >>> + if (cpumask_test_cpu(smp_processor_id(), to_cpumask(args->mask))) { >>> + local_irq_disable(); >>> + flush_tlb_func_local(info); >> I think this isn't the correct function for PV guests. >> >> In fact it should be much easier: just don't clear the own cpu from the >> mask, that's all what's needed. The hypervisor is just fine having the >> current cpu in the mask and it will do the right thing. > Thanks. I will do so in v3. I don’t think Hyper-V people would want to do > the same, unfortunately, since it would induce VM-exit on TLB flushes. Why do you believe the vmexit matters? You're talking one anyway for the IPI. Intel only have virtualised self-IPI, and while AMD do have working non-self IPIs, you still take a vmexit anyway if any destination vcpu isn't currently running in non-root mode (IIRC). At that point, you might as well have the hypervisor do all the hard work via a multi-cpu shootdown/flush hypercall, rather than trying to arrange it locally. ~Andrew
Re: [PATCH v2 4/9] x86/mm/tlb: Flush remote and local TLBs concurrently
> On Jul 3, 2019, at 7:04 AM, Juergen Gross wrote: > > On 03.07.19 01:51, Nadav Amit wrote: >> To improve TLB shootdown performance, flush the remote and local TLBs >> concurrently. Introduce flush_tlb_multi() that does so. Introduce >> paravirtual versions of flush_tlb_multi() for KVM, Xen and hyper-v (Xen >> and hyper-v are only compile-tested). >> While the updated smp infrastructure is capable of running a function on >> a single local core, it is not optimized for this case. The multiple >> function calls and the indirect branch introduce some overhead, and >> might make local TLB flushes slower than they were before the recent >> changes. >> Before calling the SMP infrastructure, check if only a local TLB flush >> is needed to restore the lost performance in this common case. This >> requires to check mm_cpumask() one more time, but unless this mask is >> updated very frequently, this should impact performance negatively. >> Cc: "K. Y. Srinivasan" >> Cc: Haiyang Zhang >> Cc: Stephen Hemminger >> Cc: Sasha Levin >> Cc: Thomas Gleixner >> Cc: Ingo Molnar >> Cc: Borislav Petkov >> Cc: x...@kernel.org >> Cc: Juergen Gross >> Cc: Paolo Bonzini >> Cc: Dave Hansen >> Cc: Andy Lutomirski >> Cc: Peter Zijlstra >> Cc: Boris Ostrovsky >> Cc: linux-hyp...@vger.kernel.org >> Cc: linux-kernel@vger.kernel.org >> Cc: virtualizat...@lists.linux-foundation.org >> Cc: k...@vger.kernel.org >> Cc: xen-de...@lists.xenproject.org >> Signed-off-by: Nadav Amit >> --- >> arch/x86/hyperv/mmu.c | 13 +++--- >> arch/x86/include/asm/paravirt.h | 6 +-- >> arch/x86/include/asm/paravirt_types.h | 4 +- >> arch/x86/include/asm/tlbflush.h | 9 ++-- >> arch/x86/include/asm/trace/hyperv.h | 2 +- >> arch/x86/kernel/kvm.c | 11 +++-- >> arch/x86/kernel/paravirt.c| 2 +- >> arch/x86/mm/tlb.c | 65 --- >> arch/x86/xen/mmu_pv.c | 20 ++--- >> include/trace/events/xen.h| 2 +- >> 10 files changed, 91 insertions(+), 43 deletions(-) > > ... > >> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c >> index beb44e22afdf..19e481e6e904 100644 >> --- a/arch/x86/xen/mmu_pv.c >> +++ b/arch/x86/xen/mmu_pv.c >> @@ -1355,8 +1355,8 @@ static void xen_flush_tlb_one_user(unsigned long addr) >> preempt_enable(); >> } >> -static void xen_flush_tlb_others(const struct cpumask *cpus, >> - const struct flush_tlb_info *info) >> +static void xen_flush_tlb_multi(const struct cpumask *cpus, >> +const struct flush_tlb_info *info) >> { >> struct { >> struct mmuext_op op; >> @@ -1366,7 +1366,7 @@ static void xen_flush_tlb_others(const struct cpumask >> *cpus, >> const size_t mc_entry_size = sizeof(args->op) + >> sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus()); >> - trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); >> +trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end); >> if (cpumask_empty(cpus)) >> return; /* nothing to do */ >> @@ -1375,9 +1375,17 @@ static void xen_flush_tlb_others(const struct cpumask >> *cpus, >> args = mcs.args; >> args->op.arg2.vcpumask = to_cpumask(args->mask); >> - /* Remove us, and any offline CPUS. */ >> +/* Flush locally if needed and remove us */ >> +if (cpumask_test_cpu(smp_processor_id(), to_cpumask(args->mask))) { >> +local_irq_disable(); >> +flush_tlb_func_local(info); > > I think this isn't the correct function for PV guests. > > In fact it should be much easier: just don't clear the own cpu from the > mask, that's all what's needed. The hypervisor is just fine having the > current cpu in the mask and it will do the right thing. Thanks. I will do so in v3. I don’t think Hyper-V people would want to do the same, unfortunately, since it would induce VM-exit on TLB flushes. But if they do - I’ll be able not to expose flush_tlb_func_local().
Re: [PATCH v2 4/9] x86/mm/tlb: Flush remote and local TLBs concurrently
On 03.07.19 01:51, Nadav Amit wrote: To improve TLB shootdown performance, flush the remote and local TLBs concurrently. Introduce flush_tlb_multi() that does so. Introduce paravirtual versions of flush_tlb_multi() for KVM, Xen and hyper-v (Xen and hyper-v are only compile-tested). While the updated smp infrastructure is capable of running a function on a single local core, it is not optimized for this case. The multiple function calls and the indirect branch introduce some overhead, and might make local TLB flushes slower than they were before the recent changes. Before calling the SMP infrastructure, check if only a local TLB flush is needed to restore the lost performance in this common case. This requires to check mm_cpumask() one more time, but unless this mask is updated very frequently, this should impact performance negatively. Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Sasha Levin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: x...@kernel.org Cc: Juergen Gross Cc: Paolo Bonzini Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Boris Ostrovsky Cc: linux-hyp...@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualizat...@lists.linux-foundation.org Cc: k...@vger.kernel.org Cc: xen-de...@lists.xenproject.org Signed-off-by: Nadav Amit --- arch/x86/hyperv/mmu.c | 13 +++--- arch/x86/include/asm/paravirt.h | 6 +-- arch/x86/include/asm/paravirt_types.h | 4 +- arch/x86/include/asm/tlbflush.h | 9 ++-- arch/x86/include/asm/trace/hyperv.h | 2 +- arch/x86/kernel/kvm.c | 11 +++-- arch/x86/kernel/paravirt.c| 2 +- arch/x86/mm/tlb.c | 65 --- arch/x86/xen/mmu_pv.c | 20 ++--- include/trace/events/xen.h| 2 +- 10 files changed, 91 insertions(+), 43 deletions(-) ... diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index beb44e22afdf..19e481e6e904 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1355,8 +1355,8 @@ static void xen_flush_tlb_one_user(unsigned long addr) preempt_enable(); } -static void xen_flush_tlb_others(const struct cpumask *cpus, -const struct flush_tlb_info *info) +static void xen_flush_tlb_multi(const struct cpumask *cpus, + const struct flush_tlb_info *info) { struct { struct mmuext_op op; @@ -1366,7 +1366,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, const size_t mc_entry_size = sizeof(args->op) + sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus()); - trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); + trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end); if (cpumask_empty(cpus)) return; /* nothing to do */ @@ -1375,9 +1375,17 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, args = mcs.args; args->op.arg2.vcpumask = to_cpumask(args->mask); - /* Remove us, and any offline CPUS. */ + /* Flush locally if needed and remove us */ + if (cpumask_test_cpu(smp_processor_id(), to_cpumask(args->mask))) { + local_irq_disable(); + flush_tlb_func_local(info); I think this isn't the correct function for PV guests. In fact it should be much easier: just don't clear the own cpu from the mask, that's all what's needed. The hypervisor is just fine having the current cpu in the mask and it will do the right thing. Juergen
[PATCH v2 4/9] x86/mm/tlb: Flush remote and local TLBs concurrently
To improve TLB shootdown performance, flush the remote and local TLBs concurrently. Introduce flush_tlb_multi() that does so. Introduce paravirtual versions of flush_tlb_multi() for KVM, Xen and hyper-v (Xen and hyper-v are only compile-tested). While the updated smp infrastructure is capable of running a function on a single local core, it is not optimized for this case. The multiple function calls and the indirect branch introduce some overhead, and might make local TLB flushes slower than they were before the recent changes. Before calling the SMP infrastructure, check if only a local TLB flush is needed to restore the lost performance in this common case. This requires to check mm_cpumask() one more time, but unless this mask is updated very frequently, this should impact performance negatively. Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Sasha Levin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: x...@kernel.org Cc: Juergen Gross Cc: Paolo Bonzini Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Boris Ostrovsky Cc: linux-hyp...@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualizat...@lists.linux-foundation.org Cc: k...@vger.kernel.org Cc: xen-de...@lists.xenproject.org Signed-off-by: Nadav Amit --- arch/x86/hyperv/mmu.c | 13 +++--- arch/x86/include/asm/paravirt.h | 6 +-- arch/x86/include/asm/paravirt_types.h | 4 +- arch/x86/include/asm/tlbflush.h | 9 ++-- arch/x86/include/asm/trace/hyperv.h | 2 +- arch/x86/kernel/kvm.c | 11 +++-- arch/x86/kernel/paravirt.c| 2 +- arch/x86/mm/tlb.c | 65 --- arch/x86/xen/mmu_pv.c | 20 ++--- include/trace/events/xen.h| 2 +- 10 files changed, 91 insertions(+), 43 deletions(-) diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index e65d7fe6489f..1177f863e4cd 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -50,8 +50,8 @@ static inline int fill_gva_list(u64 gva_list[], int offset, return gva_n - offset; } -static void hyperv_flush_tlb_others(const struct cpumask *cpus, - const struct flush_tlb_info *info) +static void hyperv_flush_tlb_multi(const struct cpumask *cpus, + const struct flush_tlb_info *info) { int cpu, vcpu, gva_n, max_gvas; struct hv_tlb_flush **flush_pcpu; @@ -59,7 +59,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, u64 status = U64_MAX; unsigned long flags; - trace_hyperv_mmu_flush_tlb_others(cpus, info); + trace_hyperv_mmu_flush_tlb_multi(cpus, info); if (!hv_hypercall_pg) goto do_native; @@ -69,6 +69,9 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, local_irq_save(flags); + if (cpumask_test_cpu(smp_processor_id(), cpus)) + flush_tlb_func_local(info); + flush_pcpu = (struct hv_tlb_flush **) this_cpu_ptr(hyperv_pcpu_input_arg); @@ -156,7 +159,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, if (!(status & HV_HYPERCALL_RESULT_MASK)) return; do_native: - native_flush_tlb_others(cpus, info); + native_flush_tlb_multi(cpus, info); } static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, @@ -231,6 +234,6 @@ void hyperv_setup_mmu_ops(void) return; pr_info("Using hypercall for remote TLB flush\n"); - pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others; + pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; pv_ops.mmu.tlb_remove_table = tlb_remove_table; } diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c25c38a05c1c..316959e89258 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -62,10 +62,10 @@ static inline void __flush_tlb_one_user(unsigned long addr) PVOP_VCALL1(mmu.flush_tlb_one_user, addr); } -static inline void flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) +static inline void flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { - PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info); + PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 946f8f1f1efc..54f4c718b5b0 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -211,8 +211,8 @@ struct pv_mmu_ops { void (*flush_tlb_user)(void); void (*flush_tlb_kernel)(void); void (*flush_tlb_one_user)(unsigned long addr); -