On Wed, Apr 03, 2019 at 11:34:13AM -0600, Khalid Aziz wrote:
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 999d6d8f0bef..cc806a01a0eb 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -37,6 +37,20 @@
> */
> #define LAST_USER_MM_IBPB 0x1UL
>
> +/*
> + * A TLB flush may be needed to flush stale TLB entries
> + * for pages that have been mapped into userspace and unmapped
> + * from kernel space. This TLB flush needs to be propagated to
> + * all CPUs. Asynchronous flush requests to all CPUs can cause
> + * significant performance imapct. Queue a pending flush for
> + * a CPU instead. Multiple of these requests can then be handled
> + * by a CPU at a less disruptive time, like context switch, in
> + * one go and reduce performance impact significantly. Following
> + * data structure is used to keep track of CPUs with pending full
> + * TLB flush forced by xpfo.
> + */
> +static cpumask_t pending_xpfo_flush;
> +
> /*
> * We get here when we do something requiring a TLB invalidation
> * but could not go invalidate all of the contexts. We do the
> @@ -321,6 +335,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct
> mm_struct *next,
> __flush_tlb_all();
> }
> #endif
> +
> + /*
> + * If there is a pending TLB flush for this CPU due to XPFO
> + * flush, do it now.
> + */
> + if (cpumask_test_and_clear_cpu(cpu, &pending_xpfo_flush)) {
> + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
> + __flush_tlb_all();
> + }
That really should be:
if (cpumask_test_cpu(cpu, &pending_xpfo_flush)) {
cpumask_clear_cpu(cpu, &pending_xpfo_flush);
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
}
test_and_clear is an unconditional RmW and can cause cacheline
contention between adjecent CPUs even if none of the bits are set.
> +
> this_cpu_write(cpu_tlbstate.is_lazy, false);
>
> /*
> @@ -803,6 +827,34 @@ void flush_tlb_kernel_range(unsigned long start,
> unsigned long end)
> }
> }
>
> +void xpfo_flush_tlb_kernel_range(unsigned long start, unsigned long end)
> +{
> + struct cpumask tmp_mask;
> +
> + /*
> + * Balance as user space task's flush, a bit conservative.
> + * Do a local flush immediately and post a pending flush on all
> + * other CPUs. Local flush can be a range flush or full flush
> + * depending upon the number of entries to be flushed. Remote
> + * flushes will be done by individual processors at the time of
> + * context switch and this allows multiple flush requests from
> + * other CPUs to be batched together.
> + */
> + if (end == TLB_FLUSH_ALL ||
> + (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
> + do_flush_tlb_all(NULL);
> + } else {
> + struct flush_tlb_info info;
> +
> + info.start = start;
> + info.end = end;
> + do_kernel_range_flush(&info);
> + }
> + cpumask_setall(&tmp_mask);
> + __cpumask_clear_cpu(smp_processor_id(), &tmp_mask);
> + cpumask_or(&pending_xpfo_flush, &pending_xpfo_flush, &tmp_mask);
> +}
> +
> void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
> {
> struct flush_tlb_info info = {
> diff --git a/arch/x86/mm/xpfo.c b/arch/x86/mm/xpfo.c
> index b42513347865..638eee5b1f09 100644
> --- a/arch/x86/mm/xpfo.c
> +++ b/arch/x86/mm/xpfo.c
> @@ -118,7 +118,7 @@ inline void xpfo_flush_kernel_tlb(struct page *page, int
> order)
> return;
> }
>
> - flush_tlb_kernel_range(kaddr, kaddr + (1 << order) * size);
> + xpfo_flush_tlb_kernel_range(kaddr, kaddr + (1 << order) * size);
> }
> EXPORT_SYMBOL_GPL(xpfo_flush_kernel_tlb);
So this patch is the one that makes it 'work', but I'm with Andy on
hating it something fierce.
Up until this point x86_64 is completely buggered in this series, after
this it sorta works but *urgh* what crap.
All in all your changelog is complete and utter garbage, this is _NOT_ a
performance issue. It is a very much a correctness issue.
Also; I distinctly dislike the inconsistent TLB states this generates.
It makes it very hard to argue for its correctness..
_______________________________________________
iommu mailing list
[email protected]
https://lists.linuxfoundation.org/mailman/listinfo/iommu