Hi Aneesh,

> For a page walk cache flush, we don't need to loop with set number.
> The set number is ignored with RIC=1 (pwc flush).
> 
> For RIC=2 (flush all), inorder to flush implementation dependent
> caches, we can ignore the set number. Hence we do a RIC=2 flush with
> set no: 0, so we do both the tlb flush for set 0 and the
> implementation dependent cache flushes. This is then followed with
> tbl flush for set 1-127

I've applied your two previous radix tlbiel optimisations as my
baseline, and using the simple exec microbenchmark in a7a9dcd882a6 I
see:

HPT:            100%
Radix baseline: 248%
Radix patched:   95%

So this patch fixes the large regression we see with radix, and is even
faster than our HPT number now. Nice work!

Acked-by: Anton Blanchard <an...@samba.org>

Anton

> Signed-off-by: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
> ---
> Note: not yet tested.
> 
>  arch/powerpc/mm/tlb-radix.c | 28 +++++++++++++++++++++++-----
>  1 file changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> index b68b5219cf45..b827aef38b90 100644
> --- a/arch/powerpc/mm/tlb-radix.c
> +++ b/arch/powerpc/mm/tlb-radix.c
> @@ -43,12 +43,30 @@ static inline void __tlbiel_pid(unsigned long
> pid, int set, */
>  static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
>  {
> -     int set;
> +     int set = 0;
>  
>       asm volatile("ptesync": : :"memory");
> -     for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
> -             __tlbiel_pid(pid, set, ric);
> +     if (ric == RIC_FLUSH_ALL) {
> +             ric = RIC_FLUSH_TLB;
> +             set = 1;
> +             /* Use set 0 to flush all */
> +             __tlbiel_pid(pid, 0, RIC_FLUSH_ALL);
>       }
> +
> +     for (; set < POWER9_TLB_SETS_RADIX ; set++)
> +             __tlbiel_pid(pid, set, ric);
> +
> +     asm volatile("ptesync": : :"memory");
> +     asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
> +}
> +
> +static inline void _tlbiel_pwc(unsigned long pid)
> +{
> +     asm volatile("ptesync": : :"memory");
> +     /*
> +      * for PWC flush, we don't look at set number
> +      */
> +     __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
>       asm volatile("ptesync": : :"memory");
>       asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
>  }
> @@ -140,7 +158,7 @@ void radix__local_flush_tlb_pwc(struct mmu_gather
> *tlb, unsigned long addr) 
>       pid = mm->context.id;
>       if (pid != MMU_NO_CONTEXT)
> -             _tlbiel_pid(pid, RIC_FLUSH_PWC);
> +             _tlbiel_pwc(pid);
>  
>       preempt_enable();
>  }
> @@ -222,7 +240,7 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb,
> unsigned long addr) if (lock_tlbie)
>                       raw_spin_unlock(&native_tlbie_lock);
>       } else
> -             _tlbiel_pid(pid, RIC_FLUSH_PWC);
> +             _tlbiel_pwc(pid);
>  no_context:
>       preempt_enable();
>  }

Reply via email to