Hi Aneesh, > For a page walk cache flush, we don't need to loop with set number. > The set number is ignored with RIC=1 (pwc flush). > > For RIC=2 (flush all), inorder to flush implementation dependent > caches, we can ignore the set number. Hence we do a RIC=2 flush with > set no: 0, so we do both the tlb flush for set 0 and the > implementation dependent cache flushes. This is then followed with > tbl flush for set 1-127
I've applied your two previous radix tlbiel optimisations as my baseline, and using the simple exec microbenchmark in a7a9dcd882a6 I see: HPT: 100% Radix baseline: 248% Radix patched: 95% So this patch fixes the large regression we see with radix, and is even faster than our HPT number now. Nice work! Acked-by: Anton Blanchard <an...@samba.org> Anton > Signed-off-by: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com> > --- > Note: not yet tested. > > arch/powerpc/mm/tlb-radix.c | 28 +++++++++++++++++++++++----- > 1 file changed, 23 insertions(+), 5 deletions(-) > > diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c > index b68b5219cf45..b827aef38b90 100644 > --- a/arch/powerpc/mm/tlb-radix.c > +++ b/arch/powerpc/mm/tlb-radix.c > @@ -43,12 +43,30 @@ static inline void __tlbiel_pid(unsigned long > pid, int set, */ > static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) > { > - int set; > + int set = 0; > > asm volatile("ptesync": : :"memory"); > - for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { > - __tlbiel_pid(pid, set, ric); > + if (ric == RIC_FLUSH_ALL) { > + ric = RIC_FLUSH_TLB; > + set = 1; > + /* Use set 0 to flush all */ > + __tlbiel_pid(pid, 0, RIC_FLUSH_ALL); > } > + > + for (; set < POWER9_TLB_SETS_RADIX ; set++) > + __tlbiel_pid(pid, set, ric); > + > + asm volatile("ptesync": : :"memory"); > + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); > +} > + > +static inline void _tlbiel_pwc(unsigned long pid) > +{ > + asm volatile("ptesync": : :"memory"); > + /* > + * for PWC flush, we don't look at set number > + */ > + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); > asm volatile("ptesync": : :"memory"); > asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); > } > @@ -140,7 +158,7 @@ void radix__local_flush_tlb_pwc(struct mmu_gather > *tlb, unsigned long addr) > pid = mm->context.id; > if (pid != MMU_NO_CONTEXT) > - _tlbiel_pid(pid, RIC_FLUSH_PWC); > + _tlbiel_pwc(pid); > > preempt_enable(); > } > @@ -222,7 +240,7 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, > unsigned long addr) if (lock_tlbie) > raw_spin_unlock(&native_tlbie_lock); > } else > - _tlbiel_pid(pid, RIC_FLUSH_PWC); > + _tlbiel_pwc(pid); > no_context: > preempt_enable(); > }