64s: Add ability to skip SLB preload

Nicholas Piggin Thu, 01 Jul 2021 04:30:58 -0700

Excerpts from Nicholas Piggin's message of July 1, 2021 5:37 pm:
> Excerpts from Christopher M. Riedl's message of July 1, 2021 4:53 pm:
>> On Thu Jul 1, 2021 at 1:04 AM CDT, Nicholas Piggin wrote:
>>> Excerpts from Christopher M. Riedl's message of July 1, 2021 3:28 pm:
>>> > On Wed Jun 30, 2021 at 11:15 PM CDT, Nicholas Piggin wrote:
>>> >> Excerpts from Christopher M. Riedl's message of July 1, 2021 1:48 pm:
>>> >> > On Sun Jun 20, 2021 at 10:13 PM CDT, Daniel Axtens wrote:
>>> >> >> "Christopher M. Riedl" <c...@linux.ibm.com> writes:
>>> >> >>
>>> >> >> > Switching to a different mm with Hash translation causes SLB 
>>> >> >> > entries to
>>> >> >> > be preloaded from the current thread_info. This reduces SLB faults, 
>>> >> >> > for
>>> >> >> > example when threads share a common mm but operate on different 
>>> >> >> > address
>>> >> >> > ranges.
>>> >> >> >
>>> >> >> > Preloading entries from the thread_info struct may not always be
>>> >> >> > appropriate - such as when switching to a temporary mm. Introduce a 
>>> >> >> > new
>>> >> >> > boolean in mm_context_t to skip the SLB preload entirely. Also move 
>>> >> >> > the
>>> >> >> > SLB preload code into a separate function since switch_slb() is 
>>> >> >> > already
>>> >> >> > quite long. The default behavior (preloading SLB entries from the
>>> >> >> > current thread_info struct) remains unchanged.
>>> >> >> >
>>> >> >> > Signed-off-by: Christopher M. Riedl <c...@linux.ibm.com>
>>> >> >> >
>>> >> >> > ---
>>> >> >> >
>>> >> >> > v4:  * New to series.
>>> >> >> > ---
>>> >> >> >  arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
>>> >> >> >  arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
>>> >> >> >  arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
>>> >> >> >  arch/powerpc/mm/book3s64/slb.c           | 56 
>>> >> >> > ++++++++++++++----------
>>> >> >> >  4 files changed, 50 insertions(+), 24 deletions(-)
>>> >> >> >
>>> >> >> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
>>> >> >> > b/arch/powerpc/include/asm/book3s/64/mmu.h
>>> >> >> > index eace8c3f7b0a1..b23a9dcdee5af 100644
>>> >> >> > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
>>> >> >> > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
>>> >> >> > @@ -130,6 +130,9 @@ typedef struct {
>>> >> >> >     u32 pkey_allocation_map;
>>> >> >> >     s16 execute_only_pkey; /* key holding execute-only protection */
>>> >> >> >  #endif
>>> >> >> > +
>>> >> >> > +   /* Do not preload SLB entries from thread_info during 
>>> >> >> > switch_slb() */
>>> >> >> > +   bool skip_slb_preload;
>>> >> >> >  } mm_context_t;
>>> >> >> >  
>>> >> >> >  static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
>>> >> >> > diff --git a/arch/powerpc/include/asm/mmu_context.h 
>>> >> >> > b/arch/powerpc/include/asm/mmu_context.h
>>> >> >> > index 4bc45d3ed8b0e..264787e90b1a1 100644
>>> >> >> > --- a/arch/powerpc/include/asm/mmu_context.h
>>> >> >> > +++ b/arch/powerpc/include/asm/mmu_context.h
>>> >> >> > @@ -298,6 +298,19 @@ static inline int arch_dup_mmap(struct 
>>> >> >> > mm_struct *oldmm,
>>> >> >> >     return 0;
>>> >> >> >  }
>>> >> >> >  
>>> >> >> > +#ifdef CONFIG_PPC_BOOK3S_64
>>> >> >> > +
>>> >> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm)
>>> >> >> > +{
>>> >> >> > +   mm->context.skip_slb_preload = true;
>>> >> >> > +}
>>> >> >> > +
>>> >> >> > +#else
>>> >> >> > +
>>> >> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
>>> >> >> > +
>>> >> >> > +#endif /* CONFIG_PPC_BOOK3S_64 */
>>> >> >> > +
>>> >> >> >  #include <asm-generic/mmu_context.h>
>>> >> >> >  
>>> >> >> >  #endif /* __KERNEL__ */
>>> >> >> > diff --git a/arch/powerpc/mm/book3s64/mmu_context.c 
>>> >> >> > b/arch/powerpc/mm/book3s64/mmu_context.c
>>> >> >> > index c10fc8a72fb37..3479910264c59 100644
>>> >> >> > --- a/arch/powerpc/mm/book3s64/mmu_context.c
>>> >> >> > +++ b/arch/powerpc/mm/book3s64/mmu_context.c
>>> >> >> > @@ -202,6 +202,8 @@ int init_new_context(struct task_struct *tsk, 
>>> >> >> > struct mm_struct *mm)
>>> >> >> >     atomic_set(&mm->context.active_cpus, 0);
>>> >> >> >     atomic_set(&mm->context.copros, 0);
>>> >> >> >  
>>> >> >> > +   mm->context.skip_slb_preload = false;
>>> >> >> > +
>>> >> >> >     return 0;
>>> >> >> >  }
>>> >> >> >  
>>> >> >> > diff --git a/arch/powerpc/mm/book3s64/slb.c 
>>> >> >> > b/arch/powerpc/mm/book3s64/slb.c
>>> >> >> > index c91bd85eb90e3..da0836cb855af 100644
>>> >> >> > --- a/arch/powerpc/mm/book3s64/slb.c
>>> >> >> > +++ b/arch/powerpc/mm/book3s64/slb.c
>>> >> >> > @@ -441,10 +441,39 @@ static void slb_cache_slbie_user(unsigned int 
>>> >> >> > index)
>>> >> >> >     asm volatile("slbie %0" : : "r" (slbie_data));
>>> >> >> >  }
>>> >> >> >  
>>> >> >> > +static void preload_slb_entries(struct task_struct *tsk, struct 
>>> >> >> > mm_struct *mm)
>>> >> >> Should this be explicitly inline or even __always_inline? I'm thinking
>>> >> >> switch_slb is probably a fairly hot path on hash?
>>> >> > 
>>> >> > Yes absolutely. I'll make this change in v5.
>>> >> > 
>>> >> >>
>>> >> >> > +{
>>> >> >> > +   struct thread_info *ti = task_thread_info(tsk);
>>> >> >> > +   unsigned char i;
>>> >> >> > +
>>> >> >> > +   /*
>>> >> >> > +    * We gradually age out SLBs after a number of context switches 
>>> >> >> > to
>>> >> >> > +    * reduce reload overhead of unused entries (like we do with 
>>> >> >> > FP/VEC
>>> >> >> > +    * reload). Each time we wrap 256 switches, take an entry out 
>>> >> >> > of the
>>> >> >> > +    * SLB preload cache.
>>> >> >> > +    */
>>> >> >> > +   tsk->thread.load_slb++;
>>> >> >> > +   if (!tsk->thread.load_slb) {
>>> >> >> > +           unsigned long pc = KSTK_EIP(tsk);
>>> >> >> > +
>>> >> >> > +           preload_age(ti);
>>> >> >> > +           preload_add(ti, pc);
>>> >> >> > +   }
>>> >> >> > +
>>> >> >> > +   for (i = 0; i < ti->slb_preload_nr; i++) {
>>> >> >> > +           unsigned char idx;
>>> >> >> > +           unsigned long ea;
>>> >> >> > +
>>> >> >> > +           idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
>>> >> >> > +           ea = (unsigned long)ti->slb_preload_esid[idx] << 
>>> >> >> > SID_SHIFT;
>>> >> >> > +
>>> >> >> > +           slb_allocate_user(mm, ea);
>>> >> >> > +   }
>>> >> >> > +}
>>> >> >> > +
>>> >> >> >  /* Flush all user entries from the segment table of the current 
>>> >> >> > processor. */
>>> >> >> >  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>>> >> >> >  {
>>> >> >> > -   struct thread_info *ti = task_thread_info(tsk);
>>> >> >> >     unsigned char i;
>>> >> >> >  
>>> >> >> >     /*
>>> >> >> > @@ -502,29 +531,8 @@ void switch_slb(struct task_struct *tsk, 
>>> >> >> > struct mm_struct *mm)
>>> >> >> >  
>>> >> >> >     copy_mm_to_paca(mm);
>>> >> >> >  
>>> >> >> > -   /*
>>> >> >> > -    * We gradually age out SLBs after a number of context switches 
>>> >> >> > to
>>> >> >> > -    * reduce reload overhead of unused entries (like we do with 
>>> >> >> > FP/VEC
>>> >> >> > -    * reload). Each time we wrap 256 switches, take an entry out 
>>> >> >> > of the
>>> >> >> > -    * SLB preload cache.
>>> >> >> > -    */
>>> >> >> > -   tsk->thread.load_slb++;
>>> >> >> > -   if (!tsk->thread.load_slb) {
>>> >> >> > -           unsigned long pc = KSTK_EIP(tsk);
>>> >> >> > -
>>> >> >> > -           preload_age(ti);
>>> >> >> > -           preload_add(ti, pc);
>>> >> >> > -   }
>>> >> >> > -
>>> >> >> > -   for (i = 0; i < ti->slb_preload_nr; i++) {
>>> >> >> > -           unsigned char idx;
>>> >> >> > -           unsigned long ea;
>>> >> >> > -
>>> >> >> > -           idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
>>> >> >> > -           ea = (unsigned long)ti->slb_preload_esid[idx] << 
>>> >> >> > SID_SHIFT;
>>> >> >> > -
>>> >> >> > -           slb_allocate_user(mm, ea);
>>> >> >> > -   }
>>> >> >> > +   if (!mm->context.skip_slb_preload)
>>> >> >> > +           preload_slb_entries(tsk, mm);
>>> >> >>
>>> >> >> Should this be wrapped in likely()?
>>> >> > 
>>> >> > Seems like a good idea - yes.
>>> >> > 
>>> >> >>
>>> >> >> >  
>>> >> >> >     /*
>>> >> >> >      * Synchronize slbmte preloads with possible subsequent user 
>>> >> >> > memory
>>> >> >>
>>> >> >> Right below this comment is the isync. It seems to be specifically
>>> >> >> concerned with synchronising preloaded slbs. Do you need it if you are
>>> >> >> skipping SLB preloads?
>>> >> >>
>>> >> >> It's probably not a big deal to have an extra isync in the fairly rare
>>> >> >> path when we're skipping preloads, but I thought I'd check.
>>> >> > 
>>> >> > I don't _think_ we need the `isync` if we are skipping the SLB 
>>> >> > preloads,
>>> >> > but then again it was always in the code-path before. If someone can
>>> >> > make a compelling argument to drop it when not preloading SLBs I will,
>>> >> > otherwise (considering some of the other non-obvious things I stepped
>>> >> > into with the Hash code) I will keep it here for now.
>>> >>
>>> >> The ISA says slbia wants an isync afterward, so we probably should keep
>>> >> it. The comment is a bit misleading in that case.
>>> >>
>>> >> Why isn't preloading appropriate for a temporary mm?
>>> > 
>>> > The preloaded entries come from the thread_info struct which isn't
>>> > necessarily related to the temporary mm at all. I saw SLB multihits
>>> > while testing this series with my LKDTM test where the "patching
>>> > address" (userspace address for the temporary mapping w/
>>> > write-permissions) ends up in a thread's preload list and then we
>>> > explicitly insert it again in map_patch() when trying to patch. At that
>>> > point the SLB multihit triggers.
>>>
>>> Hmm, so what if we use a mm, take some SLB faults then unuse it and
>>> use a different one? I wonder if kthread_use_mm has existing problems
>>> with this incorrect SLB preloading. Quite possibly. We should clear
>>> the preload whenever mm changes I think. That should cover this as
>>> well.
>> 
>> I actually did this initially but thought it was a bit too intrusive to
>> include as part of this series and hurt performance. I agree that
>> preloading the SLB from the thread may be a problem in general when
>> switching in/out an mm.
>> 
>> kthread_use_mm may not be affected unless we explicitly insert some SLB
>> entries which could collide with an existing preload (which I don't
>> think we do anywhere until this series).
> 
> kthread_use_mm(mm1);
> *ea = blah; /* slb preload[n++][ea] = va */
> kthread_unuse_mm(mm1);
> 
> kthread_use_mm(mm2);
>   switch_slb();
> schedule();
>   /* preload ea=va? */
> x = *ea;
> kthread_unuse_mm(mm2);
> 
> ? I'm sure we'd have a bug in existing code if you're hitting a bug 
> there.


Something like this I think should prevent it. I thought there was a 
better arch hook for it, but doesn't seem so. I have an unexplained
SLB crash bug somewhere too, better check if it matches...

Thanks,
Nick

diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index c91bd85eb90e..cb8c8a5d861e 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -502,6 +502,9 @@ void switch_slb(struct task_struct *tsk, struct mm_struct 
*mm)
 
        copy_mm_to_paca(mm);
 
+       if (unlikely(tsk->flags & PF_KTHREAD))
+               goto no_preload;
+
        /*
         * We gradually age out SLBs after a number of context switches to
         * reduce reload overhead of unused entries (like we do with FP/VEC
@@ -526,10 +529,11 @@ void switch_slb(struct task_struct *tsk, struct mm_struct 
*mm)
                slb_allocate_user(mm, ea);
        }
 
+no_preload:
        /*
-        * Synchronize slbmte preloads with possible subsequent user memory
-        * address accesses by the kernel (user mode won't happen until
-        * rfid, which is safe).
+        * Synchronize slbias and slbmte preloads with possible subsequent user
+        * memory address accesses by the kernel (user mode won't happen until
+        * rfid, which is synchronizing).
         */
        isync();
 }

Re: [RESEND PATCH v4 05/11] powerpc/64s: Add ability to skip SLB preload

Reply via email to