Re: [PATCH 06/17] powerpc/qspinlock: theft prevention to control latency

2022-11-10 Thread Nicholas Piggin
On Thu Nov 10, 2022 at 10:40 AM AEST, Jordan Niethe wrote:
> On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> [resend as utf-8, not utf-7]
> > Give the queue head the ability to stop stealers. After a number of
> > spins without sucessfully acquiring the lock, the queue head employs
> > this, which will assure it is the next owner.
> > ---
> >  arch/powerpc/include/asm/qspinlock_types.h | 10 +++-
> >  arch/powerpc/lib/qspinlock.c   | 56 +-
> >  2 files changed, 63 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
> > b/arch/powerpc/include/asm/qspinlock_types.h
> > index 210adf05b235..8b20f5e22bba 100644
> > --- a/arch/powerpc/include/asm/qspinlock_types.h
> > +++ b/arch/powerpc/include/asm/qspinlock_types.h
> > @@ -29,7 +29,8 @@ typedef struct qspinlock {
> >   * Bitfields in the lock word:
> >   *
> >   * 0: locked bit
> > - * 16-31: tail cpu (+1)
> > + *16: must queue bit
> > + * 17-31: tail cpu (+1)
> >   */
> >  #define_Q_SET_MASK(type)   (((1U << _Q_ ## type ## _BITS) - 1)\
> >   << _Q_ ## type ## _OFFSET)
> > @@ -38,7 +39,12 @@ typedef struct qspinlock {
> >  #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
> >  #define _Q_LOCKED_VAL  (1U << _Q_LOCKED_OFFSET)
> >  
> > -#define _Q_TAIL_CPU_OFFSET 16
> > +#define _Q_MUST_Q_OFFSET   16
> > +#define _Q_MUST_Q_BITS 1
> > +#define _Q_MUST_Q_MASK _Q_SET_MASK(MUST_Q)
> > +#define _Q_MUST_Q_VAL  (1U << _Q_MUST_Q_OFFSET)
> > +
> > +#define _Q_TAIL_CPU_OFFSET 17
> >  #define _Q_TAIL_CPU_BITS   (32 - _Q_TAIL_CPU_OFFSET)
> >  #define _Q_TAIL_CPU_MASK   _Q_SET_MASK(TAIL_CPU)
>
> Not a big deal but some of these values could be calculated like in the
> generic version. e.g.
>
>   #define _Q_PENDING_OFFSET   (_Q_LOCKED_OFFSET +_Q_LOCKED_BITS)

Yeah, we don't *really* have more than one locked bit though. Haven't
made up my mind about these defines yet.

> > diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> > index 1625cce714b2..a906cc8f15fa 100644
> > --- a/arch/powerpc/lib/qspinlock.c
> > +++ b/arch/powerpc/lib/qspinlock.c
> > @@ -22,6 +22,7 @@ struct qnodes {
> >  /* Tuning parameters */
> >  static int STEAL_SPINS __read_mostly = (1<<5);
> >  static bool MAYBE_STEALERS __read_mostly = true;
> > +static int HEAD_SPINS __read_mostly = (1<<8);
> >  
> >  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
> >  
> > @@ -30,6 +31,11 @@ static __always_inline int get_steal_spins(void)
> > return STEAL_SPINS;
> >  }
> >  
> > +static __always_inline int get_head_spins(void)
> > +{
> > +   return HEAD_SPINS;
> > +}
> > +
> >  static inline u32 encode_tail_cpu(void)
> >  {
> > return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET;
> > @@ -142,6 +148,23 @@ static __always_inline u32 publish_tail_cpu(struct 
> > qspinlock *lock, u32 tail)
> > return prev;
> >  }
> >  
> > +static __always_inline u32 lock_set_mustq(struct qspinlock *lock)
> > +{
> > +   u32 new = _Q_MUST_Q_VAL;
> > +   u32 prev;
> > +
> > +   asm volatile(
> > +"1:lwarx   %0,0,%1 # lock_set_mustq
> > \n"
>
> Is the EH bit not set because we don't hold the lock here?

Right, we're still waiting for it.

> > +"  or  %0,%0,%2\n"
> > +"  stwcx.  %0,0,%1 \n"
> > +"  bne-1b  \n"
> > +   : "=" (prev)
> > +   : "r" (>val), "r" (new)
> > +   : "cr0", "memory");
>
> This is another usage close to the DEFINE_TESTOP() pattern.
>
> > +
> > +   return prev;
> > +}
> > +
> >  static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
> >  {
> > int cpu = get_tail_cpu(val);
> > @@ -165,6 +188,9 @@ static inline bool try_to_steal_lock(struct qspinlock 
> > *lock)
> > for (;;) {
> > u32 val = READ_ONCE(lock->val);
> >  
> > +   if (val & _Q_MUST_Q_VAL)
> > +   break;
> > +
> > if (unlikely(!(val & _Q_LOCKED_VAL))) {
> > if (trylock_with_tail_cpu(lock, val))
> > return true;
> > @@ -246,11 +272,22 @@ static inline void queued_spin_lock_mcs_queue(struct 
> > qspinlock *lock)
> > /* We must be the owner, just set the lock bit and acquire */
> > lock_set_locked(lock);
> > } else {
> > +   int iters = 0;
> > +   bool set_mustq = false;
> > +
> >  again:
> > /* We're at the head of the waitqueue, wait for the lock. */
> > -   while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL)
> > +   while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
> > cpu_relax();
> >  
> > +   iters++;
>
> It seems instead of using set_mustq, (val & _Q_MUST_Q_VAL) could be checked?

I wanted to give the 

Re: [PATCH 06/17] powerpc/qspinlock: theft prevention to control latency

2022-11-09 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
[resend as utf-8, not utf-7]
> Give the queue head the ability to stop stealers. After a number of
> spins without sucessfully acquiring the lock, the queue head employs
> this, which will assure it is the next owner.
> ---
>  arch/powerpc/include/asm/qspinlock_types.h | 10 +++-
>  arch/powerpc/lib/qspinlock.c   | 56 +-
>  2 files changed, 63 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
> b/arch/powerpc/include/asm/qspinlock_types.h
> index 210adf05b235..8b20f5e22bba 100644
> --- a/arch/powerpc/include/asm/qspinlock_types.h
> +++ b/arch/powerpc/include/asm/qspinlock_types.h
> @@ -29,7 +29,8 @@ typedef struct qspinlock {
>   * Bitfields in the lock word:
>   *
>   * 0: locked bit
> - * 16-31: tail cpu (+1)
> + *16: must queue bit
> + * 17-31: tail cpu (+1)
>   */
>  #define  _Q_SET_MASK(type)   (((1U << _Q_ ## type ## _BITS) - 1)\
> << _Q_ ## type ## _OFFSET)
> @@ -38,7 +39,12 @@ typedef struct qspinlock {
>  #define _Q_LOCKED_MASK   _Q_SET_MASK(LOCKED)
>  #define _Q_LOCKED_VAL(1U << _Q_LOCKED_OFFSET)
>  
> -#define _Q_TAIL_CPU_OFFSET   16
> +#define _Q_MUST_Q_OFFSET 16
> +#define _Q_MUST_Q_BITS   1
> +#define _Q_MUST_Q_MASK   _Q_SET_MASK(MUST_Q)
> +#define _Q_MUST_Q_VAL(1U << _Q_MUST_Q_OFFSET)
> +
> +#define _Q_TAIL_CPU_OFFSET   17
>  #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
>  #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)

Not a big deal but some of these values could be calculated like in the
generic version. e.g.

#define _Q_PENDING_OFFSET   (_Q_LOCKED_OFFSET +_Q_LOCKED_BITS)

>  
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 1625cce714b2..a906cc8f15fa 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -22,6 +22,7 @@ struct qnodes {
>  /* Tuning parameters */
>  static int STEAL_SPINS __read_mostly = (1<<5);
>  static bool MAYBE_STEALERS __read_mostly = true;
> +static int HEAD_SPINS __read_mostly = (1<<8);
>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> @@ -30,6 +31,11 @@ static __always_inline int get_steal_spins(void)
>   return STEAL_SPINS;
>  }
>  
> +static __always_inline int get_head_spins(void)
> +{
> + return HEAD_SPINS;
> +}
> +
>  static inline u32 encode_tail_cpu(void)
>  {
>   return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET;
> @@ -142,6 +148,23 @@ static __always_inline u32 publish_tail_cpu(struct 
> qspinlock *lock, u32 tail)
>   return prev;
>  }
>  
> +static __always_inline u32 lock_set_mustq(struct qspinlock *lock)
> +{
> + u32 new = _Q_MUST_Q_VAL;
> + u32 prev;
> +
> + asm volatile(
> +"1:  lwarx   %0,0,%1 # lock_set_mustq\n"

Is the EH bit not set because we don't hold the lock here?

> +"or  %0,%0,%2\n"
> +"stwcx.  %0,0,%1 \n"
> +"bne-1b  \n"
> + : "=" (prev)
> + : "r" (>val), "r" (new)
> + : "cr0", "memory");

This is another usage close to the DEFINE_TESTOP() pattern.

> +
> + return prev;
> +}
> +
>  static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
>  {
>   int cpu = get_tail_cpu(val);
> @@ -165,6 +188,9 @@ static inline bool try_to_steal_lock(struct qspinlock 
> *lock)
>   for (;;) {
>   u32 val = READ_ONCE(lock->val);
>  
> + if (val & _Q_MUST_Q_VAL)
> + break;
> +
>   if (unlikely(!(val & _Q_LOCKED_VAL))) {
>   if (trylock_with_tail_cpu(lock, val))
>   return true;
> @@ -246,11 +272,22 @@ static inline void queued_spin_lock_mcs_queue(struct 
> qspinlock *lock)
>   /* We must be the owner, just set the lock bit and acquire */
>   lock_set_locked(lock);
>   } else {
> + int iters = 0;
> + bool set_mustq = false;
> +
>  again:
>   /* We're at the head of the waitqueue, wait for the lock. */
> - while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL)
> + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
>   cpu_relax();
>  
> + iters++;

It seems instead of using set_mustq, (val & _Q_MUST_Q_VAL) could be checked?

> + if (!set_mustq && iters >= get_head_spins()) {
> + set_mustq = true;
> + lock_set_mustq(lock);
> + val |= _Q_MUST_Q_VAL;
> + }
> + }
> +
>   /* If we're the last queued, must clean up the tail. */
>   if ((val & _Q_TAIL_CPU_MASK) 

Re: [PATCH 06/17] powerpc/qspinlock: theft prevention to control latency

2022-08-09 Thread Jordan Niethe
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> Give the queue head the ability to stop stealers. After a number of
> spins without sucessfully acquiring the lock, the queue head employs
> this, which will assure it is the next owner.
> ---
>  arch/powerpc/include/asm/qspinlock_types.h | 10 +++-
>  arch/powerpc/lib/qspinlock.c   | 56 +-
>  2 files changed, 63 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
> b/arch/powerpc/include/asm/qspinlock_types.h
> index 210adf05b235..8b20f5e22bba 100644
> --- a/arch/powerpc/include/asm/qspinlock_types.h
> +++ b/arch/powerpc/include/asm/qspinlock_types.h
> @@ -29,7 +29,8 @@ typedef struct qspinlock {
>   * Bitfields in the lock word:
>   *
>   * 0: locked bit
> - * 16-31: tail cpu (+1)
> + *16: must queue bit
> + * 17-31: tail cpu (+1)
>   */
>  #define  _Q_SET_MASK(type)   (((1U << _Q_ ## type ## _BITS) - 1)\
> << _Q_ ## type ## _OFFSET)
> @@ -38,7 +39,12 @@ typedef struct qspinlock {
>  #define _Q_LOCKED_MASK   _Q_SET_MASK(LOCKED)
>  #define _Q_LOCKED_VAL(1U << _Q_LOCKED_OFFSET)
>  
> -#define _Q_TAIL_CPU_OFFSET   16
> +#define _Q_MUST_Q_OFFSET 16
> +#define _Q_MUST_Q_BITS   1
> +#define _Q_MUST_Q_MASK   _Q_SET_MASK(MUST_Q)
> +#define _Q_MUST_Q_VAL(1U << _Q_MUST_Q_OFFSET)
> +
> +#define _Q_TAIL_CPU_OFFSET   17
>  #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
>  #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)

Not a big deal but some of these values could be calculated like in the
generic version. e.g.

#define _Q_PENDING_OFFSET   (_Q_LOCKED_OFFSET +_Q_LOCKED_BITS)

>  
> diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
> index 1625cce714b2..a906cc8f15fa 100644
> --- a/arch/powerpc/lib/qspinlock.c
> +++ b/arch/powerpc/lib/qspinlock.c
> @@ -22,6 +22,7 @@ struct qnodes {
>  /* Tuning parameters */
>  static int STEAL_SPINS __read_mostly = (1<<5);
>  static bool MAYBE_STEALERS __read_mostly = true;
> +static int HEAD_SPINS __read_mostly = (1<<8);
>  
>  static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
>  
> @@ -30,6 +31,11 @@ static __always_inline int get_steal_spins(void)
>   return STEAL_SPINS;
>  }
>  
> +static __always_inline int get_head_spins(void)
> +{
> + return HEAD_SPINS;
> +}
> +
>  static inline u32 encode_tail_cpu(void)
>  {
>   return (smp_processor_id() + 1) << _Q_TAIL_CPU_OFFSET;
> @@ -142,6 +148,23 @@ static __always_inline u32 publish_tail_cpu(struct 
> qspinlock *lock, u32 tail)
>   return prev;
>  }
>  
> +static __always_inline u32 lock_set_mustq(struct qspinlock *lock)
> +{
> + u32 new = _Q_MUST_Q_VAL;
> + u32 prev;
> +
> + asm volatile(
> +"1:  lwarx   %0,0,%1 # lock_set_mustq\n"

Is the EH bit not set because we don't hold the lock here?

> +"or  %0,%0,%2\n"
> +"stwcx.  %0,0,%1 \n"
> +"bne-1b  \n"
> + : "=" (prev)
> + : "r" (>val), "r" (new)
> + : "cr0", "memory");

This is another usage close to the DEFINE_TESTOP() pattern.

> +
> + return prev;
> +}
> +
>  static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
>  {
>   int cpu = get_tail_cpu(val);
> @@ -165,6 +188,9 @@ static inline bool try_to_steal_lock(struct qspinlock 
> *lock)
>   for (;;) {
>   u32 val = READ_ONCE(lock->val);
>  
> + if (val & _Q_MUST_Q_VAL)
> + break;
> +
>   if (unlikely(!(val & _Q_LOCKED_VAL))) {
>   if (trylock_with_tail_cpu(lock, val))
>   return true;
> @@ -246,11 +272,22 @@ static inline void queued_spin_lock_mcs_queue(struct 
> qspinlock *lock)
>   /* We must be the owner, just set the lock bit and acquire */
>   lock_set_locked(lock);
>   } else {
> + int iters = 0;
> + bool set_mustq = false;
> +
>  again:
>   /* We're at the head of the waitqueue, wait for the lock. */
> - while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL)
> + while ((val = READ_ONCE(lock->val)) & _Q_LOCKED_VAL) {
>   cpu_relax();
>  
> + iters++;

It seems instead of using set_mustq, (val & _Q_MUST_Q_VAL) could be checked?

> + if (!set_mustq && iters >= get_head_spins()) {
> + set_mustq = true;
> + lock_set_mustq(lock);
> + val |= _Q_MUST_Q_VAL;
> + }
> + }
> +
>   /* If we're the last queued, must clean up the tail. */
>   if ((val & _Q_TAIL_CPU_MASK) == tail) {
>