Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-20 Thread xinhui



On 2016年07月16日 00:35, Peter Zijlstra wrote:

On Fri, Jul 15, 2016 at 12:07:03PM +0200, Peter Zijlstra wrote:

So if we are kicked by the unlock_slowpath, and the lock is stealed by
someone else,  we need hash its node again and set l->locked to
_Q_SLOW_VAL, then enter pv_wait.


Right, let me go think about this a bit.


Urgh, brain hurt.

So I _think_ the below does for it but I could easily have missed yet
another case.

Waiman's patch has the problem that it can have two pv_hash() calls for
the same lock in progress and I'm thinking that means we can hit the
BUG() in pv_hash() due to that.

If we can't, it still has a problem because its not telling us either.



--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -20,7 +20,8 @@
   * native_queued_spin_unlock().
   */

-#define _Q_SLOW_VAL(3U << _Q_LOCKED_OFFSET)
+#define _Q_HASH_VAL(3U << _Q_LOCKED_OFFSET)
+#define _Q_SLOW_VAL(7U << _Q_LOCKED_OFFSET)

  /*
   * Queue Node Adaptive Spinning
@@ -36,14 +37,11 @@
   */
  #define PV_PREV_CHECK_MASK0xff

-/*
- * Queue node uses: vcpu_running & vcpu_halted.
- * Queue head uses: vcpu_running & vcpu_hashed.
- */
  enum vcpu_state {
-   vcpu_running = 0,
-   vcpu_halted,/* Used only in pv_wait_node */
-   vcpu_hashed,/* = pv_hash'ed + vcpu_halted */
+   vcpu_node_running = 0,
+   vcpu_node_halted,
+   vcpu_head_running,
+   vcpu_head_halted,
  };

  struct pv_node {
@@ -263,7 +261,7 @@ pv_wait_early(struct pv_node *prev, int
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;

-   return READ_ONCE(prev->state) != vcpu_running;
+   return READ_ONCE(prev->state) & 1;
  }

  /*
@@ -311,20 +309,19 @@ static void pv_wait_node(struct mcs_spin
 *
 * Matches the cmpxchg() from pv_kick_node().
 */
-   smp_store_mb(pn->state, vcpu_halted);
+   smp_store_mb(pn->state, vcpu_node_halted);

-   if (!READ_ONCE(node->locked)) {
-   qstat_inc(qstat_pv_wait_node, true);
-   qstat_inc(qstat_pv_wait_early, wait_early);
-   pv_wait(&pn->state, vcpu_halted);
-   }
+   if (READ_ONCE(node->locked))
+   return;
+
+   qstat_inc(qstat_pv_wait_node, true);
+   qstat_inc(qstat_pv_wait_early, wait_early);
+   pv_wait(&pn->state, vcpu_node_halted);

/*
-* If pv_kick_node() changed us to vcpu_hashed, retain that
-* value so that pv_wait_head_or_lock() knows to not also try
-* to hash this lock.
+* If pv_kick_node() advanced us, retain that state.
 */
-   cmpxchg(&pn->state, vcpu_halted, vcpu_running);
+   cmpxchg(&pn->state, vcpu_node_halted, vcpu_node_running);

/*
 * If the locked flag is still not set after wakeup, it is a
@@ -362,18 +359,17 @@ static void pv_kick_node(struct qspinloc
 *
 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
 */
-   if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+   if (cmpxchg(&pn->state, vcpu_node_halted, vcpu_head_running) != 
vcpu_node_halted)

well, I think it should be
if (cmpxchg(&pn->state, vcpu_node_halted, vcpu_head_halted) != vcpu_node_halted)

yes, the node has been the *head*, but the running state does not change.

others looks ok to me.


return;

/*
-* Put the lock into the hash table and set the _Q_SLOW_VAL.
-*
-* As this is the same vCPU that will check the _Q_SLOW_VAL value and
-* the hash table later on at unlock time, no atomic instruction is
-* needed.
+* See pv_wait_head_or_lock(). We have to hash and force the unlock
+* into the slow path to deliver the actual kick for waking.
 */
-   WRITE_ONCE(l->locked, _Q_SLOW_VAL);
-   (void)pv_hash(lock, pn);
+   if (cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_HASH_VAL) == _Q_LOCKED_VAL) {
+   (void)pv_hash(lock, pn);
+   smp_store_release(&l->locked, _Q_SLOW_VAL);
+   }
  }

  /*
@@ -388,28 +384,22 @@ pv_wait_head_or_lock(struct qspinlock *l
  {
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
-   struct qspinlock **lp = NULL;
int waitcnt = 0;
int loop;

/*
-* If pv_kick_node() already advanced our state, we don't need to
-* insert ourselves into the hash table anymore.
-*/
-   if (READ_ONCE(pn->state) == vcpu_hashed)
-   lp = (struct qspinlock **)1;
-
-   /*
 * Tracking # of slowpath locking operations
 */
qstat_inc(qstat_pv_lock_slowpath, true);

for (;; waitcnt++) {
+   u8 locked;
+
 

Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-17 Thread Wanpeng Li
2016-07-18 7:10 GMT+08:00 Waiman Long :
> On 07/17/2016 07:07 PM, Waiman Long wrote:
>>
>> On 07/15/2016 09:16 PM, Boqun Feng wrote:
>>>
>>> On Fri, Jul 15, 2016 at 06:35:56PM +0200, Peter Zijlstra wrote:

 On Fri, Jul 15, 2016 at 12:07:03PM +0200, Peter Zijlstra wrote:
>>
>> So if we are kicked by the unlock_slowpath, and the lock is stealed by
>> someone else,  we need hash its node again and set l->locked to
>> _Q_SLOW_VAL, then enter pv_wait.
>
> Right, let me go think about this a bit.

 Urgh, brain hurt.

 So I _think_ the below does for it but I could easily have missed yet
 another case.

 Waiman's patch has the problem that it can have two pv_hash() calls for
 the same lock in progress and I'm thinking that means we can hit the
 BUG() in pv_hash() due to that.

>>> I think Waiman's patch does have the problem of two pv_hash() calls for
>>> the same lock in progress. As I mentioned in the first version:
>>>
>>> http://lkml.kernel.org/g/20160527074331.GB8096@insomnia
>>>
>>> And he tried to address this in the patch #3 of this series. However,
>>> I think what is proper here is either to reorder patch 2 and 3 or to
>>> merge patch 2 and 3, otherwise, we are introducing a bug in the middle
>>> of this series.
>>>
>>> Thoughts, Waiman?
>>
>>
>> Patches 2 and 3 can be reversed.
>>
>>>
>>> That said, I found Peter's way is much simpler and easier to understand
>>> ;-)
>>
>>
>> I agree. Peter's patch is better than mine.
>>
 If we can't, it still has a problem because its not telling us either.



 --- a/kernel/locking/qspinlock_paravirt.h
 +++ b/kernel/locking/qspinlock_paravirt.h
 @@ -20,7 +20,8 @@
* native_queued_spin_unlock().
*/

 -#define _Q_SLOW_VAL(3U<<  _Q_LOCKED_OFFSET)
 +#define _Q_HASH_VAL(3U<<  _Q_LOCKED_OFFSET)
 +#define _Q_SLOW_VAL(7U<<  _Q_LOCKED_OFFSET)

   /*
* Queue Node Adaptive Spinning
 @@ -36,14 +37,11 @@
*/
   #define PV_PREV_CHECK_MASK0xff

 -/*
 - * Queue node uses: vcpu_running&  vcpu_halted.
 - * Queue head uses: vcpu_running&  vcpu_hashed.
 - */
   enum vcpu_state {
 -vcpu_running = 0,
 -vcpu_halted,/* Used only in pv_wait_node */
 -vcpu_hashed,/* = pv_hash'ed + vcpu_halted */
 +vcpu_node_running = 0,
 +vcpu_node_halted,
 +vcpu_head_running,
>>>
>>> We actually don't need this extra running state, right? Because nobody
>>> cares about the difference between two running states right now.
>>
>>
>> That addresses the problem in Xinhui patch that changed the state from
>> halted to hashed. With that separation, that change is no longer necessary.
>
>
> Oh, I meant Wanpeng's double hash race patch, not Xinhui's patch.

This patch can base on the top of mine, I think it has already done this way. :)

Regards,
Wanpeng Li


Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-17 Thread Waiman Long

On 07/17/2016 07:07 PM, Waiman Long wrote:

On 07/15/2016 09:16 PM, Boqun Feng wrote:

On Fri, Jul 15, 2016 at 06:35:56PM +0200, Peter Zijlstra wrote:

On Fri, Jul 15, 2016 at 12:07:03PM +0200, Peter Zijlstra wrote:
So if we are kicked by the unlock_slowpath, and the lock is 
stealed by

someone else,  we need hash its node again and set l->locked to
_Q_SLOW_VAL, then enter pv_wait.

Right, let me go think about this a bit.

Urgh, brain hurt.

So I _think_ the below does for it but I could easily have missed yet
another case.

Waiman's patch has the problem that it can have two pv_hash() calls for
the same lock in progress and I'm thinking that means we can hit the
BUG() in pv_hash() due to that.


I think Waiman's patch does have the problem of two pv_hash() calls for
the same lock in progress. As I mentioned in the first version:

http://lkml.kernel.org/g/20160527074331.GB8096@insomnia

And he tried to address this in the patch #3 of this series. However,
I think what is proper here is either to reorder patch 2 and 3 or to
merge patch 2 and 3, otherwise, we are introducing a bug in the middle
of this series.

Thoughts, Waiman?


Patches 2 and 3 can be reversed.



That said, I found Peter's way is much simpler and easier to understand
;-)


I agree. Peter's patch is better than mine.


If we can't, it still has a problem because its not telling us either.



--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -20,7 +20,8 @@
   * native_queued_spin_unlock().
   */

-#define _Q_SLOW_VAL(3U<<  _Q_LOCKED_OFFSET)
+#define _Q_HASH_VAL(3U<<  _Q_LOCKED_OFFSET)
+#define _Q_SLOW_VAL(7U<<  _Q_LOCKED_OFFSET)

  /*
   * Queue Node Adaptive Spinning
@@ -36,14 +37,11 @@
   */
  #define PV_PREV_CHECK_MASK0xff

-/*
- * Queue node uses: vcpu_running&  vcpu_halted.
- * Queue head uses: vcpu_running&  vcpu_hashed.
- */
  enum vcpu_state {
-vcpu_running = 0,
-vcpu_halted,/* Used only in pv_wait_node */
-vcpu_hashed,/* = pv_hash'ed + vcpu_halted */
+vcpu_node_running = 0,
+vcpu_node_halted,
+vcpu_head_running,

We actually don't need this extra running state, right? Because nobody
cares about the difference between two running states right now.


That addresses the problem in Xinhui patch that changed the state from 
halted to hashed. With that separation, that change is no longer 
necessary.


Oh, I meant Wanpeng's double hash race patch, not Xinhui's patch.

Cheers,
Longman


Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-17 Thread Waiman Long

On 07/15/2016 09:16 PM, Boqun Feng wrote:

On Fri, Jul 15, 2016 at 06:35:56PM +0200, Peter Zijlstra wrote:

On Fri, Jul 15, 2016 at 12:07:03PM +0200, Peter Zijlstra wrote:

So if we are kicked by the unlock_slowpath, and the lock is stealed by
someone else,  we need hash its node again and set l->locked to
_Q_SLOW_VAL, then enter pv_wait.

Right, let me go think about this a bit.

Urgh, brain hurt.

So I _think_ the below does for it but I could easily have missed yet
another case.

Waiman's patch has the problem that it can have two pv_hash() calls for
the same lock in progress and I'm thinking that means we can hit the
BUG() in pv_hash() due to that.


I think Waiman's patch does have the problem of two pv_hash() calls for
the same lock in progress. As I mentioned in the first version:

http://lkml.kernel.org/g/20160527074331.GB8096@insomnia

And he tried to address this in the patch #3 of this series. However,
I think what is proper here is either to reorder patch 2 and 3 or to
merge patch 2 and 3, otherwise, we are introducing a bug in the middle
of this series.

Thoughts, Waiman?


Patches 2 and 3 can be reversed.



That said, I found Peter's way is much simpler and easier to understand
;-)


I agree. Peter's patch is better than mine.


If we can't, it still has a problem because its not telling us either.



--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -20,7 +20,8 @@
   * native_queued_spin_unlock().
   */

-#define _Q_SLOW_VAL(3U<<  _Q_LOCKED_OFFSET)
+#define _Q_HASH_VAL(3U<<  _Q_LOCKED_OFFSET)
+#define _Q_SLOW_VAL(7U<<  _Q_LOCKED_OFFSET)

  /*
   * Queue Node Adaptive Spinning
@@ -36,14 +37,11 @@
   */
  #define PV_PREV_CHECK_MASK0xff

-/*
- * Queue node uses: vcpu_running&  vcpu_halted.
- * Queue head uses: vcpu_running&  vcpu_hashed.
- */
  enum vcpu_state {
-   vcpu_running = 0,
-   vcpu_halted,/* Used only in pv_wait_node */
-   vcpu_hashed,/* = pv_hash'ed + vcpu_halted */
+   vcpu_node_running = 0,
+   vcpu_node_halted,
+   vcpu_head_running,

We actually don't need this extra running state, right? Because nobody
cares about the difference between two running states right now.


That addresses the problem in Xinhui patch that changed the state from 
halted to hashed. With that separation, that change is no longer necessary.



+   vcpu_head_halted,
  };

  struct pv_node {
@@ -263,7 +261,7 @@ pv_wait_early(struct pv_node *prev, int
if ((loop&  PV_PREV_CHECK_MASK) != 0)
return false;

-   return READ_ONCE(prev->state) != vcpu_running;
+   return READ_ONCE(prev->state)&  1;
  }

  /*
@@ -311,20 +309,19 @@ static void pv_wait_node(struct mcs_spin
 *
 * Matches the cmpxchg() from pv_kick_node().
 */
-   smp_store_mb(pn->state, vcpu_halted);
+   smp_store_mb(pn->state, vcpu_node_halted);

-   if (!READ_ONCE(node->locked)) {
-   qstat_inc(qstat_pv_wait_node, true);
-   qstat_inc(qstat_pv_wait_early, wait_early);
-   pv_wait(&pn->state, vcpu_halted);
-   }
+   if (READ_ONCE(node->locked))
+   return;
+
+   qstat_inc(qstat_pv_wait_node, true);
+   qstat_inc(qstat_pv_wait_early, wait_early);
+   pv_wait(&pn->state, vcpu_node_halted);

/*
-* If pv_kick_node() changed us to vcpu_hashed, retain that
-* value so that pv_wait_head_or_lock() knows to not also try
-* to hash this lock.
+* If pv_kick_node() advanced us, retain that state.
 */
-   cmpxchg(&pn->state, vcpu_halted, vcpu_running);
+   cmpxchg(&pn->state, vcpu_node_halted, vcpu_node_running);

/*
 * If the locked flag is still not set after wakeup, it is a
@@ -362,18 +359,17 @@ static void pv_kick_node(struct qspinloc
 *
 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
 */
-   if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+   if (cmpxchg(&pn->state, vcpu_node_halted, vcpu_head_running) != 
vcpu_node_halted)
return;

/*
-* Put the lock into the hash table and set the _Q_SLOW_VAL.
-*
-* As this is the same vCPU that will check the _Q_SLOW_VAL value and
-* the hash table later on at unlock time, no atomic instruction is
-* needed.
+* See pv_wait_head_or_lock(). We have to hash and force the unlock
+* into the slow path to deliver the actual kick for waking.
 */
-   WRITE_ONCE(l->locked, _Q_SLOW_VAL);
-   (void)pv_hash(lock, pn);
+   if (cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_HASH_VAL) == _Q_LOCKED_VAL) {
+   (void)pv_hash(lock, pn);
+   smp_store_release(

Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-17 Thread Waiman Long

On 07/15/2016 12:35 PM, Peter Zijlstra wrote:

On Fri, Jul 15, 2016 at 12:07:03PM +0200, Peter Zijlstra wrote:

So if we are kicked by the unlock_slowpath, and the lock is stealed by
someone else,  we need hash its node again and set l->locked to
_Q_SLOW_VAL, then enter pv_wait.

Right, let me go think about this a bit.

Urgh, brain hurt.

So I _think_ the below does for it but I could easily have missed yet
another case.

Waiman's patch has the problem that it can have two pv_hash() calls for
the same lock in progress and I'm thinking that means we can hit the
BUG() in pv_hash() due to that.

If we can't, it still has a problem because its not telling us either.



--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -20,7 +20,8 @@
   * native_queued_spin_unlock().
   */

-#define _Q_SLOW_VAL(3U<<  _Q_LOCKED_OFFSET)
+#define _Q_HASH_VAL(3U<<  _Q_LOCKED_OFFSET)
+#define _Q_SLOW_VAL(7U<<  _Q_LOCKED_OFFSET)


The hash state is a transient state. It is not obvious until I read 
through the code. Maybe some comments on allowable state transition will 
help.




  /*
   * Queue Node Adaptive Spinning
@@ -36,14 +37,11 @@
   */
  #define PV_PREV_CHECK_MASK0xff

-/*
- * Queue node uses: vcpu_running&  vcpu_halted.
- * Queue head uses: vcpu_running&  vcpu_hashed.
- */
  enum vcpu_state {
-   vcpu_running = 0,
-   vcpu_halted,/* Used only in pv_wait_node */
-   vcpu_hashed,/* = pv_hash'ed + vcpu_halted */
+   vcpu_node_running = 0,
+   vcpu_node_halted,
+   vcpu_head_running,
+   vcpu_head_halted,
  };

  struct pv_node {
@@ -263,7 +261,7 @@ pv_wait_early(struct pv_node *prev, int
if ((loop&  PV_PREV_CHECK_MASK) != 0)
return false;

-   return READ_ONCE(prev->state) != vcpu_running;
+   return READ_ONCE(prev->state)&  1;
  }
  


This relies on the implicit ordering of the enum vcpu_state variable. I 
think we need some warning above that all the halt states must have bit 
0 set and running states have that bit cleared. We would like to make 
sure that any future changes in vcpu_state won't affect that rule.



  /*
@@ -311,20 +309,19 @@ static void pv_wait_node(struct mcs_spin
 *
 * Matches the cmpxchg() from pv_kick_node().
 */
-   smp_store_mb(pn->state, vcpu_halted);
+   smp_store_mb(pn->state, vcpu_node_halted);

-   if (!READ_ONCE(node->locked)) {
-   qstat_inc(qstat_pv_wait_node, true);
-   qstat_inc(qstat_pv_wait_early, wait_early);
-   pv_wait(&pn->state, vcpu_halted);
-   }
+   if (READ_ONCE(node->locked))
+   return;
+
+   qstat_inc(qstat_pv_wait_node, true);
+   qstat_inc(qstat_pv_wait_early, wait_early);
+   pv_wait(&pn->state, vcpu_node_halted);

/*
-* If pv_kick_node() changed us to vcpu_hashed, retain that
-* value so that pv_wait_head_or_lock() knows to not also try
-* to hash this lock.
+* If pv_kick_node() advanced us, retain that state.
 */
-   cmpxchg(&pn->state, vcpu_halted, vcpu_running);
+   cmpxchg(&pn->state, vcpu_node_halted, vcpu_node_running);

/*
 * If the locked flag is still not set after wakeup, it is a
@@ -362,18 +359,17 @@ static void pv_kick_node(struct qspinloc
 *
 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
 */
-   if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+   if (cmpxchg(&pn->state, vcpu_node_halted, vcpu_head_running) != 
vcpu_node_halted)
return;

/*
-* Put the lock into the hash table and set the _Q_SLOW_VAL.
-*
-* As this is the same vCPU that will check the _Q_SLOW_VAL value and
-* the hash table later on at unlock time, no atomic instruction is
-* needed.
+* See pv_wait_head_or_lock(). We have to hash and force the unlock
+* into the slow path to deliver the actual kick for waking.
 */
-   WRITE_ONCE(l->locked, _Q_SLOW_VAL);
-   (void)pv_hash(lock, pn);
+   if (cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_HASH_VAL) == _Q_LOCKED_VAL) {
+   (void)pv_hash(lock, pn);
+   smp_store_release(&l->locked, _Q_SLOW_VAL);
+   }
  }

  /*
@@ -388,28 +384,22 @@ pv_wait_head_or_lock(struct qspinlock *l
  {
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
-   struct qspinlock **lp = NULL;
int waitcnt = 0;
int loop;

/*
-* If pv_kick_node() already advanced our state, we don't need to
-* insert ourselves into the hash table anymore.
-*/
-   if (READ_ONCE(pn->state) == vcpu_hashed)
-  

Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-15 Thread Boqun Feng
On Fri, Jul 15, 2016 at 06:35:56PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 15, 2016 at 12:07:03PM +0200, Peter Zijlstra wrote:
> > > So if we are kicked by the unlock_slowpath, and the lock is stealed by
> > > someone else,  we need hash its node again and set l->locked to
> > > _Q_SLOW_VAL, then enter pv_wait.
> > 
> > Right, let me go think about this a bit.
> 
> Urgh, brain hurt.
> 
> So I _think_ the below does for it but I could easily have missed yet
> another case.
> 
> Waiman's patch has the problem that it can have two pv_hash() calls for
> the same lock in progress and I'm thinking that means we can hit the
> BUG() in pv_hash() due to that.
> 

I think Waiman's patch does have the problem of two pv_hash() calls for
the same lock in progress. As I mentioned in the first version:

http://lkml.kernel.org/g/20160527074331.GB8096@insomnia

And he tried to address this in the patch #3 of this series. However,
I think what is proper here is either to reorder patch 2 and 3 or to
merge patch 2 and 3, otherwise, we are introducing a bug in the middle
of this series.

Thoughts, Waiman?

That said, I found Peter's way is much simpler and easier to understand
;-)

> If we can't, it still has a problem because its not telling us either.
> 
> 
> 
> --- a/kernel/locking/qspinlock_paravirt.h
> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -20,7 +20,8 @@
>   * native_queued_spin_unlock().
>   */
>  
> -#define _Q_SLOW_VAL  (3U << _Q_LOCKED_OFFSET)
> +#define _Q_HASH_VAL  (3U << _Q_LOCKED_OFFSET)
> +#define _Q_SLOW_VAL  (7U << _Q_LOCKED_OFFSET)
>  
>  /*
>   * Queue Node Adaptive Spinning
> @@ -36,14 +37,11 @@
>   */
>  #define PV_PREV_CHECK_MASK   0xff
>  
> -/*
> - * Queue node uses: vcpu_running & vcpu_halted.
> - * Queue head uses: vcpu_running & vcpu_hashed.
> - */
>  enum vcpu_state {
> - vcpu_running = 0,
> - vcpu_halted,/* Used only in pv_wait_node */
> - vcpu_hashed,/* = pv_hash'ed + vcpu_halted */
> + vcpu_node_running = 0,
> + vcpu_node_halted,
> + vcpu_head_running,

We actually don't need this extra running state, right? Because nobody
cares about the difference between two running states right now.

> + vcpu_head_halted,
>  };
>  
>  struct pv_node {
> @@ -263,7 +261,7 @@ pv_wait_early(struct pv_node *prev, int
>   if ((loop & PV_PREV_CHECK_MASK) != 0)
>   return false;
>  
> - return READ_ONCE(prev->state) != vcpu_running;
> + return READ_ONCE(prev->state) & 1;
>  }
>  
>  /*
> @@ -311,20 +309,19 @@ static void pv_wait_node(struct mcs_spin
>*
>* Matches the cmpxchg() from pv_kick_node().
>*/
> - smp_store_mb(pn->state, vcpu_halted);
> + smp_store_mb(pn->state, vcpu_node_halted);
>  
> - if (!READ_ONCE(node->locked)) {
> - qstat_inc(qstat_pv_wait_node, true);
> - qstat_inc(qstat_pv_wait_early, wait_early);
> - pv_wait(&pn->state, vcpu_halted);
> - }
> + if (READ_ONCE(node->locked))
> + return;
> +
> + qstat_inc(qstat_pv_wait_node, true);
> + qstat_inc(qstat_pv_wait_early, wait_early);
> + pv_wait(&pn->state, vcpu_node_halted);
>  
>   /*
> -  * If pv_kick_node() changed us to vcpu_hashed, retain that
> -  * value so that pv_wait_head_or_lock() knows to not also try
> -  * to hash this lock.
> +  * If pv_kick_node() advanced us, retain that state.
>*/
> - cmpxchg(&pn->state, vcpu_halted, vcpu_running);
> + cmpxchg(&pn->state, vcpu_node_halted, vcpu_node_running);
>  
>   /*
>* If the locked flag is still not set after wakeup, it is a
> @@ -362,18 +359,17 @@ static void pv_kick_node(struct qspinloc
>*
>* Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
>*/
> - if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
> + if (cmpxchg(&pn->state, vcpu_node_halted, vcpu_head_running) != 
> vcpu_node_halted)
>   return;
>  
>   /*
> -  * Put the lock into the hash table and set the _Q_SLOW_VAL.
> -  *
> -  * As this is the same vCPU that will check the _Q_SLOW_VAL value and
> -  * the hash table later on at unlock time, no atomic instruction is
> -  * needed.
> +  * See pv_wait_head_or_lock(). We have to hash and force the unlock
> +  * into the slow path to deliver the actual kick for waking.
>*/
> - WRITE_ONCE(l->locked, _Q_SLOW_VAL);
> - (void)pv_hash(lock, pn);
> + if (cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_HASH_VAL) == _Q_LOCKED_VAL) {
> + (void)pv_hash(lock, pn);
> + smp_store_release(&l->locked, _Q_SLOW_VAL);
> + }
>  }
>  
>  /*
> @@ -388,28 +384,22 @@ pv_wait_head_or_lock(struct qspinlock *l
>  {
>   struct pv_node *pn = (struct pv

Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-15 Thread Waiman Long

On 07/15/2016 06:07 AM, Peter Zijlstra wrote:

On Fri, Jul 15, 2016 at 05:39:46PM +0800, Pan Xinhui wrote:

I'm thinking you're trying to say this:


CPU0CPU1CPU2

__pv_queued_spin_unlock_slowpath()
  ...
  smp_store_release(&l->locked, 0);
__pv_queued_spin_lock_slowpath()
  ...
  pv_queued_spin_steal_lock()
cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0


pv_wait_head_or_lock()

  pv_kick(node->cpu);  -->pv_wait(&l->locked, 
_Q_SLOW_VAL);

__pv_queued_spin_unlock()
  cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL

  for () {
trylock_clear_pending();
cpu_relax();
  }

  pv_wait(&l->locked, 
_Q_SLOW_VAL);


Which is indeed 'bad', but not fatal, note that the later pv_wait() will
not in fact go wait, since l->locked will _not_ be _Q_SLOW_VAL.

the problem is that "this later pv_wait will do nothing as l->locked
is not _Q_SLOW_VAL", So it is not paravirt friendly then. we will go
into the trylock loop again and again until the lock is unlocked.

Agreed, which is 'bad'. But the patch spoke about a missing wakeup,
which is worse, as that would completely inhibit progress.


Sorry, it is my mistake. There is no missing pv_wait().


So if we are kicked by the unlock_slowpath, and the lock is stealed by
someone else,  we need hash its node again and set l->locked to
_Q_SLOW_VAL, then enter pv_wait.

Right, let me go think about this a bit.


Yes, the purpose of this patch is to do exactly that. Let's the queue 
head vCPU sleeps until the lock holder release the lock and wake the 
queue head vCPU up.





but I am worried about lock stealing. could the node in the queue
starve for a long time? I notice the latency of pv_wait on an
over-commited guest can be bigger than 300us. I have not seen such
starving case, but I think it is possible to happen.

I share that worry, which is why we limit the steal attempt to one.
But yes, theoretically its possible to starve things AFAICT.

We've not come up with sensible way to completely avoid starvation.


If you guys are worrying about lock constantly getting stolen between 
pv_kick() of queue head vCPU and it is ready to take the lock, we can 
keep the pending bit set across pv_wait() if it is the 2nd or later time 
that pv_wait() is called. That will ensure that no lock stealing can 
happen and cap the maximum wait time to about 2x (spin + pv_wait). I 
will add that patch to my patch series.


Cheers,
Longman


Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-15 Thread Waiman Long

On 07/15/2016 04:47 AM, Peter Zijlstra wrote:

So the reason I never get around to this is because the patch stinks.

It simply doesn't make sense... Remember, the harder you make a reviewer
work the less likely the review will be done.

Present things in clear concise language and draw a picture.

On Tue, May 31, 2016 at 12:53:48PM -0400, Waiman Long wrote:

Currently, calling pv_hash() and setting _Q_SLOW_VAL is only
done once for any pv_node. It is either in pv_kick_node() or in
pv_wait_head_or_lock().

So far so good


Because of lock stealing, a pv_kick'ed node is
not guaranteed to get the lock before the spinning threshold expires
and has to call pv_wait() again. As a result, the new lock holder
won't see _Q_SLOW_VAL and so won't wake up the sleeping vCPU.

*brain melts* what!? pv_kick'ed node reads like pv_kick_node() and that
doesn't make any kind of sense.


Sorry for the confusing. I will clean up the submit log to discuss what 
I actually mean.



I'm thinking you're trying to say this:


CPU0CPU1CPU2

__pv_queued_spin_unlock_slowpath()
   ...
   smp_store_release(&l->locked, 0);
__pv_queued_spin_lock_slowpath()
  ...
  pv_queued_spin_steal_lock()
cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0


pv_wait_head_or_lock()

   pv_kick(node->cpu);  -->   pv_wait(&l->locked, 
_Q_SLOW_VAL);

__pv_queued_spin_unlock()
  cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL

  for () {
trylock_clear_pending();
cpu_relax();
  }

  pv_wait(&l->locked, 
_Q_SLOW_VAL);



Yes, that is the scenario that I have in mind.


Which is indeed 'bad', but not fatal, note that the later pv_wait() will
not in fact go wait, since l->locked will _not_ be _Q_SLOW_VAL.

Is this indeed the 3 CPU scenario you tried to describe in a scant 4
lines of text, or is there more to it?


You are right. The vCPU won't actually going to wait. It will get out 
and spin again. I will correct the patch title. However, it is still not 
good as it is not doing what it is suppose to do.


Cheers,
Longman




Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-15 Thread Peter Zijlstra
On Fri, Jul 15, 2016 at 12:07:03PM +0200, Peter Zijlstra wrote:
> > So if we are kicked by the unlock_slowpath, and the lock is stealed by
> > someone else,  we need hash its node again and set l->locked to
> > _Q_SLOW_VAL, then enter pv_wait.
> 
> Right, let me go think about this a bit.

Urgh, brain hurt.

So I _think_ the below does for it but I could easily have missed yet
another case.

Waiman's patch has the problem that it can have two pv_hash() calls for
the same lock in progress and I'm thinking that means we can hit the
BUG() in pv_hash() due to that.

If we can't, it still has a problem because its not telling us either.



--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -20,7 +20,8 @@
  * native_queued_spin_unlock().
  */
 
-#define _Q_SLOW_VAL(3U << _Q_LOCKED_OFFSET)
+#define _Q_HASH_VAL(3U << _Q_LOCKED_OFFSET)
+#define _Q_SLOW_VAL(7U << _Q_LOCKED_OFFSET)
 
 /*
  * Queue Node Adaptive Spinning
@@ -36,14 +37,11 @@
  */
 #define PV_PREV_CHECK_MASK 0xff
 
-/*
- * Queue node uses: vcpu_running & vcpu_halted.
- * Queue head uses: vcpu_running & vcpu_hashed.
- */
 enum vcpu_state {
-   vcpu_running = 0,
-   vcpu_halted,/* Used only in pv_wait_node */
-   vcpu_hashed,/* = pv_hash'ed + vcpu_halted */
+   vcpu_node_running = 0,
+   vcpu_node_halted,
+   vcpu_head_running,
+   vcpu_head_halted,
 };
 
 struct pv_node {
@@ -263,7 +261,7 @@ pv_wait_early(struct pv_node *prev, int
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;
 
-   return READ_ONCE(prev->state) != vcpu_running;
+   return READ_ONCE(prev->state) & 1;
 }
 
 /*
@@ -311,20 +309,19 @@ static void pv_wait_node(struct mcs_spin
 *
 * Matches the cmpxchg() from pv_kick_node().
 */
-   smp_store_mb(pn->state, vcpu_halted);
+   smp_store_mb(pn->state, vcpu_node_halted);
 
-   if (!READ_ONCE(node->locked)) {
-   qstat_inc(qstat_pv_wait_node, true);
-   qstat_inc(qstat_pv_wait_early, wait_early);
-   pv_wait(&pn->state, vcpu_halted);
-   }
+   if (READ_ONCE(node->locked))
+   return;
+
+   qstat_inc(qstat_pv_wait_node, true);
+   qstat_inc(qstat_pv_wait_early, wait_early);
+   pv_wait(&pn->state, vcpu_node_halted);
 
/*
-* If pv_kick_node() changed us to vcpu_hashed, retain that
-* value so that pv_wait_head_or_lock() knows to not also try
-* to hash this lock.
+* If pv_kick_node() advanced us, retain that state.
 */
-   cmpxchg(&pn->state, vcpu_halted, vcpu_running);
+   cmpxchg(&pn->state, vcpu_node_halted, vcpu_node_running);
 
/*
 * If the locked flag is still not set after wakeup, it is a
@@ -362,18 +359,17 @@ static void pv_kick_node(struct qspinloc
 *
 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
 */
-   if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+   if (cmpxchg(&pn->state, vcpu_node_halted, vcpu_head_running) != 
vcpu_node_halted)
return;
 
/*
-* Put the lock into the hash table and set the _Q_SLOW_VAL.
-*
-* As this is the same vCPU that will check the _Q_SLOW_VAL value and
-* the hash table later on at unlock time, no atomic instruction is
-* needed.
+* See pv_wait_head_or_lock(). We have to hash and force the unlock
+* into the slow path to deliver the actual kick for waking.
 */
-   WRITE_ONCE(l->locked, _Q_SLOW_VAL);
-   (void)pv_hash(lock, pn);
+   if (cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_HASH_VAL) == _Q_LOCKED_VAL) {
+   (void)pv_hash(lock, pn);
+   smp_store_release(&l->locked, _Q_SLOW_VAL);
+   }
 }
 
 /*
@@ -388,28 +384,22 @@ pv_wait_head_or_lock(struct qspinlock *l
 {
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
-   struct qspinlock **lp = NULL;
int waitcnt = 0;
int loop;
 
/*
-* If pv_kick_node() already advanced our state, we don't need to
-* insert ourselves into the hash table anymore.
-*/
-   if (READ_ONCE(pn->state) == vcpu_hashed)
-   lp = (struct qspinlock **)1;
-
-   /*
 * Tracking # of slowpath locking operations
 */
qstat_inc(qstat_pv_lock_slowpath, true);
 
for (;; waitcnt++) {
+   u8 locked;
+
/*
 * Set correct vCPU state to be used by queue node wait-early
 * mechanism.
 */
-   WRITE_ONCE(pn->state, vcpu_running);
+   WRITE_ONCE(pn->state, vcpu_head_running);

Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-15 Thread Peter Zijlstra
On Fri, Jul 15, 2016 at 05:39:46PM +0800, Pan Xinhui wrote:
> >I'm thinking you're trying to say this:
> >
> >
> >CPU0 CPU1CPU2
> >
> >__pv_queued_spin_unlock_slowpath()
> >  ...
> >  smp_store_release(&l->locked, 0);
> > __pv_queued_spin_lock_slowpath()
> >   ...
> >   pv_queued_spin_steal_lock()
> > cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0
> >
> >
> > pv_wait_head_or_lock()
> >
> >  pv_kick(node->cpu);  --> pv_wait(&l->locked, 
> > _Q_SLOW_VAL);
> >
> > __pv_queued_spin_unlock()
> >   cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL
> >
> >   for () {
> > trylock_clear_pending();
> > cpu_relax();
> >   }
> >
> >   pv_wait(&l->locked, 
> > _Q_SLOW_VAL);
> >
> >
> >Which is indeed 'bad', but not fatal, note that the later pv_wait() will
> >not in fact go wait, since l->locked will _not_ be _Q_SLOW_VAL.

> 
> the problem is that "this later pv_wait will do nothing as l->locked
> is not _Q_SLOW_VAL", So it is not paravirt friendly then. we will go
> into the trylock loop again and again until the lock is unlocked.

Agreed, which is 'bad'. But the patch spoke about a missing wakeup,
which is worse, as that would completely inhibit progress.

> So if we are kicked by the unlock_slowpath, and the lock is stealed by
> someone else,  we need hash its node again and set l->locked to
> _Q_SLOW_VAL, then enter pv_wait.

Right, let me go think about this a bit.

> but I am worried about lock stealing. could the node in the queue
> starve for a long time? I notice the latency of pv_wait on an
> over-commited guest can be bigger than 300us. I have not seen such
> starving case, but I think it is possible to happen.

I share that worry, which is why we limit the steal attempt to one.
But yes, theoretically its possible to starve things AFAICT.

We've not come up with sensible way to completely avoid starvation.


Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-15 Thread Pan Xinhui



在 16/7/15 16:47, Peter Zijlstra 写道:


So the reason I never get around to this is because the patch stinks.

It simply doesn't make sense... Remember, the harder you make a reviewer
work the less likely the review will be done.

Present things in clear concise language and draw a picture.

On Tue, May 31, 2016 at 12:53:48PM -0400, Waiman Long wrote:

Currently, calling pv_hash() and setting _Q_SLOW_VAL is only
done once for any pv_node. It is either in pv_kick_node() or in
pv_wait_head_or_lock().


So far so good


Because of lock stealing, a pv_kick'ed node is
not guaranteed to get the lock before the spinning threshold expires
and has to call pv_wait() again. As a result, the new lock holder
won't see _Q_SLOW_VAL and so won't wake up the sleeping vCPU.



waiman, it might be "as a result, the head node will not really enter wait state 
because ->locked is not_Q_SLOW_VAL, the pv_wait will return directly."


*brain melts* what!? pv_kick'ed node reads like pv_kick_node() and that
doesn't make any kind of sense.

I'm thinking you're trying to say this:


CPU0CPU1CPU2

__pv_queued_spin_unlock_slowpath()
  ...
  smp_store_release(&l->locked, 0);
__pv_queued_spin_lock_slowpath()
  ...
  pv_queued_spin_steal_lock()
cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0


pv_wait_head_or_lock()

  pv_kick(node->cpu);  -->  pv_wait(&l->locked, 
_Q_SLOW_VAL);

__pv_queued_spin_unlock()
  cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL

  for () {
trylock_clear_pending();
cpu_relax();
  }

  pv_wait(&l->locked, 
_Q_SLOW_VAL);


Which is indeed 'bad', but not fatal, note that the later pv_wait() will
not in fact go wait, since l->locked will _not_ be _Q_SLOW_VAL.


hi, Peter

the problem is that "this later pv_wait will do nothing as l->locked is not 
_Q_SLOW_VAL",
So it is not paravirt friendly then. we will go into the trylock loop again and 
again until the lock is unlocked.

So if we are kicked by the unlock_slowpath, and the lock is stealed by someone 
else,  we need hash its node again and set l->locked to _Q_SLOW_VAL, then enter 
pv_wait.

but I am worried about lock stealing. could the node in the queue starve for a 
long time? I notice the latency of pv_wait on an over-commited guest can be 
bigger than 300us. I have not seen such starving case, but I think it is 
possible to happen.

thanks
xinhui


Is this indeed the 3 CPU scenario you tried to describe in a scant 4
lines of text, or is there more to it?





Re: [PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-07-15 Thread Peter Zijlstra

So the reason I never get around to this is because the patch stinks.

It simply doesn't make sense... Remember, the harder you make a reviewer
work the less likely the review will be done.

Present things in clear concise language and draw a picture.

On Tue, May 31, 2016 at 12:53:48PM -0400, Waiman Long wrote:
> Currently, calling pv_hash() and setting _Q_SLOW_VAL is only
> done once for any pv_node. It is either in pv_kick_node() or in
> pv_wait_head_or_lock().

So far so good

> Because of lock stealing, a pv_kick'ed node is
> not guaranteed to get the lock before the spinning threshold expires
> and has to call pv_wait() again. As a result, the new lock holder
> won't see _Q_SLOW_VAL and so won't wake up the sleeping vCPU.

*brain melts* what!? pv_kick'ed node reads like pv_kick_node() and that
doesn't make any kind of sense.

I'm thinking you're trying to say this:


CPU0CPU1CPU2

__pv_queued_spin_unlock_slowpath()
  ...
  smp_store_release(&l->locked, 0);
__pv_queued_spin_lock_slowpath()
  ...
  pv_queued_spin_steal_lock()
cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0


pv_wait_head_or_lock()

  pv_kick(node->cpu);  -->pv_wait(&l->locked, 
_Q_SLOW_VAL);

__pv_queued_spin_unlock()
  cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL

  for () {
trylock_clear_pending();
cpu_relax();
  }

  pv_wait(&l->locked, 
_Q_SLOW_VAL);


Which is indeed 'bad', but not fatal, note that the later pv_wait() will
not in fact go wait, since l->locked will _not_ be _Q_SLOW_VAL.

Is this indeed the 3 CPU scenario you tried to describe in a scant 4
lines of text, or is there more to it?


[PATCH v2 2/5] locking/pvqspinlock: Fix missed PV wakeup problem

2016-05-31 Thread Waiman Long
Currently, calling pv_hash() and setting _Q_SLOW_VAL is only
done once for any pv_node. It is either in pv_kick_node() or in
pv_wait_head_or_lock(). Because of lock stealing, a pv_kick'ed node is
not guaranteed to get the lock before the spinning threshold expires
and has to call pv_wait() again. As a result, the new lock holder
won't see _Q_SLOW_VAL and so won't wake up the sleeping vCPU.

This patch fixes this missed PV wakeup problem by allowing multiple
_Q_SLOW_VAL settings within pv_wait_head_or_lock() and matching each
successful setting of _Q_SLOW_VAL to a pv_hash() call.

Reported-by: Pan Xinhui 
Signed-off-by: Waiman Long 
---
 kernel/locking/qspinlock_paravirt.h |   48 ++
 1 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/kernel/locking/qspinlock_paravirt.h 
b/kernel/locking/qspinlock_paravirt.h
index 75cc207..3df975d 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -366,12 +366,16 @@ static void pv_kick_node(struct qspinlock *lock, struct 
mcs_spinlock *node)
/*
 * Put the lock into the hash table and set the _Q_SLOW_VAL.
 *
-* As this is the same vCPU that will check the _Q_SLOW_VAL value and
-* the hash table later on at unlock time, no atomic instruction is
-* needed.
+* It is very unlikely that this will race with the _Q_SLOW_VAL setting
+* in pv_wait_head_or_lock(). However, we use cmpxchg() here to be
+* sure that we won't do a double pv_hash().
+*
+* As it is the lock holder, it won't race with
+* __pv_queued_spin_unlock().
 */
-   WRITE_ONCE(l->locked, _Q_SLOW_VAL);
-   (void)pv_hash(lock, pn);
+   if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)
+   == _Q_LOCKED_VAL))
+   pv_hash(lock, pn);
 }
 
 /*
@@ -386,18 +390,10 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct 
mcs_spinlock *node)
 {
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
-   struct qspinlock **lp = NULL;
int waitcnt = 0;
int loop;
 
/*
-* If pv_kick_node() already advanced our state, we don't need to
-* insert ourselves into the hash table anymore.
-*/
-   if (READ_ONCE(pn->state) == vcpu_hashed)
-   lp = (struct qspinlock **)1;
-
-   /*
 * Tracking # of slowpath locking operations
 */
qstat_inc(qstat_pv_lock_slowpath, true);
@@ -419,11 +415,19 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct 
mcs_spinlock *node)
goto gotlock;
cpu_relax();
}
-   clear_pending(lock);
 
+   /*
+* Make sure the lock value check below is executed after
+* all the previous loads.
+*/
+   smp_rmb();
 
-   if (!lp) { /* ONCE */
-   lp = pv_hash(lock, pn);
+   /*
+* Set _Q_SLOW_VAL and hash the PV node, if necessary.
+*/
+   if (READ_ONCE(l->locked) != _Q_SLOW_VAL) {
+   struct qspinlock **lp = pv_hash(lock, pn);
+   u8 locked;
 
/*
 * We must hash before setting _Q_SLOW_VAL, such that
@@ -436,7 +440,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct 
mcs_spinlock *node)
 *
 * Matches the smp_rmb() in __pv_queued_spin_unlock().
 */
-   if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
+   locked = xchg(&l->locked, _Q_SLOW_VAL);
+   if (locked == 0) {
/*
 * The lock was free and now we own the lock.
 * Change the lock value back to _Q_LOCKED_VAL
@@ -444,9 +449,18 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct 
mcs_spinlock *node)
 */
WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
+   clear_pending(lock);
goto gotlock;
+   } else if (unlikely(locked == _Q_SLOW_VAL)) {
+   /*
+* Racing with pv_kick_node(), need to undo
+* the pv_hash().
+*/
+   WRITE_ONCE(*lp, NULL);
}
}
+   clear_pending(lock);/* Enable lock stealing */
+
WRITE_ONCE(pn->state, vcpu_halted);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
-- 
1.7.1