[PATCH v5 2/6] locking/pvqspinlock: Add pending bit support

2015-08-07 Thread Waiman Long
Like the native qspinlock, using the pending bit when it is lightly
loaded to acquire the lock is faster than going through the PV queuing
process which is even slower than the native queuing process. It also
avoids loading two additional cachelines (the MCS and PV nodes).

This patch adds the pending bit support for PV qspinlock. The pending
bit code has a smaller spin threshold (1<<10). It will default back
to the queuing method if it cannot acquired the lock within a certain
time limit.

On a VM with 32 vCPUs on a 32-core Westmere-EX box, the kernel
build times on 4.2-rc1 based kernels were:

  KernelBuild Time  Sys Time
  ----  
  w/o patch   3m28.5s   28m17.5s
  with patch  3m19.3s   23m55.7s

Using a locking microbenchmark on the same system, the locking
rates in (kops/s) were:

  Threads   Rate w/o patch  Rate with patch
  ---   --  ---
  2 (same socket) 6,515,265   7,077,476
  2 (diff sockets)2,967,145   4,353,851

Signed-off-by: Waiman Long 
---
 kernel/locking/qspinlock.c  |   27 -
 kernel/locking/qspinlock_paravirt.h |   73 +++
 2 files changed, 99 insertions(+), 1 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 337c881..94fdd27 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -162,6 +162,17 @@ static __always_inline void 
clear_pending_set_locked(struct qspinlock *lock)
WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
 }
 
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+   struct __qspinlock *l = (void *)lock;
+
+   WRITE_ONCE(l->pending, 0);
+}
+
 /*
  * xchg_tail - Put in the new queue tail code word & retrieve previous one
  * @lock : Pointer to queued spinlock structure
@@ -193,6 +204,15 @@ static __always_inline void 
clear_pending_set_locked(struct qspinlock *lock)
 }
 
 /**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+   atomic_add(-_Q_PENDING_VAL, >val);
+}
+
+/**
  * xchg_tail - Put in the new queue tail code word & retrieve previous one
  * @lock : Pointer to queued spinlock structure
  * @tail : The new queue tail code word
@@ -245,6 +265,7 @@ static __always_inline void __pv_wait_head(struct qspinlock 
*lock,
   struct mcs_spinlock *node) { }
 
 #define pv_enabled()   false
+#define pv_pending_lock(l, v)  false
 
 #define pv_init_node   __pv_init_node
 #define pv_wait_node   __pv_wait_node
@@ -286,8 +307,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
 
BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
 
-   if (pv_enabled())
+   if (pv_enabled()) {
+   if (pv_pending_lock(lock, val))
+   return; /* Got the lock via pending bit */
goto queue;
+   }
 
if (virt_queued_spin_lock(lock))
return;
@@ -463,6 +487,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 #undef pv_wait_node
 #undef pv_kick_node
 #undef pv_wait_head
+#undef pv_pending_lock
 
 #undef  queued_spin_lock_slowpath
 #define queued_spin_lock_slowpath  __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h 
b/kernel/locking/qspinlock_paravirt.h
index 6eafb9e..94f9adf 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,14 @@
 #define _Q_SLOW_VAL(3U << _Q_LOCKED_OFFSET)
 
 /*
+ * Queued Spinlock Spin Threshold
+ *
+ * The vCPU will spin a relatively short time in pending mode before falling
+ * back to queuing.
+ */
+#define PENDING_SPIN_THRESHOLD (SPIN_THRESHOLD >> 5)
+
+/*
  * Queue node uses: vcpu_running & vcpu_halted.
  * Queue head uses: vcpu_running & vcpu_hashed.
  */
@@ -157,6 +165,71 @@ static void pv_init_node(struct mcs_spinlock *node)
 }
 
 /*
+ * Try to acquire the lock and wait using the pending bit within a certain
+ * threshold as specified by PENDING_SPIN_THRESHOLD. If the threshold has
+ * been exceeded without getting the lock, we fall back to queuing.
+ */
+static int pv_pending_lock(struct qspinlock *lock, u32 val)
+{
+   int loop = PENDING_SPIN_THRESHOLD;
+   u32 new, old;
+
+   /*
+* wait for in-progress pending->locked hand-overs
+*/
+   while ((val == _Q_PENDING_VAL) && loop) {
+   cpu_relax();
+   val = atomic_read(>val);
+   loop--;
+   }
+
+   /*
+* trylock || pending
+*
+* This loop does a trylock if lock is free or sets the pending bit
+* if lock is taken until the cmpxchg succeeds. As it is expected
+* that 

[PATCH v5 2/6] locking/pvqspinlock: Add pending bit support

2015-08-07 Thread Waiman Long
Like the native qspinlock, using the pending bit when it is lightly
loaded to acquire the lock is faster than going through the PV queuing
process which is even slower than the native queuing process. It also
avoids loading two additional cachelines (the MCS and PV nodes).

This patch adds the pending bit support for PV qspinlock. The pending
bit code has a smaller spin threshold (110). It will default back
to the queuing method if it cannot acquired the lock within a certain
time limit.

On a VM with 32 vCPUs on a 32-core Westmere-EX box, the kernel
build times on 4.2-rc1 based kernels were:

  KernelBuild Time  Sys Time
  ----  
  w/o patch   3m28.5s   28m17.5s
  with patch  3m19.3s   23m55.7s

Using a locking microbenchmark on the same system, the locking
rates in (kops/s) were:

  Threads   Rate w/o patch  Rate with patch
  ---   --  ---
  2 (same socket) 6,515,265   7,077,476
  2 (diff sockets)2,967,145   4,353,851

Signed-off-by: Waiman Long waiman.l...@hp.com
---
 kernel/locking/qspinlock.c  |   27 -
 kernel/locking/qspinlock_paravirt.h |   73 +++
 2 files changed, 99 insertions(+), 1 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 337c881..94fdd27 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -162,6 +162,17 @@ static __always_inline void 
clear_pending_set_locked(struct qspinlock *lock)
WRITE_ONCE(l-locked_pending, _Q_LOCKED_VAL);
 }
 
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+   struct __qspinlock *l = (void *)lock;
+
+   WRITE_ONCE(l-pending, 0);
+}
+
 /*
  * xchg_tail - Put in the new queue tail code word  retrieve previous one
  * @lock : Pointer to queued spinlock structure
@@ -193,6 +204,15 @@ static __always_inline void 
clear_pending_set_locked(struct qspinlock *lock)
 }
 
 /**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+   atomic_add(-_Q_PENDING_VAL, lock-val);
+}
+
+/**
  * xchg_tail - Put in the new queue tail code word  retrieve previous one
  * @lock : Pointer to queued spinlock structure
  * @tail : The new queue tail code word
@@ -245,6 +265,7 @@ static __always_inline void __pv_wait_head(struct qspinlock 
*lock,
   struct mcs_spinlock *node) { }
 
 #define pv_enabled()   false
+#define pv_pending_lock(l, v)  false
 
 #define pv_init_node   __pv_init_node
 #define pv_wait_node   __pv_wait_node
@@ -286,8 +307,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
 
BUILD_BUG_ON(CONFIG_NR_CPUS = (1U  _Q_TAIL_CPU_BITS));
 
-   if (pv_enabled())
+   if (pv_enabled()) {
+   if (pv_pending_lock(lock, val))
+   return; /* Got the lock via pending bit */
goto queue;
+   }
 
if (virt_queued_spin_lock(lock))
return;
@@ -463,6 +487,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 #undef pv_wait_node
 #undef pv_kick_node
 #undef pv_wait_head
+#undef pv_pending_lock
 
 #undef  queued_spin_lock_slowpath
 #define queued_spin_lock_slowpath  __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h 
b/kernel/locking/qspinlock_paravirt.h
index 6eafb9e..94f9adf 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,14 @@
 #define _Q_SLOW_VAL(3U  _Q_LOCKED_OFFSET)
 
 /*
+ * Queued Spinlock Spin Threshold
+ *
+ * The vCPU will spin a relatively short time in pending mode before falling
+ * back to queuing.
+ */
+#define PENDING_SPIN_THRESHOLD (SPIN_THRESHOLD  5)
+
+/*
  * Queue node uses: vcpu_running  vcpu_halted.
  * Queue head uses: vcpu_running  vcpu_hashed.
  */
@@ -157,6 +165,71 @@ static void pv_init_node(struct mcs_spinlock *node)
 }
 
 /*
+ * Try to acquire the lock and wait using the pending bit within a certain
+ * threshold as specified by PENDING_SPIN_THRESHOLD. If the threshold has
+ * been exceeded without getting the lock, we fall back to queuing.
+ */
+static int pv_pending_lock(struct qspinlock *lock, u32 val)
+{
+   int loop = PENDING_SPIN_THRESHOLD;
+   u32 new, old;
+
+   /*
+* wait for in-progress pending-locked hand-overs
+*/
+   while ((val == _Q_PENDING_VAL)  loop) {
+   cpu_relax();
+   val = atomic_read(lock-val);
+   loop--;
+   }
+
+   /*
+* trylock || pending
+*
+* This loop does a trylock if lock is free or sets the pending bit
+* if lock is taken until the cmpxchg succeeds. As it is expected
+