[PATCH v5 5/6] locking/pvqspinlock: Allow vCPUs kick-ahead

2015-08-07 Thread Waiman Long
Frequent CPU halting (vmexit) and CPU kicking (vmenter) lengthens
critical section and block forward progress.  This patch implements
a kick-ahead mechanism where the unlocker will kick the queue head
vCPUs as well as up to four additional vCPUs next to the queue head
if they were halted.  The kickings are done after exiting the critical
section to improve parallelism.

The amount of kick-ahead allowed depends on the number of vCPUs
in the VM guest. Currently it allows up to 1 vCPU kick-ahead per
4 vCPUs available up to a maximum of PV_KICK_AHEAD_MAX (4). There
are diminishing returns in increasing the maximum value. The current
value of 4 is a compromise of getting a nice performance boost without
penalizing too much on the one vCPU that is doing all the kickings.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

WestmereHaswell
  Patch 32 vCPUs48 vCPUs32 vCPUs48 vCPUs
  - 
  Before patch   3m21.9s11m20.6s 2m08.6s17m12.8s
  After patch3m03.2s 9m21.1s 2m08.9s16m14.8s

This improves performance quite substantially on Westmere, but not
so much on Haswell.

Signed-off-by: Waiman Long 
---
 kernel/locking/qspinlock_paravirt.h |   71 +-
 1 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/qspinlock_paravirt.h 
b/kernel/locking/qspinlock_paravirt.h
index 7c9d6ed..9996609 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -57,6 +57,7 @@ enum pv_qlock_stat {
pvstat_wait_again,
pvstat_kick_wait,
pvstat_kick_unlock,
+   pvstat_kick_ahead,
pvstat_pend_lock,
pvstat_pend_fail,
pvstat_spurious,
@@ -77,6 +78,7 @@ static const char * const stat_fsnames[pvstat_num] = {
[pvstat_wait_again]  = "wait_again_count",
[pvstat_kick_wait]   = "kick_wait_count",
[pvstat_kick_unlock] = "kick_unlock_count",
+   [pvstat_kick_ahead]  = "kick_ahead_count",
[pvstat_pend_lock]   = "pending_lock_count",
[pvstat_pend_fail]   = "pending_fail_count",
[pvstat_spurious]= "spurious_wakeup",
@@ -89,7 +91,7 @@ static atomic_t pvstats[pvstat_num];
  * pv_kick_latencies = sum of all pv_kick latencies in ns
  * pv_wake_latencies = sum of all wakeup latencies in ns
  *
- * Avg kick latency   = pv_kick_latencies/kick_unlock_count
+ * Avg kick latency   = pv_kick_latencies/(kick_unlock_count + 
kick_ahead_count)
  * Avg wake latency   = pv_wake_latencies/kick_wait_count
  * Avg # of hops/hash = hash_hops_count/kick_unlock_count
  */
@@ -221,6 +223,18 @@ static struct pv_hash_entry *pv_lock_hash;
 static unsigned int pv_lock_hash_bits __read_mostly;
 
 /*
+ * Allow kick-ahead of vCPUs at unlock time
+ *
+ * The pv_kick_ahead value is set by a simple formula that 1 vCPU kick-ahead
+ * is allowed per 4 vCPUs available up to a maximum of PV_KICK_AHEAD_MAX.
+ * There are diminishing returns in increasing PV_KICK_AHEAD_MAX. The current
+ * value of 4 is a good compromise that gives a good performance boost without
+ * penalizing the vCPU that is doing the kicking by too much.
+ */
+#define PV_KICK_AHEAD_MAX  4
+static int pv_kick_ahead __read_mostly;
+
+/*
  * Allocate memory for the PV qspinlock hash buckets
  *
  * This function should be called from the paravirt spinlock initialization
@@ -228,7 +242,8 @@ static unsigned int pv_lock_hash_bits __read_mostly;
  */
 void __init __pv_init_lock_hash(void)
 {
-   int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+   int ncpus = num_possible_cpus();
+   int pv_hash_size = ALIGN(4 * ncpus, PV_HE_PER_LINE);
 
if (pv_hash_size < PV_HE_MIN)
pv_hash_size = PV_HE_MIN;
@@ -242,6 +257,13 @@ void __init __pv_init_lock_hash(void)
   pv_hash_size, 0, HASH_EARLY,
   _lock_hash_bits, NULL,
   pv_hash_size, pv_hash_size);
+   /*
+* Enable the unlock kick ahead mode according to the number of
+* vCPUs available.
+*/
+   pv_kick_ahead = min(ncpus/4, PV_KICK_AHEAD_MAX);
+   if (pv_kick_ahead)
+   pr_info("PV unlock kick ahead max count = %d\n", pv_kick_ahead);
 }
 
 #define for_each_hash_entry(he, offset, hash)  
\
@@ -551,6 +573,26 @@ static void pv_wait_head(struct qspinlock *lock, struct 
mcs_spinlock *node)
 }
 
 /*
+ * Helper to get the address of the next kickable node
+ *
+ * The node has to be in the halted state. The state will then be
+ * transitioned to the running state. If no kickable node is 

[PATCH v5 5/6] locking/pvqspinlock: Allow vCPUs kick-ahead

2015-08-07 Thread Waiman Long
Frequent CPU halting (vmexit) and CPU kicking (vmenter) lengthens
critical section and block forward progress.  This patch implements
a kick-ahead mechanism where the unlocker will kick the queue head
vCPUs as well as up to four additional vCPUs next to the queue head
if they were halted.  The kickings are done after exiting the critical
section to improve parallelism.

The amount of kick-ahead allowed depends on the number of vCPUs
in the VM guest. Currently it allows up to 1 vCPU kick-ahead per
4 vCPUs available up to a maximum of PV_KICK_AHEAD_MAX (4). There
are diminishing returns in increasing the maximum value. The current
value of 4 is a compromise of getting a nice performance boost without
penalizing too much on the one vCPU that is doing all the kickings.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

WestmereHaswell
  Patch 32 vCPUs48 vCPUs32 vCPUs48 vCPUs
  - 
  Before patch   3m21.9s11m20.6s 2m08.6s17m12.8s
  After patch3m03.2s 9m21.1s 2m08.9s16m14.8s

This improves performance quite substantially on Westmere, but not
so much on Haswell.

Signed-off-by: Waiman Long waiman.l...@hp.com
---
 kernel/locking/qspinlock_paravirt.h |   71 +-
 1 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/qspinlock_paravirt.h 
b/kernel/locking/qspinlock_paravirt.h
index 7c9d6ed..9996609 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -57,6 +57,7 @@ enum pv_qlock_stat {
pvstat_wait_again,
pvstat_kick_wait,
pvstat_kick_unlock,
+   pvstat_kick_ahead,
pvstat_pend_lock,
pvstat_pend_fail,
pvstat_spurious,
@@ -77,6 +78,7 @@ static const char * const stat_fsnames[pvstat_num] = {
[pvstat_wait_again]  = wait_again_count,
[pvstat_kick_wait]   = kick_wait_count,
[pvstat_kick_unlock] = kick_unlock_count,
+   [pvstat_kick_ahead]  = kick_ahead_count,
[pvstat_pend_lock]   = pending_lock_count,
[pvstat_pend_fail]   = pending_fail_count,
[pvstat_spurious]= spurious_wakeup,
@@ -89,7 +91,7 @@ static atomic_t pvstats[pvstat_num];
  * pv_kick_latencies = sum of all pv_kick latencies in ns
  * pv_wake_latencies = sum of all wakeup latencies in ns
  *
- * Avg kick latency   = pv_kick_latencies/kick_unlock_count
+ * Avg kick latency   = pv_kick_latencies/(kick_unlock_count + 
kick_ahead_count)
  * Avg wake latency   = pv_wake_latencies/kick_wait_count
  * Avg # of hops/hash = hash_hops_count/kick_unlock_count
  */
@@ -221,6 +223,18 @@ static struct pv_hash_entry *pv_lock_hash;
 static unsigned int pv_lock_hash_bits __read_mostly;
 
 /*
+ * Allow kick-ahead of vCPUs at unlock time
+ *
+ * The pv_kick_ahead value is set by a simple formula that 1 vCPU kick-ahead
+ * is allowed per 4 vCPUs available up to a maximum of PV_KICK_AHEAD_MAX.
+ * There are diminishing returns in increasing PV_KICK_AHEAD_MAX. The current
+ * value of 4 is a good compromise that gives a good performance boost without
+ * penalizing the vCPU that is doing the kicking by too much.
+ */
+#define PV_KICK_AHEAD_MAX  4
+static int pv_kick_ahead __read_mostly;
+
+/*
  * Allocate memory for the PV qspinlock hash buckets
  *
  * This function should be called from the paravirt spinlock initialization
@@ -228,7 +242,8 @@ static unsigned int pv_lock_hash_bits __read_mostly;
  */
 void __init __pv_init_lock_hash(void)
 {
-   int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+   int ncpus = num_possible_cpus();
+   int pv_hash_size = ALIGN(4 * ncpus, PV_HE_PER_LINE);
 
if (pv_hash_size  PV_HE_MIN)
pv_hash_size = PV_HE_MIN;
@@ -242,6 +257,13 @@ void __init __pv_init_lock_hash(void)
   pv_hash_size, 0, HASH_EARLY,
   pv_lock_hash_bits, NULL,
   pv_hash_size, pv_hash_size);
+   /*
+* Enable the unlock kick ahead mode according to the number of
+* vCPUs available.
+*/
+   pv_kick_ahead = min(ncpus/4, PV_KICK_AHEAD_MAX);
+   if (pv_kick_ahead)
+   pr_info(PV unlock kick ahead max count = %d\n, pv_kick_ahead);
 }
 
 #define for_each_hash_entry(he, offset, hash)  
\
@@ -551,6 +573,26 @@ static void pv_wait_head(struct qspinlock *lock, struct 
mcs_spinlock *node)
 }
 
 /*
+ * Helper to get the address of the next kickable node
+ *
+ * The node has to be in the halted state. The state will then be
+ * transitioned to the running state. If no kickable node is