For a virtual guest with the qspinlock patch, a simple unfair byte lock
will be used if PV spinlock is not configured in or the hypervisor
isn't either KVM or Xen. The byte lock works fine with small guest
of just a few vCPUs. On a much larger guest, however, byte lock can
have serious performance problem.
There are 2 major drawbacks for the unfair byte lock:
1) The constant reading and occasionally writing to the lock word can
put a lot of cacheline contention traffic on the affected
cacheline.
2) Lock starvation is a real possibility especially if the number
of vCPUs is large.
This patch introduces a queue based unfair lock where all the vCPUs
on the queue can opportunistically steal the lock, but the frequency
of doing so decreases exponentially the further it is away from the
queue head. This can greatly reduce cacheline contention problem as
well as encouraging a more FIFO like order of getting the lock.
This patch has no impact on native qspinlock performance at all.
To measure the performance benefit of such a scheme, a linux kernel
build workload (make -j n) was run on a virtual KVM guest with
different configurations on a 8-socket with 4 active cores/socket
Westmere-EX server (8x4). n is the number of vCPUs in the guest. The
test configurations were:
1) 32 NUMA-pinned vCPUs (8x4)
2) 32 vCPUs (no pinning or NUMA)
3) 60 vCPUs (overcommitted)
The kernel build times for different 4.0-rc6 based kernels were:
KernelVM1 VM2 VM3
----- --- ---
PV ticket lock 3m49.7s 11m45.6s15m59.3s
PV qspinlock 3m50.4s5m51.6s15m33.6s
unfair byte lock 3m50.6s 61m01.1s 122m07.7s
unfair qspinlock 3m48.3s5m03.1s 4m31.1s
For VM1, all the locks give essentially the same performance. In both
VM2 and VM3, the performance of byte lock was terrible whereas the
unfair qspinlock was the fastest of all. For this particular test, the
unfair qspinlock can be more than 10X faster than a simple byte lock.
VM2 also illustrated the performance benefit of qspinlock versus
ticket spinlock which got reduced in VM3 due to the overhead of
constant vCPUs halting and kicking.
Signed-off-by: Waiman Long waiman.l...@hp.com
---
arch/x86/include/asm/qspinlock.h | 15 +--
kernel/locking/qspinlock.c| 94 +--
kernel/locking/qspinlock_unfair.h | 231 +
3 files changed, 319 insertions(+), 21 deletions(-)
create mode 100644 kernel/locking/qspinlock_unfair.h
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index c8290db..8113685 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -39,17 +39,16 @@ static inline void queue_spin_unlock(struct qspinlock *lock)
}
#endif
-#define virt_queue_spin_lock virt_queue_spin_lock
+#ifndef static_cpu_has_hypervisor
+#define static_cpu_has_hypervisor static_cpu_has(X86_FEATURE_HYPERVISOR)
+#endif
-static inline bool virt_queue_spin_lock(struct qspinlock *lock)
+#define queue_spin_trylock_unfair queue_spin_trylock_unfair
+static inline bool queue_spin_trylock_unfair(struct qspinlock *lock)
{
- if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
- return false;
-
- while (atomic_cmpxchg(lock-val, 0, _Q_LOCKED_VAL) != 0)
- cpu_relax();
+ u8 *l = (u8 *)lock;
- return true;
+ return !READ_ONCE(*l) (xchg(l, _Q_LOCKED_VAL) == 0);
}
#include asm-generic/qspinlock.h
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b9ba83b..5fda6d5 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -19,7 +19,11 @@
* Peter Zijlstra pet...@infradead.org
*/
-#ifndef _GEN_PV_LOCK_SLOWPATH
+#if defined(_GEN_PV_LOCK_SLOWPATH) || defined(_GEN_UNFAIR_LOCK_SLOWPATH)
+#define_GEN_LOCK_SLOWPATH
+#endif
+
+#ifndef _GEN_LOCK_SLOWPATH
#include linux/smp.h
#include linux/bug.h
@@ -68,12 +72,6 @@
#include mcs_spinlock.h
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-#define MAX_NODES 8
-#else
-#define MAX_NODES 4
-#endif
-
/*
* Per-CPU queue node structures; we can never have more than 4 nested
* contexts: task, softirq, hardirq, nmi.
@@ -81,7 +79,9 @@
* Exactly fits one 64-byte cacheline on a 64-bit architecture.
*
* PV doubles the storage and uses the second cacheline for PV state.
+ * Unfair lock (mutually exclusive to PV) also uses the second cacheline.
*/
+#define MAX_NODES 8
static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
/*
@@ -234,7 +234,7 @@ static __always_inline void set_locked(struct qspinlock
*lock)
/*
* Generate the native code for queue_spin_unlock_slowpath(); provide NOPs for
- * all the PV callbacks.
+ * all the PV and unfair callbacks.
*/
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
@@ -244,19 +244,36 @@ static __always_inline