This patch enables the accumulation of kicking and waiting related
PV qspinlock statistics when the new QUEUED_LOCK_STAT configuration
option is selected. It also enables the collection of data which
enable us to calculate the kicking and wakeup latencies which have
a heavy dependency on the CPUs being used.

The statistical counters are per-cpu variables to minimize the
performance overhead in their updates. These counters are exported
via the sysfs filesystem under the /sys/kernel/qlockstat directory.
When the corresponding sysfs files are read, summation and computing
of the required data are then performed.

The measured latencies for different CPUs are:

        CPU             Wakeup          Kicking
        ---             ------          -------
        Haswell-EX      63.6us           7.4us
        Westmere-EX     67.6us           9.3us

The measured latencies varied a bit from run-to-run. The wakeup
latency is much higher than the kicking latency.

A sample of statistics counts after system bootup (with vCPU
overcommit) was:

pv_hash_hops=1.00
pv_kick_unlock=1148
pv_kick_wake=1146
pv_latency_kick=11040
pv_latency_wake=194840
pv_spurious_wakeup=7
pv_wait_again=4
pv_wait_head=23
pv_wait_node=1129

Signed-off-by: Waiman Long <waiman.l...@hpe.com>
---
 arch/x86/Kconfig                    |    8 +
 kernel/locking/qspinlock_paravirt.h |   32 ++++-
 kernel/locking/qspinlock_stat.h     |  291 +++++++++++++++++++++++++++++++++++
 3 files changed, 326 insertions(+), 5 deletions(-)
 create mode 100644 kernel/locking/qspinlock_stat.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4a9b9a9..403bfea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -688,6 +688,14 @@ config PARAVIRT_SPINLOCKS
 
          If you are unsure how to answer this question, answer Y.
 
+config QUEUED_LOCK_STAT
+       bool "Paravirt queued spinlock statistics"
+       depends on PARAVIRT_SPINLOCKS && SYSFS && QUEUED_SPINLOCKS
+       ---help---
+         Enable the collection of statistical data on the slowpath
+         behavior of paravirtualized queued spinlocks and report
+         them on sysfs.
+
 source "arch/x86/xen/Kconfig"
 
 config KVM_GUEST
diff --git a/kernel/locking/qspinlock_paravirt.h 
b/kernel/locking/qspinlock_paravirt.h
index 4bd323d..aaeeefb 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -41,6 +41,11 @@ struct pv_node {
 };
 
 /*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
  * Lock and MCS node addresses hash table for fast lookup
  *
  * Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +105,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, 
struct pv_node *node)
 {
        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
        struct pv_hash_entry *he;
+       int hopcnt = 0;
 
        for_each_hash_entry(he, offset, hash) {
+               hopcnt++;
                if (!cmpxchg(&he->lock, NULL, lock)) {
                        WRITE_ONCE(he->node, node);
+                       qstat_hop(hopcnt);
                        return &he->lock;
                }
        }
@@ -164,9 +172,11 @@ static void pv_init_node(struct mcs_spinlock *node)
 static void pv_wait_node(struct mcs_spinlock *node)
 {
        struct pv_node *pn = (struct pv_node *)node;
+       int waitcnt = 0;
        int loop;
 
-       for (;;) {
+       /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+       for (;; waitcnt++) {
                for (loop = SPIN_THRESHOLD; loop; loop--) {
                        if (READ_ONCE(node->locked))
                                return;
@@ -184,12 +194,16 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 */
                smp_store_mb(pn->state, vcpu_halted);
 
-               if (!READ_ONCE(node->locked))
+               if (!READ_ONCE(node->locked)) {
+                       qstat_inc(qstat_pv_wait_node, true);
+                       qstat_inc(qstat_pv_wait_again, waitcnt);
                        pv_wait(&pn->state, vcpu_halted);
+               }
 
                /*
-                * If pv_kick_node() changed us to vcpu_hashed, retain that 
value
-                * so that pv_wait_head() knows to not also try to hash this 
lock.
+                * If pv_kick_node() changed us to vcpu_hashed, retain that
+                * value so that pv_wait_head() knows to not also try to hash
+                * this lock.
                 */
                cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
@@ -200,6 +214,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 * So it is better to spin for a while in the hope that the
                 * MCS lock will be released soon.
                 */
+               qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
        }
 
        /*
@@ -250,6 +265,7 @@ static void pv_wait_head(struct qspinlock *lock, struct 
mcs_spinlock *node)
        struct pv_node *pn = (struct pv_node *)node;
        struct __qspinlock *l = (void *)lock;
        struct qspinlock **lp = NULL;
+       int waitcnt = 0;
        int loop;
 
        /*
@@ -259,7 +275,7 @@ static void pv_wait_head(struct qspinlock *lock, struct 
mcs_spinlock *node)
        if (READ_ONCE(pn->state) == vcpu_hashed)
                lp = (struct qspinlock **)1;
 
-       for (;;) {
+       for (;; waitcnt++) {
                for (loop = SPIN_THRESHOLD; loop; loop--) {
                        if (!READ_ONCE(l->locked))
                                return;
@@ -290,14 +306,19 @@ static void pv_wait_head(struct qspinlock *lock, struct 
mcs_spinlock *node)
                                return;
                        }
                }
+               qstat_inc(qstat_pv_wait_head, true);
+               qstat_inc(qstat_pv_wait_again, waitcnt);
                pv_wait(&l->locked, _Q_SLOW_VAL);
 
+               if (!READ_ONCE(l->locked))
+                       return;
                /*
                 * The unlocker should have freed the lock before kicking the
                 * CPU. So if the lock is still not free, it is a spurious
                 * wakeup and so the vCPU should wait again after spinning for
                 * a while.
                 */
+               qstat_inc(qstat_pv_spurious_wakeup, true);
        }
 
        /*
@@ -352,6 +373,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 
locked)
         * vCPU is harmless other than the additional latency in completing
         * the unlock.
         */
+       qstat_inc(qstat_pv_kick_unlock, true);
        pv_kick(node->cpu);
 }
 
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 0000000..16b84b2
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,291 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.l...@hpe.com>
+ */
+
+/*
+ * When queued spinlock statistics is enabled, the following sysfs files
+ * will be created to hold the statistics counters:
+ *
+ * /sys/kernel/qlockstat/
+ *   pv_hash_hops      - average # of hops per hashing operation
+ *   pv_kick_unlock    - # of vCPU kicks issued at unlock time
+ *   pv_kick_wake      - # of vCPU kicks used for computing pv_latency_wake
+ *   pv_latency_kick   - average latency (ns) of vCPU kick operation
+ *   pv_latency_wake   - average latency (ns) from vCPU kick to wakeup
+ *   pv_spurious_wakeup        - # of spurious wakeups
+ *   pv_wait_again     - # of vCPU wait's that happened after a vCPU kick
+ *   pv_wait_head      - # of vCPU wait's at the queue head
+ *   pv_wait_node      - # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistics counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding sysfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+       qstat_pv_hash_hops,
+       qstat_pv_kick_unlock,
+       qstat_pv_kick_wake,
+       qstat_pv_latency_kick,
+       qstat_pv_latency_wake,
+       qstat_pv_spurious_wakeup,
+       qstat_pv_wait_again,
+       qstat_pv_wait_head,
+       qstat_pv_wait_node,
+       qstat_num,      /* Total number of statistics counters */
+       qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sched.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+       [qstat_pv_hash_hops]       = "pv_hash_hops",
+       [qstat_pv_kick_unlock]     = "pv_kick_unlock",
+       [qstat_pv_kick_wake]       = "pv_kick_wake",
+       [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+       [qstat_pv_latency_kick]    = "pv_latency_kick",
+       [qstat_pv_latency_wake]    = "pv_latency_wake",
+       [qstat_pv_wait_again]      = "pv_wait_again",
+       [qstat_pv_wait_head]       = "pv_wait_head",
+       [qstat_pv_wait_node]       = "pv_wait_node",
+       [qstat_reset_cnts]         = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Sysfs data structures
+ */
+static struct kobj_attribute qstat_kobj_attrs[qstat_num + 1];
+static struct attribute *attrs[qstat_num + 2];
+static struct kobject *qstat_kobj;
+static struct attribute_group attr_group = {
+       .attrs = attrs,
+};
+
+/*
+ * Function to show the qlock statistics count
+ */
+static ssize_t
+qstat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+       int cpu, idx;
+       u64 stat = 0;
+
+       /*
+        * Compute the index of the kobj_attribute in the array and used
+        * it as the same index as the per-cpu variable
+        */
+       idx = attr - qstat_kobj_attrs;
+
+       for_each_online_cpu(cpu)
+               stat += per_cpu(qstats[idx], cpu);
+       return sprintf(buf, "%llu\n", stat);
+}
+
+/*
+ * Return the average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ */
+static ssize_t
+kick_latency_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+       int cpu;
+       u64 latencies = 0, kicks = 0;
+
+       for_each_online_cpu(cpu) {
+               kicks     += per_cpu(qstats[qstat_pv_kick_unlock],  cpu);
+               latencies += per_cpu(qstats[qstat_pv_latency_kick], cpu);
+       }
+
+       /* Rounded to the nearest ns */
+       return sprintf(buf, "%llu\n", kicks ? (latencies + kicks/2)/kicks : 0);
+}
+
+/*
+ * Return the average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ */
+static ssize_t
+wake_latency_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+       int cpu;
+       u64 latencies = 0, kicks = 0;
+
+       for_each_online_cpu(cpu) {
+               kicks     += per_cpu(qstats[qstat_pv_kick_wake],    cpu);
+               latencies += per_cpu(qstats[qstat_pv_latency_wake], cpu);
+       }
+
+       /* Rounded to the nearest ns */
+       return sprintf(buf, "%llu\n", kicks ? (latencies + kicks/2)/kicks : 0);
+}
+
+/*
+ * Return the average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t
+hash_hop_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+       int cpu;
+       u64 hops = 0, kicks = 0;
+
+       for_each_online_cpu(cpu) {
+               kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+               hops  += per_cpu(qstats[qstat_pv_hash_hops],   cpu);
+       }
+
+       if (!kicks)
+               return sprintf(buf, "0\n");
+
+       /*
+        * Return a X.XX decimal number
+        */
+       return sprintf(buf, "%llu.%02llu\n", hops/kicks,
+                     ((hops%kicks)*100 + kicks/2)/kicks);
+}
+
+/*
+ * Reset all the counters value
+ *
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t
+reset_counters_store(struct kobject *kobj, struct kobj_attribute *attr,
+                    const char *buf, size_t count)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               int i;
+               unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+               for (i = 0 ; i < qstat_num; i++)
+                       WRITE_ONCE(ptr[i], 0);
+               for (i = 0 ; i < qstat_num; i++)
+                       WRITE_ONCE(ptr[i], 0);
+       }
+       return count;
+}
+
+/*
+ * Initialize sysfs for the qspinlock statistics
+ */
+static int __init init_qspinlock_stat(void)
+{
+       int i, retval;
+
+       qstat_kobj = kobject_create_and_add("qlockstat", kernel_kobj);
+       if (qstat_kobj == NULL)
+               return -ENOMEM;
+
+       /*
+        * Initialize the attribute table
+        *
+        * As reading from and writing to the stat files can be slow, only
+        * root is allowed to do the read/write to limit impact to system
+        * performance.
+        */
+       for (i = 0; i <= qstat_num; i++) {
+               qstat_kobj_attrs[i].attr.name = qstat_names[i];
+               qstat_kobj_attrs[i].attr.mode = 0400;
+               qstat_kobj_attrs[i].show      = qstat_show;
+               attrs[i]                      = &qstat_kobj_attrs[i].attr;
+       }
+       qstat_kobj_attrs[qstat_pv_hash_hops].show    = hash_hop_show;
+       qstat_kobj_attrs[qstat_pv_latency_kick].show = kick_latency_show;
+       qstat_kobj_attrs[qstat_pv_latency_wake].show = wake_latency_show;
+
+       /*
+        * Set attributes for reset_counters
+        */
+       qstat_kobj_attrs[qstat_reset_cnts].attr.mode = 0200;
+       qstat_kobj_attrs[qstat_reset_cnts].show      = NULL;
+       qstat_kobj_attrs[qstat_reset_cnts].store     = reset_counters_store;
+
+       retval = sysfs_create_group(qstat_kobj, &attr_group);
+       if (retval)
+               kobject_put(qstat_kobj);
+
+       return retval;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistics counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+       if (cond)
+               this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+       this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+       u64 start = sched_clock();
+
+       per_cpu(pv_kick_time, cpu) = start;
+       pv_kick(cpu);
+       this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+       u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+       *pkick_time = 0;
+       pv_wait(ptr, val);
+       if (*pkick_time) {
+               this_cpu_add(qstats[qstat_pv_latency_wake],
+                            sched_clock() - *pkick_time);
+               qstat_inc(qstat_pv_kick_wake, true);
+       }
+}
+
+#define pv_kick(c)     __pv_kick(c)
+#define pv_wait(p, v)  __pv_wait(p, v)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
+static inline void qstat_hop(int hopcnt)                       { }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to