[PATCH 07/11] perf/x86/intel: Add Intel Cache QoS Monitoring support

2014-11-14 Thread Matt Fleming
From: Matt Fleming 

Future Intel Xeon processors support a Cache QoS Monitoring feature that
allows tracking of the LLC occupancy for a task or task group, i.e. the
amount of data in pulled into the LLC for the task (group).

Currently the PMU only supports per-cpu events. We create an event for
each cpu and read out all the LLC occupancy values.

Because this results in duplicate values being written out to userspace,
we also export a .per-pkg event file so that the perf tools only
accumulate values for one cpu per package.

Cc: Jiri Olsa 
Cc: Arnaldo Carvalho de Melo 
Cc: Peter Zijlstra 
Signed-off-by: Matt Fleming 
---
 arch/x86/kernel/cpu/Makefile   |   2 +-
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 528 +
 include/linux/perf_event.h |   7 +
 3 files changed, 536 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_cqm.c

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index e27b49d7c922..18e5341bb3fd 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_CPU_SUP_AMD) += 
perf_event_amd_iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)+= perf_event_p6.o perf_event_knc.o 
perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)+= perf_event_intel_lbr.o 
perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)+= perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)+= perf_event_intel_rapl.o 
perf_event_intel_cqm.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c 
b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
new file mode 100644
index ..b16458ff274e
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,528 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include 
+#include 
+#include 
+#include "perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+#define MSR_IA32_QM_CTR0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+static unsigned int cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+struct intel_cqm_state {
+   raw_spinlock_t  lock;
+   int rmid;
+   int cnt;
+};
+
+static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+
+/*
+ * Protects cache_cgroups.
+ */
+static DEFINE_MUTEX(cache_mutex);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR (1ULL << 63)
+#define RMID_VAL_UNAVAIL   (1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID  (1 << 0)
+
+#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+
+static u64 __rmid_read(unsigned long rmid)
+{
+   u64 val;
+
+   /*
+* Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+* it just says that to increase confusion.
+*/
+   wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+   rdmsrl(MSR_IA32_QM_CTR, val);
+
+   /*
+* Aside from the ERROR and UNAVAIL bits, assume this thing returns
+* the number of cachelines tagged with @rmid.
+*/
+   return val;
+}
+
+static unsigned long *cqm_rmid_bitmap;
+
+/*
+ * Returns < 0 on fail.
+ */
+static int __get_rmid(void)
+{
+   return bitmap_find_free_region(cqm_rmid_bitmap, cqm_max_rmid, 0);
+}
+
+static void __put_rmid(int rmid)
+{
+   bitmap_release_region(cqm_rmid_bitmap, rmid, 0);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+   cqm_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(cqm_max_rmid), 
GFP_KERNEL);
+   if (!cqm_rmid_bitmap)
+   return -ENOMEM;
+
+   bitmap_zero(cqm_rmid_bitmap, cqm_max_rmid);
+
+   /*
+* RMID 0 is special and is always allocated. It's used for all
+* tasks that are not monitored.
+*/
+   bitmap_allocate_region(cqm_rmid_bitmap, 0, 0);
+
+   return 0;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+   if ((a->attach_state & PERF_ATTACH_TASK) !=
+   (b->attach_state & PERF_ATTACH_TASK))
+   return false;
+
+   /* not task */
+
+   return true; /* if not task, we're machine wide */
+}
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+   /*
+* If one of them is not a task, same story as above with cgroups.
+*/
+   if (!(a->attach_state & PERF_ATTACH_TASK) ||
+   

[PATCH 07/11] perf/x86/intel: Add Intel Cache QoS Monitoring support

2014-11-14 Thread Matt Fleming
From: Matt Fleming matt.flem...@intel.com

Future Intel Xeon processors support a Cache QoS Monitoring feature that
allows tracking of the LLC occupancy for a task or task group, i.e. the
amount of data in pulled into the LLC for the task (group).

Currently the PMU only supports per-cpu events. We create an event for
each cpu and read out all the LLC occupancy values.

Because this results in duplicate values being written out to userspace,
we also export a .per-pkg event file so that the perf tools only
accumulate values for one cpu per package.

Cc: Jiri Olsa jo...@redhat.com
Cc: Arnaldo Carvalho de Melo a...@redhat.com
Cc: Peter Zijlstra pet...@infradead.org
Signed-off-by: Matt Fleming matt.flem...@intel.com
---
 arch/x86/kernel/cpu/Makefile   |   2 +-
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 528 +
 include/linux/perf_event.h |   7 +
 3 files changed, 536 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_cqm.c

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index e27b49d7c922..18e5341bb3fd 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_CPU_SUP_AMD) += 
perf_event_amd_iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)+= perf_event_p6.o perf_event_knc.o 
perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)+= perf_event_intel_lbr.o 
perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)+= perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)+= perf_event_intel_rapl.o 
perf_event_intel_cqm.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c 
b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
new file mode 100644
index ..b16458ff274e
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,528 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include linux/perf_event.h
+#include linux/slab.h
+#include asm/cpu_device_id.h
+#include perf_event.h
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+#define MSR_IA32_QM_CTR0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+static unsigned int cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+struct intel_cqm_state {
+   raw_spinlock_t  lock;
+   int rmid;
+   int cnt;
+};
+
+static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+
+/*
+ * Protects cache_cgroups.
+ */
+static DEFINE_MUTEX(cache_mutex);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR (1ULL  63)
+#define RMID_VAL_UNAVAIL   (1ULL  62)
+
+#define QOS_L3_OCCUP_EVENT_ID  (1  0)
+
+#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+
+static u64 __rmid_read(unsigned long rmid)
+{
+   u64 val;
+
+   /*
+* Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+* it just says that to increase confusion.
+*/
+   wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+   rdmsrl(MSR_IA32_QM_CTR, val);
+
+   /*
+* Aside from the ERROR and UNAVAIL bits, assume this thing returns
+* the number of cachelines tagged with @rmid.
+*/
+   return val;
+}
+
+static unsigned long *cqm_rmid_bitmap;
+
+/*
+ * Returns  0 on fail.
+ */
+static int __get_rmid(void)
+{
+   return bitmap_find_free_region(cqm_rmid_bitmap, cqm_max_rmid, 0);
+}
+
+static void __put_rmid(int rmid)
+{
+   bitmap_release_region(cqm_rmid_bitmap, rmid, 0);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+   cqm_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(cqm_max_rmid), 
GFP_KERNEL);
+   if (!cqm_rmid_bitmap)
+   return -ENOMEM;
+
+   bitmap_zero(cqm_rmid_bitmap, cqm_max_rmid);
+
+   /*
+* RMID 0 is special and is always allocated. It's used for all
+* tasks that are not monitored.
+*/
+   bitmap_allocate_region(cqm_rmid_bitmap, 0, 0);
+
+   return 0;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+   if ((a-attach_state  PERF_ATTACH_TASK) !=
+   (b-attach_state  PERF_ATTACH_TASK))
+   return false;
+
+   /* not task */
+
+   return true; /* if not task, we're machine wide */
+}
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+   /*
+* If one of them