cqm: add per-package RMIDs, data and locks

David Carrillo-Cisneros Wed, 11 May 2016 16:11:31 -0700

Introduce struct pkg_data that contains all per-package CQM data for new
CQM driver. The per-package data is:
  1) A pool of free prmids (per-package per RMID). Each package may have
  different number of prmids (different hw max_rmid_index).
  2) lock and mutex that protect the prmids pools, changes to the pmonr
  state, and the rotation logic.
  The per-package separation of locks reduces the contention for each
 lock and mutex compared with the previous version that had system-wide
 mutex and lock.


More per-package data will be added in future patches is this series.

Reviewed-by: Stephane Eranian <eran...@google.com>
Signed-off-by: David Carrillo-Cisneros <davi...@google.com>
---
 arch/x86/events/intel/cqm.c | 499 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/events/intel/cqm.h |  62 ++++++
 include/linux/perf_event.h  |   7 +
 3 files changed, 568 insertions(+)

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 2daee37..54f219f 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -12,6 +12,8 @@
 #define MSR_IA32_QM_CTR                0x0c8e
 #define MSR_IA32_QM_EVTSEL     0x0c8d
 
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
 #define RMID_VAL_ERROR         (1ULL << 63)
 #define RMID_VAL_UNAVAIL       (1ULL << 62)
 
@@ -69,3 +71,500 @@ static inline int __cqm_prmid_update(struct prmid *prmid,
 
        return 1;
 }
+
+/*
+ * A cache groups is a group of perf_events with the same target (thread,
+ * cgroup, CPU or system-wide). Each cache group receives has one RMID.
+ * Cache groups are protected by cqm_mutex.
+ */
+static LIST_HEAD(cache_groups);
+static DEFINE_MUTEX(cqm_mutex);
+
+struct pkg_data **cqm_pkgs_data;
+
+static inline bool __valid_pkg_id(u16 pkg_id)
+{
+       return pkg_id < topology_max_packages();
+}
+
+/* Init cqm pkg_data for @cpu 's package. */
+static int pkg_data_init_cpu(int cpu)
+{
+       struct pkg_data *pkg_data;
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+       u16 pkg_id = topology_physical_package_id(cpu);
+
+       if (cqm_pkgs_data[pkg_id])
+               return 0;
+
+
+       pkg_data = kmalloc_node(sizeof(struct pkg_data),
+                               GFP_KERNEL, cpu_to_node(cpu));
+       if (!pkg_data)
+               return -ENOMEM;
+
+       pkg_data->max_rmid = c->x86_cache_max_rmid;
+
+       /* Does hardware has more rmids than this driver can handle? */
+       if (WARN_ON(pkg_data->max_rmid >= INVALID_RMID))
+               pkg_data->max_rmid = INVALID_RMID - 1;
+
+       if (c->x86_cache_occ_scale != cqm_l3_scale) {
+               pr_err("Multiple LLC scale values, disabling\n");
+               kfree(pkg_data);
+               return -EINVAL;
+       }
+
+       pkg_data->prmids_by_rmid = kmalloc_node(
+               sizeof(struct prmid *) * (1 + pkg_data->max_rmid),
+               GFP_KERNEL, cpu_to_node(cpu));
+
+       if (!pkg_data) {
+               kfree(pkg_data);
+               return -ENOMEM;
+       }
+
+       INIT_LIST_HEAD(&pkg_data->free_prmids_pool);
+
+       mutex_init(&pkg_data->pkg_data_mutex);
+       raw_spin_lock_init(&pkg_data->pkg_data_lock);
+
+       /* XXX: Chose randomly*/
+       pkg_data->rotation_cpu = cpu;
+
+       cqm_pkgs_data[pkg_id] = pkg_data;
+       return 0;
+}
+
+static int intel_cqm_setup_pkg_prmid_pools(u16 pkg_id)
+{
+       int r;
+       unsigned long flags;
+       struct prmid *prmid;
+       struct pkg_data *pkg_data = cqm_pkgs_data[pkg_id];
+
+       if (!__valid_pkg_id(pkg_id))
+               return -EINVAL;
+
+       for (r = 0; r <= pkg_data->max_rmid; r++) {
+
+               prmid = kmalloc_node(sizeof(struct prmid), GFP_KERNEL,
+                                    cpu_to_node(pkg_data->rotation_cpu));
+               if (!prmid)
+                       goto fail;
+
+               atomic64_set(&prmid->last_read_value, 0L);
+               atomic64_set(&prmid->last_read_time, 0L);
+               INIT_LIST_HEAD(&prmid->pool_entry);
+               prmid->rmid = r;
+
+               /* Lock needed if called during CPU hotplug. */
+               raw_spin_lock_irqsave_nested(
+                       &pkg_data->pkg_data_lock, flags, pkg_id);
+               pkg_data->prmids_by_rmid[r] = prmid;
+
+
+               /* RMID 0 is special and makes the root of rmid hierarchy. */
+               raw_spin_unlock_irqrestore(&pkg_data->pkg_data_lock, flags);
+       }
+       return 0;
+fail:
+       while (!list_empty(&pkg_data->free_prmids_pool)) {
+               prmid = list_first_entry(&pkg_data->free_prmids_pool,
+                                        struct prmid, pool_entry);
+               list_del(&prmid->pool_entry);
+               kfree(pkg_data->prmids_by_rmid[prmid->rmid]);
+               kfree(prmid);
+       }
+       return -ENOMEM;
+}
+
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+       /* Per-cpu and task events don't mix */
+       if ((a->attach_state & PERF_ATTACH_TASK) !=
+           (b->attach_state & PERF_ATTACH_TASK))
+               return false;
+
+#ifdef CONFIG_CGROUP_PERF
+       if (a->cgrp != b->cgrp)
+               return false;
+#endif
+
+       /* If not task event, it's a a cgroup or a non-task cpu event. */
+       if (!(b->attach_state & PERF_ATTACH_TASK))
+               return true;
+
+       /*
+        * Events that target same task are placed into the same cache group.
+        */
+       if (a->hw.target == b->hw.target)
+               return true;
+
+       /*
+        * Are we an inherited event?
+        */
+       if (b->parent == a)
+               return true;
+
+       return false;
+}
+
+static struct pmu intel_cqm_pmu;
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's monr.
+ */
+static int
+intel_cqm_setup_event(struct perf_event *event, struct perf_event **group)
+{
+       struct perf_event *iter;
+
+
+       list_for_each_entry(iter, &cache_groups, hw.cqm_event_groups_entry) {
+               if (__match_event(iter, event)) {
+                       *group = iter;
+                       return 0;
+               }
+       }
+       return 0;
+}
+
+/* Read current package immediately and remote pkg (if any) from cache. */
+static void intel_cqm_event_read(struct perf_event *event)
+{
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+       if (!(event->hw.state & PERF_HES_STOPPED))
+               return;
+
+       event->hw.state &= ~PERF_HES_STOPPED;
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
+       event->hw.state |= PERF_HES_STOPPED;
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+       event->hw.state = PERF_HES_STOPPED;
+
+       return 0;
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+       return !list_empty(&event->hw.cqm_event_groups_entry);
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+       struct perf_event *group_other = NULL;
+
+       mutex_lock(&cqm_mutex);
+       /*
+        * If there's another event in this group...
+        */
+       if (!list_empty(&event->hw.cqm_event_group_entry)) {
+               group_other = list_first_entry(&event->hw.cqm_event_group_entry,
+                                              struct perf_event,
+                                              hw.cqm_event_group_entry);
+               list_del(&event->hw.cqm_event_group_entry);
+       }
+       /*
+        * And we're the group leader..
+        */
+       if (!cqm_group_leader(event))
+               goto exit;
+
+       /*
+        * If there was a group_other, make that leader, otherwise
+        * destroy the group and return the RMID.
+        */
+       if (group_other) {
+               /* Update monr reference to group head. */
+               list_replace(&event->hw.cqm_event_groups_entry,
+                            &group_other->hw.cqm_event_groups_entry);
+               goto exit;
+       }
+
+       /*
+        * Event is the only event in cache group.
+        */
+
+       list_del(&event->hw.cqm_event_groups_entry);
+
+exit:
+       mutex_unlock(&cqm_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+       struct perf_event *group = NULL;
+       int ret;
+
+       if (event->attr.type != intel_cqm_pmu.type)
+               return -ENOENT;
+
+       if (event->attr.config & ~QOS_EVENT_MASK)
+               return -EINVAL;
+
+       /* unsupported modes and filters */
+       if (event->attr.exclude_user   ||
+           event->attr.exclude_kernel ||
+           event->attr.exclude_hv     ||
+           event->attr.exclude_idle   ||
+           event->attr.exclude_host   ||
+           event->attr.exclude_guest  ||
+           event->attr.sample_period) /* no sampling */
+               return -EINVAL;
+
+       INIT_LIST_HEAD(&event->hw.cqm_event_groups_entry);
+       INIT_LIST_HEAD(&event->hw.cqm_event_group_entry);
+
+       event->destroy = intel_cqm_event_destroy;
+
+       mutex_lock(&cqm_mutex);
+
+
+       /* Will also set rmid */
+       ret = intel_cqm_setup_event(event, &group);
+       if (ret) {
+               mutex_unlock(&cqm_mutex);
+               return ret;
+       }
+
+       if (group) {
+               list_add_tail(&event->hw.cqm_event_group_entry,
+                               &group->hw.cqm_event_group_entry);
+       } else {
+               list_add_tail(&event->hw.cqm_event_groups_entry,
+                               &cache_groups);
+       }
+
+       mutex_unlock(&cqm_mutex);
+
+       return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+       EVENT_PTR(intel_cqm_llc),
+       EVENT_PTR(intel_cqm_llc_pkg),
+       EVENT_PTR(intel_cqm_llc_unit),
+       EVENT_PTR(intel_cqm_llc_scale),
+       EVENT_PTR(intel_cqm_llc_snapshot),
+       NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+       .name = "events",
+       .attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+       &format_attr_event.attr,
+       NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+       .name = "format",
+       .attrs = intel_cqm_formats_attr,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+       &intel_cqm_events_group,
+       &intel_cqm_format_group,
+       NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+       .hrtimer_interval_ms = CQM_DEFAULT_ROTATION_PERIOD,
+       .attr_groups         = intel_cqm_attr_groups,
+       .task_ctx_nr         = perf_sw_context,
+       .event_init          = intel_cqm_event_init,
+       .add                 = intel_cqm_event_add,
+       .del                 = intel_cqm_event_stop,
+       .start               = intel_cqm_event_start,
+       .stop                = intel_cqm_event_stop,
+       .read                = intel_cqm_event_read,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+       u16 pkg_id = topology_physical_package_id(cpu);
+       /* XXX: lock, check if rotation cpu is online, maybe */
+       /*
+        * Pick a reader if there isn't one already.
+        */
+       if (cqm_pkgs_data[pkg_id]->rotation_cpu != -1)
+               cqm_pkgs_data[pkg_id]->rotation_cpu = cpu;
+}
+
+static void intel_cqm_cpu_starting(unsigned int cpu)
+{
+       struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+       u16 pkg_id = topology_physical_package_id(cpu);
+
+       state->rmid = 0;
+       state->closid = 0;
+
+       /* XXX: lock */
+       /* XXX: Make sure this case is handled when hotplug happens. */
+       WARN_ON(c->x86_cache_max_rmid != cqm_pkgs_data[pkg_id]->max_rmid);
+       WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+       /*
+        * Is @cpu a designated cqm reader?
+        */
+       u16 pkg_id = topology_physical_package_id(cpu);
+
+       if (cqm_pkgs_data[pkg_id]->rotation_cpu != cpu)
+               return;
+       /* XXX: do remove unused packages */
+       cqm_pkgs_data[pkg_id]->rotation_cpu = cpumask_any_but(
+               topology_core_cpumask(cpu), cpu);
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+                                 unsigned long action, void *hcpu)
+{
+       unsigned int cpu  = (unsigned long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               intel_cqm_cpu_exit(cpu);
+               break;
+       case CPU_STARTING:
+               pkg_data_init_cpu(cpu);
+               intel_cqm_cpu_starting(cpu);
+               cqm_pick_event_reader(cpu);
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+       { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+       {}
+};
+
+static int __init intel_cqm_init(void)
+{
+       char *str, scale[20];
+       int i, cpu, ret = 0, min_max_rmid = 0;
+
+       if (!x86_match_cpu(intel_cqm_match))
+               return -ENODEV;
+
+       cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+       if (WARN_ON(cqm_l3_scale == 0))
+               cqm_l3_scale = 1;
+
+       cqm_pkgs_data = kmalloc(
+               sizeof(struct pkg_data *) * topology_max_packages(),
+               GFP_KERNEL);
+       if (!cqm_pkgs_data)
+               return -ENOMEM;
+
+       for (i = 0; i < topology_max_packages(); i++)
+               cqm_pkgs_data[i] = NULL;
+
+       /*
+        * It's possible that not all resources support the same number
+        * of RMIDs. Instead of making scheduling much more complicated
+        * (where we have to match a task's RMID to a cpu that supports
+        * that many RMIDs) just find the minimum RMIDs supported across
+        * all cpus.
+        *
+        * Also, check that the scales match on all cpus.
+        */
+       cpu_notifier_register_begin();
+
+       /* XXX: assert all cpus in pkg have same nr rmids (they should). */
+       for_each_online_cpu(cpu) {
+               ret = pkg_data_init_cpu(cpu);
+               if  (ret)
+                       goto error;
+       }
+
+       /* Select the minimum of the maximum rmids to use as limit for
+        * threshold. XXX: per-package threshold.
+        */
+       cqm_pkg_id_for_each_online(i) {
+               if (min_max_rmid < cqm_pkgs_data[i]->max_rmid)
+                       min_max_rmid = cqm_pkgs_data[i]->max_rmid;
+               intel_cqm_setup_pkg_prmid_pools(i);
+       }
+
+       /*
+        * A reasonable upper limit on the max threshold is the number
+        * of lines tagged per RMID if all RMIDs have the same number of
+        * lines tagged in the LLC.
+        *
+        * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+        */
+       __intel_cqm_max_threshold =
+               boot_cpu_data.x86_cache_size * 1024 / (min_max_rmid + 1);
+
+       snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+       str = kstrdup(scale, GFP_KERNEL);
+       if (!str) {
+               ret = -ENOMEM;
+               goto error;
+       }
+
+       event_attr_intel_cqm_llc_scale.event_str = str;
+
+       for_each_online_cpu(i) {
+               intel_cqm_cpu_starting(i);
+               cqm_pick_event_reader(i);
+       }
+
+       __perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+       ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+       if (ret)
+               goto error;
+
+       cpu_notifier_register_done();
+
+       pr_info("Intel CQM monitoring enabled with at least %u rmids per 
package.\n",
+               min_max_rmid + 1);
+
+       return ret;
+
+error:
+       pr_err("Intel CQM perf registration failed: %d\n", ret);
+       cpu_notifier_register_done();
+
+       return ret;
+}
+
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/events/intel/cqm.h b/arch/x86/events/intel/cqm.h
index 06964cd..08623b5 100644
--- a/arch/x86/events/intel/cqm.h
+++ b/arch/x86/events/intel/cqm.h
@@ -41,9 +41,71 @@ struct prmid {
 };
 
 /*
+ * struct pkg_data: Per-package CQM data.
+ * @max_rmid:                  Max rmid valid for cpus in this package.
+ * @prmids_by_rmid:            Utility mapping between rmid values and prmids.
+ *                             XXX: Make it an array of prmids.
+ * @free_prmid_pool:           Free prmids.
+ * @pkg_data_mutex:            Hold for stability when modifying pmonrs
+ *                             hierarchy.
+ * @pkg_data_lock:             Hold to protect variables that may be accessed
+ *                             during process scheduling. The locks for all
+ *                             packages must be held when modifying the monr
+ *                             hierarchy.
+ * @rotation_cpu:               CPU to run @rotation_work on, it must be in the
+ *                              package associated to this instance of 
pkg_data.
+ */
+struct pkg_data {
+       u32                     max_rmid;
+       /* Quick map from rmids to prmids. */
+       struct prmid            **prmids_by_rmid;
+
+       /*
+        * Pools of prmids used in rotation logic.
+        */
+       struct list_head        free_prmids_pool;
+
+       struct mutex            pkg_data_mutex;
+       raw_spinlock_t          pkg_data_lock;
+
+       int                     rotation_cpu;
+};
+
+extern struct pkg_data **cqm_pkgs_data;
+
+static inline u16 __cqm_pkgs_data_next_online(u16 pkg_id)
+{
+       while (!cqm_pkgs_data[++pkg_id] && pkg_id < topology_max_packages())
+               ;
+       return pkg_id;
+}
+
+static inline u16 __cqm_pkgs_data_first_online(void)
+{
+       if (cqm_pkgs_data[0])
+               return 0;
+       return __cqm_pkgs_data_next_online(0);
+}
+
+/* Iterate for each online pkgs data */
+#define cqm_pkg_id_for_each_online(pkg_id__) \
+       for (pkg_id__ = __cqm_pkgs_data_first_online(); \
+            pkg_id__ < topology_max_packages(); \
+            pkg_id__ = __cqm_pkgs_data_next_online(pkg_id__))
+
+#define __pkg_data(pmonr, member) cqm_pkgs_data[pmonr->pkg_id]->member
+
+/*
  * Time between execution of rotation logic. The frequency of execution does
  * not affect the rate at which RMIDs are recycled, except by the delay by the
  * delay updating the prmid's and their pools.
  * The rotation period is stored in pmu->hrtimer_interval_ms.
  */
 #define CQM_DEFAULT_ROTATION_PERIOD 1200       /* ms */
+
+/*
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ * It's units are bytes must be scaled by cqm_l3_scale to obtain cache lines.
+ */
+static unsigned int __intel_cqm_max_threshold;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1417d3b..02b8e24 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -118,6 +118,13 @@ struct hw_perf_event {
                        /* for tp_event->class */
                        struct list_head        tp_list;
                };
+#ifdef CONFIG_INTEL_RDT
+               struct { /* intel_cqm */
+                       void                    *cqm_monr;
+                       struct list_head        cqm_event_group_entry;
+                       struct list_head        cqm_event_groups_entry;
+               };
+#endif
                struct { /* itrace */
                        int                     itrace_started;
                };
-- 
2.8.0.rc3.226.g39d4020

[PATCH v2 06/32] perf/x86/intel/cqm: add per-package RMIDs, data and locks

Reply via email to