+Mark

On 10/01/17 11:38, Punit Agrawal wrote:
> Both AArch32 and AArch64 mode of the ARMv8 architecture support trapping
> certain VM operations, e.g, TLB and cache maintenance
> operations. Selective trapping of these operations for specific VMs can
> be used to track the frequency with which these occur during execution.
> 
> Add a software PMU on the host that can support tracking VM
> operations (in the form of PMU events). Supporting new events requires
> providing callbacks to configure the VM to enable/disable the trapping
> and read a count of the frequency.
> 
> The host PMU events can be controlled by tools like perf that use
> standard kernel perf interfaces.
> 
> Signed-off-by: Punit Agrawal <[email protected]>
> Cc: Christoffer Dall <[email protected]>
> Cc: Marc Zyngier <[email protected]>
> Cc: Will Deacon <[email protected]>
> ---
>  arch/arm/include/asm/kvm_host.h   |   8 ++
>  arch/arm/kvm/arm.c                |   2 +
>  arch/arm64/include/asm/kvm_host.h |   8 ++
>  virt/kvm/arm/host_pmu.c           | 272 
> ++++++++++++++++++++++++++++++++++++++
>  4 files changed, 290 insertions(+)
>  create mode 100644 virt/kvm/arm/host_pmu.c
> 
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index 26f0c8a0b790..b988f8801b86 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -289,6 +289,14 @@ static inline int 
> kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
>  int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
>  
> +#if !defined(CONFIG_KVM_HOST_PMU)
> +static inline int kvm_host_pmu_init(void) { return 0; }
> +static inline void kvm_host_pmu_teardown(void) { }
> +#else
> +int kvm_host_pmu_init(void);
> +void kvm_host_pmu_teardown(void);
> +#endif
> +
>  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>  
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 11676787ad49..058626b65b8d 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -1263,6 +1263,7 @@ static int init_subsystems(void)
>               goto out;
>  
>       kvm_perf_init();
> +     kvm_host_pmu_init();
>       kvm_coproc_table_init();
>  
>  out:
> @@ -1453,6 +1454,7 @@ int kvm_arch_init(void *opaque)
>  void kvm_arch_exit(void)
>  {
>       kvm_perf_teardown();
> +     kvm_host_pmu_teardown();
>  }
>  
>  static int arm_init(void)
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index 1e83b707f14c..018f887e8964 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -349,6 +349,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run 
> *run,
>  int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
>  
> +#if !defined(CONFIG_KVM_HOST_PMU)
> +static inline int kvm_host_pmu_init(void) { return 0; }
> +static inline void kvm_host_pmu_teardown(void) { }
> +#else
> +int kvm_host_pmu_init(void);
> +void kvm_host_pmu_teardown(void);
> +#endif
> +
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
>  
>  static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
> diff --git a/virt/kvm/arm/host_pmu.c b/virt/kvm/arm/host_pmu.c
> new file mode 100644
> index 000000000000..fc610ccc169a
> --- /dev/null
> +++ b/virt/kvm/arm/host_pmu.c
> @@ -0,0 +1,272 @@
> +#include <linux/cpumask.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/kvm_host.h>
> +#include <linux/list.h>
> +#include <linux/perf_event.h>
> +#include <linux/pid.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock_types.h>
> +#include <linux/sysfs.h>
> +
> +#include <asm/kvm_emulate.h>
> +
> +enum host_pmu_events {
> +     KVM_HOST_MAX_EVENTS,
> +};
> +
> +struct host_pmu {
> +     struct pmu pmu;
> +     spinlock_t event_list_lock;
> +     struct list_head event_list_head;
> +} host_pmu;
> +#define to_host_pmu(p) (container_of(p, struct host_pmu, pmu))
> +
> +typedef void (*configure_event_fn)(struct kvm *kvm, bool enable);
> +typedef u64 (*get_event_count_fn)(struct kvm *kvm);
> +
> +struct kvm_event_cb {
> +     enum host_pmu_events event;
> +     get_event_count_fn get_event_count;
> +     configure_event_fn configure_event;
> +};
> +
> +struct event_data {
> +     bool enable;
> +     struct kvm *kvm;
> +     struct kvm_event_cb *cb;
> +     struct work_struct work;
> +     struct list_head event_list;
> +};
> +
> +static struct kvm_event_cb event_callbacks[] = {
> +};
> +
> +static struct attribute *event_attrs[] = {
> +     NULL,
> +};
> +
> +static struct attribute_group events_attr_group = {
> +     .name   = "events",
> +     .attrs  = event_attrs,
> +};
> +
> +
> +#define VM_MASK      GENMASK_ULL(31, 0)
> +#define EVENT_MASK   GENMASK_ULL(32, 39)
> +#define EVENT_SHIFT  (32)
> +
> +#define to_pid(cfg)  ((cfg) & VM_MASK)
> +#define to_event(cfg)        (((cfg) & EVENT_MASK) >> EVENT_SHIFT)
> +
> +PMU_FORMAT_ATTR(vm, "config:0-31");
> +PMU_FORMAT_ATTR(event, "config:32-39");

I'm a bit confused by these. Can't you get the PID of the VM you're
tracing directly from perf, without having to encode things? And if you
can't, surely this should be a function of the size of pid_t?

Mark, can you shine some light here?

> +
> +static struct attribute *format_attrs[] = {
> +     &format_attr_vm.attr,
> +     &format_attr_event.attr,
> +     NULL,
> +};
> +
> +static struct attribute_group format_attr_group = {
> +     .name   = "format",
> +     .attrs  = format_attrs,
> +};
> +
> +static const struct attribute_group *attr_groups[] = {
> +     &events_attr_group,
> +     &format_attr_group,
> +     NULL,
> +};
> +
> +static void host_event_destroy(struct perf_event *event)
> +{
> +     struct host_pmu *host_pmu = to_host_pmu(event->pmu);
> +     struct event_data *e_data = event->pmu_private;
> +
> +     /*
> +      * Ensure outstanding work items related to this event are
> +      * completed before freeing resources.
> +      */
> +     flush_work(&e_data->work);
> +
> +     kvm_put_kvm(e_data->kvm);
> +
> +     spin_lock(&host_pmu->event_list_lock);
> +     list_del(&e_data->event_list);
> +     spin_unlock(&host_pmu->event_list_lock);
> +     kfree(e_data);
> +}
> +
> +void host_event_work(struct work_struct *work)
> +{
> +     struct event_data *e_data = container_of(work, struct event_data, work);
> +     struct kvm *kvm = e_data->kvm;
> +
> +     e_data->cb->configure_event(kvm, e_data->enable);
> +}
> +
> +static int host_event_init(struct perf_event *event)
> +{
> +     struct host_pmu *host_pmu = to_host_pmu(event->pmu);
> +     int event_id = to_event(event->attr.config);
> +     pid_t task_pid = to_pid(event->attr.config);
> +     struct event_data *e_data, *pos;
> +     bool found = false;
> +     struct pid *pid;
> +     struct kvm *kvm;
> +     int ret = 0;
> +
> +     if (event->attr.type != event->pmu->type)
> +             return -ENOENT;
> +
> +     if (has_branch_stack(event)     ||
> +         is_sampling_event(event)    ||
> +         event->attr.exclude_user    ||
> +         event->attr.exclude_kernel  ||
> +         event->attr.exclude_hv      ||
> +         event->attr.exclude_idle    ||
> +         event->attr.exclude_guest) {
> +             return -EINVAL;
> +     }
> +
> +     if (event->attach_state == PERF_ATTACH_TASK)
> +             return -EOPNOTSUPP;
> +
> +     if (event->cpu < 0)
> +             return -EINVAL;
> +
> +     if (event_id >= KVM_HOST_MAX_EVENTS)
> +             return -EINVAL;
> +
> +     pid = find_get_pid(task_pid);
> +     spin_lock(&kvm_lock);
> +     list_for_each_entry(kvm, &vm_list, vm_list) {
> +             if (kvm->pid == pid) {
> +                     kvm_get_kvm(kvm);
> +                     found = true;
> +                     break;
> +             }
> +     }
> +     spin_unlock(&kvm_lock);
> +     put_pid(pid);
> +
> +     if (!found)
> +             return -EINVAL;
> +
> +     spin_lock(&host_pmu->event_list_lock);
> +     /* Make sure we don't already have the (event_id, kvm) pair */
> +     list_for_each_entry(pos, &host_pmu->event_list_head, event_list) {
> +             if (pos->cb->event == event_id &&
> +                 pos->kvm->pid == pid) {
> +                     kvm_put_kvm(kvm);
> +                     ret = -EOPNOTSUPP;
> +                     goto unlock;
> +             }
> +     }
> +
> +     e_data = kzalloc(sizeof(*e_data), GFP_KERNEL);
> +     e_data->kvm = kvm;
> +     e_data->cb = &event_callbacks[event_id];
> +     INIT_WORK(&e_data->work, host_event_work);
> +     event->pmu_private = e_data;
> +     event->cpu = cpumask_first(cpu_online_mask);
> +     event->destroy = host_event_destroy;
> +
> +     list_add_tail(&e_data->event_list, &host_pmu->event_list_head);
> +
> +unlock:
> +     spin_unlock(&host_pmu->event_list_lock);
> +
> +     return ret;
> +}
> +
> +static void host_event_update(struct perf_event *event)
> +{
> +     struct event_data *e_data = event->pmu_private;
> +     struct kvm_event_cb *cb = e_data->cb;
> +     struct kvm *kvm = e_data->kvm;
> +     struct hw_perf_event *hw = &event->hw;
> +     u64 prev_count, new_count;
> +
> +     do {
> +             prev_count = local64_read(&hw->prev_count);
> +             new_count = cb->get_event_count(kvm);
> +     } while (local64_xchg(&hw->prev_count, new_count) != prev_count);
> +
> +     local64_add(new_count - prev_count, &event->count);
> +}
> +
> +static void host_event_start(struct perf_event *event, int flags)
> +{
> +     struct event_data *e_data = event->pmu_private;
> +     struct kvm_event_cb *cb = e_data->cb;
> +     struct kvm *kvm = e_data->kvm;
> +     u64 val;
> +
> +     val = cb->get_event_count(kvm);
> +     local64_set(&event->hw.prev_count, val);
> +
> +     e_data->enable = true;
> +     schedule_work(&e_data->work);
> +}
> +
> +static void host_event_stop(struct perf_event *event, int flags)
> +{
> +     struct event_data *e_data = event->pmu_private;
> +
> +     e_data->enable = false;
> +     schedule_work(&e_data->work);
> +
> +     if (flags & PERF_EF_UPDATE)
> +             host_event_update(event);
> +}
> +
> +static int host_event_add(struct perf_event *event, int flags)
> +{
> +     if (flags & PERF_EF_START)
> +             host_event_start(event, flags);
> +
> +     return 0;
> +}
> +
> +static void host_event_del(struct perf_event *event, int flags)
> +{
> +     host_event_stop(event, PERF_EF_UPDATE);
> +}
> +
> +static void host_event_read(struct perf_event *event)
> +{
> +     host_event_update(event);
> +}
> +
> +static void init_host_pmu(struct host_pmu *host_pmu)
> +{
> +     host_pmu->pmu = (struct pmu) {
> +             .task_ctx_nr    = perf_sw_context,
> +             .attr_groups    = attr_groups,
> +             .event_init     = host_event_init,
> +             .add            = host_event_add,
> +             .del            = host_event_del,
> +             .start          = host_event_start,
> +             .stop           = host_event_stop,
> +             .read           = host_event_read,
> +             .capabilities   = PERF_PMU_CAP_NO_INTERRUPT,
> +     };
> +
> +     INIT_LIST_HEAD(&host_pmu->event_list_head);
> +     spin_lock_init(&host_pmu->event_list_lock);
> +}
> +
> +int kvm_host_pmu_init(void)
> +{
> +     init_host_pmu(&host_pmu);
> +
> +     return perf_pmu_register(&host_pmu.pmu, "kvm", -1);
> +}
> +
> +void kvm_host_pmu_teardown(void)
> +{
> +     perf_pmu_unregister(&host_pmu.pmu);
> +}
> 

This patch really makes me think that there is nothing arm-specific in
here at all. Why can't it be a generic feature through which
architectures can expose events in a generic way (or as close as
possible to being generic)?

Thanks,

        M.
-- 
Jazz is not dead. It just smells funny...

Reply via email to