Use perf_events to emulate an architectural PMU, version 1.

Caveats:
- counters that have PMI (interrupt) enabled stop counting after the
  interrupt is signalled.  This is because we need one-shot samples
  that keep counting, which perf doesn't support yet
- some combinations of INV and CMASK are not supported
- counters keep on counting in the host as well as the guest

Signed-off-by: Avi Kivity <[email protected]>
---
 arch/x86/include/asm/kvm_host.h |   29 +++++
 arch/x86/kvm/Makefile           |    2 +-
 arch/x86/kvm/pmu.c              |  255 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |   16 ++--
 4 files changed, 293 insertions(+), 9 deletions(-)
 create mode 100644 arch/x86/kvm/pmu.c

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc38eca..86f49a2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -16,6 +16,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/tracepoint.h>
 #include <linux/cpumask.h>
+#include <linux/irq_work.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -287,6 +288,24 @@ struct kvm_mmu {
        u64 pdptrs[4]; /* pae */
 };
 
+#define KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS 4
+
+struct kvm_pmc {
+       u64 counter;
+       u64 eventsel;
+       struct perf_event *perf_event;
+       struct kvm_vcpu *vcpu;
+};
+
+struct kvm_pmu {
+       unsigned nr_arch_gp_counters;
+       unsigned available_event_types;
+       u64 counter_bitmask;
+       u8 version;
+       struct kvm_pmc gp_counters[KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS];
+       struct irq_work irq_work;
+};
+
 struct kvm_vcpu_arch {
        /*
         * rip and regs accesses must go through
@@ -414,6 +433,8 @@ struct kvm_vcpu_arch {
        u64 mcg_ctl;
        u64 *mce_banks;
 
+       struct kvm_pmu pmu;
+
        /* used for guest single stepping over the given code position */
        unsigned long singlestep_rip;
 
@@ -870,4 +891,12 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, 
gfn_t gfn);
 
 void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
 
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
+bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..cfca03f 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API)       += $(addprefix 
../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)     += $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-                          i8254.o timer.o
+                          i8254.o timer.o pmu.o
 kvm-intel-y            += vmx.o
 kvm-amd-y              += svm.o
 
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 0000000..763e763
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,255 @@
+/*
+ * Kernel-based Virtual Machine -- Performane Monitoring Unit support
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ *   Avi Kivity   <[email protected]>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "pmu.h"
+#include "lapic.h"
+
+static struct kvm_arch_event_perf_mapping {
+       u8 eventsel;
+       u8 unit_mask;
+       unsigned event_type;
+       bool inexact;
+} arch_events[] = {
+       /* Index must match CPUID 0x0A.EBX bit vector */
+       [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+       [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+       [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
+       [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
+       [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
+       [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+       [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+};
+
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+                                        u32 base)
+{
+       if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
+               return &pmu->gp_counters[msr - base];
+       return NULL;
+}
+
+static void __kvm_perf_overflow(struct irq_work *irq_work)
+{
+       struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
+       struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, arch.pmu);
+
+       if (vcpu->arch.apic)
+               kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+}
+
+static void kvm_perf_overflow(struct perf_event *perf_event,
+                             int nmi,
+                             struct perf_sample_data *data,
+                             struct pt_regs *regs)
+{
+       struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+
+       irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
+}
+
+static u64 read_gp_pmc(struct kvm_pmu *pmu, struct kvm_pmc *pmc)
+{
+       u64 counter, enabled, running;
+
+       counter = pmc->counter;
+
+       if (pmc->perf_event)
+               counter += perf_event_read_value(pmc->perf_event,
+                                                &enabled, &running);
+
+       /* FIXME: Scaling needed? */
+
+       return counter & pmu->counter_bitmask;
+}
+
+static int reprogram_gp_counter(struct kvm_pmu *pmu, struct kvm_pmc *pmc,
+                               u64 eventsel)
+{
+       struct perf_event_attr attr = { };
+       struct perf_event *event;
+       int i;
+       u8 event_select, unit_mask, cmask;
+       perf_overflow_handler_t callback = NULL;
+       bool inv;
+
+       if (pmc->perf_event) {
+               pmc->counter = read_gp_pmc(pmu, pmc);
+               perf_event_release_kernel(pmc->perf_event);
+               pmc->perf_event = NULL;
+               irq_work_sync(&pmu->irq_work);
+               pmc->eventsel = eventsel;
+       }
+
+       if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE))
+               return 0;
+
+       attr.type = PERF_TYPE_HARDWARE;
+       attr.size = sizeof(attr);
+       attr.exclude_idle = true;
+
+       event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+       unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+
+       for (i = 0; i < ARRAY_SIZE(arch_events); ++i) {
+               if (arch_events[i].eventsel == event_select
+                   && arch_events[i].unit_mask == unit_mask
+                   && (pmu->available_event_types & (1 << i))) {
+                       attr.config = arch_events[i].event_type;
+                       break;
+               }
+       }
+       if (i == ARRAY_SIZE(arch_events))
+               return 1;
+
+       attr.exclude_user = !(eventsel & ARCH_PERFMON_EVENTSEL_USR);
+       attr.exclude_kernel = !(eventsel & ARCH_PERFMON_EVENTSEL_OS);
+
+       if (eventsel & ARCH_PERFMON_EVENTSEL_EDGE)
+               printk_once("kvm: pmu ignoring edge bit\n");
+
+       if (eventsel & ARCH_PERFMON_EVENTSEL_INT) {
+               callback = kvm_perf_overflow;
+               attr.disabled = true;
+       }
+
+       inv = eventsel & ARCH_PERFMON_EVENTSEL_INV;
+       cmask = (eventsel & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
+
+       pmc->eventsel = eventsel;
+
+       if (inv || cmask > 1) {
+               printk_once("kvm: pmu ignoring difficult inv/cmask combo\n");
+               return 0;
+       }
+
+       attr.sample_period = (-pmc->counter) & pmu->counter_bitmask;
+
+       event = perf_event_create_kernel_counter(&attr, -1, current,
+                                                callback, pmc);
+       if (IS_ERR(event))
+               return PTR_ERR(event);
+
+       if (callback)
+               perf_event_refresh(event, 1);
+
+       pmc->perf_event = event;
+       return 0;
+}
+
+bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+       return get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
+               || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0);
+}
+
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       struct kvm_pmc *pmc;
+
+       if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0))) {
+               *data = read_gp_pmc(pmu, pmc);
+               return 0;
+       } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
+               *data = pmc->eventsel;
+               return 0;
+       }
+       return 1;
+}
+
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       struct kvm_pmc *pmc;
+
+       if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0))) {
+               data = (s64)(s32)data;
+               pmc->counter += data - read_gp_pmc(pmu, pmc);
+               return 0;
+       } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
+               if (data == pmc->eventsel)
+                       return 0;
+               if (data & 0xffffffff00200000ULL)
+                       return 1;
+               return reprogram_gp_counter(pmu, pmc, data);
+       }
+       return 1;
+}
+
+int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       bool fast_mode = pmc & (1u << 31);
+       u64 ctr;
+
+       pmc &= (1u << 31) - 1;
+       if (pmc >= pmu->nr_arch_gp_counters)
+               return 1;
+       ctr = read_gp_pmc(pmu, &pmu->gp_counters[pmc]);
+       if (fast_mode)
+               ctr = (u32)ctr;
+       *data = ctr;
+
+       return 0;
+}
+
+void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       struct kvm_cpuid_entry2 *entry;
+       unsigned bitmap_len;
+
+       pmu->nr_arch_gp_counters = 0;
+       pmu->version = 0;
+       entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+       if (!entry)
+               return;
+       pmu->version = entry->eax & 0xff;
+       pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
+                                      KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS);
+       pmu->counter_bitmask = ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
+       bitmap_len = (entry->eax >> 24) & 0xff;
+       pmu->available_event_types = ~entry->ebx & ((1ULL << bitmap_len) - 1);
+}
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+       memset(pmu, 0, sizeof(*pmu));
+       for (i = 0; i < KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS; ++i)
+               pmu->gp_counters[i].vcpu = vcpu;
+       init_irq_work(&pmu->irq_work, __kvm_perf_overflow);
+       kvm_pmu_cpuid_update(vcpu);
+}
+
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = &vcpu->arch.pmu;
+       struct kvm_pmc *pmc;
+       int i;
+
+       irq_work_sync(&pmu->irq_work);
+       for (i = 0; i < KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS; ++i) {
+               pmc = &pmu->gp_counters[i];
+               if (pmc->perf_event)
+                       perf_event_release_kernel(pmc->perf_event);
+       }
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84f4607..258769f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -602,6 +602,8 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
                if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
                        best->ecx |= bit(X86_FEATURE_OSXSAVE);
        }
+
+       kvm_pmu_cpuid_update(vcpu);
 }
 
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1571,8 +1573,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, 
u64 data)
         * which we perfectly emulate ;-). Any other value should be at least
         * reported, some guests depend on them.
         */
-       case MSR_P6_EVNTSEL0:
-       case MSR_P6_EVNTSEL1:
        case MSR_K7_EVNTSEL0:
        case MSR_K7_EVNTSEL1:
        case MSR_K7_EVNTSEL2:
@@ -1584,8 +1584,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, 
u64 data)
        /* at least RHEL 4 unconditionally writes to the perfctr registers,
         * so we ignore writes to make it happy.
         */
-       case MSR_P6_PERFCTR0:
-       case MSR_P6_PERFCTR1:
        case MSR_K7_PERFCTR0:
        case MSR_K7_PERFCTR1:
        case MSR_K7_PERFCTR2:
@@ -1622,6 +1620,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, 
u64 data)
        default:
                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
                        return xen_hvm_config(vcpu, data);
+               if (kvm_pmu_msr(vcpu, msr))
+                       return kvm_pmu_set_msr(vcpu, msr, data);
                if (!ignore_msrs) {
                        pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
                                msr, data);
@@ -1782,10 +1782,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, 
u64 *pdata)
        case MSR_K8_SYSCFG:
        case MSR_K7_HWCR:
        case MSR_VM_HSAVE_PA:
-       case MSR_P6_PERFCTR0:
-       case MSR_P6_PERFCTR1:
-       case MSR_P6_EVNTSEL0:
-       case MSR_P6_EVNTSEL1:
        case MSR_K7_EVNTSEL0:
        case MSR_K7_PERFCTR0:
        case MSR_K8_INT_PENDING_MSG:
@@ -1887,6 +1883,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, 
u64 *pdata)
                data = 0xbe702111;
                break;
        default:
+               if (kvm_pmu_msr(vcpu, msr))
+                       return kvm_pmu_get_msr(vcpu, msr, pdata);
                if (!ignore_msrs) {
                        pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
                        return 1;
@@ -6290,6 +6288,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                goto fail_free_mce_banks;
 
        kvm_async_pf_hash_reset(vcpu);
+       kvm_pmu_init(vcpu);
 
        return 0;
 fail_free_mce_banks:
@@ -6308,6 +6307,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
        int idx;
 
+       kvm_pmu_destroy(vcpu);
        kfree(vcpu->arch.mce_banks);
        kvm_free_lapic(vcpu);
        idx = srcu_read_lock(&vcpu->kvm->srcu);
-- 
1.7.5.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to