[RFC v1 3/9] KVM: x86: Implement MSR_IA32_PEBS_ENABLE read/write emulation
This patch implements the MSR_IA32_PEBS_ENABLE register read/write emulation for KVM guest. MSR_IA32_PEBS_ENABLE register can be accessed only when PEBS is supported in KVM. VMM need to reprogram the counter when the value of this MSR changed because some of the counters will be created or destroyed. Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/include/asm/msr-index.h | 3 +++ arch/x86/kvm/vmx/pmu_intel.c | 42 +--- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3463326..df966c9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -471,6 +471,8 @@ struct kvm_pmu { u64 global_ctrl_mask; u64 global_ovf_ctrl_mask; u64 reserved_bits; + u64 pebs_enable; + u64 pebs_enable_mask; u8 version; bool pebs_pt; /* PEBS output to Intel PT */ struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3dd166a..a9e8720 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -131,6 +131,9 @@ #define LBR_INFO_ABORT BIT_ULL(61) #define LBR_INFO_CYCLES0x +#define MSR_IA32_PEBS_PMI_AFTER_REC(1UL << 60) +#define MSR_IA32_PEBS_OUTPUT_PT(1UL << 61) +#define MSR_IA32_PEBS_OUTPUT_MASK (3UL << 61) #define MSR_IA32_PEBS_ENABLE 0x03f1 #define MSR_PEBS_DATA_CFG 0x03f2 #define MSR_IA32_DS_AREA 0x0600 diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index e1c987f..fc79cc6 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -66,6 +66,20 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) reprogram_counter(pmu, bit); } +static void pebs_enable_changed(struct kvm_pmu *pmu, u64 data) +{ + int bit; + u64 mask = ((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << + INTEL_PMC_IDX_FIXED); + u64 diff = (pmu->pebs_enable ^ data) & mask; + + pmu->pebs_enable = data; + + for_each_set_bit(bit, (unsigned long *), X86_PMC_IDX_MAX) + reprogram_counter(pmu, bit); +} + static unsigned intel_find_arch_event(struct kvm_pmu *pmu, u8 event_select, u8 unit_mask) @@ -155,6 +169,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: ret = pmu->version > 1; break; + case MSR_IA32_PEBS_ENABLE: + ret = pmu->pebs_pt; + break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || @@ -183,6 +200,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: *data = pmu->global_ovf_ctrl; return 0; + case MSR_IA32_PEBS_ENABLE: + *data = pmu->pebs_enable; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { u64 val = pmc_read_counter(pmc); @@ -240,6 +260,16 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_PEBS_ENABLE: + if (pmu->pebs_enable == data) + return 0; + if (!(data & pmu->pebs_enable_mask) && +(data & MSR_IA32_PEBS_OUTPUT_MASK) == + MSR_IA32_PEBS_OUTPUT_PT) { + pebs_enable_changed(pmu, data); + return 0; + } + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { if (msr_info->host_initiated) @@ -270,6 +300,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + u64 cnts_mask; pmu->nr_arch_gp_counters = 0; pmu->nr_arch_fixed_counters = 0; @@ -304,9 +335,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ((u64)1 << edx.split.bit_width_fixed) - 1; } - pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | + cnts_mask = ((1ull << pmu->nr_arch_gp_counters) - 1) |
[RFC v1 4/9] KVM: x86: Implement counter reload MSRs read/write emulation
This patch implements the counter reload register MSR_RELOAD_PMCx/FIXED_CTRx read/write emulation. These registers can be accessed only when PEBS is supported in KVM. VMM need to reprogram the counters to make the host PMU framework load the value to real hardware after configuration has been changed. Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/msr-index.h | 3 +++ arch/x86/kvm/vmx/pmu_intel.c | 22 +- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index df966c9..9b930b5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -454,6 +454,7 @@ struct kvm_pmc { enum pmc_type type; u8 idx; u64 counter; + u64 reload_cnt; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a9e8720..6321acb 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -141,6 +141,9 @@ #define MSR_IA32_PERF_CAPABILITIES 0x0345 #define MSR_PEBS_LD_LAT_THRESHOLD 0x03f6 +#define MSR_IA32_RELOAD_PMC0 0x14c1 +#define MSR_IA32_RELOAD_FIXED_CTR0 0x1309 + #define MSR_IA32_RTIT_CTL 0x0570 #define RTIT_CTL_TRACEEN BIT(0) #define RTIT_CTL_CYCLEACC BIT(1) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index fc79cc6..ebd3efc 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -175,7 +175,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || - get_fixed_pmc(pmu, msr, MSR_CORE_PERF_FIXED_CTR0); + get_fixed_pmc(pmu, msr, MSR_CORE_PERF_FIXED_CTR0) || + get_gp_pmc(pmu, msr, MSR_IA32_RELOAD_PMC0) || + get_fixed_pmc(pmu, msr, MSR_IA32_RELOAD_FIXED_CTR0); break; } @@ -216,6 +218,11 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { *data = pmc->eventsel; return 0; + } else if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_RELOAD_PMC0)) || + (pmc = get_fixed_pmc(pmu, msr, + MSR_IA32_RELOAD_FIXED_CTR0))) { + *data = pmc->reload_cnt; + return 0; } } @@ -288,6 +295,19 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) reprogram_gp_counter(pmc, data); return 0; } + } else if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_RELOAD_PMC0)) || + (pmc = get_fixed_pmc(pmu, msr, + MSR_IA32_RELOAD_FIXED_CTR0))) { + if (data == pmc->reload_cnt) + return 0; + if (!(data & ~pmc_bitmask(pmc))) { + int pmc_idx = pmc_is_fixed(pmc) ? + pmc->idx + INTEL_PMC_IDX_FIXED : + pmc->idx; + pmc->reload_cnt = data; + reprogram_counter(pmu, pmc_idx); + return 0; + } } } -- 1.8.3.1
[RFC v1 9/9] KVM: x86: Expose PEBS feature to guest
Expose PEBS feature to guest by IA32_MISC_ENABLE[bit12]. IA32_MISC_ENABLE[bit12] is Processor Event Based Sampling (PEBS) Unavailable (RO) flag: 1 = PEBS is not supported; 0 = PEBS is supported. Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/x86.c | 22 +- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 94af338..f6a5630 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1130,6 +1130,7 @@ struct kvm_x86_ops { bool (*xsaves_supported)(void); bool (*umip_emulated)(void); bool (*pt_supported)(void); + bool (*pebs_supported)(void); bool (*pdcm_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8ae6716..2b271fc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6005,6 +6005,11 @@ static bool svm_pt_supported(void) return false; } +static bool svm_pebs_supported(void) +{ + return false; +} + static bool svm_pdcm_supported(void) { return false; @@ -7298,6 +7303,7 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, .pt_supported = svm_pt_supported, + .pebs_supported = svm_pebs_supported, .pdcm_supported = svm_pdcm_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 71e3d42..d85f19b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7782,6 +7782,7 @@ static __exit void hardware_unsetup(void) .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, .pt_supported = vmx_pt_supported, + .pebs_supported = vmx_pebs_supported, .pdcm_supported = vmx_pdcm_supported, .request_immediate_exit = vmx_request_immediate_exit, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 290c3c3..8ad501d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2483,6 +2483,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { bool pr = false; + bool update_cpuid = false; u32 msr = msr_info->index; u64 data = msr_info->data; @@ -2563,11 +2564,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; - vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid(vcpu); - } else { - vcpu->arch.ia32_misc_enable_msr = data; + update_cpuid = true; } + + if (kvm_x86_ops->pebs_supported()) + data &= ~MSR_IA32_MISC_ENABLE_PEBS; + else + data |= MSR_IA32_MISC_ENABLE_PEBS; + + vcpu->arch.ia32_misc_enable_msr = data; + if (update_cpuid) + kvm_update_cpuid(vcpu); break; case MSR_IA32_SMBASE: if (!msr_info->host_initiated) @@ -2875,7 +2882,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; break; case MSR_IA32_MISC_ENABLE: - msr_info->data = vcpu->arch.ia32_misc_enable_msr; + if (kvm_x86_ops->pebs_supported()) + msr_info->data = (vcpu->arch.ia32_misc_enable_msr & + ~MSR_IA32_MISC_ENABLE_PEBS); + else + msr_info->data = (vcpu->arch.ia32_misc_enable_msr | + MSR_IA32_MISC_ENABLE_PEBS); break; case MSR_IA32_SMBASE: if (!msr_info->host_initiated) -- 1.8.3.1
[RFC v1 5/9] KVM: x86: Allocate performance counter for PEBS event
This patch add a new parameter "pebs" that to make the host PMU framework allocate performance counter for guest PEBS event. Signed-off-by: Luwei Kang --- arch/x86/kvm/pmu.c | 23 +++ arch/x86/kvm/pmu.h | 5 +++-- arch/x86/kvm/pmu_amd.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 7 +-- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 46875bb..6bdc282 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -99,7 +99,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, unsigned config, bool exclude_user, bool exclude_kernel, bool intr, - bool in_tx, bool in_tx_cp) + bool in_tx, bool in_tx_cp, bool pebs) { struct perf_event *event; struct perf_event_attr attr = { @@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_user = exclude_user, .exclude_kernel = exclude_kernel, .config = config, + .precise_ip = pebs ? 1 : 0, + .aux_output = pebs ? 1 : 0, }; - attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); + attr.sample_period = pebs ? (-pmc->reload_cnt) & pmc_bitmask(pmc) : + (-pmc->counter) & pmc_bitmask(pmc); if (in_tx) attr.config |= HSW_IN_TX; @@ -140,7 +143,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, clear_bit(pmc->idx, (unsigned long*)_to_pmu(pmc)->reprogram_pmi); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel, bool pebs) { unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; @@ -198,11 +201,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) !(eventsel & ARCH_PERFMON_EVENTSEL_OS), eventsel & ARCH_PERFMON_EVENTSEL_INT, (eventsel & HSW_IN_TX), - (eventsel & HSW_IN_TX_CHECKPOINTED)); + (eventsel & HSW_IN_TX_CHECKPOINTED), + pebs); } EXPORT_SYMBOL_GPL(reprogram_gp_counter); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) +void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx, bool pebs) { unsigned en_field = ctrl & 0x3; bool pmi = ctrl & 0x8; @@ -228,7 +232,8 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) kvm_x86_ops->pmu_ops->find_fixed_event(idx), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ - pmi, false, false); + pmi, false, false, + pebs); } EXPORT_SYMBOL_GPL(reprogram_fixed_counter); @@ -240,12 +245,14 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) return; if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc, pmc->eventsel); + reprogram_gp_counter(pmc, pmc->eventsel, + (pmu->pebs_enable & (1ul << pmc_idx))); else { int idx = pmc_idx - INTEL_PMC_IDX_FIXED; u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); - reprogram_fixed_counter(pmc, ctrl, idx); + reprogram_fixed_counter(pmc, ctrl, idx, + (pmu->pebs_enable & (1ul << pmc_idx))); } } EXPORT_SYMBOL_GPL(reprogram_counter); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index c62a1ff..0c59a15 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -102,8 +102,9 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr, return NULL; } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); +void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel, bool pebs); +void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx, + bool pebs); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index c838838..7b3e307 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -248,7 +248,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_i
[RFC v1 8/9] KVM: X86: MSR_IA32_PERF_CAPABILITIES MSR emulation
Expose some bits of definition which relate with enable PEBS to KVM guest especially PEBS via PT feature. Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/msr-index.h | 3 +++ arch/x86/kvm/vmx/vmx.c | 14 ++ 3 files changed, 18 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2d9b0f9..94af338 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -576,6 +576,7 @@ struct kvm_vcpu_arch { u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; + u64 ia32_perf_capabilities; /* * Paging state of the vcpu diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6321acb..4932dec 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -137,6 +137,9 @@ #define MSR_IA32_PEBS_ENABLE 0x03f1 #define MSR_PEBS_DATA_CFG 0x03f2 #define MSR_IA32_DS_AREA 0x0600 +#define MSR_IA32_PERF_CAP_PEBS_TRAP(1UL << 6) +#define MSR_IA32_PERF_CAP_PEBS_ARCH_REG(1UL << 7) +#define MSR_IA32_PERF_CAP_PEBS_REC_FMT (0xfUL << 8) #define MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT (1UL << 16) #define MSR_IA32_PERF_CAPABILITIES 0x0345 #define MSR_PEBS_LD_LAT_THRESHOLD 0x03f6 diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index dbff8f0..71e3d42 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1737,6 +1737,16 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.ia32_xss; break; + case MSR_IA32_PERF_CAPABILITIES: + if (!vmx_pdcm_supported() || !vmx_pebs_supported()) + return 1; + rdmsrl(MSR_IA32_PERF_CAPABILITIES, msr_info->data); + msr_info->data = msr_info->data & + (MSR_IA32_PERF_CAP_PEBS_TRAP | +MSR_IA32_PERF_CAP_PEBS_ARCH_REG | +MSR_IA32_PERF_CAP_PEBS_REC_FMT | +MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT); + break; case MSR_IA32_RTIT_CTL: if (pt_mode != PT_MODE_HOST_GUEST) return 1; @@ -1981,6 +1991,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; + case MSR_IA32_PERF_CAPABILITIES: + if (!vmx_pdcm_supported() || !vmx_pebs_supported()) + return 1; + break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || vmx_rtit_ctl_check(vcpu, data) || -- 1.8.3.1
[RFC v1 6/9] KVM: x86: Add shadow value of PEBS status
The performance counter used by guest perspective may different with the counter allocated from real hardware (e.g. Guest driver get counter 0 for PEBS but the host PMU driver may alloc other counters for this event). Introduce a new parameter for the mapping of PEBS enable status from guest to real hardware. Update the shadow value of PEBS before VM-entry when PT is enabled in guest. Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/pmu.c | 34 ++ arch/x86/kvm/pmu.h | 1 + arch/x86/kvm/vmx/vmx.c | 8 +++- 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9b930b5..07d3b21 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -473,6 +473,7 @@ struct kvm_pmu { u64 global_ovf_ctrl_mask; u64 reserved_bits; u64 pebs_enable; + u64 pebs_enable_shadow; u64 pebs_enable_mask; u8 version; bool pebs_pt; /* PEBS output to Intel PT */ diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 6bdc282..89d3e4c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -257,6 +257,40 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) } EXPORT_SYMBOL_GPL(reprogram_counter); +void kvm_pmu_pebs_shadow(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct perf_event *event; + int i; + + if (!pmu->pebs_pt) + return; + + pmu->pebs_enable_shadow = MSR_IA32_PEBS_OUTPUT_PT; + + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + if (!test_bit(i, (unsigned long *)>pebs_enable)) + continue; + + event = pmu->gp_counters[i].perf_event; + if (event && (event->hw.idx != -1)) + set_bit(event->hw.idx, + (unsigned long *)>pebs_enable_shadow); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + if (!test_bit(i + INTEL_PMC_IDX_FIXED, + (unsigned long *)>pebs_enable)) + continue; + + event = pmu->fixed_counters[i].perf_event; + if (event && (event->hw.idx != -1)) + set_bit(event->hw.idx, + (unsigned long *)>pebs_enable_shadow); + } +} +EXPORT_SYMBOL_GPL(kvm_pmu_pebs_shadow); + void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 0c59a15..81c35c9 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -119,6 +119,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx, void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); +void kvm_pmu_pebs_shadow(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c030c96..4090c08 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1019,6 +1019,7 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) wrmsrl(MSR_IA32_RTIT_CTL, 0); pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range); pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + kvm_pmu_pebs_shadow(>vcpu); } } @@ -6365,12 +6366,17 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) if (!msrs) return; - for (i = 0; i < nr_msrs; i++) + for (i = 0; i < nr_msrs; i++) { + if (msrs[i].msr == MSR_IA32_PEBS_ENABLE) + msrs[i].guest = + vcpu_to_pmu(>vcpu)->pebs_enable_shadow; + if (msrs[i].host == msrs[i].guest) clear_atomic_switch_msr(vmx, msrs[i].msr); else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, msrs[i].host, false); + } } static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) -- 1.8.3.1
[RFC v1 2/9] KVM: x86: PEBS via Intel PT HW feature detection
PEBS can be enabled in KVM guest by direct PEBS record into the Intel Processor Trace output buffer. This patch adds a new flag to detect if PEBS can be supported in KVM guest. It not only need HW support PEBS output Intel PT (IA32_PERF_CAPABILITIES.PEBS_OUTPUT_PT_AVAIL[16]=1) but also depends on: 1. PEBS feature is supported by HW (IA32_MISC_ENABLE[Bit12]=0); 2. Intel PT must be working in HOST_GUEST mode. Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/msr-index.h | 3 +++ arch/x86/kvm/vmx/capabilities.h | 11 +++ arch/x86/kvm/vmx/pmu_intel.c | 7 ++- 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74e88e5..3463326 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -472,6 +472,7 @@ struct kvm_pmu { u64 global_ovf_ctrl_mask; u64 reserved_bits; u8 version; + bool pebs_pt; /* PEBS output to Intel PT */ struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; struct irq_work irq_work; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 271d837..3dd166a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -134,6 +134,7 @@ #define MSR_IA32_PEBS_ENABLE 0x03f1 #define MSR_PEBS_DATA_CFG 0x03f2 #define MSR_IA32_DS_AREA 0x0600 +#define MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT (1UL << 16) #define MSR_IA32_PERF_CAPABILITIES 0x0345 #define MSR_PEBS_LD_LAT_THRESHOLD 0x03f6 @@ -660,6 +661,8 @@ #define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT) #define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT10 #define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX(1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT) +#define MSR_IA32_MISC_ENABLE_PEBS_BIT 12 +#define MSR_IA32_MISC_ENABLE_PEBS (1ULL << MSR_IA32_MISC_ENABLE_PEBS_BIT) #define MSR_IA32_MISC_ENABLE_TM2_BIT 13 #define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT) #define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19 diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index d6664ee..4bcb6b4 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -342,4 +342,15 @@ static inline bool cpu_has_vmx_intel_pt(void) (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } +static inline bool cpu_has_vmx_pebs_output_pt(void) +{ + u64 misc, perf_cap; + + rdmsrl(MSR_IA32_MISC_ENABLE, misc); + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); + + return (!(misc & MSR_IA32_MISC_ENABLE_PEBS) && + (perf_cap & MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT)); +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 01441be..e1c987f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -12,6 +12,7 @@ #include #include #include +#include "capabilities.h" #include "x86.h" #include "cpuid.h" #include "lapic.h" @@ -309,10 +310,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); - if (kvm_x86_ops->pt_supported()) + if (kvm_x86_ops->pt_supported()) { pmu->global_ovf_ctrl_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; + if (cpu_has_vmx_pebs_output_pt()) + pmu->pebs_pt = true; + } + entry = kvm_find_cpuid_entry(vcpu, 7, 0); if (entry && (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && -- 1.8.3.1
[RFC v1 7/9] KVM: X86: Expose PDCM cpuid to guest
PDCM (Perfmon and Debug Capability) indicates the processor supports the performance and debug feature indication MSR IA32_PERF_CAPABILITIES. PEBS enabling in KVM guest depend on PEBS via PT, and PEBS via PT is detected by IA32_PERF_CAPABILITIES[Bit16]. Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c| 3 ++- arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx/capabilities.h | 10 ++ arch/x86/kvm/vmx/vmx.c | 1 + 5 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 07d3b21..2d9b0f9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1129,6 +1129,7 @@ struct kvm_x86_ops { bool (*xsaves_supported)(void); bool (*umip_emulated)(void); bool (*pt_supported)(void); + bool (*pdcm_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 22c2720..d12e7af 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -430,6 +430,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; + unsigned f_pdcm = kvm_x86_ops->pdcm_supported() ? F(PDCM) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -458,7 +459,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | + F(FMA) | F(CX16) | 0 /* xTPR Update */ | f_pdcm | F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e036807..8ae6716 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6005,6 +6005,11 @@ static bool svm_pt_supported(void) return false; } +static bool svm_pdcm_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -7293,6 +7298,7 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, .pt_supported = svm_pt_supported, + .pdcm_supported = svm_pdcm_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 4bcb6b4..82ca51d 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -353,4 +353,14 @@ static inline bool cpu_has_vmx_pebs_output_pt(void) (perf_cap & MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT)); } +static inline bool vmx_pebs_supported(void) +{ + return (cpu_has_vmx_pebs_output_pt() && pt_mode == PT_MODE_HOST_GUEST); +} + +static inline bool vmx_pdcm_supported(void) +{ + return boot_cpu_has(X86_FEATURE_PDCM) && vmx_pebs_supported(); +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4090c08..dbff8f0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7768,6 +7768,7 @@ static __exit void hardware_unsetup(void) .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, .pt_supported = vmx_pt_supported, + .pdcm_supported = vmx_pdcm_supported, .request_immediate_exit = vmx_request_immediate_exit, -- 1.8.3.1
[RFC v1 1/9] KVM: x86: Add base address parameter for get_fixed_pmc function
PEBS output Inte PT introduces some new MSRs (MSR_RELOAD_FIXED_CTRx) for fixed function counters that use for autoload the preset value after writing out a PEBS event. Introduce base MSRs address parameter to make this function can get performance monitor counter structure by MSR_RELOAD_FIXED_CTRx registers. Signed-off-by: Luwei Kang --- arch/x86/kvm/pmu.h | 5 ++--- arch/x86/kvm/vmx/pmu_intel.c | 14 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 58265f7..c62a1ff 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -93,10 +93,9 @@ static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, } /* returns fixed PMC with the specified MSR */ -static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) +static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr, + int base) { - int base = MSR_CORE_PERF_FIXED_CTR0; - if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) return >fixed_counters[msr - base]; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 4dea0e0..01441be 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -41,7 +41,8 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); struct kvm_pmc *pmc; - pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i, + MSR_CORE_PERF_FIXED_CTR0); if (old_ctrl == new_ctrl) continue; @@ -106,7 +107,8 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) else { u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0, + MSR_CORE_PERF_FIXED_CTR0); } } @@ -155,7 +157,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || - get_fixed_pmc(pmu, msr); + get_fixed_pmc(pmu, msr, MSR_CORE_PERF_FIXED_CTR0); break; } @@ -185,7 +187,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) u64 val = pmc_read_counter(pmc); *data = val & pmu->counter_bitmask[KVM_PMC_GP]; return 0; - } else if ((pmc = get_fixed_pmc(pmu, msr))) { + } else if ((pmc = get_fixed_pmc(pmu, msr, + MSR_CORE_PERF_FIXED_CTR0))) { u64 val = pmc_read_counter(pmc); *data = val & pmu->counter_bitmask[KVM_PMC_FIXED]; return 0; @@ -243,7 +246,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else pmc->counter = (s32)data; return 0; - } else if ((pmc = get_fixed_pmc(pmu, msr))) { + } else if ((pmc = get_fixed_pmc(pmu, msr, + MSR_CORE_PERF_FIXED_CTR0))) { pmc->counter = data; return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { -- 1.8.3.1
[RFC v1 0/9] PEBS enabling in KVM guest
Intel new hardware introduces some Precise Event-Based Sampling (PEBS) extensions that output the PEBS record to Intel PT stream instead of DS area. The PEBS record will be packaged in a specific format when outputing to Intel PT. This patch set will enable PEBS functionality in KVM Guest by PEBS output to Intel PT. The native driver as [1] (still under review). [1] https://www.spinics.net/lists/kernel/msg3215354.html Luwei Kang (9): KVM: x86: Add base address parameter for get_fixed_pmc function KVM: x86: PEBS via Intel PT HW feature detection KVM: x86: Implement MSR_IA32_PEBS_ENABLE read/write emulation KVM: x86: Implement counter reload MSRs read/write emulation KVM: x86: Allocate performance counter for PEBS event KVM: x86: Add shadow value of PEBS status KVM: X86: Expose PDCM cpuid to guest KVM: X86: MSR_IA32_PERF_CAPABILITIES MSR emulation KVM: x86: Expose PEBS feature to guest arch/x86/include/asm/kvm_host.h | 8 arch/x86/include/asm/msr-index.h | 12 ++ arch/x86/kvm/cpuid.c | 3 +- arch/x86/kvm/pmu.c | 57 ++ arch/x86/kvm/pmu.h | 11 ++--- arch/x86/kvm/pmu_amd.c | 2 +- arch/x86/kvm/svm.c | 12 ++ arch/x86/kvm/vmx/capabilities.h | 21 ++ arch/x86/kvm/vmx/pmu_intel.c | 88 +++- arch/x86/kvm/vmx/vmx.c | 24 ++- arch/x86/kvm/x86.c | 22 +++--- 11 files changed, 229 insertions(+), 31 deletions(-) -- 1.8.3.1
[PATCH] KVM: LAPIC: Do not mask the local interrupts when LAPIC is sw disabled
The current code will mask all the local interrupts in the local vector table when the LAPIC is disabled by SVR (Spurious-Interrupt Vector Register) "APIC Software Enable/Disable" flag (bit8). This may block local interrupt be delivered to target vCPU even if LAPIC is enabled by set SVR (bit8 == 1) after. For example, reset vCPU will mask all the local interrupts and set the SVR to default value FFH (LAPIC is disabled because SVR[bit8] == 0). Guest may try to enable some local interrupts (e.g. LVTPC) by clear bit16 of LVT entry before enable LAPIC. But bit16 can't be cleared when LAPIC is "software disabled" and this local interrupt still disabled after LAPIC "software enabled". This patch will not mask the local interrupts when LAPIC is "software disabled" and add LAPIC "software enabled" checking before deliver local interrupt. Signed-off-by: Luwei Kang --- arch/x86/kvm/lapic.c | 19 ++- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fcf42a3..a199f47 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1892,15 +1892,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) mask |= APIC_SPIV_DIRECTED_EOI; apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { - int i; - u32 lvt_val; - - for (i = 0; i < KVM_APIC_LVT_NUM; i++) { - lvt_val = kvm_lapic_get_reg(apic, - APIC_LVTT + 0x10 * i); - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, -lvt_val | APIC_LVT_MASKED); - } apic_update_lvtt(apic); atomic_set(>lapic_timer.pending, 0); @@ -1926,18 +1917,12 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTPC: case APIC_LVT1: case APIC_LVTERR: - /* TODO: Check vector */ - if (!kvm_apic_sw_enabled(apic)) - val |= APIC_LVT_MASKED; - val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; kvm_lapic_set_reg(apic, reg, val); break; case APIC_LVTT: - if (!kvm_apic_sw_enabled(apic)) - val |= APIC_LVT_MASKED; val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); kvm_lapic_set_reg(apic, APIC_LVTT, val); apic_update_lvtt(apic); @@ -2260,7 +2245,7 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; - if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { + if (apic_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; @@ -2363,7 +2348,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (!kvm_apic_hw_enabled(vcpu->arch.apic)) + if (!apic_enabled(vcpu->arch.apic)) r = 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) -- 1.8.3.1
[PATCH v1 5/6] KVM: VMX: Intel PT configration context switch using XSAVES/XRSTORS
This patch add the support of using XSAVES/XRSTORS to do the Intel processor trace context switch. Because of native driver didn't set the XSS[bit8] to enabled the PT state in xsave area, so this patch only set this bit before XSAVE/XRSTORS intstuction executtion and restore the original value after. The flag "initialized" need to be cleared when PT is change from enabled to disabled. Guest may modify PT MSRs when PT is disabled and they are only saved in variables. We need to reload these value to HW manual when PT is enabled. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx/vmx.c | 80 -- 1 file changed, 65 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4691665..d323e6b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1002,33 +1002,83 @@ static inline void pt_save_msr(struct pt_state *ctx, u32 addr_range) static void pt_guest_enter(struct vcpu_vmx *vmx) { + struct pt_desc *desc; + int err; + if (pt_mode == PT_MODE_SYSTEM) return; - /* -* GUEST_IA32_RTIT_CTL is already set in the VMCS. -* Save host state before VM entry. -*/ - rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc->host_ctx->rtit_ctl); - if (vmx->pt_desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) { - wrmsrl(MSR_IA32_RTIT_CTL, 0); - pt_save_msr(vmx->pt_desc->host_ctx, vmx->pt_desc->addr_range); - pt_load_msr(vmx->pt_desc->guest_ctx, vmx->pt_desc->addr_range); + desc = vmx->pt_desc; + + rdmsrl(MSR_IA32_RTIT_CTL, desc->host_ctx->rtit_ctl); + + if (desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) { + if (likely(desc->pt_xsave)) { + wrmsrl(MSR_IA32_XSS, host_xss | XFEATURE_MASK_PT); + /* +* XSAVES instruction will clears the TeaceEn after +* saving the value of RTIT_CTL and before saving any +* other PT state. +*/ + XSTATE_XSAVE(>host_xs->state.xsave, + XFEATURE_MASK_PT, 0, err); + /* +* Still need to load the guest PT state manual if +* PT stste not populated in xsave area. +*/ + if (desc->guest_xs->initialized) + XSTATE_XRESTORE(>guest_xs->state.xsave, + XFEATURE_MASK_PT, 0); + else + pt_load_msr(desc->guest_ctx, desc->addr_range); + + wrmsrl(MSR_IA32_XSS, host_xss); + } else { + if (desc->host_ctx->rtit_ctl & RTIT_CTL_TRACEEN) + wrmsrl(MSR_IA32_RTIT_CTL, 0); + + pt_save_msr(desc->host_ctx, desc->addr_range); + pt_load_msr(desc->guest_ctx, desc->addr_range); + } } } static void pt_guest_exit(struct vcpu_vmx *vmx) { + struct pt_desc *desc; + int err; + if (pt_mode == PT_MODE_SYSTEM) return; - if (vmx->pt_desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) { - pt_save_msr(vmx->pt_desc->guest_ctx, vmx->pt_desc->addr_range); - pt_load_msr(vmx->pt_desc->host_ctx, vmx->pt_desc->addr_range); - } + desc = vmx->pt_desc; + + if (desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) { + if (likely(desc->pt_xsave)) { + wrmsrl(MSR_IA32_XSS, host_xss | XFEATURE_MASK_PT); + /* +* Save guest state. TraceEn is 0 before and after +* XSAVES instruction because RTIT_CTL will be cleared +* on VM-exit (VM Exit control bit25). +*/ + XSTATE_XSAVE(>guest_xs->state.xsave, + XFEATURE_MASK_PT, 0, err); + desc->guest_xs->initialized = 1; + /* +* Resume host PT state and PT may enabled after this +* instruction if host PT is enabled before VM-entry. +*/ + XSTATE_XRESTORE(>host_xs->state.xsave, + XFEATURE_MASK_PT, 0); + wrmsrl(MSR_IA32_XSS, host_xss); + } else { + pt_save_msr(desc->guest_ctx, desc->addr_range); + pt_load_msr(desc->host_ctx, desc->addr_ra
[PATCH v1 2/6] KVM: VMX: Reuse the pt_state structure for PT context
Remove the previous pt_ctx structure and use pt_state to save the PT configuration because they are saved the same things. Add *_ctx postfix to different with the upcoming host and guest fpu pointer for PT state. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c| 96 +-- arch/x86/kvm/vmx/vmx.h| 16 +--- 3 files changed, 46 insertions(+), 68 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f4b1ae4..e8d5c61 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4201,7 +4201,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->nested.vmxon = true; if (pt_mode == PT_MODE_HOST_GUEST) { - vmx->pt_desc.guest.ctl = 0; + vmx->pt_desc.guest_ctx.rtit_ctl = 0; pt_update_intercept_for_msr(vmx); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0db7ded..4234e40e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -976,32 +976,28 @@ static unsigned long segment_base(u16 selector) } #endif -static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) +static inline void pt_load_msr(struct pt_state *ctx, u32 addr_range) { u32 i; - wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); - for (i = 0; i < addr_range; i++) { - wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); - } + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->rtit_output_base); + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->rtit_output_mask); + wrmsrl(MSR_IA32_RTIT_STATUS, ctx->rtit_status); + wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->rtit_cr3_match); + for (i = 0; i < addr_range * 2; i++) + wrmsrl(MSR_IA32_RTIT_ADDR0_A + i, ctx->rtit_addrx_ab[i]); } -static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) +static inline void pt_save_msr(struct pt_state *ctx, u32 addr_range) { u32 i; - rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); - for (i = 0; i < addr_range; i++) { - rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); - } + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->rtit_output_base); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->rtit_output_mask); + rdmsrl(MSR_IA32_RTIT_STATUS, ctx->rtit_status); + rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->rtit_cr3_match); + for (i = 0; i < addr_range; i++) + rdmsrl(MSR_IA32_RTIT_ADDR0_A + i, ctx->rtit_addrx_ab[i]); } static void pt_guest_enter(struct vcpu_vmx *vmx) @@ -1013,11 +1009,11 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry. */ - rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); - if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host_ctx.rtit_ctl); + if (vmx->pt_desc.guest_ctx.rtit_ctl & RTIT_CTL_TRACEEN) { wrmsrl(MSR_IA32_RTIT_CTL, 0); - pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range); - pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + pt_save_msr(>pt_desc.host_ctx, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.guest_ctx, vmx->pt_desc.addr_range); } } @@ -1026,13 +1022,13 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) if (pt_mode == PT_MODE_SYSTEM) return; - if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { - pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range); - pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range); + if (vmx->pt_desc.guest_ctx.rtit_ctl & RTIT_CTL_TRACEEN) { + pt_save_msr(>pt_desc.guest_ctx, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.host_ctx, vmx->pt_desc.addr_range); } /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host_ctx.rtit_ctl); } void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) @@ -1402,8 +1398,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 da
[PATCH v1 6/6] KVM: VMX: Get PT state from xsave area to variables
This patch get the Intel PT state from xsave area to variables when PT is change from enabled to disabled. Because PT state is saved/restored to/from xsave area by XSAVES/XRSTORES instructions when Intel PT is enabled. The KVM guest may read this MSRs when PT is disabled but the real value is saved in xsave area not variables. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx/vmx.c | 13 + 1 file changed, 13 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d323e6b..d3e2569 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1000,6 +1000,16 @@ static inline void pt_save_msr(struct pt_state *ctx, u32 addr_range) rdmsrl(MSR_IA32_RTIT_ADDR0_A + i, ctx->rtit_addrx_ab[i]); } +static void pt_state_get(struct pt_state *ctx, struct fpu *fpu, u32 addr_range) +{ + char *buff = fpu->state.xsave.extended_state_area; + + /* skip riti_ctl register */ + memcpy(>rtit_output_base, buff + sizeof(u64), + sizeof(struct pt_state) - sizeof(u64) + + sizeof(u64) * addr_range * 2); +} + static void pt_guest_enter(struct vcpu_vmx *vmx) { struct pt_desc *desc; @@ -1040,6 +1050,9 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) pt_save_msr(desc->host_ctx, desc->addr_range); pt_load_msr(desc->guest_ctx, desc->addr_range); } + } else if (desc->pt_xsave && desc->guest_xs->initialized) { + pt_state_get(desc->guest_ctx, desc->guest_xs, desc->addr_range); + desc->guest_xs->initialized = 0; } } -- 1.8.3.1
[PATCH v1 3/6] KVM: VMX: Dymamic allocate Intel PT configuration state
This patch change the Intel PT configuration state to structure pointer so that we only need to allocate the state buffer when Intel PT working in HOST_GUEST mode. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c| 202 +++--- arch/x86/kvm/vmx/vmx.h| 6 +- 3 files changed, 121 insertions(+), 89 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e8d5c61..349be88 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4201,7 +4201,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->nested.vmxon = true; if (pt_mode == PT_MODE_HOST_GUEST) { - vmx->pt_desc.guest_ctx.rtit_ctl = 0; + vmx->pt_desc->guest_ctx->rtit_ctl = 0; pt_update_intercept_for_msr(vmx); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4234e40e..4595230 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1009,11 +1009,11 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry. */ - rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host_ctx.rtit_ctl); - if (vmx->pt_desc.guest_ctx.rtit_ctl & RTIT_CTL_TRACEEN) { + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc->host_ctx->rtit_ctl); + if (vmx->pt_desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) { wrmsrl(MSR_IA32_RTIT_CTL, 0); - pt_save_msr(>pt_desc.host_ctx, vmx->pt_desc.addr_range); - pt_load_msr(>pt_desc.guest_ctx, vmx->pt_desc.addr_range); + pt_save_msr(vmx->pt_desc->host_ctx, vmx->pt_desc->addr_range); + pt_load_msr(vmx->pt_desc->guest_ctx, vmx->pt_desc->addr_range); } } @@ -1022,13 +1022,35 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) if (pt_mode == PT_MODE_SYSTEM) return; - if (vmx->pt_desc.guest_ctx.rtit_ctl & RTIT_CTL_TRACEEN) { - pt_save_msr(>pt_desc.guest_ctx, vmx->pt_desc.addr_range); - pt_load_msr(>pt_desc.host_ctx, vmx->pt_desc.addr_range); + if (vmx->pt_desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) { + pt_save_msr(vmx->pt_desc->guest_ctx, vmx->pt_desc->addr_range); + pt_load_msr(vmx->pt_desc->host_ctx, vmx->pt_desc->addr_range); } /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host_ctx.rtit_ctl); + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc->host_ctx->rtit_ctl); +} + +static int pt_init(struct vcpu_vmx *vmx) +{ + u32 pt_state_sz = sizeof(struct pt_state) + sizeof(u64) * + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2; + + vmx->pt_desc = kzalloc(sizeof(struct pt_desc) + pt_state_sz * 2, + GFP_KERNEL_ACCOUNT); + if (!vmx->pt_desc) + return -ENOMEM; + + vmx->pt_desc->host_ctx = (struct pt_state *)(vmx->pt_desc + 1); + vmx->pt_desc->guest_ctx = (void *)vmx->pt_desc->host_ctx + pt_state_sz; + + return 0; +} + +static void pt_uninit(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_HOST_GUEST) + kfree(vmx->pt_desc); } void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) @@ -1391,15 +1413,16 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) * Any MSR write that attempts to change bits marked reserved will * case a #GP fault. */ - if (data & vmx->pt_desc.ctl_bitmask) + if (data & vmx->pt_desc->ctl_bitmask) return 1; /* * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will * result in a #GP unless the same write also clears TraceEn. */ - if ((vmx->pt_desc.guest_ctx.rtit_ctl & RTIT_CTL_TRACEEN) && - ((vmx->pt_desc.guest_ctx.rtit_ctl ^ data) & ~RTIT_CTL_TRACEEN)) + if ((vmx->pt_desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) && + ((vmx->pt_desc->guest_ctx->rtit_ctl ^ data) & + ~RTIT_CTL_TRACEEN)) return 1; /* @@ -1409,7 +1432,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) */ if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && !(data & RTIT_CTL_FABRIC_EN) && - !intel_pt_validate_cap(vmx->pt_desc.caps, + !intel_pt_validate_cap(vmx->pt_desc->caps, PT_CAP_single_range_output)) r
[PATCH v1 4/6] KVM: VMX: Allocate XSAVE area for Intel PT configuration
Allocate XSAVE area for host and guest Intel PT configuration when Intel PT working in HOST_GUEST mode. Intel PT configuration state can be saved using XSAVES and restored by XRSTORS instruction. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx/vmx.c | 25 - arch/x86/kvm/vmx/vmx.h | 3 +++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4595230..4691665 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1033,6 +1033,7 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) static int pt_init(struct vcpu_vmx *vmx) { + unsigned int eax, ebx, ecx, edx; u32 pt_state_sz = sizeof(struct pt_state) + sizeof(u64) * intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2; @@ -1044,13 +1045,35 @@ static int pt_init(struct vcpu_vmx *vmx) vmx->pt_desc->host_ctx = (struct pt_state *)(vmx->pt_desc + 1); vmx->pt_desc->guest_ctx = (void *)vmx->pt_desc->host_ctx + pt_state_sz; + cpuid_count(XSTATE_CPUID, 1, , , , ); + if (ecx & XFEATURE_MASK_PT) { + vmx->pt_desc->host_xs = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + vmx->pt_desc->guest_xs = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vmx->pt_desc->host_xs || !vmx->pt_desc->guest_xs) { + if (vmx->pt_desc->host_xs) + kmem_cache_free(x86_fpu_cache, + vmx->pt_desc->host_xs); + if (vmx->pt_desc->guest_xs) + kmem_cache_free(x86_fpu_cache, + vmx->pt_desc->guest_xs); + } else + vmx->pt_desc->pt_xsave = true; + } + return 0; } static void pt_uninit(struct vcpu_vmx *vmx) { - if (pt_mode == PT_MODE_HOST_GUEST) + if (pt_mode == PT_MODE_HOST_GUEST) { kfree(vmx->pt_desc); + if (vmx->pt_desc->pt_xsave) { + kmem_cache_free(x86_fpu_cache, vmx->pt_desc->host_xs); + kmem_cache_free(x86_fpu_cache, vmx->pt_desc->guest_xs); + } + } } void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 283f69d..e103991 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -69,8 +69,11 @@ struct pt_desc { u64 ctl_bitmask; u32 addr_range; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; + bool pt_xsave; struct pt_state *host_ctx; struct pt_state *guest_ctx; + struct fpu *host_xs; + struct fpu *guest_xs; }; /* -- 1.8.3.1
[PATCH v1 0/6] KVM: VMX: Intel PT configuration switch using XSAVES/XRSTORS on VM-Entry/Exit
This patch set is mainly used for reduce the overhead of switch Intel PT configuation contex on VM-Entry/Exit by XSAVES/XRSTORS instructions. I measured the cycles number of context witch on Manual and XSAVES/XRSTORES by rdtsc, and the data as below: Manual save(rdmsr): ~334 cycles Manual restore(wrmsr): ~1668 cycles XSAVES insturction: ~124 cycles XRSTORS instruction:~378 cycles Manual: Switch the configuration by rdmsr and wrmsr instruction, and there have 8 registers need to be saved or restore. They are IA32_RTIT_OUTPUT_BASE, *_OUTPUT_MASK_PTRS, *_STATUS, *_CR3_MATCH, *_ADDR0_A, *_ADDR0_B, *_ADDR1_A, *_ADDR1_B. XSAVES/XRSTORS: Switch the configuration context by XSAVES/XRSTORS instructions. This patch set will allocate separate "struct fpu" structure to save host and guest PT state. Only a small portion of this structure will be used because we only save/restore PT state (not save AVX, AVX-512, MPX, PKRU and so on). This patch set also do some code clean e.g. patch 2 will reuse the fpu pt_state to save the PT configuration contex and patch 3 will dymamic allocate Intel PT configuration state. Luwei Kang (6): x86/fpu: Introduce new fpu state for Intel processor trace KVM: VMX: Reuse the pt_state structure for PT context KVM: VMX: Dymamic allocate Intel PT configuration state KVM: VMX: Allocate XSAVE area for Intel PT configuration KVM: VMX: Intel PT configration context switch using XSAVES/XRSTORS KVM: VMX: Get PT state from xsave area to variables arch/x86/include/asm/fpu/types.h | 13 ++ arch/x86/kvm/vmx/nested.c| 2 +- arch/x86/kvm/vmx/vmx.c | 338 ++- arch/x86/kvm/vmx/vmx.h | 21 +-- 4 files changed, 243 insertions(+), 131 deletions(-) -- 1.8.3.1
[PATCH v1 1/6] x86/fpu: Introduce new fpu state for Intel processor trace
Introduce new fpu state structure pt_state to save Intel processor trace configuration. The upcoming using XSAVES/XRSTORS to switch the Intel PT configuration on VM-Entry/Exit will use this structure. Signed-off-by: Luwei Kang --- arch/x86/include/asm/fpu/types.h | 13 + 1 file changed, 13 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 2e32e17..8cbb42e 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -221,6 +221,19 @@ struct avx_512_hi16_state { } __packed; /* + * State component 8 is used for some 64-bit registers + * of Intel processor trace. + */ +struct pt_state { + u64 rtit_ctl; + u64 rtit_output_base; + u64 rtit_output_mask; + u64 rtit_status; + u64 rtit_cr3_match; + u64 rtit_addrx_ab[0]; +} __packed; + +/* * State component 9: 32-bit PKRU register. The state is * 8 bytes long but only 4 bytes is used currently. */ -- 1.8.3.1
[PATCH v2 2/2] KVM: x86: Add support of clear Trace_ToPA_PMI status
Add support of clear Intel PT ToPA PMI status for KVM guest. Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/msr-index.h | 4 arch/x86/kvm/vmx/pmu_intel.c | 8 +++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4660ce9..de95704 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -468,6 +468,7 @@ struct kvm_pmu { u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; + u64 global_ovf_ctrl_mask; u64 reserved_bits; u8 version; struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ae01fb0..c0ea4aa 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -778,6 +778,10 @@ /* PERF_GLOBAL_OVF_CTL bits */ #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT 55 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT) +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT 62 +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT) +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT63 +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD(1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT) /* Geode defined MSRs */ #define MSR_GEODE_BUSCONT_CONF00x1900 diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5ab4a36..6dee7cf 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -227,7 +227,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62 { + if (!(data & pmu->global_ovf_ctrl_mask)) { if (!msr_info->host_initiated) pmu->global_status &= ~data; pmu->global_ovf_ctrl = data; @@ -297,6 +297,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; + pmu->global_ovf_ctrl_mask = ~(pmu->global_ctrl | + MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | + MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); + if (kvm_x86_ops->pt_supported()) + pmu->global_ovf_ctrl_mask &= + ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry(vcpu, 7, 0); if (entry && -- 1.8.3.1
[PATCH v2 0/2] Inject a PMI for KVM Guest when ToPA buffer is filled
Each intel processor trace table of physical addresses (ToPA) entry has an INT bit. If this bit is set, the processor will signal a performance-monitoring interrupt (PMI) when the corresponding trace output region is filled. This patch set will inject a PMI for Intel Processor Trace when ToPA buffer is filled. >From v1: - Exported a global function pointers may not a good chioce. Add a new member in kvm_guest_cbs to send Intel PT PMI for KVM guest. Luwei Kang (2): KVM: x86: Inject PMI for KVM guest KVM: x86: Add support of clear Trace_ToPA_PMI status arch/x86/events/intel/core.c | 6 +- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/msr-index.h | 8 arch/x86/kvm/vmx/pmu_intel.c | 8 +++- arch/x86/kvm/x86.c | 10 ++ include/linux/perf_event.h | 1 + 6 files changed, 32 insertions(+), 2 deletions(-) -- 1.8.3.1
[PATCH v2 1/2] KVM: x86: Inject PMI for KVM guest
Inject a PMI for KVM guest when Intel PT working in Host-Guest mode and Guest ToPA entry memory buffer was completely filled. Signed-off-by: Luwei Kang --- arch/x86/events/intel/core.c | 6 +- arch/x86/include/asm/msr-index.h | 4 arch/x86/kvm/x86.c | 10 ++ include/linux/perf_event.h | 1 + 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 730978d..37cecff 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2273,7 +2273,11 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(55, (unsigned long *))) { handled++; - intel_pt_interrupt(); + if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() && + perf_guest_cbs->handle_intel_pt_intr)) + perf_guest_cbs->handle_intel_pt_intr(); + else + intel_pt_interrupt(); } /* diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8e40c24..ae01fb0 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -775,6 +775,10 @@ #define MSR_CORE_PERF_GLOBAL_CTRL 0x038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x0390 +/* PERF_GLOBAL_OVF_CTL bits */ +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT 55 +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT) + /* Geode defined MSRs */ #define MSR_GEODE_BUSCONT_CONF00x1900 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 941f932..d1f4e0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6795,10 +6795,20 @@ static unsigned long kvm_get_guest_ip(void) return ip; } +static void kvm_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)>arch.pmu.global_status); +} + static struct perf_guest_info_callbacks kvm_guest_cbs = { .is_in_guest= kvm_is_in_guest, .is_user_mode = kvm_is_user_mode, .get_guest_ip = kvm_get_guest_ip, + .handle_intel_pt_intr = kvm_handle_intel_pt_intr, }; static void kvm_set_mmio_spte_mask(void) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e1a0517..2b26a34 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -30,6 +30,7 @@ struct perf_guest_info_callbacks { int (*is_in_guest)(void); int (*is_user_mode)(void); unsigned long (*get_guest_ip)(void); + void(*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT -- 1.8.3.1
[PATCH V4] KVM: x86: Sync the pending Posted-Interrupts
Some Posted-Interrupts from passthrough devices may be lost or overwritten when the vCPU is in runnable state. The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state but not running on physical CPU). If a posted interrupt coming at this time, the irq remmaping facility will set the bit of PIR (Posted Interrupt Requests) without ON (Outstanding Notification). So this interrupt can't be sync to APIC virtualization register and will not be handled by Guest because ON is zero. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx/vmx.c | 26 +++--- arch/x86/kvm/vmx/vmx.h | 6 ++ arch/x86/kvm/x86.c | 2 +- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f6915f1..fe59199 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1192,21 +1192,6 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; - /* -* First handle the simple case where no cmpxchg is necessary; just -* allow posting non-urgent interrupts. -* -* If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change -* PI.NDST: pi_post_block will do it for us and the wakeup_handler -* expects the VCPU to be on the blocked_vcpu_list that matches -* PI.NDST. -*/ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || - vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - return; - } - /* The full case. */ do { old.control = new.control = pi_desc->control; @@ -1221,6 +1206,17 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.sn = 0; } while (cmpxchg64(_desc->control, old.control, new.control) != old.control); + + /* +* Clear SN before reading the bitmap. The VT-d firmware +* writes the bitmap and reads SN atomically (5.2.3 in the +* spec), so it doesn't really have a memory barrier that +* pairs with this, but we cannot do that and we need one. +*/ + smp_mb__after_atomic(); + + if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) + pi_set_on(pi_desc); } /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 9932895..a4527e1 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -349,6 +349,12 @@ static inline void pi_set_sn(struct pi_desc *pi_desc) (unsigned long *)_desc->control); } +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, + (unsigned long *)_desc->control); +} + static inline void pi_clear_on(struct pi_desc *pi_desc) { clear_bit(POSTED_INTR_ON, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d32b8f..ebd6737 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7795,7 +7795,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * 1) We should set ->mode before checking ->requests. Please see * the comment in kvm_vcpu_exiting_guest_mode(). * -* 2) For APICv, we should set ->mode before checking PIR.ON. This +* 2) For APICv, we should set ->mode before checking PID.ON. This * pairs with the memory barrier implicit in pi_test_and_set_on * (see vmx_deliver_posted_interrupt). * -- 1.8.3.1
[PATCH V3] KVM: x86: Sync the pending Posted-Interrupts
Some Posted-Interrupts from passthrough devices may be lost or overwritten when the vCPU is in runnable state. The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state but not running on physical CPU). If a posted interrupt coming at this time, the irq remmaping facility will set the bit of PIR (Posted Interrupt Requests) without ON (Outstanding Notification). So this interrupt can't be sync to APIC virtualization register and will not be handled by Guest because ON is zero. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx/vmx.c | 5 + arch/x86/kvm/x86.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4341175..8ed9634 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1221,6 +1221,11 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.sn = 0; } while (cmpxchg64(_desc->control, old.control, new.control) != old.control); + + smp_mb__after_atomic(); + + if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) + pi_test_and_set_on(pi_desc); } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d27206..5bcf2c4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7794,7 +7794,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * 1) We should set ->mode before checking ->requests. Please see * the comment in kvm_vcpu_exiting_guest_mode(). * -* 2) For APICv, we should set ->mode before checking PIR.ON. This +* 2) For APICv, we should set ->mode before checking PID.PIR. This * pairs with the memory barrier implicit in pi_test_and_set_on * (see vmx_deliver_posted_interrupt). * -- 1.8.3.1
[PATCH v2] KVM: x86: Sync the pending Posted-Interrupts
Some Posted-Interrupts from passthrough devices may be lost or overwritten when the vCPU is in runnable state. The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state but not running on physical CPU). If a posted interrupt coming at this time, the irq remmaping facility will set the bit of PIR (Posted Interrupt Requests) without ON (Outstanding Notification). So this interrupt can't be sync to APIC virtualization register and will not be handled by Guest because ON is zero. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f6915f1..820a03b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6048,7 +6048,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) bool max_irr_updated; WARN_ON(!vcpu->arch.apicv_active); - if (pi_test_on(>pi_desc)) { + if (!bitmap_empty((unsigned long *)vmx->pi_desc.pir, NR_VECTORS)) { pi_clear_on(>pi_desc); /* * IOMMU can write to PIR.ON, so the barrier matters even on UP. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02c8e09..c31b608 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7793,7 +7793,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * 1) We should set ->mode before checking ->requests. Please see * the comment in kvm_vcpu_exiting_guest_mode(). * -* 2) For APICv, we should set ->mode before checking PIR.ON. This +* 2) For APICv, we should set ->mode before checking PID.PIR. This * pairs with the memory barrier implicit in pi_test_and_set_on * (see vmx_deliver_posted_interrupt). * -- 1.8.3.1
[PATCH 3/3] KVM: x86: Add support of clear Trace_ToPA_PMI status
Add support of clear Intel PT ToPA PMI status for KVM guest. Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/msr-index.h | 4 arch/x86/kvm/vmx/pmu_intel.c | 8 +++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4660ce9..de95704 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -468,6 +468,7 @@ struct kvm_pmu { u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; + u64 global_ovf_ctrl_mask; u64 reserved_bits; u8 version; struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ae01fb0..c0ea4aa 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -778,6 +778,10 @@ /* PERF_GLOBAL_OVF_CTL bits */ #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT 55 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT) +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT 62 +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT) +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT63 +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD(1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT) /* Geode defined MSRs */ #define MSR_GEODE_BUSCONT_CONF00x1900 diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5ab4a36..6dee7cf 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -227,7 +227,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62 { + if (!(data & pmu->global_ovf_ctrl_mask)) { if (!msr_info->host_initiated) pmu->global_status &= ~data; pmu->global_ovf_ctrl = data; @@ -297,6 +297,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; + pmu->global_ovf_ctrl_mask = ~(pmu->global_ctrl | + MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | + MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); + if (kvm_x86_ops->pt_supported()) + pmu->global_ovf_ctrl_mask &= + ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry(vcpu, 7, 0); if (entry && -- 1.8.3.1
[PATCH 2/3] perf/x86/intel/pt: Inject PMI for KVM guest
Inject a PMI for KVM guest when Intel PT working in Host-Guest mode and Guest ToPA entry memory buffer was completely filled. The definition of ‘kvm_make_request’ and ‘KVM_REQ_PMI’ depend on "linux/kvm_host.h" header. Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 12 +++- arch/x86/include/asm/intel_pt.h | 1 + arch/x86/include/asm/msr-index.h | 4 arch/x86/kvm/x86.h | 6 ++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 9494ca6..09375bd 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -33,7 +34,8 @@ #include "../perf_event.h" #include "pt.h" -static DEFINE_PER_CPU(struct pt, pt_ctx); +DEFINE_PER_CPU(struct pt, pt_ctx); +EXPORT_PER_CPU_SYMBOL_GPL(pt_ctx); static struct pt_pmu pt_pmu; @@ -1260,6 +1262,14 @@ void intel_pt_interrupt(void) struct pt_buffer *buf; struct perf_event *event = pt->handle.event; + if (pt->vcpu) { + /* Inject PMI to Guest */ + kvm_make_request(KVM_REQ_PMI, pt->vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)>vcpu->arch.pmu.global_status); + return; + } + /* * There may be a dangling PT bit in the interrupt status register * after PT has been disabled by pt_event_stop(). Make sure we don't diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index ee960fb..32da2e9 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -62,6 +62,7 @@ struct pt { struct pt_filters filters; int handle_nmi; int vmx_on; + struct kvm_vcpu *vcpu; }; #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8e40c24..ae01fb0 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -775,6 +775,10 @@ #define MSR_CORE_PERF_GLOBAL_CTRL 0x038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x0390 +/* PERF_GLOBAL_OVF_CTL bits */ +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT 55 +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT) + /* Geode defined MSRs */ #define MSR_GEODE_BUSCONT_CONF00x1900 diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 224cd0a..a9ee498 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -4,6 +4,7 @@ #include #include +#include #include "kvm_cache_regs.h" #define KVM_DEFAULT_PLE_GAP128 @@ -331,15 +332,20 @@ static inline bool kvm_pause_in_guest(struct kvm *kvm) } DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); +DECLARE_PER_CPU(struct pt, pt_ctx); static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) { __this_cpu_write(current_vcpu, vcpu); + if (kvm_x86_ops->pt_supported()) + this_cpu_ptr(_ctx)->vcpu = vcpu; } static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) { __this_cpu_write(current_vcpu, NULL); + if (kvm_x86_ops->pt_supported()) + this_cpu_ptr(_ctx)->vcpu = NULL; } #endif -- 1.8.3.1
[PATCH 1/3] perf/x86/intel/pt: Move pt structure to global header
Intel PT structure (struct pt) is in a private header. Move it (and sub structure) to a global header so that it can be accessible from KVM code. The definition of perf_output_handle structure included in "linux/perf_event.h". Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.h | 38 -- arch/x86/include/asm/intel_pt.h | 40 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 269e15a..964948f 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -93,42 +93,4 @@ struct pt_buffer { struct topa_entry *topa_index[0]; }; -#define PT_FILTERS_NUM 4 - -/** - * struct pt_filter - IP range filter configuration - * @msr_a: range start, goes to RTIT_ADDRn_A - * @msr_b: range end, goes to RTIT_ADDRn_B - * @config:4-bit field in RTIT_CTL - */ -struct pt_filter { - unsigned long msr_a; - unsigned long msr_b; - unsigned long config; -}; - -/** - * struct pt_filters - IP range filtering context - * @filter:filters defined for this context - * @nr_filters:number of defined filters in the @filter array - */ -struct pt_filters { - struct pt_filterfilter[PT_FILTERS_NUM]; - unsigned intnr_filters; -}; - -/** - * struct pt - per-cpu pt context - * @handle:perf output handle - * @filters: last configured filters - * @handle_nmi:do handle PT PMI on this cpu, there's an active event - * @vmx_on:1 if VMX is ON on this cpu - */ -struct pt { - struct perf_output_handle handle; - struct pt_filters filters; - int handle_nmi; - int vmx_on; -}; - #endif /* __INTEL_PT_H__ */ diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 634f99b..ee960fb 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_INTEL_PT_H #define _ASM_X86_INTEL_PT_H +#include + #define PT_CPUID_LEAVES2 #define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ @@ -24,6 +26,44 @@ enum pt_capabilities { PT_CAP_psb_periods, }; +#define PT_FILTERS_NUM 4 + +/** + * struct pt_filter - IP range filter configuration + * @msr_a: range start, goes to RTIT_ADDRn_A + * @msr_b: range end, goes to RTIT_ADDRn_B + * @config:4-bit field in RTIT_CTL + */ +struct pt_filter { + unsigned long msr_a; + unsigned long msr_b; + unsigned long config; +}; + +/** + * struct pt_filters - IP range filtering context + * @filter:filters defined for this context + * @nr_filters:number of defined filters in the @filter array + */ +struct pt_filters { + struct pt_filterfilter[PT_FILTERS_NUM]; + unsigned intnr_filters; +}; + +/** + * struct pt - per-cpu pt context + * @handle:perf output handle + * @filters: last configured filters + * @handle_nmi:do handle PT PMI on this cpu, there's an active event + * @vmx_on:1 if VMX is ON on this cpu + */ +struct pt { + struct perf_output_handle handle; + struct pt_filters filters; + int handle_nmi; + int vmx_on; +}; + #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) void cpu_emergency_stop_pt(void); extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap); -- 1.8.3.1
[PATCH 0/3] Inject a PMI for KVM Guest when ToPA buffer is filled
Each intel processor trace table of physical addresses (ToPA) entry has an INT bit. If this bit is set, the processor will signal a performance-monitoring interrupt (PMI) when the corresponding trace output region is filled. This patch set will inject a PMI for Intel Processor Trace when ToPA buffer is filled. Luwei Kang (3): perf/x86/intel/pt: Move pt structure to global header perf/x86/intel/pt: Inject PMI for KVM guest KVM: x86: Add support of clear Trace_ToPA_PMI status arch/x86/events/intel/pt.c | 12 +++- arch/x86/events/intel/pt.h | 38 - arch/x86/include/asm/intel_pt.h | 41 arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/msr-index.h | 8 arch/x86/kvm/vmx/pmu_intel.c | 8 +++- arch/x86/kvm/x86.h | 6 ++ 7 files changed, 74 insertions(+), 40 deletions(-) -- 1.8.3.1
[PATCH] KVM: x86: Sync the pending Posted-Interrupts
Some Posted-Interrupts from passthrough devices may be lost or overwritten when the vCPU is in runnable state. The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state but not running on physical CPU). If a posted interrupt coming at this time, the irq remmaping facility will set the bit of PIR (Posted Interrupt Requests) but ON (Outstanding Notification). So this interrupt can't be sync to APIC virtualization register and will not be handled by Guest because ON is zero. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f6915f1..820a03b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6048,7 +6048,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) bool max_irr_updated; WARN_ON(!vcpu->arch.apicv_active); - if (pi_test_on(>pi_desc)) { + if (!bitmap_empty((unsigned long *)vmx->pi_desc.pir, NR_VECTORS)) { pi_clear_on(>pi_desc); /* * IOMMU can write to PIR.ON, so the barrier matters even on UP. -- 1.8.3.1
[PATCH v13 09/12] KVM: x86: Introduce a function to initialize the PT configuration
Initialize the Intel PT configuration when cpuid update. Include cpuid inforamtion, rtit_ctl bit mask and the number of address ranges. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 73 ++ 1 file changed, 73 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d8480a6..2697618 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11921,6 +11921,75 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) } } +static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *best = NULL; + int i; + + for (i = 0; i < PT_CPUID_LEAVES; i++) { + best = kvm_find_cpuid_entry(vcpu, 0x14, i); + if (!best) + return; + vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; + vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; + vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; + vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; + } + + /* Get the number of configurable Address Ranges for filtering */ + vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_num_address_ranges); + + /* Initialize and clear the no dependency bits */ + vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise +* will inject an #GP +*/ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and +* PSBFreq can be set +*/ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | + RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and +* MTCFreq can be set +*/ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | + RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); + + /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | + RTIT_CTL_PTW_EN); + + /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; + + /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; + + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; + + /* unmask address range configure area */ + for (i = 0; i < vmx->pt_desc.addr_range; i++) + vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -11941,6 +12010,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) nested_vmx_cr_fixed1_bits_update(vcpu); nested_vmx_entry_exit_ctls_update(vcpu); } + + if (boot_cpu_has(X86_FEATURE_INTEL_PT) && + guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + update_intel_pt_cfg(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -- 1.8.3.1
[PATCH v13 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write
From: Chao Peng To save performance overhead, disable intercept Intel PT MSRs read/write when Intel PT is enabled in guest. MSR_IA32_RTIT_CTL is an exception that will always be intercepted. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 23 +++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a568d49..ed247dd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1333,6 +1333,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -4558,6 +4559,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_rtit_ctl_check(vcpu, data)) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); + pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); vmx->pt_desc.guest.ctl = data; break; case MSR_IA32_RTIT_STATUS: @@ -6414,6 +6416,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) vmx->msr_bitmap_mode = mode; } +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag) +{ + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + u32 i; + + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, + MSR_TYPE_RW, flag); + for (i = 0; i < vmx->pt_desc.addr_range; i++) { + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + } +} + static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; -- 1.8.3.1
[PATCH v13 10/12] KVM: x86: Implement Intel PT MSRs read/write emulation
From: Chao Peng This patch implement Intel Processor Trace MSRs read/write emulation. Intel PT MSRs read/write need to be emulated when Intel PT MSRs is intercepted in guest and during live migration. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/intel_pt.h | 8 ++ arch/x86/kvm/vmx.c | 176 arch/x86/kvm/x86.c | 33 +++- 3 files changed, 216 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index eabbdbc..a1c2080 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -10,6 +10,14 @@ #define RTIT_ADDR_RANGE4 +#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ + RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ + RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ + RTIT_STATUS_BYTECNT)) + +#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ + (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2697618..a568d49 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3350,6 +3350,79 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); } +static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long value; + + /* +* Any MSR write that attempts to change bits marked reserved will +* case a #GP fault. +*/ + if (data & vmx->pt_desc.ctl_bitmask) + return 1; + + /* +* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will +* result in a #GP unless the same write also clears TraceEn. +*/ + if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && + ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + return 1; + + /* +* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit +* and FabricEn would cause #GP, if +* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 +*/ + if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && + !(data & RTIT_CTL_FABRIC_EN) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) + return 1; + + /* +* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that +* utilize encodings marked reserved will casue a #GP fault. +*/ + value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && + !test_bit((data & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET, )) + return 1; + value = intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cycle_thresholds); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET, )) + return 1; + value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET, )) + return 1; + + /* +* If ADDRx_CFG is reserved or the encodings is >2 will +* cause a #GP fault. +*/ + value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) + return 1; + + return 0; +} + + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; @@ -4186,6 +4259,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to
[PATCH v13 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation
From: Chao Peng Expose Intel Processor Trace to guest only when the PT works in Host-Guest mode. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c| 22 -- arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c | 6 ++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 55e51ff..9ab7ac0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1105,6 +1105,7 @@ struct kvm_x86_ops { bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); + bool (*pt_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7bcfa61..05b8fb4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -337,6 +337,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -395,7 +396,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL); + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = @@ -426,7 +427,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xd); + entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); break; case 1: entry->edx &= kvm_cpuid_1_edx_x86_features; @@ -603,6 +604,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + /* Intel PT */ + case 0x14: { + int t, times = entry->eax; + + if (!f_intel_pt) + break; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (t = 1; t <= times; ++t) { + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent([t], function, t); + entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { static const char signature[12] = "KVMKVMKVM\0\0"; const u32 *sigptr = (const u32 *)signature; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f416f5c7..6e8a61b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5904,6 +5904,11 @@ static bool svm_umip_emulated(void) return false; } +static bool svm_pt_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -7139,6 +7144,7 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, .mpx_supported = svm_mpx_supported, .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, + .pt_supported = svm_pt_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c4c4b76..692154c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11013,6 +11013,11 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static bool vmx_pt_supported(void) +{ + return (pt_mode == PT_MODE_HOST_GUEST); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -15127,6 +15132,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, + .pt_supported = vmx_pt_supported, .check_nested_events = vmx_check_nested_events, .request_immediate_exit = vmx_request_immediate_exit, -- 1.8.3.1
[PATCH v13 09/12] KVM: x86: Introduce a function to initialize the PT configuration
Initialize the Intel PT configuration when cpuid update. Include cpuid inforamtion, rtit_ctl bit mask and the number of address ranges. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 73 ++ 1 file changed, 73 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d8480a6..2697618 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11921,6 +11921,75 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) } } +static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *best = NULL; + int i; + + for (i = 0; i < PT_CPUID_LEAVES; i++) { + best = kvm_find_cpuid_entry(vcpu, 0x14, i); + if (!best) + return; + vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; + vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; + vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; + vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; + } + + /* Get the number of configurable Address Ranges for filtering */ + vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_num_address_ranges); + + /* Initialize and clear the no dependency bits */ + vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise +* will inject an #GP +*/ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and +* PSBFreq can be set +*/ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | + RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and +* MTCFreq can be set +*/ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | + RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); + + /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | + RTIT_CTL_PTW_EN); + + /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; + + /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; + + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; + + /* unmask address range configure area */ + for (i = 0; i < vmx->pt_desc.addr_range; i++) + vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -11941,6 +12010,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) nested_vmx_cr_fixed1_bits_update(vcpu); nested_vmx_entry_exit_ctls_update(vcpu); } + + if (boot_cpu_has(X86_FEATURE_INTEL_PT) && + guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + update_intel_pt_cfg(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -- 1.8.3.1
[PATCH v13 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write
From: Chao Peng To save performance overhead, disable intercept Intel PT MSRs read/write when Intel PT is enabled in guest. MSR_IA32_RTIT_CTL is an exception that will always be intercepted. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 23 +++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a568d49..ed247dd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1333,6 +1333,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -4558,6 +4559,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_rtit_ctl_check(vcpu, data)) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); + pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); vmx->pt_desc.guest.ctl = data; break; case MSR_IA32_RTIT_STATUS: @@ -6414,6 +6416,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) vmx->msr_bitmap_mode = mode; } +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag) +{ + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + u32 i; + + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, + MSR_TYPE_RW, flag); + for (i = 0; i < vmx->pt_desc.addr_range; i++) { + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + } +} + static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; -- 1.8.3.1
[PATCH v13 10/12] KVM: x86: Implement Intel PT MSRs read/write emulation
From: Chao Peng This patch implement Intel Processor Trace MSRs read/write emulation. Intel PT MSRs read/write need to be emulated when Intel PT MSRs is intercepted in guest and during live migration. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/intel_pt.h | 8 ++ arch/x86/kvm/vmx.c | 176 arch/x86/kvm/x86.c | 33 +++- 3 files changed, 216 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index eabbdbc..a1c2080 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -10,6 +10,14 @@ #define RTIT_ADDR_RANGE4 +#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ + RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ + RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ + RTIT_STATUS_BYTECNT)) + +#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ + (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2697618..a568d49 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3350,6 +3350,79 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); } +static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long value; + + /* +* Any MSR write that attempts to change bits marked reserved will +* case a #GP fault. +*/ + if (data & vmx->pt_desc.ctl_bitmask) + return 1; + + /* +* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will +* result in a #GP unless the same write also clears TraceEn. +*/ + if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && + ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + return 1; + + /* +* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit +* and FabricEn would cause #GP, if +* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 +*/ + if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && + !(data & RTIT_CTL_FABRIC_EN) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) + return 1; + + /* +* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that +* utilize encodings marked reserved will casue a #GP fault. +*/ + value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && + !test_bit((data & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET, )) + return 1; + value = intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cycle_thresholds); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET, )) + return 1; + value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET, )) + return 1; + + /* +* If ADDRx_CFG is reserved or the encodings is >2 will +* cause a #GP fault. +*/ + value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) + return 1; + + return 0; +} + + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; @@ -4186,6 +4259,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to
[PATCH v13 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation
From: Chao Peng Expose Intel Processor Trace to guest only when the PT works in Host-Guest mode. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c| 22 -- arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c | 6 ++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 55e51ff..9ab7ac0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1105,6 +1105,7 @@ struct kvm_x86_ops { bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); + bool (*pt_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7bcfa61..05b8fb4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -337,6 +337,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -395,7 +396,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL); + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = @@ -426,7 +427,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xd); + entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); break; case 1: entry->edx &= kvm_cpuid_1_edx_x86_features; @@ -603,6 +604,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + /* Intel PT */ + case 0x14: { + int t, times = entry->eax; + + if (!f_intel_pt) + break; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (t = 1; t <= times; ++t) { + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent([t], function, t); + entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { static const char signature[12] = "KVMKVMKVM\0\0"; const u32 *sigptr = (const u32 *)signature; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f416f5c7..6e8a61b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5904,6 +5904,11 @@ static bool svm_umip_emulated(void) return false; } +static bool svm_pt_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -7139,6 +7144,7 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, .mpx_supported = svm_mpx_supported, .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, + .pt_supported = svm_pt_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c4c4b76..692154c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11013,6 +11013,11 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static bool vmx_pt_supported(void) +{ + return (pt_mode == PT_MODE_HOST_GUEST); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -15127,6 +15132,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, + .pt_supported = vmx_pt_supported, .check_nested_events = vmx_check_nested_events, .request_immediate_exit = vmx_request_immediate_exit, -- 1.8.3.1
[PATCH v13 06/12] KVM: x86: Add Intel PT virtualization work mode
From: Chao Peng Intel Processor Trace virtualization can be work in one of 2 possible modes: a. System-Wide mode (default): When the host configures Intel PT to collect trace packets of the entire system, it can leave the relevant VMX controls clear to allow VMX-specific packets to provide information across VMX transitions. KVM guest will not aware this feature in this mode and both host and KVM guest trace will output to host buffer. b. Host-Guest mode: Host can configure trace-packet generation while in VMX non-root operation for guests and root operation for native executing normally. Intel PT will be exposed to KVM guest in this mode, and the trace output to respective buffer of host and guest. In this mode, tht status of PT will be saved and disabled before VM-entry and restored after VM-exit if trace a virtual machine. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/intel_pt.h | 3 ++ arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/vmx.h | 8 + arch/x86/kvm/vmx.c | 68 +--- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 634f99b..4727584 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -5,6 +5,9 @@ #define PT_CPUID_LEAVES2 #define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ +#define PT_MODE_SYSTEM 0 +#define PT_MODE_HOST_GUEST 1 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 107818e3..f51579d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -805,6 +805,7 @@ #define VMX_BASIC_INOUT0x0040LLU /* MSR_IA32_VMX_MISC bits */ +#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index ade0f15..b99710c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -77,7 +77,9 @@ #define SECONDARY_EXEC_ENCLS_EXITING 0x8000 #define SECONDARY_EXEC_RDSEED_EXITING 0x0001 #define SECONDARY_EXEC_ENABLE_PML 0x0002 +#define SECONDARY_EXEC_PT_CONCEAL_VMX 0x0008 #define SECONDARY_EXEC_XSAVES 0x0010 +#define SECONDARY_EXEC_PT_USE_GPA 0x0100 #define SECONDARY_EXEC_TSC_SCALING 0x0200 #define PIN_BASED_EXT_INTR_MASK 0x0001 @@ -98,6 +100,8 @@ #define VM_EXIT_LOAD_IA32_EFER 0x0020 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x0040 #define VM_EXIT_CLEAR_BNDCFGS 0x0080 +#define VM_EXIT_PT_CONCEAL_PIP 0x0100 +#define VM_EXIT_CLEAR_IA32_RTIT_CTL0x0200 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -109,6 +113,8 @@ #define VM_ENTRY_LOAD_IA32_PAT 0x4000 #define VM_ENTRY_LOAD_IA32_EFER 0x8000 #define VM_ENTRY_LOAD_BNDCFGS 0x0001 +#define VM_ENTRY_PT_CONCEAL_PIP0x0002 +#define VM_ENTRY_LOAD_IA32_RTIT_CTL0x0004 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x11ff @@ -240,6 +246,8 @@ enum vmcs_field { GUEST_PDPTR3_HIGH = 0x2811, GUEST_BNDCFGS = 0x2812, GUEST_BNDCFGS_HIGH = 0x2813, + GUEST_IA32_RTIT_CTL = 0x2814, + GUEST_IA32_RTIT_CTL_HIGH= 0x2815, HOST_IA32_PAT = 0x2c00, HOST_IA32_PAT_HIGH = 0x2c01, HOST_IA32_EFER = 0x2c02, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 641a65b..c4c4b76 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -55,6 +55,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -190,6 +191,10 @@ static unsigned int ple_window_max= KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); +/* Default is SYSTEM mode. */ +static int __read_mostly pt_mode = PT_MODE_SYSTEM; +module_param(pt_mode, int, S_IRUGO); + extern const ulong vmx_return; extern const ulong vmx_early_consistency_check_return; @@ -1955,6 +1960,20 @@ static bool vmx_umip_emulated(void) SECONDARY_EXEC_DESC; } +static inline bool cpu_has_vmx_intel_pt(void) +{ + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + return !!(vmx_msr & MSR_IA32_VMX_MISC
[PATCH v13 12/12] KVM: x86: Disable Intel PT when VMXON in L1 guest
Currently, Intel Processor Trace do not support tracing in L1 guest VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM, on these type of processors, execution of the VMXON instruction will clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL causes a general-protection exception (#GP). Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ed247dd..5001049 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4556,7 +4556,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || - vmx_rtit_ctl_check(vcpu, data)) + vmx_rtit_ctl_check(vcpu, data) || + vmx->nested.vmxon) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); @@ -8760,6 +8761,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (ret) return ret; + if (pt_mode == PT_MODE_HOST_GUEST) { + vmx->pt_desc.guest.ctl = 0; + pt_set_intercept_for_msr(vmx, 1); + } + return nested_vmx_succeed(vcpu); } -- 1.8.3.1
[PATCH v13 08/12] KVM: x86: Add Intel PT context switch for each vcpu
From: Chao Peng Load/Store Intel Processor Trace register in context switch. MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS. In Host-Guest mode, we need load/resore PT MSRs only when PT is enabled in guest. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/intel_pt.h | 2 + arch/x86/kvm/vmx.c | 94 + 2 files changed, 96 insertions(+) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 4727584..eabbdbc 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -8,6 +8,8 @@ #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 +#define RTIT_ADDR_RANGE4 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 692154c..d8480a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -978,6 +978,24 @@ struct vmx_msrs { struct vmx_msr_entryval[NR_AUTOLOAD_MSRS]; }; +struct pt_ctx { + u64 ctl; + u64 status; + u64 output_base; + u64 output_mask; + u64 cr3_match; + u64 addr_a[RTIT_ADDR_RANGE]; + u64 addr_b[RTIT_ADDR_RANGE]; +}; + +struct pt_desc { + u64 ctl_bitmask; + u32 addr_range; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; + struct pt_ctx host; + struct pt_ctx guest; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -1071,6 +1089,8 @@ struct vcpu_vmx { u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; u64 ept_pointer; + + struct pt_desc pt_desc; }; enum segment_cache_field { @@ -2899,6 +2919,69 @@ static unsigned long segment_base(u16 selector) } #endif +static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static void pt_guest_enter(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + /* Save host state before VM entry */ + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + + /* +* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled +* on VM entry when it has been disabled in guest before). +*/ + vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl); + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + wrmsrl(MSR_IA32_RTIT_CTL, 0); + pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + } +} + +static void pt_guest_exit(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range); + } + + /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); +} + static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6749,6 +6832,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + + if (pt_mode == PT_MODE_HOST_GUEST) { + memset(>pt_desc, 0, sizeof(vmx->pt_desc)); + /* Bit[6~0] are forced to 1, writes are ignored. */ + vmx->pt_desc.guest.output_mask = 0x7F; + vmcs_write64(GUEST_IA32_RTIT_CTL, 0); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -11260,6 +11350,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.pkru != vmx->host_pkru)
[PATCH v13 12/12] KVM: x86: Disable Intel PT when VMXON in L1 guest
Currently, Intel Processor Trace do not support tracing in L1 guest VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM, on these type of processors, execution of the VMXON instruction will clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL causes a general-protection exception (#GP). Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ed247dd..5001049 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4556,7 +4556,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || - vmx_rtit_ctl_check(vcpu, data)) + vmx_rtit_ctl_check(vcpu, data) || + vmx->nested.vmxon) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); @@ -8760,6 +8761,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (ret) return ret; + if (pt_mode == PT_MODE_HOST_GUEST) { + vmx->pt_desc.guest.ctl = 0; + pt_set_intercept_for_msr(vmx, 1); + } + return nested_vmx_succeed(vcpu); } -- 1.8.3.1
[PATCH v13 08/12] KVM: x86: Add Intel PT context switch for each vcpu
From: Chao Peng Load/Store Intel Processor Trace register in context switch. MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS. In Host-Guest mode, we need load/resore PT MSRs only when PT is enabled in guest. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/intel_pt.h | 2 + arch/x86/kvm/vmx.c | 94 + 2 files changed, 96 insertions(+) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 4727584..eabbdbc 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -8,6 +8,8 @@ #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 +#define RTIT_ADDR_RANGE4 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 692154c..d8480a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -978,6 +978,24 @@ struct vmx_msrs { struct vmx_msr_entryval[NR_AUTOLOAD_MSRS]; }; +struct pt_ctx { + u64 ctl; + u64 status; + u64 output_base; + u64 output_mask; + u64 cr3_match; + u64 addr_a[RTIT_ADDR_RANGE]; + u64 addr_b[RTIT_ADDR_RANGE]; +}; + +struct pt_desc { + u64 ctl_bitmask; + u32 addr_range; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; + struct pt_ctx host; + struct pt_ctx guest; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -1071,6 +1089,8 @@ struct vcpu_vmx { u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; u64 ept_pointer; + + struct pt_desc pt_desc; }; enum segment_cache_field { @@ -2899,6 +2919,69 @@ static unsigned long segment_base(u16 selector) } #endif +static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static void pt_guest_enter(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + /* Save host state before VM entry */ + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + + /* +* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled +* on VM entry when it has been disabled in guest before). +*/ + vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl); + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + wrmsrl(MSR_IA32_RTIT_CTL, 0); + pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + } +} + +static void pt_guest_exit(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range); + } + + /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); +} + static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6749,6 +6832,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + + if (pt_mode == PT_MODE_HOST_GUEST) { + memset(>pt_desc, 0, sizeof(vmx->pt_desc)); + /* Bit[6~0] are forced to 1, writes are ignored. */ + vmx->pt_desc.guest.output_mask = 0x7F; + vmcs_write64(GUEST_IA32_RTIT_CTL, 0); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -11260,6 +11350,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.pkru != vmx->host_pkru)
[PATCH v13 06/12] KVM: x86: Add Intel PT virtualization work mode
From: Chao Peng Intel Processor Trace virtualization can be work in one of 2 possible modes: a. System-Wide mode (default): When the host configures Intel PT to collect trace packets of the entire system, it can leave the relevant VMX controls clear to allow VMX-specific packets to provide information across VMX transitions. KVM guest will not aware this feature in this mode and both host and KVM guest trace will output to host buffer. b. Host-Guest mode: Host can configure trace-packet generation while in VMX non-root operation for guests and root operation for native executing normally. Intel PT will be exposed to KVM guest in this mode, and the trace output to respective buffer of host and guest. In this mode, tht status of PT will be saved and disabled before VM-entry and restored after VM-exit if trace a virtual machine. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/intel_pt.h | 3 ++ arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/vmx.h | 8 + arch/x86/kvm/vmx.c | 68 +--- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 634f99b..4727584 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -5,6 +5,9 @@ #define PT_CPUID_LEAVES2 #define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ +#define PT_MODE_SYSTEM 0 +#define PT_MODE_HOST_GUEST 1 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 107818e3..f51579d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -805,6 +805,7 @@ #define VMX_BASIC_INOUT0x0040LLU /* MSR_IA32_VMX_MISC bits */ +#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index ade0f15..b99710c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -77,7 +77,9 @@ #define SECONDARY_EXEC_ENCLS_EXITING 0x8000 #define SECONDARY_EXEC_RDSEED_EXITING 0x0001 #define SECONDARY_EXEC_ENABLE_PML 0x0002 +#define SECONDARY_EXEC_PT_CONCEAL_VMX 0x0008 #define SECONDARY_EXEC_XSAVES 0x0010 +#define SECONDARY_EXEC_PT_USE_GPA 0x0100 #define SECONDARY_EXEC_TSC_SCALING 0x0200 #define PIN_BASED_EXT_INTR_MASK 0x0001 @@ -98,6 +100,8 @@ #define VM_EXIT_LOAD_IA32_EFER 0x0020 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x0040 #define VM_EXIT_CLEAR_BNDCFGS 0x0080 +#define VM_EXIT_PT_CONCEAL_PIP 0x0100 +#define VM_EXIT_CLEAR_IA32_RTIT_CTL0x0200 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -109,6 +113,8 @@ #define VM_ENTRY_LOAD_IA32_PAT 0x4000 #define VM_ENTRY_LOAD_IA32_EFER 0x8000 #define VM_ENTRY_LOAD_BNDCFGS 0x0001 +#define VM_ENTRY_PT_CONCEAL_PIP0x0002 +#define VM_ENTRY_LOAD_IA32_RTIT_CTL0x0004 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x11ff @@ -240,6 +246,8 @@ enum vmcs_field { GUEST_PDPTR3_HIGH = 0x2811, GUEST_BNDCFGS = 0x2812, GUEST_BNDCFGS_HIGH = 0x2813, + GUEST_IA32_RTIT_CTL = 0x2814, + GUEST_IA32_RTIT_CTL_HIGH= 0x2815, HOST_IA32_PAT = 0x2c00, HOST_IA32_PAT_HIGH = 0x2c01, HOST_IA32_EFER = 0x2c02, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 641a65b..c4c4b76 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -55,6 +55,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -190,6 +191,10 @@ static unsigned int ple_window_max= KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); +/* Default is SYSTEM mode. */ +static int __read_mostly pt_mode = PT_MODE_SYSTEM; +module_param(pt_mode, int, S_IRUGO); + extern const ulong vmx_return; extern const ulong vmx_early_consistency_check_return; @@ -1955,6 +1960,20 @@ static bool vmx_umip_emulated(void) SECONDARY_EXEC_DESC; } +static inline bool cpu_has_vmx_intel_pt(void) +{ + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + return !!(vmx_msr & MSR_IA32_VMX_MISC
[PATCH v13 04/12] perf/x86/intel/pt: Add new bit definitions for PT MSRs
Add bit definitions for Intel PT MSRs to support trace output directed to the memeory subsystem and holds a count if packet bytes that have been sent out. These are required by the upcoming PT support in KVM guests for MSRs read/write emulation. Signed-off-by: Luwei Kang --- arch/x86/include/asm/msr-index.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d3a9eb9..107818e3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -126,6 +126,7 @@ #define RTIT_CTL_USR BIT(3) #define RTIT_CTL_PWR_EVT_ENBIT(4) #define RTIT_CTL_FUP_ON_PTWBIT(5) +#define RTIT_CTL_FABRIC_EN BIT(6) #define RTIT_CTL_CR3EN BIT(7) #define RTIT_CTL_TOPA BIT(8) #define RTIT_CTL_MTC_ENBIT(9) @@ -154,6 +155,8 @@ #define RTIT_STATUS_BUFFOVFBIT(3) #define RTIT_STATUS_ERROR BIT(4) #define RTIT_STATUS_STOPPEDBIT(5) +#define RTIT_STATUS_BYTECNT_OFFSET 32 +#define RTIT_STATUS_BYTECNT(0x1ull << RTIT_STATUS_BYTECNT_OFFSET) #define MSR_IA32_RTIT_ADDR0_A 0x0580 #define MSR_IA32_RTIT_ADDR0_B 0x0581 #define MSR_IA32_RTIT_ADDR1_A 0x0582 -- 1.8.3.1
[PATCH v13 03/12] perf/x86/intel/pt: Introduce intel_pt_validate_cap()
intel_pt_validate_hw_cap() validates whether a given PT capability is supported by the hardware. It checks the PT capability array which reflects the capabilities of the hardware on which the code is executed. For setting up PT for KVM guests this is not correct as the capability array for the guest can be different from the host array. Provide a new function to check against a given capability array. Acked-by: Song Liu Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 12 +--- arch/x86/include/asm/intel_pt.h | 2 ++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 309bb1d..53e481a 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -75,14 +75,20 @@ PT_CAP(psb_periods, 1, CPUID_EBX, 0x), }; -u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) +u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { - struct pt_cap_desc *cd = _caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; + struct pt_cap_desc *cd = _caps[capability]; + u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; unsigned int shift = __ffs(cd->mask); return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(intel_pt_validate_cap); + +u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) +{ + return intel_pt_validate_cap(pt_pmu.caps, cap); +} EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap); static ssize_t pt_cap_show(struct device *cdev, diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index fa4b4fd..00f4afb 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -26,9 +26,11 @@ enum pt_capabilities { #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) void cpu_emergency_stop_pt(void); extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap); +extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap); #else static inline void cpu_emergency_stop_pt(void) {} static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; } +static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ -- 1.8.3.1
[PATCH v13 01/12] perf/x86/intel/pt: Move Intel PT MSRs bit defines to global header
From: Chao Peng The Intel Processor Trace (PT) MSR bit defines are in a private header. The upcoming support for PT virtualization requires these defines to be accessible from KVM code. Move them to the global MSR header file. Reviewed-by: Thomas Gleixner Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.h | 37 - arch/x86/include/asm/msr-index.h | 33 + 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 0eb41d0..0050ca1 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -20,43 +20,6 @@ #define __INTEL_PT_H__ /* - * PT MSR bit definitions - */ -#define RTIT_CTL_TRACEEN BIT(0) -#define RTIT_CTL_CYCLEACC BIT(1) -#define RTIT_CTL_OSBIT(2) -#define RTIT_CTL_USR BIT(3) -#define RTIT_CTL_PWR_EVT_ENBIT(4) -#define RTIT_CTL_FUP_ON_PTWBIT(5) -#define RTIT_CTL_CR3EN BIT(7) -#define RTIT_CTL_TOPA BIT(8) -#define RTIT_CTL_MTC_ENBIT(9) -#define RTIT_CTL_TSC_ENBIT(10) -#define RTIT_CTL_DISRETC BIT(11) -#define RTIT_CTL_PTW_ENBIT(12) -#define RTIT_CTL_BRANCH_EN BIT(13) -#define RTIT_CTL_MTC_RANGE_OFFSET 14 -#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) -#define RTIT_CTL_CYC_THRESH_OFFSET 19 -#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) -#define RTIT_CTL_PSB_FREQ_OFFSET 24 -#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) -#define RTIT_CTL_ADDR0_OFFSET 32 -#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) -#define RTIT_CTL_ADDR1_OFFSET 36 -#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) -#define RTIT_CTL_ADDR2_OFFSET 40 -#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) -#define RTIT_CTL_ADDR3_OFFSET 44 -#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) -#define RTIT_STATUS_FILTEREN BIT(0) -#define RTIT_STATUS_CONTEXTEN BIT(1) -#define RTIT_STATUS_TRIGGEREN BIT(2) -#define RTIT_STATUS_BUFFOVFBIT(3) -#define RTIT_STATUS_ERROR BIT(4) -#define RTIT_STATUS_STOPPEDBIT(5) - -/* * Single-entry ToPA: when this close to region boundary, switch * buffers to avoid losing data. */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 4731f0c..d3a9eb9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -120,7 +120,40 @@ #define MSR_PEBS_LD_LAT_THRESHOLD 0x03f6 #define MSR_IA32_RTIT_CTL 0x0570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OSBIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_PWR_EVT_ENBIT(4) +#define RTIT_CTL_FUP_ON_PTWBIT(5) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_ENBIT(9) +#define RTIT_CTL_TSC_ENBIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_PTW_ENBIT(12) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define RTIT_CTL_ADDR0_OFFSET 32 +#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) +#define RTIT_CTL_ADDR1_OFFSET 36 +#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) +#define RTIT_CTL_ADDR2_OFFSET 40 +#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) +#define RTIT_CTL_ADDR3_OFFSET 44 +#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) #define MSR_IA32_RTIT_STATUS 0x0571 +#define RTIT_STATUS_FILTEREN BIT(0) +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_BUFFOVFBIT(3) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPEDBIT(5) #define MSR_IA32_RTIT_ADDR0_A 0x0580 #define MSR_IA32_RTIT_ADDR0_B 0x0581 #define MSR_IA32_RTIT_ADDR1_A 0x0582 -- 1.8.3.1
[PATCH v13 05/12] perf/x86/intel/pt: add new capability for Intel PT
This adds support for "output to Trace Transport subsystem" capability of Intel PT. It means that PT can output its trace to an MMIO address range rather than system memory buffer. Acked-by: Song Liu Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 1 + arch/x86/include/asm/intel_pt.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 53e481a..9597ea6 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -68,6 +68,7 @@ PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)), PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), PT_CAP(mtc_periods, 1, CPUID_EAX, 0x), diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 00f4afb..634f99b 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -16,6 +16,7 @@ enum pt_capabilities { PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, + PT_CAP_output_subsys, PT_CAP_payloads_lip, PT_CAP_num_address_ranges, PT_CAP_mtc_periods, -- 1.8.3.1
[PATCH v13 05/12] perf/x86/intel/pt: add new capability for Intel PT
This adds support for "output to Trace Transport subsystem" capability of Intel PT. It means that PT can output its trace to an MMIO address range rather than system memory buffer. Acked-by: Song Liu Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 1 + arch/x86/include/asm/intel_pt.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 53e481a..9597ea6 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -68,6 +68,7 @@ PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)), PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), PT_CAP(mtc_periods, 1, CPUID_EAX, 0x), diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 00f4afb..634f99b 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -16,6 +16,7 @@ enum pt_capabilities { PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, + PT_CAP_output_subsys, PT_CAP_payloads_lip, PT_CAP_num_address_ranges, PT_CAP_mtc_periods, -- 1.8.3.1
[PATCH v13 04/12] perf/x86/intel/pt: Add new bit definitions for PT MSRs
Add bit definitions for Intel PT MSRs to support trace output directed to the memeory subsystem and holds a count if packet bytes that have been sent out. These are required by the upcoming PT support in KVM guests for MSRs read/write emulation. Signed-off-by: Luwei Kang --- arch/x86/include/asm/msr-index.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d3a9eb9..107818e3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -126,6 +126,7 @@ #define RTIT_CTL_USR BIT(3) #define RTIT_CTL_PWR_EVT_ENBIT(4) #define RTIT_CTL_FUP_ON_PTWBIT(5) +#define RTIT_CTL_FABRIC_EN BIT(6) #define RTIT_CTL_CR3EN BIT(7) #define RTIT_CTL_TOPA BIT(8) #define RTIT_CTL_MTC_ENBIT(9) @@ -154,6 +155,8 @@ #define RTIT_STATUS_BUFFOVFBIT(3) #define RTIT_STATUS_ERROR BIT(4) #define RTIT_STATUS_STOPPEDBIT(5) +#define RTIT_STATUS_BYTECNT_OFFSET 32 +#define RTIT_STATUS_BYTECNT(0x1ull << RTIT_STATUS_BYTECNT_OFFSET) #define MSR_IA32_RTIT_ADDR0_A 0x0580 #define MSR_IA32_RTIT_ADDR0_B 0x0581 #define MSR_IA32_RTIT_ADDR1_A 0x0582 -- 1.8.3.1
[PATCH v13 03/12] perf/x86/intel/pt: Introduce intel_pt_validate_cap()
intel_pt_validate_hw_cap() validates whether a given PT capability is supported by the hardware. It checks the PT capability array which reflects the capabilities of the hardware on which the code is executed. For setting up PT for KVM guests this is not correct as the capability array for the guest can be different from the host array. Provide a new function to check against a given capability array. Acked-by: Song Liu Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 12 +--- arch/x86/include/asm/intel_pt.h | 2 ++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 309bb1d..53e481a 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -75,14 +75,20 @@ PT_CAP(psb_periods, 1, CPUID_EBX, 0x), }; -u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) +u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { - struct pt_cap_desc *cd = _caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; + struct pt_cap_desc *cd = _caps[capability]; + u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; unsigned int shift = __ffs(cd->mask); return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(intel_pt_validate_cap); + +u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) +{ + return intel_pt_validate_cap(pt_pmu.caps, cap); +} EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap); static ssize_t pt_cap_show(struct device *cdev, diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index fa4b4fd..00f4afb 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -26,9 +26,11 @@ enum pt_capabilities { #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) void cpu_emergency_stop_pt(void); extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap); +extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap); #else static inline void cpu_emergency_stop_pt(void) {} static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; } +static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ -- 1.8.3.1
[PATCH v13 01/12] perf/x86/intel/pt: Move Intel PT MSRs bit defines to global header
From: Chao Peng The Intel Processor Trace (PT) MSR bit defines are in a private header. The upcoming support for PT virtualization requires these defines to be accessible from KVM code. Move them to the global MSR header file. Reviewed-by: Thomas Gleixner Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.h | 37 - arch/x86/include/asm/msr-index.h | 33 + 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 0eb41d0..0050ca1 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -20,43 +20,6 @@ #define __INTEL_PT_H__ /* - * PT MSR bit definitions - */ -#define RTIT_CTL_TRACEEN BIT(0) -#define RTIT_CTL_CYCLEACC BIT(1) -#define RTIT_CTL_OSBIT(2) -#define RTIT_CTL_USR BIT(3) -#define RTIT_CTL_PWR_EVT_ENBIT(4) -#define RTIT_CTL_FUP_ON_PTWBIT(5) -#define RTIT_CTL_CR3EN BIT(7) -#define RTIT_CTL_TOPA BIT(8) -#define RTIT_CTL_MTC_ENBIT(9) -#define RTIT_CTL_TSC_ENBIT(10) -#define RTIT_CTL_DISRETC BIT(11) -#define RTIT_CTL_PTW_ENBIT(12) -#define RTIT_CTL_BRANCH_EN BIT(13) -#define RTIT_CTL_MTC_RANGE_OFFSET 14 -#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) -#define RTIT_CTL_CYC_THRESH_OFFSET 19 -#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) -#define RTIT_CTL_PSB_FREQ_OFFSET 24 -#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) -#define RTIT_CTL_ADDR0_OFFSET 32 -#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) -#define RTIT_CTL_ADDR1_OFFSET 36 -#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) -#define RTIT_CTL_ADDR2_OFFSET 40 -#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) -#define RTIT_CTL_ADDR3_OFFSET 44 -#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) -#define RTIT_STATUS_FILTEREN BIT(0) -#define RTIT_STATUS_CONTEXTEN BIT(1) -#define RTIT_STATUS_TRIGGEREN BIT(2) -#define RTIT_STATUS_BUFFOVFBIT(3) -#define RTIT_STATUS_ERROR BIT(4) -#define RTIT_STATUS_STOPPEDBIT(5) - -/* * Single-entry ToPA: when this close to region boundary, switch * buffers to avoid losing data. */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 4731f0c..d3a9eb9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -120,7 +120,40 @@ #define MSR_PEBS_LD_LAT_THRESHOLD 0x03f6 #define MSR_IA32_RTIT_CTL 0x0570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OSBIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_PWR_EVT_ENBIT(4) +#define RTIT_CTL_FUP_ON_PTWBIT(5) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_ENBIT(9) +#define RTIT_CTL_TSC_ENBIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_PTW_ENBIT(12) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define RTIT_CTL_ADDR0_OFFSET 32 +#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) +#define RTIT_CTL_ADDR1_OFFSET 36 +#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) +#define RTIT_CTL_ADDR2_OFFSET 40 +#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) +#define RTIT_CTL_ADDR3_OFFSET 44 +#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) #define MSR_IA32_RTIT_STATUS 0x0571 +#define RTIT_STATUS_FILTEREN BIT(0) +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_BUFFOVFBIT(3) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPEDBIT(5) #define MSR_IA32_RTIT_ADDR0_A 0x0580 #define MSR_IA32_RTIT_ADDR0_B 0x0581 #define MSR_IA32_RTIT_ADDR1_A 0x0582 -- 1.8.3.1
[PATCH v13 02/12] perf/x86/intel/pt: Export pt_cap_get()
From: Chao Peng pt_cap_get() is required by the upcoming PT support in KVM guests. Export it and move the capabilites enum to a global header. As a global functions, "pt_*" is already used for ptrace and other things, so it makes sense to use "intel_pt_*" as a prefix. Acked-by: Song Liu Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 49 ++--- arch/x86/events/intel/pt.h | 21 -- arch/x86/include/asm/intel_pt.h | 23 +++ 3 files changed, 49 insertions(+), 44 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 8d016ce..309bb1d 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -75,7 +75,7 @@ PT_CAP(psb_periods, 1, CPUID_EBX, 0x), }; -static u32 pt_cap_get(enum pt_capabilities cap) +u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { struct pt_cap_desc *cd = _caps[cap]; u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; @@ -83,6 +83,7 @@ static u32 pt_cap_get(enum pt_capabilities cap) return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap); static ssize_t pt_cap_show(struct device *cdev, struct device_attribute *attr, @@ -92,7 +93,7 @@ static ssize_t pt_cap_show(struct device *cdev, container_of(attr, struct dev_ext_attribute, attr); enum pt_capabilities cap = (long)ea->var; - return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); + return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap)); } static struct attribute_group pt_cap_group = { @@ -310,16 +311,16 @@ static bool pt_event_valid(struct perf_event *event) return false; if (config & RTIT_CTL_CYC_PSB) { - if (!pt_cap_get(PT_CAP_psb_cyc)) + if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc)) return false; - allowed = pt_cap_get(PT_CAP_psb_periods); + allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods); requested = (config & RTIT_CTL_PSB_FREQ) >> RTIT_CTL_PSB_FREQ_OFFSET; if (requested && (!(allowed & BIT(requested return false; - allowed = pt_cap_get(PT_CAP_cycle_thresholds); + allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds); requested = (config & RTIT_CTL_CYC_THRESH) >> RTIT_CTL_CYC_THRESH_OFFSET; if (requested && (!(allowed & BIT(requested @@ -334,10 +335,10 @@ static bool pt_event_valid(struct perf_event *event) * Spec says that setting mtc period bits while mtc bit in * CPUID is 0 will #GP, so better safe than sorry. */ - if (!pt_cap_get(PT_CAP_mtc)) + if (!intel_pt_validate_hw_cap(PT_CAP_mtc)) return false; - allowed = pt_cap_get(PT_CAP_mtc_periods); + allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods); if (!allowed) return false; @@ -349,11 +350,11 @@ static bool pt_event_valid(struct perf_event *event) } if (config & RTIT_CTL_PWR_EVT_EN && - !pt_cap_get(PT_CAP_power_event_trace)) + !intel_pt_validate_hw_cap(PT_CAP_power_event_trace)) return false; if (config & RTIT_CTL_PTW) { - if (!pt_cap_get(PT_CAP_ptwrite)) + if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite)) return false; /* FUPonPTW without PTW doesn't make sense */ @@ -598,7 +599,7 @@ static struct topa *topa_alloc(int cpu, gfp_t gfp) * In case of singe-entry ToPA, always put the self-referencing END * link as the 2nd entry in the table */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) { + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; TOPA_ENTRY(topa, 1)->end = 1; } @@ -638,7 +639,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) topa->offset = last->offset + last->size; buf->last = topa; - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) return; BUG_ON(last->last != TENTS_PER_PAGE - 1); @@ -654,7 +655,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) static bool topa_table_full(struct topa *topa) { /* single-entry ToPA is a special case */ -
[PATCH v13 02/12] perf/x86/intel/pt: Export pt_cap_get()
From: Chao Peng pt_cap_get() is required by the upcoming PT support in KVM guests. Export it and move the capabilites enum to a global header. As a global functions, "pt_*" is already used for ptrace and other things, so it makes sense to use "intel_pt_*" as a prefix. Acked-by: Song Liu Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 49 ++--- arch/x86/events/intel/pt.h | 21 -- arch/x86/include/asm/intel_pt.h | 23 +++ 3 files changed, 49 insertions(+), 44 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 8d016ce..309bb1d 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -75,7 +75,7 @@ PT_CAP(psb_periods, 1, CPUID_EBX, 0x), }; -static u32 pt_cap_get(enum pt_capabilities cap) +u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { struct pt_cap_desc *cd = _caps[cap]; u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; @@ -83,6 +83,7 @@ static u32 pt_cap_get(enum pt_capabilities cap) return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap); static ssize_t pt_cap_show(struct device *cdev, struct device_attribute *attr, @@ -92,7 +93,7 @@ static ssize_t pt_cap_show(struct device *cdev, container_of(attr, struct dev_ext_attribute, attr); enum pt_capabilities cap = (long)ea->var; - return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); + return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap)); } static struct attribute_group pt_cap_group = { @@ -310,16 +311,16 @@ static bool pt_event_valid(struct perf_event *event) return false; if (config & RTIT_CTL_CYC_PSB) { - if (!pt_cap_get(PT_CAP_psb_cyc)) + if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc)) return false; - allowed = pt_cap_get(PT_CAP_psb_periods); + allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods); requested = (config & RTIT_CTL_PSB_FREQ) >> RTIT_CTL_PSB_FREQ_OFFSET; if (requested && (!(allowed & BIT(requested return false; - allowed = pt_cap_get(PT_CAP_cycle_thresholds); + allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds); requested = (config & RTIT_CTL_CYC_THRESH) >> RTIT_CTL_CYC_THRESH_OFFSET; if (requested && (!(allowed & BIT(requested @@ -334,10 +335,10 @@ static bool pt_event_valid(struct perf_event *event) * Spec says that setting mtc period bits while mtc bit in * CPUID is 0 will #GP, so better safe than sorry. */ - if (!pt_cap_get(PT_CAP_mtc)) + if (!intel_pt_validate_hw_cap(PT_CAP_mtc)) return false; - allowed = pt_cap_get(PT_CAP_mtc_periods); + allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods); if (!allowed) return false; @@ -349,11 +350,11 @@ static bool pt_event_valid(struct perf_event *event) } if (config & RTIT_CTL_PWR_EVT_EN && - !pt_cap_get(PT_CAP_power_event_trace)) + !intel_pt_validate_hw_cap(PT_CAP_power_event_trace)) return false; if (config & RTIT_CTL_PTW) { - if (!pt_cap_get(PT_CAP_ptwrite)) + if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite)) return false; /* FUPonPTW without PTW doesn't make sense */ @@ -598,7 +599,7 @@ static struct topa *topa_alloc(int cpu, gfp_t gfp) * In case of singe-entry ToPA, always put the self-referencing END * link as the 2nd entry in the table */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) { + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; TOPA_ENTRY(topa, 1)->end = 1; } @@ -638,7 +639,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) topa->offset = last->offset + last->size; buf->last = topa; - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) return; BUG_ON(last->last != TENTS_PER_PAGE - 1); @@ -654,7 +655,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) static bool topa_table_full(struct topa *topa) { /* single-entry ToPA is a special case */ -
[PATCH v13 00/12] Intel Processor Trace virtualization enabling
>From V12 - Refine the title and description of patch 1~3. -- Thomas Gleixner - Rename the function of validate the capabilities of Intel PT. -- Thomas Gleixner - Add more description of Intel PT work mode. -- Alexander Shishkin >From V11: - In patch 3, arguments caps vs. cap is not good. Spell second one out. -- Thomas Gleixner >From V10: (This version don't have code change) - move the patch 5 in version 9 to patch 3 (reorder patch 5) -- Alexander Shishkin - refind the patch description of patch 5 (add new capability for Intel PT) -- Alexander Shishkin - CC all the maintainers, reviewers and submitters in each patch of this patch set -- Alexander Shishkin >From V9: - remove redundant initialize for "ctl_bitmask" in patch 9; - do some changes for patch's description. >From V8: - move macro definition MSR_IA32_RTIT_ADDR_RANGE from msr-index.h to intel_pt.h; - initialize the RTIT_CTL bitmask to ~0ULL. >From V7: - remove host only mode since it can be emulated by perf code; - merge patch 8 and 9 to make code and data in the same patch; - rename __pt_cap_get() to pt_cap_decode(); - other minor change. >From V6: - split pathes 1~2 to four separate patches (these patches do 2 things) and add more descriptions. >From V5: - rename the function from pt_cap_get_ex() to __pt_cap_get(); - replace the most of function from vmx_pt_supported() to "pt_mode == PT_MODE_HOST_GUEST"(or !=). >From V4: - add data check when setting the value of MSR_IA32_RTIT_CTL; - Invoke new interface to set the intercept of MSRs read/write after "MSR bitmap per-vcpu" patches. >From V3: - change default mode to SYSTEM mode; - add a new patch to move PT out of scattered features; - add a new fucntion kvm_get_pt_addr_cnt() to get the number of address ranges; - add a new function vmx_set_rtit_ctl() to set the value of guest RTIT_CTL, GUEST_IA32_RTIT_CTL and MSRs intercept. >From v2: - replace *_PT_SUPPRESS_PIP to *_PT_CONCEAL_PIP; - clean SECONDARY_EXEC_PT_USE_GPA, VM_EXIT_CLEAR_IA32_RTIT_CTL and VM_ENTRY_LOAD_IA32_RTIT_CTL in SYSTEM mode. These bits must be all set or all clean; - move processor tracing out of scattered features; - add a new function to enable/disable intercept MSRs read/write; - add all Intel PT MSRs read/write and disable intercept when PT is enabled in guest; - disable Intel PT and enable intercept MSRs when L1 guest VMXON; - performance optimization. In Host only mode. we just need to save host RTIT_CTL before vm-entry and restore host RTIT_CTL after vm-exit; In HOST_GUEST mode. we need to save and restore all MSRs only when PT has enabled in guest. - use XSAVES/XRESTORES implement context switch. Haven't implementation in this version and still in debuging. will make a separate patch work on this. >From v1: - remove guest-only mode because guest-only mode can be covered by host-guest mode; - always set "use GPA for processor tracing" in secondary execution control if it can be; - trap RTIT_CTL read/write. Forbid write this msr when VMXON in L1 hypervisor. Chao Peng (7): perf/x86/intel/pt: Move Intel PT MSRs bit defines to global header perf/x86/intel/pt: Export pt_cap_get() KVM: x86: Add Intel PT virtualization work mode KVM: x86: Add Intel Processor Trace cpuid emulation KVM: x86: Add Intel PT context switch for each vcpu KVM: x86: Implement Intel PT MSRs read/write emulation KVM: x86: Set intercept for Intel PT MSRs read/write Luwei Kang (5): perf/x86/intel/pt: Introduce intel_pt_validate_cap() perf/x86/intel/pt: Add new bit definitions for PT MSRs perf/x86/intel/pt: add new capability for Intel PT KVM: x86: Introduce a function to initialize the PT configuration KVM: x86: Disable Intel PT when VMXON in L1 guest arch/x86/events/intel/pt.c | 60 +++--- arch/x86/events/intel/pt.h | 58 - arch/x86/include/asm/intel_pt.h | 39 arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/msr-index.h | 37 arch/x86/include/asm/vmx.h | 8 + arch/x86/kvm/cpuid.c | 22 +- arch/x86/kvm/svm.c | 6 + arch/x86/kvm/vmx.c | 446 ++- arch/x86/kvm/x86.c | 33 ++- 10 files changed, 620 insertions(+), 90 deletions(-) -- 1.8.3.1
[PATCH v13 00/12] Intel Processor Trace virtualization enabling
>From V12 - Refine the title and description of patch 1~3. -- Thomas Gleixner - Rename the function of validate the capabilities of Intel PT. -- Thomas Gleixner - Add more description of Intel PT work mode. -- Alexander Shishkin >From V11: - In patch 3, arguments caps vs. cap is not good. Spell second one out. -- Thomas Gleixner >From V10: (This version don't have code change) - move the patch 5 in version 9 to patch 3 (reorder patch 5) -- Alexander Shishkin - refind the patch description of patch 5 (add new capability for Intel PT) -- Alexander Shishkin - CC all the maintainers, reviewers and submitters in each patch of this patch set -- Alexander Shishkin >From V9: - remove redundant initialize for "ctl_bitmask" in patch 9; - do some changes for patch's description. >From V8: - move macro definition MSR_IA32_RTIT_ADDR_RANGE from msr-index.h to intel_pt.h; - initialize the RTIT_CTL bitmask to ~0ULL. >From V7: - remove host only mode since it can be emulated by perf code; - merge patch 8 and 9 to make code and data in the same patch; - rename __pt_cap_get() to pt_cap_decode(); - other minor change. >From V6: - split pathes 1~2 to four separate patches (these patches do 2 things) and add more descriptions. >From V5: - rename the function from pt_cap_get_ex() to __pt_cap_get(); - replace the most of function from vmx_pt_supported() to "pt_mode == PT_MODE_HOST_GUEST"(or !=). >From V4: - add data check when setting the value of MSR_IA32_RTIT_CTL; - Invoke new interface to set the intercept of MSRs read/write after "MSR bitmap per-vcpu" patches. >From V3: - change default mode to SYSTEM mode; - add a new patch to move PT out of scattered features; - add a new fucntion kvm_get_pt_addr_cnt() to get the number of address ranges; - add a new function vmx_set_rtit_ctl() to set the value of guest RTIT_CTL, GUEST_IA32_RTIT_CTL and MSRs intercept. >From v2: - replace *_PT_SUPPRESS_PIP to *_PT_CONCEAL_PIP; - clean SECONDARY_EXEC_PT_USE_GPA, VM_EXIT_CLEAR_IA32_RTIT_CTL and VM_ENTRY_LOAD_IA32_RTIT_CTL in SYSTEM mode. These bits must be all set or all clean; - move processor tracing out of scattered features; - add a new function to enable/disable intercept MSRs read/write; - add all Intel PT MSRs read/write and disable intercept when PT is enabled in guest; - disable Intel PT and enable intercept MSRs when L1 guest VMXON; - performance optimization. In Host only mode. we just need to save host RTIT_CTL before vm-entry and restore host RTIT_CTL after vm-exit; In HOST_GUEST mode. we need to save and restore all MSRs only when PT has enabled in guest. - use XSAVES/XRESTORES implement context switch. Haven't implementation in this version and still in debuging. will make a separate patch work on this. >From v1: - remove guest-only mode because guest-only mode can be covered by host-guest mode; - always set "use GPA for processor tracing" in secondary execution control if it can be; - trap RTIT_CTL read/write. Forbid write this msr when VMXON in L1 hypervisor. Chao Peng (7): perf/x86/intel/pt: Move Intel PT MSRs bit defines to global header perf/x86/intel/pt: Export pt_cap_get() KVM: x86: Add Intel PT virtualization work mode KVM: x86: Add Intel Processor Trace cpuid emulation KVM: x86: Add Intel PT context switch for each vcpu KVM: x86: Implement Intel PT MSRs read/write emulation KVM: x86: Set intercept for Intel PT MSRs read/write Luwei Kang (5): perf/x86/intel/pt: Introduce intel_pt_validate_cap() perf/x86/intel/pt: Add new bit definitions for PT MSRs perf/x86/intel/pt: add new capability for Intel PT KVM: x86: Introduce a function to initialize the PT configuration KVM: x86: Disable Intel PT when VMXON in L1 guest arch/x86/events/intel/pt.c | 60 +++--- arch/x86/events/intel/pt.h | 58 - arch/x86/include/asm/intel_pt.h | 39 arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/asm/msr-index.h | 37 arch/x86/include/asm/vmx.h | 8 + arch/x86/kvm/cpuid.c | 22 +- arch/x86/kvm/svm.c | 6 + arch/x86/kvm/vmx.c | 446 ++- arch/x86/kvm/x86.c | 33 ++- 10 files changed, 620 insertions(+), 90 deletions(-) -- 1.8.3.1
[PATCH v9 02/12] perf/x86/intel/pt: Change pt_cap_get() to a public function
From: Chao Peng <chao.p.p...@linux.intel.com> Change pt_cap_get() to a public function that KVM can access this function to check if specific feature is supported on hardware. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/events/intel/pt.c | 3 ++- arch/x86/events/intel/pt.h | 21 - arch/x86/include/asm/intel_pt.h | 23 +++ 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 3b99394..c80e2f5 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -75,7 +75,7 @@ PT_CAP(psb_periods, 1, CPUID_EBX, 0x), }; -static u32 pt_cap_get(enum pt_capabilities cap) +u32 pt_cap_get(enum pt_capabilities cap) { struct pt_cap_desc *cd = _caps[cap]; u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; @@ -83,6 +83,7 @@ static u32 pt_cap_get(enum pt_capabilities cap) return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(pt_cap_get); static ssize_t pt_cap_show(struct device *cdev, struct device_attribute *attr, diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 0050ca1..269e15a 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -45,30 +45,9 @@ struct topa_entry { u64 rsvd4 : 16; }; -#define PT_CPUID_LEAVES2 -#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ - /* TSC to Core Crystal Clock Ratio */ #define CPUID_TSC_LEAF 0x15 -enum pt_capabilities { - PT_CAP_max_subleaf = 0, - PT_CAP_cr3_filtering, - PT_CAP_psb_cyc, - PT_CAP_ip_filtering, - PT_CAP_mtc, - PT_CAP_ptwrite, - PT_CAP_power_event_trace, - PT_CAP_topa_output, - PT_CAP_topa_multiple_entries, - PT_CAP_single_range_output, - PT_CAP_payloads_lip, - PT_CAP_num_address_ranges, - PT_CAP_mtc_periods, - PT_CAP_cycle_thresholds, - PT_CAP_psb_periods, -}; - struct pt_pmu { struct pmu pmu; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index b523f51..4270421 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -2,10 +2,33 @@ #ifndef _ASM_X86_INTEL_PT_H #define _ASM_X86_INTEL_PT_H +#define PT_CPUID_LEAVES2 +#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ + +enum pt_capabilities { + PT_CAP_max_subleaf = 0, + PT_CAP_cr3_filtering, + PT_CAP_psb_cyc, + PT_CAP_ip_filtering, + PT_CAP_mtc, + PT_CAP_ptwrite, + PT_CAP_power_event_trace, + PT_CAP_topa_output, + PT_CAP_topa_multiple_entries, + PT_CAP_single_range_output, + PT_CAP_payloads_lip, + PT_CAP_num_address_ranges, + PT_CAP_mtc_periods, + PT_CAP_cycle_thresholds, + PT_CAP_psb_periods, +}; + #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) void cpu_emergency_stop_pt(void); +extern u32 pt_cap_get(enum pt_capabilities cap); #else static inline void cpu_emergency_stop_pt(void) {} +static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ -- 1.8.3.1
[PATCH v9 02/12] perf/x86/intel/pt: Change pt_cap_get() to a public function
From: Chao Peng Change pt_cap_get() to a public function that KVM can access this function to check if specific feature is supported on hardware. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 3 ++- arch/x86/events/intel/pt.h | 21 - arch/x86/include/asm/intel_pt.h | 23 +++ 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 3b99394..c80e2f5 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -75,7 +75,7 @@ PT_CAP(psb_periods, 1, CPUID_EBX, 0x), }; -static u32 pt_cap_get(enum pt_capabilities cap) +u32 pt_cap_get(enum pt_capabilities cap) { struct pt_cap_desc *cd = _caps[cap]; u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; @@ -83,6 +83,7 @@ static u32 pt_cap_get(enum pt_capabilities cap) return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(pt_cap_get); static ssize_t pt_cap_show(struct device *cdev, struct device_attribute *attr, diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 0050ca1..269e15a 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -45,30 +45,9 @@ struct topa_entry { u64 rsvd4 : 16; }; -#define PT_CPUID_LEAVES2 -#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ - /* TSC to Core Crystal Clock Ratio */ #define CPUID_TSC_LEAF 0x15 -enum pt_capabilities { - PT_CAP_max_subleaf = 0, - PT_CAP_cr3_filtering, - PT_CAP_psb_cyc, - PT_CAP_ip_filtering, - PT_CAP_mtc, - PT_CAP_ptwrite, - PT_CAP_power_event_trace, - PT_CAP_topa_output, - PT_CAP_topa_multiple_entries, - PT_CAP_single_range_output, - PT_CAP_payloads_lip, - PT_CAP_num_address_ranges, - PT_CAP_mtc_periods, - PT_CAP_cycle_thresholds, - PT_CAP_psb_periods, -}; - struct pt_pmu { struct pmu pmu; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index b523f51..4270421 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -2,10 +2,33 @@ #ifndef _ASM_X86_INTEL_PT_H #define _ASM_X86_INTEL_PT_H +#define PT_CPUID_LEAVES2 +#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ + +enum pt_capabilities { + PT_CAP_max_subleaf = 0, + PT_CAP_cr3_filtering, + PT_CAP_psb_cyc, + PT_CAP_ip_filtering, + PT_CAP_mtc, + PT_CAP_ptwrite, + PT_CAP_power_event_trace, + PT_CAP_topa_output, + PT_CAP_topa_multiple_entries, + PT_CAP_single_range_output, + PT_CAP_payloads_lip, + PT_CAP_num_address_ranges, + PT_CAP_mtc_periods, + PT_CAP_cycle_thresholds, + PT_CAP_psb_periods, +}; + #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) void cpu_emergency_stop_pt(void); +extern u32 pt_cap_get(enum pt_capabilities cap); #else static inline void cpu_emergency_stop_pt(void) {} +static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ -- 1.8.3.1
[PATCH v9 04/12] perf/x86/intel/pt: add new capability for Intel PT
CPUID(EAX=14H,ECX=0):EBX[bit 3] = 1 indicates support of output to Trace Transport subsystem. MSR IA32_RTIT_CTL.FabricEn[bit 6] is reserved if CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0. This is use for emulate IA32_RTIT_CTL MSR read/write in KVM. KVM guest write IA32_RTIT_CTL will trap to root mode and a #GP would be injected to guest if set IA32_RTIT_CTL.FabricEn with CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0. Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/events/intel/pt.c | 1 + arch/x86/include/asm/intel_pt.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index c80e2f5..f65f97a 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -68,6 +68,7 @@ PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)), PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), PT_CAP(mtc_periods, 1, CPUID_EAX, 0x), diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 4270421..2de4db0 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -16,6 +16,7 @@ enum pt_capabilities { PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, + PT_CAP_output_subsys, PT_CAP_payloads_lip, PT_CAP_num_address_ranges, PT_CAP_mtc_periods, -- 1.8.3.1
[PATCH v9 04/12] perf/x86/intel/pt: add new capability for Intel PT
CPUID(EAX=14H,ECX=0):EBX[bit 3] = 1 indicates support of output to Trace Transport subsystem. MSR IA32_RTIT_CTL.FabricEn[bit 6] is reserved if CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0. This is use for emulate IA32_RTIT_CTL MSR read/write in KVM. KVM guest write IA32_RTIT_CTL will trap to root mode and a #GP would be injected to guest if set IA32_RTIT_CTL.FabricEn with CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0. Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 1 + arch/x86/include/asm/intel_pt.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index c80e2f5..f65f97a 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -68,6 +68,7 @@ PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)), PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), PT_CAP(mtc_periods, 1, CPUID_EAX, 0x), diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 4270421..2de4db0 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -16,6 +16,7 @@ enum pt_capabilities { PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, + PT_CAP_output_subsys, PT_CAP_payloads_lip, PT_CAP_num_address_ranges, PT_CAP_mtc_periods, -- 1.8.3.1
[PATCH v9 10/12] KVM: x86: Implement Intel Processor Trace MSRs read/write emulation
From: Chao Peng <chao.p.p...@linux.intel.com> This patch implement Intel Processor Trace MSRs read/write emulation. Intel PT MSRs read/write need to be emulated when Intel PT MSRs is intercepted in guest and during live migration. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/include/asm/intel_pt.h | 8 ++ arch/x86/kvm/vmx.c | 172 arch/x86/kvm/x86.c | 33 +++- 3 files changed, 212 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 70f4139..3a25dc1 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -10,6 +10,14 @@ #define RTIT_ADDR_RANGE4 +#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ + RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ + RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ + RTIT_STATUS_BYTECNT)) + +#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ + (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 952ddf4..770cb7c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2809,6 +2809,77 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); } +static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long value; + + /* +* Any MSR write that attempts to change bits marked reserved will +* case a #GP fault. +*/ + if (data & vmx->pt_desc.ctl_bitmask) + return 1; + + /* +* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will +* result in a #GP unless the same write also clears TraceEn. +*/ + if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && + ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + return 1; + + /* +* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit +* and FabricEn would cause #GP, if +* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 +*/ + if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && + !(data & RTIT_CTL_FABRIC_EN) && + !pt_cap_decode(vmx->pt_desc.caps, PT_CAP_single_range_output)) + return 1; + + /* +* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that +* utilize encodings marked reserved will casue a #GP fault. +*/ + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc_periods); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc) && + !test_bit((data & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET, )) + return 1; + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cycle_thresholds); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET, )) + return 1; + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_periods); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET, )) + return 1; + + /* +* If ADDRx_CFG is reserved or the encodings is >2 will +* cause a #GP fault. +*/ + value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) + return 1; + + return 0; +} + + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; @@ -3625,6 +3696,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; + u
[PATCH v9 10/12] KVM: x86: Implement Intel Processor Trace MSRs read/write emulation
From: Chao Peng This patch implement Intel Processor Trace MSRs read/write emulation. Intel PT MSRs read/write need to be emulated when Intel PT MSRs is intercepted in guest and during live migration. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/intel_pt.h | 8 ++ arch/x86/kvm/vmx.c | 172 arch/x86/kvm/x86.c | 33 +++- 3 files changed, 212 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 70f4139..3a25dc1 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -10,6 +10,14 @@ #define RTIT_ADDR_RANGE4 +#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ + RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ + RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ + RTIT_STATUS_BYTECNT)) + +#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ + (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 952ddf4..770cb7c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2809,6 +2809,77 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); } +static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long value; + + /* +* Any MSR write that attempts to change bits marked reserved will +* case a #GP fault. +*/ + if (data & vmx->pt_desc.ctl_bitmask) + return 1; + + /* +* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will +* result in a #GP unless the same write also clears TraceEn. +*/ + if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && + ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + return 1; + + /* +* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit +* and FabricEn would cause #GP, if +* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 +*/ + if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && + !(data & RTIT_CTL_FABRIC_EN) && + !pt_cap_decode(vmx->pt_desc.caps, PT_CAP_single_range_output)) + return 1; + + /* +* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that +* utilize encodings marked reserved will casue a #GP fault. +*/ + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc_periods); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc) && + !test_bit((data & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET, )) + return 1; + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cycle_thresholds); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET, )) + return 1; + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_periods); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET, )) + return 1; + + /* +* If ADDRx_CFG is reserved or the encodings is >2 will +* cause a #GP fault. +*/ + value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) + return 1; + + return 0; +} + + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; @@ -3625,6 +3696,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; + u32 index; switch (msr_info->index) { #ifdef CONFIG_X86_64 @@ -369
[PATCH v9 06/12] KVM: x86: Add Intel Processor Trace virtualization mode
From: Chao Peng <chao.p.p...@linux.intel.com> Intel PT virtualization can be work in one of 2 possible modes: a. system-wide: trace both host and guest and output to host buffer; b. host-guest: trace host/guest simultaneous and output to their respective buffer. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/include/asm/intel_pt.h | 3 ++ arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/vmx.h | 8 + arch/x86/kvm/vmx.c | 68 +--- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 9c71453..5748205 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -5,6 +5,9 @@ #define PT_CPUID_LEAVES2 #define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ +#define PT_MODE_SYSTEM 0 +#define PT_MODE_HOST_GUEST 1 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6ae2462..6b14325 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -789,6 +789,7 @@ #define VMX_BASIC_INOUT0x0040LLU /* MSR_IA32_VMX_MISC bits */ +#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 5db8b0b..5936d72 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -76,7 +76,9 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x4000 #define SECONDARY_EXEC_RDSEED_EXITING 0x0001 #define SECONDARY_EXEC_ENABLE_PML 0x0002 +#define SECONDARY_EXEC_PT_CONCEAL_VMX 0x0008 #define SECONDARY_EXEC_XSAVES 0x0010 +#define SECONDARY_EXEC_PT_USE_GPA 0x0100 #define SECONDARY_EXEC_TSC_SCALING 0x0200 #define PIN_BASED_EXT_INTR_MASK 0x0001 @@ -97,6 +99,8 @@ #define VM_EXIT_LOAD_IA32_EFER 0x0020 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x0040 #define VM_EXIT_CLEAR_BNDCFGS 0x0080 +#define VM_EXIT_PT_CONCEAL_PIP 0x0100 +#define VM_EXIT_CLEAR_IA32_RTIT_CTL0x0200 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -108,6 +112,8 @@ #define VM_ENTRY_LOAD_IA32_PAT 0x4000 #define VM_ENTRY_LOAD_IA32_EFER 0x8000 #define VM_ENTRY_LOAD_BNDCFGS 0x0001 +#define VM_ENTRY_PT_CONCEAL_PIP0x0002 +#define VM_ENTRY_LOAD_IA32_RTIT_CTL0x0004 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x11ff @@ -234,6 +240,8 @@ enum vmcs_field { GUEST_PDPTR3_HIGH = 0x2811, GUEST_BNDCFGS = 0x2812, GUEST_BNDCFGS_HIGH = 0x2813, + GUEST_IA32_RTIT_CTL = 0x2814, + GUEST_IA32_RTIT_CTL_HIGH= 0x2815, HOST_IA32_PAT = 0x2c00, HOST_IA32_PAT_HIGH = 0x2c01, HOST_IA32_EFER = 0x2c02, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ea09813..bb96396 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -186,6 +187,10 @@ static unsigned int ple_window_max= KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); +/* Default is SYSTEM mode. */ +static int __read_mostly pt_mode = PT_MODE_SYSTEM; +module_param(pt_mode, int, S_IRUGO); + extern const ulong vmx_return; struct kvm_vmx { @@ -1512,6 +1517,20 @@ static bool vmx_umip_emulated(void) SECONDARY_EXEC_DESC; } +static inline bool cpu_has_vmx_intel_pt(void) +{ + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + return !!(vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT); +} + +static inline bool cpu_has_vmx_pt_use_gpa(void) +{ + return !!(vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PT_USE_GPA); +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -4026,6 +4045,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_PT_USE_G
[PATCH v9 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write
From: Chao Peng <chao.p.p...@linux.intel.com> Disable intercept Intel PT MSRs only when Intel PT is enabled in guest. But MSR_IA32_RTIT_CTL will alway be intercept. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/kvm/vmx.c | 23 +++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 770cb7c..a09157c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -948,6 +948,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3999,6 +4000,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_rtit_ctl_check(vcpu, data)) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); + pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); vmx->pt_desc.guest.ctl = data; break; case MSR_IA32_RTIT_STATUS: @@ -5820,6 +5822,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) vmx->msr_bitmap_mode = mode; } +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag) +{ + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + u32 i; + + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, + MSR_TYPE_RW, flag); + for (i = 0; i < vmx->pt_desc.addr_range; i++) { + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + } +} + static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; -- 1.8.3.1
[PATCH v9 06/12] KVM: x86: Add Intel Processor Trace virtualization mode
From: Chao Peng Intel PT virtualization can be work in one of 2 possible modes: a. system-wide: trace both host and guest and output to host buffer; b. host-guest: trace host/guest simultaneous and output to their respective buffer. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/intel_pt.h | 3 ++ arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/vmx.h | 8 + arch/x86/kvm/vmx.c | 68 +--- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 9c71453..5748205 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -5,6 +5,9 @@ #define PT_CPUID_LEAVES2 #define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ +#define PT_MODE_SYSTEM 0 +#define PT_MODE_HOST_GUEST 1 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6ae2462..6b14325 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -789,6 +789,7 @@ #define VMX_BASIC_INOUT0x0040LLU /* MSR_IA32_VMX_MISC bits */ +#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 5db8b0b..5936d72 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -76,7 +76,9 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x4000 #define SECONDARY_EXEC_RDSEED_EXITING 0x0001 #define SECONDARY_EXEC_ENABLE_PML 0x0002 +#define SECONDARY_EXEC_PT_CONCEAL_VMX 0x0008 #define SECONDARY_EXEC_XSAVES 0x0010 +#define SECONDARY_EXEC_PT_USE_GPA 0x0100 #define SECONDARY_EXEC_TSC_SCALING 0x0200 #define PIN_BASED_EXT_INTR_MASK 0x0001 @@ -97,6 +99,8 @@ #define VM_EXIT_LOAD_IA32_EFER 0x0020 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x0040 #define VM_EXIT_CLEAR_BNDCFGS 0x0080 +#define VM_EXIT_PT_CONCEAL_PIP 0x0100 +#define VM_EXIT_CLEAR_IA32_RTIT_CTL0x0200 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -108,6 +112,8 @@ #define VM_ENTRY_LOAD_IA32_PAT 0x4000 #define VM_ENTRY_LOAD_IA32_EFER 0x8000 #define VM_ENTRY_LOAD_BNDCFGS 0x0001 +#define VM_ENTRY_PT_CONCEAL_PIP0x0002 +#define VM_ENTRY_LOAD_IA32_RTIT_CTL0x0004 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x11ff @@ -234,6 +240,8 @@ enum vmcs_field { GUEST_PDPTR3_HIGH = 0x2811, GUEST_BNDCFGS = 0x2812, GUEST_BNDCFGS_HIGH = 0x2813, + GUEST_IA32_RTIT_CTL = 0x2814, + GUEST_IA32_RTIT_CTL_HIGH= 0x2815, HOST_IA32_PAT = 0x2c00, HOST_IA32_PAT_HIGH = 0x2c01, HOST_IA32_EFER = 0x2c02, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ea09813..bb96396 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -186,6 +187,10 @@ static unsigned int ple_window_max= KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); +/* Default is SYSTEM mode. */ +static int __read_mostly pt_mode = PT_MODE_SYSTEM; +module_param(pt_mode, int, S_IRUGO); + extern const ulong vmx_return; struct kvm_vmx { @@ -1512,6 +1517,20 @@ static bool vmx_umip_emulated(void) SECONDARY_EXEC_DESC; } +static inline bool cpu_has_vmx_intel_pt(void) +{ + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + return !!(vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT); +} + +static inline bool cpu_has_vmx_pt_use_gpa(void) +{ + return !!(vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PT_USE_GPA); +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -4026,6 +4045,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_PT_USE_GPA | + SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC;
[PATCH v9 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write
From: Chao Peng Disable intercept Intel PT MSRs only when Intel PT is enabled in guest. But MSR_IA32_RTIT_CTL will alway be intercept. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 23 +++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 770cb7c..a09157c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -948,6 +948,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3999,6 +4000,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_rtit_ctl_check(vcpu, data)) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); + pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); vmx->pt_desc.guest.ctl = data; break; case MSR_IA32_RTIT_STATUS: @@ -5820,6 +5822,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) vmx->msr_bitmap_mode = mode; } +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag) +{ + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + u32 i; + + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, + MSR_TYPE_RW, flag); + for (i = 0; i < vmx->pt_desc.addr_range; i++) { + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + } +} + static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; -- 1.8.3.1
[PATCH v9 12/12] KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest
Currently, Intel Processor Trace do not support tracing in L1 guest VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM, on these type of processors, execution of the VMXON instruction will clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL causes a general-protection exception (#GP). Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/kvm/vmx.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a09157c..093c1f7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3997,7 +3997,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || - vmx_rtit_ctl_check(vcpu, data)) + vmx_rtit_ctl_check(vcpu, data) || + vmx->nested.vmxon) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); @@ -8090,6 +8091,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (ret) return ret; + if (pt_mode == PT_MODE_HOST_GUEST) { + vmx->pt_desc.guest.ctl = 0; + pt_set_intercept_for_msr(vmx, 1); + } + nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); } -- 1.8.3.1
[PATCH v9 12/12] KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest
Currently, Intel Processor Trace do not support tracing in L1 guest VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM, on these type of processors, execution of the VMXON instruction will clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL causes a general-protection exception (#GP). Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a09157c..093c1f7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3997,7 +3997,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || - vmx_rtit_ctl_check(vcpu, data)) + vmx_rtit_ctl_check(vcpu, data) || + vmx->nested.vmxon) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); @@ -8090,6 +8091,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (ret) return ret; + if (pt_mode == PT_MODE_HOST_GUEST) { + vmx->pt_desc.guest.ctl = 0; + pt_set_intercept_for_msr(vmx, 1); + } + nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); } -- 1.8.3.1
[PATCH v9 08/12] KVM: x86: Add Intel Processor Trace context switch for each vcpu
From: Chao Peng <chao.p.p...@linux.intel.com> Load/Store Intel processor trace register in context switch. MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS. In HOST_GUEST mode, we need load/resore PT MSRs only when PT is enabled in guest. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/include/asm/intel_pt.h | 2 + arch/x86/kvm/vmx.c | 94 + 2 files changed, 96 insertions(+) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 5748205..70f4139 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -8,6 +8,8 @@ #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 +#define RTIT_ADDR_RANGE4 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 24aded4..11fb90a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -597,6 +597,24 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) (unsigned long *)_desc->control); } +struct pt_ctx { + u64 ctl; + u64 status; + u64 output_base; + u64 output_mask; + u64 cr3_match; + u64 addr_a[RTIT_ADDR_RANGE]; + u64 addr_b[RTIT_ADDR_RANGE]; +}; + +struct pt_desc { + u64 ctl_bitmask; + u32 addr_range; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; + struct pt_ctx host; + struct pt_ctx guest; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -693,6 +711,8 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + + struct pt_desc pt_desc; }; enum segment_cache_field { @@ -2391,6 +2411,69 @@ static unsigned long segment_base(u16 selector) } #endif +static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static void pt_guest_enter(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + /* Save host state before VM entry */ + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + + /* +* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled +* on VM entry when it has been disabled in guest before). +*/ + vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl); + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + wrmsrl(MSR_IA32_RTIT_CTL, 0); + pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + } +} + +static void pt_guest_exit(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range); + } + + /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); +} + static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6135,6 +6218,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } + + if (pt_mode == PT_MODE_HOST_GUEST) { + memset(>pt_desc, 0, sizeof(vmx->pt_desc)); + /* Bit[6~0] are forced to 1, writes are ignored. */ + vmx->pt_desc.guest.output_mask = 0x7F; + vmcs_write64(GUEST_IA32_RTIT_CTL, 0); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@
[PATCH v9 09/12] KVM: x86: Introduce a function to initialize the PT configuration
Initialize the Intel PT configuration when cpuid update. Include cpuid inforamtion, rtit_ctl bit mask and the number of address ranges. Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/kvm/vmx.c | 70 ++ 1 file changed, 70 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 11fb90a..952ddf4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10411,6 +10411,72 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } +static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *best = NULL; + int i; + + for (i = 0; i < PT_CPUID_LEAVES; i++) { + best = kvm_find_cpuid_entry(vcpu, 0x14, i); + if (!best) + return; + vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; + vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; + vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; + vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; + } + + /* Get the number of configurable Address Ranges for filtering */ + vmx->pt_desc.addr_range = pt_cap_decode(vmx->pt_desc.caps, + PT_CAP_num_address_ranges); + + /* Initialize and clear the no dependency bits */ + vmx->pt_desc.ctl_bitmask = ~0ULL; + vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); + + /* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and +* PSBFreq can be set +*/ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | + RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and +* MTCFreq can be set +*/ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | + RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); + + /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_ptwrite)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | + RTIT_CTL_PTW_EN); + + /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_power_event_trace)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; + + /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_topa_output)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_output_subsys)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; + + /* unmask address range configure area */ + for (i = 0; i < vmx->pt_desc.addr_range; i++) + vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -10429,6 +10495,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) nested_vmx_cr_fixed1_bits_update(vcpu); + + if (boot_cpu_has(X86_FEATURE_INTEL_PT) && + guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + update_intel_pt_cfg(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -- 1.8.3.1
[PATCH v9 08/12] KVM: x86: Add Intel Processor Trace context switch for each vcpu
From: Chao Peng Load/Store Intel processor trace register in context switch. MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS. In HOST_GUEST mode, we need load/resore PT MSRs only when PT is enabled in guest. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/intel_pt.h | 2 + arch/x86/kvm/vmx.c | 94 + 2 files changed, 96 insertions(+) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 5748205..70f4139 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -8,6 +8,8 @@ #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 +#define RTIT_ADDR_RANGE4 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 24aded4..11fb90a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -597,6 +597,24 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) (unsigned long *)_desc->control); } +struct pt_ctx { + u64 ctl; + u64 status; + u64 output_base; + u64 output_mask; + u64 cr3_match; + u64 addr_a[RTIT_ADDR_RANGE]; + u64 addr_b[RTIT_ADDR_RANGE]; +}; + +struct pt_desc { + u64 ctl_bitmask; + u32 addr_range; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; + struct pt_ctx host; + struct pt_ctx guest; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -693,6 +711,8 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + + struct pt_desc pt_desc; }; enum segment_cache_field { @@ -2391,6 +2411,69 @@ static unsigned long segment_base(u16 selector) } #endif +static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static void pt_guest_enter(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + /* Save host state before VM entry */ + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + + /* +* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled +* on VM entry when it has been disabled in guest before). +*/ + vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl); + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + wrmsrl(MSR_IA32_RTIT_CTL, 0); + pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + } +} + +static void pt_guest_exit(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range); + } + + /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); +} + static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6135,6 +6218,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } + + if (pt_mode == PT_MODE_HOST_GUEST) { + memset(>pt_desc, 0, sizeof(vmx->pt_desc)); + /* Bit[6~0] are forced to 1, writes are ignored. */ + vmx->pt_desc.guest.output_mask = 0x7F; + vmcs_write64(GUEST_IA32_RTIT_CTL, 0); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9800,6 +9890,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu-&
[PATCH v9 09/12] KVM: x86: Introduce a function to initialize the PT configuration
Initialize the Intel PT configuration when cpuid update. Include cpuid inforamtion, rtit_ctl bit mask and the number of address ranges. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 70 ++ 1 file changed, 70 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 11fb90a..952ddf4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10411,6 +10411,72 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } +static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *best = NULL; + int i; + + for (i = 0; i < PT_CPUID_LEAVES; i++) { + best = kvm_find_cpuid_entry(vcpu, 0x14, i); + if (!best) + return; + vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; + vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; + vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; + vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; + } + + /* Get the number of configurable Address Ranges for filtering */ + vmx->pt_desc.addr_range = pt_cap_decode(vmx->pt_desc.caps, + PT_CAP_num_address_ranges); + + /* Initialize and clear the no dependency bits */ + vmx->pt_desc.ctl_bitmask = ~0ULL; + vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); + + /* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and +* PSBFreq can be set +*/ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | + RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and +* MTCFreq can be set +*/ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | + RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); + + /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_ptwrite)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | + RTIT_CTL_PTW_EN); + + /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_power_event_trace)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; + + /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_topa_output)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_output_subsys)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; + + /* unmask address range configure area */ + for (i = 0; i < vmx->pt_desc.addr_range; i++) + vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -10429,6 +10495,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) nested_vmx_cr_fixed1_bits_update(vcpu); + + if (boot_cpu_has(X86_FEATURE_INTEL_PT) && + guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + update_intel_pt_cfg(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -- 1.8.3.1
[PATCH v9 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation
From: Chao Peng <chao.p.p...@linux.intel.com> Expose Intel Processor Trace to guest only when PT work in HOST_GUEST mode. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c| 22 -- arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c | 6 ++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b27de80..8f3c7ea 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1026,6 +1026,7 @@ struct kvm_x86_ops { bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); + bool (*pt_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 82055b9..e04bf67 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -336,6 +336,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -393,7 +394,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL); + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = @@ -423,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xd); + entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); break; case 1: entry->edx &= kvm_cpuid_1_edx_x86_features; @@ -595,6 +596,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + /* Intel PT */ + case 0x14: { + int t, times = entry->eax; + + if (!f_intel_pt) + break; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (t = 1; t <= times; ++t) { + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent([t], function, t); + entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { static const char signature[12] = "KVMKVMKVM\0\0"; const u32 *sigptr = (const u32 *)signature; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 220e5a89..6df8075 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5876,6 +5876,11 @@ static bool svm_umip_emulated(void) return false; } +static bool svm_pt_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -7101,6 +7106,7 @@ static int svm_unregister_enc_region(struct kvm *kvm, .mpx_supported = svm_mpx_supported, .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, + .pt_supported = svm_pt_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bb96396..24aded4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9588,6 +9588,11 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static bool vmx_pt_supported(void) +{ + return (pt_mode == PT_MODE_HOST_GUEST); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -12809,6 +12814,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, + .pt_supported = vmx_pt_supported, .check_nested_events = vmx_check_nested_events, -- 1.8.3.1
[PATCH v9 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation
From: Chao Peng Expose Intel Processor Trace to guest only when PT work in HOST_GUEST mode. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c| 22 -- arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c | 6 ++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b27de80..8f3c7ea 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1026,6 +1026,7 @@ struct kvm_x86_ops { bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); + bool (*pt_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 82055b9..e04bf67 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -336,6 +336,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -393,7 +394,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL); + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = @@ -423,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xd); + entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); break; case 1: entry->edx &= kvm_cpuid_1_edx_x86_features; @@ -595,6 +596,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + /* Intel PT */ + case 0x14: { + int t, times = entry->eax; + + if (!f_intel_pt) + break; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (t = 1; t <= times; ++t) { + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent([t], function, t); + entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { static const char signature[12] = "KVMKVMKVM\0\0"; const u32 *sigptr = (const u32 *)signature; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 220e5a89..6df8075 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5876,6 +5876,11 @@ static bool svm_umip_emulated(void) return false; } +static bool svm_pt_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -7101,6 +7106,7 @@ static int svm_unregister_enc_region(struct kvm *kvm, .mpx_supported = svm_mpx_supported, .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, + .pt_supported = svm_pt_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bb96396..24aded4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9588,6 +9588,11 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static bool vmx_pt_supported(void) +{ + return (pt_mode == PT_MODE_HOST_GUEST); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -12809,6 +12814,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, + .pt_supported = vmx_pt_supported, .check_nested_events = vmx_check_nested_events, -- 1.8.3.1
[PATCH v9 05/12] perf/x86/intel/pt: Introduce a new function to get capability of Intel PT
New function pt_cap_decode() will be invoked in KVM to check if a specific capability is available in KVM guest. Another function pt_cap_get() can only check the hardware capabilities but this may different with KVM guest because some features may not be exposed to guest. Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/events/intel/pt.c | 10 -- arch/x86/include/asm/intel_pt.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index f65f97a..18a2e80 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -76,14 +76,20 @@ PT_CAP(psb_periods, 1, CPUID_EBX, 0x), }; -u32 pt_cap_get(enum pt_capabilities cap) +u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { struct pt_cap_desc *cd = _caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; + u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; unsigned int shift = __ffs(cd->mask); return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(pt_cap_decode); + +u32 pt_cap_get(enum pt_capabilities cap) +{ + return pt_cap_decode(pt_pmu.caps, cap); +} EXPORT_SYMBOL_GPL(pt_cap_get); static ssize_t pt_cap_show(struct device *cdev, diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 2de4db0..9c71453 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -27,9 +27,11 @@ enum pt_capabilities { #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) void cpu_emergency_stop_pt(void); extern u32 pt_cap_get(enum pt_capabilities cap); +extern u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap); #else static inline void cpu_emergency_stop_pt(void) {} static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; } +static u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ -- 1.8.3.1
[PATCH v9 05/12] perf/x86/intel/pt: Introduce a new function to get capability of Intel PT
New function pt_cap_decode() will be invoked in KVM to check if a specific capability is available in KVM guest. Another function pt_cap_get() can only check the hardware capabilities but this may different with KVM guest because some features may not be exposed to guest. Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 10 -- arch/x86/include/asm/intel_pt.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index f65f97a..18a2e80 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -76,14 +76,20 @@ PT_CAP(psb_periods, 1, CPUID_EBX, 0x), }; -u32 pt_cap_get(enum pt_capabilities cap) +u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { struct pt_cap_desc *cd = _caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; + u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; unsigned int shift = __ffs(cd->mask); return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(pt_cap_decode); + +u32 pt_cap_get(enum pt_capabilities cap) +{ + return pt_cap_decode(pt_pmu.caps, cap); +} EXPORT_SYMBOL_GPL(pt_cap_get); static ssize_t pt_cap_show(struct device *cdev, diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 2de4db0..9c71453 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -27,9 +27,11 @@ enum pt_capabilities { #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) void cpu_emergency_stop_pt(void); extern u32 pt_cap_get(enum pt_capabilities cap); +extern u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap); #else static inline void cpu_emergency_stop_pt(void) {} static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; } +static u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ -- 1.8.3.1
[PATCH v9 03/12] perf/x86/intel/pt: Add new bit definitions for Intel PT MSRs
These bit definitions are use for emulate MSRs read/write for KVM. For example, IA32_RTIT_CTL.FabricEn[bit 6] is available only when CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 1. If KVM guest try to set this bit with CPUID.(EAX=14H, ECX=0):ECX[bit3] = 0 a #GP would be injected to KVM guest. Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/include/asm/msr-index.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index afe4e13..6ae2462 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -112,6 +112,7 @@ #define RTIT_CTL_USR BIT(3) #define RTIT_CTL_PWR_EVT_ENBIT(4) #define RTIT_CTL_FUP_ON_PTWBIT(5) +#define RTIT_CTL_FABRIC_EN BIT(6) #define RTIT_CTL_CR3EN BIT(7) #define RTIT_CTL_TOPA BIT(8) #define RTIT_CTL_MTC_ENBIT(9) @@ -140,6 +141,8 @@ #define RTIT_STATUS_BUFFOVFBIT(3) #define RTIT_STATUS_ERROR BIT(4) #define RTIT_STATUS_STOPPEDBIT(5) +#define RTIT_STATUS_BYTECNT_OFFSET 32 +#define RTIT_STATUS_BYTECNT(0x1ull << RTIT_STATUS_BYTECNT_OFFSET) #define MSR_IA32_RTIT_ADDR0_A 0x0580 #define MSR_IA32_RTIT_ADDR0_B 0x0581 #define MSR_IA32_RTIT_ADDR1_A 0x0582 -- 1.8.3.1
[PATCH v9 03/12] perf/x86/intel/pt: Add new bit definitions for Intel PT MSRs
These bit definitions are use for emulate MSRs read/write for KVM. For example, IA32_RTIT_CTL.FabricEn[bit 6] is available only when CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 1. If KVM guest try to set this bit with CPUID.(EAX=14H, ECX=0):ECX[bit3] = 0 a #GP would be injected to KVM guest. Signed-off-by: Luwei Kang --- arch/x86/include/asm/msr-index.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index afe4e13..6ae2462 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -112,6 +112,7 @@ #define RTIT_CTL_USR BIT(3) #define RTIT_CTL_PWR_EVT_ENBIT(4) #define RTIT_CTL_FUP_ON_PTWBIT(5) +#define RTIT_CTL_FABRIC_EN BIT(6) #define RTIT_CTL_CR3EN BIT(7) #define RTIT_CTL_TOPA BIT(8) #define RTIT_CTL_MTC_ENBIT(9) @@ -140,6 +141,8 @@ #define RTIT_STATUS_BUFFOVFBIT(3) #define RTIT_STATUS_ERROR BIT(4) #define RTIT_STATUS_STOPPEDBIT(5) +#define RTIT_STATUS_BYTECNT_OFFSET 32 +#define RTIT_STATUS_BYTECNT(0x1ull << RTIT_STATUS_BYTECNT_OFFSET) #define MSR_IA32_RTIT_ADDR0_A 0x0580 #define MSR_IA32_RTIT_ADDR0_B 0x0581 #define MSR_IA32_RTIT_ADDR1_A 0x0582 -- 1.8.3.1
[PATCH v9 01/12] perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public header
From: Chao Peng <chao.p.p...@linux.intel.com> Intel Processor Trace virtualization enabling in KVM guest need to access these MSRs bit definitions, so move them to public header file msr-index.h. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/events/intel/pt.h | 37 - arch/x86/include/asm/msr-index.h | 33 + 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 0eb41d0..0050ca1 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -20,43 +20,6 @@ #define __INTEL_PT_H__ /* - * PT MSR bit definitions - */ -#define RTIT_CTL_TRACEEN BIT(0) -#define RTIT_CTL_CYCLEACC BIT(1) -#define RTIT_CTL_OSBIT(2) -#define RTIT_CTL_USR BIT(3) -#define RTIT_CTL_PWR_EVT_ENBIT(4) -#define RTIT_CTL_FUP_ON_PTWBIT(5) -#define RTIT_CTL_CR3EN BIT(7) -#define RTIT_CTL_TOPA BIT(8) -#define RTIT_CTL_MTC_ENBIT(9) -#define RTIT_CTL_TSC_ENBIT(10) -#define RTIT_CTL_DISRETC BIT(11) -#define RTIT_CTL_PTW_ENBIT(12) -#define RTIT_CTL_BRANCH_EN BIT(13) -#define RTIT_CTL_MTC_RANGE_OFFSET 14 -#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) -#define RTIT_CTL_CYC_THRESH_OFFSET 19 -#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) -#define RTIT_CTL_PSB_FREQ_OFFSET 24 -#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) -#define RTIT_CTL_ADDR0_OFFSET 32 -#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) -#define RTIT_CTL_ADDR1_OFFSET 36 -#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) -#define RTIT_CTL_ADDR2_OFFSET 40 -#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) -#define RTIT_CTL_ADDR3_OFFSET 44 -#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) -#define RTIT_STATUS_FILTEREN BIT(0) -#define RTIT_STATUS_CONTEXTEN BIT(1) -#define RTIT_STATUS_TRIGGEREN BIT(2) -#define RTIT_STATUS_BUFFOVFBIT(3) -#define RTIT_STATUS_ERROR BIT(4) -#define RTIT_STATUS_STOPPEDBIT(5) - -/* * Single-entry ToPA: when this close to region boundary, switch * buffers to avoid losing data. */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 53d5b1b..afe4e13 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -106,7 +106,40 @@ #define MSR_PEBS_LD_LAT_THRESHOLD 0x03f6 #define MSR_IA32_RTIT_CTL 0x0570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OSBIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_PWR_EVT_ENBIT(4) +#define RTIT_CTL_FUP_ON_PTWBIT(5) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_ENBIT(9) +#define RTIT_CTL_TSC_ENBIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_PTW_ENBIT(12) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define RTIT_CTL_ADDR0_OFFSET 32 +#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) +#define RTIT_CTL_ADDR1_OFFSET 36 +#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) +#define RTIT_CTL_ADDR2_OFFSET 40 +#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) +#define RTIT_CTL_ADDR3_OFFSET 44 +#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) #define MSR_IA32_RTIT_STATUS 0x0571 +#define RTIT_STATUS_FILTEREN BIT(0) +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_BUFFOVFBIT(3) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPEDBIT(5) #define MSR_IA32_RTIT_ADDR0_A 0x0580 #define MSR_IA32_RTIT_ADDR0_B 0x0581 #define MSR_IA32_RTIT_ADDR1_A 0x0582 -- 1.8.3.1
[PATCH v9 01/12] perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public header
From: Chao Peng Intel Processor Trace virtualization enabling in KVM guest need to access these MSRs bit definitions, so move them to public header file msr-index.h. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.h | 37 - arch/x86/include/asm/msr-index.h | 33 + 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 0eb41d0..0050ca1 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -20,43 +20,6 @@ #define __INTEL_PT_H__ /* - * PT MSR bit definitions - */ -#define RTIT_CTL_TRACEEN BIT(0) -#define RTIT_CTL_CYCLEACC BIT(1) -#define RTIT_CTL_OSBIT(2) -#define RTIT_CTL_USR BIT(3) -#define RTIT_CTL_PWR_EVT_ENBIT(4) -#define RTIT_CTL_FUP_ON_PTWBIT(5) -#define RTIT_CTL_CR3EN BIT(7) -#define RTIT_CTL_TOPA BIT(8) -#define RTIT_CTL_MTC_ENBIT(9) -#define RTIT_CTL_TSC_ENBIT(10) -#define RTIT_CTL_DISRETC BIT(11) -#define RTIT_CTL_PTW_ENBIT(12) -#define RTIT_CTL_BRANCH_EN BIT(13) -#define RTIT_CTL_MTC_RANGE_OFFSET 14 -#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) -#define RTIT_CTL_CYC_THRESH_OFFSET 19 -#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) -#define RTIT_CTL_PSB_FREQ_OFFSET 24 -#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) -#define RTIT_CTL_ADDR0_OFFSET 32 -#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) -#define RTIT_CTL_ADDR1_OFFSET 36 -#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) -#define RTIT_CTL_ADDR2_OFFSET 40 -#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) -#define RTIT_CTL_ADDR3_OFFSET 44 -#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) -#define RTIT_STATUS_FILTEREN BIT(0) -#define RTIT_STATUS_CONTEXTEN BIT(1) -#define RTIT_STATUS_TRIGGEREN BIT(2) -#define RTIT_STATUS_BUFFOVFBIT(3) -#define RTIT_STATUS_ERROR BIT(4) -#define RTIT_STATUS_STOPPEDBIT(5) - -/* * Single-entry ToPA: when this close to region boundary, switch * buffers to avoid losing data. */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 53d5b1b..afe4e13 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -106,7 +106,40 @@ #define MSR_PEBS_LD_LAT_THRESHOLD 0x03f6 #define MSR_IA32_RTIT_CTL 0x0570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OSBIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_PWR_EVT_ENBIT(4) +#define RTIT_CTL_FUP_ON_PTWBIT(5) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_ENBIT(9) +#define RTIT_CTL_TSC_ENBIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_PTW_ENBIT(12) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define RTIT_CTL_ADDR0_OFFSET 32 +#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) +#define RTIT_CTL_ADDR1_OFFSET 36 +#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) +#define RTIT_CTL_ADDR2_OFFSET 40 +#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) +#define RTIT_CTL_ADDR3_OFFSET 44 +#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) #define MSR_IA32_RTIT_STATUS 0x0571 +#define RTIT_STATUS_FILTEREN BIT(0) +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_BUFFOVFBIT(3) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPEDBIT(5) #define MSR_IA32_RTIT_ADDR0_A 0x0580 #define MSR_IA32_RTIT_ADDR0_B 0x0581 #define MSR_IA32_RTIT_ADDR1_A 0x0582 -- 1.8.3.1
[PATCH v9 00/12] Intel Processor Trace virtualization enabling
Hi All, Here is a patch-series which adding Processor Trace enabling in KVM guest. You can get It's software developer manuals from: https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf In Chapter 4 INTEL PROCESSOR TRACE: VMX IMPROVEMENTS. Introduction: Intel Processor Trace (Intel PT) is an extension of Intel Architecture that captures information about software execution using dedicated hardware facilities that cause only minimal performance perturbation to the software being traced. Details on the Intel PT infrastructure and trace capabilities can be found in the Intel 64 and IA-32 Architectures Software Developer’s Manual, Volume 3C. The suite of architecture changes serve to simplify the process of virtualizing Intel PT for use by a guest software. There are two primary elements to this new architecture support for VMX support improvements made for Intel PT. 1. Addition of a new guest IA32_RTIT_CTL value field to the VMCS. — This serves to speed and simplify the process of disabling trace on VM exit, and restoring it on VM entry. 2. Enabling use of EPT to redirect PT output. — This enables the VMM to elect to virtualize the PT output buffer using EPT. In this mode, the CPU will treat PT output addresses as Guest Physical Addresses (GPAs) and translate them using EPT. This means that Intel PT output reads (of the ToPA table) and writes (of trace output) can cause EPT violations, and other output events. Processor Trace virtualization can be work in one of 2 possible modes by set new option "pt_mode". Default value is system mode. a. system-wide: trace both host/guest and output to host buffer; b. host-guest: trace host/guest simultaneous and output to their respective buffer. >From V8: - move macro definition MSR_IA32_RTIT_ADDR_RANGE from msr-index.h to intel_pt.h; - initialize the RTIT_CTL bitmask to ~0ULL. >From V7: - remove host only mode since it can be emulated by perf code; - merge patch 8 and 9 to make code and data in the same patch; - rename __pt_cap_get() to pt_cap_decode(); - other minor change. >From V6: - split pathes 1~2 to four separate patches (these patches do 2 things) and add more descriptions. >From V5: - rename the function from pt_cap_get_ex() to __pt_cap_get(); - replace the most of function from vmx_pt_supported() to "pt_mode == PT_MODE_HOST_GUEST"(or !=). >From V4: - add data check when setting the value of MSR_IA32_RTIT_CTL; - Invoke new interface to set the intercept of MSRs read/write after "MSR bitmap per-vcpu" patches. >From V3: - change default mode to SYSTEM mode; - add a new patch to move PT out of scattered features; - add a new fucntion kvm_get_pt_addr_cnt() to get the number of address ranges; - add a new function vmx_set_rtit_ctl() to set the value of guest RTIT_CTL, GUEST_IA32_RTIT_CTL and MSRs intercept. >From v2: - replace *_PT_SUPPRESS_PIP to *_PT_CONCEAL_PIP; - clean SECONDARY_EXEC_PT_USE_GPA, VM_EXIT_CLEAR_IA32_RTIT_CTL and VM_ENTRY_LOAD_IA32_RTIT_CTL in SYSTEM mode. These bits must be all set or all clean; - move processor tracing out of scattered features; - add a new function to enable/disable intercept MSRs read/write; - add all Intel PT MSRs read/write and disable intercept when PT is enabled in guest; - disable Intel PT and enable intercept MSRs when L1 guest VMXON; - performance optimization. In Host only mode. we just need to save host RTIT_CTL before vm-entry and restore host RTIT_CTL after vm-exit; In HOST_GUEST mode. we need to save and restore all MSRs only when PT has enabled in guest. - use XSAVES/XRESTORES implement context switch. Haven't implementation in this version and still in debuging. will make a separate patch work on this. >From v1: - remove guest-only mode because guest-only mode can be covered by host-guest mode; - always set "use GPA for processor tracing" in secondary execution control if it can be; - trap RTIT_CTL read/write. Forbid write this msr when VMXON in L1 hypervisor. Chao Peng (7): perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public header perf/x86/intel/pt: Change pt_cap_get() to a public function KVM: x86: Add Intel Processor Trace virtualization mode KVM: x86: Add Intel Processor Trace cpuid emulation KVM: x86: Add Intel Processor Trace context switch for each vcpu KVM: x86: Implement Intel Processor Trace MSRs read/write emulation KVM: x86: Set intercept for Intel PT MSRs read/write Luwei Kang (5): perf/x86/intel/pt: Add new bit definitions for Intel PT MSRs perf/x86/intel/pt: add new capability for Intel PT perf/x86/intel/pt: Introduce a new function to get capability of Intel PT KVM: x86: Introduce a function to initialize the PT configuration KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest arch/x86/events/intel/pt.c | 12 +- arc
[PATCH v9 00/12] Intel Processor Trace virtualization enabling
Hi All, Here is a patch-series which adding Processor Trace enabling in KVM guest. You can get It's software developer manuals from: https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf In Chapter 4 INTEL PROCESSOR TRACE: VMX IMPROVEMENTS. Introduction: Intel Processor Trace (Intel PT) is an extension of Intel Architecture that captures information about software execution using dedicated hardware facilities that cause only minimal performance perturbation to the software being traced. Details on the Intel PT infrastructure and trace capabilities can be found in the Intel 64 and IA-32 Architectures Software Developer’s Manual, Volume 3C. The suite of architecture changes serve to simplify the process of virtualizing Intel PT for use by a guest software. There are two primary elements to this new architecture support for VMX support improvements made for Intel PT. 1. Addition of a new guest IA32_RTIT_CTL value field to the VMCS. — This serves to speed and simplify the process of disabling trace on VM exit, and restoring it on VM entry. 2. Enabling use of EPT to redirect PT output. — This enables the VMM to elect to virtualize the PT output buffer using EPT. In this mode, the CPU will treat PT output addresses as Guest Physical Addresses (GPAs) and translate them using EPT. This means that Intel PT output reads (of the ToPA table) and writes (of trace output) can cause EPT violations, and other output events. Processor Trace virtualization can be work in one of 2 possible modes by set new option "pt_mode". Default value is system mode. a. system-wide: trace both host/guest and output to host buffer; b. host-guest: trace host/guest simultaneous and output to their respective buffer. >From V8: - move macro definition MSR_IA32_RTIT_ADDR_RANGE from msr-index.h to intel_pt.h; - initialize the RTIT_CTL bitmask to ~0ULL. >From V7: - remove host only mode since it can be emulated by perf code; - merge patch 8 and 9 to make code and data in the same patch; - rename __pt_cap_get() to pt_cap_decode(); - other minor change. >From V6: - split pathes 1~2 to four separate patches (these patches do 2 things) and add more descriptions. >From V5: - rename the function from pt_cap_get_ex() to __pt_cap_get(); - replace the most of function from vmx_pt_supported() to "pt_mode == PT_MODE_HOST_GUEST"(or !=). >From V4: - add data check when setting the value of MSR_IA32_RTIT_CTL; - Invoke new interface to set the intercept of MSRs read/write after "MSR bitmap per-vcpu" patches. >From V3: - change default mode to SYSTEM mode; - add a new patch to move PT out of scattered features; - add a new fucntion kvm_get_pt_addr_cnt() to get the number of address ranges; - add a new function vmx_set_rtit_ctl() to set the value of guest RTIT_CTL, GUEST_IA32_RTIT_CTL and MSRs intercept. >From v2: - replace *_PT_SUPPRESS_PIP to *_PT_CONCEAL_PIP; - clean SECONDARY_EXEC_PT_USE_GPA, VM_EXIT_CLEAR_IA32_RTIT_CTL and VM_ENTRY_LOAD_IA32_RTIT_CTL in SYSTEM mode. These bits must be all set or all clean; - move processor tracing out of scattered features; - add a new function to enable/disable intercept MSRs read/write; - add all Intel PT MSRs read/write and disable intercept when PT is enabled in guest; - disable Intel PT and enable intercept MSRs when L1 guest VMXON; - performance optimization. In Host only mode. we just need to save host RTIT_CTL before vm-entry and restore host RTIT_CTL after vm-exit; In HOST_GUEST mode. we need to save and restore all MSRs only when PT has enabled in guest. - use XSAVES/XRESTORES implement context switch. Haven't implementation in this version and still in debuging. will make a separate patch work on this. >From v1: - remove guest-only mode because guest-only mode can be covered by host-guest mode; - always set "use GPA for processor tracing" in secondary execution control if it can be; - trap RTIT_CTL read/write. Forbid write this msr when VMXON in L1 hypervisor. Chao Peng (7): perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public header perf/x86/intel/pt: Change pt_cap_get() to a public function KVM: x86: Add Intel Processor Trace virtualization mode KVM: x86: Add Intel Processor Trace cpuid emulation KVM: x86: Add Intel Processor Trace context switch for each vcpu KVM: x86: Implement Intel Processor Trace MSRs read/write emulation KVM: x86: Set intercept for Intel PT MSRs read/write Luwei Kang (5): perf/x86/intel/pt: Add new bit definitions for Intel PT MSRs perf/x86/intel/pt: add new capability for Intel PT perf/x86/intel/pt: Introduce a new function to get capability of Intel PT KVM: x86: Introduce a function to initialize the PT configuration KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest arch/x86/events/intel/pt.c | 12 +- arc
[PATCH v8 04/12] perf/x86/intel/pt: add new capability for Intel PT
CPUID(EAX=14H,ECX=0):EBX[bit 3] = 1 indicates support of output to Trace Transport subsystem. MSR IA32_RTIT_CTL.FabricEn[bit 6] is reserved if CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0. This is use for emulate IA32_RTIT_CTL MSR read/write in KVM. KVM guest write IA32_RTIT_CTL will trap to root mode and a #GP would be injected to guest if set IA32_RTIT_CTL.FabricEn with CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0. Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/events/intel/pt.c | 1 + arch/x86/include/asm/intel_pt.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index c80e2f5..f65f97a 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -68,6 +68,7 @@ PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)), PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), PT_CAP(mtc_periods, 1, CPUID_EAX, 0x), diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 4270421..2de4db0 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -16,6 +16,7 @@ enum pt_capabilities { PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, + PT_CAP_output_subsys, PT_CAP_payloads_lip, PT_CAP_num_address_ranges, PT_CAP_mtc_periods, -- 1.8.3.1
[PATCH v8 04/12] perf/x86/intel/pt: add new capability for Intel PT
CPUID(EAX=14H,ECX=0):EBX[bit 3] = 1 indicates support of output to Trace Transport subsystem. MSR IA32_RTIT_CTL.FabricEn[bit 6] is reserved if CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0. This is use for emulate IA32_RTIT_CTL MSR read/write in KVM. KVM guest write IA32_RTIT_CTL will trap to root mode and a #GP would be injected to guest if set IA32_RTIT_CTL.FabricEn with CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0. Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 1 + arch/x86/include/asm/intel_pt.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index c80e2f5..f65f97a 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -68,6 +68,7 @@ PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)), PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), PT_CAP(mtc_periods, 1, CPUID_EAX, 0x), diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 4270421..2de4db0 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -16,6 +16,7 @@ enum pt_capabilities { PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, + PT_CAP_output_subsys, PT_CAP_payloads_lip, PT_CAP_num_address_ranges, PT_CAP_mtc_periods, -- 1.8.3.1
[PATCH v8 12/12] KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest
Currently, Intel Processor Trace do not support tracing in L1 guest VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM, on these type of processors, execution of the VMXON instruction will clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL causes a general-protection exception (#GP). Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/kvm/vmx.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 170cd48..7ace11a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3996,7 +3996,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || - vmx_rtit_ctl_check(vcpu, data)) + vmx_rtit_ctl_check(vcpu, data) || + vmx->nested.vmxon) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); @@ -8089,6 +8090,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (ret) return ret; + if (pt_mode == PT_MODE_HOST_GUEST) { + vmx->pt_desc.guest.ctl = 0; + pt_set_intercept_for_msr(vmx, 1); + } + nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); } -- 1.8.3.1
[PATCH v8 12/12] KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest
Currently, Intel Processor Trace do not support tracing in L1 guest VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM, on these type of processors, execution of the VMXON instruction will clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL causes a general-protection exception (#GP). Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 170cd48..7ace11a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3996,7 +3996,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || - vmx_rtit_ctl_check(vcpu, data)) + vmx_rtit_ctl_check(vcpu, data) || + vmx->nested.vmxon) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); @@ -8089,6 +8090,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (ret) return ret; + if (pt_mode == PT_MODE_HOST_GUEST) { + vmx->pt_desc.guest.ctl = 0; + pt_set_intercept_for_msr(vmx, 1); + } + nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); } -- 1.8.3.1
[PATCH v8 10/12] KVM: x86: Implement Intel Processor Trace MSRs read/write emulation
From: Chao Peng <chao.p.p...@linux.intel.com> This patch implement Intel Processor Trace MSRs read/write emulation. Intel PT MSRs read/write need to be emulated when Intel PT MSRs is intercepted in guest and during live migration. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/include/asm/intel_pt.h | 8 ++ arch/x86/kvm/vmx.c | 172 arch/x86/kvm/x86.c | 33 +++- 3 files changed, 212 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 5748205..3da4cdb 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -8,6 +8,14 @@ #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 +#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ + RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ + RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ + RTIT_STATUS_BYTECNT)) + +#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ + (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5c321cd..d04b235 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2808,6 +2808,77 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); } +static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long value; + + /* +* Any MSR write that attempts to change bits marked reserved will +* case a #GP fault. +*/ + if (data & vmx->pt_desc.ctl_bitmask) + return 1; + + /* +* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will +* result in a #GP unless the same write also clears TraceEn. +*/ + if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && + ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + return 1; + + /* +* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit +* and FabricEn would cause #GP, if +* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 +*/ + if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && + !(data & RTIT_CTL_FABRIC_EN) && + !pt_cap_decode(vmx->pt_desc.caps, PT_CAP_single_range_output)) + return 1; + + /* +* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that +* utilize encodings marked reserved will casue a #GP fault. +*/ + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc_periods); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc) && + !test_bit((data & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET, )) + return 1; + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cycle_thresholds); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET, )) + return 1; + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_periods); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET, )) + return 1; + + /* +* If ADDRx_CFG is reserved or the encodings is >2 will +* cause a #GP fault. +*/ + value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) + return 1; + + return 0; +} + + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; @@ -3624,6 +3695,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_ms
[PATCH v8 10/12] KVM: x86: Implement Intel Processor Trace MSRs read/write emulation
From: Chao Peng This patch implement Intel Processor Trace MSRs read/write emulation. Intel PT MSRs read/write need to be emulated when Intel PT MSRs is intercepted in guest and during live migration. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/intel_pt.h | 8 ++ arch/x86/kvm/vmx.c | 172 arch/x86/kvm/x86.c | 33 +++- 3 files changed, 212 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 5748205..3da4cdb 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -8,6 +8,14 @@ #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 +#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ + RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ + RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ + RTIT_STATUS_BYTECNT)) + +#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ + (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5c321cd..d04b235 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2808,6 +2808,77 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); } +static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long value; + + /* +* Any MSR write that attempts to change bits marked reserved will +* case a #GP fault. +*/ + if (data & vmx->pt_desc.ctl_bitmask) + return 1; + + /* +* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will +* result in a #GP unless the same write also clears TraceEn. +*/ + if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && + ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + return 1; + + /* +* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit +* and FabricEn would cause #GP, if +* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 +*/ + if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && + !(data & RTIT_CTL_FABRIC_EN) && + !pt_cap_decode(vmx->pt_desc.caps, PT_CAP_single_range_output)) + return 1; + + /* +* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that +* utilize encodings marked reserved will casue a #GP fault. +*/ + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc_periods); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc) && + !test_bit((data & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET, )) + return 1; + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cycle_thresholds); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET, )) + return 1; + value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_periods); + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET, )) + return 1; + + /* +* If ADDRx_CFG is reserved or the encodings is >2 will +* cause a #GP fault. +*/ + value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; + if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) + return 1; + + return 0; +} + + static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; @@ -3624,6 +3695,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; + u32 index; switch (msr_info->index) {
[PATCH v8 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write
From: Chao Peng <chao.p.p...@linux.intel.com> Disable intercept Intel PT MSRs only when Intel PT is enabled in guest. But MSR_IA32_RTIT_CTL will alway be intercept. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/kvm/vmx.c | 23 +++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d04b235..170cd48 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -947,6 +947,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3998,6 +3999,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_rtit_ctl_check(vcpu, data)) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); + pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); vmx->pt_desc.guest.ctl = data; break; case MSR_IA32_RTIT_STATUS: @@ -5819,6 +5821,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) vmx->msr_bitmap_mode = mode; } +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag) +{ + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + u32 i; + + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, + MSR_TYPE_RW, flag); + for (i = 0; i < vmx->pt_desc.addr_range; i++) { + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + } +} + static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; -- 1.8.3.1
[PATCH v8 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write
From: Chao Peng Disable intercept Intel PT MSRs only when Intel PT is enabled in guest. But MSR_IA32_RTIT_CTL will alway be intercept. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 23 +++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d04b235..170cd48 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -947,6 +947,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3998,6 +3999,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_rtit_ctl_check(vcpu, data)) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); + pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN)); vmx->pt_desc.guest.ctl = data; break; case MSR_IA32_RTIT_STATUS: @@ -5819,6 +5821,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) vmx->msr_bitmap_mode = mode; } +static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag) +{ + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + u32 i; + + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, + MSR_TYPE_RW, flag); + for (i = 0; i < vmx->pt_desc.addr_range; i++) { + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(msr_bitmap, + MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + } +} + static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; -- 1.8.3.1
[PATCH v8 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation
From: Chao Peng <chao.p.p...@linux.intel.com> Expose Intel Processor Trace to guest only when PT work in HOST_GUEST mode. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c| 22 -- arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c | 6 ++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8cb8461..e9acd8b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1025,6 +1025,7 @@ struct kvm_x86_ops { bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); + bool (*pt_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 82055b9..e04bf67 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -336,6 +336,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -393,7 +394,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL); + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = @@ -423,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xd); + entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); break; case 1: entry->edx &= kvm_cpuid_1_edx_x86_features; @@ -595,6 +596,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + /* Intel PT */ + case 0x14: { + int t, times = entry->eax; + + if (!f_intel_pt) + break; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (t = 1; t <= times; ++t) { + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent([t], function, t); + entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { static const char signature[12] = "KVMKVMKVM\0\0"; const u32 *sigptr = (const u32 *)signature; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1fc05e4..21b2441 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5876,6 +5876,11 @@ static bool svm_umip_emulated(void) return false; } +static bool svm_pt_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -7101,6 +7106,7 @@ static int svm_unregister_enc_region(struct kvm *kvm, .mpx_supported = svm_mpx_supported, .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, + .pt_supported = svm_pt_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ede5abf..f9b701a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9590,6 +9590,11 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static bool vmx_pt_supported(void) +{ + return (pt_mode == PT_MODE_HOST_GUEST); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -12817,6 +12822,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, + .pt_supported = vmx_pt_supported, .check_nested_events = vmx_check_nested_events, -- 1.8.3.1
[PATCH v8 08/12] KVM: x86: Add Intel Processor Trace context switch for each vcpu
From: Chao Peng <chao.p.p...@linux.intel.com> Load/Store Intel processor trace register in context switch. MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS. In HOST_GUEST mode, we need load/resore PT MSRs only when PT is enabled in guest. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/kvm/vmx.c | 94 ++ 1 file changed, 94 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f9b701a..eb5f50a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -596,6 +596,24 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) (unsigned long *)_desc->control); } +struct pt_ctx { + u64 ctl; + u64 status; + u64 output_base; + u64 output_mask; + u64 cr3_match; + u64 addr_a[MSR_IA32_RTIT_ADDR_RANGE]; + u64 addr_b[MSR_IA32_RTIT_ADDR_RANGE]; +}; + +struct pt_desc { + u64 ctl_bitmask; + u32 addr_range; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; + struct pt_ctx host; + struct pt_ctx guest; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -692,6 +710,8 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + + struct pt_desc pt_desc; }; enum segment_cache_field { @@ -2390,6 +2410,69 @@ static unsigned long segment_base(u16 selector) } #endif +static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static void pt_guest_enter(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + /* Save host state before VM entry */ + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + + /* +* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled +* on VM entry when it has been disabled in guest before). +*/ + vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl); + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + wrmsrl(MSR_IA32_RTIT_CTL, 0); + pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + } +} + +static void pt_guest_exit(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range); + } + + /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); +} + static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6134,6 +6217,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } + + if (pt_mode == PT_MODE_HOST_GUEST) { + memset(>pt_desc, 0, sizeof(vmx->pt_desc)); + /* Bit[6~0] are forced to 1, writes are ignored. */ + vmx->pt_desc.guest.output_mask = 0x7F; + vmcs_write64(GUEST_IA32_RTIT_CTL, 0); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9802,6 +9892,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vcpu->arch.pkru); + pt_guest_enter(vmx); + atomic_switch_perf_msrs(vmx); vmx_arm_hv_timer(vcpu); @@ -9996,6 +10088,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_
[PATCH v8 09/12] KVM: x86: Introduce a function to initialize the PT configuration
Initialize the Intel PT configuration when cpuid update. Include cpuid inforamtion, rtit_ctl bit mask and the number of address ranges. Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/kvm/vmx.c | 69 ++ 1 file changed, 69 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index eb5f50a..5c321cd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10413,6 +10413,71 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } +static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *best = NULL; + int i; + + for (i = 0; i < PT_CPUID_LEAVES; i++) { + best = kvm_find_cpuid_entry(vcpu, 0x14, i); + if (!best) + return; + vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; + vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; + vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; + vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; + } + + /* Get the number of configurable Address Ranges for filtering */ + vmx->pt_desc.addr_range = pt_cap_decode(vmx->pt_desc.caps, + PT_CAP_num_address_ranges); + + /* Clear the no dependency bits */ + vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); + + /* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and +* PSBFreq can be set +*/ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | + RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and +* MTCFreq can be set +*/ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | + RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); + + /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_ptwrite)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | + RTIT_CTL_PTW_EN); + + /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_power_event_trace)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; + + /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_topa_output)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_output_subsys)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; + + /* unmask address range configure area */ + for (i = 0; i < vmx->pt_desc.addr_range; i++) + vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -10431,6 +10496,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) nested_vmx_cr_fixed1_bits_update(vcpu); + + if (boot_cpu_has(X86_FEATURE_INTEL_PT) && + guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + update_intel_pt_cfg(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -- 1.8.3.1
[PATCH v8 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation
From: Chao Peng Expose Intel Processor Trace to guest only when PT work in HOST_GUEST mode. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c| 22 -- arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c | 6 ++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8cb8461..e9acd8b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1025,6 +1025,7 @@ struct kvm_x86_ops { bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); + bool (*pt_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 82055b9..e04bf67 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -336,6 +336,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -393,7 +394,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL); + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = @@ -423,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xd); + entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); break; case 1: entry->edx &= kvm_cpuid_1_edx_x86_features; @@ -595,6 +596,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + /* Intel PT */ + case 0x14: { + int t, times = entry->eax; + + if (!f_intel_pt) + break; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (t = 1; t <= times; ++t) { + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent([t], function, t); + entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { static const char signature[12] = "KVMKVMKVM\0\0"; const u32 *sigptr = (const u32 *)signature; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1fc05e4..21b2441 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5876,6 +5876,11 @@ static bool svm_umip_emulated(void) return false; } +static bool svm_pt_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -7101,6 +7106,7 @@ static int svm_unregister_enc_region(struct kvm *kvm, .mpx_supported = svm_mpx_supported, .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, + .pt_supported = svm_pt_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ede5abf..f9b701a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9590,6 +9590,11 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static bool vmx_pt_supported(void) +{ + return (pt_mode == PT_MODE_HOST_GUEST); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -12817,6 +12822,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, + .pt_supported = vmx_pt_supported, .check_nested_events = vmx_check_nested_events, -- 1.8.3.1
[PATCH v8 08/12] KVM: x86: Add Intel Processor Trace context switch for each vcpu
From: Chao Peng Load/Store Intel processor trace register in context switch. MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS. In HOST_GUEST mode, we need load/resore PT MSRs only when PT is enabled in guest. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 94 ++ 1 file changed, 94 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f9b701a..eb5f50a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -596,6 +596,24 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) (unsigned long *)_desc->control); } +struct pt_ctx { + u64 ctl; + u64 status; + u64 output_base; + u64 output_mask; + u64 cr3_match; + u64 addr_a[MSR_IA32_RTIT_ADDR_RANGE]; + u64 addr_b[MSR_IA32_RTIT_ADDR_RANGE]; +}; + +struct pt_desc { + u64 ctl_bitmask; + u32 addr_range; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; + struct pt_ctx host; + struct pt_ctx guest; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -692,6 +710,8 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + + struct pt_desc pt_desc; }; enum segment_cache_field { @@ -2390,6 +2410,69 @@ static unsigned long segment_base(u16 selector) } #endif +static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static void pt_guest_enter(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + /* Save host state before VM entry */ + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + + /* +* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled +* on VM entry when it has been disabled in guest before). +*/ + vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl); + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + wrmsrl(MSR_IA32_RTIT_CTL, 0); + pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + } +} + +static void pt_guest_exit(struct vcpu_vmx *vmx) +{ + if (pt_mode == PT_MODE_SYSTEM) + return; + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range); + pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range); + } + + /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); +} + static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6134,6 +6217,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } + + if (pt_mode == PT_MODE_HOST_GUEST) { + memset(>pt_desc, 0, sizeof(vmx->pt_desc)); + /* Bit[6~0] are forced to 1, writes are ignored. */ + vmx->pt_desc.guest.output_mask = 0x7F; + vmcs_write64(GUEST_IA32_RTIT_CTL, 0); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9802,6 +9892,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vcpu->arch.pkru); + pt_guest_enter(vmx); + atomic_switch_perf_msrs(vmx); vmx_arm_hv_timer(vcpu); @@ -9996,6 +10088,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; + pt_guest_exit(vmx); + /* * ea
[PATCH v8 09/12] KVM: x86: Introduce a function to initialize the PT configuration
Initialize the Intel PT configuration when cpuid update. Include cpuid inforamtion, rtit_ctl bit mask and the number of address ranges. Signed-off-by: Luwei Kang --- arch/x86/kvm/vmx.c | 69 ++ 1 file changed, 69 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index eb5f50a..5c321cd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10413,6 +10413,71 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } +static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *best = NULL; + int i; + + for (i = 0; i < PT_CPUID_LEAVES; i++) { + best = kvm_find_cpuid_entry(vcpu, 0x14, i); + if (!best) + return; + vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; + vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; + vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; + vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; + } + + /* Get the number of configurable Address Ranges for filtering */ + vmx->pt_desc.addr_range = pt_cap_decode(vmx->pt_desc.caps, + PT_CAP_num_address_ranges); + + /* Clear the no dependency bits */ + vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); + + /* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and +* PSBFreq can be set +*/ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | + RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); + + /* +* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and +* MTCFreq can be set +*/ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | + RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); + + /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_ptwrite)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | + RTIT_CTL_PTW_EN); + + /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_power_event_trace)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; + + /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_topa_output)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ + if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_output_subsys)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; + + /* unmask address range configure area */ + for (i = 0; i < vmx->pt_desc.addr_range; i++) + vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4)); +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -10431,6 +10496,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) nested_vmx_cr_fixed1_bits_update(vcpu); + + if (boot_cpu_has(X86_FEATURE_INTEL_PT) && + guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + update_intel_pt_cfg(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -- 1.8.3.1
[PATCH v8 01/12] perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public header
From: Chao Peng <chao.p.p...@linux.intel.com> Intel Processor Trace virtualization enabling in KVM guest need to access these MSRs bit definitions, so move them to public header file msr-index.h. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/events/intel/pt.h | 37 - arch/x86/include/asm/msr-index.h | 34 ++ 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 0eb41d0..0050ca1 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -20,43 +20,6 @@ #define __INTEL_PT_H__ /* - * PT MSR bit definitions - */ -#define RTIT_CTL_TRACEEN BIT(0) -#define RTIT_CTL_CYCLEACC BIT(1) -#define RTIT_CTL_OSBIT(2) -#define RTIT_CTL_USR BIT(3) -#define RTIT_CTL_PWR_EVT_ENBIT(4) -#define RTIT_CTL_FUP_ON_PTWBIT(5) -#define RTIT_CTL_CR3EN BIT(7) -#define RTIT_CTL_TOPA BIT(8) -#define RTIT_CTL_MTC_ENBIT(9) -#define RTIT_CTL_TSC_ENBIT(10) -#define RTIT_CTL_DISRETC BIT(11) -#define RTIT_CTL_PTW_ENBIT(12) -#define RTIT_CTL_BRANCH_EN BIT(13) -#define RTIT_CTL_MTC_RANGE_OFFSET 14 -#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) -#define RTIT_CTL_CYC_THRESH_OFFSET 19 -#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) -#define RTIT_CTL_PSB_FREQ_OFFSET 24 -#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) -#define RTIT_CTL_ADDR0_OFFSET 32 -#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) -#define RTIT_CTL_ADDR1_OFFSET 36 -#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) -#define RTIT_CTL_ADDR2_OFFSET 40 -#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) -#define RTIT_CTL_ADDR3_OFFSET 44 -#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) -#define RTIT_STATUS_FILTEREN BIT(0) -#define RTIT_STATUS_CONTEXTEN BIT(1) -#define RTIT_STATUS_TRIGGEREN BIT(2) -#define RTIT_STATUS_BUFFOVFBIT(3) -#define RTIT_STATUS_ERROR BIT(4) -#define RTIT_STATUS_STOPPEDBIT(5) - -/* * Single-entry ToPA: when this close to region boundary, switch * buffers to avoid losing data. */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 53d5b1b..5e8d156 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -106,7 +106,40 @@ #define MSR_PEBS_LD_LAT_THRESHOLD 0x03f6 #define MSR_IA32_RTIT_CTL 0x0570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OSBIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_PWR_EVT_ENBIT(4) +#define RTIT_CTL_FUP_ON_PTWBIT(5) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_ENBIT(9) +#define RTIT_CTL_TSC_ENBIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_PTW_ENBIT(12) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define RTIT_CTL_ADDR0_OFFSET 32 +#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) +#define RTIT_CTL_ADDR1_OFFSET 36 +#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) +#define RTIT_CTL_ADDR2_OFFSET 40 +#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) +#define RTIT_CTL_ADDR3_OFFSET 44 +#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) #define MSR_IA32_RTIT_STATUS 0x0571 +#define RTIT_STATUS_FILTEREN BIT(0) +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_BUFFOVFBIT(3) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPEDBIT(5) #define MSR_IA32_RTIT_ADDR0_A 0x0580 #define MSR_IA32_RTIT_ADDR0_B 0x0581 #define MSR_IA32_RTIT_ADDR1
[PATCH v8 05/12] perf/x86/intel/pt: Introduce a new function to get capability of Intel PT
New function pt_cap_decode() will be invoked in KVM to check if a specific capability is available in KVM guest. Another function pt_cap_get() can only check the hardware capabilities but this may different with KVM guest because some features may not be exposed to guest. Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/events/intel/pt.c | 10 -- arch/x86/include/asm/intel_pt.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index f65f97a..18a2e80 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -76,14 +76,20 @@ PT_CAP(psb_periods, 1, CPUID_EBX, 0x), }; -u32 pt_cap_get(enum pt_capabilities cap) +u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { struct pt_cap_desc *cd = _caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; + u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; unsigned int shift = __ffs(cd->mask); return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(pt_cap_decode); + +u32 pt_cap_get(enum pt_capabilities cap) +{ + return pt_cap_decode(pt_pmu.caps, cap); +} EXPORT_SYMBOL_GPL(pt_cap_get); static ssize_t pt_cap_show(struct device *cdev, diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 2de4db0..9c71453 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -27,9 +27,11 @@ enum pt_capabilities { #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) void cpu_emergency_stop_pt(void); extern u32 pt_cap_get(enum pt_capabilities cap); +extern u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap); #else static inline void cpu_emergency_stop_pt(void) {} static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; } +static u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ -- 1.8.3.1
[PATCH v8 01/12] perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public header
From: Chao Peng Intel Processor Trace virtualization enabling in KVM guest need to access these MSRs bit definitions, so move them to public header file msr-index.h. Signed-off-by: Chao Peng Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.h | 37 - arch/x86/include/asm/msr-index.h | 34 ++ 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 0eb41d0..0050ca1 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -20,43 +20,6 @@ #define __INTEL_PT_H__ /* - * PT MSR bit definitions - */ -#define RTIT_CTL_TRACEEN BIT(0) -#define RTIT_CTL_CYCLEACC BIT(1) -#define RTIT_CTL_OSBIT(2) -#define RTIT_CTL_USR BIT(3) -#define RTIT_CTL_PWR_EVT_ENBIT(4) -#define RTIT_CTL_FUP_ON_PTWBIT(5) -#define RTIT_CTL_CR3EN BIT(7) -#define RTIT_CTL_TOPA BIT(8) -#define RTIT_CTL_MTC_ENBIT(9) -#define RTIT_CTL_TSC_ENBIT(10) -#define RTIT_CTL_DISRETC BIT(11) -#define RTIT_CTL_PTW_ENBIT(12) -#define RTIT_CTL_BRANCH_EN BIT(13) -#define RTIT_CTL_MTC_RANGE_OFFSET 14 -#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) -#define RTIT_CTL_CYC_THRESH_OFFSET 19 -#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) -#define RTIT_CTL_PSB_FREQ_OFFSET 24 -#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) -#define RTIT_CTL_ADDR0_OFFSET 32 -#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) -#define RTIT_CTL_ADDR1_OFFSET 36 -#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) -#define RTIT_CTL_ADDR2_OFFSET 40 -#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) -#define RTIT_CTL_ADDR3_OFFSET 44 -#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) -#define RTIT_STATUS_FILTEREN BIT(0) -#define RTIT_STATUS_CONTEXTEN BIT(1) -#define RTIT_STATUS_TRIGGEREN BIT(2) -#define RTIT_STATUS_BUFFOVFBIT(3) -#define RTIT_STATUS_ERROR BIT(4) -#define RTIT_STATUS_STOPPEDBIT(5) - -/* * Single-entry ToPA: when this close to region boundary, switch * buffers to avoid losing data. */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 53d5b1b..5e8d156 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -106,7 +106,40 @@ #define MSR_PEBS_LD_LAT_THRESHOLD 0x03f6 #define MSR_IA32_RTIT_CTL 0x0570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OSBIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_PWR_EVT_ENBIT(4) +#define RTIT_CTL_FUP_ON_PTWBIT(5) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_ENBIT(9) +#define RTIT_CTL_TSC_ENBIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_PTW_ENBIT(12) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define RTIT_CTL_ADDR0_OFFSET 32 +#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) +#define RTIT_CTL_ADDR1_OFFSET 36 +#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) +#define RTIT_CTL_ADDR2_OFFSET 40 +#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) +#define RTIT_CTL_ADDR3_OFFSET 44 +#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) #define MSR_IA32_RTIT_STATUS 0x0571 +#define RTIT_STATUS_FILTEREN BIT(0) +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_BUFFOVFBIT(3) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPEDBIT(5) #define MSR_IA32_RTIT_ADDR0_A 0x0580 #define MSR_IA32_RTIT_ADDR0_B 0x0581 #define MSR_IA32_RTIT_ADDR1_A 0x0582 @@ -115,6 +148,7 @@ #define MSR_IA32_RTIT_ADDR2_B 0x05
[PATCH v8 05/12] perf/x86/intel/pt: Introduce a new function to get capability of Intel PT
New function pt_cap_decode() will be invoked in KVM to check if a specific capability is available in KVM guest. Another function pt_cap_get() can only check the hardware capabilities but this may different with KVM guest because some features may not be exposed to guest. Signed-off-by: Luwei Kang --- arch/x86/events/intel/pt.c | 10 -- arch/x86/include/asm/intel_pt.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index f65f97a..18a2e80 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -76,14 +76,20 @@ PT_CAP(psb_periods, 1, CPUID_EBX, 0x), }; -u32 pt_cap_get(enum pt_capabilities cap) +u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { struct pt_cap_desc *cd = _caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; + u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; unsigned int shift = __ffs(cd->mask); return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(pt_cap_decode); + +u32 pt_cap_get(enum pt_capabilities cap) +{ + return pt_cap_decode(pt_pmu.caps, cap); +} EXPORT_SYMBOL_GPL(pt_cap_get); static ssize_t pt_cap_show(struct device *cdev, diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 2de4db0..9c71453 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -27,9 +27,11 @@ enum pt_capabilities { #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) void cpu_emergency_stop_pt(void); extern u32 pt_cap_get(enum pt_capabilities cap); +extern u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap); #else static inline void cpu_emergency_stop_pt(void) {} static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; } +static u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ -- 1.8.3.1
[PATCH v8 06/12] KVM: x86: Add Intel Processor Trace virtualization mode
From: Chao Peng <chao.p.p...@linux.intel.com> Intel PT virtualization can be work in one of 2 possible modes: a. system-wide: trace both host and guest and output to host buffer; b. host-guest: trace host/guest simultaneous and output to their respective buffer. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> Signed-off-by: Luwei Kang <luwei.k...@intel.com> --- arch/x86/include/asm/intel_pt.h | 3 ++ arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/vmx.h | 8 + arch/x86/kvm/vmx.c | 68 +--- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 9c71453..5748205 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -5,6 +5,9 @@ #define PT_CPUID_LEAVES2 #define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ +#define PT_MODE_SYSTEM 0 +#define PT_MODE_HOST_GUEST 1 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f163f04..96a1fc8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -790,6 +790,7 @@ #define VMX_BASIC_INOUT0x0040LLU /* MSR_IA32_VMX_MISC bits */ +#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 5db8b0b..5936d72 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -76,7 +76,9 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x4000 #define SECONDARY_EXEC_RDSEED_EXITING 0x0001 #define SECONDARY_EXEC_ENABLE_PML 0x0002 +#define SECONDARY_EXEC_PT_CONCEAL_VMX 0x0008 #define SECONDARY_EXEC_XSAVES 0x0010 +#define SECONDARY_EXEC_PT_USE_GPA 0x0100 #define SECONDARY_EXEC_TSC_SCALING 0x0200 #define PIN_BASED_EXT_INTR_MASK 0x0001 @@ -97,6 +99,8 @@ #define VM_EXIT_LOAD_IA32_EFER 0x0020 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x0040 #define VM_EXIT_CLEAR_BNDCFGS 0x0080 +#define VM_EXIT_PT_CONCEAL_PIP 0x0100 +#define VM_EXIT_CLEAR_IA32_RTIT_CTL0x0200 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -108,6 +112,8 @@ #define VM_ENTRY_LOAD_IA32_PAT 0x4000 #define VM_ENTRY_LOAD_IA32_EFER 0x8000 #define VM_ENTRY_LOAD_BNDCFGS 0x0001 +#define VM_ENTRY_PT_CONCEAL_PIP0x0002 +#define VM_ENTRY_LOAD_IA32_RTIT_CTL0x0004 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x11ff @@ -234,6 +240,8 @@ enum vmcs_field { GUEST_PDPTR3_HIGH = 0x2811, GUEST_BNDCFGS = 0x2812, GUEST_BNDCFGS_HIGH = 0x2813, + GUEST_IA32_RTIT_CTL = 0x2814, + GUEST_IA32_RTIT_CTL_HIGH= 0x2815, HOST_IA32_PAT = 0x2c00, HOST_IA32_PAT_HIGH = 0x2c01, HOST_IA32_EFER = 0x2c02, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 467cab4..ede5abf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -186,6 +187,10 @@ static unsigned int ple_window_max= KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); +/* Default is SYSTEM mode. */ +static int __read_mostly pt_mode = PT_MODE_SYSTEM; +module_param(pt_mode, int, S_IRUGO); + extern const ulong vmx_return; struct kvm_vmx { @@ -1511,6 +1516,20 @@ static bool vmx_umip_emulated(void) SECONDARY_EXEC_DESC; } +static inline bool cpu_has_vmx_intel_pt(void) +{ + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + return !!(vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT); +} + +static inline bool cpu_has_vmx_pt_use_gpa(void) +{ + return !!(vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PT_USE_GPA); +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -4025,6 +4044,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_PT_USE_G