[RFC v1 3/9] KVM: x86: Implement MSR_IA32_PEBS_ENABLE read/write emulation

2019-08-28 Thread Luwei Kang
This patch implements the MSR_IA32_PEBS_ENABLE register
read/write emulation for KVM guest. MSR_IA32_PEBS_ENABLE
register can be accessed only when PEBS is supported in KVM.

VMM need to reprogram the counter when the value of this MSR
changed because some of the counters will be created or destroyed.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h  |  2 ++
 arch/x86/include/asm/msr-index.h |  3 +++
 arch/x86/kvm/vmx/pmu_intel.c | 42 +---
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3463326..df966c9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -471,6 +471,8 @@ struct kvm_pmu {
u64 global_ctrl_mask;
u64 global_ovf_ctrl_mask;
u64 reserved_bits;
+   u64 pebs_enable;
+   u64 pebs_enable_mask;
u8 version;
bool pebs_pt;   /* PEBS output to Intel PT */
struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3dd166a..a9e8720 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -131,6 +131,9 @@
 #define LBR_INFO_ABORT BIT_ULL(61)
 #define LBR_INFO_CYCLES0x
 
+#define MSR_IA32_PEBS_PMI_AFTER_REC(1UL << 60)
+#define MSR_IA32_PEBS_OUTPUT_PT(1UL << 61)
+#define MSR_IA32_PEBS_OUTPUT_MASK  (3UL << 61)
 #define MSR_IA32_PEBS_ENABLE   0x03f1
 #define MSR_PEBS_DATA_CFG  0x03f2
 #define MSR_IA32_DS_AREA   0x0600
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index e1c987f..fc79cc6 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -66,6 +66,20 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 
data)
reprogram_counter(pmu, bit);
 }
 
+static void pebs_enable_changed(struct kvm_pmu *pmu, u64 data)
+{
+   int bit;
+   u64 mask = ((1ull << pmu->nr_arch_gp_counters) - 1) |
+   (((1ull << pmu->nr_arch_fixed_counters) - 1) <<
+   INTEL_PMC_IDX_FIXED);
+   u64 diff = (pmu->pebs_enable ^ data) & mask;
+
+   pmu->pebs_enable = data;
+
+   for_each_set_bit(bit, (unsigned long *), X86_PMC_IDX_MAX)
+   reprogram_counter(pmu, bit);
+}
+
 static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
  u8 event_select,
  u8 unit_mask)
@@ -155,6 +169,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 
msr)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
ret = pmu->version > 1;
break;
+   case MSR_IA32_PEBS_ENABLE:
+   ret = pmu->pebs_pt;
+   break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
@@ -183,6 +200,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 
msr, u64 *data)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
*data = pmu->global_ovf_ctrl;
return 0;
+   case MSR_IA32_PEBS_ENABLE:
+   *data = pmu->pebs_enable;
+   return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
u64 val = pmc_read_counter(pmc);
@@ -240,6 +260,16 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
return 0;
}
break;
+   case MSR_IA32_PEBS_ENABLE:
+   if (pmu->pebs_enable == data)
+   return 0;
+   if (!(data & pmu->pebs_enable_mask) &&
+(data & MSR_IA32_PEBS_OUTPUT_MASK) ==
+   MSR_IA32_PEBS_OUTPUT_PT) {
+   pebs_enable_changed(pmu, data);
+   return 0;
+   }
+   break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
if (msr_info->host_initiated)
@@ -270,6 +300,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
union cpuid10_edx edx;
+   u64 cnts_mask;
 
pmu->nr_arch_gp_counters = 0;
pmu->nr_arch_fixed_counters = 0;
@@ -304,9 +335,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
((u64)1 << edx.split.bit_width_fixed) - 1;
}
 
-   pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
+   cnts_mask = ((1ull << pmu->nr_arch_gp_counters) - 1) |
   

[RFC v1 4/9] KVM: x86: Implement counter reload MSRs read/write emulation

2019-08-28 Thread Luwei Kang
This patch implements the counter reload register
MSR_RELOAD_PMCx/FIXED_CTRx read/write emulation. These registers
can be accessed only when PEBS is supported in KVM.

VMM need to reprogram the counters to make the host PMU framework
load the value to real hardware after configuration has been changed.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h  |  1 +
 arch/x86/include/asm/msr-index.h |  3 +++
 arch/x86/kvm/vmx/pmu_intel.c | 22 +-
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index df966c9..9b930b5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -454,6 +454,7 @@ struct kvm_pmc {
enum pmc_type type;
u8 idx;
u64 counter;
+   u64 reload_cnt;
u64 eventsel;
struct perf_event *perf_event;
struct kvm_vcpu *vcpu;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index a9e8720..6321acb 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -141,6 +141,9 @@
 #define MSR_IA32_PERF_CAPABILITIES 0x0345
 #define MSR_PEBS_LD_LAT_THRESHOLD  0x03f6
 
+#define MSR_IA32_RELOAD_PMC0   0x14c1
+#define MSR_IA32_RELOAD_FIXED_CTR0 0x1309
+
 #define MSR_IA32_RTIT_CTL  0x0570
 #define RTIT_CTL_TRACEEN   BIT(0)
 #define RTIT_CTL_CYCLEACC  BIT(1)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index fc79cc6..ebd3efc 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -175,7 +175,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 
msr)
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
-   get_fixed_pmc(pmu, msr, MSR_CORE_PERF_FIXED_CTR0);
+   get_fixed_pmc(pmu, msr, MSR_CORE_PERF_FIXED_CTR0) ||
+   get_gp_pmc(pmu, msr, MSR_IA32_RELOAD_PMC0) ||
+   get_fixed_pmc(pmu, msr, MSR_IA32_RELOAD_FIXED_CTR0);
break;
}
 
@@ -216,6 +218,11 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 
msr, u64 *data)
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
*data = pmc->eventsel;
return 0;
+   } else if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_RELOAD_PMC0)) ||
+  (pmc = get_fixed_pmc(pmu, msr,
+   MSR_IA32_RELOAD_FIXED_CTR0))) {
+   *data = pmc->reload_cnt;
+   return 0;
}
}
 
@@ -288,6 +295,19 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
reprogram_gp_counter(pmc, data);
return 0;
}
+   } else if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_RELOAD_PMC0)) ||
+  (pmc = get_fixed_pmc(pmu, msr,
+   MSR_IA32_RELOAD_FIXED_CTR0))) {
+   if (data == pmc->reload_cnt)
+   return 0;
+   if (!(data & ~pmc_bitmask(pmc))) {
+   int pmc_idx = pmc_is_fixed(pmc) ?
+   pmc->idx + INTEL_PMC_IDX_FIXED :
+   pmc->idx;
+   pmc->reload_cnt = data;
+   reprogram_counter(pmu, pmc_idx);
+   return 0;
+   }
}
}
 
-- 
1.8.3.1



[RFC v1 9/9] KVM: x86: Expose PEBS feature to guest

2019-08-28 Thread Luwei Kang
Expose PEBS feature to guest by IA32_MISC_ENABLE[bit12].
IA32_MISC_ENABLE[bit12] is Processor Event Based Sampling (PEBS)
Unavailable (RO) flag:
1 = PEBS is not supported; 0 = PEBS is supported.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx/vmx.c  |  1 +
 arch/x86/kvm/x86.c  | 22 +-
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 94af338..f6a5630 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1130,6 +1130,7 @@ struct kvm_x86_ops {
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
bool (*pt_supported)(void);
+   bool (*pebs_supported)(void);
bool (*pdcm_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8ae6716..2b271fc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6005,6 +6005,11 @@ static bool svm_pt_supported(void)
return false;
 }
 
+static bool svm_pebs_supported(void)
+{
+   return false;
+}
+
 static bool svm_pdcm_supported(void)
 {
return false;
@@ -7298,6 +7303,7 @@ static bool svm_need_emulation_on_page_fault(struct 
kvm_vcpu *vcpu)
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
.pt_supported = svm_pt_supported,
+   .pebs_supported = svm_pebs_supported,
.pdcm_supported = svm_pdcm_supported,
 
.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 71e3d42..d85f19b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7782,6 +7782,7 @@ static __exit void hardware_unsetup(void)
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
.pt_supported = vmx_pt_supported,
+   .pebs_supported = vmx_pebs_supported,
.pdcm_supported = vmx_pdcm_supported,
 
.request_immediate_exit = vmx_request_immediate_exit,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 290c3c3..8ad501d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2483,6 +2483,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
bool pr = false;
+   bool update_cpuid = false;
u32 msr = msr_info->index;
u64 data = msr_info->data;
 
@@ -2563,11 +2564,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
((vcpu->arch.ia32_misc_enable_msr ^ data) & 
MSR_IA32_MISC_ENABLE_MWAIT)) {
if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
return 1;
-   vcpu->arch.ia32_misc_enable_msr = data;
-   kvm_update_cpuid(vcpu);
-   } else {
-   vcpu->arch.ia32_misc_enable_msr = data;
+   update_cpuid = true;
}
+
+   if (kvm_x86_ops->pebs_supported())
+   data &=  ~MSR_IA32_MISC_ENABLE_PEBS;
+   else
+   data |= MSR_IA32_MISC_ENABLE_PEBS;
+
+   vcpu->arch.ia32_misc_enable_msr = data;
+   if (update_cpuid)
+   kvm_update_cpuid(vcpu);
break;
case MSR_IA32_SMBASE:
if (!msr_info->host_initiated)
@@ -2875,7 +2882,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
break;
case MSR_IA32_MISC_ENABLE:
-   msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+   if (kvm_x86_ops->pebs_supported())
+   msr_info->data = (vcpu->arch.ia32_misc_enable_msr &
+   ~MSR_IA32_MISC_ENABLE_PEBS);
+   else
+   msr_info->data = (vcpu->arch.ia32_misc_enable_msr |
+   MSR_IA32_MISC_ENABLE_PEBS);
break;
case MSR_IA32_SMBASE:
if (!msr_info->host_initiated)
-- 
1.8.3.1



[RFC v1 5/9] KVM: x86: Allocate performance counter for PEBS event

2019-08-28 Thread Luwei Kang
This patch add a new parameter "pebs" that to make the host
PMU framework allocate performance counter for guest PEBS event.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/pmu.c   | 23 +++
 arch/x86/kvm/pmu.h   |  5 +++--
 arch/x86/kvm/pmu_amd.c   |  2 +-
 arch/x86/kvm/vmx/pmu_intel.c |  7 +--
 4 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 46875bb..6bdc282 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -99,7 +99,7 @@ static void kvm_perf_overflow_intr(struct perf_event 
*perf_event,
 static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
  unsigned config, bool exclude_user,
  bool exclude_kernel, bool intr,
- bool in_tx, bool in_tx_cp)
+ bool in_tx, bool in_tx_cp, bool pebs)
 {
struct perf_event *event;
struct perf_event_attr attr = {
@@ -111,9 +111,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 
type,
.exclude_user = exclude_user,
.exclude_kernel = exclude_kernel,
.config = config,
+   .precise_ip = pebs ? 1 : 0,
+   .aux_output = pebs ? 1 : 0,
};
 
-   attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
+   attr.sample_period = pebs ? (-pmc->reload_cnt) & pmc_bitmask(pmc) :
+   (-pmc->counter) & pmc_bitmask(pmc);
 
if (in_tx)
attr.config |= HSW_IN_TX;
@@ -140,7 +143,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 
type,
clear_bit(pmc->idx, (unsigned long*)_to_pmu(pmc)->reprogram_pmi);
 }
 
-void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel, bool pebs)
 {
unsigned config, type = PERF_TYPE_RAW;
u8 event_select, unit_mask;
@@ -198,11 +201,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 
eventsel)
  !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
  eventsel & ARCH_PERFMON_EVENTSEL_INT,
  (eventsel & HSW_IN_TX),
- (eventsel & HSW_IN_TX_CHECKPOINTED));
+ (eventsel & HSW_IN_TX_CHECKPOINTED),
+ pebs);
 }
 EXPORT_SYMBOL_GPL(reprogram_gp_counter);
 
-void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx, bool pebs)
 {
unsigned en_field = ctrl & 0x3;
bool pmi = ctrl & 0x8;
@@ -228,7 +232,8 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, 
int idx)
  kvm_x86_ops->pmu_ops->find_fixed_event(idx),
  !(en_field & 0x2), /* exclude user */
  !(en_field & 0x1), /* exclude kernel */
- pmi, false, false);
+ pmi, false, false,
+ pebs);
 }
 EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
 
@@ -240,12 +245,14 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
return;
 
if (pmc_is_gp(pmc))
-   reprogram_gp_counter(pmc, pmc->eventsel);
+   reprogram_gp_counter(pmc, pmc->eventsel,
+   (pmu->pebs_enable & (1ul << pmc_idx)));
else {
int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
 
-   reprogram_fixed_counter(pmc, ctrl, idx);
+   reprogram_fixed_counter(pmc, ctrl, idx,
+   (pmu->pebs_enable & (1ul << pmc_idx)));
}
 }
 EXPORT_SYMBOL_GPL(reprogram_counter);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index c62a1ff..0c59a15 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -102,8 +102,9 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu 
*pmu, u32 msr,
return NULL;
 }
 
-void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
-void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel, bool pebs);
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx,
+   bool pebs);
 void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
 
 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index c838838..7b3e307 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -248,7 +248,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_i

[RFC v1 8/9] KVM: X86: MSR_IA32_PERF_CAPABILITIES MSR emulation

2019-08-28 Thread Luwei Kang
Expose some bits of definition which relate with enable
PEBS to KVM guest especially PEBS via PT feature.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h  |  1 +
 arch/x86/include/asm/msr-index.h |  3 +++
 arch/x86/kvm/vmx/vmx.c   | 14 ++
 3 files changed, 18 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2d9b0f9..94af338 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -576,6 +576,7 @@ struct kvm_vcpu_arch {
u64 ia32_xss;
u64 microcode_version;
u64 arch_capabilities;
+   u64 ia32_perf_capabilities;
 
/*
 * Paging state of the vcpu
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6321acb..4932dec 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -137,6 +137,9 @@
 #define MSR_IA32_PEBS_ENABLE   0x03f1
 #define MSR_PEBS_DATA_CFG  0x03f2
 #define MSR_IA32_DS_AREA   0x0600
+#define MSR_IA32_PERF_CAP_PEBS_TRAP(1UL << 6)
+#define MSR_IA32_PERF_CAP_PEBS_ARCH_REG(1UL << 7)
+#define MSR_IA32_PERF_CAP_PEBS_REC_FMT (0xfUL << 8)
 #define MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT   (1UL << 16)
 #define MSR_IA32_PERF_CAPABILITIES 0x0345
 #define MSR_PEBS_LD_LAT_THRESHOLD  0x03f6
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index dbff8f0..71e3d42 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1737,6 +1737,16 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
return 1;
msr_info->data = vcpu->arch.ia32_xss;
break;
+   case MSR_IA32_PERF_CAPABILITIES:
+   if (!vmx_pdcm_supported() || !vmx_pebs_supported())
+   return 1;
+   rdmsrl(MSR_IA32_PERF_CAPABILITIES, msr_info->data);
+   msr_info->data = msr_info->data &
+   (MSR_IA32_PERF_CAP_PEBS_TRAP |
+MSR_IA32_PERF_CAP_PEBS_ARCH_REG |
+MSR_IA32_PERF_CAP_PEBS_REC_FMT |
+MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT);
+   break;
case MSR_IA32_RTIT_CTL:
if (pt_mode != PT_MODE_HOST_GUEST)
return 1;
@@ -1981,6 +1991,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
else
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
+   case MSR_IA32_PERF_CAPABILITIES:
+   if (!vmx_pdcm_supported() || !vmx_pebs_supported())
+   return 1;
+   break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
vmx_rtit_ctl_check(vcpu, data) ||
-- 
1.8.3.1



[RFC v1 6/9] KVM: x86: Add shadow value of PEBS status

2019-08-28 Thread Luwei Kang
The performance counter used by guest perspective may different
with the counter allocated from real hardware (e.g. Guest driver
get counter 0 for PEBS but the host PMU driver may alloc other
counters for this event).

Introduce a new parameter for the mapping of PEBS enable status from
guest to real hardware. Update the shadow value of PEBS before
VM-entry when PT is enabled in guest.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/pmu.c  | 34 ++
 arch/x86/kvm/pmu.h  |  1 +
 arch/x86/kvm/vmx/vmx.c  |  8 +++-
 4 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9b930b5..07d3b21 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -473,6 +473,7 @@ struct kvm_pmu {
u64 global_ovf_ctrl_mask;
u64 reserved_bits;
u64 pebs_enable;
+   u64 pebs_enable_shadow;
u64 pebs_enable_mask;
u8 version;
bool pebs_pt;   /* PEBS output to Intel PT */
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 6bdc282..89d3e4c 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -257,6 +257,40 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
 }
 EXPORT_SYMBOL_GPL(reprogram_counter);
 
+void kvm_pmu_pebs_shadow(struct kvm_vcpu *vcpu)
+{
+   struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+   struct perf_event *event;
+   int i;
+
+   if (!pmu->pebs_pt)
+   return;
+
+   pmu->pebs_enable_shadow = MSR_IA32_PEBS_OUTPUT_PT;
+
+   for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
+   if (!test_bit(i, (unsigned long *)>pebs_enable))
+   continue;
+
+   event = pmu->gp_counters[i].perf_event;
+   if (event && (event->hw.idx != -1))
+   set_bit(event->hw.idx,
+   (unsigned long *)>pebs_enable_shadow);
+   }
+
+   for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+   if (!test_bit(i + INTEL_PMC_IDX_FIXED,
+   (unsigned long *)>pebs_enable))
+   continue;
+
+   event = pmu->fixed_counters[i].perf_event;
+   if (event && (event->hw.idx != -1))
+   set_bit(event->hw.idx,
+   (unsigned long *)>pebs_enable_shadow);
+   }
+}
+EXPORT_SYMBOL_GPL(kvm_pmu_pebs_shadow);
+
 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
 {
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 0c59a15..81c35c9 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -119,6 +119,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, 
int fixed_idx,
 void kvm_pmu_init(struct kvm_vcpu *vcpu);
 void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
+void kvm_pmu_pebs_shadow(struct kvm_vcpu *vcpu);
 
 bool is_vmware_backdoor_pmc(u32 pmc_idx);
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c030c96..4090c08 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1019,6 +1019,7 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
wrmsrl(MSR_IA32_RTIT_CTL, 0);
pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range);
pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   kvm_pmu_pebs_shadow(>vcpu);
}
 }
 
@@ -6365,12 +6366,17 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx 
*vmx)
if (!msrs)
return;
 
-   for (i = 0; i < nr_msrs; i++)
+   for (i = 0; i < nr_msrs; i++) {
+   if (msrs[i].msr == MSR_IA32_PEBS_ENABLE)
+   msrs[i].guest =
+   vcpu_to_pmu(>vcpu)->pebs_enable_shadow;
+
if (msrs[i].host == msrs[i].guest)
clear_atomic_switch_msr(vmx, msrs[i].msr);
else
add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
msrs[i].host, false);
+   }
 }
 
 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
-- 
1.8.3.1



[RFC v1 2/9] KVM: x86: PEBS via Intel PT HW feature detection

2019-08-28 Thread Luwei Kang
PEBS can be enabled in KVM guest by direct PEBS record into the Intel
Processor Trace output buffer. This patch adds a new flag to detect
if PEBS can be supported in KVM guest. It not only need HW support PEBS
output Intel PT (IA32_PERF_CAPABILITIES.PEBS_OUTPUT_PT_AVAIL[16]=1)
but also depends on:
1. PEBS feature is supported by HW (IA32_MISC_ENABLE[Bit12]=0);
2. Intel PT must be working in HOST_GUEST mode.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h  |  1 +
 arch/x86/include/asm/msr-index.h |  3 +++
 arch/x86/kvm/vmx/capabilities.h  | 11 +++
 arch/x86/kvm/vmx/pmu_intel.c |  7 ++-
 4 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 74e88e5..3463326 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -472,6 +472,7 @@ struct kvm_pmu {
u64 global_ovf_ctrl_mask;
u64 reserved_bits;
u8 version;
+   bool pebs_pt;   /* PEBS output to Intel PT */
struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
struct irq_work irq_work;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 271d837..3dd166a 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -134,6 +134,7 @@
 #define MSR_IA32_PEBS_ENABLE   0x03f1
 #define MSR_PEBS_DATA_CFG  0x03f2
 #define MSR_IA32_DS_AREA   0x0600
+#define MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT   (1UL << 16)
 #define MSR_IA32_PERF_CAPABILITIES 0x0345
 #define MSR_PEBS_LD_LAT_THRESHOLD  0x03f6
 
@@ -660,6 +661,8 @@
 #define MSR_IA32_MISC_ENABLE_FERR  (1ULL << 
MSR_IA32_MISC_ENABLE_FERR_BIT)
 #define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT10
 #define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX(1ULL << 
MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT)
+#define MSR_IA32_MISC_ENABLE_PEBS_BIT  12
+#define MSR_IA32_MISC_ENABLE_PEBS  (1ULL << 
MSR_IA32_MISC_ENABLE_PEBS_BIT)
 #define MSR_IA32_MISC_ENABLE_TM2_BIT   13
 #define MSR_IA32_MISC_ENABLE_TM2   (1ULL << 
MSR_IA32_MISC_ENABLE_TM2_BIT)
 #define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT  19
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index d6664ee..4bcb6b4 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -342,4 +342,15 @@ static inline bool cpu_has_vmx_intel_pt(void)
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
 }
 
+static inline bool cpu_has_vmx_pebs_output_pt(void)
+{
+   u64 misc, perf_cap;
+
+   rdmsrl(MSR_IA32_MISC_ENABLE, misc);
+   rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
+
+   return (!(misc & MSR_IA32_MISC_ENABLE_PEBS) &&
+   (perf_cap & MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT));
+}
+
 #endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 01441be..e1c987f 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include "capabilities.h"
 #include "x86.h"
 #include "cpuid.h"
 #include "lapic.h"
@@ -309,10 +310,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
-   if (kvm_x86_ops->pt_supported())
+   if (kvm_x86_ops->pt_supported()) {
pmu->global_ovf_ctrl_mask &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
 
+   if (cpu_has_vmx_pebs_output_pt())
+   pmu->pebs_pt = true;
+   }
+
entry = kvm_find_cpuid_entry(vcpu, 7, 0);
if (entry &&
(boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
-- 
1.8.3.1



[RFC v1 7/9] KVM: X86: Expose PDCM cpuid to guest

2019-08-28 Thread Luwei Kang
PDCM (Perfmon and Debug Capability) indicates the processor
supports the performance and debug feature indication
MSR IA32_PERF_CAPABILITIES.

PEBS enabling in KVM guest depend on PEBS via PT, and
PEBS via PT is detected by IA32_PERF_CAPABILITIES[Bit16].

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/cpuid.c|  3 ++-
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx/capabilities.h | 10 ++
 arch/x86/kvm/vmx/vmx.c  |  1 +
 5 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 07d3b21..2d9b0f9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1129,6 +1129,7 @@ struct kvm_x86_ops {
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
bool (*pt_supported)(void);
+   bool (*pdcm_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 22c2720..d12e7af 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -430,6 +430,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 
*entry, u32 function,
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
+   unsigned f_pdcm = kvm_x86_ops->pdcm_supported() ? F(PDCM) : 0;
 
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -458,7 +459,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 
*entry, u32 function,
F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-   F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+   F(FMA) | F(CX16) | 0 /* xTPR Update */ | f_pdcm |
F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e036807..8ae6716 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6005,6 +6005,11 @@ static bool svm_pt_supported(void)
return false;
 }
 
+static bool svm_pdcm_supported(void)
+{
+   return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
return true;
@@ -7293,6 +7298,7 @@ static bool svm_need_emulation_on_page_fault(struct 
kvm_vcpu *vcpu)
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
.pt_supported = svm_pt_supported,
+   .pdcm_supported = svm_pdcm_supported,
 
.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 4bcb6b4..82ca51d 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -353,4 +353,14 @@ static inline bool cpu_has_vmx_pebs_output_pt(void)
(perf_cap & MSR_IA32_PERF_CAP_PEBS_OUTPUT_PT));
 }
 
+static inline bool vmx_pebs_supported(void)
+{
+   return (cpu_has_vmx_pebs_output_pt() && pt_mode == PT_MODE_HOST_GUEST);
+}
+
+static inline bool vmx_pdcm_supported(void)
+{
+   return boot_cpu_has(X86_FEATURE_PDCM) && vmx_pebs_supported();
+}
+
 #endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4090c08..dbff8f0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7768,6 +7768,7 @@ static __exit void hardware_unsetup(void)
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
.pt_supported = vmx_pt_supported,
+   .pdcm_supported = vmx_pdcm_supported,
 
.request_immediate_exit = vmx_request_immediate_exit,
 
-- 
1.8.3.1



[RFC v1 1/9] KVM: x86: Add base address parameter for get_fixed_pmc function

2019-08-28 Thread Luwei Kang
PEBS output Inte PT introduces some new MSRs (MSR_RELOAD_FIXED_CTRx)
for fixed function counters that use for autoload the preset value
after writing out a PEBS event.

Introduce base MSRs address parameter to make this function can get
performance monitor counter structure by MSR_RELOAD_FIXED_CTRx registers.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/pmu.h   |  5 ++---
 arch/x86/kvm/vmx/pmu_intel.c | 14 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 58265f7..c62a1ff 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -93,10 +93,9 @@ static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu 
*pmu, u32 msr,
 }
 
 /* returns fixed PMC with the specified MSR */
-static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr,
+   int base)
 {
-   int base = MSR_CORE_PERF_FIXED_CTR0;
-
if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
return >fixed_counters[msr - base];
 
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 4dea0e0..01441be 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -41,7 +41,8 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 
data)
u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
struct kvm_pmc *pmc;
 
-   pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+   pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i,
+   MSR_CORE_PERF_FIXED_CTR0);
 
if (old_ctrl == new_ctrl)
continue;
@@ -106,7 +107,8 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu 
*pmu, int pmc_idx)
else {
u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
 
-   return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+   return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0,
+   MSR_CORE_PERF_FIXED_CTR0);
}
 }
 
@@ -155,7 +157,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 
msr)
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
-   get_fixed_pmc(pmu, msr);
+   get_fixed_pmc(pmu, msr, MSR_CORE_PERF_FIXED_CTR0);
break;
}
 
@@ -185,7 +187,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 
msr, u64 *data)
u64 val = pmc_read_counter(pmc);
*data = val & pmu->counter_bitmask[KVM_PMC_GP];
return 0;
-   } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+   } else if ((pmc = get_fixed_pmc(pmu, msr,
+   MSR_CORE_PERF_FIXED_CTR0))) {
u64 val = pmc_read_counter(pmc);
*data = val & pmu->counter_bitmask[KVM_PMC_FIXED];
return 0;
@@ -243,7 +246,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
else
pmc->counter = (s32)data;
return 0;
-   } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+   } else if ((pmc = get_fixed_pmc(pmu, msr,
+   MSR_CORE_PERF_FIXED_CTR0))) {
pmc->counter = data;
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
-- 
1.8.3.1



[RFC v1 0/9] PEBS enabling in KVM guest

2019-08-28 Thread Luwei Kang
Intel new hardware introduces some Precise Event-Based Sampling (PEBS)
extensions that output the PEBS record to Intel PT stream instead of
DS area. The PEBS record will be packaged in a specific format when
outputing to Intel PT.

This patch set will enable PEBS functionality in KVM Guest by PEBS
output to Intel PT. The native driver as [1] (still under review).

[1] https://www.spinics.net/lists/kernel/msg3215354.html

Luwei Kang (9):
  KVM: x86: Add base address parameter for get_fixed_pmc function
  KVM: x86: PEBS via Intel PT HW feature detection
  KVM: x86: Implement MSR_IA32_PEBS_ENABLE read/write emulation
  KVM: x86: Implement counter reload MSRs read/write emulation
  KVM: x86: Allocate performance counter for PEBS event
  KVM: x86: Add shadow value of PEBS status
  KVM: X86: Expose PDCM cpuid to guest
  KVM: X86: MSR_IA32_PERF_CAPABILITIES MSR emulation
  KVM: x86: Expose PEBS feature to guest

 arch/x86/include/asm/kvm_host.h  |  8 
 arch/x86/include/asm/msr-index.h | 12 ++
 arch/x86/kvm/cpuid.c |  3 +-
 arch/x86/kvm/pmu.c   | 57 ++
 arch/x86/kvm/pmu.h   | 11 ++---
 arch/x86/kvm/pmu_amd.c   |  2 +-
 arch/x86/kvm/svm.c   | 12 ++
 arch/x86/kvm/vmx/capabilities.h  | 21 ++
 arch/x86/kvm/vmx/pmu_intel.c | 88 +++-
 arch/x86/kvm/vmx/vmx.c   | 24 ++-
 arch/x86/kvm/x86.c   | 22 +++---
 11 files changed, 229 insertions(+), 31 deletions(-)

-- 
1.8.3.1



[PATCH] KVM: LAPIC: Do not mask the local interrupts when LAPIC is sw disabled

2019-05-21 Thread Luwei Kang
The current code will mask all the local interrupts in the local
vector table when the LAPIC is disabled by SVR (Spurious-Interrupt
Vector Register) "APIC Software Enable/Disable" flag (bit8).
This may block local interrupt be delivered to target vCPU
even if LAPIC is enabled by set SVR (bit8 == 1) after.

For example, reset vCPU will mask all the local interrupts and
set the SVR to default value FFH (LAPIC is disabled because
SVR[bit8] == 0). Guest may try to enable some local interrupts
(e.g. LVTPC) by clear bit16 of LVT entry before enable LAPIC.
But bit16 can't be cleared when LAPIC is "software disabled"
and this local interrupt still disabled after LAPIC "software
enabled".

This patch will not mask the local interrupts when LAPIC
is "software disabled" and add LAPIC "software enabled" checking
before deliver local interrupt.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/lapic.c | 19 ++-
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fcf42a3..a199f47 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1892,15 +1892,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, 
u32 val)
mask |= APIC_SPIV_DIRECTED_EOI;
apic_set_spiv(apic, val & mask);
if (!(val & APIC_SPIV_APIC_ENABLED)) {
-   int i;
-   u32 lvt_val;
-
-   for (i = 0; i < KVM_APIC_LVT_NUM; i++) {
-   lvt_val = kvm_lapic_get_reg(apic,
-  APIC_LVTT + 0x10 * i);
-   kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i,
-lvt_val | APIC_LVT_MASKED);
-   }
apic_update_lvtt(apic);
atomic_set(>lapic_timer.pending, 0);
 
@@ -1926,18 +1917,12 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 
reg, u32 val)
case APIC_LVTPC:
case APIC_LVT1:
case APIC_LVTERR:
-   /* TODO: Check vector */
-   if (!kvm_apic_sw_enabled(apic))
-   val |= APIC_LVT_MASKED;
-
val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
kvm_lapic_set_reg(apic, reg, val);
 
break;
 
case APIC_LVTT:
-   if (!kvm_apic_sw_enabled(apic))
-   val |= APIC_LVT_MASKED;
val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
kvm_lapic_set_reg(apic, APIC_LVTT, val);
apic_update_lvtt(apic);
@@ -2260,7 +2245,7 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int 
lvt_type)
u32 reg = kvm_lapic_get_reg(apic, lvt_type);
int vector, mode, trig_mode;
 
-   if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+   if (apic_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
vector = reg & APIC_VECTOR_MASK;
mode = reg & APIC_MODE_MASK;
trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
@@ -2363,7 +2348,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
 
-   if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+   if (!apic_enabled(vcpu->arch.apic))
r = 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
-- 
1.8.3.1



[PATCH v1 5/6] KVM: VMX: Intel PT configration context switch using XSAVES/XRSTORS

2019-05-16 Thread Luwei Kang
This patch add the support of using XSAVES/XRSTORS to
do the Intel processor trace context switch.

Because of native driver didn't set the XSS[bit8] to enabled
the PT state in xsave area, so this patch only set this bit
before XSAVE/XRSTORS intstuction executtion and restore the
original value after.

The flag "initialized" need to be cleared when PT is change
from enabled to disabled. Guest may modify PT MSRs when PT
is disabled and they are only saved in variables.
We need to reload these value to HW manual when PT is enabled.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx/vmx.c | 80 --
 1 file changed, 65 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4691665..d323e6b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1002,33 +1002,83 @@ static inline void pt_save_msr(struct pt_state *ctx, 
u32 addr_range)
 
 static void pt_guest_enter(struct vcpu_vmx *vmx)
 {
+   struct pt_desc *desc;
+   int err;
+
if (pt_mode == PT_MODE_SYSTEM)
return;
 
-   /*
-* GUEST_IA32_RTIT_CTL is already set in the VMCS.
-* Save host state before VM entry.
-*/
-   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc->host_ctx->rtit_ctl);
-   if (vmx->pt_desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) {
-   wrmsrl(MSR_IA32_RTIT_CTL, 0);
-   pt_save_msr(vmx->pt_desc->host_ctx, vmx->pt_desc->addr_range);
-   pt_load_msr(vmx->pt_desc->guest_ctx, vmx->pt_desc->addr_range);
+   desc = vmx->pt_desc;
+
+   rdmsrl(MSR_IA32_RTIT_CTL, desc->host_ctx->rtit_ctl);
+
+   if (desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) {
+   if (likely(desc->pt_xsave)) {
+   wrmsrl(MSR_IA32_XSS, host_xss | XFEATURE_MASK_PT);
+   /*
+* XSAVES instruction will clears the TeaceEn after
+* saving the value of RTIT_CTL and before saving any
+* other PT state.
+*/
+   XSTATE_XSAVE(>host_xs->state.xsave,
+   XFEATURE_MASK_PT, 0, err);
+   /*
+* Still need to load the guest PT state manual if
+* PT stste not populated in xsave area.
+*/
+   if (desc->guest_xs->initialized)
+   XSTATE_XRESTORE(>guest_xs->state.xsave,
+   XFEATURE_MASK_PT, 0);
+   else
+   pt_load_msr(desc->guest_ctx, desc->addr_range);
+
+   wrmsrl(MSR_IA32_XSS, host_xss);
+   } else {
+   if (desc->host_ctx->rtit_ctl & RTIT_CTL_TRACEEN)
+   wrmsrl(MSR_IA32_RTIT_CTL, 0);
+
+   pt_save_msr(desc->host_ctx, desc->addr_range);
+   pt_load_msr(desc->guest_ctx, desc->addr_range);
+   }
}
 }
 
 static void pt_guest_exit(struct vcpu_vmx *vmx)
 {
+   struct pt_desc *desc;
+   int err;
+
if (pt_mode == PT_MODE_SYSTEM)
return;
 
-   if (vmx->pt_desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) {
-   pt_save_msr(vmx->pt_desc->guest_ctx, vmx->pt_desc->addr_range);
-   pt_load_msr(vmx->pt_desc->host_ctx, vmx->pt_desc->addr_range);
-   }
+   desc = vmx->pt_desc;
+
+   if (desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) {
+   if (likely(desc->pt_xsave)) {
+   wrmsrl(MSR_IA32_XSS, host_xss | XFEATURE_MASK_PT);
+   /*
+* Save guest state. TraceEn is 0 before and after
+* XSAVES instruction because RTIT_CTL will be cleared
+* on VM-exit (VM Exit control bit25).
+*/
+   XSTATE_XSAVE(>guest_xs->state.xsave,
+   XFEATURE_MASK_PT, 0, err);
+   desc->guest_xs->initialized = 1;
+   /*
+* Resume host PT state and PT may enabled after this
+* instruction if host PT is enabled before VM-entry.
+*/
+   XSTATE_XRESTORE(>host_xs->state.xsave,
+   XFEATURE_MASK_PT, 0);
+   wrmsrl(MSR_IA32_XSS, host_xss);
+   } else {
+   pt_save_msr(desc->guest_ctx, desc->addr_range);
+   pt_load_msr(desc->host_ctx, desc->addr_ra

[PATCH v1 2/6] KVM: VMX: Reuse the pt_state structure for PT context

2019-05-16 Thread Luwei Kang
Remove the previous pt_ctx structure and use pt_state
to save the PT configuration because they are saved
the same things.
Add *_ctx postfix to different with the upcoming
host and guest fpu pointer for PT state.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx/nested.c |  2 +-
 arch/x86/kvm/vmx/vmx.c| 96 +--
 arch/x86/kvm/vmx/vmx.h| 16 +---
 3 files changed, 46 insertions(+), 68 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f4b1ae4..e8d5c61 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4201,7 +4201,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
vmx->nested.vmxon = true;
 
if (pt_mode == PT_MODE_HOST_GUEST) {
-   vmx->pt_desc.guest.ctl = 0;
+   vmx->pt_desc.guest_ctx.rtit_ctl = 0;
pt_update_intercept_for_msr(vmx);
}
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0db7ded..4234e40e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -976,32 +976,28 @@ static unsigned long segment_base(u16 selector)
 }
 #endif
 
-static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
+static inline void pt_load_msr(struct pt_state *ctx, u32 addr_range)
 {
u32 i;
 
-   wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
-   wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
-   wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
-   wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
-   for (i = 0; i < addr_range; i++) {
-   wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
-   wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
-   }
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->rtit_output_base);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->rtit_output_mask);
+   wrmsrl(MSR_IA32_RTIT_STATUS, ctx->rtit_status);
+   wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->rtit_cr3_match);
+   for (i = 0; i < addr_range * 2; i++)
+   wrmsrl(MSR_IA32_RTIT_ADDR0_A + i, ctx->rtit_addrx_ab[i]);
 }
 
-static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
+static inline void pt_save_msr(struct pt_state *ctx, u32 addr_range)
 {
u32 i;
 
-   rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
-   rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
-   rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
-   rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
-   for (i = 0; i < addr_range; i++) {
-   rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
-   rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
-   }
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->rtit_output_base);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->rtit_output_mask);
+   rdmsrl(MSR_IA32_RTIT_STATUS, ctx->rtit_status);
+   rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->rtit_cr3_match);
+   for (i = 0; i < addr_range; i++)
+   rdmsrl(MSR_IA32_RTIT_ADDR0_A + i, ctx->rtit_addrx_ab[i]);
 }
 
 static void pt_guest_enter(struct vcpu_vmx *vmx)
@@ -1013,11 +1009,11 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
 * Save host state before VM entry.
 */
-   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
-   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host_ctx.rtit_ctl);
+   if (vmx->pt_desc.guest_ctx.rtit_ctl & RTIT_CTL_TRACEEN) {
wrmsrl(MSR_IA32_RTIT_CTL, 0);
-   pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range);
-   pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   pt_save_msr(>pt_desc.host_ctx, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.guest_ctx, vmx->pt_desc.addr_range);
}
 }
 
@@ -1026,13 +1022,13 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
if (pt_mode == PT_MODE_SYSTEM)
return;
 
-   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
-   pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
-   pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   if (vmx->pt_desc.guest_ctx.rtit_ctl & RTIT_CTL_TRACEEN) {
+   pt_save_msr(>pt_desc.guest_ctx, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.host_ctx, vmx->pt_desc.addr_range);
}
 
/* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
-   wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+   wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host_ctx.rtit_ctl);
 }
 
 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
@@ -1402,8 +1398,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 
da

[PATCH v1 6/6] KVM: VMX: Get PT state from xsave area to variables

2019-05-16 Thread Luwei Kang
This patch get the Intel PT state from xsave area to
variables when PT is change from enabled to disabled.
Because PT state is saved/restored to/from xsave area
by XSAVES/XRSTORES instructions when Intel PT is enabled.
The KVM guest may read this MSRs when PT is disabled
but the real value is saved in xsave area not variables.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx/vmx.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d323e6b..d3e2569 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1000,6 +1000,16 @@ static inline void pt_save_msr(struct pt_state *ctx, u32 
addr_range)
rdmsrl(MSR_IA32_RTIT_ADDR0_A + i, ctx->rtit_addrx_ab[i]);
 }
 
+static void pt_state_get(struct pt_state *ctx, struct fpu *fpu, u32 addr_range)
+{
+   char *buff = fpu->state.xsave.extended_state_area;
+
+   /* skip riti_ctl register */
+   memcpy(>rtit_output_base, buff + sizeof(u64),
+   sizeof(struct pt_state) - sizeof(u64) +
+   sizeof(u64) * addr_range * 2);
+}
+
 static void pt_guest_enter(struct vcpu_vmx *vmx)
 {
struct pt_desc *desc;
@@ -1040,6 +1050,9 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
pt_save_msr(desc->host_ctx, desc->addr_range);
pt_load_msr(desc->guest_ctx, desc->addr_range);
}
+   } else if (desc->pt_xsave && desc->guest_xs->initialized) {
+   pt_state_get(desc->guest_ctx, desc->guest_xs, desc->addr_range);
+   desc->guest_xs->initialized = 0;
}
 }
 
-- 
1.8.3.1



[PATCH v1 3/6] KVM: VMX: Dymamic allocate Intel PT configuration state

2019-05-16 Thread Luwei Kang
This patch change the Intel PT configuration state
to structure pointer so that we only need to allocate
the state buffer when Intel PT working in HOST_GUEST
mode.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx/nested.c |   2 +-
 arch/x86/kvm/vmx/vmx.c| 202 +++---
 arch/x86/kvm/vmx/vmx.h|   6 +-
 3 files changed, 121 insertions(+), 89 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index e8d5c61..349be88 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4201,7 +4201,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
vmx->nested.vmxon = true;
 
if (pt_mode == PT_MODE_HOST_GUEST) {
-   vmx->pt_desc.guest_ctx.rtit_ctl = 0;
+   vmx->pt_desc->guest_ctx->rtit_ctl = 0;
pt_update_intercept_for_msr(vmx);
}
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4234e40e..4595230 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1009,11 +1009,11 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
 * Save host state before VM entry.
 */
-   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host_ctx.rtit_ctl);
-   if (vmx->pt_desc.guest_ctx.rtit_ctl & RTIT_CTL_TRACEEN) {
+   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc->host_ctx->rtit_ctl);
+   if (vmx->pt_desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) {
wrmsrl(MSR_IA32_RTIT_CTL, 0);
-   pt_save_msr(>pt_desc.host_ctx, vmx->pt_desc.addr_range);
-   pt_load_msr(>pt_desc.guest_ctx, vmx->pt_desc.addr_range);
+   pt_save_msr(vmx->pt_desc->host_ctx, vmx->pt_desc->addr_range);
+   pt_load_msr(vmx->pt_desc->guest_ctx, vmx->pt_desc->addr_range);
}
 }
 
@@ -1022,13 +1022,35 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
if (pt_mode == PT_MODE_SYSTEM)
return;
 
-   if (vmx->pt_desc.guest_ctx.rtit_ctl & RTIT_CTL_TRACEEN) {
-   pt_save_msr(>pt_desc.guest_ctx, vmx->pt_desc.addr_range);
-   pt_load_msr(>pt_desc.host_ctx, vmx->pt_desc.addr_range);
+   if (vmx->pt_desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) {
+   pt_save_msr(vmx->pt_desc->guest_ctx, vmx->pt_desc->addr_range);
+   pt_load_msr(vmx->pt_desc->host_ctx, vmx->pt_desc->addr_range);
}
 
/* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
-   wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host_ctx.rtit_ctl);
+   wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc->host_ctx->rtit_ctl);
+}
+
+static int pt_init(struct vcpu_vmx *vmx)
+{
+   u32 pt_state_sz = sizeof(struct pt_state) + sizeof(u64) *
+   intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2;
+
+   vmx->pt_desc = kzalloc(sizeof(struct pt_desc) + pt_state_sz * 2,
+   GFP_KERNEL_ACCOUNT);
+   if (!vmx->pt_desc)
+   return -ENOMEM;
+
+   vmx->pt_desc->host_ctx = (struct pt_state *)(vmx->pt_desc + 1);
+   vmx->pt_desc->guest_ctx = (void *)vmx->pt_desc->host_ctx + pt_state_sz;
+
+   return 0;
+}
+
+static void pt_uninit(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_HOST_GUEST)
+   kfree(vmx->pt_desc);
 }
 
 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
@@ -1391,15 +1413,16 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, 
u64 data)
 * Any MSR write that attempts to change bits marked reserved will
 * case a #GP fault.
 */
-   if (data & vmx->pt_desc.ctl_bitmask)
+   if (data & vmx->pt_desc->ctl_bitmask)
return 1;
 
/*
 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
 * result in a #GP unless the same write also clears TraceEn.
 */
-   if ((vmx->pt_desc.guest_ctx.rtit_ctl & RTIT_CTL_TRACEEN) &&
-   ((vmx->pt_desc.guest_ctx.rtit_ctl ^ data) & ~RTIT_CTL_TRACEEN))
+   if ((vmx->pt_desc->guest_ctx->rtit_ctl & RTIT_CTL_TRACEEN) &&
+   ((vmx->pt_desc->guest_ctx->rtit_ctl ^ data) &
+   ~RTIT_CTL_TRACEEN))
return 1;
 
/*
@@ -1409,7 +1432,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 
data)
 */
if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
!(data & RTIT_CTL_FABRIC_EN) &&
-   !intel_pt_validate_cap(vmx->pt_desc.caps,
+   !intel_pt_validate_cap(vmx->pt_desc->caps,
PT_CAP_single_range_output))
r

[PATCH v1 4/6] KVM: VMX: Allocate XSAVE area for Intel PT configuration

2019-05-16 Thread Luwei Kang
Allocate XSAVE area for host and guest Intel PT
configuration when Intel PT working in HOST_GUEST
mode. Intel PT configuration state can be saved
using XSAVES and restored by XRSTORS instruction.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx/vmx.c | 25 -
 arch/x86/kvm/vmx/vmx.h |  3 +++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4595230..4691665 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1033,6 +1033,7 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
 
 static int pt_init(struct vcpu_vmx *vmx)
 {
+   unsigned int eax, ebx, ecx, edx;
u32 pt_state_sz = sizeof(struct pt_state) + sizeof(u64) *
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2;
 
@@ -1044,13 +1045,35 @@ static int pt_init(struct vcpu_vmx *vmx)
vmx->pt_desc->host_ctx = (struct pt_state *)(vmx->pt_desc + 1);
vmx->pt_desc->guest_ctx = (void *)vmx->pt_desc->host_ctx + pt_state_sz;
 
+   cpuid_count(XSTATE_CPUID, 1, , , , );
+   if (ecx & XFEATURE_MASK_PT) {
+   vmx->pt_desc->host_xs = kmem_cache_zalloc(x86_fpu_cache,
+   GFP_KERNEL_ACCOUNT);
+   vmx->pt_desc->guest_xs = kmem_cache_zalloc(x86_fpu_cache,
+   GFP_KERNEL_ACCOUNT);
+   if (!vmx->pt_desc->host_xs || !vmx->pt_desc->guest_xs) {
+   if (vmx->pt_desc->host_xs)
+   kmem_cache_free(x86_fpu_cache,
+   vmx->pt_desc->host_xs);
+   if (vmx->pt_desc->guest_xs)
+   kmem_cache_free(x86_fpu_cache,
+   vmx->pt_desc->guest_xs);
+   } else
+   vmx->pt_desc->pt_xsave = true;
+   }
+
return 0;
 }
 
 static void pt_uninit(struct vcpu_vmx *vmx)
 {
-   if (pt_mode == PT_MODE_HOST_GUEST)
+   if (pt_mode == PT_MODE_HOST_GUEST) {
kfree(vmx->pt_desc);
+   if (vmx->pt_desc->pt_xsave) {
+   kmem_cache_free(x86_fpu_cache, vmx->pt_desc->host_xs);
+   kmem_cache_free(x86_fpu_cache, vmx->pt_desc->guest_xs);
+   }
+   }
 }
 
 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 283f69d..e103991 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -69,8 +69,11 @@ struct pt_desc {
u64 ctl_bitmask;
u32 addr_range;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+   bool pt_xsave;
struct pt_state *host_ctx;
struct pt_state *guest_ctx;
+   struct fpu *host_xs;
+   struct fpu *guest_xs;
 };
 
 /*
-- 
1.8.3.1



[PATCH v1 0/6] KVM: VMX: Intel PT configuration switch using XSAVES/XRSTORS on VM-Entry/Exit

2019-05-16 Thread Luwei Kang
This patch set is mainly used for reduce the overhead of switch
Intel PT configuation contex on VM-Entry/Exit by XSAVES/XRSTORS
instructions.

I measured the cycles number of context witch on Manual and
XSAVES/XRSTORES by rdtsc, and the data as below:

Manual save(rdmsr): ~334  cycles
Manual restore(wrmsr):  ~1668 cycles

XSAVES insturction: ~124  cycles
XRSTORS instruction:~378  cycles

Manual: Switch the configuration by rdmsr and wrmsr instruction,
and there have 8 registers need to be saved or restore.
They are IA32_RTIT_OUTPUT_BASE, *_OUTPUT_MASK_PTRS,
*_STATUS, *_CR3_MATCH, *_ADDR0_A, *_ADDR0_B,
*_ADDR1_A, *_ADDR1_B.
XSAVES/XRSTORS: Switch the configuration context by XSAVES/XRSTORS
instructions. This patch set will allocate separate
"struct fpu" structure to save host and guest PT state.
Only a small portion of this structure will be used because
we only save/restore PT state (not save AVX, AVX-512, MPX,
PKRU and so on).

This patch set also do some code clean e.g. patch 2 will reuse
the fpu pt_state to save the PT configuration contex and
patch 3 will dymamic allocate Intel PT configuration state.

Luwei Kang (6):
  x86/fpu: Introduce new fpu state for Intel processor trace
  KVM: VMX: Reuse the pt_state structure for PT context
  KVM: VMX: Dymamic allocate Intel PT configuration state
  KVM: VMX: Allocate XSAVE area for Intel PT configuration
  KVM: VMX: Intel PT configration context switch using XSAVES/XRSTORS
  KVM: VMX: Get PT state from xsave area to variables

 arch/x86/include/asm/fpu/types.h |  13 ++
 arch/x86/kvm/vmx/nested.c|   2 +-
 arch/x86/kvm/vmx/vmx.c   | 338 ++-
 arch/x86/kvm/vmx/vmx.h   |  21 +--
 4 files changed, 243 insertions(+), 131 deletions(-)

-- 
1.8.3.1



[PATCH v1 1/6] x86/fpu: Introduce new fpu state for Intel processor trace

2019-05-16 Thread Luwei Kang
Introduce new fpu state structure pt_state to save Intel
processor trace configuration. The upcoming using
XSAVES/XRSTORS to switch the Intel PT configuration
on VM-Entry/Exit will use this structure.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/fpu/types.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 2e32e17..8cbb42e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -221,6 +221,19 @@ struct avx_512_hi16_state {
 } __packed;
 
 /*
+ * State component 8 is used for some 64-bit registers
+ * of Intel processor trace.
+ */
+struct pt_state {
+   u64 rtit_ctl;
+   u64 rtit_output_base;
+   u64 rtit_output_mask;
+   u64 rtit_status;
+   u64 rtit_cr3_match;
+   u64 rtit_addrx_ab[0];
+} __packed;
+
+/*
  * State component 9: 32-bit PKRU register.  The state is
  * 8 bytes long but only 4 bytes is used currently.
  */
-- 
1.8.3.1



[PATCH v2 2/2] KVM: x86: Add support of clear Trace_ToPA_PMI status

2019-02-18 Thread Luwei Kang
Add support of clear Intel PT ToPA PMI status for
KVM guest.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h  | 1 +
 arch/x86/include/asm/msr-index.h | 4 
 arch/x86/kvm/vmx/pmu_intel.c | 8 +++-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4660ce9..de95704 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -468,6 +468,7 @@ struct kvm_pmu {
u64 global_ovf_ctrl;
u64 counter_bitmask[2];
u64 global_ctrl_mask;
+   u64 global_ovf_ctrl_mask;
u64 reserved_bits;
u8 version;
struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ae01fb0..c0ea4aa 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -778,6 +778,10 @@
 /* PERF_GLOBAL_OVF_CTL bits */
 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT   55
 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI   (1ULL << 
MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT)
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT  62
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF  (1ULL <<  
MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT)
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT63
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD(1ULL 
<< MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT)
 
 /* Geode defined MSRs */
 #define MSR_GEODE_BUSCONT_CONF00x1900
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 5ab4a36..6dee7cf 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -227,7 +227,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
}
break;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-   if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62 {
+   if (!(data & pmu->global_ovf_ctrl_mask)) {
if (!msr_info->host_initiated)
pmu->global_status &= ~data;
pmu->global_ovf_ctrl = data;
@@ -297,6 +297,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << 
INTEL_PMC_IDX_FIXED);
pmu->global_ctrl_mask = ~pmu->global_ctrl;
+   pmu->global_ovf_ctrl_mask = ~(pmu->global_ctrl |
+   MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
+   MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
+   if (kvm_x86_ops->pt_supported())
+   pmu->global_ovf_ctrl_mask &=
+   ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
 
entry = kvm_find_cpuid_entry(vcpu, 7, 0);
if (entry &&
-- 
1.8.3.1



[PATCH v2 0/2] Inject a PMI for KVM Guest when ToPA buffer is filled

2019-02-18 Thread Luwei Kang
Each intel processor trace table of physical addresses (ToPA) entry
has an INT bit. If this bit is set, the processor will signal a
performance-monitoring interrupt (PMI) when the corresponding trace
output region is filled. This patch set will inject a PMI for Intel
Processor Trace when ToPA buffer is filled.

>From v1:
 - Exported a global function pointers may not a good chioce.
   Add a new member in kvm_guest_cbs to send Intel PT PMI for KVM guest.

Luwei Kang (2):
  KVM: x86: Inject PMI for KVM guest
  KVM: x86: Add support of clear Trace_ToPA_PMI status

 arch/x86/events/intel/core.c |  6 +-
 arch/x86/include/asm/kvm_host.h  |  1 +
 arch/x86/include/asm/msr-index.h |  8 
 arch/x86/kvm/vmx/pmu_intel.c |  8 +++-
 arch/x86/kvm/x86.c   | 10 ++
 include/linux/perf_event.h   |  1 +
 6 files changed, 32 insertions(+), 2 deletions(-)

-- 
1.8.3.1



[PATCH v2 1/2] KVM: x86: Inject PMI for KVM guest

2019-02-18 Thread Luwei Kang
Inject a PMI for KVM guest when Intel PT working
in Host-Guest mode and Guest ToPA entry memory buffer
was completely filled.

Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/core.c |  6 +-
 arch/x86/include/asm/msr-index.h |  4 
 arch/x86/kvm/x86.c   | 10 ++
 include/linux/perf_event.h   |  1 +
 4 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 730978d..37cecff 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2273,7 +2273,11 @@ static int handle_pmi_common(struct pt_regs *regs, u64 
status)
 */
if (__test_and_clear_bit(55, (unsigned long *))) {
handled++;
-   intel_pt_interrupt();
+   if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() &&
+   perf_guest_cbs->handle_intel_pt_intr))
+   perf_guest_cbs->handle_intel_pt_intr();
+   else
+   intel_pt_interrupt();
}
 
/*
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8e40c24..ae01fb0 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -775,6 +775,10 @@
 #define MSR_CORE_PERF_GLOBAL_CTRL  0x038f
 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL  0x0390
 
+/* PERF_GLOBAL_OVF_CTL bits */
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT   55
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI   (1ULL << 
MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT)
+
 /* Geode defined MSRs */
 #define MSR_GEODE_BUSCONT_CONF00x1900
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 941f932..d1f4e0a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6795,10 +6795,20 @@ static unsigned long kvm_get_guest_ip(void)
return ip;
 }
 
+static void kvm_handle_intel_pt_intr(void)
+{
+   struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
+
+   kvm_make_request(KVM_REQ_PMI, vcpu);
+   __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+   (unsigned long *)>arch.pmu.global_status);
+}
+
 static struct perf_guest_info_callbacks kvm_guest_cbs = {
.is_in_guest= kvm_is_in_guest,
.is_user_mode   = kvm_is_user_mode,
.get_guest_ip   = kvm_get_guest_ip,
+   .handle_intel_pt_intr   = kvm_handle_intel_pt_intr,
 };
 
 static void kvm_set_mmio_spte_mask(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e1a0517..2b26a34 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -30,6 +30,7 @@ struct perf_guest_info_callbacks {
int (*is_in_guest)(void);
int (*is_user_mode)(void);
unsigned long   (*get_guest_ip)(void);
+   void(*handle_intel_pt_intr)(void);
 };
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-- 
1.8.3.1



[PATCH V4] KVM: x86: Sync the pending Posted-Interrupts

2019-02-13 Thread Luwei Kang
Some Posted-Interrupts from passthrough devices may be lost or
overwritten when the vCPU is in runnable state.

The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will
be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state
but not running on physical CPU). If a posted interrupt coming at this
time, the irq remmaping facility will set the bit of PIR (Posted
Interrupt Requests) without ON (Outstanding Notification).
So this interrupt can't be sync to APIC virtualization register and
will not be handled by Guest because ON is zero.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx/vmx.c | 26 +++---
 arch/x86/kvm/vmx/vmx.h |  6 ++
 arch/x86/kvm/x86.c |  2 +-
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f6915f1..fe59199 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1192,21 +1192,6 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int 
cpu)
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
 
-   /*
-* First handle the simple case where no cmpxchg is necessary; just
-* allow posting non-urgent interrupts.
-*
-* If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
-* PI.NDST: pi_post_block will do it for us and the wakeup_handler
-* expects the VCPU to be on the blocked_vcpu_list that matches
-* PI.NDST.
-*/
-   if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
-   vcpu->cpu == cpu) {
-   pi_clear_sn(pi_desc);
-   return;
-   }
-
/* The full case.  */
do {
old.control = new.control = pi_desc->control;
@@ -1221,6 +1206,17 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int 
cpu)
new.sn = 0;
} while (cmpxchg64(_desc->control, old.control,
   new.control) != old.control);
+
+   /*
+* Clear SN before reading the bitmap.  The VT-d firmware
+* writes the bitmap and reads SN atomically (5.2.3 in the
+* spec), so it doesn't really have a memory barrier that
+* pairs with this, but we cannot do that and we need one.
+*/
+   smp_mb__after_atomic();
+
+   if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
+   pi_set_on(pi_desc);
 }
 
 /*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9932895..a4527e1 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -349,6 +349,12 @@ static inline void pi_set_sn(struct pi_desc *pi_desc)
(unsigned long *)_desc->control);
 }
 
+static inline void pi_set_on(struct pi_desc *pi_desc)
+{
+   set_bit(POSTED_INTR_ON,
+   (unsigned long *)_desc->control);
+}
+
 static inline void pi_clear_on(struct pi_desc *pi_desc)
 {
clear_bit(POSTED_INTR_ON,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d32b8f..ebd6737 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7795,7 +7795,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 * 1) We should set ->mode before checking ->requests.  Please see
 * the comment in kvm_vcpu_exiting_guest_mode().
 *
-* 2) For APICv, we should set ->mode before checking PIR.ON.  This
+* 2) For APICv, we should set ->mode before checking PID.ON. This
 * pairs with the memory barrier implicit in pi_test_and_set_on
 * (see vmx_deliver_posted_interrupt).
 *
-- 
1.8.3.1



[PATCH V3] KVM: x86: Sync the pending Posted-Interrupts

2019-01-31 Thread Luwei Kang
Some Posted-Interrupts from passthrough devices may be lost or
overwritten when the vCPU is in runnable state.

The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will
be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state
but not running on physical CPU). If a posted interrupt coming at this
time, the irq remmaping facility will set the bit of PIR (Posted
Interrupt Requests) without ON (Outstanding Notification).
So this interrupt can't be sync to APIC virtualization register and
will not be handled by Guest because ON is zero.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx/vmx.c | 5 +
 arch/x86/kvm/x86.c | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4341175..8ed9634 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1221,6 +1221,11 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int 
cpu)
new.sn = 0;
} while (cmpxchg64(_desc->control, old.control,
   new.control) != old.control);
+
+   smp_mb__after_atomic();
+
+   if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
+   pi_test_and_set_on(pi_desc);
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d27206..5bcf2c4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7794,7 +7794,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 * 1) We should set ->mode before checking ->requests.  Please see
 * the comment in kvm_vcpu_exiting_guest_mode().
 *
-* 2) For APICv, we should set ->mode before checking PIR.ON.  This
+* 2) For APICv, we should set ->mode before checking PID.PIR. This
 * pairs with the memory barrier implicit in pi_test_and_set_on
 * (see vmx_deliver_posted_interrupt).
 *
-- 
1.8.3.1



[PATCH v2] KVM: x86: Sync the pending Posted-Interrupts

2019-01-29 Thread Luwei Kang
Some Posted-Interrupts from passthrough devices may be lost or
overwritten when the vCPU is in runnable state.

The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will
be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state
but not running on physical CPU). If a posted interrupt coming at this
time, the irq remmaping facility will set the bit of PIR (Posted
Interrupt Requests) without ON (Outstanding Notification).
So this interrupt can't be sync to APIC virtualization register and
will not be handled by Guest because ON is zero.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx/vmx.c | 2 +-
 arch/x86/kvm/x86.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f6915f1..820a03b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6048,7 +6048,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
bool max_irr_updated;
 
WARN_ON(!vcpu->arch.apicv_active);
-   if (pi_test_on(>pi_desc)) {
+   if (!bitmap_empty((unsigned long *)vmx->pi_desc.pir, NR_VECTORS)) {
pi_clear_on(>pi_desc);
/*
 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02c8e09..c31b608 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7793,7 +7793,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 * 1) We should set ->mode before checking ->requests.  Please see
 * the comment in kvm_vcpu_exiting_guest_mode().
 *
-* 2) For APICv, we should set ->mode before checking PIR.ON.  This
+* 2) For APICv, we should set ->mode before checking PID.PIR. This
 * pairs with the memory barrier implicit in pi_test_and_set_on
 * (see vmx_deliver_posted_interrupt).
 *
-- 
1.8.3.1



[PATCH 3/3] KVM: x86: Add support of clear Trace_ToPA_PMI status

2019-01-28 Thread Luwei Kang
Add support of clear Intel PT ToPA PMI status for
KVM guest.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h  | 1 +
 arch/x86/include/asm/msr-index.h | 4 
 arch/x86/kvm/vmx/pmu_intel.c | 8 +++-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4660ce9..de95704 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -468,6 +468,7 @@ struct kvm_pmu {
u64 global_ovf_ctrl;
u64 counter_bitmask[2];
u64 global_ctrl_mask;
+   u64 global_ovf_ctrl_mask;
u64 reserved_bits;
u8 version;
struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ae01fb0..c0ea4aa 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -778,6 +778,10 @@
 /* PERF_GLOBAL_OVF_CTL bits */
 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT   55
 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI   (1ULL << 
MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT)
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT  62
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF  (1ULL <<  
MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT)
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT63
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD(1ULL 
<< MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT)
 
 /* Geode defined MSRs */
 #define MSR_GEODE_BUSCONT_CONF00x1900
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 5ab4a36..6dee7cf 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -227,7 +227,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
}
break;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-   if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62 {
+   if (!(data & pmu->global_ovf_ctrl_mask)) {
if (!msr_info->host_initiated)
pmu->global_status &= ~data;
pmu->global_ovf_ctrl = data;
@@ -297,6 +297,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << 
INTEL_PMC_IDX_FIXED);
pmu->global_ctrl_mask = ~pmu->global_ctrl;
+   pmu->global_ovf_ctrl_mask = ~(pmu->global_ctrl |
+   MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
+   MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
+   if (kvm_x86_ops->pt_supported())
+   pmu->global_ovf_ctrl_mask &=
+   ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
 
entry = kvm_find_cpuid_entry(vcpu, 7, 0);
if (entry &&
-- 
1.8.3.1



[PATCH 2/3] perf/x86/intel/pt: Inject PMI for KVM guest

2019-01-28 Thread Luwei Kang
Inject a PMI for KVM guest when Intel PT working
in Host-Guest mode and Guest ToPA entry memory buffer
was completely filled.

The definition of ‘kvm_make_request’ and ‘KVM_REQ_PMI’
depend on "linux/kvm_host.h" header.

Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c   | 12 +++-
 arch/x86/include/asm/intel_pt.h  |  1 +
 arch/x86/include/asm/msr-index.h |  4 
 arch/x86/kvm/x86.h   |  6 ++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 9494ca6..09375bd 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -33,7 +34,8 @@
 #include "../perf_event.h"
 #include "pt.h"
 
-static DEFINE_PER_CPU(struct pt, pt_ctx);
+DEFINE_PER_CPU(struct pt, pt_ctx);
+EXPORT_PER_CPU_SYMBOL_GPL(pt_ctx);
 
 static struct pt_pmu pt_pmu;
 
@@ -1260,6 +1262,14 @@ void intel_pt_interrupt(void)
struct pt_buffer *buf;
struct perf_event *event = pt->handle.event;
 
+   if (pt->vcpu) {
+   /* Inject PMI to Guest */
+   kvm_make_request(KVM_REQ_PMI, pt->vcpu);
+   __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+   (unsigned long *)>vcpu->arch.pmu.global_status);
+   return;
+   }
+
/*
 * There may be a dangling PT bit in the interrupt status register
 * after PT has been disabled by pt_event_stop(). Make sure we don't
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index ee960fb..32da2e9 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -62,6 +62,7 @@ struct pt {
struct pt_filters   filters;
int handle_nmi;
int vmx_on;
+   struct kvm_vcpu *vcpu;
 };
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8e40c24..ae01fb0 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -775,6 +775,10 @@
 #define MSR_CORE_PERF_GLOBAL_CTRL  0x038f
 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL  0x0390
 
+/* PERF_GLOBAL_OVF_CTL bits */
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT   55
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI   (1ULL << 
MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT)
+
 /* Geode defined MSRs */
 #define MSR_GEODE_BUSCONT_CONF00x1900
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 224cd0a..a9ee498 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -4,6 +4,7 @@
 
 #include 
 #include 
+#include 
 #include "kvm_cache_regs.h"
 
 #define KVM_DEFAULT_PLE_GAP128
@@ -331,15 +332,20 @@ static inline bool kvm_pause_in_guest(struct kvm *kvm)
 }
 
 DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+DECLARE_PER_CPU(struct pt, pt_ctx);
 
 static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
 {
__this_cpu_write(current_vcpu, vcpu);
+   if (kvm_x86_ops->pt_supported())
+   this_cpu_ptr(_ctx)->vcpu = vcpu;
 }
 
 static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
 {
__this_cpu_write(current_vcpu, NULL);
+   if (kvm_x86_ops->pt_supported())
+   this_cpu_ptr(_ctx)->vcpu = NULL;
 }
 
 #endif
-- 
1.8.3.1



[PATCH 1/3] perf/x86/intel/pt: Move pt structure to global header

2019-01-28 Thread Luwei Kang
Intel PT structure (struct pt) is in a private header.
Move it (and sub structure) to a global header so that
it can be accessible from KVM code.

The definition of perf_output_handle structure included
in "linux/perf_event.h".

Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.h  | 38 --
 arch/x86/include/asm/intel_pt.h | 40 
 2 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 269e15a..964948f 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -93,42 +93,4 @@ struct pt_buffer {
struct topa_entry   *topa_index[0];
 };
 
-#define PT_FILTERS_NUM 4
-
-/**
- * struct pt_filter - IP range filter configuration
- * @msr_a: range start, goes to RTIT_ADDRn_A
- * @msr_b: range end, goes to RTIT_ADDRn_B
- * @config:4-bit field in RTIT_CTL
- */
-struct pt_filter {
-   unsigned long   msr_a;
-   unsigned long   msr_b;
-   unsigned long   config;
-};
-
-/**
- * struct pt_filters - IP range filtering context
- * @filter:filters defined for this context
- * @nr_filters:number of defined filters in the @filter array
- */
-struct pt_filters {
-   struct pt_filterfilter[PT_FILTERS_NUM];
-   unsigned intnr_filters;
-};
-
-/**
- * struct pt - per-cpu pt context
- * @handle:perf output handle
- * @filters:   last configured filters
- * @handle_nmi:do handle PT PMI on this cpu, there's an active event
- * @vmx_on:1 if VMX is ON on this cpu
- */
-struct pt {
-   struct perf_output_handle handle;
-   struct pt_filters   filters;
-   int handle_nmi;
-   int vmx_on;
-};
-
 #endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 634f99b..ee960fb 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_INTEL_PT_H
 #define _ASM_X86_INTEL_PT_H
 
+#include 
+
 #define PT_CPUID_LEAVES2
 #define PT_CPUID_REGS_NUM  4 /* number of regsters (eax, ebx, ecx, edx) */
 
@@ -24,6 +26,44 @@ enum pt_capabilities {
PT_CAP_psb_periods,
 };
 
+#define PT_FILTERS_NUM 4
+
+/**
+ * struct pt_filter - IP range filter configuration
+ * @msr_a: range start, goes to RTIT_ADDRn_A
+ * @msr_b: range end, goes to RTIT_ADDRn_B
+ * @config:4-bit field in RTIT_CTL
+ */
+struct pt_filter {
+   unsigned long   msr_a;
+   unsigned long   msr_b;
+   unsigned long   config;
+};
+
+/**
+ * struct pt_filters - IP range filtering context
+ * @filter:filters defined for this context
+ * @nr_filters:number of defined filters in the @filter array
+ */
+struct pt_filters {
+   struct pt_filterfilter[PT_FILTERS_NUM];
+   unsigned intnr_filters;
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle:perf output handle
+ * @filters:   last configured filters
+ * @handle_nmi:do handle PT PMI on this cpu, there's an active event
+ * @vmx_on:1 if VMX is ON on this cpu
+ */
+struct pt {
+   struct perf_output_handle handle;
+   struct pt_filters   filters;
+   int handle_nmi;
+   int vmx_on;
+};
+
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 void cpu_emergency_stop_pt(void);
 extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap);
-- 
1.8.3.1



[PATCH 0/3] Inject a PMI for KVM Guest when ToPA buffer is filled

2019-01-28 Thread Luwei Kang
Each intel processor trace table of physical addresses (ToPA) entry
has an INT bit. If this bit is set, the processor will signal a
performance-monitoring interrupt (PMI) when the corresponding trace
output region is filled. This patch set will inject a PMI for Intel
Processor Trace when ToPA buffer is filled.

Luwei Kang (3):
  perf/x86/intel/pt: Move pt structure to global header
  perf/x86/intel/pt: Inject PMI for KVM guest
  KVM: x86: Add support of clear Trace_ToPA_PMI status

 arch/x86/events/intel/pt.c   | 12 +++-
 arch/x86/events/intel/pt.h   | 38 -
 arch/x86/include/asm/intel_pt.h  | 41 
 arch/x86/include/asm/kvm_host.h  |  1 +
 arch/x86/include/asm/msr-index.h |  8 
 arch/x86/kvm/vmx/pmu_intel.c |  8 +++-
 arch/x86/kvm/x86.h   |  6 ++
 7 files changed, 74 insertions(+), 40 deletions(-)

-- 
1.8.3.1



[PATCH] KVM: x86: Sync the pending Posted-Interrupts

2019-01-17 Thread Luwei Kang
Some Posted-Interrupts from passthrough devices may be lost or
overwritten when the vCPU is in runnable state.

The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will
be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state
but not running on physical CPU). If a posted interrupt coming at this
time, the irq remmaping facility will set the bit of PIR (Posted
Interrupt Requests) but ON (Outstanding Notification).
So this interrupt can't be sync to APIC virtualization register and
will not be handled by Guest because ON is zero.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f6915f1..820a03b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6048,7 +6048,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
bool max_irr_updated;
 
WARN_ON(!vcpu->arch.apicv_active);
-   if (pi_test_on(>pi_desc)) {
+   if (!bitmap_empty((unsigned long *)vmx->pi_desc.pir, NR_VECTORS)) {
pi_clear_on(>pi_desc);
/*
 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
-- 
1.8.3.1



[PATCH v13 09/12] KVM: x86: Introduce a function to initialize the PT configuration

2018-10-24 Thread Luwei Kang
Initialize the Intel PT configuration when cpuid update.
Include cpuid inforamtion, rtit_ctl bit mask and the number of
address ranges.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 73 ++
 1 file changed, 73 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d8480a6..2697618 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11921,6 +11921,75 @@ static void nested_vmx_entry_exit_ctls_update(struct 
kvm_vcpu *vcpu)
}
 }
 
+static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_cpuid_entry2 *best = NULL;
+   int i;
+
+   for (i = 0; i < PT_CPUID_LEAVES; i++) {
+   best = kvm_find_cpuid_entry(vcpu, 0x14, i);
+   if (!best)
+   return;
+   vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
+   vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
+   vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
+   vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
+   }
+
+   /* Get the number of configurable Address Ranges for filtering */
+   vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
+   PT_CAP_num_address_ranges);
+
+   /* Initialize and clear the no dependency bits */
+   vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
+   RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
+* will inject an #GP
+*/
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
+* PSBFreq can be set
+*/
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
+   RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
+* MTCFreq can be set
+*/
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
+   RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
+   RTIT_CTL_PTW_EN);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
+
+   /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
+
+   /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
+
+   /* unmask address range configure area */
+   for (i = 0; i < vmx->pt_desc.addr_range; i++)
+   vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4));
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -11941,6 +12010,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
nested_vmx_cr_fixed1_bits_update(vcpu);
nested_vmx_entry_exit_ctls_update(vcpu);
}
+
+   if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
+   guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+   update_intel_pt_cfg(vcpu);
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-- 
1.8.3.1



[PATCH v13 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write

2018-10-24 Thread Luwei Kang
From: Chao Peng 

To save performance overhead, disable intercept Intel PT MSRs
read/write when Intel PT is enabled in guest.
MSR_IA32_RTIT_CTL is an exception that will always be intercepted.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a568d49..ed247dd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1333,6 +1333,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 
*vmcs12,
 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 static void __always_inline vmx_disable_intercept_for_msr(unsigned long 
*msr_bitmap,
  u32 msr, int type);
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -4558,6 +4559,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
vmx_rtit_ctl_check(vcpu, data))
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
+   pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
vmx->pt_desc.guest.ctl = data;
break;
case MSR_IA32_RTIT_STATUS:
@@ -6414,6 +6416,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
vmx->msr_bitmap_mode = mode;
 }
 
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag)
+{
+   unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+   u32 i;
+
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
+   MSR_TYPE_RW, flag);
+   for (i = 0; i < vmx->pt_desc.addr_range; i++) {
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+   }
+}
+
 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
return enable_apicv;
-- 
1.8.3.1



[PATCH v13 10/12] KVM: x86: Implement Intel PT MSRs read/write emulation

2018-10-24 Thread Luwei Kang
From: Chao Peng 

This patch implement Intel Processor Trace MSRs read/write
emulation.
Intel PT MSRs read/write need to be emulated when Intel PT
MSRs is intercepted in guest and during live migration.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/intel_pt.h |   8 ++
 arch/x86/kvm/vmx.c  | 176 
 arch/x86/kvm/x86.c  |  33 +++-
 3 files changed, 216 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index eabbdbc..a1c2080 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -10,6 +10,14 @@
 
 #define RTIT_ADDR_RANGE4
 
+#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
+   RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
+   RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
+   RTIT_STATUS_BYTECNT))
+
+#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
+   (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2697618..a568d49 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3350,6 +3350,79 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu 
*vcpu, int mask)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
 }
 
+static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   unsigned long value;
+
+   /*
+* Any MSR write that attempts to change bits marked reserved will
+* case a #GP fault.
+*/
+   if (data & vmx->pt_desc.ctl_bitmask)
+   return 1;
+
+   /*
+* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
+* result in a #GP unless the same write also clears TraceEn.
+*/
+   if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
+   ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+   return 1;
+
+   /*
+* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
+* and FabricEn would cause #GP, if
+* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
+*/
+   if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
+   !(data & RTIT_CTL_FABRIC_EN) &&
+   !intel_pt_validate_cap(vmx->pt_desc.caps,
+   PT_CAP_single_range_output))
+   return 1;
+
+   /*
+* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
+* utilize encodings marked reserved will casue a #GP fault.
+*/
+   value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
+   !test_bit((data & RTIT_CTL_MTC_RANGE) >>
+   RTIT_CTL_MTC_RANGE_OFFSET, ))
+   return 1;
+   value = intel_pt_validate_cap(vmx->pt_desc.caps,
+   PT_CAP_cycle_thresholds);
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_CYC_THRESH) >>
+   RTIT_CTL_CYC_THRESH_OFFSET, ))
+   return 1;
+   value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_PSB_FREQ) >>
+   RTIT_CTL_PSB_FREQ_OFFSET, ))
+   return 1;
+
+   /*
+* If ADDRx_CFG is reserved or the encodings is >2 will
+* cause a #GP fault.
+*/
+   value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+   return 1;
+
+   return 0;
+}
+
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
unsigned long rip;
@@ -4186,6 +4259,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
 {
struct vcpu_vmx *vmx = to

[PATCH v13 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation

2018-10-24 Thread Luwei Kang
From: Chao Peng 

Expose Intel Processor Trace to guest only when
the PT works in Host-Guest mode.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/cpuid.c| 22 --
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx.c  |  6 ++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 55e51ff..9ab7ac0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1105,6 +1105,7 @@ struct kvm_x86_ops {
bool (*mpx_supported)(void);
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
+   bool (*pt_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7bcfa61..05b8fb4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -337,6 +337,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
+   unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -395,7 +396,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | 
F(AVX512DQ) |
-   F(SHA_NI) | F(AVX512BW) | F(AVX512VL);
+   F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
 
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
@@ -426,7 +427,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
switch (function) {
case 0:
-   entry->eax = min(entry->eax, (u32)0xd);
+   entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd));
break;
case 1:
entry->edx &= kvm_cpuid_1_edx_x86_features;
@@ -603,6 +604,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
}
break;
}
+   /* Intel PT */
+   case 0x14: {
+   int t, times = entry->eax;
+
+   if (!f_intel_pt)
+   break;
+
+   entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   for (t = 1; t <= times; ++t) {
+   if (*nent >= maxnent)
+   goto out;
+   do_cpuid_1_ent([t], function, t);
+   entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   ++*nent;
+   }
+   break;
+   }
case KVM_CPUID_SIGNATURE: {
static const char signature[12] = "KVMKVMKVM\0\0";
const u32 *sigptr = (const u32 *)signature;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f416f5c7..6e8a61b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5904,6 +5904,11 @@ static bool svm_umip_emulated(void)
return false;
 }
 
+static bool svm_pt_supported(void)
+{
+   return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
return true;
@@ -7139,6 +7144,7 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
.mpx_supported = svm_mpx_supported,
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
+   .pt_supported = svm_pt_supported,
 
.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c4c4b76..692154c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11013,6 +11013,11 @@ static bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES;
 }
 
+static bool vmx_pt_supported(void)
+{
+   return (pt_mode == PT_MODE_HOST_GUEST);
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
u32 exit_intr_info;
@@ -15127,6 +15132,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
.mpx_supported = vmx_mpx_supported,
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
+   .pt_supported = vmx_pt_supported,
 
.check_nested_events = vmx_check_nested_events,
.request_immediate_exit = vmx_request_immediate_exit,
-- 
1.8.3.1



[PATCH v13 09/12] KVM: x86: Introduce a function to initialize the PT configuration

2018-10-24 Thread Luwei Kang
Initialize the Intel PT configuration when cpuid update.
Include cpuid inforamtion, rtit_ctl bit mask and the number of
address ranges.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 73 ++
 1 file changed, 73 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d8480a6..2697618 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11921,6 +11921,75 @@ static void nested_vmx_entry_exit_ctls_update(struct 
kvm_vcpu *vcpu)
}
 }
 
+static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_cpuid_entry2 *best = NULL;
+   int i;
+
+   for (i = 0; i < PT_CPUID_LEAVES; i++) {
+   best = kvm_find_cpuid_entry(vcpu, 0x14, i);
+   if (!best)
+   return;
+   vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
+   vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
+   vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
+   vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
+   }
+
+   /* Get the number of configurable Address Ranges for filtering */
+   vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
+   PT_CAP_num_address_ranges);
+
+   /* Initialize and clear the no dependency bits */
+   vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
+   RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
+* will inject an #GP
+*/
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
+* PSBFreq can be set
+*/
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
+   RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
+* MTCFreq can be set
+*/
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
+   RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
+   RTIT_CTL_PTW_EN);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
+
+   /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
+
+   /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
+
+   /* unmask address range configure area */
+   for (i = 0; i < vmx->pt_desc.addr_range; i++)
+   vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4));
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -11941,6 +12010,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
nested_vmx_cr_fixed1_bits_update(vcpu);
nested_vmx_entry_exit_ctls_update(vcpu);
}
+
+   if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
+   guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+   update_intel_pt_cfg(vcpu);
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-- 
1.8.3.1



[PATCH v13 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write

2018-10-24 Thread Luwei Kang
From: Chao Peng 

To save performance overhead, disable intercept Intel PT MSRs
read/write when Intel PT is enabled in guest.
MSR_IA32_RTIT_CTL is an exception that will always be intercepted.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a568d49..ed247dd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1333,6 +1333,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 
*vmcs12,
 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 static void __always_inline vmx_disable_intercept_for_msr(unsigned long 
*msr_bitmap,
  u32 msr, int type);
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -4558,6 +4559,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
vmx_rtit_ctl_check(vcpu, data))
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
+   pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
vmx->pt_desc.guest.ctl = data;
break;
case MSR_IA32_RTIT_STATUS:
@@ -6414,6 +6416,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
vmx->msr_bitmap_mode = mode;
 }
 
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag)
+{
+   unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+   u32 i;
+
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
+   MSR_TYPE_RW, flag);
+   for (i = 0; i < vmx->pt_desc.addr_range; i++) {
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+   }
+}
+
 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
return enable_apicv;
-- 
1.8.3.1



[PATCH v13 10/12] KVM: x86: Implement Intel PT MSRs read/write emulation

2018-10-24 Thread Luwei Kang
From: Chao Peng 

This patch implement Intel Processor Trace MSRs read/write
emulation.
Intel PT MSRs read/write need to be emulated when Intel PT
MSRs is intercepted in guest and during live migration.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/intel_pt.h |   8 ++
 arch/x86/kvm/vmx.c  | 176 
 arch/x86/kvm/x86.c  |  33 +++-
 3 files changed, 216 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index eabbdbc..a1c2080 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -10,6 +10,14 @@
 
 #define RTIT_ADDR_RANGE4
 
+#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
+   RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
+   RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
+   RTIT_STATUS_BYTECNT))
+
+#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
+   (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2697618..a568d49 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3350,6 +3350,79 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu 
*vcpu, int mask)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
 }
 
+static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   unsigned long value;
+
+   /*
+* Any MSR write that attempts to change bits marked reserved will
+* case a #GP fault.
+*/
+   if (data & vmx->pt_desc.ctl_bitmask)
+   return 1;
+
+   /*
+* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
+* result in a #GP unless the same write also clears TraceEn.
+*/
+   if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
+   ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+   return 1;
+
+   /*
+* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
+* and FabricEn would cause #GP, if
+* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
+*/
+   if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
+   !(data & RTIT_CTL_FABRIC_EN) &&
+   !intel_pt_validate_cap(vmx->pt_desc.caps,
+   PT_CAP_single_range_output))
+   return 1;
+
+   /*
+* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
+* utilize encodings marked reserved will casue a #GP fault.
+*/
+   value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
+   !test_bit((data & RTIT_CTL_MTC_RANGE) >>
+   RTIT_CTL_MTC_RANGE_OFFSET, ))
+   return 1;
+   value = intel_pt_validate_cap(vmx->pt_desc.caps,
+   PT_CAP_cycle_thresholds);
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_CYC_THRESH) >>
+   RTIT_CTL_CYC_THRESH_OFFSET, ))
+   return 1;
+   value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
+   if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_PSB_FREQ) >>
+   RTIT_CTL_PSB_FREQ_OFFSET, ))
+   return 1;
+
+   /*
+* If ADDRx_CFG is reserved or the encodings is >2 will
+* cause a #GP fault.
+*/
+   value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+   return 1;
+
+   return 0;
+}
+
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
unsigned long rip;
@@ -4186,6 +4259,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
 {
struct vcpu_vmx *vmx = to

[PATCH v13 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation

2018-10-24 Thread Luwei Kang
From: Chao Peng 

Expose Intel Processor Trace to guest only when
the PT works in Host-Guest mode.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/cpuid.c| 22 --
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx.c  |  6 ++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 55e51ff..9ab7ac0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1105,6 +1105,7 @@ struct kvm_x86_ops {
bool (*mpx_supported)(void);
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
+   bool (*pt_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7bcfa61..05b8fb4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -337,6 +337,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
+   unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -395,7 +396,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | 
F(AVX512DQ) |
-   F(SHA_NI) | F(AVX512BW) | F(AVX512VL);
+   F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
 
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
@@ -426,7 +427,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
switch (function) {
case 0:
-   entry->eax = min(entry->eax, (u32)0xd);
+   entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd));
break;
case 1:
entry->edx &= kvm_cpuid_1_edx_x86_features;
@@ -603,6 +604,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
}
break;
}
+   /* Intel PT */
+   case 0x14: {
+   int t, times = entry->eax;
+
+   if (!f_intel_pt)
+   break;
+
+   entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   for (t = 1; t <= times; ++t) {
+   if (*nent >= maxnent)
+   goto out;
+   do_cpuid_1_ent([t], function, t);
+   entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   ++*nent;
+   }
+   break;
+   }
case KVM_CPUID_SIGNATURE: {
static const char signature[12] = "KVMKVMKVM\0\0";
const u32 *sigptr = (const u32 *)signature;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f416f5c7..6e8a61b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5904,6 +5904,11 @@ static bool svm_umip_emulated(void)
return false;
 }
 
+static bool svm_pt_supported(void)
+{
+   return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
return true;
@@ -7139,6 +7144,7 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
.mpx_supported = svm_mpx_supported,
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
+   .pt_supported = svm_pt_supported,
 
.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c4c4b76..692154c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11013,6 +11013,11 @@ static bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES;
 }
 
+static bool vmx_pt_supported(void)
+{
+   return (pt_mode == PT_MODE_HOST_GUEST);
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
u32 exit_intr_info;
@@ -15127,6 +15132,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
.mpx_supported = vmx_mpx_supported,
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
+   .pt_supported = vmx_pt_supported,
 
.check_nested_events = vmx_check_nested_events,
.request_immediate_exit = vmx_request_immediate_exit,
-- 
1.8.3.1



[PATCH v13 06/12] KVM: x86: Add Intel PT virtualization work mode

2018-10-24 Thread Luwei Kang
From: Chao Peng 

Intel Processor Trace virtualization can be work in one
of 2 possible modes:

a. System-Wide mode (default):
   When the host configures Intel PT to collect trace packets
   of the entire system, it can leave the relevant VMX controls
   clear to allow VMX-specific packets to provide information
   across VMX transitions.
   KVM guest will not aware this feature in this mode and both
   host and KVM guest trace will output to host buffer.

b. Host-Guest mode:
   Host can configure trace-packet generation while in
   VMX non-root operation for guests and root operation
   for native executing normally.
   Intel PT will be exposed to KVM guest in this mode, and
   the trace output to respective buffer of host and guest.
   In this mode, tht status of PT will be saved and disabled
   before VM-entry and restored after VM-exit if trace
   a virtual machine.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/intel_pt.h  |  3 ++
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/include/asm/vmx.h   |  8 +
 arch/x86/kvm/vmx.c   | 68 +---
 4 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 634f99b..4727584 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -5,6 +5,9 @@
 #define PT_CPUID_LEAVES2
 #define PT_CPUID_REGS_NUM  4 /* number of regsters (eax, ebx, ecx, edx) */
 
+#define PT_MODE_SYSTEM 0
+#define PT_MODE_HOST_GUEST 1
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 107818e3..f51579d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -805,6 +805,7 @@
 #define VMX_BASIC_INOUT0x0040LLU
 
 /* MSR_IA32_VMX_MISC bits */
+#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index ade0f15..b99710c 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -77,7 +77,9 @@
 #define SECONDARY_EXEC_ENCLS_EXITING   0x8000
 #define SECONDARY_EXEC_RDSEED_EXITING  0x0001
 #define SECONDARY_EXEC_ENABLE_PML   0x0002
+#define SECONDARY_EXEC_PT_CONCEAL_VMX  0x0008
 #define SECONDARY_EXEC_XSAVES  0x0010
+#define SECONDARY_EXEC_PT_USE_GPA  0x0100
 #define SECONDARY_EXEC_TSC_SCALING  0x0200
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
@@ -98,6 +100,8 @@
 #define VM_EXIT_LOAD_IA32_EFER  0x0020
 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER   0x0040
 #define VM_EXIT_CLEAR_BNDCFGS   0x0080
+#define VM_EXIT_PT_CONCEAL_PIP 0x0100
+#define VM_EXIT_CLEAR_IA32_RTIT_CTL0x0200
 
 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR  0x00036dff
 
@@ -109,6 +113,8 @@
 #define VM_ENTRY_LOAD_IA32_PAT 0x4000
 #define VM_ENTRY_LOAD_IA32_EFER 0x8000
 #define VM_ENTRY_LOAD_BNDCFGS   0x0001
+#define VM_ENTRY_PT_CONCEAL_PIP0x0002
+#define VM_ENTRY_LOAD_IA32_RTIT_CTL0x0004
 
 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x11ff
 
@@ -240,6 +246,8 @@ enum vmcs_field {
GUEST_PDPTR3_HIGH   = 0x2811,
GUEST_BNDCFGS   = 0x2812,
GUEST_BNDCFGS_HIGH  = 0x2813,
+   GUEST_IA32_RTIT_CTL = 0x2814,
+   GUEST_IA32_RTIT_CTL_HIGH= 0x2815,
HOST_IA32_PAT   = 0x2c00,
HOST_IA32_PAT_HIGH  = 0x2c01,
HOST_IA32_EFER  = 0x2c02,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 641a65b..c4c4b76 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "trace.h"
 #include "pmu.h"
@@ -190,6 +191,10 @@
 static unsigned int ple_window_max= KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 module_param(ple_window_max, uint, 0444);
 
+/* Default is SYSTEM mode. */
+static int __read_mostly pt_mode = PT_MODE_SYSTEM;
+module_param(pt_mode, int, S_IRUGO);
+
 extern const ulong vmx_return;
 extern const ulong vmx_early_consistency_check_return;
 
@@ -1955,6 +1960,20 @@ static bool vmx_umip_emulated(void)
SECONDARY_EXEC_DESC;
 }
 
+static inline bool cpu_has_vmx_intel_pt(void)
+{
+   u64 vmx_msr;
+
+   rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+   return !!(vmx_msr & MSR_IA32_VMX_MISC

[PATCH v13 12/12] KVM: x86: Disable Intel PT when VMXON in L1 guest

2018-10-24 Thread Luwei Kang
Currently, Intel Processor Trace do not support tracing in L1 guest
VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM,
on these type of processors, execution of the VMXON instruction will
clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL
causes a general-protection exception (#GP).

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed247dd..5001049 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4556,7 +4556,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
-   vmx_rtit_ctl_check(vcpu, data))
+   vmx_rtit_ctl_check(vcpu, data) ||
+   vmx->nested.vmxon)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
@@ -8760,6 +8761,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (ret)
return ret;
 
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   vmx->pt_desc.guest.ctl = 0;
+   pt_set_intercept_for_msr(vmx, 1);
+   }
+
return nested_vmx_succeed(vcpu);
 }
 
-- 
1.8.3.1



[PATCH v13 08/12] KVM: x86: Add Intel PT context switch for each vcpu

2018-10-24 Thread Luwei Kang
From: Chao Peng 

Load/Store Intel Processor Trace register in context switch.
MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS.
In Host-Guest mode, we need load/resore PT MSRs only when PT
is enabled in guest.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/intel_pt.h |  2 +
 arch/x86/kvm/vmx.c  | 94 +
 2 files changed, 96 insertions(+)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 4727584..eabbdbc 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -8,6 +8,8 @@
 #define PT_MODE_SYSTEM 0
 #define PT_MODE_HOST_GUEST 1
 
+#define RTIT_ADDR_RANGE4
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 692154c..d8480a6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -978,6 +978,24 @@ struct vmx_msrs {
struct vmx_msr_entryval[NR_AUTOLOAD_MSRS];
 };
 
+struct pt_ctx {
+   u64 ctl;
+   u64 status;
+   u64 output_base;
+   u64 output_mask;
+   u64 cr3_match;
+   u64 addr_a[RTIT_ADDR_RANGE];
+   u64 addr_b[RTIT_ADDR_RANGE];
+};
+
+struct pt_desc {
+   u64 ctl_bitmask;
+   u32 addr_range;
+   u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+   struct pt_ctx host;
+   struct pt_ctx guest;
+};
+
 struct vcpu_vmx {
struct kvm_vcpu   vcpu;
unsigned long host_rsp;
@@ -1071,6 +1089,8 @@ struct vcpu_vmx {
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
u64 ept_pointer;
+
+   struct pt_desc pt_desc;
 };
 
 enum segment_cache_field {
@@ -2899,6 +2919,69 @@ static unsigned long segment_base(u16 selector)
 }
 #endif
 
+static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static void pt_guest_enter(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   /* Save host state before VM entry */
+   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+
+   /*
+* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled
+* on VM entry when it has been disabled in guest before).
+*/
+   vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl);
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   wrmsrl(MSR_IA32_RTIT_CTL, 0);
+   pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   }
+}
+
+static void pt_guest_exit(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   }
+
+   /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
+   wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+}
+
 static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6749,6 +6832,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
if (cpu_has_vmx_encls_vmexit())
vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   memset(>pt_desc, 0, sizeof(vmx->pt_desc));
+   /* Bit[6~0] are forced to 1, writes are ignored. */
+   vmx->pt_desc.guest.output_mask = 0x7F;
+   vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
+   }
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -11260,6 +11350,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu 
*vcpu)
vcpu->arch.pkru != vmx->host_pkru)
   

[PATCH v13 12/12] KVM: x86: Disable Intel PT when VMXON in L1 guest

2018-10-24 Thread Luwei Kang
Currently, Intel Processor Trace do not support tracing in L1 guest
VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM,
on these type of processors, execution of the VMXON instruction will
clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL
causes a general-protection exception (#GP).

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed247dd..5001049 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4556,7 +4556,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
-   vmx_rtit_ctl_check(vcpu, data))
+   vmx_rtit_ctl_check(vcpu, data) ||
+   vmx->nested.vmxon)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
@@ -8760,6 +8761,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (ret)
return ret;
 
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   vmx->pt_desc.guest.ctl = 0;
+   pt_set_intercept_for_msr(vmx, 1);
+   }
+
return nested_vmx_succeed(vcpu);
 }
 
-- 
1.8.3.1



[PATCH v13 08/12] KVM: x86: Add Intel PT context switch for each vcpu

2018-10-24 Thread Luwei Kang
From: Chao Peng 

Load/Store Intel Processor Trace register in context switch.
MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS.
In Host-Guest mode, we need load/resore PT MSRs only when PT
is enabled in guest.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/intel_pt.h |  2 +
 arch/x86/kvm/vmx.c  | 94 +
 2 files changed, 96 insertions(+)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 4727584..eabbdbc 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -8,6 +8,8 @@
 #define PT_MODE_SYSTEM 0
 #define PT_MODE_HOST_GUEST 1
 
+#define RTIT_ADDR_RANGE4
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 692154c..d8480a6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -978,6 +978,24 @@ struct vmx_msrs {
struct vmx_msr_entryval[NR_AUTOLOAD_MSRS];
 };
 
+struct pt_ctx {
+   u64 ctl;
+   u64 status;
+   u64 output_base;
+   u64 output_mask;
+   u64 cr3_match;
+   u64 addr_a[RTIT_ADDR_RANGE];
+   u64 addr_b[RTIT_ADDR_RANGE];
+};
+
+struct pt_desc {
+   u64 ctl_bitmask;
+   u32 addr_range;
+   u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+   struct pt_ctx host;
+   struct pt_ctx guest;
+};
+
 struct vcpu_vmx {
struct kvm_vcpu   vcpu;
unsigned long host_rsp;
@@ -1071,6 +1089,8 @@ struct vcpu_vmx {
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
u64 ept_pointer;
+
+   struct pt_desc pt_desc;
 };
 
 enum segment_cache_field {
@@ -2899,6 +2919,69 @@ static unsigned long segment_base(u16 selector)
 }
 #endif
 
+static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static void pt_guest_enter(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   /* Save host state before VM entry */
+   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+
+   /*
+* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled
+* on VM entry when it has been disabled in guest before).
+*/
+   vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl);
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   wrmsrl(MSR_IA32_RTIT_CTL, 0);
+   pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   }
+}
+
+static void pt_guest_exit(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   }
+
+   /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
+   wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+}
+
 static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6749,6 +6832,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
if (cpu_has_vmx_encls_vmexit())
vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   memset(>pt_desc, 0, sizeof(vmx->pt_desc));
+   /* Bit[6~0] are forced to 1, writes are ignored. */
+   vmx->pt_desc.guest.output_mask = 0x7F;
+   vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
+   }
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -11260,6 +11350,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu 
*vcpu)
vcpu->arch.pkru != vmx->host_pkru)
   

[PATCH v13 06/12] KVM: x86: Add Intel PT virtualization work mode

2018-10-24 Thread Luwei Kang
From: Chao Peng 

Intel Processor Trace virtualization can be work in one
of 2 possible modes:

a. System-Wide mode (default):
   When the host configures Intel PT to collect trace packets
   of the entire system, it can leave the relevant VMX controls
   clear to allow VMX-specific packets to provide information
   across VMX transitions.
   KVM guest will not aware this feature in this mode and both
   host and KVM guest trace will output to host buffer.

b. Host-Guest mode:
   Host can configure trace-packet generation while in
   VMX non-root operation for guests and root operation
   for native executing normally.
   Intel PT will be exposed to KVM guest in this mode, and
   the trace output to respective buffer of host and guest.
   In this mode, tht status of PT will be saved and disabled
   before VM-entry and restored after VM-exit if trace
   a virtual machine.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/intel_pt.h  |  3 ++
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/include/asm/vmx.h   |  8 +
 arch/x86/kvm/vmx.c   | 68 +---
 4 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 634f99b..4727584 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -5,6 +5,9 @@
 #define PT_CPUID_LEAVES2
 #define PT_CPUID_REGS_NUM  4 /* number of regsters (eax, ebx, ecx, edx) */
 
+#define PT_MODE_SYSTEM 0
+#define PT_MODE_HOST_GUEST 1
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 107818e3..f51579d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -805,6 +805,7 @@
 #define VMX_BASIC_INOUT0x0040LLU
 
 /* MSR_IA32_VMX_MISC bits */
+#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index ade0f15..b99710c 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -77,7 +77,9 @@
 #define SECONDARY_EXEC_ENCLS_EXITING   0x8000
 #define SECONDARY_EXEC_RDSEED_EXITING  0x0001
 #define SECONDARY_EXEC_ENABLE_PML   0x0002
+#define SECONDARY_EXEC_PT_CONCEAL_VMX  0x0008
 #define SECONDARY_EXEC_XSAVES  0x0010
+#define SECONDARY_EXEC_PT_USE_GPA  0x0100
 #define SECONDARY_EXEC_TSC_SCALING  0x0200
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
@@ -98,6 +100,8 @@
 #define VM_EXIT_LOAD_IA32_EFER  0x0020
 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER   0x0040
 #define VM_EXIT_CLEAR_BNDCFGS   0x0080
+#define VM_EXIT_PT_CONCEAL_PIP 0x0100
+#define VM_EXIT_CLEAR_IA32_RTIT_CTL0x0200
 
 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR  0x00036dff
 
@@ -109,6 +113,8 @@
 #define VM_ENTRY_LOAD_IA32_PAT 0x4000
 #define VM_ENTRY_LOAD_IA32_EFER 0x8000
 #define VM_ENTRY_LOAD_BNDCFGS   0x0001
+#define VM_ENTRY_PT_CONCEAL_PIP0x0002
+#define VM_ENTRY_LOAD_IA32_RTIT_CTL0x0004
 
 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x11ff
 
@@ -240,6 +246,8 @@ enum vmcs_field {
GUEST_PDPTR3_HIGH   = 0x2811,
GUEST_BNDCFGS   = 0x2812,
GUEST_BNDCFGS_HIGH  = 0x2813,
+   GUEST_IA32_RTIT_CTL = 0x2814,
+   GUEST_IA32_RTIT_CTL_HIGH= 0x2815,
HOST_IA32_PAT   = 0x2c00,
HOST_IA32_PAT_HIGH  = 0x2c01,
HOST_IA32_EFER  = 0x2c02,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 641a65b..c4c4b76 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "trace.h"
 #include "pmu.h"
@@ -190,6 +191,10 @@
 static unsigned int ple_window_max= KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 module_param(ple_window_max, uint, 0444);
 
+/* Default is SYSTEM mode. */
+static int __read_mostly pt_mode = PT_MODE_SYSTEM;
+module_param(pt_mode, int, S_IRUGO);
+
 extern const ulong vmx_return;
 extern const ulong vmx_early_consistency_check_return;
 
@@ -1955,6 +1960,20 @@ static bool vmx_umip_emulated(void)
SECONDARY_EXEC_DESC;
 }
 
+static inline bool cpu_has_vmx_intel_pt(void)
+{
+   u64 vmx_msr;
+
+   rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+   return !!(vmx_msr & MSR_IA32_VMX_MISC

[PATCH v13 04/12] perf/x86/intel/pt: Add new bit definitions for PT MSRs

2018-10-24 Thread Luwei Kang
Add bit definitions for Intel PT MSRs to support trace output
directed to the memeory subsystem and holds a count if packet
bytes that have been sent out.

These are required by the upcoming PT support in KVM guests
for MSRs read/write emulation.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/msr-index.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d3a9eb9..107818e3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -126,6 +126,7 @@
 #define RTIT_CTL_USR   BIT(3)
 #define RTIT_CTL_PWR_EVT_ENBIT(4)
 #define RTIT_CTL_FUP_ON_PTWBIT(5)
+#define RTIT_CTL_FABRIC_EN BIT(6)
 #define RTIT_CTL_CR3EN BIT(7)
 #define RTIT_CTL_TOPA  BIT(8)
 #define RTIT_CTL_MTC_ENBIT(9)
@@ -154,6 +155,8 @@
 #define RTIT_STATUS_BUFFOVFBIT(3)
 #define RTIT_STATUS_ERROR  BIT(4)
 #define RTIT_STATUS_STOPPEDBIT(5)
+#define RTIT_STATUS_BYTECNT_OFFSET 32
+#define RTIT_STATUS_BYTECNT(0x1ull << 
RTIT_STATUS_BYTECNT_OFFSET)
 #define MSR_IA32_RTIT_ADDR0_A  0x0580
 #define MSR_IA32_RTIT_ADDR0_B  0x0581
 #define MSR_IA32_RTIT_ADDR1_A  0x0582
-- 
1.8.3.1



[PATCH v13 03/12] perf/x86/intel/pt: Introduce intel_pt_validate_cap()

2018-10-24 Thread Luwei Kang
intel_pt_validate_hw_cap() validates whether a given PT capability is
supported by the hardware. It checks the PT capability array which
reflects the capabilities of the hardware on which the code is executed.

For setting up PT for KVM guests this is not correct as the capability
array for the guest can be different from the host array.

Provide a new function to check against a given capability array.

Acked-by: Song Liu 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  | 12 +---
 arch/x86/include/asm/intel_pt.h |  2 ++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 309bb1d..53e481a 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -75,14 +75,20 @@
PT_CAP(psb_periods, 1, CPUID_EBX, 0x),
 };
 
-u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
+u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
 {
-   struct pt_cap_desc *cd = _caps[cap];
-   u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+   struct pt_cap_desc *cd = _caps[capability];
+   u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
unsigned int shift = __ffs(cd->mask);
 
return (c & cd->mask) >> shift;
 }
+EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
+
+u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
+{
+   return intel_pt_validate_cap(pt_pmu.caps, cap);
+}
 EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
 
 static ssize_t pt_cap_show(struct device *cdev,
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index fa4b4fd..00f4afb 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -26,9 +26,11 @@ enum pt_capabilities {
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 void cpu_emergency_stop_pt(void);
 extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap);
+extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap);
 #else
 static inline void cpu_emergency_stop_pt(void) {}
 static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 
0; }
+static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities 
capability) { return 0; }
 #endif
 
 #endif /* _ASM_X86_INTEL_PT_H */
-- 
1.8.3.1



[PATCH v13 01/12] perf/x86/intel/pt: Move Intel PT MSRs bit defines to global header

2018-10-24 Thread Luwei Kang
From: Chao Peng 

The Intel Processor Trace (PT) MSR bit defines are in a private
header. The upcoming support for PT virtualization requires these defines
to be accessible from KVM code.

Move them to the global MSR header file.

Reviewed-by: Thomas Gleixner 
Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.h   | 37 -
 arch/x86/include/asm/msr-index.h | 33 +
 2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 0eb41d0..0050ca1 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -20,43 +20,6 @@
 #define __INTEL_PT_H__
 
 /*
- * PT MSR bit definitions
- */
-#define RTIT_CTL_TRACEEN   BIT(0)
-#define RTIT_CTL_CYCLEACC  BIT(1)
-#define RTIT_CTL_OSBIT(2)
-#define RTIT_CTL_USR   BIT(3)
-#define RTIT_CTL_PWR_EVT_ENBIT(4)
-#define RTIT_CTL_FUP_ON_PTWBIT(5)
-#define RTIT_CTL_CR3EN BIT(7)
-#define RTIT_CTL_TOPA  BIT(8)
-#define RTIT_CTL_MTC_ENBIT(9)
-#define RTIT_CTL_TSC_ENBIT(10)
-#define RTIT_CTL_DISRETC   BIT(11)
-#define RTIT_CTL_PTW_ENBIT(12)
-#define RTIT_CTL_BRANCH_EN BIT(13)
-#define RTIT_CTL_MTC_RANGE_OFFSET  14
-#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
-#define RTIT_CTL_CYC_THRESH_OFFSET 19
-#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
-#define RTIT_CTL_PSB_FREQ_OFFSET   24
-#define RTIT_CTL_PSB_FREQ  (0x0full << 
RTIT_CTL_PSB_FREQ_OFFSET)
-#define RTIT_CTL_ADDR0_OFFSET  32
-#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
-#define RTIT_CTL_ADDR1_OFFSET  36
-#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
-#define RTIT_CTL_ADDR2_OFFSET  40
-#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
-#define RTIT_CTL_ADDR3_OFFSET  44
-#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
-#define RTIT_STATUS_FILTEREN   BIT(0)
-#define RTIT_STATUS_CONTEXTEN  BIT(1)
-#define RTIT_STATUS_TRIGGEREN  BIT(2)
-#define RTIT_STATUS_BUFFOVFBIT(3)
-#define RTIT_STATUS_ERROR  BIT(4)
-#define RTIT_STATUS_STOPPEDBIT(5)
-
-/*
  * Single-entry ToPA: when this close to region boundary, switch
  * buffers to avoid losing data.
  */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4731f0c..d3a9eb9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -120,7 +120,40 @@
 #define MSR_PEBS_LD_LAT_THRESHOLD  0x03f6
 
 #define MSR_IA32_RTIT_CTL  0x0570
+#define RTIT_CTL_TRACEEN   BIT(0)
+#define RTIT_CTL_CYCLEACC  BIT(1)
+#define RTIT_CTL_OSBIT(2)
+#define RTIT_CTL_USR   BIT(3)
+#define RTIT_CTL_PWR_EVT_ENBIT(4)
+#define RTIT_CTL_FUP_ON_PTWBIT(5)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA  BIT(8)
+#define RTIT_CTL_MTC_ENBIT(9)
+#define RTIT_CTL_TSC_ENBIT(10)
+#define RTIT_CTL_DISRETC   BIT(11)
+#define RTIT_CTL_PTW_ENBIT(12)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET  14
+#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET 19
+#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET   24
+#define RTIT_CTL_PSB_FREQ  (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
+#define RTIT_CTL_ADDR0_OFFSET  32
+#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
+#define RTIT_CTL_ADDR1_OFFSET  36
+#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
+#define RTIT_CTL_ADDR2_OFFSET  40
+#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
+#define RTIT_CTL_ADDR3_OFFSET  44
+#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
 #define MSR_IA32_RTIT_STATUS   0x0571
+#define RTIT_STATUS_FILTEREN   BIT(0)
+#define RTIT_STATUS_CONTEXTEN  BIT(1)
+#define RTIT_STATUS_TRIGGEREN  BIT(2)
+#define RTIT_STATUS_BUFFOVFBIT(3)
+#define RTIT_STATUS_ERROR  BIT(4)
+#define RTIT_STATUS_STOPPEDBIT(5)
 #define MSR_IA32_RTIT_ADDR0_A  0x0580
 #define MSR_IA32_RTIT_ADDR0_B  0x0581
 #define MSR_IA32_RTIT_ADDR1_A  0x0582
-- 
1.8.3.1



[PATCH v13 05/12] perf/x86/intel/pt: add new capability for Intel PT

2018-10-24 Thread Luwei Kang
This adds support for "output to Trace Transport subsystem"
capability of Intel PT. It means that PT can output its
trace to an MMIO address range rather than system memory buffer.

Acked-by: Song Liu 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  | 1 +
 arch/x86/include/asm/intel_pt.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 53e481a..9597ea6 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -68,6 +68,7 @@
PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
PT_CAP(topa_multiple_entries,   0, CPUID_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
+   PT_CAP(output_subsys,   0, CPUID_ECX, BIT(3)),
PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)),
PT_CAP(num_address_ranges,  1, CPUID_EAX, 0x3),
PT_CAP(mtc_periods, 1, CPUID_EAX, 0x),
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 00f4afb..634f99b 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -16,6 +16,7 @@ enum pt_capabilities {
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
PT_CAP_single_range_output,
+   PT_CAP_output_subsys,
PT_CAP_payloads_lip,
PT_CAP_num_address_ranges,
PT_CAP_mtc_periods,
-- 
1.8.3.1



[PATCH v13 05/12] perf/x86/intel/pt: add new capability for Intel PT

2018-10-24 Thread Luwei Kang
This adds support for "output to Trace Transport subsystem"
capability of Intel PT. It means that PT can output its
trace to an MMIO address range rather than system memory buffer.

Acked-by: Song Liu 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  | 1 +
 arch/x86/include/asm/intel_pt.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 53e481a..9597ea6 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -68,6 +68,7 @@
PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
PT_CAP(topa_multiple_entries,   0, CPUID_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
+   PT_CAP(output_subsys,   0, CPUID_ECX, BIT(3)),
PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)),
PT_CAP(num_address_ranges,  1, CPUID_EAX, 0x3),
PT_CAP(mtc_periods, 1, CPUID_EAX, 0x),
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 00f4afb..634f99b 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -16,6 +16,7 @@ enum pt_capabilities {
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
PT_CAP_single_range_output,
+   PT_CAP_output_subsys,
PT_CAP_payloads_lip,
PT_CAP_num_address_ranges,
PT_CAP_mtc_periods,
-- 
1.8.3.1



[PATCH v13 04/12] perf/x86/intel/pt: Add new bit definitions for PT MSRs

2018-10-24 Thread Luwei Kang
Add bit definitions for Intel PT MSRs to support trace output
directed to the memeory subsystem and holds a count if packet
bytes that have been sent out.

These are required by the upcoming PT support in KVM guests
for MSRs read/write emulation.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/msr-index.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d3a9eb9..107818e3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -126,6 +126,7 @@
 #define RTIT_CTL_USR   BIT(3)
 #define RTIT_CTL_PWR_EVT_ENBIT(4)
 #define RTIT_CTL_FUP_ON_PTWBIT(5)
+#define RTIT_CTL_FABRIC_EN BIT(6)
 #define RTIT_CTL_CR3EN BIT(7)
 #define RTIT_CTL_TOPA  BIT(8)
 #define RTIT_CTL_MTC_ENBIT(9)
@@ -154,6 +155,8 @@
 #define RTIT_STATUS_BUFFOVFBIT(3)
 #define RTIT_STATUS_ERROR  BIT(4)
 #define RTIT_STATUS_STOPPEDBIT(5)
+#define RTIT_STATUS_BYTECNT_OFFSET 32
+#define RTIT_STATUS_BYTECNT(0x1ull << 
RTIT_STATUS_BYTECNT_OFFSET)
 #define MSR_IA32_RTIT_ADDR0_A  0x0580
 #define MSR_IA32_RTIT_ADDR0_B  0x0581
 #define MSR_IA32_RTIT_ADDR1_A  0x0582
-- 
1.8.3.1



[PATCH v13 03/12] perf/x86/intel/pt: Introduce intel_pt_validate_cap()

2018-10-24 Thread Luwei Kang
intel_pt_validate_hw_cap() validates whether a given PT capability is
supported by the hardware. It checks the PT capability array which
reflects the capabilities of the hardware on which the code is executed.

For setting up PT for KVM guests this is not correct as the capability
array for the guest can be different from the host array.

Provide a new function to check against a given capability array.

Acked-by: Song Liu 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  | 12 +---
 arch/x86/include/asm/intel_pt.h |  2 ++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 309bb1d..53e481a 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -75,14 +75,20 @@
PT_CAP(psb_periods, 1, CPUID_EBX, 0x),
 };
 
-u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
+u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
 {
-   struct pt_cap_desc *cd = _caps[cap];
-   u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+   struct pt_cap_desc *cd = _caps[capability];
+   u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
unsigned int shift = __ffs(cd->mask);
 
return (c & cd->mask) >> shift;
 }
+EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
+
+u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
+{
+   return intel_pt_validate_cap(pt_pmu.caps, cap);
+}
 EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
 
 static ssize_t pt_cap_show(struct device *cdev,
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index fa4b4fd..00f4afb 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -26,9 +26,11 @@ enum pt_capabilities {
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 void cpu_emergency_stop_pt(void);
 extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap);
+extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap);
 #else
 static inline void cpu_emergency_stop_pt(void) {}
 static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 
0; }
+static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities 
capability) { return 0; }
 #endif
 
 #endif /* _ASM_X86_INTEL_PT_H */
-- 
1.8.3.1



[PATCH v13 01/12] perf/x86/intel/pt: Move Intel PT MSRs bit defines to global header

2018-10-24 Thread Luwei Kang
From: Chao Peng 

The Intel Processor Trace (PT) MSR bit defines are in a private
header. The upcoming support for PT virtualization requires these defines
to be accessible from KVM code.

Move them to the global MSR header file.

Reviewed-by: Thomas Gleixner 
Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.h   | 37 -
 arch/x86/include/asm/msr-index.h | 33 +
 2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 0eb41d0..0050ca1 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -20,43 +20,6 @@
 #define __INTEL_PT_H__
 
 /*
- * PT MSR bit definitions
- */
-#define RTIT_CTL_TRACEEN   BIT(0)
-#define RTIT_CTL_CYCLEACC  BIT(1)
-#define RTIT_CTL_OSBIT(2)
-#define RTIT_CTL_USR   BIT(3)
-#define RTIT_CTL_PWR_EVT_ENBIT(4)
-#define RTIT_CTL_FUP_ON_PTWBIT(5)
-#define RTIT_CTL_CR3EN BIT(7)
-#define RTIT_CTL_TOPA  BIT(8)
-#define RTIT_CTL_MTC_ENBIT(9)
-#define RTIT_CTL_TSC_ENBIT(10)
-#define RTIT_CTL_DISRETC   BIT(11)
-#define RTIT_CTL_PTW_ENBIT(12)
-#define RTIT_CTL_BRANCH_EN BIT(13)
-#define RTIT_CTL_MTC_RANGE_OFFSET  14
-#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
-#define RTIT_CTL_CYC_THRESH_OFFSET 19
-#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
-#define RTIT_CTL_PSB_FREQ_OFFSET   24
-#define RTIT_CTL_PSB_FREQ  (0x0full << 
RTIT_CTL_PSB_FREQ_OFFSET)
-#define RTIT_CTL_ADDR0_OFFSET  32
-#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
-#define RTIT_CTL_ADDR1_OFFSET  36
-#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
-#define RTIT_CTL_ADDR2_OFFSET  40
-#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
-#define RTIT_CTL_ADDR3_OFFSET  44
-#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
-#define RTIT_STATUS_FILTEREN   BIT(0)
-#define RTIT_STATUS_CONTEXTEN  BIT(1)
-#define RTIT_STATUS_TRIGGEREN  BIT(2)
-#define RTIT_STATUS_BUFFOVFBIT(3)
-#define RTIT_STATUS_ERROR  BIT(4)
-#define RTIT_STATUS_STOPPEDBIT(5)
-
-/*
  * Single-entry ToPA: when this close to region boundary, switch
  * buffers to avoid losing data.
  */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4731f0c..d3a9eb9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -120,7 +120,40 @@
 #define MSR_PEBS_LD_LAT_THRESHOLD  0x03f6
 
 #define MSR_IA32_RTIT_CTL  0x0570
+#define RTIT_CTL_TRACEEN   BIT(0)
+#define RTIT_CTL_CYCLEACC  BIT(1)
+#define RTIT_CTL_OSBIT(2)
+#define RTIT_CTL_USR   BIT(3)
+#define RTIT_CTL_PWR_EVT_ENBIT(4)
+#define RTIT_CTL_FUP_ON_PTWBIT(5)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA  BIT(8)
+#define RTIT_CTL_MTC_ENBIT(9)
+#define RTIT_CTL_TSC_ENBIT(10)
+#define RTIT_CTL_DISRETC   BIT(11)
+#define RTIT_CTL_PTW_ENBIT(12)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET  14
+#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET 19
+#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET   24
+#define RTIT_CTL_PSB_FREQ  (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
+#define RTIT_CTL_ADDR0_OFFSET  32
+#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
+#define RTIT_CTL_ADDR1_OFFSET  36
+#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
+#define RTIT_CTL_ADDR2_OFFSET  40
+#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
+#define RTIT_CTL_ADDR3_OFFSET  44
+#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
 #define MSR_IA32_RTIT_STATUS   0x0571
+#define RTIT_STATUS_FILTEREN   BIT(0)
+#define RTIT_STATUS_CONTEXTEN  BIT(1)
+#define RTIT_STATUS_TRIGGEREN  BIT(2)
+#define RTIT_STATUS_BUFFOVFBIT(3)
+#define RTIT_STATUS_ERROR  BIT(4)
+#define RTIT_STATUS_STOPPEDBIT(5)
 #define MSR_IA32_RTIT_ADDR0_A  0x0580
 #define MSR_IA32_RTIT_ADDR0_B  0x0581
 #define MSR_IA32_RTIT_ADDR1_A  0x0582
-- 
1.8.3.1



[PATCH v13 02/12] perf/x86/intel/pt: Export pt_cap_get()

2018-10-24 Thread Luwei Kang
From: Chao Peng 

pt_cap_get() is required by the upcoming PT support in KVM guests.

Export it and move the capabilites enum to a global header.

As a global functions, "pt_*" is already used for ptrace and
other things, so it makes sense to use "intel_pt_*" as a prefix.

Acked-by: Song Liu 
Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  | 49 ++---
 arch/x86/events/intel/pt.h  | 21 --
 arch/x86/include/asm/intel_pt.h | 23 +++
 3 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 8d016ce..309bb1d 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -75,7 +75,7 @@
PT_CAP(psb_periods, 1, CPUID_EBX, 0x),
 };
 
-static u32 pt_cap_get(enum pt_capabilities cap)
+u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
 {
struct pt_cap_desc *cd = _caps[cap];
u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
@@ -83,6 +83,7 @@ static u32 pt_cap_get(enum pt_capabilities cap)
 
return (c & cd->mask) >> shift;
 }
+EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
 
 static ssize_t pt_cap_show(struct device *cdev,
   struct device_attribute *attr,
@@ -92,7 +93,7 @@ static ssize_t pt_cap_show(struct device *cdev,
container_of(attr, struct dev_ext_attribute, attr);
enum pt_capabilities cap = (long)ea->var;
 
-   return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+   return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap));
 }
 
 static struct attribute_group pt_cap_group = {
@@ -310,16 +311,16 @@ static bool pt_event_valid(struct perf_event *event)
return false;
 
if (config & RTIT_CTL_CYC_PSB) {
-   if (!pt_cap_get(PT_CAP_psb_cyc))
+   if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc))
return false;
 
-   allowed = pt_cap_get(PT_CAP_psb_periods);
+   allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods);
requested = (config & RTIT_CTL_PSB_FREQ) >>
RTIT_CTL_PSB_FREQ_OFFSET;
if (requested && (!(allowed & BIT(requested
return false;
 
-   allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+   allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds);
requested = (config & RTIT_CTL_CYC_THRESH) >>
RTIT_CTL_CYC_THRESH_OFFSET;
if (requested && (!(allowed & BIT(requested
@@ -334,10 +335,10 @@ static bool pt_event_valid(struct perf_event *event)
 * Spec says that setting mtc period bits while mtc bit in
 * CPUID is 0 will #GP, so better safe than sorry.
 */
-   if (!pt_cap_get(PT_CAP_mtc))
+   if (!intel_pt_validate_hw_cap(PT_CAP_mtc))
return false;
 
-   allowed = pt_cap_get(PT_CAP_mtc_periods);
+   allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods);
if (!allowed)
return false;
 
@@ -349,11 +350,11 @@ static bool pt_event_valid(struct perf_event *event)
}
 
if (config & RTIT_CTL_PWR_EVT_EN &&
-   !pt_cap_get(PT_CAP_power_event_trace))
+   !intel_pt_validate_hw_cap(PT_CAP_power_event_trace))
return false;
 
if (config & RTIT_CTL_PTW) {
-   if (!pt_cap_get(PT_CAP_ptwrite))
+   if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite))
return false;
 
/* FUPonPTW without PTW doesn't make sense */
@@ -598,7 +599,7 @@ static struct topa *topa_alloc(int cpu, gfp_t gfp)
 * In case of singe-entry ToPA, always put the self-referencing END
 * link as the 2nd entry in the table
 */
-   if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+   if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
TOPA_ENTRY(topa, 1)->end = 1;
}
@@ -638,7 +639,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct 
topa *topa)
topa->offset = last->offset + last->size;
buf->last = topa;
 
-   if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+   if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
return;
 
BUG_ON(last->last != TENTS_PER_PAGE - 1);
@@ -654,7 +655,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct 
topa *topa)
 static bool topa_table_full(struct topa *topa)
 {
/* single-entry ToPA is a special case */
-  

[PATCH v13 02/12] perf/x86/intel/pt: Export pt_cap_get()

2018-10-24 Thread Luwei Kang
From: Chao Peng 

pt_cap_get() is required by the upcoming PT support in KVM guests.

Export it and move the capabilites enum to a global header.

As a global functions, "pt_*" is already used for ptrace and
other things, so it makes sense to use "intel_pt_*" as a prefix.

Acked-by: Song Liu 
Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  | 49 ++---
 arch/x86/events/intel/pt.h  | 21 --
 arch/x86/include/asm/intel_pt.h | 23 +++
 3 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 8d016ce..309bb1d 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -75,7 +75,7 @@
PT_CAP(psb_periods, 1, CPUID_EBX, 0x),
 };
 
-static u32 pt_cap_get(enum pt_capabilities cap)
+u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
 {
struct pt_cap_desc *cd = _caps[cap];
u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
@@ -83,6 +83,7 @@ static u32 pt_cap_get(enum pt_capabilities cap)
 
return (c & cd->mask) >> shift;
 }
+EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
 
 static ssize_t pt_cap_show(struct device *cdev,
   struct device_attribute *attr,
@@ -92,7 +93,7 @@ static ssize_t pt_cap_show(struct device *cdev,
container_of(attr, struct dev_ext_attribute, attr);
enum pt_capabilities cap = (long)ea->var;
 
-   return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+   return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap));
 }
 
 static struct attribute_group pt_cap_group = {
@@ -310,16 +311,16 @@ static bool pt_event_valid(struct perf_event *event)
return false;
 
if (config & RTIT_CTL_CYC_PSB) {
-   if (!pt_cap_get(PT_CAP_psb_cyc))
+   if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc))
return false;
 
-   allowed = pt_cap_get(PT_CAP_psb_periods);
+   allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods);
requested = (config & RTIT_CTL_PSB_FREQ) >>
RTIT_CTL_PSB_FREQ_OFFSET;
if (requested && (!(allowed & BIT(requested
return false;
 
-   allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+   allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds);
requested = (config & RTIT_CTL_CYC_THRESH) >>
RTIT_CTL_CYC_THRESH_OFFSET;
if (requested && (!(allowed & BIT(requested
@@ -334,10 +335,10 @@ static bool pt_event_valid(struct perf_event *event)
 * Spec says that setting mtc period bits while mtc bit in
 * CPUID is 0 will #GP, so better safe than sorry.
 */
-   if (!pt_cap_get(PT_CAP_mtc))
+   if (!intel_pt_validate_hw_cap(PT_CAP_mtc))
return false;
 
-   allowed = pt_cap_get(PT_CAP_mtc_periods);
+   allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods);
if (!allowed)
return false;
 
@@ -349,11 +350,11 @@ static bool pt_event_valid(struct perf_event *event)
}
 
if (config & RTIT_CTL_PWR_EVT_EN &&
-   !pt_cap_get(PT_CAP_power_event_trace))
+   !intel_pt_validate_hw_cap(PT_CAP_power_event_trace))
return false;
 
if (config & RTIT_CTL_PTW) {
-   if (!pt_cap_get(PT_CAP_ptwrite))
+   if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite))
return false;
 
/* FUPonPTW without PTW doesn't make sense */
@@ -598,7 +599,7 @@ static struct topa *topa_alloc(int cpu, gfp_t gfp)
 * In case of singe-entry ToPA, always put the self-referencing END
 * link as the 2nd entry in the table
 */
-   if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+   if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
TOPA_ENTRY(topa, 1)->end = 1;
}
@@ -638,7 +639,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct 
topa *topa)
topa->offset = last->offset + last->size;
buf->last = topa;
 
-   if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+   if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries))
return;
 
BUG_ON(last->last != TENTS_PER_PAGE - 1);
@@ -654,7 +655,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct 
topa *topa)
 static bool topa_table_full(struct topa *topa)
 {
/* single-entry ToPA is a special case */
-  

[PATCH v13 00/12] Intel Processor Trace virtualization enabling

2018-10-24 Thread Luwei Kang
>From V12
 - Refine the title and description of patch 1~3. -- Thomas Gleixner
 - Rename the function of validate the capabilities of Intel PT. -- Thomas 
Gleixner
 - Add more description of Intel PT work mode. -- Alexander Shishkin

>From V11:
 - In patch 3, arguments caps vs. cap is not good. Spell second one out. -- 
Thomas Gleixner

>From V10: (This version don't have code change)
 - move the patch 5 in version 9 to patch 3 (reorder patch 5) -- Alexander 
Shishkin
 - refind the patch description of patch 5 (add new capability for Intel PT) -- 
Alexander Shishkin
 - CC all the maintainers, reviewers and submitters in each patch of this patch 
set -- Alexander Shishkin

>From V9:
 - remove redundant initialize for "ctl_bitmask" in patch 9;
 - do some changes for patch's description.

>From V8:
 - move macro definition MSR_IA32_RTIT_ADDR_RANGE from msr-index.h to 
intel_pt.h;
 - initialize the RTIT_CTL bitmask to ~0ULL.

>From V7:
 - remove host only mode since it can be emulated by perf code;
 - merge patch 8 and 9 to make code and data in the same patch;
 - rename __pt_cap_get() to pt_cap_decode();
 - other minor change.

>From V6:
 - split pathes 1~2 to four separate patches (these patches do 2 things) and 
add more descriptions.

>From V5:
 - rename the function from pt_cap_get_ex() to __pt_cap_get();
 - replace the most of function from vmx_pt_supported() to "pt_mode == 
PT_MODE_HOST_GUEST"(or !=).

>From V4:
 - add data check when setting the value of MSR_IA32_RTIT_CTL;
 - Invoke new interface to set the intercept of MSRs read/write after "MSR 
bitmap per-vcpu" patches.

>From V3:
 - change default mode to SYSTEM mode;
 - add a new patch to move PT out of scattered features;
 - add a new fucntion kvm_get_pt_addr_cnt() to get the number of address ranges;
 - add a new function vmx_set_rtit_ctl() to set the value of guest RTIT_CTL, 
GUEST_IA32_RTIT_CTL and MSRs intercept.

>From v2:
 - replace *_PT_SUPPRESS_PIP to *_PT_CONCEAL_PIP;
 - clean SECONDARY_EXEC_PT_USE_GPA, VM_EXIT_CLEAR_IA32_RTIT_CTL and 
VM_ENTRY_LOAD_IA32_RTIT_CTL in SYSTEM mode. These bits must be all set or all 
clean;
 - move processor tracing out of scattered features;
 - add a new function to enable/disable intercept MSRs read/write;
 - add all Intel PT MSRs read/write and disable intercept when PT is enabled in 
guest;
 - disable Intel PT and enable intercept MSRs when L1 guest VMXON;
 - performance optimization.
   In Host only mode. we just need to save host RTIT_CTL before vm-entry and 
restore host RTIT_CTL after vm-exit;
   In HOST_GUEST mode. we need to save and restore all MSRs only when PT has 
enabled in guest.
 - use XSAVES/XRESTORES implement context switch.
   Haven't implementation in this version and still in debuging. will make a 
separate patch work on this.

>From v1:
 - remove guest-only mode because guest-only mode can be covered by host-guest 
mode;
 - always set "use GPA for processor tracing" in secondary execution control if 
it can be;
 - trap RTIT_CTL read/write. Forbid write this msr when VMXON in L1 hypervisor.

Chao Peng (7):
  perf/x86/intel/pt: Move Intel PT MSRs bit defines to global header
  perf/x86/intel/pt: Export pt_cap_get()
  KVM: x86: Add Intel PT virtualization work mode
  KVM: x86: Add Intel Processor Trace cpuid emulation
  KVM: x86: Add Intel PT context switch for each vcpu
  KVM: x86: Implement Intel PT MSRs read/write emulation
  KVM: x86: Set intercept for Intel PT MSRs read/write

Luwei Kang (5):
  perf/x86/intel/pt: Introduce intel_pt_validate_cap()
  perf/x86/intel/pt: Add new bit definitions for PT MSRs
  perf/x86/intel/pt: add new capability for Intel PT
  KVM: x86: Introduce a function to initialize the PT configuration
  KVM: x86: Disable Intel PT when VMXON in L1 guest

 arch/x86/events/intel/pt.c   |  60 +++---
 arch/x86/events/intel/pt.h   |  58 -
 arch/x86/include/asm/intel_pt.h  |  39 
 arch/x86/include/asm/kvm_host.h  |   1 +
 arch/x86/include/asm/msr-index.h |  37 
 arch/x86/include/asm/vmx.h   |   8 +
 arch/x86/kvm/cpuid.c |  22 +-
 arch/x86/kvm/svm.c   |   6 +
 arch/x86/kvm/vmx.c   | 446 ++-
 arch/x86/kvm/x86.c   |  33 ++-
 10 files changed, 620 insertions(+), 90 deletions(-)

-- 
1.8.3.1



[PATCH v13 00/12] Intel Processor Trace virtualization enabling

2018-10-24 Thread Luwei Kang
>From V12
 - Refine the title and description of patch 1~3. -- Thomas Gleixner
 - Rename the function of validate the capabilities of Intel PT. -- Thomas 
Gleixner
 - Add more description of Intel PT work mode. -- Alexander Shishkin

>From V11:
 - In patch 3, arguments caps vs. cap is not good. Spell second one out. -- 
Thomas Gleixner

>From V10: (This version don't have code change)
 - move the patch 5 in version 9 to patch 3 (reorder patch 5) -- Alexander 
Shishkin
 - refind the patch description of patch 5 (add new capability for Intel PT) -- 
Alexander Shishkin
 - CC all the maintainers, reviewers and submitters in each patch of this patch 
set -- Alexander Shishkin

>From V9:
 - remove redundant initialize for "ctl_bitmask" in patch 9;
 - do some changes for patch's description.

>From V8:
 - move macro definition MSR_IA32_RTIT_ADDR_RANGE from msr-index.h to 
intel_pt.h;
 - initialize the RTIT_CTL bitmask to ~0ULL.

>From V7:
 - remove host only mode since it can be emulated by perf code;
 - merge patch 8 and 9 to make code and data in the same patch;
 - rename __pt_cap_get() to pt_cap_decode();
 - other minor change.

>From V6:
 - split pathes 1~2 to four separate patches (these patches do 2 things) and 
add more descriptions.

>From V5:
 - rename the function from pt_cap_get_ex() to __pt_cap_get();
 - replace the most of function from vmx_pt_supported() to "pt_mode == 
PT_MODE_HOST_GUEST"(or !=).

>From V4:
 - add data check when setting the value of MSR_IA32_RTIT_CTL;
 - Invoke new interface to set the intercept of MSRs read/write after "MSR 
bitmap per-vcpu" patches.

>From V3:
 - change default mode to SYSTEM mode;
 - add a new patch to move PT out of scattered features;
 - add a new fucntion kvm_get_pt_addr_cnt() to get the number of address ranges;
 - add a new function vmx_set_rtit_ctl() to set the value of guest RTIT_CTL, 
GUEST_IA32_RTIT_CTL and MSRs intercept.

>From v2:
 - replace *_PT_SUPPRESS_PIP to *_PT_CONCEAL_PIP;
 - clean SECONDARY_EXEC_PT_USE_GPA, VM_EXIT_CLEAR_IA32_RTIT_CTL and 
VM_ENTRY_LOAD_IA32_RTIT_CTL in SYSTEM mode. These bits must be all set or all 
clean;
 - move processor tracing out of scattered features;
 - add a new function to enable/disable intercept MSRs read/write;
 - add all Intel PT MSRs read/write and disable intercept when PT is enabled in 
guest;
 - disable Intel PT and enable intercept MSRs when L1 guest VMXON;
 - performance optimization.
   In Host only mode. we just need to save host RTIT_CTL before vm-entry and 
restore host RTIT_CTL after vm-exit;
   In HOST_GUEST mode. we need to save and restore all MSRs only when PT has 
enabled in guest.
 - use XSAVES/XRESTORES implement context switch.
   Haven't implementation in this version and still in debuging. will make a 
separate patch work on this.

>From v1:
 - remove guest-only mode because guest-only mode can be covered by host-guest 
mode;
 - always set "use GPA for processor tracing" in secondary execution control if 
it can be;
 - trap RTIT_CTL read/write. Forbid write this msr when VMXON in L1 hypervisor.

Chao Peng (7):
  perf/x86/intel/pt: Move Intel PT MSRs bit defines to global header
  perf/x86/intel/pt: Export pt_cap_get()
  KVM: x86: Add Intel PT virtualization work mode
  KVM: x86: Add Intel Processor Trace cpuid emulation
  KVM: x86: Add Intel PT context switch for each vcpu
  KVM: x86: Implement Intel PT MSRs read/write emulation
  KVM: x86: Set intercept for Intel PT MSRs read/write

Luwei Kang (5):
  perf/x86/intel/pt: Introduce intel_pt_validate_cap()
  perf/x86/intel/pt: Add new bit definitions for PT MSRs
  perf/x86/intel/pt: add new capability for Intel PT
  KVM: x86: Introduce a function to initialize the PT configuration
  KVM: x86: Disable Intel PT when VMXON in L1 guest

 arch/x86/events/intel/pt.c   |  60 +++---
 arch/x86/events/intel/pt.h   |  58 -
 arch/x86/include/asm/intel_pt.h  |  39 
 arch/x86/include/asm/kvm_host.h  |   1 +
 arch/x86/include/asm/msr-index.h |  37 
 arch/x86/include/asm/vmx.h   |   8 +
 arch/x86/kvm/cpuid.c |  22 +-
 arch/x86/kvm/svm.c   |   6 +
 arch/x86/kvm/vmx.c   | 446 ++-
 arch/x86/kvm/x86.c   |  33 ++-
 10 files changed, 620 insertions(+), 90 deletions(-)

-- 
1.8.3.1



[PATCH v9 02/12] perf/x86/intel/pt: Change pt_cap_get() to a public function

2018-05-21 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Change pt_cap_get() to a public function that KVM
can access this function to check if specific
feature is supported on hardware.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/events/intel/pt.c  |  3 ++-
 arch/x86/events/intel/pt.h  | 21 -
 arch/x86/include/asm/intel_pt.h | 23 +++
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 3b99394..c80e2f5 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -75,7 +75,7 @@
PT_CAP(psb_periods, 1, CPUID_EBX, 0x),
 };
 
-static u32 pt_cap_get(enum pt_capabilities cap)
+u32 pt_cap_get(enum pt_capabilities cap)
 {
struct pt_cap_desc *cd = _caps[cap];
u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
@@ -83,6 +83,7 @@ static u32 pt_cap_get(enum pt_capabilities cap)
 
return (c & cd->mask) >> shift;
 }
+EXPORT_SYMBOL_GPL(pt_cap_get);
 
 static ssize_t pt_cap_show(struct device *cdev,
   struct device_attribute *attr,
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 0050ca1..269e15a 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -45,30 +45,9 @@ struct topa_entry {
u64 rsvd4   : 16;
 };
 
-#define PT_CPUID_LEAVES2
-#define PT_CPUID_REGS_NUM  4 /* number of regsters (eax, ebx, ecx, edx) */
-
 /* TSC to Core Crystal Clock Ratio */
 #define CPUID_TSC_LEAF 0x15
 
-enum pt_capabilities {
-   PT_CAP_max_subleaf = 0,
-   PT_CAP_cr3_filtering,
-   PT_CAP_psb_cyc,
-   PT_CAP_ip_filtering,
-   PT_CAP_mtc,
-   PT_CAP_ptwrite,
-   PT_CAP_power_event_trace,
-   PT_CAP_topa_output,
-   PT_CAP_topa_multiple_entries,
-   PT_CAP_single_range_output,
-   PT_CAP_payloads_lip,
-   PT_CAP_num_address_ranges,
-   PT_CAP_mtc_periods,
-   PT_CAP_cycle_thresholds,
-   PT_CAP_psb_periods,
-};
-
 struct pt_pmu {
struct pmu  pmu;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index b523f51..4270421 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -2,10 +2,33 @@
 #ifndef _ASM_X86_INTEL_PT_H
 #define _ASM_X86_INTEL_PT_H
 
+#define PT_CPUID_LEAVES2
+#define PT_CPUID_REGS_NUM  4 /* number of regsters (eax, ebx, ecx, edx) */
+
+enum pt_capabilities {
+   PT_CAP_max_subleaf = 0,
+   PT_CAP_cr3_filtering,
+   PT_CAP_psb_cyc,
+   PT_CAP_ip_filtering,
+   PT_CAP_mtc,
+   PT_CAP_ptwrite,
+   PT_CAP_power_event_trace,
+   PT_CAP_topa_output,
+   PT_CAP_topa_multiple_entries,
+   PT_CAP_single_range_output,
+   PT_CAP_payloads_lip,
+   PT_CAP_num_address_ranges,
+   PT_CAP_mtc_periods,
+   PT_CAP_cycle_thresholds,
+   PT_CAP_psb_periods,
+};
+
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 void cpu_emergency_stop_pt(void);
+extern u32 pt_cap_get(enum pt_capabilities cap);
 #else
 static inline void cpu_emergency_stop_pt(void) {}
+static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; }
 #endif
 
 #endif /* _ASM_X86_INTEL_PT_H */
-- 
1.8.3.1



[PATCH v9 02/12] perf/x86/intel/pt: Change pt_cap_get() to a public function

2018-05-21 Thread Luwei Kang
From: Chao Peng 

Change pt_cap_get() to a public function that KVM
can access this function to check if specific
feature is supported on hardware.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  |  3 ++-
 arch/x86/events/intel/pt.h  | 21 -
 arch/x86/include/asm/intel_pt.h | 23 +++
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 3b99394..c80e2f5 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -75,7 +75,7 @@
PT_CAP(psb_periods, 1, CPUID_EBX, 0x),
 };
 
-static u32 pt_cap_get(enum pt_capabilities cap)
+u32 pt_cap_get(enum pt_capabilities cap)
 {
struct pt_cap_desc *cd = _caps[cap];
u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
@@ -83,6 +83,7 @@ static u32 pt_cap_get(enum pt_capabilities cap)
 
return (c & cd->mask) >> shift;
 }
+EXPORT_SYMBOL_GPL(pt_cap_get);
 
 static ssize_t pt_cap_show(struct device *cdev,
   struct device_attribute *attr,
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 0050ca1..269e15a 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -45,30 +45,9 @@ struct topa_entry {
u64 rsvd4   : 16;
 };
 
-#define PT_CPUID_LEAVES2
-#define PT_CPUID_REGS_NUM  4 /* number of regsters (eax, ebx, ecx, edx) */
-
 /* TSC to Core Crystal Clock Ratio */
 #define CPUID_TSC_LEAF 0x15
 
-enum pt_capabilities {
-   PT_CAP_max_subleaf = 0,
-   PT_CAP_cr3_filtering,
-   PT_CAP_psb_cyc,
-   PT_CAP_ip_filtering,
-   PT_CAP_mtc,
-   PT_CAP_ptwrite,
-   PT_CAP_power_event_trace,
-   PT_CAP_topa_output,
-   PT_CAP_topa_multiple_entries,
-   PT_CAP_single_range_output,
-   PT_CAP_payloads_lip,
-   PT_CAP_num_address_ranges,
-   PT_CAP_mtc_periods,
-   PT_CAP_cycle_thresholds,
-   PT_CAP_psb_periods,
-};
-
 struct pt_pmu {
struct pmu  pmu;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index b523f51..4270421 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -2,10 +2,33 @@
 #ifndef _ASM_X86_INTEL_PT_H
 #define _ASM_X86_INTEL_PT_H
 
+#define PT_CPUID_LEAVES2
+#define PT_CPUID_REGS_NUM  4 /* number of regsters (eax, ebx, ecx, edx) */
+
+enum pt_capabilities {
+   PT_CAP_max_subleaf = 0,
+   PT_CAP_cr3_filtering,
+   PT_CAP_psb_cyc,
+   PT_CAP_ip_filtering,
+   PT_CAP_mtc,
+   PT_CAP_ptwrite,
+   PT_CAP_power_event_trace,
+   PT_CAP_topa_output,
+   PT_CAP_topa_multiple_entries,
+   PT_CAP_single_range_output,
+   PT_CAP_payloads_lip,
+   PT_CAP_num_address_ranges,
+   PT_CAP_mtc_periods,
+   PT_CAP_cycle_thresholds,
+   PT_CAP_psb_periods,
+};
+
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 void cpu_emergency_stop_pt(void);
+extern u32 pt_cap_get(enum pt_capabilities cap);
 #else
 static inline void cpu_emergency_stop_pt(void) {}
+static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; }
 #endif
 
 #endif /* _ASM_X86_INTEL_PT_H */
-- 
1.8.3.1



[PATCH v9 04/12] perf/x86/intel/pt: add new capability for Intel PT

2018-05-21 Thread Luwei Kang
CPUID(EAX=14H,ECX=0):EBX[bit 3] = 1 indicates support of
output to Trace Transport subsystem.
MSR IA32_RTIT_CTL.FabricEn[bit 6] is reserved if
CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0.
This is use for emulate IA32_RTIT_CTL MSR read/write
in KVM. KVM guest write IA32_RTIT_CTL will trap to
root mode and a #GP would be injected to guest if set
IA32_RTIT_CTL.FabricEn with
CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0.

Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/events/intel/pt.c  | 1 +
 arch/x86/include/asm/intel_pt.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index c80e2f5..f65f97a 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -68,6 +68,7 @@
PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
PT_CAP(topa_multiple_entries,   0, CPUID_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
+   PT_CAP(output_subsys,   0, CPUID_ECX, BIT(3)),
PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)),
PT_CAP(num_address_ranges,  1, CPUID_EAX, 0x3),
PT_CAP(mtc_periods, 1, CPUID_EAX, 0x),
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 4270421..2de4db0 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -16,6 +16,7 @@ enum pt_capabilities {
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
PT_CAP_single_range_output,
+   PT_CAP_output_subsys,
PT_CAP_payloads_lip,
PT_CAP_num_address_ranges,
PT_CAP_mtc_periods,
-- 
1.8.3.1



[PATCH v9 04/12] perf/x86/intel/pt: add new capability for Intel PT

2018-05-21 Thread Luwei Kang
CPUID(EAX=14H,ECX=0):EBX[bit 3] = 1 indicates support of
output to Trace Transport subsystem.
MSR IA32_RTIT_CTL.FabricEn[bit 6] is reserved if
CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0.
This is use for emulate IA32_RTIT_CTL MSR read/write
in KVM. KVM guest write IA32_RTIT_CTL will trap to
root mode and a #GP would be injected to guest if set
IA32_RTIT_CTL.FabricEn with
CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0.

Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  | 1 +
 arch/x86/include/asm/intel_pt.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index c80e2f5..f65f97a 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -68,6 +68,7 @@
PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
PT_CAP(topa_multiple_entries,   0, CPUID_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
+   PT_CAP(output_subsys,   0, CPUID_ECX, BIT(3)),
PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)),
PT_CAP(num_address_ranges,  1, CPUID_EAX, 0x3),
PT_CAP(mtc_periods, 1, CPUID_EAX, 0x),
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 4270421..2de4db0 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -16,6 +16,7 @@ enum pt_capabilities {
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
PT_CAP_single_range_output,
+   PT_CAP_output_subsys,
PT_CAP_payloads_lip,
PT_CAP_num_address_ranges,
PT_CAP_mtc_periods,
-- 
1.8.3.1



[PATCH v9 10/12] KVM: x86: Implement Intel Processor Trace MSRs read/write emulation

2018-05-21 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

This patch implement Intel Processor Trace MSRs read/write
emulation.
Intel PT MSRs read/write need to be emulated when Intel PT
MSRs is intercepted in guest and during live migration.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/include/asm/intel_pt.h |   8 ++
 arch/x86/kvm/vmx.c  | 172 
 arch/x86/kvm/x86.c  |  33 +++-
 3 files changed, 212 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 70f4139..3a25dc1 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -10,6 +10,14 @@
 
 #define RTIT_ADDR_RANGE4
 
+#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
+   RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
+   RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
+   RTIT_STATUS_BYTECNT))
+
+#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
+   (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 952ddf4..770cb7c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2809,6 +2809,77 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu 
*vcpu, int mask)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
 }
 
+static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   unsigned long value;
+
+   /*
+* Any MSR write that attempts to change bits marked reserved will
+* case a #GP fault.
+*/
+   if (data & vmx->pt_desc.ctl_bitmask)
+   return 1;
+
+   /*
+* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
+* result in a #GP unless the same write also clears TraceEn.
+*/
+   if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
+   ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+   return 1;
+
+   /*
+* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
+* and FabricEn would cause #GP, if
+* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
+*/
+   if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
+   !(data & RTIT_CTL_FABRIC_EN) &&
+   !pt_cap_decode(vmx->pt_desc.caps, PT_CAP_single_range_output))
+   return 1;
+
+   /*
+* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
+* utilize encodings marked reserved will casue a #GP fault.
+*/
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc_periods);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc) &&
+   !test_bit((data & RTIT_CTL_MTC_RANGE) >>
+   RTIT_CTL_MTC_RANGE_OFFSET, ))
+   return 1;
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cycle_thresholds);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_CYC_THRESH) >>
+   RTIT_CTL_CYC_THRESH_OFFSET, ))
+   return 1;
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_periods);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_PSB_FREQ) >>
+   RTIT_CTL_PSB_FREQ_OFFSET, ))
+   return 1;
+
+   /*
+* If ADDRx_CFG is reserved or the encodings is >2 will
+* cause a #GP fault.
+*/
+   value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+   return 1;
+
+   return 0;
+}
+
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
unsigned long rip;
@@ -3625,6 +3696,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
+   u

[PATCH v9 10/12] KVM: x86: Implement Intel Processor Trace MSRs read/write emulation

2018-05-21 Thread Luwei Kang
From: Chao Peng 

This patch implement Intel Processor Trace MSRs read/write
emulation.
Intel PT MSRs read/write need to be emulated when Intel PT
MSRs is intercepted in guest and during live migration.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/intel_pt.h |   8 ++
 arch/x86/kvm/vmx.c  | 172 
 arch/x86/kvm/x86.c  |  33 +++-
 3 files changed, 212 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 70f4139..3a25dc1 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -10,6 +10,14 @@
 
 #define RTIT_ADDR_RANGE4
 
+#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
+   RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
+   RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
+   RTIT_STATUS_BYTECNT))
+
+#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
+   (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 952ddf4..770cb7c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2809,6 +2809,77 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu 
*vcpu, int mask)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
 }
 
+static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   unsigned long value;
+
+   /*
+* Any MSR write that attempts to change bits marked reserved will
+* case a #GP fault.
+*/
+   if (data & vmx->pt_desc.ctl_bitmask)
+   return 1;
+
+   /*
+* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
+* result in a #GP unless the same write also clears TraceEn.
+*/
+   if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
+   ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+   return 1;
+
+   /*
+* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
+* and FabricEn would cause #GP, if
+* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
+*/
+   if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
+   !(data & RTIT_CTL_FABRIC_EN) &&
+   !pt_cap_decode(vmx->pt_desc.caps, PT_CAP_single_range_output))
+   return 1;
+
+   /*
+* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
+* utilize encodings marked reserved will casue a #GP fault.
+*/
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc_periods);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc) &&
+   !test_bit((data & RTIT_CTL_MTC_RANGE) >>
+   RTIT_CTL_MTC_RANGE_OFFSET, ))
+   return 1;
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cycle_thresholds);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_CYC_THRESH) >>
+   RTIT_CTL_CYC_THRESH_OFFSET, ))
+   return 1;
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_periods);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_PSB_FREQ) >>
+   RTIT_CTL_PSB_FREQ_OFFSET, ))
+   return 1;
+
+   /*
+* If ADDRx_CFG is reserved or the encodings is >2 will
+* cause a #GP fault.
+*/
+   value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+   return 1;
+
+   return 0;
+}
+
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
unsigned long rip;
@@ -3625,6 +3696,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
+   u32 index;
 
switch (msr_info->index) {
 #ifdef CONFIG_X86_64
@@ -369

[PATCH v9 06/12] KVM: x86: Add Intel Processor Trace virtualization mode

2018-05-21 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Intel PT virtualization can be work in one of 2 possible modes:
a. system-wide: trace both host and guest and output to host buffer;
b. host-guest: trace host/guest simultaneous and output to their
   respective buffer.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/include/asm/intel_pt.h  |  3 ++
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/include/asm/vmx.h   |  8 +
 arch/x86/kvm/vmx.c   | 68 +---
 4 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 9c71453..5748205 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -5,6 +5,9 @@
 #define PT_CPUID_LEAVES2
 #define PT_CPUID_REGS_NUM  4 /* number of regsters (eax, ebx, ecx, edx) */
 
+#define PT_MODE_SYSTEM 0
+#define PT_MODE_HOST_GUEST 1
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6ae2462..6b14325 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -789,6 +789,7 @@
 #define VMX_BASIC_INOUT0x0040LLU
 
 /* MSR_IA32_VMX_MISC bits */
+#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 5db8b0b..5936d72 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -76,7 +76,9 @@
 #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
 #define SECONDARY_EXEC_RDSEED_EXITING  0x0001
 #define SECONDARY_EXEC_ENABLE_PML   0x0002
+#define SECONDARY_EXEC_PT_CONCEAL_VMX  0x0008
 #define SECONDARY_EXEC_XSAVES  0x0010
+#define SECONDARY_EXEC_PT_USE_GPA  0x0100
 #define SECONDARY_EXEC_TSC_SCALING  0x0200
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
@@ -97,6 +99,8 @@
 #define VM_EXIT_LOAD_IA32_EFER  0x0020
 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER   0x0040
 #define VM_EXIT_CLEAR_BNDCFGS   0x0080
+#define VM_EXIT_PT_CONCEAL_PIP 0x0100
+#define VM_EXIT_CLEAR_IA32_RTIT_CTL0x0200
 
 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR  0x00036dff
 
@@ -108,6 +112,8 @@
 #define VM_ENTRY_LOAD_IA32_PAT 0x4000
 #define VM_ENTRY_LOAD_IA32_EFER 0x8000
 #define VM_ENTRY_LOAD_BNDCFGS   0x0001
+#define VM_ENTRY_PT_CONCEAL_PIP0x0002
+#define VM_ENTRY_LOAD_IA32_RTIT_CTL0x0004
 
 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x11ff
 
@@ -234,6 +240,8 @@ enum vmcs_field {
GUEST_PDPTR3_HIGH   = 0x2811,
GUEST_BNDCFGS   = 0x2812,
GUEST_BNDCFGS_HIGH  = 0x2813,
+   GUEST_IA32_RTIT_CTL = 0x2814,
+   GUEST_IA32_RTIT_CTL_HIGH= 0x2815,
HOST_IA32_PAT   = 0x2c00,
HOST_IA32_PAT_HIGH  = 0x2c01,
HOST_IA32_EFER  = 0x2c02,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ea09813..bb96396 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -53,6 +53,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "trace.h"
 #include "pmu.h"
@@ -186,6 +187,10 @@
 static unsigned int ple_window_max= KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 module_param(ple_window_max, uint, 0444);
 
+/* Default is SYSTEM mode. */
+static int __read_mostly pt_mode = PT_MODE_SYSTEM;
+module_param(pt_mode, int, S_IRUGO);
+
 extern const ulong vmx_return;
 
 struct kvm_vmx {
@@ -1512,6 +1517,20 @@ static bool vmx_umip_emulated(void)
SECONDARY_EXEC_DESC;
 }
 
+static inline bool cpu_has_vmx_intel_pt(void)
+{
+   u64 vmx_msr;
+
+   rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+   return !!(vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT);
+}
+
+static inline bool cpu_has_vmx_pt_use_gpa(void)
+{
+   return !!(vmcs_config.cpu_based_2nd_exec_ctrl &
+   SECONDARY_EXEC_PT_USE_GPA);
+}
+
 static inline bool report_flexpriority(void)
 {
return flexpriority_enabled;
@@ -4026,6 +4045,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
+   SECONDARY_EXEC_PT_USE_G

[PATCH v9 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write

2018-05-21 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Disable intercept Intel PT MSRs only when Intel PT is
enabled in guest. But MSR_IA32_RTIT_CTL will alway be
intercept.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/kvm/vmx.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 770cb7c..a09157c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -948,6 +948,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 
*vmcs12,
 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 static void __always_inline vmx_disable_intercept_for_msr(unsigned long 
*msr_bitmap,
  u32 msr, int type);
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3999,6 +4000,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
vmx_rtit_ctl_check(vcpu, data))
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
+   pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
vmx->pt_desc.guest.ctl = data;
break;
case MSR_IA32_RTIT_STATUS:
@@ -5820,6 +5822,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
vmx->msr_bitmap_mode = mode;
 }
 
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag)
+{
+   unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+   u32 i;
+
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
+   MSR_TYPE_RW, flag);
+   for (i = 0; i < vmx->pt_desc.addr_range; i++) {
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+   }
+}
+
 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
return enable_apicv;
-- 
1.8.3.1



[PATCH v9 06/12] KVM: x86: Add Intel Processor Trace virtualization mode

2018-05-21 Thread Luwei Kang
From: Chao Peng 

Intel PT virtualization can be work in one of 2 possible modes:
a. system-wide: trace both host and guest and output to host buffer;
b. host-guest: trace host/guest simultaneous and output to their
   respective buffer.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/intel_pt.h  |  3 ++
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/include/asm/vmx.h   |  8 +
 arch/x86/kvm/vmx.c   | 68 +---
 4 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 9c71453..5748205 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -5,6 +5,9 @@
 #define PT_CPUID_LEAVES2
 #define PT_CPUID_REGS_NUM  4 /* number of regsters (eax, ebx, ecx, edx) */
 
+#define PT_MODE_SYSTEM 0
+#define PT_MODE_HOST_GUEST 1
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6ae2462..6b14325 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -789,6 +789,7 @@
 #define VMX_BASIC_INOUT0x0040LLU
 
 /* MSR_IA32_VMX_MISC bits */
+#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 5db8b0b..5936d72 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -76,7 +76,9 @@
 #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
 #define SECONDARY_EXEC_RDSEED_EXITING  0x0001
 #define SECONDARY_EXEC_ENABLE_PML   0x0002
+#define SECONDARY_EXEC_PT_CONCEAL_VMX  0x0008
 #define SECONDARY_EXEC_XSAVES  0x0010
+#define SECONDARY_EXEC_PT_USE_GPA  0x0100
 #define SECONDARY_EXEC_TSC_SCALING  0x0200
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
@@ -97,6 +99,8 @@
 #define VM_EXIT_LOAD_IA32_EFER  0x0020
 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER   0x0040
 #define VM_EXIT_CLEAR_BNDCFGS   0x0080
+#define VM_EXIT_PT_CONCEAL_PIP 0x0100
+#define VM_EXIT_CLEAR_IA32_RTIT_CTL0x0200
 
 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR  0x00036dff
 
@@ -108,6 +112,8 @@
 #define VM_ENTRY_LOAD_IA32_PAT 0x4000
 #define VM_ENTRY_LOAD_IA32_EFER 0x8000
 #define VM_ENTRY_LOAD_BNDCFGS   0x0001
+#define VM_ENTRY_PT_CONCEAL_PIP0x0002
+#define VM_ENTRY_LOAD_IA32_RTIT_CTL0x0004
 
 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x11ff
 
@@ -234,6 +240,8 @@ enum vmcs_field {
GUEST_PDPTR3_HIGH   = 0x2811,
GUEST_BNDCFGS   = 0x2812,
GUEST_BNDCFGS_HIGH  = 0x2813,
+   GUEST_IA32_RTIT_CTL = 0x2814,
+   GUEST_IA32_RTIT_CTL_HIGH= 0x2815,
HOST_IA32_PAT   = 0x2c00,
HOST_IA32_PAT_HIGH  = 0x2c01,
HOST_IA32_EFER  = 0x2c02,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ea09813..bb96396 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -53,6 +53,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "trace.h"
 #include "pmu.h"
@@ -186,6 +187,10 @@
 static unsigned int ple_window_max= KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 module_param(ple_window_max, uint, 0444);
 
+/* Default is SYSTEM mode. */
+static int __read_mostly pt_mode = PT_MODE_SYSTEM;
+module_param(pt_mode, int, S_IRUGO);
+
 extern const ulong vmx_return;
 
 struct kvm_vmx {
@@ -1512,6 +1517,20 @@ static bool vmx_umip_emulated(void)
SECONDARY_EXEC_DESC;
 }
 
+static inline bool cpu_has_vmx_intel_pt(void)
+{
+   u64 vmx_msr;
+
+   rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+   return !!(vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT);
+}
+
+static inline bool cpu_has_vmx_pt_use_gpa(void)
+{
+   return !!(vmcs_config.cpu_based_2nd_exec_ctrl &
+   SECONDARY_EXEC_PT_USE_GPA);
+}
+
 static inline bool report_flexpriority(void)
 {
return flexpriority_enabled;
@@ -4026,6 +4045,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
+   SECONDARY_EXEC_PT_USE_GPA |
+   SECONDARY_EXEC_PT_CONCEAL_VMX |
SECONDARY_EXEC_ENABLE_VMFUNC;
  

[PATCH v9 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write

2018-05-21 Thread Luwei Kang
From: Chao Peng 

Disable intercept Intel PT MSRs only when Intel PT is
enabled in guest. But MSR_IA32_RTIT_CTL will alway be
intercept.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 770cb7c..a09157c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -948,6 +948,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 
*vmcs12,
 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 static void __always_inline vmx_disable_intercept_for_msr(unsigned long 
*msr_bitmap,
  u32 msr, int type);
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3999,6 +4000,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
vmx_rtit_ctl_check(vcpu, data))
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
+   pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
vmx->pt_desc.guest.ctl = data;
break;
case MSR_IA32_RTIT_STATUS:
@@ -5820,6 +5822,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
vmx->msr_bitmap_mode = mode;
 }
 
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag)
+{
+   unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+   u32 i;
+
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
+   MSR_TYPE_RW, flag);
+   for (i = 0; i < vmx->pt_desc.addr_range; i++) {
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+   }
+}
+
 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
return enable_apicv;
-- 
1.8.3.1



[PATCH v9 12/12] KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest

2018-05-21 Thread Luwei Kang
Currently, Intel Processor Trace do not support tracing in L1 guest
VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM,
on these type of processors, execution of the VMXON instruction will
clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL
causes a general-protection exception (#GP).

Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/kvm/vmx.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a09157c..093c1f7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3997,7 +3997,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
-   vmx_rtit_ctl_check(vcpu, data))
+   vmx_rtit_ctl_check(vcpu, data) ||
+   vmx->nested.vmxon)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
@@ -8090,6 +8091,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (ret)
return ret;
 
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   vmx->pt_desc.guest.ctl = 0;
+   pt_set_intercept_for_msr(vmx, 1);
+   }
+
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
 }
-- 
1.8.3.1



[PATCH v9 12/12] KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest

2018-05-21 Thread Luwei Kang
Currently, Intel Processor Trace do not support tracing in L1 guest
VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM,
on these type of processors, execution of the VMXON instruction will
clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL
causes a general-protection exception (#GP).

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a09157c..093c1f7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3997,7 +3997,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
-   vmx_rtit_ctl_check(vcpu, data))
+   vmx_rtit_ctl_check(vcpu, data) ||
+   vmx->nested.vmxon)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
@@ -8090,6 +8091,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (ret)
return ret;
 
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   vmx->pt_desc.guest.ctl = 0;
+   pt_set_intercept_for_msr(vmx, 1);
+   }
+
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
 }
-- 
1.8.3.1



[PATCH v9 08/12] KVM: x86: Add Intel Processor Trace context switch for each vcpu

2018-05-21 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Load/Store Intel processor trace register in context switch.
MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS.
In HOST_GUEST mode, we need load/resore PT MSRs only when PT
is enabled in guest.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/include/asm/intel_pt.h |  2 +
 arch/x86/kvm/vmx.c  | 94 +
 2 files changed, 96 insertions(+)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 5748205..70f4139 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -8,6 +8,8 @@
 #define PT_MODE_SYSTEM 0
 #define PT_MODE_HOST_GUEST 1
 
+#define RTIT_ADDR_RANGE4
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 24aded4..11fb90a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -597,6 +597,24 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
(unsigned long *)_desc->control);
 }
 
+struct pt_ctx {
+   u64 ctl;
+   u64 status;
+   u64 output_base;
+   u64 output_mask;
+   u64 cr3_match;
+   u64 addr_a[RTIT_ADDR_RANGE];
+   u64 addr_b[RTIT_ADDR_RANGE];
+};
+
+struct pt_desc {
+   u64 ctl_bitmask;
+   u32 addr_range;
+   u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+   struct pt_ctx host;
+   struct pt_ctx guest;
+};
+
 struct vcpu_vmx {
struct kvm_vcpu   vcpu;
unsigned long host_rsp;
@@ -693,6 +711,8 @@ struct vcpu_vmx {
 */
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
+
+   struct pt_desc pt_desc;
 };
 
 enum segment_cache_field {
@@ -2391,6 +2411,69 @@ static unsigned long segment_base(u16 selector)
 }
 #endif
 
+static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static void pt_guest_enter(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   /* Save host state before VM entry */
+   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+
+   /*
+* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled
+* on VM entry when it has been disabled in guest before).
+*/
+   vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl);
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   wrmsrl(MSR_IA32_RTIT_CTL, 0);
+   pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   }
+}
+
+static void pt_guest_exit(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   }
+
+   /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
+   wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+}
+
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6135,6 +6218,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
+
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   memset(>pt_desc, 0, sizeof(vmx->pt_desc));
+   /* Bit[6~0] are forced to 1, writes are ignored. */
+   vmx->pt_desc.guest.output_mask = 0x7F;
+   vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
+   }
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ 

[PATCH v9 09/12] KVM: x86: Introduce a function to initialize the PT configuration

2018-05-21 Thread Luwei Kang
Initialize the Intel PT configuration when cpuid update.
Include cpuid inforamtion, rtit_ctl bit mask and the number of
address ranges.

Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/kvm/vmx.c | 70 ++
 1 file changed, 70 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 11fb90a..952ddf4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10411,6 +10411,72 @@ static void nested_vmx_cr_fixed1_bits_update(struct 
kvm_vcpu *vcpu)
 #undef cr4_fixed1_update
 }
 
+static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_cpuid_entry2 *best = NULL;
+   int i;
+
+   for (i = 0; i < PT_CPUID_LEAVES; i++) {
+   best = kvm_find_cpuid_entry(vcpu, 0x14, i);
+   if (!best)
+   return;
+   vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
+   vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
+   vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
+   vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
+   }
+
+   /* Get the number of configurable Address Ranges for filtering */
+   vmx->pt_desc.addr_range = pt_cap_decode(vmx->pt_desc.caps,
+   PT_CAP_num_address_ranges);
+
+   /* Initialize and clear the no dependency bits */
+   vmx->pt_desc.ctl_bitmask = ~0ULL;
+   vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
+   RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
+* PSBFreq can be set
+*/
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
+   RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
+* MTCFreq can be set
+*/
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
+   RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_ptwrite))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
+   RTIT_CTL_PTW_EN);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_power_event_trace))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
+
+   /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_topa_output))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
+   /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_output_subsys))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
+
+   /* unmask address range configure area */
+   for (i = 0; i < vmx->pt_desc.addr_range; i++)
+   vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4));
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -10429,6 +10495,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
if (nested_vmx_allowed(vcpu))
nested_vmx_cr_fixed1_bits_update(vcpu);
+
+   if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
+   guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+   update_intel_pt_cfg(vcpu);
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-- 
1.8.3.1



[PATCH v9 08/12] KVM: x86: Add Intel Processor Trace context switch for each vcpu

2018-05-21 Thread Luwei Kang
From: Chao Peng 

Load/Store Intel processor trace register in context switch.
MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS.
In HOST_GUEST mode, we need load/resore PT MSRs only when PT
is enabled in guest.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/intel_pt.h |  2 +
 arch/x86/kvm/vmx.c  | 94 +
 2 files changed, 96 insertions(+)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 5748205..70f4139 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -8,6 +8,8 @@
 #define PT_MODE_SYSTEM 0
 #define PT_MODE_HOST_GUEST 1
 
+#define RTIT_ADDR_RANGE4
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 24aded4..11fb90a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -597,6 +597,24 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
(unsigned long *)_desc->control);
 }
 
+struct pt_ctx {
+   u64 ctl;
+   u64 status;
+   u64 output_base;
+   u64 output_mask;
+   u64 cr3_match;
+   u64 addr_a[RTIT_ADDR_RANGE];
+   u64 addr_b[RTIT_ADDR_RANGE];
+};
+
+struct pt_desc {
+   u64 ctl_bitmask;
+   u32 addr_range;
+   u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+   struct pt_ctx host;
+   struct pt_ctx guest;
+};
+
 struct vcpu_vmx {
struct kvm_vcpu   vcpu;
unsigned long host_rsp;
@@ -693,6 +711,8 @@ struct vcpu_vmx {
 */
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
+
+   struct pt_desc pt_desc;
 };
 
 enum segment_cache_field {
@@ -2391,6 +2411,69 @@ static unsigned long segment_base(u16 selector)
 }
 #endif
 
+static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static void pt_guest_enter(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   /* Save host state before VM entry */
+   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+
+   /*
+* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled
+* on VM entry when it has been disabled in guest before).
+*/
+   vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl);
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   wrmsrl(MSR_IA32_RTIT_CTL, 0);
+   pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   }
+}
+
+static void pt_guest_exit(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   }
+
+   /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
+   wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+}
+
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6135,6 +6218,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
+
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   memset(>pt_desc, 0, sizeof(vmx->pt_desc));
+   /* Bit[6~0] are forced to 1, writes are ignored. */
+   vmx->pt_desc.guest.output_mask = 0x7F;
+   vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
+   }
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -9800,6 +9890,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vcpu-&

[PATCH v9 09/12] KVM: x86: Introduce a function to initialize the PT configuration

2018-05-21 Thread Luwei Kang
Initialize the Intel PT configuration when cpuid update.
Include cpuid inforamtion, rtit_ctl bit mask and the number of
address ranges.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 70 ++
 1 file changed, 70 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 11fb90a..952ddf4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10411,6 +10411,72 @@ static void nested_vmx_cr_fixed1_bits_update(struct 
kvm_vcpu *vcpu)
 #undef cr4_fixed1_update
 }
 
+static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_cpuid_entry2 *best = NULL;
+   int i;
+
+   for (i = 0; i < PT_CPUID_LEAVES; i++) {
+   best = kvm_find_cpuid_entry(vcpu, 0x14, i);
+   if (!best)
+   return;
+   vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
+   vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
+   vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
+   vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
+   }
+
+   /* Get the number of configurable Address Ranges for filtering */
+   vmx->pt_desc.addr_range = pt_cap_decode(vmx->pt_desc.caps,
+   PT_CAP_num_address_ranges);
+
+   /* Initialize and clear the no dependency bits */
+   vmx->pt_desc.ctl_bitmask = ~0ULL;
+   vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
+   RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
+* PSBFreq can be set
+*/
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
+   RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
+* MTCFreq can be set
+*/
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
+   RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_ptwrite))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
+   RTIT_CTL_PTW_EN);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_power_event_trace))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
+
+   /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_topa_output))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
+   /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_output_subsys))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
+
+   /* unmask address range configure area */
+   for (i = 0; i < vmx->pt_desc.addr_range; i++)
+   vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4));
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -10429,6 +10495,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
if (nested_vmx_allowed(vcpu))
nested_vmx_cr_fixed1_bits_update(vcpu);
+
+   if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
+   guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+   update_intel_pt_cfg(vcpu);
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-- 
1.8.3.1



[PATCH v9 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation

2018-05-21 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Expose Intel Processor Trace to guest only when PT work in
HOST_GUEST mode.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/cpuid.c| 22 --
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx.c  |  6 ++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b27de80..8f3c7ea 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1026,6 +1026,7 @@ struct kvm_x86_ops {
bool (*mpx_supported)(void);
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
+   bool (*pt_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 82055b9..e04bf67 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -336,6 +336,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
+   unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -393,7 +394,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | 
F(AVX512DQ) |
-   F(SHA_NI) | F(AVX512BW) | F(AVX512VL);
+   F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
 
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
@@ -423,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
switch (function) {
case 0:
-   entry->eax = min(entry->eax, (u32)0xd);
+   entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd));
break;
case 1:
entry->edx &= kvm_cpuid_1_edx_x86_features;
@@ -595,6 +596,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
}
break;
}
+   /* Intel PT */
+   case 0x14: {
+   int t, times = entry->eax;
+
+   if (!f_intel_pt)
+   break;
+
+   entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   for (t = 1; t <= times; ++t) {
+   if (*nent >= maxnent)
+   goto out;
+   do_cpuid_1_ent([t], function, t);
+   entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   ++*nent;
+   }
+   break;
+   }
case KVM_CPUID_SIGNATURE: {
static const char signature[12] = "KVMKVMKVM\0\0";
const u32 *sigptr = (const u32 *)signature;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 220e5a89..6df8075 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5876,6 +5876,11 @@ static bool svm_umip_emulated(void)
return false;
 }
 
+static bool svm_pt_supported(void)
+{
+   return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
return true;
@@ -7101,6 +7106,7 @@ static int svm_unregister_enc_region(struct kvm *kvm,
.mpx_supported = svm_mpx_supported,
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
+   .pt_supported = svm_pt_supported,
 
.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb96396..24aded4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9588,6 +9588,11 @@ static bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES;
 }
 
+static bool vmx_pt_supported(void)
+{
+   return (pt_mode == PT_MODE_HOST_GUEST);
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
u32 exit_intr_info;
@@ -12809,6 +12814,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
.mpx_supported = vmx_mpx_supported,
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
+   .pt_supported = vmx_pt_supported,
 
.check_nested_events = vmx_check_nested_events,
 
-- 
1.8.3.1



[PATCH v9 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation

2018-05-21 Thread Luwei Kang
From: Chao Peng 

Expose Intel Processor Trace to guest only when PT work in
HOST_GUEST mode.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/cpuid.c| 22 --
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx.c  |  6 ++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b27de80..8f3c7ea 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1026,6 +1026,7 @@ struct kvm_x86_ops {
bool (*mpx_supported)(void);
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
+   bool (*pt_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 82055b9..e04bf67 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -336,6 +336,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
+   unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -393,7 +394,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | 
F(AVX512DQ) |
-   F(SHA_NI) | F(AVX512BW) | F(AVX512VL);
+   F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
 
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
@@ -423,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
switch (function) {
case 0:
-   entry->eax = min(entry->eax, (u32)0xd);
+   entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd));
break;
case 1:
entry->edx &= kvm_cpuid_1_edx_x86_features;
@@ -595,6 +596,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
}
break;
}
+   /* Intel PT */
+   case 0x14: {
+   int t, times = entry->eax;
+
+   if (!f_intel_pt)
+   break;
+
+   entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   for (t = 1; t <= times; ++t) {
+   if (*nent >= maxnent)
+   goto out;
+   do_cpuid_1_ent([t], function, t);
+   entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   ++*nent;
+   }
+   break;
+   }
case KVM_CPUID_SIGNATURE: {
static const char signature[12] = "KVMKVMKVM\0\0";
const u32 *sigptr = (const u32 *)signature;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 220e5a89..6df8075 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5876,6 +5876,11 @@ static bool svm_umip_emulated(void)
return false;
 }
 
+static bool svm_pt_supported(void)
+{
+   return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
return true;
@@ -7101,6 +7106,7 @@ static int svm_unregister_enc_region(struct kvm *kvm,
.mpx_supported = svm_mpx_supported,
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
+   .pt_supported = svm_pt_supported,
 
.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb96396..24aded4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9588,6 +9588,11 @@ static bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES;
 }
 
+static bool vmx_pt_supported(void)
+{
+   return (pt_mode == PT_MODE_HOST_GUEST);
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
u32 exit_intr_info;
@@ -12809,6 +12814,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
.mpx_supported = vmx_mpx_supported,
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
+   .pt_supported = vmx_pt_supported,
 
.check_nested_events = vmx_check_nested_events,
 
-- 
1.8.3.1



[PATCH v9 05/12] perf/x86/intel/pt: Introduce a new function to get capability of Intel PT

2018-05-21 Thread Luwei Kang
New function pt_cap_decode() will be invoked in KVM to check
if a specific capability is available in KVM guest.
Another function pt_cap_get() can only check the hardware
capabilities but this may different with KVM guest because
some features may not be exposed to guest.

Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/events/intel/pt.c  | 10 --
 arch/x86/include/asm/intel_pt.h |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index f65f97a..18a2e80 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -76,14 +76,20 @@
PT_CAP(psb_periods, 1, CPUID_EBX, 0x),
 };
 
-u32 pt_cap_get(enum pt_capabilities cap)
+u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap)
 {
struct pt_cap_desc *cd = _caps[cap];
-   u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+   u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
unsigned int shift = __ffs(cd->mask);
 
return (c & cd->mask) >> shift;
 }
+EXPORT_SYMBOL_GPL(pt_cap_decode);
+
+u32 pt_cap_get(enum pt_capabilities cap)
+{
+   return pt_cap_decode(pt_pmu.caps, cap);
+}
 EXPORT_SYMBOL_GPL(pt_cap_get);
 
 static ssize_t pt_cap_show(struct device *cdev,
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 2de4db0..9c71453 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -27,9 +27,11 @@ enum pt_capabilities {
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 void cpu_emergency_stop_pt(void);
 extern u32 pt_cap_get(enum pt_capabilities cap);
+extern u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap);
 #else
 static inline void cpu_emergency_stop_pt(void) {}
 static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; }
+static u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { return 0; }
 #endif
 
 #endif /* _ASM_X86_INTEL_PT_H */
-- 
1.8.3.1



[PATCH v9 05/12] perf/x86/intel/pt: Introduce a new function to get capability of Intel PT

2018-05-21 Thread Luwei Kang
New function pt_cap_decode() will be invoked in KVM to check
if a specific capability is available in KVM guest.
Another function pt_cap_get() can only check the hardware
capabilities but this may different with KVM guest because
some features may not be exposed to guest.

Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  | 10 --
 arch/x86/include/asm/intel_pt.h |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index f65f97a..18a2e80 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -76,14 +76,20 @@
PT_CAP(psb_periods, 1, CPUID_EBX, 0x),
 };
 
-u32 pt_cap_get(enum pt_capabilities cap)
+u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap)
 {
struct pt_cap_desc *cd = _caps[cap];
-   u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+   u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
unsigned int shift = __ffs(cd->mask);
 
return (c & cd->mask) >> shift;
 }
+EXPORT_SYMBOL_GPL(pt_cap_decode);
+
+u32 pt_cap_get(enum pt_capabilities cap)
+{
+   return pt_cap_decode(pt_pmu.caps, cap);
+}
 EXPORT_SYMBOL_GPL(pt_cap_get);
 
 static ssize_t pt_cap_show(struct device *cdev,
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 2de4db0..9c71453 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -27,9 +27,11 @@ enum pt_capabilities {
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 void cpu_emergency_stop_pt(void);
 extern u32 pt_cap_get(enum pt_capabilities cap);
+extern u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap);
 #else
 static inline void cpu_emergency_stop_pt(void) {}
 static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; }
+static u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { return 0; }
 #endif
 
 #endif /* _ASM_X86_INTEL_PT_H */
-- 
1.8.3.1



[PATCH v9 03/12] perf/x86/intel/pt: Add new bit definitions for Intel PT MSRs

2018-05-21 Thread Luwei Kang
These bit definitions are use for emulate MSRs read/write
for KVM. For example, IA32_RTIT_CTL.FabricEn[bit 6] is available
only when CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 1. If KVM guest
try to set this bit with CPUID.(EAX=14H, ECX=0):ECX[bit3] = 0
a #GP would be injected to KVM guest.

Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/include/asm/msr-index.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index afe4e13..6ae2462 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -112,6 +112,7 @@
 #define RTIT_CTL_USR   BIT(3)
 #define RTIT_CTL_PWR_EVT_ENBIT(4)
 #define RTIT_CTL_FUP_ON_PTWBIT(5)
+#define RTIT_CTL_FABRIC_EN BIT(6)
 #define RTIT_CTL_CR3EN BIT(7)
 #define RTIT_CTL_TOPA  BIT(8)
 #define RTIT_CTL_MTC_ENBIT(9)
@@ -140,6 +141,8 @@
 #define RTIT_STATUS_BUFFOVFBIT(3)
 #define RTIT_STATUS_ERROR  BIT(4)
 #define RTIT_STATUS_STOPPEDBIT(5)
+#define RTIT_STATUS_BYTECNT_OFFSET 32
+#define RTIT_STATUS_BYTECNT(0x1ull << 
RTIT_STATUS_BYTECNT_OFFSET)
 #define MSR_IA32_RTIT_ADDR0_A  0x0580
 #define MSR_IA32_RTIT_ADDR0_B  0x0581
 #define MSR_IA32_RTIT_ADDR1_A  0x0582
-- 
1.8.3.1



[PATCH v9 03/12] perf/x86/intel/pt: Add new bit definitions for Intel PT MSRs

2018-05-21 Thread Luwei Kang
These bit definitions are use for emulate MSRs read/write
for KVM. For example, IA32_RTIT_CTL.FabricEn[bit 6] is available
only when CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 1. If KVM guest
try to set this bit with CPUID.(EAX=14H, ECX=0):ECX[bit3] = 0
a #GP would be injected to KVM guest.

Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/msr-index.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index afe4e13..6ae2462 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -112,6 +112,7 @@
 #define RTIT_CTL_USR   BIT(3)
 #define RTIT_CTL_PWR_EVT_ENBIT(4)
 #define RTIT_CTL_FUP_ON_PTWBIT(5)
+#define RTIT_CTL_FABRIC_EN BIT(6)
 #define RTIT_CTL_CR3EN BIT(7)
 #define RTIT_CTL_TOPA  BIT(8)
 #define RTIT_CTL_MTC_ENBIT(9)
@@ -140,6 +141,8 @@
 #define RTIT_STATUS_BUFFOVFBIT(3)
 #define RTIT_STATUS_ERROR  BIT(4)
 #define RTIT_STATUS_STOPPEDBIT(5)
+#define RTIT_STATUS_BYTECNT_OFFSET 32
+#define RTIT_STATUS_BYTECNT(0x1ull << 
RTIT_STATUS_BYTECNT_OFFSET)
 #define MSR_IA32_RTIT_ADDR0_A  0x0580
 #define MSR_IA32_RTIT_ADDR0_B  0x0581
 #define MSR_IA32_RTIT_ADDR1_A  0x0582
-- 
1.8.3.1



[PATCH v9 01/12] perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public header

2018-05-21 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Intel Processor Trace virtualization enabling in KVM guest
need to access these MSRs bit definitions, so move them to
public header file msr-index.h.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/events/intel/pt.h   | 37 -
 arch/x86/include/asm/msr-index.h | 33 +
 2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 0eb41d0..0050ca1 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -20,43 +20,6 @@
 #define __INTEL_PT_H__
 
 /*
- * PT MSR bit definitions
- */
-#define RTIT_CTL_TRACEEN   BIT(0)
-#define RTIT_CTL_CYCLEACC  BIT(1)
-#define RTIT_CTL_OSBIT(2)
-#define RTIT_CTL_USR   BIT(3)
-#define RTIT_CTL_PWR_EVT_ENBIT(4)
-#define RTIT_CTL_FUP_ON_PTWBIT(5)
-#define RTIT_CTL_CR3EN BIT(7)
-#define RTIT_CTL_TOPA  BIT(8)
-#define RTIT_CTL_MTC_ENBIT(9)
-#define RTIT_CTL_TSC_ENBIT(10)
-#define RTIT_CTL_DISRETC   BIT(11)
-#define RTIT_CTL_PTW_ENBIT(12)
-#define RTIT_CTL_BRANCH_EN BIT(13)
-#define RTIT_CTL_MTC_RANGE_OFFSET  14
-#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
-#define RTIT_CTL_CYC_THRESH_OFFSET 19
-#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
-#define RTIT_CTL_PSB_FREQ_OFFSET   24
-#define RTIT_CTL_PSB_FREQ  (0x0full << 
RTIT_CTL_PSB_FREQ_OFFSET)
-#define RTIT_CTL_ADDR0_OFFSET  32
-#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
-#define RTIT_CTL_ADDR1_OFFSET  36
-#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
-#define RTIT_CTL_ADDR2_OFFSET  40
-#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
-#define RTIT_CTL_ADDR3_OFFSET  44
-#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
-#define RTIT_STATUS_FILTEREN   BIT(0)
-#define RTIT_STATUS_CONTEXTEN  BIT(1)
-#define RTIT_STATUS_TRIGGEREN  BIT(2)
-#define RTIT_STATUS_BUFFOVFBIT(3)
-#define RTIT_STATUS_ERROR  BIT(4)
-#define RTIT_STATUS_STOPPEDBIT(5)
-
-/*
  * Single-entry ToPA: when this close to region boundary, switch
  * buffers to avoid losing data.
  */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 53d5b1b..afe4e13 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -106,7 +106,40 @@
 #define MSR_PEBS_LD_LAT_THRESHOLD  0x03f6
 
 #define MSR_IA32_RTIT_CTL  0x0570
+#define RTIT_CTL_TRACEEN   BIT(0)
+#define RTIT_CTL_CYCLEACC  BIT(1)
+#define RTIT_CTL_OSBIT(2)
+#define RTIT_CTL_USR   BIT(3)
+#define RTIT_CTL_PWR_EVT_ENBIT(4)
+#define RTIT_CTL_FUP_ON_PTWBIT(5)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA  BIT(8)
+#define RTIT_CTL_MTC_ENBIT(9)
+#define RTIT_CTL_TSC_ENBIT(10)
+#define RTIT_CTL_DISRETC   BIT(11)
+#define RTIT_CTL_PTW_ENBIT(12)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET  14
+#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET 19
+#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET   24
+#define RTIT_CTL_PSB_FREQ  (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
+#define RTIT_CTL_ADDR0_OFFSET  32
+#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
+#define RTIT_CTL_ADDR1_OFFSET  36
+#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
+#define RTIT_CTL_ADDR2_OFFSET  40
+#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
+#define RTIT_CTL_ADDR3_OFFSET  44
+#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
 #define MSR_IA32_RTIT_STATUS   0x0571
+#define RTIT_STATUS_FILTEREN   BIT(0)
+#define RTIT_STATUS_CONTEXTEN  BIT(1)
+#define RTIT_STATUS_TRIGGEREN  BIT(2)
+#define RTIT_STATUS_BUFFOVFBIT(3)
+#define RTIT_STATUS_ERROR  BIT(4)
+#define RTIT_STATUS_STOPPEDBIT(5)
 #define MSR_IA32_RTIT_ADDR0_A  0x0580
 #define MSR_IA32_RTIT_ADDR0_B  0x0581
 #define MSR_IA32_RTIT_ADDR1_A  0x0582
-- 
1.8.3.1



[PATCH v9 01/12] perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public header

2018-05-21 Thread Luwei Kang
From: Chao Peng 

Intel Processor Trace virtualization enabling in KVM guest
need to access these MSRs bit definitions, so move them to
public header file msr-index.h.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.h   | 37 -
 arch/x86/include/asm/msr-index.h | 33 +
 2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 0eb41d0..0050ca1 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -20,43 +20,6 @@
 #define __INTEL_PT_H__
 
 /*
- * PT MSR bit definitions
- */
-#define RTIT_CTL_TRACEEN   BIT(0)
-#define RTIT_CTL_CYCLEACC  BIT(1)
-#define RTIT_CTL_OSBIT(2)
-#define RTIT_CTL_USR   BIT(3)
-#define RTIT_CTL_PWR_EVT_ENBIT(4)
-#define RTIT_CTL_FUP_ON_PTWBIT(5)
-#define RTIT_CTL_CR3EN BIT(7)
-#define RTIT_CTL_TOPA  BIT(8)
-#define RTIT_CTL_MTC_ENBIT(9)
-#define RTIT_CTL_TSC_ENBIT(10)
-#define RTIT_CTL_DISRETC   BIT(11)
-#define RTIT_CTL_PTW_ENBIT(12)
-#define RTIT_CTL_BRANCH_EN BIT(13)
-#define RTIT_CTL_MTC_RANGE_OFFSET  14
-#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
-#define RTIT_CTL_CYC_THRESH_OFFSET 19
-#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
-#define RTIT_CTL_PSB_FREQ_OFFSET   24
-#define RTIT_CTL_PSB_FREQ  (0x0full << 
RTIT_CTL_PSB_FREQ_OFFSET)
-#define RTIT_CTL_ADDR0_OFFSET  32
-#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
-#define RTIT_CTL_ADDR1_OFFSET  36
-#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
-#define RTIT_CTL_ADDR2_OFFSET  40
-#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
-#define RTIT_CTL_ADDR3_OFFSET  44
-#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
-#define RTIT_STATUS_FILTEREN   BIT(0)
-#define RTIT_STATUS_CONTEXTEN  BIT(1)
-#define RTIT_STATUS_TRIGGEREN  BIT(2)
-#define RTIT_STATUS_BUFFOVFBIT(3)
-#define RTIT_STATUS_ERROR  BIT(4)
-#define RTIT_STATUS_STOPPEDBIT(5)
-
-/*
  * Single-entry ToPA: when this close to region boundary, switch
  * buffers to avoid losing data.
  */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 53d5b1b..afe4e13 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -106,7 +106,40 @@
 #define MSR_PEBS_LD_LAT_THRESHOLD  0x03f6
 
 #define MSR_IA32_RTIT_CTL  0x0570
+#define RTIT_CTL_TRACEEN   BIT(0)
+#define RTIT_CTL_CYCLEACC  BIT(1)
+#define RTIT_CTL_OSBIT(2)
+#define RTIT_CTL_USR   BIT(3)
+#define RTIT_CTL_PWR_EVT_ENBIT(4)
+#define RTIT_CTL_FUP_ON_PTWBIT(5)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA  BIT(8)
+#define RTIT_CTL_MTC_ENBIT(9)
+#define RTIT_CTL_TSC_ENBIT(10)
+#define RTIT_CTL_DISRETC   BIT(11)
+#define RTIT_CTL_PTW_ENBIT(12)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET  14
+#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET 19
+#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET   24
+#define RTIT_CTL_PSB_FREQ  (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
+#define RTIT_CTL_ADDR0_OFFSET  32
+#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
+#define RTIT_CTL_ADDR1_OFFSET  36
+#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
+#define RTIT_CTL_ADDR2_OFFSET  40
+#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
+#define RTIT_CTL_ADDR3_OFFSET  44
+#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
 #define MSR_IA32_RTIT_STATUS   0x0571
+#define RTIT_STATUS_FILTEREN   BIT(0)
+#define RTIT_STATUS_CONTEXTEN  BIT(1)
+#define RTIT_STATUS_TRIGGEREN  BIT(2)
+#define RTIT_STATUS_BUFFOVFBIT(3)
+#define RTIT_STATUS_ERROR  BIT(4)
+#define RTIT_STATUS_STOPPEDBIT(5)
 #define MSR_IA32_RTIT_ADDR0_A  0x0580
 #define MSR_IA32_RTIT_ADDR0_B  0x0581
 #define MSR_IA32_RTIT_ADDR1_A  0x0582
-- 
1.8.3.1



[PATCH v9 00/12] Intel Processor Trace virtualization enabling

2018-05-21 Thread Luwei Kang
Hi All,

Here is a patch-series which adding Processor Trace enabling in KVM guest. You 
can get It's software developer manuals from:
https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
In Chapter 4 INTEL PROCESSOR TRACE: VMX IMPROVEMENTS.

Introduction:
Intel Processor Trace (Intel PT) is an extension of Intel Architecture that 
captures information about software execution using dedicated hardware 
facilities that cause only minimal performance perturbation to the software 
being traced. Details on the Intel PT infrastructure and trace capabilities can 
be found in the Intel 64 and IA-32 Architectures Software Developer’s Manual, 
Volume 3C.

The suite of architecture changes serve to simplify the process of virtualizing 
Intel PT for use by a guest software. There are two primary elements to this 
new architecture support for VMX support improvements made for Intel PT.
1. Addition of a new guest IA32_RTIT_CTL value field to the VMCS.
  — This serves to speed and simplify the process of disabling trace on VM 
exit, and restoring it on VM entry.
2. Enabling use of EPT to redirect PT output.
  — This enables the VMM to elect to virtualize the PT output buffer using EPT. 
In this mode, the CPU will treat PT output addresses as Guest Physical 
Addresses (GPAs) and translate them using EPT. This means that Intel PT output 
reads (of the ToPA table) and writes (of trace output) can cause EPT 
violations, and other output events.

Processor Trace virtualization can be work in one of 2 possible modes by set 
new option "pt_mode". Default value is system mode.
 a. system-wide: trace both host/guest and output to host buffer;
 b. host-guest: trace host/guest simultaneous and output to their respective 
buffer.

>From V8:
 - move macro definition MSR_IA32_RTIT_ADDR_RANGE from msr-index.h to 
intel_pt.h;
 - initialize the RTIT_CTL bitmask to ~0ULL.

>From V7:
 - remove host only mode since it can be emulated by perf code;
 - merge patch 8 and 9 to make code and data in the same patch;
 - rename __pt_cap_get() to pt_cap_decode();
 - other minor change.

>From V6:
 - split pathes 1~2 to four separate patches (these patches do 2 things) and 
add more descriptions.

>From V5:
 - rename the function from pt_cap_get_ex() to __pt_cap_get();
 - replace the most of function from vmx_pt_supported() to "pt_mode == 
PT_MODE_HOST_GUEST"(or !=).

>From V4:
 - add data check when setting the value of MSR_IA32_RTIT_CTL;
 - Invoke new interface to set the intercept of MSRs read/write after "MSR 
bitmap per-vcpu" patches.

>From V3:
 - change default mode to SYSTEM mode;
 - add a new patch to move PT out of scattered features;
 - add a new fucntion kvm_get_pt_addr_cnt() to get the number of address ranges;
 - add a new function vmx_set_rtit_ctl() to set the value of guest RTIT_CTL, 
GUEST_IA32_RTIT_CTL and MSRs intercept.

>From v2:
 - replace *_PT_SUPPRESS_PIP to *_PT_CONCEAL_PIP;
 - clean SECONDARY_EXEC_PT_USE_GPA, VM_EXIT_CLEAR_IA32_RTIT_CTL and 
VM_ENTRY_LOAD_IA32_RTIT_CTL in SYSTEM mode. These bits must be all set or all 
clean;
 - move processor tracing out of scattered features;
 - add a new function to enable/disable intercept MSRs read/write;
 - add all Intel PT MSRs read/write and disable intercept when PT is enabled in 
guest;
 - disable Intel PT and enable intercept MSRs when L1 guest VMXON;
 - performance optimization.
   In Host only mode. we just need to save host RTIT_CTL before vm-entry and 
restore host RTIT_CTL after vm-exit;
   In HOST_GUEST mode. we need to save and restore all MSRs only when PT has 
enabled in guest.
 - use XSAVES/XRESTORES implement context switch.
   Haven't implementation in this version and still in debuging. will make a 
separate patch work on this.

>From v1:
 - remove guest-only mode because guest-only mode can be covered by host-guest 
mode;
 - always set "use GPA for processor tracing" in secondary execution control if 
it can be;
 - trap RTIT_CTL read/write. Forbid write this msr when VMXON in L1 hypervisor.

Chao Peng (7):
  perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public
header
  perf/x86/intel/pt: Change pt_cap_get() to a public function
  KVM: x86: Add Intel Processor Trace virtualization mode
  KVM: x86: Add Intel Processor Trace cpuid emulation
  KVM: x86: Add Intel Processor Trace context switch for each vcpu
  KVM: x86: Implement Intel Processor Trace MSRs read/write emulation
  KVM: x86: Set intercept for Intel PT MSRs read/write

Luwei Kang (5):
  perf/x86/intel/pt: Add new bit definitions for Intel PT MSRs
  perf/x86/intel/pt: add new capability for Intel PT
  perf/x86/intel/pt: Introduce a new function to get capability of Intel
PT
  KVM: x86: Introduce a function to initialize the PT configuration
  KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest

 arch/x86/events/intel/pt.c   |  12 +-
 arc

[PATCH v9 00/12] Intel Processor Trace virtualization enabling

2018-05-21 Thread Luwei Kang
Hi All,

Here is a patch-series which adding Processor Trace enabling in KVM guest. You 
can get It's software developer manuals from:
https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
In Chapter 4 INTEL PROCESSOR TRACE: VMX IMPROVEMENTS.

Introduction:
Intel Processor Trace (Intel PT) is an extension of Intel Architecture that 
captures information about software execution using dedicated hardware 
facilities that cause only minimal performance perturbation to the software 
being traced. Details on the Intel PT infrastructure and trace capabilities can 
be found in the Intel 64 and IA-32 Architectures Software Developer’s Manual, 
Volume 3C.

The suite of architecture changes serve to simplify the process of virtualizing 
Intel PT for use by a guest software. There are two primary elements to this 
new architecture support for VMX support improvements made for Intel PT.
1. Addition of a new guest IA32_RTIT_CTL value field to the VMCS.
  — This serves to speed and simplify the process of disabling trace on VM 
exit, and restoring it on VM entry.
2. Enabling use of EPT to redirect PT output.
  — This enables the VMM to elect to virtualize the PT output buffer using EPT. 
In this mode, the CPU will treat PT output addresses as Guest Physical 
Addresses (GPAs) and translate them using EPT. This means that Intel PT output 
reads (of the ToPA table) and writes (of trace output) can cause EPT 
violations, and other output events.

Processor Trace virtualization can be work in one of 2 possible modes by set 
new option "pt_mode". Default value is system mode.
 a. system-wide: trace both host/guest and output to host buffer;
 b. host-guest: trace host/guest simultaneous and output to their respective 
buffer.

>From V8:
 - move macro definition MSR_IA32_RTIT_ADDR_RANGE from msr-index.h to 
intel_pt.h;
 - initialize the RTIT_CTL bitmask to ~0ULL.

>From V7:
 - remove host only mode since it can be emulated by perf code;
 - merge patch 8 and 9 to make code and data in the same patch;
 - rename __pt_cap_get() to pt_cap_decode();
 - other minor change.

>From V6:
 - split pathes 1~2 to four separate patches (these patches do 2 things) and 
add more descriptions.

>From V5:
 - rename the function from pt_cap_get_ex() to __pt_cap_get();
 - replace the most of function from vmx_pt_supported() to "pt_mode == 
PT_MODE_HOST_GUEST"(or !=).

>From V4:
 - add data check when setting the value of MSR_IA32_RTIT_CTL;
 - Invoke new interface to set the intercept of MSRs read/write after "MSR 
bitmap per-vcpu" patches.

>From V3:
 - change default mode to SYSTEM mode;
 - add a new patch to move PT out of scattered features;
 - add a new fucntion kvm_get_pt_addr_cnt() to get the number of address ranges;
 - add a new function vmx_set_rtit_ctl() to set the value of guest RTIT_CTL, 
GUEST_IA32_RTIT_CTL and MSRs intercept.

>From v2:
 - replace *_PT_SUPPRESS_PIP to *_PT_CONCEAL_PIP;
 - clean SECONDARY_EXEC_PT_USE_GPA, VM_EXIT_CLEAR_IA32_RTIT_CTL and 
VM_ENTRY_LOAD_IA32_RTIT_CTL in SYSTEM mode. These bits must be all set or all 
clean;
 - move processor tracing out of scattered features;
 - add a new function to enable/disable intercept MSRs read/write;
 - add all Intel PT MSRs read/write and disable intercept when PT is enabled in 
guest;
 - disable Intel PT and enable intercept MSRs when L1 guest VMXON;
 - performance optimization.
   In Host only mode. we just need to save host RTIT_CTL before vm-entry and 
restore host RTIT_CTL after vm-exit;
   In HOST_GUEST mode. we need to save and restore all MSRs only when PT has 
enabled in guest.
 - use XSAVES/XRESTORES implement context switch.
   Haven't implementation in this version and still in debuging. will make a 
separate patch work on this.

>From v1:
 - remove guest-only mode because guest-only mode can be covered by host-guest 
mode;
 - always set "use GPA for processor tracing" in secondary execution control if 
it can be;
 - trap RTIT_CTL read/write. Forbid write this msr when VMXON in L1 hypervisor.

Chao Peng (7):
  perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public
header
  perf/x86/intel/pt: Change pt_cap_get() to a public function
  KVM: x86: Add Intel Processor Trace virtualization mode
  KVM: x86: Add Intel Processor Trace cpuid emulation
  KVM: x86: Add Intel Processor Trace context switch for each vcpu
  KVM: x86: Implement Intel Processor Trace MSRs read/write emulation
  KVM: x86: Set intercept for Intel PT MSRs read/write

Luwei Kang (5):
  perf/x86/intel/pt: Add new bit definitions for Intel PT MSRs
  perf/x86/intel/pt: add new capability for Intel PT
  perf/x86/intel/pt: Introduce a new function to get capability of Intel
PT
  KVM: x86: Introduce a function to initialize the PT configuration
  KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest

 arch/x86/events/intel/pt.c   |  12 +-
 arc

[PATCH v8 04/12] perf/x86/intel/pt: add new capability for Intel PT

2018-05-14 Thread Luwei Kang
CPUID(EAX=14H,ECX=0):EBX[bit 3] = 1 indicates support of
output to Trace Transport subsystem.
MSR IA32_RTIT_CTL.FabricEn[bit 6] is reserved if
CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0.
This is use for emulate IA32_RTIT_CTL MSR read/write
in KVM. KVM guest write IA32_RTIT_CTL will trap to
root mode and a #GP would be injected to guest if set
IA32_RTIT_CTL.FabricEn with
CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0.

Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/events/intel/pt.c  | 1 +
 arch/x86/include/asm/intel_pt.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index c80e2f5..f65f97a 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -68,6 +68,7 @@
PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
PT_CAP(topa_multiple_entries,   0, CPUID_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
+   PT_CAP(output_subsys,   0, CPUID_ECX, BIT(3)),
PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)),
PT_CAP(num_address_ranges,  1, CPUID_EAX, 0x3),
PT_CAP(mtc_periods, 1, CPUID_EAX, 0x),
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 4270421..2de4db0 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -16,6 +16,7 @@ enum pt_capabilities {
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
PT_CAP_single_range_output,
+   PT_CAP_output_subsys,
PT_CAP_payloads_lip,
PT_CAP_num_address_ranges,
PT_CAP_mtc_periods,
-- 
1.8.3.1



[PATCH v8 04/12] perf/x86/intel/pt: add new capability for Intel PT

2018-05-14 Thread Luwei Kang
CPUID(EAX=14H,ECX=0):EBX[bit 3] = 1 indicates support of
output to Trace Transport subsystem.
MSR IA32_RTIT_CTL.FabricEn[bit 6] is reserved if
CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0.
This is use for emulate IA32_RTIT_CTL MSR read/write
in KVM. KVM guest write IA32_RTIT_CTL will trap to
root mode and a #GP would be injected to guest if set
IA32_RTIT_CTL.FabricEn with
CPUID.(EAX=14H, ECX=0):ECX[bit 3] = 0.

Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  | 1 +
 arch/x86/include/asm/intel_pt.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index c80e2f5..f65f97a 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -68,6 +68,7 @@
PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
PT_CAP(topa_multiple_entries,   0, CPUID_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
+   PT_CAP(output_subsys,   0, CPUID_ECX, BIT(3)),
PT_CAP(payloads_lip,0, CPUID_ECX, BIT(31)),
PT_CAP(num_address_ranges,  1, CPUID_EAX, 0x3),
PT_CAP(mtc_periods, 1, CPUID_EAX, 0x),
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 4270421..2de4db0 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -16,6 +16,7 @@ enum pt_capabilities {
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
PT_CAP_single_range_output,
+   PT_CAP_output_subsys,
PT_CAP_payloads_lip,
PT_CAP_num_address_ranges,
PT_CAP_mtc_periods,
-- 
1.8.3.1



[PATCH v8 12/12] KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest

2018-05-14 Thread Luwei Kang
Currently, Intel Processor Trace do not support tracing in L1 guest
VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM,
on these type of processors, execution of the VMXON instruction will
clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL
causes a general-protection exception (#GP).

Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/kvm/vmx.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 170cd48..7ace11a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3996,7 +3996,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
-   vmx_rtit_ctl_check(vcpu, data))
+   vmx_rtit_ctl_check(vcpu, data) ||
+   vmx->nested.vmxon)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
@@ -8089,6 +8090,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (ret)
return ret;
 
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   vmx->pt_desc.guest.ctl = 0;
+   pt_set_intercept_for_msr(vmx, 1);
+   }
+
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
 }
-- 
1.8.3.1



[PATCH v8 12/12] KVM: x86: Disable Intel Processor Trace when VMXON in L1 guest

2018-05-14 Thread Luwei Kang
Currently, Intel Processor Trace do not support tracing in L1 guest
VMX operation(IA32_VMX_MISC[bit 14] is 0). As mentioned in SDM,
on these type of processors, execution of the VMXON instruction will
clears IA32_RTIT_CTL.TraceEn and any attempt to write IA32_RTIT_CTL
causes a general-protection exception (#GP).

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 170cd48..7ace11a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3996,7 +3996,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
-   vmx_rtit_ctl_check(vcpu, data))
+   vmx_rtit_ctl_check(vcpu, data) ||
+   vmx->nested.vmxon)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
@@ -8089,6 +8090,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (ret)
return ret;
 
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   vmx->pt_desc.guest.ctl = 0;
+   pt_set_intercept_for_msr(vmx, 1);
+   }
+
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
 }
-- 
1.8.3.1



[PATCH v8 10/12] KVM: x86: Implement Intel Processor Trace MSRs read/write emulation

2018-05-14 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

This patch implement Intel Processor Trace MSRs read/write
emulation.
Intel PT MSRs read/write need to be emulated when Intel PT
MSRs is intercepted in guest and during live migration.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/include/asm/intel_pt.h |   8 ++
 arch/x86/kvm/vmx.c  | 172 
 arch/x86/kvm/x86.c  |  33 +++-
 3 files changed, 212 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 5748205..3da4cdb 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -8,6 +8,14 @@
 #define PT_MODE_SYSTEM 0
 #define PT_MODE_HOST_GUEST 1
 
+#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
+   RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
+   RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
+   RTIT_STATUS_BYTECNT))
+
+#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
+   (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5c321cd..d04b235 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2808,6 +2808,77 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu 
*vcpu, int mask)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
 }
 
+static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   unsigned long value;
+
+   /*
+* Any MSR write that attempts to change bits marked reserved will
+* case a #GP fault.
+*/
+   if (data & vmx->pt_desc.ctl_bitmask)
+   return 1;
+
+   /*
+* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
+* result in a #GP unless the same write also clears TraceEn.
+*/
+   if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
+   ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+   return 1;
+
+   /*
+* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
+* and FabricEn would cause #GP, if
+* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
+*/
+   if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
+   !(data & RTIT_CTL_FABRIC_EN) &&
+   !pt_cap_decode(vmx->pt_desc.caps, PT_CAP_single_range_output))
+   return 1;
+
+   /*
+* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
+* utilize encodings marked reserved will casue a #GP fault.
+*/
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc_periods);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc) &&
+   !test_bit((data & RTIT_CTL_MTC_RANGE) >>
+   RTIT_CTL_MTC_RANGE_OFFSET, ))
+   return 1;
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cycle_thresholds);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_CYC_THRESH) >>
+   RTIT_CTL_CYC_THRESH_OFFSET, ))
+   return 1;
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_periods);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_PSB_FREQ) >>
+   RTIT_CTL_PSB_FREQ_OFFSET, ))
+   return 1;
+
+   /*
+* If ADDRx_CFG is reserved or the encodings is >2 will
+* cause a #GP fault.
+*/
+   value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+   return 1;
+
+   return 0;
+}
+
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
unsigned long rip;
@@ -3624,6 +3695,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_ms

[PATCH v8 10/12] KVM: x86: Implement Intel Processor Trace MSRs read/write emulation

2018-05-14 Thread Luwei Kang
From: Chao Peng 

This patch implement Intel Processor Trace MSRs read/write
emulation.
Intel PT MSRs read/write need to be emulated when Intel PT
MSRs is intercepted in guest and during live migration.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/intel_pt.h |   8 ++
 arch/x86/kvm/vmx.c  | 172 
 arch/x86/kvm/x86.c  |  33 +++-
 3 files changed, 212 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 5748205..3da4cdb 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -8,6 +8,14 @@
 #define PT_MODE_SYSTEM 0
 #define PT_MODE_HOST_GUEST 1
 
+#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
+   RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
+   RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
+   RTIT_STATUS_BYTECNT))
+
+#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
+   (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5c321cd..d04b235 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2808,6 +2808,77 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu 
*vcpu, int mask)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
 }
 
+static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   unsigned long value;
+
+   /*
+* Any MSR write that attempts to change bits marked reserved will
+* case a #GP fault.
+*/
+   if (data & vmx->pt_desc.ctl_bitmask)
+   return 1;
+
+   /*
+* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
+* result in a #GP unless the same write also clears TraceEn.
+*/
+   if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
+   ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+   return 1;
+
+   /*
+* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
+* and FabricEn would cause #GP, if
+* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
+*/
+   if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
+   !(data & RTIT_CTL_FABRIC_EN) &&
+   !pt_cap_decode(vmx->pt_desc.caps, PT_CAP_single_range_output))
+   return 1;
+
+   /*
+* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
+* utilize encodings marked reserved will casue a #GP fault.
+*/
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc_periods);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc) &&
+   !test_bit((data & RTIT_CTL_MTC_RANGE) >>
+   RTIT_CTL_MTC_RANGE_OFFSET, ))
+   return 1;
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cycle_thresholds);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_CYC_THRESH) >>
+   RTIT_CTL_CYC_THRESH_OFFSET, ))
+   return 1;
+   value = pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_periods);
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+   !test_bit((data & RTIT_CTL_PSB_FREQ) >>
+   RTIT_CTL_PSB_FREQ_OFFSET, ))
+   return 1;
+
+   /*
+* If ADDRx_CFG is reserved or the encodings is >2 will
+* cause a #GP fault.
+*/
+   value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+   return 1;
+   value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
+   if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+   return 1;
+
+   return 0;
+}
+
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
unsigned long rip;
@@ -3624,6 +3695,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
+   u32 index;
 
switch (msr_info->index) {

[PATCH v8 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write

2018-05-14 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Disable intercept Intel PT MSRs only when Intel PT is
enabled in guest. But MSR_IA32_RTIT_CTL will alway be
intercept.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/kvm/vmx.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d04b235..170cd48 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -947,6 +947,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 
*vmcs12,
 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 static void __always_inline vmx_disable_intercept_for_msr(unsigned long 
*msr_bitmap,
  u32 msr, int type);
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3998,6 +3999,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
vmx_rtit_ctl_check(vcpu, data))
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
+   pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
vmx->pt_desc.guest.ctl = data;
break;
case MSR_IA32_RTIT_STATUS:
@@ -5819,6 +5821,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
vmx->msr_bitmap_mode = mode;
 }
 
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag)
+{
+   unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+   u32 i;
+
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
+   MSR_TYPE_RW, flag);
+   for (i = 0; i < vmx->pt_desc.addr_range; i++) {
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+   }
+}
+
 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
return enable_apicv;
-- 
1.8.3.1



[PATCH v8 11/12] KVM: x86: Set intercept for Intel PT MSRs read/write

2018-05-14 Thread Luwei Kang
From: Chao Peng 

Disable intercept Intel PT MSRs only when Intel PT is
enabled in guest. But MSR_IA32_RTIT_CTL will alway be
intercept.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d04b235..170cd48 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -947,6 +947,7 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 
*vmcs12,
 static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 static void __always_inline vmx_disable_intercept_for_msr(unsigned long 
*msr_bitmap,
  u32 msr, int type);
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3998,6 +3999,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
vmx_rtit_ctl_check(vcpu, data))
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
+   pt_set_intercept_for_msr(vmx, !(data & RTIT_CTL_TRACEEN));
vmx->pt_desc.guest.ctl = data;
break;
case MSR_IA32_RTIT_STATUS:
@@ -5819,6 +5821,27 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
vmx->msr_bitmap_mode = mode;
 }
 
+static void pt_set_intercept_for_msr(struct vcpu_vmx *vmx, bool flag)
+{
+   unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+   u32 i;
+
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
+   MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
+   MSR_TYPE_RW, flag);
+   for (i = 0; i < vmx->pt_desc.addr_range; i++) {
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+   vmx_set_intercept_for_msr(msr_bitmap,
+   MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+   }
+}
+
 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
return enable_apicv;
-- 
1.8.3.1



[PATCH v8 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation

2018-05-14 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Expose Intel Processor Trace to guest only when PT work in
HOST_GUEST mode.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/cpuid.c| 22 --
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx.c  |  6 ++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8cb8461..e9acd8b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1025,6 +1025,7 @@ struct kvm_x86_ops {
bool (*mpx_supported)(void);
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
+   bool (*pt_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 82055b9..e04bf67 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -336,6 +336,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
+   unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -393,7 +394,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | 
F(AVX512DQ) |
-   F(SHA_NI) | F(AVX512BW) | F(AVX512VL);
+   F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
 
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
@@ -423,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
switch (function) {
case 0:
-   entry->eax = min(entry->eax, (u32)0xd);
+   entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd));
break;
case 1:
entry->edx &= kvm_cpuid_1_edx_x86_features;
@@ -595,6 +596,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
}
break;
}
+   /* Intel PT */
+   case 0x14: {
+   int t, times = entry->eax;
+
+   if (!f_intel_pt)
+   break;
+
+   entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   for (t = 1; t <= times; ++t) {
+   if (*nent >= maxnent)
+   goto out;
+   do_cpuid_1_ent([t], function, t);
+   entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   ++*nent;
+   }
+   break;
+   }
case KVM_CPUID_SIGNATURE: {
static const char signature[12] = "KVMKVMKVM\0\0";
const u32 *sigptr = (const u32 *)signature;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1fc05e4..21b2441 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5876,6 +5876,11 @@ static bool svm_umip_emulated(void)
return false;
 }
 
+static bool svm_pt_supported(void)
+{
+   return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
return true;
@@ -7101,6 +7106,7 @@ static int svm_unregister_enc_region(struct kvm *kvm,
.mpx_supported = svm_mpx_supported,
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
+   .pt_supported = svm_pt_supported,
 
.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ede5abf..f9b701a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9590,6 +9590,11 @@ static bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES;
 }
 
+static bool vmx_pt_supported(void)
+{
+   return (pt_mode == PT_MODE_HOST_GUEST);
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
u32 exit_intr_info;
@@ -12817,6 +12822,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
.mpx_supported = vmx_mpx_supported,
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
+   .pt_supported = vmx_pt_supported,
 
.check_nested_events = vmx_check_nested_events,
 
-- 
1.8.3.1



[PATCH v8 08/12] KVM: x86: Add Intel Processor Trace context switch for each vcpu

2018-05-14 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Load/Store Intel processor trace register in context switch.
MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS.
In HOST_GUEST mode, we need load/resore PT MSRs only when PT
is enabled in guest.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/kvm/vmx.c | 94 ++
 1 file changed, 94 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f9b701a..eb5f50a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -596,6 +596,24 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
(unsigned long *)_desc->control);
 }
 
+struct pt_ctx {
+   u64 ctl;
+   u64 status;
+   u64 output_base;
+   u64 output_mask;
+   u64 cr3_match;
+   u64 addr_a[MSR_IA32_RTIT_ADDR_RANGE];
+   u64 addr_b[MSR_IA32_RTIT_ADDR_RANGE];
+};
+
+struct pt_desc {
+   u64 ctl_bitmask;
+   u32 addr_range;
+   u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+   struct pt_ctx host;
+   struct pt_ctx guest;
+};
+
 struct vcpu_vmx {
struct kvm_vcpu   vcpu;
unsigned long host_rsp;
@@ -692,6 +710,8 @@ struct vcpu_vmx {
 */
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
+
+   struct pt_desc pt_desc;
 };
 
 enum segment_cache_field {
@@ -2390,6 +2410,69 @@ static unsigned long segment_base(u16 selector)
 }
 #endif
 
+static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static void pt_guest_enter(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   /* Save host state before VM entry */
+   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+
+   /*
+* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled
+* on VM entry when it has been disabled in guest before).
+*/
+   vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl);
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   wrmsrl(MSR_IA32_RTIT_CTL, 0);
+   pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   }
+}
+
+static void pt_guest_exit(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   }
+
+   /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
+   wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+}
+
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6134,6 +6217,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
+
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   memset(>pt_desc, 0, sizeof(vmx->pt_desc));
+   /* Bit[6~0] are forced to 1, writes are ignored. */
+   vmx->pt_desc.guest.output_mask = 0x7F;
+   vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
+   }
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -9802,6 +9892,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.pkru != vmx->host_pkru)
__write_pkru(vcpu->arch.pkru);
 
+   pt_guest_enter(vmx);
+
atomic_switch_perf_msrs(vmx);
 
vmx_arm_hv_timer(vcpu);
@@ -9996,6 +10088,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  | (1 << VCPU_EXREG_

[PATCH v8 09/12] KVM: x86: Introduce a function to initialize the PT configuration

2018-05-14 Thread Luwei Kang
Initialize the Intel PT configuration when cpuid update.
Include cpuid inforamtion, rtit_ctl bit mask and the number of
address ranges.

Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/kvm/vmx.c | 69 ++
 1 file changed, 69 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index eb5f50a..5c321cd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10413,6 +10413,71 @@ static void nested_vmx_cr_fixed1_bits_update(struct 
kvm_vcpu *vcpu)
 #undef cr4_fixed1_update
 }
 
+static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_cpuid_entry2 *best = NULL;
+   int i;
+
+   for (i = 0; i < PT_CPUID_LEAVES; i++) {
+   best = kvm_find_cpuid_entry(vcpu, 0x14, i);
+   if (!best)
+   return;
+   vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
+   vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
+   vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
+   vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
+   }
+
+   /* Get the number of configurable Address Ranges for filtering */
+   vmx->pt_desc.addr_range = pt_cap_decode(vmx->pt_desc.caps,
+   PT_CAP_num_address_ranges);
+
+   /* Clear the no dependency bits */
+   vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
+   RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
+* PSBFreq can be set
+*/
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
+   RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
+* MTCFreq can be set
+*/
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
+   RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_ptwrite))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
+   RTIT_CTL_PTW_EN);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_power_event_trace))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
+
+   /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_topa_output))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
+   /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_output_subsys))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
+
+   /* unmask address range configure area */
+   for (i = 0; i < vmx->pt_desc.addr_range; i++)
+   vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4));
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -10431,6 +10496,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
if (nested_vmx_allowed(vcpu))
nested_vmx_cr_fixed1_bits_update(vcpu);
+
+   if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
+   guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+   update_intel_pt_cfg(vcpu);
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-- 
1.8.3.1



[PATCH v8 07/12] KVM: x86: Add Intel Processor Trace cpuid emulation

2018-05-14 Thread Luwei Kang
From: Chao Peng 

Expose Intel Processor Trace to guest only when PT work in
HOST_GUEST mode.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/cpuid.c| 22 --
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx.c  |  6 ++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8cb8461..e9acd8b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1025,6 +1025,7 @@ struct kvm_x86_ops {
bool (*mpx_supported)(void);
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
+   bool (*pt_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 82055b9..e04bf67 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -336,6 +336,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
+   unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
 
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -393,7 +394,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | 
F(AVX512DQ) |
-   F(SHA_NI) | F(AVX512BW) | F(AVX512VL);
+   F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
 
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
@@ -423,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
switch (function) {
case 0:
-   entry->eax = min(entry->eax, (u32)0xd);
+   entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd));
break;
case 1:
entry->edx &= kvm_cpuid_1_edx_x86_features;
@@ -595,6 +596,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
}
break;
}
+   /* Intel PT */
+   case 0x14: {
+   int t, times = entry->eax;
+
+   if (!f_intel_pt)
+   break;
+
+   entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   for (t = 1; t <= times; ++t) {
+   if (*nent >= maxnent)
+   goto out;
+   do_cpuid_1_ent([t], function, t);
+   entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+   ++*nent;
+   }
+   break;
+   }
case KVM_CPUID_SIGNATURE: {
static const char signature[12] = "KVMKVMKVM\0\0";
const u32 *sigptr = (const u32 *)signature;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1fc05e4..21b2441 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5876,6 +5876,11 @@ static bool svm_umip_emulated(void)
return false;
 }
 
+static bool svm_pt_supported(void)
+{
+   return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
return true;
@@ -7101,6 +7106,7 @@ static int svm_unregister_enc_region(struct kvm *kvm,
.mpx_supported = svm_mpx_supported,
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
+   .pt_supported = svm_pt_supported,
 
.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ede5abf..f9b701a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9590,6 +9590,11 @@ static bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES;
 }
 
+static bool vmx_pt_supported(void)
+{
+   return (pt_mode == PT_MODE_HOST_GUEST);
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
u32 exit_intr_info;
@@ -12817,6 +12822,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
.mpx_supported = vmx_mpx_supported,
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
+   .pt_supported = vmx_pt_supported,
 
.check_nested_events = vmx_check_nested_events,
 
-- 
1.8.3.1



[PATCH v8 08/12] KVM: x86: Add Intel Processor Trace context switch for each vcpu

2018-05-14 Thread Luwei Kang
From: Chao Peng 

Load/Store Intel processor trace register in context switch.
MSR IA32_RTIT_CTL is loaded/stored automatically from VMCS.
In HOST_GUEST mode, we need load/resore PT MSRs only when PT
is enabled in guest.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 94 ++
 1 file changed, 94 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f9b701a..eb5f50a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -596,6 +596,24 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
(unsigned long *)_desc->control);
 }
 
+struct pt_ctx {
+   u64 ctl;
+   u64 status;
+   u64 output_base;
+   u64 output_mask;
+   u64 cr3_match;
+   u64 addr_a[MSR_IA32_RTIT_ADDR_RANGE];
+   u64 addr_b[MSR_IA32_RTIT_ADDR_RANGE];
+};
+
+struct pt_desc {
+   u64 ctl_bitmask;
+   u32 addr_range;
+   u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+   struct pt_ctx host;
+   struct pt_ctx guest;
+};
+
 struct vcpu_vmx {
struct kvm_vcpu   vcpu;
unsigned long host_rsp;
@@ -692,6 +710,8 @@ struct vcpu_vmx {
 */
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
+
+   struct pt_desc pt_desc;
 };
 
 enum segment_cache_field {
@@ -2390,6 +2410,69 @@ static unsigned long segment_base(u16 selector)
 }
 #endif
 
+static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+   u32 i;
+
+   rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+   rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+   rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+   for (i = 0; i < addr_range; i++) {
+   rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+   rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+   }
+}
+
+static void pt_guest_enter(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   /* Save host state before VM entry */
+   rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+
+   /*
+* Set guest state of MSR_IA32_RTIT_CTL MSR (PT will be disabled
+* on VM entry when it has been disabled in guest before).
+*/
+   vmcs_write64(GUEST_IA32_RTIT_CTL, vmx->pt_desc.guest.ctl);
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   wrmsrl(MSR_IA32_RTIT_CTL, 0);
+   pt_save_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   }
+}
+
+static void pt_guest_exit(struct vcpu_vmx *vmx)
+{
+   if (pt_mode == PT_MODE_SYSTEM)
+   return;
+
+   if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+   pt_save_msr(>pt_desc.guest, vmx->pt_desc.addr_range);
+   pt_load_msr(>pt_desc.host, vmx->pt_desc.addr_range);
+   }
+
+   /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
+   wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+}
+
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6134,6 +6217,13 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
+
+   if (pt_mode == PT_MODE_HOST_GUEST) {
+   memset(>pt_desc, 0, sizeof(vmx->pt_desc));
+   /* Bit[6~0] are forced to 1, writes are ignored. */
+   vmx->pt_desc.guest.output_mask = 0x7F;
+   vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
+   }
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -9802,6 +9892,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.pkru != vmx->host_pkru)
__write_pkru(vcpu->arch.pkru);
 
+   pt_guest_enter(vmx);
+
atomic_switch_perf_msrs(vmx);
 
vmx_arm_hv_timer(vcpu);
@@ -9996,6 +10088,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  | (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
 
+   pt_guest_exit(vmx);
+
/*
 * ea

[PATCH v8 09/12] KVM: x86: Introduce a function to initialize the PT configuration

2018-05-14 Thread Luwei Kang
Initialize the Intel PT configuration when cpuid update.
Include cpuid inforamtion, rtit_ctl bit mask and the number of
address ranges.

Signed-off-by: Luwei Kang 
---
 arch/x86/kvm/vmx.c | 69 ++
 1 file changed, 69 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index eb5f50a..5c321cd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10413,6 +10413,71 @@ static void nested_vmx_cr_fixed1_bits_update(struct 
kvm_vcpu *vcpu)
 #undef cr4_fixed1_update
 }
 
+static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_cpuid_entry2 *best = NULL;
+   int i;
+
+   for (i = 0; i < PT_CPUID_LEAVES; i++) {
+   best = kvm_find_cpuid_entry(vcpu, 0x14, i);
+   if (!best)
+   return;
+   vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
+   vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
+   vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
+   vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
+   }
+
+   /* Get the number of configurable Address Ranges for filtering */
+   vmx->pt_desc.addr_range = pt_cap_decode(vmx->pt_desc.caps,
+   PT_CAP_num_address_ranges);
+
+   /* Clear the no dependency bits */
+   vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
+   RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
+* PSBFreq can be set
+*/
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_psb_cyc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
+   RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
+
+   /*
+* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
+* MTCFreq can be set
+*/
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_mtc))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
+   RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_ptwrite))
+   vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
+   RTIT_CTL_PTW_EN);
+
+   /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_power_event_trace))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
+
+   /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_topa_output))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
+   /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
+   if (pt_cap_decode(vmx->pt_desc.caps, PT_CAP_output_subsys))
+   vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
+
+   /* unmask address range configure area */
+   for (i = 0; i < vmx->pt_desc.addr_range; i++)
+   vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4));
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -10431,6 +10496,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
if (nested_vmx_allowed(vcpu))
nested_vmx_cr_fixed1_bits_update(vcpu);
+
+   if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
+   guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+   update_intel_pt_cfg(vcpu);
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-- 
1.8.3.1



[PATCH v8 01/12] perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public header

2018-05-14 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Intel Processor Trace virtualization enabling in KVM guest
need to access these MSRs bit definitions, so move them to
public header file msr-index.h.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/events/intel/pt.h   | 37 -
 arch/x86/include/asm/msr-index.h | 34 ++
 2 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 0eb41d0..0050ca1 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -20,43 +20,6 @@
 #define __INTEL_PT_H__
 
 /*
- * PT MSR bit definitions
- */
-#define RTIT_CTL_TRACEEN   BIT(0)
-#define RTIT_CTL_CYCLEACC  BIT(1)
-#define RTIT_CTL_OSBIT(2)
-#define RTIT_CTL_USR   BIT(3)
-#define RTIT_CTL_PWR_EVT_ENBIT(4)
-#define RTIT_CTL_FUP_ON_PTWBIT(5)
-#define RTIT_CTL_CR3EN BIT(7)
-#define RTIT_CTL_TOPA  BIT(8)
-#define RTIT_CTL_MTC_ENBIT(9)
-#define RTIT_CTL_TSC_ENBIT(10)
-#define RTIT_CTL_DISRETC   BIT(11)
-#define RTIT_CTL_PTW_ENBIT(12)
-#define RTIT_CTL_BRANCH_EN BIT(13)
-#define RTIT_CTL_MTC_RANGE_OFFSET  14
-#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
-#define RTIT_CTL_CYC_THRESH_OFFSET 19
-#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
-#define RTIT_CTL_PSB_FREQ_OFFSET   24
-#define RTIT_CTL_PSB_FREQ  (0x0full << 
RTIT_CTL_PSB_FREQ_OFFSET)
-#define RTIT_CTL_ADDR0_OFFSET  32
-#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
-#define RTIT_CTL_ADDR1_OFFSET  36
-#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
-#define RTIT_CTL_ADDR2_OFFSET  40
-#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
-#define RTIT_CTL_ADDR3_OFFSET  44
-#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
-#define RTIT_STATUS_FILTEREN   BIT(0)
-#define RTIT_STATUS_CONTEXTEN  BIT(1)
-#define RTIT_STATUS_TRIGGEREN  BIT(2)
-#define RTIT_STATUS_BUFFOVFBIT(3)
-#define RTIT_STATUS_ERROR  BIT(4)
-#define RTIT_STATUS_STOPPEDBIT(5)
-
-/*
  * Single-entry ToPA: when this close to region boundary, switch
  * buffers to avoid losing data.
  */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 53d5b1b..5e8d156 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -106,7 +106,40 @@
 #define MSR_PEBS_LD_LAT_THRESHOLD  0x03f6
 
 #define MSR_IA32_RTIT_CTL  0x0570
+#define RTIT_CTL_TRACEEN   BIT(0)
+#define RTIT_CTL_CYCLEACC  BIT(1)
+#define RTIT_CTL_OSBIT(2)
+#define RTIT_CTL_USR   BIT(3)
+#define RTIT_CTL_PWR_EVT_ENBIT(4)
+#define RTIT_CTL_FUP_ON_PTWBIT(5)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA  BIT(8)
+#define RTIT_CTL_MTC_ENBIT(9)
+#define RTIT_CTL_TSC_ENBIT(10)
+#define RTIT_CTL_DISRETC   BIT(11)
+#define RTIT_CTL_PTW_ENBIT(12)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET  14
+#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET 19
+#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET   24
+#define RTIT_CTL_PSB_FREQ  (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
+#define RTIT_CTL_ADDR0_OFFSET  32
+#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
+#define RTIT_CTL_ADDR1_OFFSET  36
+#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
+#define RTIT_CTL_ADDR2_OFFSET  40
+#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
+#define RTIT_CTL_ADDR3_OFFSET  44
+#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
 #define MSR_IA32_RTIT_STATUS   0x0571
+#define RTIT_STATUS_FILTEREN   BIT(0)
+#define RTIT_STATUS_CONTEXTEN  BIT(1)
+#define RTIT_STATUS_TRIGGEREN  BIT(2)
+#define RTIT_STATUS_BUFFOVFBIT(3)
+#define RTIT_STATUS_ERROR  BIT(4)
+#define RTIT_STATUS_STOPPEDBIT(5)
 #define MSR_IA32_RTIT_ADDR0_A  0x0580
 #define MSR_IA32_RTIT_ADDR0_B  0x0581
 #define MSR_IA32_RTIT_ADDR1

[PATCH v8 05/12] perf/x86/intel/pt: Introduce a new function to get capability of Intel PT

2018-05-14 Thread Luwei Kang
New function pt_cap_decode() will be invoked in KVM to check
if a specific capability is available in KVM guest.
Another function pt_cap_get() can only check the hardware
capabilities but this may different with KVM guest because
some features may not be exposed to guest.

Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/events/intel/pt.c  | 10 --
 arch/x86/include/asm/intel_pt.h |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index f65f97a..18a2e80 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -76,14 +76,20 @@
PT_CAP(psb_periods, 1, CPUID_EBX, 0x),
 };
 
-u32 pt_cap_get(enum pt_capabilities cap)
+u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap)
 {
struct pt_cap_desc *cd = _caps[cap];
-   u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+   u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
unsigned int shift = __ffs(cd->mask);
 
return (c & cd->mask) >> shift;
 }
+EXPORT_SYMBOL_GPL(pt_cap_decode);
+
+u32 pt_cap_get(enum pt_capabilities cap)
+{
+   return pt_cap_decode(pt_pmu.caps, cap);
+}
 EXPORT_SYMBOL_GPL(pt_cap_get);
 
 static ssize_t pt_cap_show(struct device *cdev,
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 2de4db0..9c71453 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -27,9 +27,11 @@ enum pt_capabilities {
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 void cpu_emergency_stop_pt(void);
 extern u32 pt_cap_get(enum pt_capabilities cap);
+extern u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap);
 #else
 static inline void cpu_emergency_stop_pt(void) {}
 static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; }
+static u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { return 0; }
 #endif
 
 #endif /* _ASM_X86_INTEL_PT_H */
-- 
1.8.3.1



[PATCH v8 01/12] perf/x86/intel/pt: Move Intel-PT MSRs bit definitions to a public header

2018-05-14 Thread Luwei Kang
From: Chao Peng 

Intel Processor Trace virtualization enabling in KVM guest
need to access these MSRs bit definitions, so move them to
public header file msr-index.h.

Signed-off-by: Chao Peng 
Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.h   | 37 -
 arch/x86/include/asm/msr-index.h | 34 ++
 2 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 0eb41d0..0050ca1 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -20,43 +20,6 @@
 #define __INTEL_PT_H__
 
 /*
- * PT MSR bit definitions
- */
-#define RTIT_CTL_TRACEEN   BIT(0)
-#define RTIT_CTL_CYCLEACC  BIT(1)
-#define RTIT_CTL_OSBIT(2)
-#define RTIT_CTL_USR   BIT(3)
-#define RTIT_CTL_PWR_EVT_ENBIT(4)
-#define RTIT_CTL_FUP_ON_PTWBIT(5)
-#define RTIT_CTL_CR3EN BIT(7)
-#define RTIT_CTL_TOPA  BIT(8)
-#define RTIT_CTL_MTC_ENBIT(9)
-#define RTIT_CTL_TSC_ENBIT(10)
-#define RTIT_CTL_DISRETC   BIT(11)
-#define RTIT_CTL_PTW_ENBIT(12)
-#define RTIT_CTL_BRANCH_EN BIT(13)
-#define RTIT_CTL_MTC_RANGE_OFFSET  14
-#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
-#define RTIT_CTL_CYC_THRESH_OFFSET 19
-#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
-#define RTIT_CTL_PSB_FREQ_OFFSET   24
-#define RTIT_CTL_PSB_FREQ  (0x0full << 
RTIT_CTL_PSB_FREQ_OFFSET)
-#define RTIT_CTL_ADDR0_OFFSET  32
-#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
-#define RTIT_CTL_ADDR1_OFFSET  36
-#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
-#define RTIT_CTL_ADDR2_OFFSET  40
-#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
-#define RTIT_CTL_ADDR3_OFFSET  44
-#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
-#define RTIT_STATUS_FILTEREN   BIT(0)
-#define RTIT_STATUS_CONTEXTEN  BIT(1)
-#define RTIT_STATUS_TRIGGEREN  BIT(2)
-#define RTIT_STATUS_BUFFOVFBIT(3)
-#define RTIT_STATUS_ERROR  BIT(4)
-#define RTIT_STATUS_STOPPEDBIT(5)
-
-/*
  * Single-entry ToPA: when this close to region boundary, switch
  * buffers to avoid losing data.
  */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 53d5b1b..5e8d156 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -106,7 +106,40 @@
 #define MSR_PEBS_LD_LAT_THRESHOLD  0x03f6
 
 #define MSR_IA32_RTIT_CTL  0x0570
+#define RTIT_CTL_TRACEEN   BIT(0)
+#define RTIT_CTL_CYCLEACC  BIT(1)
+#define RTIT_CTL_OSBIT(2)
+#define RTIT_CTL_USR   BIT(3)
+#define RTIT_CTL_PWR_EVT_ENBIT(4)
+#define RTIT_CTL_FUP_ON_PTWBIT(5)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA  BIT(8)
+#define RTIT_CTL_MTC_ENBIT(9)
+#define RTIT_CTL_TSC_ENBIT(10)
+#define RTIT_CTL_DISRETC   BIT(11)
+#define RTIT_CTL_PTW_ENBIT(12)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET  14
+#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET 19
+#define RTIT_CTL_CYC_THRESH(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET   24
+#define RTIT_CTL_PSB_FREQ  (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
+#define RTIT_CTL_ADDR0_OFFSET  32
+#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
+#define RTIT_CTL_ADDR1_OFFSET  36
+#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
+#define RTIT_CTL_ADDR2_OFFSET  40
+#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
+#define RTIT_CTL_ADDR3_OFFSET  44
+#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
 #define MSR_IA32_RTIT_STATUS   0x0571
+#define RTIT_STATUS_FILTEREN   BIT(0)
+#define RTIT_STATUS_CONTEXTEN  BIT(1)
+#define RTIT_STATUS_TRIGGEREN  BIT(2)
+#define RTIT_STATUS_BUFFOVFBIT(3)
+#define RTIT_STATUS_ERROR  BIT(4)
+#define RTIT_STATUS_STOPPEDBIT(5)
 #define MSR_IA32_RTIT_ADDR0_A  0x0580
 #define MSR_IA32_RTIT_ADDR0_B  0x0581
 #define MSR_IA32_RTIT_ADDR1_A  0x0582
@@ -115,6 +148,7 @@
 #define MSR_IA32_RTIT_ADDR2_B  0x05

[PATCH v8 05/12] perf/x86/intel/pt: Introduce a new function to get capability of Intel PT

2018-05-14 Thread Luwei Kang
New function pt_cap_decode() will be invoked in KVM to check
if a specific capability is available in KVM guest.
Another function pt_cap_get() can only check the hardware
capabilities but this may different with KVM guest because
some features may not be exposed to guest.

Signed-off-by: Luwei Kang 
---
 arch/x86/events/intel/pt.c  | 10 --
 arch/x86/include/asm/intel_pt.h |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index f65f97a..18a2e80 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -76,14 +76,20 @@
PT_CAP(psb_periods, 1, CPUID_EBX, 0x),
 };
 
-u32 pt_cap_get(enum pt_capabilities cap)
+u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap)
 {
struct pt_cap_desc *cd = _caps[cap];
-   u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+   u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
unsigned int shift = __ffs(cd->mask);
 
return (c & cd->mask) >> shift;
 }
+EXPORT_SYMBOL_GPL(pt_cap_decode);
+
+u32 pt_cap_get(enum pt_capabilities cap)
+{
+   return pt_cap_decode(pt_pmu.caps, cap);
+}
 EXPORT_SYMBOL_GPL(pt_cap_get);
 
 static ssize_t pt_cap_show(struct device *cdev,
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 2de4db0..9c71453 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -27,9 +27,11 @@ enum pt_capabilities {
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 void cpu_emergency_stop_pt(void);
 extern u32 pt_cap_get(enum pt_capabilities cap);
+extern u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap);
 #else
 static inline void cpu_emergency_stop_pt(void) {}
 static inline u32 pt_cap_get(enum pt_capabilities cap) { return 0; }
+static u32 pt_cap_decode(u32 *caps, enum pt_capabilities cap) { return 0; }
 #endif
 
 #endif /* _ASM_X86_INTEL_PT_H */
-- 
1.8.3.1



[PATCH v8 06/12] KVM: x86: Add Intel Processor Trace virtualization mode

2018-05-14 Thread Luwei Kang
From: Chao Peng <chao.p.p...@linux.intel.com>

Intel PT virtualization can be work in one of 2 possible modes:
a. system-wide: trace both host and guest and output to host buffer;
b. host-guest: trace host/guest simultaneous and output to their
   respective buffer.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Signed-off-by: Luwei Kang <luwei.k...@intel.com>
---
 arch/x86/include/asm/intel_pt.h  |  3 ++
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/include/asm/vmx.h   |  8 +
 arch/x86/kvm/vmx.c   | 68 +---
 4 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 9c71453..5748205 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -5,6 +5,9 @@
 #define PT_CPUID_LEAVES2
 #define PT_CPUID_REGS_NUM  4 /* number of regsters (eax, ebx, ecx, edx) */
 
+#define PT_MODE_SYSTEM 0
+#define PT_MODE_HOST_GUEST 1
+
 enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f163f04..96a1fc8 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -790,6 +790,7 @@
 #define VMX_BASIC_INOUT0x0040LLU
 
 /* MSR_IA32_VMX_MISC bits */
+#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 5db8b0b..5936d72 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -76,7 +76,9 @@
 #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
 #define SECONDARY_EXEC_RDSEED_EXITING  0x0001
 #define SECONDARY_EXEC_ENABLE_PML   0x0002
+#define SECONDARY_EXEC_PT_CONCEAL_VMX  0x0008
 #define SECONDARY_EXEC_XSAVES  0x0010
+#define SECONDARY_EXEC_PT_USE_GPA  0x0100
 #define SECONDARY_EXEC_TSC_SCALING  0x0200
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
@@ -97,6 +99,8 @@
 #define VM_EXIT_LOAD_IA32_EFER  0x0020
 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER   0x0040
 #define VM_EXIT_CLEAR_BNDCFGS   0x0080
+#define VM_EXIT_PT_CONCEAL_PIP 0x0100
+#define VM_EXIT_CLEAR_IA32_RTIT_CTL0x0200
 
 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR  0x00036dff
 
@@ -108,6 +112,8 @@
 #define VM_ENTRY_LOAD_IA32_PAT 0x4000
 #define VM_ENTRY_LOAD_IA32_EFER 0x8000
 #define VM_ENTRY_LOAD_BNDCFGS   0x0001
+#define VM_ENTRY_PT_CONCEAL_PIP0x0002
+#define VM_ENTRY_LOAD_IA32_RTIT_CTL0x0004
 
 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x11ff
 
@@ -234,6 +240,8 @@ enum vmcs_field {
GUEST_PDPTR3_HIGH   = 0x2811,
GUEST_BNDCFGS   = 0x2812,
GUEST_BNDCFGS_HIGH  = 0x2813,
+   GUEST_IA32_RTIT_CTL = 0x2814,
+   GUEST_IA32_RTIT_CTL_HIGH= 0x2815,
HOST_IA32_PAT   = 0x2c00,
HOST_IA32_PAT_HIGH  = 0x2c01,
HOST_IA32_EFER  = 0x2c02,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 467cab4..ede5abf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -53,6 +53,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "trace.h"
 #include "pmu.h"
@@ -186,6 +187,10 @@
 static unsigned int ple_window_max= KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 module_param(ple_window_max, uint, 0444);
 
+/* Default is SYSTEM mode. */
+static int __read_mostly pt_mode = PT_MODE_SYSTEM;
+module_param(pt_mode, int, S_IRUGO);
+
 extern const ulong vmx_return;
 
 struct kvm_vmx {
@@ -1511,6 +1516,20 @@ static bool vmx_umip_emulated(void)
SECONDARY_EXEC_DESC;
 }
 
+static inline bool cpu_has_vmx_intel_pt(void)
+{
+   u64 vmx_msr;
+
+   rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+   return !!(vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT);
+}
+
+static inline bool cpu_has_vmx_pt_use_gpa(void)
+{
+   return !!(vmcs_config.cpu_based_2nd_exec_ctrl &
+   SECONDARY_EXEC_PT_USE_GPA);
+}
+
 static inline bool report_flexpriority(void)
 {
return flexpriority_enabled;
@@ -4025,6 +4044,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
+   SECONDARY_EXEC_PT_USE_G

  1   2   3   >