Add MCE support to KVM. The related MSRs are emulated. A new vcpu
ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation such as
the mcg_cap. MCE is injected via vcpu ioctl command
KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
not implemented.

Signed-off-by: Huang Ying <[email protected]>

---
 arch/x86/include/asm/kvm_host.h |    5 
 arch/x86/include/asm/mce.h      |    1 
 arch/x86/kvm/x86.c              |  202 +++++++++++++++++++++++++++++++++++-----
 include/linux/kvm.h             |   15 ++
 4 files changed, 199 insertions(+), 24 deletions(-)

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,6 +42,7 @@
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
+#include <asm/mce.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS                                              \
@@ -734,23 +735,43 @@ static int set_msr_mtrr(struct kvm_vcpu 
        return 0;
 }
 
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
+
        switch (msr) {
-       case MSR_EFER:
-               set_efer(vcpu, data);
-               break;
-       case MSR_IA32_MC0_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-                      __func__, data);
-               break;
        case MSR_IA32_MCG_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-                       __func__, data);
+               vcpu->arch.mcg_status = data;
                break;
        case MSR_IA32_MCG_CTL:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
-                       __func__, data);
+               if (!(mcg_cap & MCG_CTL_P))
+                       return 1;
+               if (data != 0 && data != ~(u64)0)
+                       return -1;
+               vcpu->arch.mcg_ctl = data;
+               break;
+       default:
+               if (msr >= MSR_IA32_MC0_CTL &&
+                   msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       /* only 0 or all 1s can be written to IA32_MCi_CTL */
+                       if ((offset & 0x3) == 0 &&
+                           data != 0 && data != ~(u64)0)
+                               return -1;
+                       vcpu->arch.mce_banks[offset] = data;
+                       break;
+               }
+               return 1;
+       }
+       return 0;
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+       switch (msr) {
+       case MSR_EFER:
+               set_efer(vcpu, data);
                break;
        case MSR_IA32_DEBUGCTLMSR:
                if (!data) {
@@ -807,6 +828,8 @@ int kvm_set_msr_common(struct kvm_vcpu *
                break;
        }
        default:
+               if (!set_msr_mce(vcpu, msr, data))
+                   break;
                pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
                return 1;
        }
@@ -861,26 +884,49 @@ static int get_msr_mtrr(struct kvm_vcpu 
        return 0;
 }
 
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
        u64 data;
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
 
        switch (msr) {
-       case 0xc0010010: /* SYSCFG */
-       case 0xc0010015: /* HWCR */
-       case MSR_IA32_PLATFORM_ID:
        case MSR_IA32_P5_MC_ADDR:
        case MSR_IA32_P5_MC_TYPE:
-       case MSR_IA32_MC0_CTL:
-       case MSR_IA32_MCG_STATUS:
+               data = 0;
+               break;
        case MSR_IA32_MCG_CAP:
+               data = vcpu->arch.mcg_cap;
+               break;
        case MSR_IA32_MCG_CTL:
-       case MSR_IA32_MC0_MISC:
-       case MSR_IA32_MC0_MISC+4:
-       case MSR_IA32_MC0_MISC+8:
-       case MSR_IA32_MC0_MISC+12:
-       case MSR_IA32_MC0_MISC+16:
-       case MSR_IA32_MC0_MISC+20:
+               if (!(mcg_cap & MCG_CTL_P))
+                       return 1;
+               data = vcpu->arch.mcg_ctl;
+               break;
+       case MSR_IA32_MCG_STATUS:
+               data = vcpu->arch.mcg_status;
+               break;
+       default:
+               if (msr >= MSR_IA32_MC0_CTL &&
+                   msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       data = vcpu->arch.mce_banks[offset];
+                       break;
+               }
+               return 1;
+       }
+       *pdata = data;
+       return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+       u64 data;
+
+       switch (msr) {
+       case 0xc0010010: /* SYSCFG */
+       case 0xc0010015: /* HWCR */
+       case MSR_IA32_PLATFORM_ID:
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_EBL_CR_POWERON:
        case MSR_IA32_DEBUGCTLMSR:
@@ -921,6 +967,8 @@ int kvm_get_msr_common(struct kvm_vcpu *
                data = vcpu->arch.time;
                break;
        default:
+               if (!get_msr_mce(vcpu, msr, &data))
+                       break;
                pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
                return 1;
        }
@@ -1443,6 +1491,87 @@ static int vcpu_ioctl_tpr_access_reporti
        return 0;
 }
 
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+                                       u64 mcg_cap)
+{
+       int r;
+       unsigned bank_num = mcg_cap & 0xff, bank;
+       u64 *banks;
+
+       r = -EINVAL;
+       if (!bank_num)
+               goto out;
+       r = -ENOMEM;
+       banks = kzalloc(bank_num * sizeof(u64) * 4, GFP_KERNEL);
+       if (!banks)
+               goto out;
+       r = 0;
+       vcpu->arch.mce_banks = banks;
+       vcpu->arch.mcg_cap = mcg_cap;
+       /* Init IA32_MCG_CTL to all 1s */
+       if (mcg_cap & MCG_CTL_P)
+               vcpu->arch.mcg_ctl = ~(u64)0;
+       /* Init IA32_MCi_CTL to all 1s */
+       for (bank = 0; bank < bank_num; bank++)
+               vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+       return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+                                     struct kvm_x86_mce *mce)
+{
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
+       u64 *banks = vcpu->arch.mce_banks;
+
+       if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+               return -EINVAL;
+       /*
+        * if IA32_MCG_CTL is not all 1s, the uncorrected error
+        * reporting is disabled
+        */
+       if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+           vcpu->arch.mcg_ctl != ~(u64)0)
+               return 0;
+       banks += 4 * mce->bank;
+       /*
+        * if IA32_MCi_CTL is not all 1s, the uncorrected error
+        * reporting is disabled for the bank
+        */
+       if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+               return 0;
+       if (mce->status & MCI_STATUS_UC) {
+               u64 status = mce->status;
+               if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+                   !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+                       printk(KERN_DEBUG "kvm: set_mce: "
+                              "injects mce exception while "
+                              "previous one is in progress!\n");
+                       set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+                       return 0;
+               }
+               if (banks[1] & MCI_STATUS_VAL)
+                       status |= MCI_STATUS_OVER;
+               banks[1] = mce->status;
+               banks[2] = mce->addr;
+               banks[3] = mce->misc;
+               vcpu->arch.mcg_status = mce->mcg_status;
+               kvm_queue_exception(vcpu, MC_VECTOR);
+       } else if (!(banks[1] & MCI_STATUS_VAL) ||
+                  (!(banks[1] & MCI_STATUS_UC) &&
+                   !((mcg_cap & MCG_TES_P) && ((banks[1]>>53) & 0x3) < 2))) {
+               u64 status = mce->status;
+               if (banks[1] & MCI_STATUS_VAL)
+                       status |= MCI_STATUS_OVER;
+               banks[1] = mce->status;
+               banks[2] = mce->addr;
+               banks[3] = mce->misc;
+       } else
+               banks[1] |= MCI_STATUS_OVER;
+       return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
 {
@@ -1576,6 +1705,31 @@ long kvm_arch_vcpu_ioctl(struct file *fi
                kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
                break;
        }
+       case KVM_X86_SETUP_MCE: {
+               u64 mcg_cap;
+
+               r = -EFAULT;
+               if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+                       goto out;
+               /*
+                * extended machine-check state registers and CMCI are
+                * not supported.
+                */
+               mcg_cap &= ~(MCG_EXT_P|MCG_CMCI_P);
+               if (copy_to_user(argp, &mcg_cap, sizeof mcg_cap))
+                       goto out;
+               r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+               break;
+       }
+       case KVM_X86_SET_MCE: {
+               struct kvm_x86_mce mce;
+
+               r = -EFAULT;
+               if (copy_from_user(&mce, argp, sizeof mce))
+                       goto out;
+               r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+               break;
+       }
        default:
                r = -EINVAL;
        }
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -371,6 +371,11 @@ struct kvm_vcpu_arch {
        unsigned long dr6;
        unsigned long dr7;
        unsigned long eff_db[KVM_NR_DB_REGS];
+
+       u64 mcg_cap;
+       u64 mcg_status;
+       u64 mcg_ctl;
+       u64 *mce_banks;
 };
 
 struct kvm_mem_alias {
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -292,6 +292,18 @@ struct kvm_guest_debug {
        struct kvm_guest_debug_arch arch;
 };
 
+/* x86 MCE */
+struct kvm_x86_mce {
+       __u64 status;
+       __u64 addr;
+       __u64 misc;
+       __u64 mcg_status;
+       __u8 bank;
+       __u8 pad1;
+       __u16 pad2;
+       __u32 pad3;
+};
+
 #define KVM_TRC_SHIFT           16
 /*
  * kvm trace categories
@@ -528,6 +540,9 @@ struct kvm_irq_routing {
 #define KVM_NMI                   _IO(KVMIO,  0x9a)
 /* Available with KVM_CAP_SET_GUEST_DEBUG */
 #define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
+/* MCE for x86 */
+#define KVM_X86_SETUP_MCE           _IOWR(KVMIO,  0x9a, __u64)
+#define KVM_X86_SET_MCE           _IOW(KVMIO,  0x9b, struct kvm_x86_mce)
 
 /*
  * Deprecated interfaces
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -13,6 +13,7 @@
 #define MCG_CTL_P       (1UL<<8)   /* MCG_CAP register available */
 #define MCG_EXT_P       (1ULL<<9)   /* Extended registers available */
 #define MCG_CMCI_P      (1ULL<<10)  /* CMCI supported */
+#define MCG_TES_P       (1ULL<<11)  /* Threshold-based error status */
 
 #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */

Attachment: signature.asc
Description: This is a digitally signed message part

Reply via email to