On Wed, Sep 09, 2009 at 10:28:02AM +0800, Huang Ying wrote:
> UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> where some hardware error such as some memory error can be reported
> without PCC (processor context corrupted). To recover from such MCE,
> the corresponding memory will be unmapped, and all processes accessing
> the memory will be killed via SIGBUS.
>
> For KVM, if QEMU/KVM is killed, all guest processes will be killed
> too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> injection. Then guest OS can isolate corresponding memory and kill
> necessary guest processes only. SIGBUS sent to main thread (not VCPU
> threads) will be broadcast to all VCPU threads as UCR MCE.
>
> v2:
>
> - Use qemu_ram_addr_from_host instead of self made one to covert from
> host address to guest RAM address. Thanks Anthony Liguori.
>
> Signed-off-by: Huang Ying <[email protected]>
>
> ---
> cpu-common.h | 1
> exec.c | 20 +++++--
> qemu-kvm.c | 154
> ++++++++++++++++++++++++++++++++++++++++++++++++++----
> target-i386/cpu.h | 20 ++++++-
> 4 files changed, 178 insertions(+), 17 deletions(-)
>
> --- a/qemu-kvm.c
> +++ b/qemu-kvm.c
> @@ -27,10 +27,23 @@
> #include <sys/mman.h>
> #include <sys/ioctl.h>
> #include <signal.h>
> +#include <sys/signalfd.h>
> +#include <sys/prctl.h>
>
> #define false 0
> #define true 1
>
> +#ifndef PR_MCE_KILL
> +#define PR_MCE_KILL 33
> +#endif
> +
> +#ifndef BUS_MCEERR_AR
> +#define BUS_MCEERR_AR 4
> +#endif
> +#ifndef BUS_MCEERR_AO
> +#define BUS_MCEERR_AO 5
> +#endif
> +
> #define EXPECTED_KVM_API_VERSION 12
>
> #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
> @@ -1507,6 +1520,37 @@ static void sig_ipi_handler(int n)
> {
> }
>
> +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void
> *ctx)
> +{
> + if (siginfo->ssi_code == BUS_MCEERR_AO) {
> + uint64_t status;
> + unsigned long paddr;
> + CPUState *cenv;
> +
> + /* Hope we are lucky for AO MCE */
> + if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) {
> + fprintf(stderr, "Hardware memory error for memory used by "
> + "QEMU itself instead of guest system!: %llx\n",
> + (unsigned long long)siginfo->ssi_addr);
> + return;
qemu-kvm should die here?
> + }
> + status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> + | 0xc0;
> + kvm_inject_x86_mce(first_cpu, 9, status,
> + MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
> + (MCM_ADDR_PHYS << 6) | 0xc);
> + for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
> + kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
> + MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0);
> + return;
Should abort if kvm_inject_x86_mce fails?
> + } else if (siginfo->ssi_code == BUS_MCEERR_AR)
> + fprintf(stderr, "Hardware memory error!\n");
> + else
> + fprintf(stderr, "Internal error in QEMU!\n");
Can you re-raise SIGBUS so you we get a coredump on non-MCE SIGBUS as
usual?
> + exit(1);
> +}
> +
> static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
> {
> struct qemu_work_item wi;
> @@ -1649,29 +1693,102 @@ static void flush_queued_work(CPUState *
> pthread_cond_broadcast(&qemu_work_cond);
> }
>
> +static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
> +{
> +#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
> + struct kvm_x86_mce mce = {
> + .bank = 9,
> + };
> + unsigned long paddr;
> + int r;
> +
> + if (env->mcg_cap && siginfo->si_addr
> + && (siginfo->si_code == BUS_MCEERR_AR
> + || siginfo->si_code == BUS_MCEERR_AO)) {
> + if (siginfo->si_code == BUS_MCEERR_AR) {
> + /* Fake an Intel architectural Data Load SRAR UCR */
> + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> + | MCI_STATUS_AR | 0x134;
> + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
> + } else {
> + /* Fake an Intel architectural Memory scrubbing UCR */
> + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> + | 0xc0;
> + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
> + }
> + if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
> + fprintf(stderr, "Hardware memory error for memory used by "
> + "QEMU itself instaed of guest system!\n");
> + /* Hope we are lucky for AO MCE */
> + if (siginfo->si_code == BUS_MCEERR_AO)
> + return;
Should die?
> + else
> + exit(1);
> + }
> + mce.addr = paddr;
> + r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
> + if (r < 0) {
> + fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
> + exit(1);
> + }
> + } else
> +#endif
> + {
> + if (siginfo->si_code == BUS_MCEERR_AO)
> + return;
> + if (siginfo->si_code == BUS_MCEERR_AR)
> + fprintf(stderr, "Hardware memory error!\n");
> + else
> + fprintf(stderr, "Internal error in QEMU!\n");
> + exit(1);
> + }
> +}
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html