On Mon, 05 May 2025 17:14:07 +0100, Jiaqi Yan <jiaqi...@google.com> wrote: > > When APEI fails to handle a stage2 abort that is synchrnous external > abort (SEA),
nit: "a stage-2 synchronous external abort". > today KVM directly injects an async SError to the VCPU > then resumes it, which usually results in unpleasant guest kernel panic. Which is a perfectly legal thing to do, and not a violation of the architecture. > One major situation of guest SEA is when vCPU consumes recoverable > uncorrected memory error (UER). Although SError and guest kernel panic > effectively stops the propagation of corrupted memory, there is still > room to recover from memory UER in a more graceful manner. "there is room to recover from an UER..." > > Alternatively KVM can redirect the synchronous SEA event to VMM to > - Reduce blast radius if possible. VMM can inject a SEA to VCPU via > KVM's existing KVM_SET_VCPU_EVENTS API. If the memory poison > consumption or fault is not from guest kernel, blast radius can be > limited to the triggering thread in guest userspace, so VM can > keep running. > - VMM can protect from future memory poison consumption by unmapping > the page from stage-2 with KVM userfault [1]. VMM can also > track SEA events that VM customer cares about, restart VM when > certain number of distinct poison events happened, provide > observability to customers [2]. > > Introduce following userspace-visible features to make VMM handle SEA: > - KVM_CAP_ARM_SEA_TO_USER. As the alternative fallback behavior > when host APEI fails to claim a SEA, userspace can opt in this new > capability to let KVM exit to userspace during synchronous abort. > - KVM_EXIT_ARM_SEA. A new exit reason is introduced for this, and > KVM fills kvm_run.arm_sea with as much as possible information about > the SEA, including > - ESR_EL2. > - If faulting guest virtual and physical addresses are available. > - Faulting guest virtual address if available. > - Faulting guest physical address if available. > > [1] > https://lpc.events/event/18/contributions/1757/attachments/1442/3073/LPC_%20KVM%20Userfault.pdf > [2] https://cloud.google.com/solutions/sap/docs/manage-host-errors I really don't think we need these link in a commit message (they are likely to vanish, specially the second one). Either the information is pertinent and it needs to be added to the commit message, or removed altogether. I don't think the SAP thing makes much sense as is. > > Signed-off-by: Jiaqi Yan <jiaqi...@google.com> > --- > arch/arm64/include/asm/kvm_emulate.h | 12 +++++++ > arch/arm64/include/asm/kvm_host.h | 8 +++++ > arch/arm64/include/asm/kvm_ras.h | 21 ++++------- > arch/arm64/kvm/Makefile | 3 +- > arch/arm64/kvm/arm.c | 5 +++ > arch/arm64/kvm/kvm_ras.c | 54 ++++++++++++++++++++++++++++ > arch/arm64/kvm/mmu.c | 12 ++----- > include/uapi/linux/kvm.h | 11 ++++++ > 8 files changed, 101 insertions(+), 25 deletions(-) > create mode 100644 arch/arm64/kvm/kvm_ras.c > > diff --git a/arch/arm64/include/asm/kvm_emulate.h > b/arch/arm64/include/asm/kvm_emulate.h > index bd020fc28aa9c..a9de30478a088 100644 > --- a/arch/arm64/include/asm/kvm_emulate.h > +++ b/arch/arm64/include/asm/kvm_emulate.h > @@ -429,6 +429,18 @@ static __always_inline bool kvm_vcpu_abt_issea(const > struct kvm_vcpu *vcpu) > } > } > > +/* Return true if FAR holds valid faulting guest virtual address. */ > +static inline bool kvm_vcpu_sea_far_valid(const struct kvm_vcpu *vcpu) > +{ > + return !(kvm_vcpu_get_esr(vcpu) & ESR_ELx_FnV); > +} > + > +/* Return true if HPFAR_EL2 holds valid faulting guest physical address. */ > +static inline bool kvm_vcpu_sea_ipa_valid(const struct kvm_vcpu *vcpu) > +{ > + return vcpu->arch.fault.hpfar_el2 & HPFAR_EL2_NS; > +} > + > static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) > { > u64 esr = kvm_vcpu_get_esr(vcpu); > diff --git a/arch/arm64/include/asm/kvm_host.h > b/arch/arm64/include/asm/kvm_host.h > index 73b7762b0e7d1..e0129f9799f80 100644 > --- a/arch/arm64/include/asm/kvm_host.h > +++ b/arch/arm64/include/asm/kvm_host.h > @@ -342,6 +342,14 @@ struct kvm_arch { > #define KVM_ARCH_FLAG_GUEST_HAS_SVE 9 > /* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */ > #define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS 10 > + /* > + * When APEI failed to claim stage-2 synchronous external abort > + * (SEA) return to userspace with fault information. Userspace > + * can opt in this feature if KVM_CAP_ARM_SEA_TO_USER is > + * supported. Userspace is encouraged to handle this VM exit > + * by injecting a SEA to VCPU before resume the VCPU. > + */ > +#define KVM_ARCH_FLAG_RETURN_SEA_TO_USER 11 > unsigned long flags; > > /* VM-wide vCPU feature set */ > diff --git a/arch/arm64/include/asm/kvm_ras.h > b/arch/arm64/include/asm/kvm_ras.h > index 9398ade632aaf..a2fd91af8f97e 100644 > --- a/arch/arm64/include/asm/kvm_ras.h > +++ b/arch/arm64/include/asm/kvm_ras.h > @@ -4,22 +4,15 @@ > #ifndef __ARM64_KVM_RAS_H__ > #define __ARM64_KVM_RAS_H__ > > -#include <linux/acpi.h> > -#include <linux/errno.h> > -#include <linux/types.h> > - > -#include <asm/acpi.h> > +#include <linux/kvm_host.h> > > /* > - * Was this synchronous external abort a RAS notification? > - * Returns '0' for errors handled by some RAS subsystem, or -ENOENT. > + * Handle stage2 synchronous external abort (SEA) in the following order: > + * 1. Delegate to APEI/GHES and if they can claim SEA, resume guest. > + * 2. If userspace opt-ed in KVM_CAP_ARM_SEA_TO_USER, exit to userspace > + * with details about the SEA. > + * 3. Otherwise, inject async SError into the VCPU and resume guest. > */ > -static inline int kvm_handle_guest_sea(void) > -{ > - /* apei_claim_sea(NULL) expects to mask interrupts itself */ > - lockdep_assert_irqs_enabled(); > - > - return apei_claim_sea(NULL); > -} > +int kvm_handle_guest_sea(struct kvm_vcpu *vcpu); > > #endif /* __ARM64_KVM_RAS_H__ */ > diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile > index 209bc76263f10..785d568411e88 100644 > --- a/arch/arm64/kvm/Makefile > +++ b/arch/arm64/kvm/Makefile > @@ -23,7 +23,8 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ > vgic/vgic-v3.o vgic/vgic-v4.o \ > vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ > vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ > - vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o > + vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o \ > + kvm_ras.o > > kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o > kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > index 19ca57def6292..47544945fba45 100644 > --- a/arch/arm64/kvm/arm.c > +++ b/arch/arm64/kvm/arm.c > @@ -133,6 +133,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, > } > mutex_unlock(&kvm->lock); > break; > + case KVM_CAP_ARM_SEA_TO_USER: > + r = 0; > + set_bit(KVM_ARCH_FLAG_RETURN_SEA_TO_USER, &kvm->arch.flags); > + break; > default: > break; > } > @@ -322,6 +326,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long > ext) > case KVM_CAP_IRQFD_RESAMPLE: > case KVM_CAP_COUNTER_OFFSET: > case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: > + case KVM_CAP_ARM_SEA_TO_USER: > r = 1; > break; > case KVM_CAP_SET_GUEST_DEBUG2: > diff --git a/arch/arm64/kvm/kvm_ras.c b/arch/arm64/kvm/kvm_ras.c > new file mode 100644 > index 0000000000000..83f2731c95d77 > --- /dev/null > +++ b/arch/arm64/kvm/kvm_ras.c I don't think we need a new file for so little code. Please leave it with the MMU code for now. > @@ -0,0 +1,54 @@ > +// SPDX-License-Identifier: GPL-2.0-only > + > +#include <linux/acpi.h> > +#include <linux/types.h> > +#include <asm/acpi.h> > +#include <asm/kvm_emulate.h> > +#include <asm/kvm_ras.h> > +#include <asm/system_misc.h> > + > +/* > + * Was this synchronous external abort a RAS notification? > + * Returns 0 for errors handled by some RAS subsystem, or -ENOENT. > + */ > +static int kvm_delegate_guest_sea(void) > +{ > + /* apei_claim_sea(NULL) expects to mask interrupts itself. */ > + lockdep_assert_irqs_enabled(); > + return apei_claim_sea(NULL); > +} > + > +int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) > +{ > + struct kvm_run *run = vcpu->run; > + bool exit = test_bit(KVM_ARCH_FLAG_RETURN_SEA_TO_USER, > + &vcpu->kvm->arch.flags); > + > + /* For RAS the host kernel may handle this abort. */ > + if (kvm_delegate_guest_sea() == 0) > + return 1; > + > + if (!exit) { Move the condition here, since nothing else evaluates "exit". > + /* Fallback behavior prior to KVM_EXIT_ARM_SEA. */ > + kvm_inject_vabt(vcpu); > + return 1; > + } > + > + run->exit_reason = KVM_EXIT_ARM_SEA; > + run->arm_sea.esr = kvm_vcpu_get_esr(vcpu); Do we *always* want to report this without any sanitisation? ESR_EL2 contains a lot of things that we may want to hide. For example, what do we do on a SEA generated by a S1PTW? I don't think it makes any sense to report it as such, if only because userspace knows nothing of stage-2. > + run->arm_sea.flags = 0ULL; > + run->arm_sea.gva = 0ULL; > + run->arm_sea.gpa = 0ULL; > + > + if (kvm_vcpu_sea_far_valid(vcpu)) { > + run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GVA_VALID; > + run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); > + } Under which circumstances is the guest VA of interest to userspace? I can imagine *something*, but it'd be good to document it. > + > + if (kvm_vcpu_sea_ipa_valid(vcpu)) { > + run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; > + run->arm_sea.gpa = kvm_vcpu_get_fault_ipa(vcpu); > + } > + > + return 0; > +} > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index 754f2fe0cc673..a605ee56fa150 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -1795,16 +1795,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) > int ret, idx; > > /* Synchronous External Abort? */ > - if (kvm_vcpu_abt_issea(vcpu)) { > - /* > - * For RAS the host kernel may handle this abort. > - * There is no need to pass the error into the guest. > - */ > - if (kvm_handle_guest_sea()) > - kvm_inject_vabt(vcpu); > - > - return 1; > - } > + if (kvm_vcpu_abt_issea(vcpu)) > + return kvm_handle_guest_sea(vcpu); > > esr = kvm_vcpu_get_esr(vcpu); > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index b6ae8ad8934b5..79dc4676ff74b 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -178,6 +178,7 @@ struct kvm_xen_exit { > #define KVM_EXIT_NOTIFY 37 > #define KVM_EXIT_LOONGARCH_IOCSR 38 > #define KVM_EXIT_MEMORY_FAULT 39 > +#define KVM_EXIT_ARM_SEA 40 > > /* For KVM_EXIT_INTERNAL_ERROR */ > /* Emulate instruction failed. */ > @@ -446,6 +447,15 @@ struct kvm_run { > __u64 gpa; > __u64 size; > } memory_fault; > + /* KVM_EXIT_ARM_SEA */ > + struct { > + __u64 esr; > +#define KVM_EXIT_ARM_SEA_FLAG_GVA_VALID (1ULL << 0) > +#define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 1) > + __u64 flags; > + __u64 gva; > + __u64 gpa; > + } arm_sea; > /* Fix the size of the union. */ > char padding[256]; > }; > @@ -930,6 +940,7 @@ struct kvm_enable_cap { > #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 > #define KVM_CAP_X86_GUEST_MODE 238 > #define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 > +#define KVM_CAP_ARM_SEA_TO_USER 240 > > struct kvm_irq_routing_irqchip { > __u32 irqchip; Thanks, M. -- Without deviation from the norm, progress is not possible.