From: David Woodhouse <[email protected]>

ICEBP (INT1, opcode 0xF1) generates a #DB that is architecturally a
trap, but on SVM it was not always intercepted. Unconditionally
intercept ICEBP on SVM to match VMX behaviour and ensure correct
event delivery semantics.

Add two selftests exercising ICEBP:

 - int1_ept_test: verifies that ICEBP works correctly when the
   exception stack page is not present (EPT/NPT fault during #DB
   delivery). The IST stack is evicted via MADV_DONTNEED before
   executing INT1.

 - int1_task_gate_test: verifies ICEBP delivery through a 32-bit
   task gate, exercising the legacy task-switch path for #DB.

Tested on Intel Sapphire Rapids and AMD Genoa. Without the SVM fix,
int1_task_gate_test fails on AMD with EIP pointing at ICEBP instead
of after it. With the fix, both tests pass on both platforms.

Signed-off-by: David Woodhouse <[email protected]>
---
 arch/x86/kvm/svm/svm.c                        |  21 ++
 tools/testing/selftests/kvm/Makefile.kvm      |   2 +
 .../testing/selftests/kvm/x86/int1_ept_test.c | 116 +++++++
 .../selftests/kvm/x86/int1_task_gate_test.c   | 298 ++++++++++++++++++
 4 files changed, 437 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86/int1_ept_test.c
 create mode 100644 tools/testing/selftests/kvm/x86/int1_task_gate_test.c

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e7fdd7a9c280..a820d47a4bca 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1159,6 +1159,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool 
init_event)
        svm_set_intercept(svm, INTERCEPT_SKINIT);
        svm_set_intercept(svm, INTERCEPT_WBINVD);
        svm_set_intercept(svm, INTERCEPT_XSETBV);
+       svm_set_intercept(svm, INTERCEPT_ICEBP);
        svm_set_intercept(svm, INTERCEPT_RDPRU);
        svm_set_intercept(svm, INTERCEPT_RSM);
 
@@ -2045,6 +2046,25 @@ static int bp_interception(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int icebp_interception(struct kvm_vcpu *vcpu)
+{
+       /*
+        * ICEBP (INT1, opcode 0xF1) generates #DB as a trap. Intercept it
+        * unconditionally so that RIP is advanced past the instruction
+        * *before* #DB is injected. This is necessary because SVM reports
+        * the wrong RIP for ICEBP-induced #DB when delivered via a task
+        * gate: RIP points at the ICEBP instruction instead of after it.
+        *
+        * By intercepting ICEBP, NRIP (next_rip) gives us the correct
+        * post-instruction RIP. Advancing RIP first and then injecting #DB
+        * ensures consistent behaviour regardless of how the guest's IDT
+        * delivers the exception.
+        */
+       svm_skip_emulated_instruction(vcpu);
+       kvm_queue_exception(vcpu, DB_VECTOR);
+       return 1;
+}
+
 static int ud_interception(struct kvm_vcpu *vcpu)
 {
        return handle_ud(vcpu);
@@ -3333,6 +3353,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu 
*vcpu) = {
        [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
        [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
        [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
+       [SVM_EXIT_ICEBP]                        = icebp_interception,
        [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
        [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
        [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
diff --git a/tools/testing/selftests/kvm/Makefile.kvm 
b/tools/testing/selftests/kvm/Makefile.kvm
index 9118a5a51b89..1b89bf0e5cf1 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -86,6 +86,8 @@ TEST_GEN_PROGS_x86 += x86/hyperv_features
 TEST_GEN_PROGS_x86 += x86/hyperv_ipi
 TEST_GEN_PROGS_x86 += x86/hyperv_svm_test
 TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush
+TEST_GEN_PROGS_x86 += x86/int1_ept_test
+TEST_GEN_PROGS_x86 += x86/int1_task_gate_test
 TEST_GEN_PROGS_x86 += x86/kvm_clock_test
 TEST_GEN_PROGS_x86 += x86/kvm_pv_test
 TEST_GEN_PROGS_x86 += x86/kvm_buslock_test
diff --git a/tools/testing/selftests/kvm/x86/int1_ept_test.c 
b/tools/testing/selftests/kvm/x86/int1_ept_test.c
new file mode 100644
index 000000000000..1ced77ffeeb9
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/int1_ept_test.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test that KVM correctly handles an EPT/NPT fault during delivery of
+ * a #DB exception caused by INT1 (ICEBP, opcode 0xF1).
+ *
+ * The guest executes INT1, which generates #DB. The IDT entry for #DB
+ * uses IST1, pointing to a stack page whose backing has been evicted
+ * via MADV_DONTNEED. The CPU takes an EPT/NPT violation when pushing
+ * the exception frame. KVM must handle the fault and reinject #DB.
+ */
+#include <sys/mman.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define IST_STACK_GPA  0xd0000000ull
+#define IST_STACK_SIZE PAGE_SIZE
+/* IST1 offset in the 64-bit TSS */
+#define TSS64_IST1_OFFSET      0x24
+
+static volatile bool db_handler_called;
+static volatile u64 db_handler_rip;
+
+static void guest_db_handler(struct ex_regs *regs)
+{
+       db_handler_called = true;
+       db_handler_rip = regs->rip;
+}
+
+static void guest_code(void)
+{
+       unsigned long expected_rip;
+
+       db_handler_called = false;
+
+       /*
+        * Execute INT1 (ICEBP). This generates #DB as a trap, so RIP
+        * in the exception frame points after the 0xF1 byte.
+        */
+       asm volatile("lea 1f(%%rip), %0\n\t"
+                    ".byte 0xf1\n\t"
+                    "1:"
+                    : "=r"(expected_rip) :: "memory");
+
+       GUEST_ASSERT(db_handler_called);
+       GUEST_ASSERT_EQ(db_handler_rip, expected_rip);
+       GUEST_SYNC(0);
+}
+
+int main(int argc, char *argv[])
+{
+       struct kvm_vcpu *vcpu;
+       struct kvm_vm *vm;
+       struct ucall uc;
+       void *ist_hva;
+       unsigned int nr_pages;
+       struct idt_entry *idt;
+
+       vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+       /* Install #DB handler using the normal exception handler table */
+       vm_install_exception_handler(vm, DB_VECTOR, guest_db_handler);
+
+       /*
+        * Allocate a separate stack page for IST1. Map it into the guest
+        * so the IDT entry can reference it.
+        */
+       nr_pages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, IST_STACK_SIZE);
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+                                   IST_STACK_GPA, 1, nr_pages, 0);
+       virt_map(vm, IST_STACK_GPA, IST_STACK_GPA, nr_pages);
+       ist_hva = addr_gpa2hva(vm, IST_STACK_GPA);
+
+       /*
+        * Write IST1 in the TSS to point to the top of the IST stack.
+        * TSS64 layout: offset 0x24 = IST1 (8 bytes).
+        */
+       {
+               void *tss_hva = addr_gva2hva(vm, vm->arch.tss);
+               u64 ist1_val = IST_STACK_GPA + IST_STACK_SIZE;
+
+               memcpy(tss_hva + TSS64_IST1_OFFSET, &ist1_val, 
sizeof(ist1_val));
+       }
+
+       /*
+        * Set the #DB IDT entry to use IST1. The handler address stays
+        * the same (the framework's idt_handlers stub for vector 1).
+        */
+       idt = (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt);
+       idt[DB_VECTOR].ist = 1;
+
+       /*
+        * Evict the IST stack page from the EPT/NPT so that when the CPU
+        * tries to push the #DB exception frame, it takes an EPT violation.
+        */
+       madvise(ist_hva, IST_STACK_SIZE, MADV_DONTNEED);
+
+       /* Run the guest — it should execute INT1, fault, get reinjected, 
succeed */
+       vcpu_run(vcpu);
+       TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+       switch (get_ucall(vcpu, &uc)) {
+       case UCALL_ABORT:
+               REPORT_GUEST_ASSERT(uc);
+               break;
+       case UCALL_SYNC:
+               pr_info("INT1 with EPT fault on IST stack: PASSED\n");
+               break;
+       default:
+               TEST_FAIL("Unexpected ucall");
+       }
+
+       kvm_vm_free(vm);
+       return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/int1_task_gate_test.c 
b/tools/testing/selftests/kvm/x86/int1_task_gate_test.c
new file mode 100644
index 000000000000..4485445ee39d
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/int1_task_gate_test.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test ICEBP (INT1) with a 32-bit task gate for #DB delivery.
+ *
+ * On SVM, when ICEBP causes #DB and #DB is delivered via a task gate,
+ * the saved RIP points at the ICEBP instruction rather than after it.
+ * This is a hardware quirk that the hypervisor must work around by
+ * intercepting ICEBP and advancing RIP before the task switch.
+ *
+ * This test sets up a 32-bit protected mode guest with:
+ *   - A #DB IDT entry configured as a task gate (type 5)
+ *   - A TSS for the #DB handler task
+ *   - Guest code that executes ICEBP (0xF1)
+ *
+ * The #DB handler task records the EIP from the back-link TSS and
+ * writes it to a known memory location. The test verifies that EIP
+ * points after the ICEBP instruction, not at it.
+ */
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <linux/kvm.h>
+
+#define KVM_DEV        "/dev/kvm"
+#define PAGE_SIZE 4096
+
+/* Guest physical memory layout (all below 1MB for simplicity) */
+#define GDT_BASE       0x1000
+#define IDT_BASE       0x2000
+#define MAIN_TSS_BASE  0x3000
+#define DB_TSS_BASE    0x4000
+#define CODE_BASE      0x5000
+#define RESULT_BASE    0x6000
+#define MAIN_STACK     0x8000  /* top of main task stack */
+#define DB_STACK       0x9000  /* top of #DB task stack */
+
+/* GDT selectors */
+#define SEL_CODE32     0x08
+#define SEL_DATA32     0x10
+#define SEL_MAIN_TSS   0x18
+#define SEL_DB_TSS     0x20
+
+/* 32-bit GDT entry */
+struct gdt_entry {
+       u16 limit_lo;
+       u16 base_lo;
+       u8  base_mid;
+       u8  access;
+       u8  limit_hi_flags;
+       u8  base_hi;
+} __attribute__((packed));
+
+/* 32-bit IDT task gate entry */
+struct idt_task_gate {
+       u16 reserved0;
+       u16 tss_selector;
+       u8  reserved1;
+       u8  type_attr;  /* P=1, DPL=0, type=0x5 (task gate) */
+       u16 reserved2;
+} __attribute__((packed));
+
+/* 32-bit TSS */
+struct tss32 {
+       u16 back_link, __blh;
+       u32 esp0;
+       u16 ss0, __ss0h;
+       u32 esp1;
+       u16 ss1, __ss1h;
+       u32 esp2;
+       u16 ss2, __ss2h;
+       u32 cr3;
+       u32 eip;
+       u32 eflags;
+       u32 eax, ecx, edx, ebx;
+       u32 esp, ebp, esi, edi;
+       u16 es, __esh;
+       u16 cs, __csh;
+       u16 ss, __ssh;
+       u16 ds, __dsh;
+       u16 fs, __fsh;
+       u16 gs, __gsh;
+       u16 ldt, __ldth;
+       u16 trace;
+       u16 iomap_base;
+} __attribute__((packed));
+
+static void set_gdt_entry(struct gdt_entry *e, u32 base, u32 limit,
+                         u8 access, u8 flags)
+{
+       e->limit_lo = limit & 0xffff;
+       e->base_lo = base & 0xffff;
+       e->base_mid = (base >> 16) & 0xff;
+       e->access = access;
+       e->limit_hi_flags = ((limit >> 16) & 0xf) | (flags << 4);
+       e->base_hi = (base >> 24) & 0xff;
+}
+
+/*
+ * Guest code (32-bit protected mode, assembled as bytes).
+ *
+ * The main task executes ICEBP. The #DB task gate switches to the
+ * DB handler TSS. The DB handler reads the back-link TSS to get the
+ * saved EIP, writes it to RESULT_BASE, then halts.
+ */
+
+/* Main task code: just execute ICEBP then halt */
+static const u8 main_code[] = {
+       0xf1,                   /* ICEBP (INT1) */
+       0xf4,                   /* HLT - should not reach here if task gate 
works */
+};
+
+/*
+ * #DB handler task code:
+ *   - Read back_link from our TSS (at DB_TSS_BASE offset 0) to get main TSS 
selector
+ *   - The main TSS (at MAIN_TSS_BASE) has saved EIP at offset 0x20
+ *   - Read that EIP and store at RESULT_BASE
+ *   - Store 0xCAFE at RESULT_BASE+4 as "handler ran" marker
+ *   - HLT
+ */
+static const u8 db_handler_code[] = {
+       /* mov eax, [MAIN_TSS_BASE + 0x20]  -- saved EIP in main TSS */
+       0xa1, (MAIN_TSS_BASE + 0x20) & 0xff, ((MAIN_TSS_BASE + 0x20) >> 8) & 
0xff, 0x00, 0x00,
+       /* mov [RESULT_BASE], eax */
+       0xa3, RESULT_BASE & 0xff, (RESULT_BASE >> 8) & 0xff, 0x00, 0x00,
+       /* mov dword [RESULT_BASE+4], 0xCAFE */
+       0xc7, 0x05, (RESULT_BASE + 4) & 0xff, ((RESULT_BASE + 4) >> 8) & 0xff, 
0x00, 0x00,
+       0xfe, 0xca, 0x00, 0x00,
+       /* hlt */
+       0xf4,
+};
+
+#define DB_HANDLER_OFFSET 0x100        /* offset within CODE_BASE page */
+
+static int kvm_ioctl(int fd, unsigned long cmd, void *arg)
+{
+       int ret = ioctl(fd, cmd, arg);
+       if (ret < 0) {
+               perror("KVM ioctl");
+               exit(1);
+       }
+       return ret;
+}
+
+int main(void)
+{
+       int kvm_fd, vm_fd, vcpu_fd;
+       struct kvm_userspace_memory_region region;
+       struct kvm_sregs sregs;
+       struct kvm_regs regs;
+       struct kvm_run *run;
+       void *mem;
+       size_t mmap_size;
+       struct gdt_entry *gdt;
+       struct idt_task_gate *idt;
+       struct tss32 *main_tss, *db_tss;
+       u32 *result;
+       u32 expected_eip;
+
+       kvm_fd = open(KVM_DEV, O_RDWR);
+       if (kvm_fd < 0) { perror("open /dev/kvm"); return 1; }
+
+       vm_fd = kvm_ioctl(kvm_fd, KVM_CREATE_VM, 0);
+       vcpu_fd = kvm_ioctl(vm_fd, KVM_CREATE_VCPU, 0);
+
+       /* Allocate guest memory — single 1MB region */
+       mem = mmap(NULL, 0x100000, PROT_READ | PROT_WRITE,
+                  MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       if (mem == MAP_FAILED) { perror("mmap"); return 1; }
+       memset(mem, 0, 0x100000);
+
+       region = (struct kvm_userspace_memory_region){
+               .slot = 0,
+               .guest_phys_addr = 0,
+               .memory_size = 0x100000,
+               .userspace_addr = (u64)mem,
+       };
+       kvm_ioctl(vm_fd, KVM_SET_USER_MEMORY_REGION, &region);
+
+       /* Set up GDT */
+       gdt = (struct gdt_entry *)(mem + GDT_BASE);
+       /* Null descriptor */
+       memset(&gdt[0], 0, sizeof(gdt[0]));
+       /* Code32: base=0, limit=0xfffff, 32-bit, present, code r/x */
+       set_gdt_entry(&gdt[1], 0, 0xfffff, 0x9a, 0xc);
+       /* Data32: base=0, limit=0xfffff, 32-bit, present, data r/w */
+       set_gdt_entry(&gdt[2], 0, 0xfffff, 0x92, 0xc);
+       /* Main TSS: base=MAIN_TSS_BASE, limit=sizeof(tss32)-1, present, TSS32 
available */
+       set_gdt_entry(&gdt[3], MAIN_TSS_BASE, sizeof(struct tss32) - 1, 0x89, 
0x0);
+       /* DB TSS: base=DB_TSS_BASE, limit=sizeof(tss32)-1, present, TSS32 
available */
+       set_gdt_entry(&gdt[4], DB_TSS_BASE, sizeof(struct tss32) - 1, 0x89, 
0x0);
+
+       /* Set up IDT — task gate for #DB (vector 1) */
+       idt = (struct idt_task_gate *)(mem + IDT_BASE);
+       idt[1].tss_selector = SEL_DB_TSS;
+       idt[1].type_attr = 0x85;  /* P=1, DPL=0, type=5 (32-bit task gate) */
+
+       /* Set up main TSS */
+       main_tss = (struct tss32 *)(mem + MAIN_TSS_BASE);
+       main_tss->cr3 = 0;  /* no paging */
+       main_tss->eip = CODE_BASE;
+       main_tss->eflags = 0x02;
+       main_tss->esp = MAIN_STACK;
+       main_tss->cs = SEL_CODE32;
+       main_tss->ds = SEL_DATA32;
+       main_tss->es = SEL_DATA32;
+       main_tss->ss = SEL_DATA32;
+       main_tss->fs = SEL_DATA32;
+       main_tss->gs = SEL_DATA32;
+
+       /* Set up #DB handler TSS */
+       db_tss = (struct tss32 *)(mem + DB_TSS_BASE);
+       db_tss->cr3 = 0;
+       db_tss->eip = CODE_BASE + DB_HANDLER_OFFSET;
+       db_tss->eflags = 0x02;
+       db_tss->esp = DB_STACK;
+       db_tss->cs = SEL_CODE32;
+       db_tss->ds = SEL_DATA32;
+       db_tss->es = SEL_DATA32;
+       db_tss->ss = SEL_DATA32;
+       db_tss->fs = SEL_DATA32;
+       db_tss->gs = SEL_DATA32;
+
+       /* Copy guest code */
+       memcpy(mem + CODE_BASE, main_code, sizeof(main_code));
+       memcpy(mem + CODE_BASE + DB_HANDLER_OFFSET, db_handler_code, 
sizeof(db_handler_code));
+
+       /* Expected EIP: after the 0xF1 byte */
+       expected_eip = CODE_BASE + 1;
+
+       /* Set up vCPU registers for 32-bit protected mode (no paging) */
+       kvm_ioctl(vcpu_fd, KVM_GET_SREGS, &sregs);
+       sregs.cr0 = 0x11;  /* PE + ET, no PG */
+       sregs.cr3 = 0;
+       sregs.cr4 = 0;
+
+       sregs.gdt.base = GDT_BASE;
+       sregs.gdt.limit = 5 * 8 - 1;
+       sregs.idt.base = IDT_BASE;
+       sregs.idt.limit = 256 * 8 - 1;
+
+       sregs.cs.base = 0; sregs.cs.limit = 0xfffff; sregs.cs.selector = 
SEL_CODE32;
+       sregs.cs.type = 0xb; sregs.cs.present = 1; sregs.cs.db = 1; sregs.cs.s 
= 1; sregs.cs.g = 1;
+       sregs.ds.base = 0; sregs.ds.limit = 0xfffff; sregs.ds.selector = 
SEL_DATA32;
+       sregs.ds.type = 3; sregs.ds.present = 1; sregs.ds.db = 1; sregs.ds.s = 
1; sregs.ds.g = 1;
+       sregs.es = sregs.ss = sregs.fs = sregs.gs = sregs.ds;
+
+       sregs.tr.base = MAIN_TSS_BASE; sregs.tr.limit = sizeof(struct tss32) - 
1;
+       sregs.tr.selector = SEL_MAIN_TSS; sregs.tr.type = 0xb; /* 32-bit TSS 
busy */
+       sregs.tr.present = 1;
+
+       kvm_ioctl(vcpu_fd, KVM_SET_SREGS, &sregs);
+
+       memset(&regs, 0, sizeof(regs));
+       regs.rip = CODE_BASE;
+       regs.rsp = MAIN_STACK;
+       regs.rflags = 0x02;
+       kvm_ioctl(vcpu_fd, KVM_SET_REGS, &regs);
+
+       /* Run the guest */
+       mmap_size = kvm_ioctl(kvm_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+       run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 
vcpu_fd, 0);
+
+       for (int i = 0; i < 100; i++) {
+               kvm_ioctl(vcpu_fd, KVM_RUN, 0);
+
+               if (run->exit_reason == KVM_EXIT_HLT)
+                       break;
+
+               if (run->exit_reason != KVM_EXIT_INTERNAL_ERROR &&
+                   run->exit_reason != KVM_EXIT_DEBUG) {
+                       fprintf(stderr, "Unexpected exit reason: %d\n", 
run->exit_reason);
+                       return 1;
+               }
+       }
+
+       /* Check results */
+       result = (u32 *)(mem + RESULT_BASE);
+       if (result[1] != 0xCAFE) {
+               fprintf(stderr, "FAIL: #DB handler did not run 
(marker=0x%x)\n", result[1]);
+               return 1;
+       }
+
+       printf("Saved EIP: 0x%x, expected: 0x%x\n", result[0], expected_eip);
+       if (result[0] == expected_eip) {
+               printf("PASS: EIP points after ICEBP\n");
+               return 0;
+       } else if (result[0] == expected_eip - 1) {
+               printf("FAIL: EIP points AT ICEBP (known SVM bug with task 
gates)\n");
+               return 1;
+       } else {
+               printf("FAIL: EIP is unexpected\n");
+               return 1;
+       }
+}
-- 
2.43.0


Attachment: smime.p7s
Description: S/MIME cryptographic signature

Reply via email to