The MSHV driver passes pages to MSHV for its exclusive use. A
subsequently kexec'd-to kernel must not use these pages, so
we need to register these pages with KHO.

- adapt hv_call_deposit_pages() and hv_call_withdraw_memory() to
  use tracker
- Use KHO to preserve MSHV-owned pages across kexec

Signed-off-by: Jork Loeser <[email protected]>
---
 drivers/hv/Kconfig              |   3 +
 drivers/hv/Makefile             |   2 +-
 drivers/hv/hv_common.c          |   3 +
 drivers/hv/hv_proc.c            |  32 ++-
 drivers/hv/mshv_page_preserve.c | 374 ++++++++++++++++++++++++++++++++
 drivers/hv/mshv_page_preserve.h |  15 ++
 drivers/hv/mshv_root.h          |   1 +
 drivers/hv/mshv_root_hv_call.c  |  12 +-
 8 files changed, 434 insertions(+), 8 deletions(-)
 create mode 100644 drivers/hv/mshv_page_preserve.c
 create mode 100644 drivers/hv/mshv_page_preserve.h

diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 2d0b3fcb0ff8..0c4ffc1c701b 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -74,6 +74,9 @@ config MSHV_ROOT
        # e.g. When withdrawing memory, the hypervisor gives back 4k pages in
        # no particular order, making it impossible to reassemble larger pages
        depends on PAGE_SIZE_4KB
+       # Pages deposited to the hypervisor must be tracked and preserved
+       # across kexec to avoid memory corruption.
+       depends on KEXEC_HANDOVER
        select EVENTFD
        select VIRT_XFER_TO_GUEST_WORK
        select HMM_MIRROR
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index 888a748cc7cb..49526ae704f9 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -21,7 +21,7 @@ mshv_vtl-y := mshv_vtl_main.o
 
 # Code that must be built-in
 obj-$(CONFIG_HYPERV) += hv_common.o
-obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o
+obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_page_preserve.o
 ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),)
        obj-y += mshv_common.o
 endif
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 6b67ac616789..8a593117e9b8 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -30,6 +30,7 @@
 #include <linux/set_memory.h>
 #include <hyperv/hvhdk.h>
 #include <asm/mshyperv.h>
+#include "mshv_root.h"
 
 u64 hv_current_partition_id = HV_PARTITION_ID_SELF;
 EXPORT_SYMBOL_GPL(hv_current_partition_id);
@@ -382,6 +383,8 @@ int __init hv_common_init(void)
        if (hv_parent_partition()) {
                hv_synic_eventring_tail = alloc_percpu(u8 *);
                BUG_ON(!hv_synic_eventring_tail);
+
+               mshv_preserve_init();
        }
 
        hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index),
diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
index 57b2c64197cb..0392ea1f3cc5 100644
--- a/drivers/hv/hv_proc.c
+++ b/drivers/hv/hv_proc.c
@@ -8,6 +8,7 @@
 #include <linux/minmax.h>
 #include <linux/export.h>
 #include <asm/mshyperv.h>
+#include "mshv_root.h"
 
 /*
  * See struct hv_deposit_memory. The first u64 is partition ID, the rest
@@ -22,6 +23,7 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 
num_pages)
        int *counts;
        int num_allocations;
        int i, j, page_count;
+       int reg_i = 0, reg_j = 0;
        int order;
        u64 status;
        int ret;
@@ -72,6 +74,18 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 
num_pages)
        }
        num_allocations = i;
 
+       /* Register the pages for preservation across kexec */
+       for (i = 0; i < num_allocations; ++i) {
+               for (j = 0; j < counts[i]; ++j) {
+                       ret = mshv_register_preserve_page(pages[i] + j);
+                       if (ret) {
+                               reg_i = i;
+                               reg_j = j;
+                               goto err_unregister;
+                       }
+               }
+       }
+
        local_irq_save(flags);
 
        input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -90,19 +104,27 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 
num_pages)
        if (!hv_result_success(status)) {
                hv_status_err(status, "\n");
                ret = hv_result_to_errno(status);
-               goto err_free_allocations;
+               reg_i = num_allocations;
+               goto err_unregister;
        }
 
        ret = 0;
        goto free_buf;
 
-err_free_allocations:
+err_unregister:
        for (i = 0; i < num_allocations; ++i) {
-               base_pfn = page_to_pfn(pages[i]);
-               for (j = 0; j < counts[i]; ++j)
-                       __free_page(pfn_to_page(base_pfn + j));
+               for (j = 0; j < counts[i]; ++j) {
+                       if (i == reg_i && j == reg_j)
+                               goto err_free_allocations;
+                       mshv_unregister_preserve_page(pages[i] + j);
+               }
        }
 
+err_free_allocations:
+       for (i = 0; i < num_allocations; ++i)
+               for (j = 0; j < counts[i]; ++j)
+                       __free_page(pages[i] + j);
+
 free_buf:
        free_page((unsigned long)pages);
        kfree(counts);
diff --git a/drivers/hv/mshv_page_preserve.c b/drivers/hv/mshv_page_preserve.c
new file mode 100644
index 000000000000..a79725a74663
--- /dev/null
+++ b/drivers/hv/mshv_page_preserve.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Preserve pages owned by Microsoft Hypervisor
+ *
+ * When handing pages to MSHV and kexec'ing, the next kernel needs to know 
which
+ * pages not to touch. Handles this preservation here.
+ *
+ * Copyright (C) 2026 Microsoft Corporation, Jork Loeser 
<[email protected]>
+ */
+
+#define pr_fmt(fmt) "mshv: " fmt
+
+#include <asm/mshyperv.h>
+#include <linux/kexec.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho_radix_tree.h>
+#include <linux/libfdt.h>
+#include <linux/reboot.h>
+#include "mshv_page_preserve.h"
+
+#define FDT_SUBTREE_MSHV "mshv_prsv_pt"
+#define MSHV_KHO_COMPAT_STR "mshv_kho-v1"
+
+static void *fdt_page;
+static struct kho_radix_tree preserved_pages_tree;
+
+/**
+ * mshv_register_preserve_page() - Register a page to be preserved by KHO
+ * @pg: pointer to the page to preserve
+ *
+ * Registers a single page to be preserved by KHO across kexec.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int mshv_register_preserve_page(struct page *pg)
+{
+       return kho_radix_add_key(&preserved_pages_tree, page_to_pfn(pg));
+}
+EXPORT_SYMBOL_GPL(mshv_register_preserve_page);
+
+/**
+ * mshv_unregister_preserve_page() - Unregister a page from KHO preservation
+ * @pg: pointer to the page to unpreserve
+ *
+ * Unregisters a page that was previously registered to be preserved by KHO.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int mshv_unregister_preserve_page(struct page *pg)
+{
+       return kho_radix_del_key(&preserved_pages_tree, page_to_pfn(pg));
+}
+EXPORT_SYMBOL_GPL(mshv_unregister_preserve_page);
+
+/* Preserve a single page identified by its PFN key with KHO */
+static int preserve_key_cb(unsigned long key, void *data)
+{
+       return kho_preserve_pages(pfn_to_page(key), 1);
+}
+
+/* Preserve a radix tree metadata page with KHO */
+static int preserve_table_cb(phys_addr_t phys, void *data)
+{
+       return kho_preserve_pages(phys_to_page(phys), 1);
+}
+
+static int create_fdt(void)
+{
+       int err;
+       void *fdt;
+       phys_addr_t root_table;
+
+       if (!fdt_page)
+               return -EINVAL;
+
+       fdt = fdt_page;
+
+       err = fdt_create(fdt, PAGE_SIZE);
+       if (err)
+               return err;
+       err = fdt_finish_reservemap(fdt);
+       if (err)
+               return err;
+       err = fdt_begin_node(fdt, "");
+       if (err)
+               return err;
+       err = fdt_property(fdt, "compatible", MSHV_KHO_COMPAT_STR,
+                          strlen(MSHV_KHO_COMPAT_STR) + 1);
+       if (err)
+               return err;
+       root_table = virt_to_phys(preserved_pages_tree.root);
+       err = fdt_property(fdt, "root_table", &root_table, sizeof(root_table));
+       if (err)
+               return err;
+       err = fdt_end_node(fdt);
+       if (err)
+               return err;
+       err = fdt_finish(fdt);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+/**
+ * preserve_tree() - Preserve pages owned by Microsoft Hypervisor
+ *
+ * This gets called prior to kexec and is our signal to finally preserve the
+ * pages with KHO, and create & register the named FDT. We also need to freeze
+ * the tree, since we cannot communicate any later changes.
+ *
+ * Return: 0 on success, -errno on error.
+ */
+static int preserve_tree(void)
+{
+       const struct kho_radix_walk_cb preserve_cb = {
+               .key = preserve_key_cb,
+               .table = preserve_table_cb,
+       };
+       int err;
+
+       err = kho_radix_tree_freeze(&preserved_pages_tree);
+       if (err) {
+               pr_warn("%s() - kho_radix_tree_freeze() failed: %d\n",
+                       __func__, err);
+               return err;
+       }
+
+       /* Populate the pre-allocated FDT page with current tree state */
+       err = create_fdt();
+       if (err) {
+               pr_warn("%s() - create_fdt() failed: %d\n", __func__, err);
+               return err;
+       }
+
+       /* Preserve both data- and meta-pages */
+       err = kho_radix_walk_tree(&preserved_pages_tree, &preserve_cb, NULL);
+       if (err) {
+               /* We could not preserve all pages and cannot kexec. */
+               pr_warn("%s() - kho_radix_walk_tree() failed: %d\n", __func__,
+                       err);
+               return err;
+       }
+
+       err = kho_preserve_pages(virt_to_page(fdt_page), 1);
+       if (err) {
+               pr_warn("%s() - kho_preserve_pages(fdt) failed: %d\n", __func__,
+                       err);
+               return err;
+       }
+
+       err = kho_add_subtree(FDT_SUBTREE_MSHV, fdt_page, PAGE_SIZE);
+       if (err) {
+               /* KHO will abort and undo all preservations. We cannot kexec. 
*/
+               pr_warn("%s() - kho_add_subtree() failed: %d\n", __func__, err);
+               return err;
+       }
+
+       pr_debug("%s() - success\n", __func__);
+       return 0;
+}
+
+/*
+ * Reboot-callback triggering page preservation prior to kexec. Other reboots
+ * need no KHO preservation.
+ */
+static int reboot_cb(struct notifier_block *nb, unsigned long action,
+                    void *data)
+{
+       /* codes such as SYS_RESTART, SYS_HALT do not convey kexec specifically 
*/
+       if (kexec_in_progress) {
+               int err;
+
+               /* Finalize handover: write KHO descriptors, flush metadata */
+               pr_debug("%s() - KHO-preserving page tree\n", __func__);
+               err = preserve_tree();
+               if (err)
+                       panic("preserve_tree() failed - must not kexec: %d\n",
+                             err);
+       }
+       return NOTIFY_OK;
+}
+
+/**
+ * restore_tree() - Restore the page-tree state from KHO.
+ *
+ * Return: 0 on success, -ENOENT if no KHO subtree was found (i.e. this is
+ *         not a KHO boot), -EINVAL if the preserved FDT is malformed or
+ *         incompatible.
+ */
+static int __init restore_tree(void)
+{
+       void *fdt;
+       phys_addr_t fdt_pa;
+       int len;
+       int node;
+       const phys_addr_t *root_table_fdt_ptr;
+       int err;
+
+       err = kho_retrieve_subtree(FDT_SUBTREE_MSHV, &fdt_pa, NULL);
+       if (err)
+               return err;
+
+       fdt = phys_to_virt(fdt_pa);
+       node = fdt_path_offset(fdt, "/");
+       if (node < 0) {
+               pr_err("Could not find root node in KHO-preserved FDT.\n");
+               return -EINVAL;
+       }
+
+       if (fdt_node_check_compatible(fdt, node, MSHV_KHO_COMPAT_STR)) {
+               /*
+                * This is unfortunate. We kexec'd into a kernel that isn't
+                * compatible with prior preservations. Pages this kernel
+                * considers available might actually be held by MSHV. The only
+                * recourse is to reboot.
+                */
+               const char *s = fdt_getprop(fdt, node, "compatible", &len);
+
+               if (s && len >= 0)
+                       pr_err("Incompatible kernel: Current is %s, preserved 
is %.*s\n",
+                              MSHV_KHO_COMPAT_STR, len, s);
+               else
+                       pr_err("Incompatible kernel: preserved misses 
'compatible' mark.\n");
+               return -EINVAL;
+       }
+
+       root_table_fdt_ptr = fdt_getprop(fdt, node, "root_table", &len);
+       if (!root_table_fdt_ptr || len != sizeof(*root_table_fdt_ptr)) {
+               pr_err("Could not obtain root_table property from KHO-preserved 
FDT.\n");
+               return -EINVAL;
+       }
+
+       /* Restore struct page so it could be freed if needed */
+       if (!kho_restore_pages(fdt_pa, 1))
+               return -EINVAL;
+
+       fdt_page = phys_to_virt(fdt_pa);
+
+       err = kho_radix_init_tree(&preserved_pages_tree,
+                                 phys_to_virt(*root_table_fdt_ptr));
+       if (err)
+               return -EINVAL;
+
+       pr_debug("Restored tracking from KHO.\n");
+       return 0;
+}
+
+/*
+ * Restore individual pages using KHO's helper during boot.
+ *
+ * Pages must be restored one at a time because they were deposited to
+ * the hypervisor individually and will be withdrawn individually later.
+ * Restoring them as a higher-order group would create compound pages
+ * that cannot be freed with __free_page().
+ */
+static int __init restore_key_cb(unsigned long key, void *data)
+{
+       if (!kho_restore_pages(PFN_PHYS(key), 1))
+               return -EINVAL;
+       return 0;
+}
+
+static int __init restore_table_cb(phys_addr_t phys, void *data)
+{
+       if (!kho_restore_pages(phys, 1))
+               return -EINVAL;
+       return 0;
+}
+
+/**
+ * restore_page_structs() - Restore page-structs so they can be __free_page()'d
+ *
+ * This is necessary because KHO-preserved pages are in a "weird" state
+ * post-kexec. While doing so here in bulk adds to boot time, there is no 
vetted
+ * alternative that would allow doing this later, when we cannot say which 
pages
+ * had been freshly added, and which came into the tree through KHO.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+static int __init restore_page_structs(void)
+{
+       const struct kho_radix_walk_cb cb = {
+               .key = restore_key_cb,
+               .table = restore_table_cb,
+       };
+
+       return kho_radix_walk_tree(&preserved_pages_tree, &cb, NULL);
+}
+
+/**
+ * alloc_tree() - Allocate a fresh page tree and FDT page.
+ *
+ * Called on fresh boot (no KHO data). Allocates an empty radix tree and
+ * the FDT page used to serialize state before kexec.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+static int __init alloc_tree(void)
+{
+       int err;
+
+       fdt_page = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!fdt_page)
+               return -ENOMEM;
+
+       err = kho_radix_init_tree(&preserved_pages_tree, NULL);
+       if (err) {
+               free_page((unsigned long)fdt_page);
+               fdt_page = NULL;
+               return err;
+       }
+
+       return 0;
+}
+
+static struct notifier_block reboot_notifier = {
+       .notifier_call = reboot_cb,
+       .priority = 0,
+};
+
+/**
+ * mshv_preserve_init() - Initialize the page preservation
+ *
+ * Upon return:
+ * - the tracker will be ready for use (restored post-kexec, or empty
+ *   post-reboot),
+ * - restored pages will be in a state that can be __free_page()'d,
+ * - KHO notification for preservation will be registered.
+ *
+ * Return: 0 on success, -errno on error.
+ */
+int __init mshv_preserve_init(void)
+{
+       int err;
+
+       if (!kho_is_enabled()) {
+               pr_err("KHO is disabled; page deposits will fail.\n");
+               return 0;
+       }
+
+       err = restore_tree();
+       if (!err) {
+               /* Restore struct pages so they can be __free_page()'d */
+               if (restore_page_structs())
+                       /*
+                        * Unrestored struct pages would BUG when freed
+                        * at withdraw time.
+                        */
+                       panic("Failed to restore MSHV page structs\n");
+       } else if (err == -ENOENT) {
+               pr_debug("Nothing to restore from KHO.\n");
+               if (alloc_tree()) {
+                       pr_err("Could not allocate page tree; page deposits 
will fail.\n");
+                       return 0;
+               }
+       } else {
+               /*
+                * Pages from the prior kernel are held by MSHV but we
+                * lost track of them -- memory corruption is inevitable.
+                */
+               panic("Could not restore page tree from KHO: %d\n", err);
+       }
+
+       err = register_reboot_notifier(&reboot_notifier);
+       if (err)
+               /*
+                * Deposits would succeed but pages would not be preserved
+                * across kexec, causing memory corruption post-kexec.
+                */
+               panic("Could not register reboot notification: %d\n", err);
+
+       return 0;
+}
diff --git a/drivers/hv/mshv_page_preserve.h b/drivers/hv/mshv_page_preserve.h
new file mode 100644
index 000000000000..0609002e5f1d
--- /dev/null
+++ b/drivers/hv/mshv_page_preserve.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2026 Microsoft Corporation, Jork Loeser 
<[email protected]>
+ */
+
+#ifndef _MSHV_PAGE_PRESERVE_H
+#define _MSHV_PAGE_PRESERVE_H
+
+struct page;
+
+int mshv_preserve_init(void);
+int mshv_register_preserve_page(struct page *pg);
+int mshv_unregister_preserve_page(struct page *pg);
+
+#endif /* _MSHV_PAGE_PRESERVE_H */
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 1f086dcb7aa1..362768786c17 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -18,6 +18,7 @@
 #include <linux/mmu_notifier.h>
 #include <uapi/linux/mshv.h>
 #include "mshv_trace.h"
+#include "mshv_page_preserve.h"
 
 /*
  * Hypervisor must be between these version numbers (inclusive)
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index cb55d4d4be2e..f5ff03318787 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -69,8 +69,16 @@ int hv_call_withdraw_memory(u64 count, int node, u64 
partition_id)
 
                completed = hv_repcomp(status);
 
-               for (i = 0; i < completed; i++)
-                       __free_page(pfn_to_page(output_page->gpa_page_list[i]));
+               for (i = 0; i < completed; i++) {
+                       struct page *pg = 
pfn_to_page(output_page->gpa_page_list[i]);
+                       int res = mshv_unregister_preserve_page(pg);
+
+                       WARN_ONCE(res, "Failed to unregister PFN %#llx\n",
+                                 output_page->gpa_page_list[i]);
+
+                       /* Free regardless -- HV has already released the page 
*/
+                       __free_page(pg);
+               }
 
                if (!hv_result_success(status)) {
                        if (hv_result(status) == HV_STATUS_NO_RESOURCES)
-- 
2.43.0


Reply via email to