Map any populated pages into the hypervisor upfront when creating a movable region, rather than waiting for faults. Previously, movable regions were created with all pages marked as HV_MAP_GPA_NO_ACCESS regardless of whether the userspace mapping contained populated pages.
This guarantees that if the caller passes a populated mapping, those present pages will be mapped into the hypervisor immediately during region creation instead of being faulted in later. Pages that are present but not writable in host page tables (e.g. shared zero pages) are left as no-access mappings to preserve copy-on-write semantics; they will be faulted in on demand. The region is processed in bounded chunks to avoid soft lockups and livelock from concurrent invalidations. Signed-off-by: Stanislav Kinsburskii <[email protected]> --- drivers/hv/mshv_regions.c | 126 +++++++++++++++++++++++++++++++++---------- drivers/hv/mshv_root.h | 1 drivers/hv/mshv_root_main.c | 10 --- 3 files changed, 98 insertions(+), 39 deletions(-) diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c index d9e33e9ef8550..85f8b7bddf939 100644 --- a/drivers/hv/mshv_regions.c +++ b/drivers/hv/mshv_regions.c @@ -17,8 +17,12 @@ #include "mshv_root.h" +/* Process memory regions in chunks to avoid soft lockups and livelock */ +#define MSHV_MAX_PFN_BATCH (SZ_2M / PAGE_SIZE) + #define MSHV_MAP_FAULT_IN_PAGES \ (PTRS_PER_PMD * max_t(unsigned long, 1, PAGE_SIZE / HV_HYP_PAGE_SIZE)) + #define MSHV_INVALID_PFN ULONG_MAX static inline bool mshv_pfn_valid(unsigned long pfn) @@ -450,13 +454,16 @@ int mshv_region_get(struct mshv_mem_region *region) /** * mshv_region_hmm_fault_and_lock - Fault in pages across VMAs and lock * the memory region - * @region: Pointer to the memory region structure - * @start : Starting virtual address of the range to fault (inclusive) - * @end : Ending virtual address of the range to fault (exclusive) - * @pfns : Output array for page frame numbers with HMM flags + * @region : Pointer to the memory region structure + * @start : Starting virtual address of the range to fault (inclusive) + * @end : Ending virtual address of the range to fault (exclusive) + * @pfns : Output array for page frame numbers with HMM flags + * @do_fault: If true, fault in missing pages; if false, snapshot only + * pages already present in page tables * - * Iterates through VMAs covering [start, end), faulting in pages via - * hmm_range_fault() for each VMA segment. Write faults are requested + * Iterates through VMAs covering [start, end), collecting page frame + * numbers via hmm_range_fault() for each VMA segment. When @do_fault + * is true, missing pages are faulted in and write faults are requested * only when both the VMA and the hypervisor mapping permit writes, to * avoid breaking copy-on-write semantics on read-only mappings. * @@ -469,7 +476,8 @@ int mshv_region_get(struct mshv_mem_region *region) static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region, unsigned long start, unsigned long end, - unsigned long *pfns) + unsigned long *pfns, + bool do_fault) { struct hmm_range range = { .notifier = ®ion->mreg_mni, @@ -491,18 +499,22 @@ static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region, range.hmm_pfns = pfns; range.start = start; range.end = min(vma->vm_end, end); - range.default_flags = HMM_PFN_REQ_FAULT; - /* - * Only request writable pages from HMM when both the - * VMA and the hypervisor mapping allow writes. Without - * this, hmm_range_fault() would trigger COW on read-only - * mappings (e.g. shared zero pages, file-backed pages), - * breaking copy-on-write semantics and potentially - * granting the guest write access to shared host pages. - */ - if ((vma->vm_flags & VM_WRITE) && - (region->hv_map_flags & HV_MAP_GPA_WRITABLE)) - range.default_flags |= HMM_PFN_REQ_WRITE; + range.default_flags = 0; + if (do_fault) { + range.default_flags = HMM_PFN_REQ_FAULT; + /* + * Only request writable pages from HMM when both + * the VMA and the hypervisor mapping allow writes. + * Without this, hmm_range_fault() would trigger + * COW on read-only mappings (e.g. shared zero + * pages, file-backed pages), breaking + * copy-on-write semantics and potentially granting + * the guest write access to shared host pages. + */ + if ((vma->vm_flags & VM_WRITE) && + (region->hv_map_flags & HV_MAP_GPA_WRITABLE)) + range.default_flags |= HMM_PFN_REQ_WRITE; + } ret = hmm_range_fault(&range); if (ret) @@ -527,19 +539,33 @@ static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region, } /** - * mshv_region_range_fault - Handle memory range faults for a given region. - * @region: Pointer to the memory region structure. - * @pfn_offset: Offset of the page within the region. - * @pfn_count: Number of pages to handle. + * mshv_region_collect_and_map - Collect PFNs for a user range and map them + * @region : memory region being processed + * @pfn_offset: PFNs offset within the region + * @pfn_count : number of PFNs to process + * @do_fault : if true, fault in missing pages; + * if false, collect only present pages * - * This function resolves memory faults for a specified range of pages - * within a memory region. It uses HMM (Heterogeneous Memory Management) - * to fault in the required pages and updates the region's page array. + * Collects PFNs for the specified portion of @region from the + * corresponding userspace VMAs and maps them into the hypervisor. The + * behavior depends on @do_fault: * - * Return: 0 on success, negative error code on failure. + * - true: Fault in missing pages from userspace, ensuring all pages in the + * range are present. Used for on-demand page population. + * - false: Collect PFNs only for pages already present in userspace, + * leaving missing pages as invalid PFN markers. + * Used for initial region setup. + * + * Collected PFNs are stored in region->mreg_pfns[] with HMM bookkeeping + * flags cleared, then the range is mapped into the hypervisor. Present + * PFNs get mapped with region access permissions; missing PFNs (invalid + * entries) get mapped with no-access permissions. + * + * Return: 0 on success, negative errno on failure. */ -static int mshv_region_range_fault(struct mshv_mem_region *region, - u64 pfn_offset, u64 pfn_count) +static int mshv_region_collect_and_map(struct mshv_mem_region *region, + u64 pfn_offset, u64 pfn_count, + bool do_fault) { unsigned long start, end; unsigned long *pfns; @@ -555,7 +581,7 @@ static int mshv_region_range_fault(struct mshv_mem_region *region, do { ret = mshv_region_hmm_fault_and_lock(region, start, end, - pfns); + pfns, do_fault); } while (ret == -EBUSY); if (ret) @@ -564,6 +590,11 @@ static int mshv_region_range_fault(struct mshv_mem_region *region, for (i = 0; i < pfn_count; i++) { if (!(pfns[i] & HMM_PFN_VALID)) continue; + /* Skip read-only pages to avoid bypassing COW */ + if (!do_fault && + (region->hv_map_flags & HV_MAP_GPA_WRITABLE) && + !(pfns[i] & HMM_PFN_WRITE)) + continue; /* Drop HMM_PFN_* flags to ensure PFNs are valid. */ region->mreg_pfns[pfn_offset + i] = pfns[i] & ~HMM_PFN_FLAGS; } @@ -577,6 +608,13 @@ static int mshv_region_range_fault(struct mshv_mem_region *region, return ret; } +static int mshv_region_range_fault(struct mshv_mem_region *region, + u64 pfn_offset, u64 pfn_count) +{ + return mshv_region_collect_and_map(region, pfn_offset, pfn_count, + true); +} + bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn) { u64 pfn_offset, pfn_count; @@ -764,3 +802,31 @@ int mshv_map_pinned_region(struct mshv_mem_region *region) err_out: return ret; } + +/* + * mshv_map_movable_region - Map a movable memory region to the hypervisor + * @region: The memory region to map + * + * Maps the entire movable region by processing it in bounded chunks to avoid + * soft lockups from holding mmap_read_lock() too long and to prevent livelock + * if concurrent memory invalidations force restarts. + * + * Returns: 0 on success, negative error code on failure. + */ +int mshv_map_movable_region(struct mshv_mem_region *region) +{ + u64 pfn, count; + int ret; + + for (pfn = 0; pfn < region->nr_pfns; pfn += MSHV_MAX_PFN_BATCH) { + count = min_t(u64, MSHV_MAX_PFN_BATCH, region->nr_pfns - pfn); + + ret = mshv_region_collect_and_map(region, pfn, count, false); + if (ret) + return ret; + + cond_resched(); + } + + return 0; +} diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 2a4eff27917f2..0f4fc57a14cd0 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -378,5 +378,6 @@ bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn); void mshv_region_movable_fini(struct mshv_mem_region *region); bool mshv_region_movable_init(struct mshv_mem_region *region); int mshv_map_pinned_region(struct mshv_mem_region *region); +int mshv_map_movable_region(struct mshv_mem_region *region); #endif /* _MSHV_ROOT_H_ */ diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 4af2b98738ee2..e38438c539c5d 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1391,15 +1391,7 @@ mshv_map_user_memory(struct mshv_partition *partition, ret = mshv_map_pinned_region(region); break; case MSHV_REGION_TYPE_MEM_MOVABLE: - /* - * For movable memory regions, remap with no access to let - * the hypervisor track dirty pages, enabling pre-copy live - * migration. - */ - ret = hv_call_map_ram_pfns(partition->pt_id, - region->start_gfn, - region->nr_pfns, - HV_MAP_GPA_NO_ACCESS, NULL); + ret = mshv_map_movable_region(region); break; case MSHV_REGION_TYPE_MMIO: ret = hv_call_map_mmio_pfns(partition->pt_id,

