On Sat, Aug 30, 2025 at 1:03 AM Michael Kelley <mhkli...@outlook.com> wrote: > > From: Vitaly Kuznetsov <vkuzn...@redhat.com> Sent: Thursday, August 28, 2025 > 2:16 AM > > > > Azure CVM instance types featuring a paravisor hang upon kdump. The > > investigation shows that makedumpfile causes a hang when it steps on a page > > which was previously share with the host > > (HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no > > knowledge of these 'special' regions (which are Vmbus connection pages, > > GPADL buffers, ...). There are several ways to approach the issue: > > - Convey the knowledge about these regions to the new kernel somehow. > > - Unshare these regions before accessing in the new kernel (it is unclear > > if there's a way to query the status for a given GPA range). > > - Unshare these regions before jumping to the new kernel (which this patch > > implements). > > > > To make the procedure as robust as possible, store PFN ranges of shared > > regions in a linked list instead of storing GVAs and re-using > > hv_vtom_set_host_visibility(). This also allows to avoid memory allocation > > on the kdump/kexec path. > > > > Signed-off-by: Vitaly Kuznetsov <vkuzn...@redhat.com> > > Looks good! > > Adding Confidential Computing mailing list (linux-c...@lists.linux.dev) > for visibility. > > Reviewed-by: Michael Kelley <mhkli...@outlook.com> > > > --- > > Changes since v3 [Michael Kelley]: > > - Employ x86_platform.guest.enc_kexec_{begin,finish} hooks. > > - Don't use spinlock in what's now hv_vtom_kexec_finish(). > > - Handle possible hypercall failures in hv_mark_gpa_visibility() > > symmetrically; change hv_list_enc_remove() to return -ENOMEM as well. > > - Rebase to the latest hyperv/next. > > --- > > arch/x86/hyperv/ivm.c | 211 +++++++++++++++++++++++++++++++++++++++++- > > 1 file changed, 210 insertions(+), 1 deletion(-) > > > > diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c > > index ade6c665c97e..a4615b889f3e 100644 > > --- a/arch/x86/hyperv/ivm.c > > +++ b/arch/x86/hyperv/ivm.c > > @@ -462,6 +462,195 @@ void hv_ivm_msr_read(u64 msr, u64 *value) > > hv_ghcb_msr_read(msr, value); > > } > > > > +/* > > + * Keep track of the PFN regions which were shared with the host. The > > access > > + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()). > > + */ > > +struct hv_enc_pfn_region { > > + struct list_head list; > > + u64 pfn; > > + int count; > > +}; > > + > > +static LIST_HEAD(hv_list_enc); > > +static DEFINE_RAW_SPINLOCK(hv_list_enc_lock); > > + > > +static int hv_list_enc_add(const u64 *pfn_list, int count) > > +{ > > + struct hv_enc_pfn_region *ent; > > + unsigned long flags; > > + u64 pfn; > > + int i; > > + > > + for (i = 0; i < count; i++) { > > + pfn = pfn_list[i]; > > + > > + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); > > + /* Check if the PFN already exists in some region first */ > > + list_for_each_entry(ent, &hv_list_enc, list) { > > + if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 > > >= pfn)) > > + /* Nothing to do - pfn is already in the list > > */ > > + goto unlock_done; > > + } > > + > > + /* > > + * Check if the PFN is adjacent to an existing region. Growing > > + * a region can make it adjacent to another one but merging is > > + * not (yet) implemented for simplicity. A PFN cannot be added > > + * to two regions to keep the logic in hv_list_enc_remove() > > + * correct. > > + */ > > + list_for_each_entry(ent, &hv_list_enc, list) { > > + if (ent->pfn + ent->count == pfn) { > > + /* Grow existing region up */ > > + ent->count++; > > + goto unlock_done; > > + } else if (pfn + 1 == ent->pfn) { > > + /* Grow existing region down */ > > + ent->pfn--; > > + ent->count++; > > + goto unlock_done; > > + } > > + } > > + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); > > + > > + /* No adjacent region found -- create a new one */ > > + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); > > + if (!ent) > > + return -ENOMEM; > > + > > + ent->pfn = pfn; > > + ent->count = 1; > > + > > + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); > > + list_add(&ent->list, &hv_list_enc); > > + > > +unlock_done: > > + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); > > + } > > + > > + return 0; > > +} > > + > > +static int hv_list_enc_remove(const u64 *pfn_list, int count) > > +{ > > + struct hv_enc_pfn_region *ent, *t; > > + struct hv_enc_pfn_region new_region; > > + unsigned long flags; > > + u64 pfn; > > + int i; > > + > > + for (i = 0; i < count; i++) { > > + pfn = pfn_list[i]; > > + > > + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); > > + list_for_each_entry_safe(ent, t, &hv_list_enc, list) { > > + if (pfn == ent->pfn + ent->count - 1) { > > + /* Removing tail pfn */ > > + ent->count--; > > + if (!ent->count) { > > + list_del(&ent->list); > > + kfree(ent); > > + } > > + goto unlock_done; > > + } else if (pfn == ent->pfn) { > > + /* Removing head pfn */ > > + ent->count--; > > + ent->pfn++; > > + if (!ent->count) { > > + list_del(&ent->list); > > + kfree(ent); > > + } > > + goto unlock_done; > > + } else if (pfn > ent->pfn && pfn < ent->pfn + > > ent->count - 1) { > > + /* > > + * Removing a pfn in the middle. Cut off the > > tail > > + * of the existing region and create a > > template for > > + * the new one. > > + */ > > + new_region.pfn = pfn + 1; > > + new_region.count = ent->count - (pfn - > > ent->pfn + 1); > > + ent->count = pfn - ent->pfn; > > + goto unlock_split; > > + } > > + > > + } > > +unlock_done: > > + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); > > + continue; > > + > > +unlock_split: > > + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); > > + > > + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); > > + if (!ent) > > + return -ENOMEM; > > + > > + ent->pfn = new_region.pfn; > > + ent->count = new_region.count; > > + > > + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); > > + list_add(&ent->list, &hv_list_enc); > > + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); > > + } > > + > > + return 0; > > +} > > + > > +/* Stop new private<->shared conversions */ > > +static void hv_vtom_kexec_begin(void) > > +{ > > + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) > > + return; > > + > > + /* > > + * Crash kernel reaches here with interrupts disabled: can't wait for > > + * conversions to finish. > > + * > > + * If race happened, just report and proceed. > > + */ > > + if (!set_memory_enc_stop_conversion()) > > + pr_warn("Failed to stop shared<->private conversions\n"); > > +} > > + > > +static void hv_vtom_kexec_finish(void) > > +{ > > + struct hv_gpa_range_for_visibility *input; > > + struct hv_enc_pfn_region *ent; > > + unsigned long flags; > > + u64 hv_status; > > + int cur, i; > > + > > + local_irq_save(flags); > > + input = *this_cpu_ptr(hyperv_pcpu_input_arg); > > + > > + if (unlikely(!input)) > > + goto out; > > + > > + list_for_each_entry(ent, &hv_list_enc, list) { > > + for (i = 0, cur = 0; i < ent->count; i++) { > > + input->gpa_page_list[cur] = ent->pfn + i; > > + cur++; > > + > > + if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == > > ent->count - 1) { > > + input->partition_id = HV_PARTITION_ID_SELF; > > + input->host_visibility = > > VMBUS_PAGE_NOT_VISIBLE; > > + input->reserved0 = 0; > > + input->reserved1 = 0; > > + hv_status = hv_do_rep_hypercall( > > + > > HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, > > + cur, 0, input, NULL); > > + WARN_ON_ONCE(!hv_result_success(hv_status)); > > + cur = 0; > > + } > > + } > > + > > + } > > + > > +out: > > + local_irq_restore(flags); > > +} > > + > > /* > > * hv_mark_gpa_visibility - Set pages visible to host via hvcall. > > * > > @@ -475,6 +664,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 > > pfn[], > > struct hv_gpa_range_for_visibility *input; > > u64 hv_status; > > unsigned long flags; > > + int ret; > > > > /* no-op if partition isolation is not enabled */ > > if (!hv_is_isolation_supported()) > > @@ -486,6 +676,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 > > pfn[], > > return -EINVAL; > > } > > > > + if (visibility == VMBUS_PAGE_NOT_VISIBLE) > > + ret = hv_list_enc_remove(pfn, count); > > + else > > + ret = hv_list_enc_add(pfn, count); > > + if (ret) > > + return ret; > > + > > local_irq_save(flags); > > input = *this_cpu_ptr(hyperv_pcpu_input_arg); > > > > @@ -506,8 +703,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 > > pfn[], > > > > if (hv_result_success(hv_status)) > > return 0; > > + > > + if (visibility == VMBUS_PAGE_NOT_VISIBLE) > > + ret = hv_list_enc_add(pfn, count); > > else > > - return -EFAULT; > > + ret = hv_list_enc_remove(pfn, count); > > + /* > > + * There's no good way to recover from -ENOMEM here, the accounting is > > + * wrong either way. > > + */ > > + WARN_ON_ONCE(ret); > > + > > + return -EFAULT; > > } > > > > /* > > @@ -669,6 +876,8 @@ void __init hv_vtom_init(void) > > x86_platform.guest.enc_tlb_flush_required = > > hv_vtom_tlb_flush_required; > > x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present; > > x86_platform.guest.enc_status_change_finish = > > hv_vtom_set_host_visibility; > > + x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin; > > + x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish; > > > > /* Set WB as the default cache mode. */ > > guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); > > -- > > 2.50.1 > >
Reviewed-by: Tianyu Lan <ti...@microsoft.com> -- Thanks Tianyu Lan