Azure CVM instance types featuring a paravisor hang upon kdump. The investigation shows that makedumpfile causes a hang when it steps on a page which was previously share with the host (HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no knowledge of these 'special' regions (which are Vmbus connection pages, GPADL buffers, ...). There are several ways to approach the issue: - Convey the knowledge about these regions to the new kernel somehow. - Unshare these regions before accessing in the new kernel (it is unclear if there's a way to query the status for a given GPA range). - Unshare these regions before jumping to the new kernel (which this patch implements).
To make the procedure as robust as possible, store PFN ranges of shared regions in a linked list instead of storing GVAs and re-using hv_vtom_set_host_visibility(). This also allows to avoid memory allocation on the kdump/kexec path. The patch skips implementing weird corner case in hv_list_enc_remove() when a PFN in the middle of a region is unshared. First, it is unlikely that such requests happen. Second, it is not a big problem if hv_list_enc_remove() doesn't actually remove some regions as this will only result in an extra hypercall doing nothing upon kexec/kdump; there's no need to be perfect. Signed-off-by: Vitaly Kuznetsov <vkuzn...@redhat.com> --- Changes since v1: - fix build on ARM [kernel test robot] --- arch/arm64/include/asm/mshyperv.h | 3 + arch/x86/hyperv/ivm.c | 153 ++++++++++++++++++++++++++++++ arch/x86/include/asm/mshyperv.h | 2 + drivers/hv/vmbus_drv.c | 2 + 4 files changed, 160 insertions(+) diff --git a/arch/arm64/include/asm/mshyperv.h b/arch/arm64/include/asm/mshyperv.h index b721d3134ab6..af11abf403b4 100644 --- a/arch/arm64/include/asm/mshyperv.h +++ b/arch/arm64/include/asm/mshyperv.h @@ -53,6 +53,9 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg) return hv_get_msr(reg); } +/* Isolated VMs are unsupported on ARM, no cleanup needed */ +static inline void hv_ivm_clear_host_access(void) {} + /* SMCCC hypercall parameters */ #define HV_SMCCC_FUNC_NUMBER 1 #define HV_FUNC_ID ARM_SMCCC_CALL_VAL( \ diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index ade6c665c97e..a6e614672836 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -462,6 +462,150 @@ void hv_ivm_msr_read(u64 msr, u64 *value) hv_ghcb_msr_read(msr, value); } +/* + * Keep track of the PFN regions which were shared with the host. The access + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()). + */ +struct hv_enc_pfn_region { + struct list_head list; + u64 pfn; + int count; +}; + +static LIST_HEAD(hv_list_enc); +static DEFINE_RAW_SPINLOCK(hv_list_enc_lock); + +static int hv_list_enc_add(const u64 *pfn_list, int count) +{ + struct hv_enc_pfn_region *ent; + unsigned long flags; + bool found = false; + u64 pfn; + int i; + + for (i = 0; i < count; i++) { + pfn = pfn_list[i]; + + found = false; + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_for_each_entry(ent, &hv_list_enc, list) { + if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn)) { + /* Nothin to do - pfn is already in the list */ + found = true; + } else if (ent->pfn + ent->count == pfn) { + /* Grow existing region up */ + found = true; + ent->count++; + } else if (pfn + 1 == ent->pfn) { + /* Grow existing region down */ + found = true; + ent->pfn--; + ent->count++; + } + }; + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + + if (found) + continue; + + /* No adajacent region found -- creating a new one */ + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->pfn = pfn; + ent->count = 1; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_add(&ent->list, &hv_list_enc); + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + } + + return 0; +} + +static void hv_list_enc_remove(const u64 *pfn_list, int count) +{ + struct hv_enc_pfn_region *ent, *t; + unsigned long flags; + u64 pfn; + int i; + + for (i = 0; i < count; i++) { + pfn = pfn_list[i]; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_for_each_entry_safe(ent, t, &hv_list_enc, list) { + if (pfn == ent->pfn + count - 1) { + /* Removing tail pfn */ + ent->count--; + if (!ent->count) { + list_del(&ent->list); + kfree(ent); + } + } else if (pfn == ent->pfn) { + /* Removing head pfn */ + ent->count--; + ent->pfn++; + if (!ent->count) { + list_del(&ent->list); + kfree(ent); + } + } + + /* + * Removing PFNs in the middle of a region is not implemented; the + * list is currently only used for cleanup upon kexec and there's + * no harm done if we issue an extra unneeded hypercall making some + * region encrypted when it already is. + */ + }; + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + } +} + +void hv_ivm_clear_host_access(void) +{ + struct hv_gpa_range_for_visibility *input; + struct hv_enc_pfn_region *ent; + unsigned long flags; + u64 hv_status; + int cur, i; + + if (!hv_is_isolation_supported()) + return; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + if (!input) + goto unlock; + + list_for_each_entry(ent, &hv_list_enc, list) { + for (i = 0, cur = 0; i < ent->count; i++) { + input->gpa_page_list[cur] = ent->pfn + i; + cur++; + + if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) { + input->partition_id = HV_PARTITION_ID_SELF; + input->host_visibility = VMBUS_PAGE_NOT_VISIBLE; + input->reserved0 = 0; + input->reserved1 = 0; + hv_status = hv_do_rep_hypercall( + HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, + cur, 0, input, NULL); + WARN_ON_ONCE(!hv_result_success(hv_status)); + cur = 0; + } + } + + }; + +unlock: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); +} +EXPORT_SYMBOL_GPL(hv_ivm_clear_host_access); + /* * hv_mark_gpa_visibility - Set pages visible to host via hvcall. * @@ -475,6 +619,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], struct hv_gpa_range_for_visibility *input; u64 hv_status; unsigned long flags; + int ret; /* no-op if partition isolation is not enabled */ if (!hv_is_isolation_supported()) @@ -486,6 +631,14 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], return -EINVAL; } + if (visibility == VMBUS_PAGE_NOT_VISIBLE) { + hv_list_enc_remove(pfn, count); + } else { + ret = hv_list_enc_add(pfn, count); + if (ret) + return ret; + } + local_irq_save(flags); input = *this_cpu_ptr(hyperv_pcpu_input_arg); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index abc4659f5809..6a988001e46f 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -263,10 +263,12 @@ static inline int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, void hv_vtom_init(void); void hv_ivm_msr_write(u64 msr, u64 value); void hv_ivm_msr_read(u64 msr, u64 *value); +void hv_ivm_clear_host_access(void); #else static inline void hv_vtom_init(void) {} static inline void hv_ivm_msr_write(u64 msr, u64 value) {} static inline void hv_ivm_msr_read(u64 msr, u64 *value) {} +static inline void hv_ivm_clear_host_access(void) {} #endif static inline bool hv_is_synic_msr(unsigned int reg) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 2ed5a1e89d69..2e891e108218 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2784,6 +2784,7 @@ static void hv_kexec_handler(void) /* Make sure conn_state is set as hv_synic_cleanup checks for it */ mb(); cpuhp_remove_state(hyperv_cpuhp_online); + hv_ivm_clear_host_access(); }; static void hv_crash_handler(struct pt_regs *regs) @@ -2799,6 +2800,7 @@ static void hv_crash_handler(struct pt_regs *regs) cpu = smp_processor_id(); hv_stimer_cleanup(cpu); hv_synic_disable_regs(cpu); + hv_ivm_clear_host_access(); }; static int hv_synic_suspend(void) -- 2.50.0