From: Roman Kisel <rom...@linux.microsoft.com> Sent: Sunday, May 11, 2025 4:08 PM >
For the Subject: line use prefix "Drivers: hv:" as that's where the changes predominate. > Confidential VMBus employs the paravisor SynIC pages to implement > the control plane of the protocol, and the data plane may use > encrypted pages. > > Implement scanning the additional pages in the control plane, > and update the logic not to decrypt ring buffer and GPADLs (GPA > descr. lists) unconditionally. This patch is really big. The handling of the GPADL and ring buffer decryption, and the associated booleans that are returned in the VMBus offer, could be in a separate preparatory patch that comes before the synic changes. As long as the booleans are always false, such a preparatory patch would leave everything still functional for normal VMs and CoCo VMs that don't have Confidential VMBus. > > Signed-off-by: Roman Kisel <rom...@linux.microsoft.com> > --- > arch/x86/kernel/cpu/mshyperv.c | 23 +- > drivers/hv/channel.c | 36 +-- > drivers/hv/channel_mgmt.c | 29 +- > drivers/hv/connection.c | 10 +- > drivers/hv/hv.c | 485 ++++++++++++++++++++++++--------- > drivers/hv/hyperv_vmbus.h | 9 +- > drivers/hv/ring_buffer.c | 5 +- > drivers/hv/vmbus_drv.c | 140 +++++----- > 8 files changed, 518 insertions(+), 219 deletions(-) > > diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c > index 4f6e3d02f730..4163bc24269e 100644 > --- a/arch/x86/kernel/cpu/mshyperv.c > +++ b/arch/x86/kernel/cpu/mshyperv.c > @@ -28,6 +28,7 @@ > #include <asm/apic.h> > #include <asm/timer.h> > #include <asm/reboot.h> > +#include <asm/msr.h> > #include <asm/nmi.h> > #include <clocksource/hyperv_timer.h> > #include <asm/numa.h> > @@ -77,14 +78,28 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); > > void hv_set_non_nested_msr(unsigned int reg, u64 value) > { > + if (reg == HV_X64_MSR_EOM && vmbus_is_confidential()) { > + /* Reach out to the paravisor. */ > + native_wrmsrl(reg, value); > + return; > + } > + > if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) { > + /* The hypervisor will get the intercept. */ > hv_ivm_msr_write(reg, value); > > - /* Write proxy bit via wrmsl instruction */ > - if (hv_is_sint_msr(reg)) > - wrmsrl(reg, value | 1 << 20); > + if (hv_is_sint_msr(reg)) { > + /* > + * Write proxy bit in the case of non-confidential > VMBus control plane. See some later comments, but I'd suggest dropping the "control plane" concept and just say "non-confidential VMBus". > + * Using wrmsl instruction so the following goes to the > paravisor. > + */ > + u32 proxy = 1 & !vmbus_is_confidential(); > + > + value |= (proxy << 20); > + native_wrmsrl(reg, value); > + } > } else { > - wrmsrl(reg, value); > + native_wrmsrl(reg, value); > } > } > EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); > diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c > index fb8cd8469328..ef540b72f6ea 100644 > --- a/drivers/hv/channel.c > +++ b/drivers/hv/channel.c > @@ -443,20 +443,23 @@ static int __vmbus_establish_gpadl(struct vmbus_channel > *channel, > return ret; > } > > - /* > - * Set the "decrypted" flag to true for the set_memory_decrypted() > - * success case. In the failure case, the encryption state of the > - * memory is unknown. Leave "decrypted" as true to ensure the > - * memory will be leaked instead of going back on the free list. > - */ > - gpadl->decrypted = true; > - ret = set_memory_decrypted((unsigned long)kbuffer, > - PFN_UP(size)); > - if (ret) { > - dev_warn(&channel->device_obj->device, > - "Failed to set host visibility for new GPADL %d.\n", > - ret); > - return ret; > + if ((!channel->confidential_external_memory && type == HV_GPADL_BUFFER) > || > + (!channel->confidential_ring_buffer && type == HV_GPADL_RING)) { > + /* > + * Set the "decrypted" flag to true for the > set_memory_decrypted() > + * success case. In the failure case, the encryption state of > the > + * memory is unknown. Leave "decrypted" as true to ensure the > + * memory will be leaked instead of going back on the free list. > + */ > + gpadl->decrypted = true; > + ret = set_memory_decrypted((unsigned long)kbuffer, > + PFN_UP(size)); > + if (ret) { > + dev_warn(&channel->device_obj->device, > + "Failed to set host visibility for new GPADL > %d.\n", > + ret); > + return ret; > + } Some problems here. First, gpadl->decrypted is left uninitialized if set_memory_decrypted() is skipped because we have confidential external memory or ring buffer. Second, at the end of this function, if there's an error (i.e., ret != 0), set_memory_encrypted() is called even if the memory was never decrypted. In a CoCo VM, we must not call set_memory_encrypted() on memory that is already encrypted. Third, vmbus_teardown_gpadl() has the same problem -- it assumes that __vmbus_establish_gpadl() always decrypts the memory, so it always calls set_memory_encrypted(). > } > > init_completion(&msginfo->waitevent); > @@ -676,12 +679,13 @@ static int __vmbus_open(struct vmbus_channel > *newchannel, > goto error_clean_ring; > > err = hv_ringbuffer_init(&newchannel->outbound, > - page, send_pages, 0); > + page, send_pages, 0, > newchannel->confidential_ring_buffer); > if (err) > goto error_free_gpadl; > > err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages], > - recv_pages, newchannel->max_pkt_size); > + recv_pages, newchannel->max_pkt_size, > + newchannel->confidential_ring_buffer); > if (err) > goto error_free_gpadl; > > diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c > index 6e084c207414..39c8b80d967f 100644 > --- a/drivers/hv/channel_mgmt.c > +++ b/drivers/hv/channel_mgmt.c > @@ -843,14 +843,14 @@ static void vmbus_wait_for_unload(void) > = per_cpu_ptr(hv_context.cpu_context, cpu); > > /* > - * In a CoCo VM the synic_message_page is not allocated > + * In a CoCo VM the hv_synic_message_page is not > allocated > * in hv_synic_alloc(). Instead it is set/cleared in > * hv_synic_enable_regs() and hv_synic_disable_regs() > * such that it is set only when the CPU is online. If > * not all present CPUs are online, the message page > * might be NULL, so skip such CPUs. > */ > - page_addr = hv_cpu->synic_message_page; > + page_addr = hv_cpu->hv_synic_message_page; > if (!page_addr) > continue; > > @@ -891,7 +891,7 @@ static void vmbus_wait_for_unload(void) > struct hv_per_cpu_context *hv_cpu > = per_cpu_ptr(hv_context.cpu_context, cpu); > > - page_addr = hv_cpu->synic_message_page; > + page_addr = hv_cpu->hv_synic_message_page; > if (!page_addr) > continue; > > @@ -1021,6 +1021,7 @@ static void vmbus_onoffer(struct > vmbus_channel_message_header *hdr) > struct vmbus_channel_offer_channel *offer; > struct vmbus_channel *oldchannel, *newchannel; > size_t offer_sz; > + bool confidential_ring_buffer, confidential_external_memory; > > offer = (struct vmbus_channel_offer_channel *)hdr; > > @@ -1033,6 +1034,18 @@ static void vmbus_onoffer(struct > vmbus_channel_message_header *hdr) > return; > } > > + confidential_ring_buffer = is_confidential_ring_buffer(offer); > + if (confidential_ring_buffer) { > + if (vmbus_proto_version < VERSION_WIN_COPPER || > !vmbus_is_confidential()) > + return; Like the earlier code in this function that tests vmbus_is_valid_offer(), you must decrement vmbus_connection.offer_in_progress before the failure case returns. Otherwise, a rescind operation could hang forever waiting for all offers to complete. > + } > + > + confidential_external_memory = is_confidential_external_memory(offer); > + if (is_confidential_external_memory(offer)) { > + if (vmbus_proto_version < VERSION_WIN_COPPER || > !vmbus_is_confidential()) > + return; Same here. > + } > + > oldchannel = find_primary_channel_by_offer(offer); > > if (oldchannel != NULL) { > @@ -1069,6 +1082,14 @@ static void vmbus_onoffer(struct > vmbus_channel_message_header *hdr) > > atomic_dec(&vmbus_connection.offer_in_progress); > > + if ((oldchannel->confidential_ring_buffer && > !confidential_ring_buffer) || > + (oldchannel->confidential_external_memory && > + !confidential_external_memory)) { > + pr_err_ratelimited("Offer %d changes the confidential > state\n", > + offer->child_relid); Not sure why this needs to be ratelimited. Typically there are only a couple dozen offers at most. Also, I don't think hibernation in a CoCo VM is supported in the first place, so maybe we should never be here if vmbus_is_confidential(). > + return; Must release the channel mutex before returning in this error case. > + } > + > WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID); > /* Fix up the relid. */ > oldchannel->offermsg.child_relid = offer->child_relid; > @@ -1111,6 +1132,8 @@ static void vmbus_onoffer(struct > vmbus_channel_message_header *hdr) > pr_err("Unable to allocate channel object\n"); > return; > } > + newchannel->confidential_ring_buffer = confidential_ring_buffer; > + newchannel->confidential_external_memory = confidential_external_memory; > > vmbus_setup_channel_state(newchannel, offer); > > diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c > index 8351360bba16..268b7d58b45b 100644 > --- a/drivers/hv/connection.c > +++ b/drivers/hv/connection.c > @@ -51,7 +51,8 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version); > * Linux guests and are not listed. > */ > static __u32 vmbus_versions[] = { > - VERSION_WIN10_V5_3, > + VERSION_WIN_COPPER, > + VERSION_WIN_IRON, > VERSION_WIN10_V5_2, > VERSION_WIN10_V5_1, > VERSION_WIN10_V5, > @@ -65,7 +66,7 @@ static __u32 vmbus_versions[] = { > * Maximal VMBus protocol version guests can negotiate. Useful to cap the > * VMBus version for testing and debugging purpose. > */ > -static uint max_version = VERSION_WIN10_V5_3; > +static uint max_version = VERSION_WIN_COPPER; > > module_param(max_version, uint, S_IRUGO); > MODULE_PARM_DESC(max_version, > @@ -105,6 +106,11 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo > *msginfo, u32 version) > vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID; > } > > + if (vmbus_is_confidential() && version >= VERSION_WIN_COPPER) > + msg->feature_flags = VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS; > + else > + msg->feature_flags = 0; > + msg has already been zero'ed, so the above "else" clause isn't needed. > /* > * shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always > * bitwise OR it > diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c > index 308c8f279df8..94be5b3f9e70 100644 > --- a/drivers/hv/hv.c > +++ b/drivers/hv/hv.c > @@ -74,7 +74,7 @@ int hv_post_message(union hv_connection_id connection_id, > aligned_msg->payload_size = payload_size; > memcpy((void *)aligned_msg->payload, payload, payload_size); > > - if (ms_hyperv.paravisor_present) { > + if (ms_hyperv.paravisor_present && !vmbus_is_confidential()) { > if (hv_isolation_type_tdx()) > status = hv_tdx_hypercall(HVCALL_POST_MESSAGE, > virt_to_phys(aligned_msg), 0); > @@ -94,11 +94,135 @@ int hv_post_message(union hv_connection_id connection_id, > return hv_result(status); > } > > +enum hv_page_encryption_action { > + HV_PAGE_ENC_DEAFULT, > + HV_PAGE_ENC_ENCRYPT, > + HV_PAGE_ENC_DECRYPT > +}; > + > +static int hv_alloc_page(unsigned int cpu, void **page, enum > hv_page_encryption_action enc_action, > + const char *note) > +{ > + int ret = 0; > + > + pr_debug("allocating %s\n", note); > + > + /* > + * After the page changes its encryption status, its contents will > + * appear scrambled. Thus `get_zeroed_page` would zero the page out > + * in vain, we do that ourselves exactly one time. FWIW, the statement about "scrambled" is true for SEV-SNP, but it's not true for TDX. TDX leaves the page all zero'ed. And it seems like I remember something about perhaps a future version of SEV-SNP would similarly do the zero'ing, but I don't know any details. But in any case, doing the zero'ing explicitly is the correct approach, at least for the moment. It's just the comment that isn't precisely correct. > + * > + * The function might be called from contexts where sleeping is very > + * bad (like hotplug callbacks) or not possible (interrupt handling), > + * Thus requesting `GFP_ATOMIC`. It looks to me like the existing code's use of GFP_ATOMIC is bogus. hv_synic_alloc() is only called from vmbus_bus_init(), and there are no restrictions there on sleeping. set_memory_decrypted() could sleep because it gets a mutex lock in the mm code. I'd delete this comment and use GFP_KERNEL. There's already an allocation a few lines up in hv_synic_alloc() that uses GFP_KERNEL. > + * > + * The page order is 0 as we need 1 page and log_2 (1) = 0. > + */ > + *page = (void *)__get_free_pages(GFP_ATOMIC, 0); Just use __get_free_page() [singular] and you can avoid the discussion of where the "0" second argument comes from. > + if (!*page) > + return -ENOMEM; > + > + pr_debug("allocated %s\n", note); > + > + switch (enc_action) { > + case HV_PAGE_ENC_ENCRYPT: > + ret = set_memory_encrypted((unsigned long)*page, 1); > + break; > + case HV_PAGE_ENC_DECRYPT: > + ret = set_memory_decrypted((unsigned long)*page, 1); > + break; > + case HV_PAGE_ENC_DEAFULT: > + break; > + default: > + pr_warn("unknown page encryption action %d for %s\n", > enc_action, note); > + break; > + } This seems a bit over-engineered to me. There are no cases where this function is called with HV_PAGE_ENC_ENCRYPT, and I can't see that ever being useful in the future. Conceptually, it's wrong to be encrypting a newly allocated page. That leaves HV_PAGE_ENC_DECRYPT and HV_PAGE_ENC_DEFAULT, which can more straightforwardly be handled as a boolean parameter specifying whether to decrypt the page. The 13 lines of switch statement then become just: if (decrypt) ret = set_memory_decrypted((unsigned long)*page, 1); And enum hv_page_encryption_action is no longer needed. > + > + if (ret) > + goto failed; > + > + memset(*page, 0, PAGE_SIZE); > + return 0; > + > +failed: > + > + pr_err("page encryption action %d failed for %s, error %d when > allocating the page\n", > + enc_action, note, ret); > + free_page((unsigned long)*page); > + *page = NULL; > + return ret; > +} > + > +static int hv_free_page(void **page, enum hv_page_encryption_action > enc_action, > + const char *note) > +{ > + int ret = 0; > + > + pr_debug("freeing %s\n", note); > + > + if (!page) > + return 0; This test seems unnecessary for a static function that can only be called within this module, where it's easy to ensure that a valid pointer is always passed. > + if (!*page) > + return 0; > + > + switch (enc_action) { > + case HV_PAGE_ENC_ENCRYPT: > + ret = set_memory_encrypted((unsigned long)*page, 1); > + break; > + case HV_PAGE_ENC_DECRYPT: > + ret = set_memory_decrypted((unsigned long)*page, 1); > + break; > + case HV_PAGE_ENC_DEAFULT: > + break; > + default: > + pr_warn("unknown page encryption action %d for %s page\n", > + enc_action, note); > + break; > + } Same here about using a boolean parameter to specify whether to encrypt. There will never be a case where you want to decrypt before free'ing. > + > + /* > + * In the case of the action failure, the page is leaked. > + * Something is wrong, prefer to lose the page and stay afloat. > + */ > + if (ret) { > + pr_err("page encryption action %d failed for %s, error %d when > freeing\n", > + enc_action, note, ret); > + } else { > + pr_debug("freed %s\n", note); > + free_page((unsigned long)*page); > + } > + > + *page = NULL; > + > + return ret; > +} > + > +static bool hv_should_allocate_post_msg_page(void) > +{ > + return ms_hyperv.paravisor_present && hv_isolation_type_tdx(); > +} > + > +static bool hv_should_allocate_synic_pages(void) > +{ > + return !ms_hyperv.paravisor_present && !hv_root_partition(); > +} For the above two, rather than creating these helper functions, what about creating a boolean in struct hv_context for each of these? Then hv_synic_alloc() and hv_synic_free() can directly reference the boolean instead of having to declare local variables that are initialized from the above functions. The new booleans in hv_context would be set in ms_hyperv_init_platform(), and they don't change during the life of the VM. To me, this approach would eliminate some code overhead that doesn't really add any value. > + > +static bool hv_should_allocate_pv_synic_pages(void) > +{ > + return vmbus_is_confidential(); > +} This one *might* have some value if the criteria for creating the paravisor synic pages could change in the future. But I'd recommend taking the simpler way for now, and just directly call vmbus_is_confidential() in hv_synic_alloc() and hv_synic_free(). Don't even bother with creating a local variable to contain the result since it is only used once in each function. > + > int hv_synic_alloc(void) > { > int cpu, ret = -ENOMEM; > struct hv_per_cpu_context *hv_cpu; > > + const bool allocate_post_msg_page = hv_should_allocate_post_msg_page(); > + const bool allocate_synic_pages = hv_should_allocate_synic_pages(); > + const bool allocate_pv_synic_pages = > hv_should_allocate_pv_synic_pages(); > + const enum hv_page_encryption_action enc_action = > + (!vmbus_is_confidential()) ? HV_PAGE_ENC_DECRYPT : > HV_PAGE_ENC_DEAFULT; > + > /* > * First, zero all per-cpu memory areas so hv_synic_free() can > * detect what memory has been allocated and cleanup properly > @@ -122,74 +246,38 @@ int hv_synic_alloc(void) > tasklet_init(&hv_cpu->msg_dpc, > vmbus_on_msg_dpc, (unsigned long)hv_cpu); > > - if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { > - hv_cpu->post_msg_page = (void > *)get_zeroed_page(GFP_ATOMIC); > - if (!hv_cpu->post_msg_page) { > - pr_err("Unable to allocate post msg page\n"); > + if (allocate_post_msg_page) { > + ret = hv_alloc_page(cpu, &hv_cpu->post_msg_page, > + enc_action, "post msg page"); > + if (ret) > goto err; > - } > - > - ret = set_memory_decrypted((unsigned > long)hv_cpu->post_msg_page, 1); > - if (ret) { > - pr_err("Failed to decrypt post msg page: %d\n", > ret); > - /* Just leak the page, as it's unsafe to free > the page. */ > - hv_cpu->post_msg_page = NULL; > - goto err; > - } > - > - memset(hv_cpu->post_msg_page, 0, PAGE_SIZE); > } > > /* > - * Synic message and event pages are allocated by paravisor. > - * Skip these pages allocation here. > + * If these SynIC pages are not allocated, SIEF and SIM pages > + * are configured using what the root partition or the paravisor > + * provides upon reading the SIEFP and SIMP registers. > */ > - if (!ms_hyperv.paravisor_present && !hv_root_partition()) { > - hv_cpu->synic_message_page = > - (void *)get_zeroed_page(GFP_ATOMIC); > - if (!hv_cpu->synic_message_page) { > - pr_err("Unable to allocate SYNIC message > page\n"); > + if (allocate_synic_pages) { > + ret = hv_alloc_page(cpu, &hv_cpu->hv_synic_message_page, > + enc_action, "SynIC msg page"); > + if (ret) > goto err; > - } > - > - hv_cpu->synic_event_page = > - (void *)get_zeroed_page(GFP_ATOMIC); > - if (!hv_cpu->synic_event_page) { > - pr_err("Unable to allocate SYNIC event page\n"); > - > - free_page((unsigned > long)hv_cpu->synic_message_page); > - hv_cpu->synic_message_page = NULL; > + ret = hv_alloc_page(cpu, &hv_cpu->hv_synic_event_page, > + enc_action, "SynIC event page"); > + if (ret) > goto err; > - } > } > > - if (!ms_hyperv.paravisor_present && > - (hv_isolation_type_snp() || hv_isolation_type_tdx())) { > - ret = set_memory_decrypted((unsigned long) > - hv_cpu->synic_message_page, 1); > - if (ret) { > - pr_err("Failed to decrypt SYNIC msg page: > %d\n", ret); > - hv_cpu->synic_message_page = NULL; > - > - /* > - * Free the event page here so that > hv_synic_free() > - * won't later try to re-encrypt it. > - */ > - free_page((unsigned > long)hv_cpu->synic_event_page); > - hv_cpu->synic_event_page = NULL; > + if (allocate_pv_synic_pages) { > + ret = hv_alloc_page(cpu, &hv_cpu->pv_synic_message_page, > + HV_PAGE_ENC_DEAFULT, "pv SynIC msg page"); > + if (ret) > goto err; > - } > - > - ret = set_memory_decrypted((unsigned long) > - hv_cpu->synic_event_page, 1); > - if (ret) { > - pr_err("Failed to decrypt SYNIC event page: > %d\n", ret); > - hv_cpu->synic_event_page = NULL; > + ret = hv_alloc_page(cpu, &hv_cpu->pv_synic_event_page, > + HV_PAGE_ENC_DEAFULT, "pv SynIC event page"); > + if (ret) > goto err; > - } > - > - memset(hv_cpu->synic_message_page, 0, PAGE_SIZE); > - memset(hv_cpu->synic_event_page, 0, PAGE_SIZE); > } > } > > @@ -205,55 +293,38 @@ int hv_synic_alloc(void) > > void hv_synic_free(void) > { > - int cpu, ret; > + int cpu; > + > + const bool free_post_msg_page = hv_should_allocate_post_msg_page(); > + const bool free_synic_pages = hv_should_allocate_synic_pages(); > + const bool free_pv_synic_pages = hv_should_allocate_pv_synic_pages(); > > for_each_present_cpu(cpu) { > struct hv_per_cpu_context *hv_cpu = > per_cpu_ptr(hv_context.cpu_context, cpu); > > - /* It's better to leak the page if the encryption fails. */ > - if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { > - if (hv_cpu->post_msg_page) { > - ret = set_memory_encrypted((unsigned long) > - hv_cpu->post_msg_page, 1); > - if (ret) { > - pr_err("Failed to encrypt post msg > page: %d\n", ret); > - hv_cpu->post_msg_page = NULL; > - } > - } > + if (free_post_msg_page) > + hv_free_page(&hv_cpu->post_msg_page, > + HV_PAGE_ENC_ENCRYPT, "post msg page"); > + if (free_synic_pages) { > + hv_free_page(&hv_cpu->hv_synic_event_page, > + HV_PAGE_ENC_ENCRYPT, "SynIC event page"); > + hv_free_page(&hv_cpu->hv_synic_message_page, > + HV_PAGE_ENC_ENCRYPT, "SynIC msg page"); Always re-encrypting the page is wrong. If VMBus is confidential, the pages will have remained encrypted in hv_synic_alloc(). Trying to encrypt them again will produce errors. > } > - > - if (!ms_hyperv.paravisor_present && > - (hv_isolation_type_snp() || hv_isolation_type_tdx())) { > - if (hv_cpu->synic_message_page) { > - ret = set_memory_encrypted((unsigned long) > - hv_cpu->synic_message_page, 1); > - if (ret) { > - pr_err("Failed to encrypt SYNIC msg > page: %d\n", ret); > - hv_cpu->synic_message_page = NULL; > - } > - } > - > - if (hv_cpu->synic_event_page) { > - ret = set_memory_encrypted((unsigned long) > - hv_cpu->synic_event_page, 1); > - if (ret) { > - pr_err("Failed to encrypt SYNIC event > page: %d\n", ret); > - hv_cpu->synic_event_page = NULL; > - } > - } > + if (free_pv_synic_pages) { > + hv_free_page(&hv_cpu->pv_synic_event_page, > + HV_PAGE_ENC_DEAFULT, "pv SynIC event page"); > + hv_free_page(&hv_cpu->pv_synic_message_page, > + HV_PAGE_ENC_DEAFULT, "pv SynIC msg page"); > } > - > - free_page((unsigned long)hv_cpu->post_msg_page); > - free_page((unsigned long)hv_cpu->synic_event_page); > - free_page((unsigned long)hv_cpu->synic_message_page); > } > > kfree(hv_context.hv_numa_map); > } > > /* > - * hv_synic_init - Initialize the Synthetic Interrupt Controller. > + * hv_synic_enable_regs - Initialize the Synthetic Interrupt Controller. > * > * If it is already initialized by another entity (ie x2v shim), we need to > * retrieve the initialized message and event pages. Otherwise, we create > and This comment about the x2v shim is ancient and long since incorrect. It's in the existing code, but should be removed. > @@ -266,7 +337,6 @@ void hv_synic_enable_regs(unsigned int cpu) > union hv_synic_simp simp; > union hv_synic_siefp siefp; > union hv_synic_sint shared_sint; > - union hv_synic_scontrol sctrl; > > /* Setup the Synic's message page */ > simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); > @@ -276,18 +346,18 @@ void hv_synic_enable_regs(unsigned int cpu) > /* Mask out vTOM bit. ioremap_cache() maps decrypted */ > u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & > ~ms_hyperv.shared_gpa_boundary; > - hv_cpu->synic_message_page = > - (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); > - if (!hv_cpu->synic_message_page) > + hv_cpu->hv_synic_message_page Renaming "synic_message_page" to "hv_synic_message_page" (along with the renaming of synic_event_page) could be a separate preparatory patch. Together I see about 40 references that need to be changed. Doing them in a separate patch is less clutter in the patch with the main synic changes. I also have a quibble with naming it hv_synic_message_page. The "hv" prefix means "Hyper-V", and think what you are mean here is the hypervisor as opposed to the paravisor. What about naming it with prefix "hyp", and use "pvr" for the paravisor version as I suggested in a comment on an earlier patch in this series? > + = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); Why move the equals sign to the next line? > + if (!hv_cpu->hv_synic_message_page) > pr_err("Fail to map synic message page.\n"); > } else { > - simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page) > + simp.base_simp_gpa = virt_to_phys(hv_cpu->hv_synic_message_page) > >> HV_HYP_PAGE_SHIFT; > } > > hv_set_msr(HV_MSR_SIMP, simp.as_uint64); > > - /* Setup the Synic's event page */ > + /* Setup the Synic's event page with the hypervisor. */ > siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); > siefp.siefp_enabled = 1; > > @@ -295,12 +365,12 @@ void hv_synic_enable_regs(unsigned int cpu) > /* Mask out vTOM bit. ioremap_cache() maps decrypted */ > u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & > ~ms_hyperv.shared_gpa_boundary; > - hv_cpu->synic_event_page = > - (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); > - if (!hv_cpu->synic_event_page) > + hv_cpu->hv_synic_event_page > + = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); Why move the equals sign to the next line? > + if (!hv_cpu->hv_synic_event_page) > pr_err("Fail to map synic event page.\n"); > } else { > - siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page) > + siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hv_synic_event_page) > >> HV_HYP_PAGE_SHIFT; > } > > @@ -313,8 +383,24 @@ void hv_synic_enable_regs(unsigned int cpu) > > shared_sint.vector = vmbus_interrupt; > shared_sint.masked = false; > - shared_sint.auto_eoi = hv_recommend_using_aeoi(); > - hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); > + > + /* > + * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), > + * it doesn't provide a recommendation flag and AEOI must be disabled. > + */ > +#ifdef HV_DEPRECATING_AEOI_RECOMMENDED > + shared_sint.auto_eoi = > + !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); > +#else > + shared_sint.auto_eoi = 0; > +#endif Why not use the helper function hv_recommend_using_aeoi()? I think it was added relatively recently, so maybe your code started out before it existed. > + hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, > + shared_sint.as_uint64); Why is the above statement now on two lines? In fact, it looks like this entire little section of changes is spurious. > +} > + > +static void hv_synic_enable_interrupts(void) > +{ > + union hv_synic_scontrol sctrl; > > /* Enable the global synic bit */ > sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); > @@ -323,13 +409,78 @@ void hv_synic_enable_regs(unsigned int cpu) > hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); > } > > +/* > + * The paravisor might not support proxying SynIC, and this > + * function may fail. > + */ > +static int hv_pv_synic_enable_regs(unsigned int cpu) > +{ > + union hv_synic_simp simp; > + union hv_synic_siefp siefp; > + This blank line seems spurious. Don't usually see blank lines in local variable declaration lists. > + int err; > + struct hv_per_cpu_context *hv_cpu > + = per_cpu_ptr(hv_context.cpu_context, cpu); > + > + /* Setup the Synic's message page with the paravisor. */ > + simp.as_uint64 = hv_pv_get_synic_register(HV_MSR_SIMP, &err); > + if (err) > + return err; > + simp.simp_enabled = 1; > + simp.base_simp_gpa = virt_to_phys(hv_cpu->pv_synic_message_page) > + >> HV_HYP_PAGE_SHIFT; > + err = hv_pv_set_synic_register(HV_MSR_SIMP, simp.as_uint64); > + if (err) > + return err; > + > + /* Setup the Synic's event page with the paravisor. */ > + siefp.as_uint64 = hv_pv_get_synic_register(HV_MSR_SIEFP, &err); > + if (err) If setting up the simp succeeds, but then accessing the siefp fails, does the simp need to be cleared? I don't know the implications of abandoning a partial setup. > + return err; > + siefp.siefp_enabled = 1; > + siefp.base_siefp_gpa = virt_to_phys(hv_cpu->pv_synic_event_page) > + >> HV_HYP_PAGE_SHIFT; > + return hv_pv_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64); Same question here about a failure, and abandoning a partial setup. > +} > + > +static int hv_pv_synic_enable_interrupts(void) > +{ > + union hv_synic_scontrol sctrl; > + int err; > + > + /* Enable the global synic bit */ > + sctrl.as_uint64 = hv_pv_get_synic_register(HV_MSR_SCONTROL, &err); > + if (err) > + return err; > + sctrl.enable = 1; > + > + return hv_pv_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64); > +} > + > int hv_synic_init(unsigned int cpu) > { > + int err = 0; > + > + /* > + * The paravisor may not support the confidential VMBus, > + * check on that first. > + */ > + if (vmbus_is_confidential()) > + err = hv_pv_synic_enable_regs(cpu); > + if (err) > + return err; I would expect to see the test for "err" to be under the test for vmbus_is_confidential(). > + > hv_synic_enable_regs(cpu); > + if (!vmbus_is_confidential()) > + hv_synic_enable_interrupts(); > + else > + err = hv_pv_synic_enable_interrupts(); Flip the order of the "if" and "else" clauses so the negation on vmbus_is_confidential() can be removed? It's one less piece of logic to cognitively process when reading the code .... I'm still trying to figure out how things work with confidential VMBus since there are two synics to deal with. When there are two, it appears from this code that the guest takes interrupts only from the paravisor synic? > + if (err) > + return err; Again, group the test for "err" with the call to hv_pv_synic_enable_interrupts(). > > hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT); > > - return 0; > + return err; In existing code, hv_synic_init() doesn't fail. Given the new failure modes, should an error message be output so that a failure is noted? And does anything need to be undone if enable_regs() succeeds but enable_interrupts() fails? > } > > void hv_synic_disable_regs(unsigned int cpu) > @@ -339,7 +490,6 @@ void hv_synic_disable_regs(unsigned int cpu) > union hv_synic_sint shared_sint; > union hv_synic_simp simp; > union hv_synic_siefp siefp; > - union hv_synic_scontrol sctrl; > > shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT); > > @@ -358,8 +508,8 @@ void hv_synic_disable_regs(unsigned int cpu) > */ > simp.simp_enabled = 0; > if (ms_hyperv.paravisor_present || hv_root_partition()) { > - iounmap(hv_cpu->synic_message_page); > - hv_cpu->synic_message_page = NULL; > + memunmap(hv_cpu->hv_synic_message_page); > + hv_cpu->hv_synic_message_page = NULL; > } else { > simp.base_simp_gpa = 0; > } > @@ -370,43 +520,97 @@ void hv_synic_disable_regs(unsigned int cpu) > siefp.siefp_enabled = 0; > > if (ms_hyperv.paravisor_present || hv_root_partition()) { > - iounmap(hv_cpu->synic_event_page); > - hv_cpu->synic_event_page = NULL; > + memunmap(hv_cpu->hv_synic_event_page); > + hv_cpu->hv_synic_event_page = NULL; > } else { > siefp.base_siefp_gpa = 0; > } > > hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); > +} > + > +static void hv_synic_disable_interrupts(void) > +{ > + union hv_synic_scontrol sctrl; > > /* Disable the global synic bit */ > sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); > sctrl.enable = 0; > hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); > +} > > +static void hv_vmbus_disable_percpu_interrupts(void) > +{ > if (vmbus_irq != -1) > disable_percpu_irq(vmbus_irq); > } Is this separate function needed? It's two lines of code that are only called in one place. > > +static void hv_pv_synic_disable_regs(unsigned int cpu) > +{ > + /* > + * The get/set register errors are deliberatley ignored in > + * the cleanup path as they are non-consequential here. > + */ I don't understand this comment. Errors are checked for, and the function exits, just like in hv_pv_synic_enable_regs(). Of course, the caller never finds out about the errors. > + int err; > + union hv_synic_simp simp; > + union hv_synic_siefp siefp; > + > + struct hv_per_cpu_context *hv_cpu > + = per_cpu_ptr(hv_context.cpu_context, cpu); For the hypervisor synic, hv_synic_disable_regs() masks the VMBUS_MESSAGE_SINT before changing the simp and siefp. Doesn't the same need to be done for the paravisor synic? > + > + /* Disable SynIC's message page in the paravisor. */ > + simp.as_uint64 = hv_pv_get_synic_register(HV_MSR_SIMP, &err); > + if (err) > + return; > + simp.simp_enabled = 0; > + > + memunmap(hv_cpu->pv_synic_message_page); > + hv_cpu->pv_synic_message_page = NULL; This code seems bogus. The pv_synic_mesage_page was allocated, not memmap()'ed. And setting it to NULL here prevents deallocation in hv_synic_free(). > + > + err = hv_pv_set_synic_register(HV_MSR_SIMP, simp.as_uint64); > + if (err) > + return; > + > + /* Disable SynIC's event page in the paravisor. */ > + siefp.as_uint64 = hv_pv_get_synic_register(HV_MSR_SIEFP, &err); > + if (err) > + return; > + siefp.siefp_enabled = 0; > + > + memunmap(hv_cpu->pv_synic_event_page); > + hv_cpu->pv_synic_event_page = NULL; Same bogus code for the pv_synic_event_page? > + > + hv_pv_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64); > +} > + > +static void hv_pv_synic_disable_interrupts(void) > +{ > + union hv_synic_scontrol sctrl; > + int err; > + > + /* Disable the global synic bit */ > + sctrl.as_uint64 = hv_pv_get_synic_register(HV_MSR_SCONTROL, &err); > + if (err) > + return; > + sctrl.enable = 0; > + hv_pv_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64); > +} > + > #define HV_MAX_TRIES 3 > -/* > - * Scan the event flags page of 'this' CPU looking for any bit that is set. > If we find one > - * bit set, then wait for a few milliseconds. Repeat these steps for a > maximum of 3 times. > - * Return 'true', if there is still any set bit after this operation; > 'false', otherwise. > - * > - * If a bit is set, that means there is a pending channel interrupt. The > expectation is > - * that the normal interrupt handling mechanism will find and process the > channel interrupt > - * "very soon", and in the process clear the bit. > - */ > -static bool hv_synic_event_pending(void) > + > +static bool hv_synic_event_pending_for(union hv_synic_event_flags *event, > int sint) The usual naming pattern for an internal implementation version of a function is to prepend a double-underscore; i.e., __hv_synic_event_pending(). > { > - struct hv_per_cpu_context *hv_cpu = > this_cpu_ptr(hv_context.cpu_context); > - union hv_synic_event_flags *event = > - (union hv_synic_event_flags *)hv_cpu->synic_event_page + > VMBUS_MESSAGE_SINT; > - unsigned long *recv_int_page = event->flags; /* assumes VMBus version > >= VERSION_WIN8 */ > + unsigned long *recv_int_page; > bool pending; > u32 relid; > - int tries = 0; > + int tries; > + > + if (!event) > + return false; > > + tries = 0; Any reason this can't be done when "tries" is declared, like the existing code? Seems like an unnecessary change. > + event += sint; > + recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 > */ > retry: > pending = false; > for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) { > @@ -460,6 +664,26 @@ static int hv_pick_new_cpu(struct vmbus_channel *channel) > /* > * hv_synic_cleanup - Cleanup routine for hv_synic_init(). > */ Any reason to place this function *after* hv_pick_new_cpu()? Seems like an unnecessarily change in the order of the two functions. And anyway, this one would be better directly following __hv_synic_event_pending(). > +/* > + * Scan the event flags page of 'this' CPU looking for any bit that is set. > If we find one > + * bit set, then wait for a few milliseconds. Repeat these steps for a > maximum of 3 times. > + * Return 'true', if there is still any set bit after this operation; > 'false', otherwise. > + * > + * If a bit is set, that means there is a pending channel interrupt. The > expectation is > + * that the normal interrupt handling mechanism will find and process the > channel interrupt > + * "very soon", and in the process clear the bit. > + */ > +static bool hv_synic_event_pending(void) > +{ > + struct hv_per_cpu_context *hv_cpu = > this_cpu_ptr(hv_context.cpu_context); > + union hv_synic_event_flags *hv_synic_event_page = > hv_cpu->hv_synic_event_page; > + union hv_synic_event_flags *pv_synic_event_page = > hv_cpu->pv_synic_event_page; > + > + return > + hv_synic_event_pending_for(hv_synic_event_page, > VMBUS_MESSAGE_SINT) || > + hv_synic_event_pending_for(pv_synic_event_page, > VMBUS_MESSAGE_SINT); > +} > + > int hv_synic_cleanup(unsigned int cpu) > { > struct vmbus_channel *channel, *sc; > @@ -516,6 +740,13 @@ int hv_synic_cleanup(unsigned int cpu) > hv_stimer_legacy_cleanup(cpu); > > hv_synic_disable_regs(cpu); > + if (vmbus_is_confidential()) > + hv_pv_synic_disable_regs(cpu); > + if (!vmbus_is_confidential()) > + hv_synic_disable_interrupts(); > + else > + hv_pv_synic_disable_interrupts(); How about this so there's only one test of vmbus_is_confidential(): If (vmbus_is_confidential()) { hv_pv_synic_disable_regs(cpu); hv_pv_synic_disable_interrupts(); } else { hv_synic_disable_interrupts(); } > + hv_vmbus_disable_percpu_interrupts(); > > return ret; > } > diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h > index 29780f3a7478..9337e0afa3ce 100644 > --- a/drivers/hv/hyperv_vmbus.h > +++ b/drivers/hv/hyperv_vmbus.h > @@ -120,8 +120,10 @@ enum { > * Per cpu state for channel handling > */ > struct hv_per_cpu_context { > - void *synic_message_page; > - void *synic_event_page; > + void *hv_synic_message_page; > + void *hv_synic_event_page; See comment above about doing this renaming in a separate patch. Also, I don't think you tried compiling with CONFIG_MSHV_ROOT, as the old names are referenced in the mshv root code and they haven't been fixed up in this patch. > + void *pv_synic_message_page; > + void *pv_synic_event_page; > > /* > * The page is only used in hv_post_message() for a TDX VM (with the > @@ -182,7 +184,8 @@ extern int hv_synic_cleanup(unsigned int cpu); > void hv_ringbuffer_pre_init(struct vmbus_channel *channel); > > int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, > - struct page *pages, u32 pagecnt, u32 max_pkt_size); > + struct page *pages, u32 pagecnt, u32 max_pkt_size, > + bool confidential); > > void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info); > > diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c > index 3c9b02471760..05c2cd42fc75 100644 > --- a/drivers/hv/ring_buffer.c > +++ b/drivers/hv/ring_buffer.c > @@ -183,7 +183,8 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel) > > /* Initialize the ring buffer. */ > int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, > - struct page *pages, u32 page_cnt, u32 max_pkt_size) > + struct page *pages, u32 page_cnt, u32 max_pkt_size, > + bool confidential) > { > struct page **pages_wraparound; > int i; > @@ -207,7 +208,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info > *ring_info, > > ring_info->ring_buffer = (struct hv_ring_buffer *) > vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, > - pgprot_decrypted(PAGE_KERNEL)); > + confidential ? PAGE_KERNEL : > pgprot_decrypted(PAGE_KERNEL)); > > kfree(pages_wraparound); > if (!ring_info->ring_buffer) > diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c > index e431978fa408..375b4e45c762 100644 > --- a/drivers/hv/vmbus_drv.c > +++ b/drivers/hv/vmbus_drv.c > @@ -1034,12 +1034,9 @@ static void vmbus_onmessage_work(struct work_struct > *work) > kfree(ctx); > } > > -void vmbus_on_msg_dpc(unsigned long data) > +static void vmbus_on_msg_dpc_for(void *message_page_addr) Per earlier comment, name this __vmbus_on_msg_dpc(). > { > - struct hv_per_cpu_context *hv_cpu = (void *)data; > - void *page_addr = hv_cpu->synic_message_page; > - struct hv_message msg_copy, *msg = (struct hv_message *)page_addr + > - VMBUS_MESSAGE_SINT; > + struct hv_message msg_copy, *msg; > struct vmbus_channel_message_header *hdr; > enum vmbus_channel_message_type msgtype; > const struct vmbus_channel_message_table_entry *entry; > @@ -1047,6 +1044,10 @@ void vmbus_on_msg_dpc(unsigned long data) > __u8 payload_size; > u32 message_type; > > + if (!message_page_addr) > + return; > + msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; > + > /* > * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as > * it is being used in 'struct vmbus_channel_message_header' definition > @@ -1172,6 +1173,14 @@ void vmbus_on_msg_dpc(unsigned long data) > vmbus_signal_eom(msg, message_type); > } > > +void vmbus_on_msg_dpc(unsigned long data) > +{ > + struct hv_per_cpu_context *hv_cpu = (void *)data; > + > + vmbus_on_msg_dpc_for(hv_cpu->hv_synic_message_page); > + vmbus_on_msg_dpc_for(hv_cpu->pv_synic_message_page); > +} > + > #ifdef CONFIG_PM_SLEEP > /* > * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for > @@ -1210,21 +1219,19 @@ static void vmbus_force_channel_rescinded(struct > vmbus_channel *channel) > #endif /* CONFIG_PM_SLEEP */ > > /* > - * Schedule all channels with events pending > + * Schedule all channels with events pending. > + * The event page can be directly checked to get the id of > + * the channel that has the interrupt pending. > */ > -static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) > +static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu, void > *event_page_addr) > { > unsigned long *recv_int_page; > u32 maxbits, relid; > + union hv_synic_event_flags *event; > > - /* > - * The event page can be directly checked to get the id of > - * the channel that has the interrupt pending. > - */ > - void *page_addr = hv_cpu->synic_event_page; > - union hv_synic_event_flags *event > - = (union hv_synic_event_flags *)page_addr + > - VMBUS_MESSAGE_SINT; > + if (!event_page_addr) > + return; > + event = (union hv_synic_event_flags *)event_page_addr + > VMBUS_MESSAGE_SINT; > > maxbits = HV_EVENT_FLAGS_COUNT; > recv_int_page = event->flags; > @@ -1295,26 +1302,35 @@ static void vmbus_chan_sched(struct > hv_per_cpu_context *hv_cpu) > } > } > > -static void vmbus_isr(void) > +static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void > *message_page_addr) > { > - struct hv_per_cpu_context *hv_cpu > - = this_cpu_ptr(hv_context.cpu_context); > - void *page_addr; > struct hv_message *msg; > > - vmbus_chan_sched(hv_cpu); > - > - page_addr = hv_cpu->synic_message_page; > - msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; > + if (!message_page_addr) > + return; > + msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; > > /* Check if there are actual msgs to be processed */ > if (msg->header.message_type != HVMSG_NONE) { > if (msg->header.message_type == HVMSG_TIMER_EXPIRED) { > hv_stimer0_isr(); > vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED); > - } else > + } else { > tasklet_schedule(&hv_cpu->msg_dpc); > + } > } > +} > + > +static void vmbus_isr(void) > +{ > + struct hv_per_cpu_context *hv_cpu > + = this_cpu_ptr(hv_context.cpu_context); > + > + vmbus_chan_sched(hv_cpu, hv_cpu->hv_synic_event_page); > + vmbus_chan_sched(hv_cpu, hv_cpu->pv_synic_event_page); vmbus_chan_sched() scans the full hv_synic_event_flags bit array looking for bits that are set. That's 2048 bits, or 256 bytes, to scan. If Confidential VMBus is active, that scan must be done twice, touching different 256 byte memory ranges that will be ping'ing around in different CPU's caches. That could be a noticeable perf hit to interrupt handling. One possible optimization would be to keep track of the largest relID that's in use, and only scan up to that relID. In most VMs, this would significantly cut down the range of the scan, and would be beneficial even in VMs where Confidential VMBus isn't active. At this point I'm just noting the issue -- doing the optimization could be a separate follow-on patch. > + > + vmbus_message_sched(hv_cpu, hv_cpu->hv_synic_message_page); > + vmbus_message_sched(hv_cpu, hv_cpu->pv_synic_message_page); > > add_interrupt_randomness(vmbus_interrupt); > } > @@ -1325,11 +1341,35 @@ static irqreturn_t vmbus_percpu_isr(int irq, void > *dev_id) > return IRQ_HANDLED; > } > > -static void vmbus_percpu_work(struct work_struct *work) > +static int vmbus_setup_control_plane(void) The concept of "control plane" doesn't appear anywhere in the existing VMBus code. Perhaps rename to use existing concepts: vmbus_alloc_synic_and_connect() > { > - unsigned int cpu = smp_processor_id(); > + int ret; > + int hyperv_cpuhp_online; > + > + ret = hv_synic_alloc(); > + if (ret < 0) > + goto err_alloc; > > - hv_synic_init(cpu); > + /* > + * Initialize the per-cpu interrupt state and stimer state. > + * Then connect to the host. > + */ > + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", > + hv_synic_init, hv_synic_cleanup); > + if (ret < 0) > + goto err_alloc; > + hyperv_cpuhp_online = ret; > + ret = vmbus_connect(); > + if (ret) > + goto err_connect; > + return 0; > + > +err_connect: > + cpuhp_remove_state(hyperv_cpuhp_online); > + return -ENODEV; > +err_alloc: > + hv_synic_free(); > + return -ENOMEM; > } > > /* > @@ -1342,8 +1382,7 @@ static void vmbus_percpu_work(struct work_struct *work) > */ > static int vmbus_bus_init(void) > { > - int ret, cpu; > - struct work_struct __percpu *works; > + int ret; > > ret = hv_init(); > if (ret != 0) { > @@ -1378,41 +1417,21 @@ static int vmbus_bus_init(void) > } > } > > - ret = hv_synic_alloc(); > - if (ret) > - goto err_alloc; > - > - works = alloc_percpu(struct work_struct); > - if (!works) { > - ret = -ENOMEM; > - goto err_alloc; > - } > - > /* > - * Initialize the per-cpu interrupt state and stimer state. > - * Then connect to the host. > + * Attempt to establish the confidential control plane first if this VM > is > + .* a hardware confidential VM, and the paravisor is present. Spurious "." before the "*" > */ > - cpus_read_lock(); > - for_each_online_cpu(cpu) { > - struct work_struct *work = per_cpu_ptr(works, cpu); > + ret = -ENODEV; > + if (ms_hyperv.paravisor_present && (hv_isolation_type_tdx() || > hv_isolation_type_snp())) { > + is_confidential = true; > + ret = vmbus_setup_control_plane(); > + is_confidential = ret == 0; Or perhaps better, is_confidential = !ret; > > - INIT_WORK(work, vmbus_percpu_work); > - schedule_work_on(cpu, work); This recently added code to do hv_synic_init() in parallel on all CPUs has been lost. See commit 87c9741a38c4. > + pr_info("VMBus control plane is confidential: %d\n", > is_confidential); Just say "VMBus is confidential" and omit the concept of control plane. > } > > - for_each_online_cpu(cpu) > - flush_work(per_cpu_ptr(works, cpu)); > - > - /* Register the callbacks for possible CPU online/offline'ing */ > - ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, > "hyperv/vmbus:online", > - hv_synic_init, > hv_synic_cleanup); > - cpus_read_unlock(); > - free_percpu(works); > - if (ret < 0) > - goto err_alloc; > - hyperv_cpuhp_online = ret; > - > - ret = vmbus_connect(); > + if (!is_confidential) > + ret = vmbus_setup_control_plane(); > if (ret) > goto err_connect; > > @@ -1428,9 +1447,6 @@ static int vmbus_bus_init(void) > return 0; > > err_connect: > - cpuhp_remove_state(hyperv_cpuhp_online); > -err_alloc: > - hv_synic_free(); > if (vmbus_irq == -1) { > hv_remove_vmbus_handler(); > } else { > -- > 2.43.0 >