On 6/11/2025 12:27 AM, Naman Jain wrote:
> Provide an interface for Virtual Machine Monitor like OpenVMM and its
> use as OpenHCL paravisor to control VTL0 (Virtual trust Level).
> Expose devices and support IOCTLs for features like VTL creation,
> VTL0 memory management, context switch, making hypercalls,
> mapping VTL0 address space to VTL2 userspace, getting new VMBus
> messages and channel events in VTL2 etc.
> 
> Co-developed-by: Roman Kisel <rom...@linux.microsoft.com>
> Signed-off-by: Roman Kisel <rom...@linux.microsoft.com>
> Co-developed-by: Saurabh Sengar <ssen...@linux.microsoft.com>
> Signed-off-by: Saurabh Sengar <ssen...@linux.microsoft.com>
> Reviewed-by: Roman Kisel <rom...@linux.microsoft.com>
> Reviewed-by: Alok Tiwari <alok.a.tiw...@oracle.com>
> Message-ID: <20250512140432.2387503-3-namj...@linux.microsoft.com>
> Reviewed-by: Saurabh Sengar <ssen...@linux.microsoft.com>
> Signed-off-by: Naman Jain <namj...@linux.microsoft.com>
> ---
>  drivers/hv/Kconfig          |   23 +
>  drivers/hv/Makefile         |    7 +-
>  drivers/hv/mshv_vtl.h       |   52 +
>  drivers/hv/mshv_vtl_main.c  | 1783 +++++++++++++++++++++++++++++++++++
>  include/hyperv/hvgdk_mini.h |   81 ++
>  include/hyperv/hvhdk.h      |    1 +
>  include/uapi/linux/mshv.h   |   82 ++
>  7 files changed, 2028 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/hv/mshv_vtl.h
>  create mode 100644 drivers/hv/mshv_vtl_main.c
> 
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 1cd188b73b74..1403b4abbece 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -73,4 +73,27 @@ config MSHV_ROOT
>  
>         If unsure, say N.
>  
> +config MSHV_VTL
> +     tristate "Microsoft Hyper-V VTL driver"
> +     depends on HYPERV && X86_64
> +     # Mapping VTL0 memory to a userspace process in VTL2 is supported in 
> OpenHCL.
> +     # VTL2 for OpenHCL makes use of Huge Pages to improve performance on 
> VMs,
> +     # specially with large memory requirements.
> +     depends on TRANSPARENT_HUGEPAGE
> +     # MTRRs are controlled by VTL0, and are not specific to individual VTLs.
> +     # Therefore, do not attempt to access or modify MTRRs here.
> +     depends on !MTRR
> +     select CPUMASK_OFFSTACK
> +     select HYPERV_VTL_MODE
> +     default n
> +     help
> +       Select this option to enable Hyper-V VTL driver support.
> +       This driver provides interfaces for Virtual Machine Manager (VMM) 
> running in VTL2
> +       userspace to create VTLs and partitions, setup and manage VTL0 memory 
> and
> +       allow userspace to make direct hypercalls. This also allows to map 
> VTL0's address
> +       space to a usermode process in VTL2 and supports getting new VMBus 
> messages and channel
> +       events in VTL2.
> +
> +       If unsure, say N.
> +
>  endmenu
> diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
> index 976189c725dc..c53a0df746b7 100644
> --- a/drivers/hv/Makefile
> +++ b/drivers/hv/Makefile
> @@ -3,6 +3,7 @@ obj-$(CONFIG_HYPERV)          += hv_vmbus.o
>  obj-$(CONFIG_HYPERV_UTILS)   += hv_utils.o
>  obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
>  obj-$(CONFIG_MSHV_ROOT)              += mshv_root.o
> +obj-$(CONFIG_MSHV_VTL)          += mshv_vtl.o
>  
>  CFLAGS_hv_trace.o = -I$(src)
>  CFLAGS_hv_balloon.o = -I$(src)
> @@ -14,7 +15,11 @@ hv_vmbus-$(CONFIG_HYPERV_TESTING)  += hv_debugfs.o
>  hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
>  mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
>              mshv_root_hv_call.o mshv_portid_table.o
> +mshv_vtl-y := mshv_vtl_main.o
>  
>  # Code that must be built-in
>  obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o
> -obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o
> +obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o
> +ifneq ($(CONFIG_MSHV_ROOT) $(CONFIG_MSHV_VTL),)
> +    obj-y += mshv_common.o
> +endif
> diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h
> new file mode 100644
> index 000000000000..f765fda3601b
> --- /dev/null
> +++ b/drivers/hv/mshv_vtl.h
> @@ -0,0 +1,52 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _MSHV_VTL_H
> +#define _MSHV_VTL_H
> +
> +#include <linux/mshv.h>
> +#include <linux/types.h>
> +#include <asm/fpu/types.h>
> +
> +struct mshv_vtl_cpu_context {
> +     union {
> +             struct {
> +                     u64 rax;
> +                     u64 rcx;
> +                     u64 rdx;
> +                     u64 rbx;
> +                     u64 cr2;
> +                     u64 rbp;
> +                     u64 rsi;
> +                     u64 rdi;
> +                     u64 r8;
> +                     u64 r9;
> +                     u64 r10;
> +                     u64 r11;
> +                     u64 r12;
> +                     u64 r13;
> +                     u64 r14;
> +                     u64 r15;
> +             };
> +             u64 gp_regs[16];
> +     };
> +
> +     struct fxregs_state fx_state;
> +};
> +
> +struct mshv_vtl_run {
> +     u32 cancel;
> +     u32 vtl_ret_action_size;
> +     u32 pad[2];
> +     char exit_message[MSHV_MAX_RUN_MSG_SIZE];
> +     union {
> +             struct mshv_vtl_cpu_context cpu_context;
> +
> +             /*
> +              * Reserving room for the cpu context to grow and to maintain 
> compatibility
> +              * with user mode.
> +              */
> +             char reserved[1024];
> +     };
> +     char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE];
> +};
> +
> +#endif /* _MSHV_VTL_H */
> diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c
> new file mode 100644
> index 000000000000..b1717b118772
> --- /dev/null
> +++ b/drivers/hv/mshv_vtl_main.c
> @@ -0,0 +1,1783 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2023, Microsoft Corporation.
> + *
> + * Author:
> + *   Roman Kisel <rom...@linux.microsoft.com>
> + *   Saurabh Sengar <ssen...@linux.microsoft.com>
> + *   Naman Jain <namj...@linux.microsoft.com>
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/miscdevice.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/pfn_t.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/count_zeros.h>
> +#include <linux/eventfd.h>
> +#include <linux/poll.h>
> +#include <linux/file.h>
> +#include <linux/vmalloc.h>
> +#include <asm/debugreg.h>
> +#include <asm/mshyperv.h>
> +#include <trace/events/ipi.h>
> +#include <uapi/asm/mtrr.h>
> +#include <uapi/linux/mshv.h>
> +#include <hyperv/hvhdk.h>
> +
> +#include "../../kernel/fpu/legacy.h"
> +#include "mshv.h"
> +#include "mshv_vtl.h"
> +#include "hyperv_vmbus.h"
> +
> +MODULE_AUTHOR("Microsoft");
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver");
> +
> +#define MSHV_ENTRY_REASON_LOWER_VTL_CALL     0x1
> +#define MSHV_ENTRY_REASON_INTERRUPT          0x2
> +#define MSHV_ENTRY_REASON_INTERCEPT          0x3
> +
> +#define MAX_GUEST_MEM_SIZE   BIT_ULL(40)
> +#define MSHV_PG_OFF_CPU_MASK 0xFFFF
> +#define MSHV_REAL_OFF_SHIFT  16
> +#define MSHV_RUN_PAGE_OFFSET 0
> +#define MSHV_REG_PAGE_OFFSET 1
> +#define VTL2_VMBUS_SINT_INDEX        7
> +
> +static struct device *mem_dev;
> +
> +static struct tasklet_struct msg_dpc;
> +static wait_queue_head_t fd_wait_queue;
> +static bool has_message;
> +static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT];
> +static DEFINE_MUTEX(flag_lock);
> +static bool __read_mostly mshv_has_reg_page;
> +
> +struct mshv_vtl_hvcall_fd {
> +     u64 allow_bitmap[2 * PAGE_SIZE];
> +     bool allow_map_initialized;
> +     /*
> +      * Used to protect hvcall setup in IOCTLs
> +      */
> +     struct mutex init_mutex;
> +     struct miscdevice *dev;
> +};
> +
> +struct mshv_vtl_poll_file {
> +     struct file *file;
> +     wait_queue_entry_t wait;
> +     wait_queue_head_t *wqh;
> +     poll_table pt;
> +     int cpu;
> +};
> +
> +struct mshv_vtl {
> +     struct device *module_dev;
> +     u64 id;
> +};
> +
> +union mshv_synic_overlay_page_msr {
> +     u64 as_uint64;
> +     struct {
> +             u64 enabled: 1;
> +             u64 reserved: 11;
> +             u64 pfn: 52;
> +     };
> +};
> +
> +union hv_register_vsm_capabilities {
> +     u64 as_uint64;
> +     struct {
> +             u64 dr6_shared: 1;
> +             u64 mbec_vtl_mask: 16;
> +             u64 deny_lower_vtl_startup: 1;
> +             u64 supervisor_shadow_stack: 1;
> +             u64 hardware_hvpt_available: 1;
> +             u64 software_hvpt_available: 1;
> +             u64 hardware_hvpt_range_bits: 6;
> +             u64 intercept_page_available: 1;
> +             u64 return_action_available: 1;
> +             u64 reserved: 35;
> +     } __packed;
> +};
> +
> +union hv_register_vsm_page_offsets {
> +     struct {
> +             u64 vtl_call_offset : 12;
> +             u64 vtl_return_offset : 12;
> +             u64 reserved_mbz : 40;
> +     };
> +     u64 as_uint64;
> +} __packed;
> +
> +struct mshv_vtl_per_cpu {
> +     struct mshv_vtl_run *run;
> +     struct page *reg_page;
> +};
> +
> +static struct mutex mshv_vtl_poll_file_lock;
> +static union hv_register_vsm_page_offsets mshv_vsm_page_offsets;
> +static union hv_register_vsm_capabilities mshv_vsm_capabilities;
> +
> +static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file);
> +static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions);
> +static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu);
> +
> +static const struct file_operations mshv_vtl_fops;
> +
> +static long
> +mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev)
> +{
> +     struct mshv_vtl *vtl;
> +     struct file *file;
> +     int fd;
> +
> +     vtl = kzalloc(sizeof(*vtl), GFP_KERNEL);
> +     if (!vtl)
> +             return -ENOMEM;
> +
> +     fd = get_unused_fd_flags(O_CLOEXEC);
> +     if (fd < 0) {
> +             kfree(vtl);
> +             return fd;
> +     }
> +     file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops,
> +                               vtl, O_RDWR);
> +     if (IS_ERR(file)) {
> +             kfree(vtl);
> +             return PTR_ERR(file);
> +     }
> +     vtl->module_dev = module_dev;
> +     fd_install(fd, file);
> +
> +     return fd;
> +}
> +
> +static long
> +mshv_ioctl_check_extension(void __user *user_arg)
> +{
> +     u32 arg;
> +
> +     if (copy_from_user(&arg, user_arg, sizeof(arg)))
> +             return -EFAULT;
> +
> +     switch (arg) {
> +     case MSHV_CAP_CORE_API_STABLE:
> +             return 0;
> +     case MSHV_CAP_REGISTER_PAGE:
> +             return mshv_has_reg_page;
> +     case MSHV_CAP_VTL_RETURN_ACTION:
> +             return mshv_vsm_capabilities.return_action_available;
> +     case MSHV_CAP_DR6_SHARED:
> +             return mshv_vsm_capabilities.dr6_shared;
> +     }
> +
> +     return -EOPNOTSUPP;
> +}
> +
> +static long
> +mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
> +{
> +     struct miscdevice *misc = filp->private_data;
> +
> +     switch (ioctl) {
> +     case MSHV_CHECK_EXTENSION:
> +             return mshv_ioctl_check_extension((void __user *)arg);
> +     case MSHV_CREATE_VTL:
> +             return mshv_ioctl_create_vtl((void __user *)arg, 
> misc->this_device);
> +     }
> +
> +     return -ENOTTY;
> +}
> +
> +static const struct file_operations mshv_dev_fops = {
> +     .owner          = THIS_MODULE,
> +     .unlocked_ioctl = mshv_dev_ioctl,
> +     .llseek         = noop_llseek,
> +};
> +
> +static struct miscdevice mshv_dev = {
> +     .minor = MISC_DYNAMIC_MINOR,
> +     .name = "mshv",
> +     .fops = &mshv_dev_fops,
> +     .mode = 0600,
> +};
> +
> +static struct mshv_vtl_run *mshv_vtl_this_run(void)
> +{
> +     return *this_cpu_ptr(&mshv_vtl_per_cpu.run);
> +}
> +
> +static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu)
> +{
> +     return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu);
> +}
> +
> +static struct page *mshv_vtl_cpu_reg_page(int cpu)
> +{
> +     return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu);
> +}
> +
> +static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu)
> +{
> +     struct hv_register_assoc reg_assoc = {};
> +     union mshv_synic_overlay_page_msr overlay = {};
> +     struct page *reg_page;
> +     union hv_input_vtl vtl = { .as_uint8 = 0 };
> +
> +     reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL);
> +     if (!reg_page) {
> +             WARN(1, "failed to allocate register page\n");
> +             return;
> +     }
> +
> +     overlay.enabled = 1;
> +     overlay.pfn = page_to_phys(reg_page) >> HV_HYP_PAGE_SHIFT;
> +     reg_assoc.name = HV_X64_REGISTER_REG_PAGE;
> +     reg_assoc.value.reg64 = overlay.as_uint64;
> +
> +     if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> +                                  1, vtl, &reg_assoc)) {
> +             WARN(1, "failed to setup register page\n");
> +             __free_page(reg_page);
> +             return;
> +     }
> +
> +     per_cpu->reg_page = reg_page;
> +     mshv_has_reg_page = true;
> +}
> +
> +static void mshv_vtl_synic_enable_regs(unsigned int cpu)
> +{
> +     union hv_synic_sint sint;
> +
> +     sint.as_uint64 = 0;
> +     sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> +     sint.masked = false;
> +     sint.auto_eoi = hv_recommend_using_aeoi();
> +
> +     /* Enable intercepts */
> +     if (!mshv_vsm_capabilities.intercept_page_available)
> +             hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
> +                        sint.as_uint64);
> +
> +     /* VTL2 Host VSP SINT is (un)masked when the user mode requests that */
> +}
> +
> +static int mshv_vtl_get_vsm_regs(void)
> +{
> +     struct hv_register_assoc registers[2];
> +     union hv_input_vtl input_vtl;
> +     int ret, count = 2;
> +
> +     input_vtl.as_uint8 = 0;
> +     registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS;
> +     registers[1].name = HV_REGISTER_VSM_CAPABILITIES;
> +
> +     ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> +                                    count, input_vtl, registers);
> +     if (ret)
> +             return ret;
> +
> +     mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64;
> +     mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64;
> +
> +     return ret;
> +}
> +
> +static int mshv_vtl_configure_vsm_partition(struct device *dev)
> +{
> +     union hv_register_vsm_partition_config config;
> +     struct hv_register_assoc reg_assoc;
> +     union hv_input_vtl input_vtl;
> +
> +     config.as_uint64 = 0;
> +     config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK;
> +     config.enable_vtl_protection = 1;
> +     config.zero_memory_on_reset = 1;
> +     config.intercept_vp_startup = 1;
> +     config.intercept_cpuid_unimplemented = 1;
> +
> +     if (mshv_vsm_capabilities.intercept_page_available) {
> +             dev_dbg(dev, "using intercept page\n");
> +             config.intercept_page = 1;
> +     }
> +
> +     reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG;
> +     reg_assoc.value.reg64 = config.as_uint64;
> +     input_vtl.as_uint8 = 0;
> +
> +     return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> +                                    1, input_vtl, &reg_assoc);
> +}
> +
> +static void mshv_vtl_vmbus_isr(void)
> +{
> +     struct hv_per_cpu_context *per_cpu;
> +     struct hv_message *msg;
> +     u32 message_type;
> +     union hv_synic_event_flags *event_flags;
> +     unsigned long word;
> +     int i, j;
> +     struct eventfd_ctx *eventfd;
> +
> +     per_cpu = this_cpu_ptr(hv_context.cpu_context);
> +     if (smp_processor_id() == 0) {
> +             msg = (struct hv_message *)per_cpu->synic_message_page + 
> VTL2_VMBUS_SINT_INDEX;
> +             message_type = READ_ONCE(msg->header.message_type);
> +             if (message_type != HVMSG_NONE)
> +                     tasklet_schedule(&msg_dpc);
> +     }
> +
> +     event_flags = (union hv_synic_event_flags *)per_cpu->synic_event_page +
> +                     VTL2_VMBUS_SINT_INDEX;
> +     for (i = 0; i < HV_EVENT_FLAGS_LONG_COUNT; i++) {
> +             if (READ_ONCE(event_flags->flags[i])) {
> +                     word = xchg(&event_flags->flags[i], 0);
> +                     for_each_set_bit(j, &word, BITS_PER_LONG) {
> +                             rcu_read_lock();
> +                             eventfd = READ_ONCE(flag_eventfds[i * 
> BITS_PER_LONG + j]);
> +                             if (eventfd)
> +                                     eventfd_signal(eventfd);
> +                             rcu_read_unlock();
> +                     }
> +             }
> +     }
> +
> +     vmbus_isr();
> +}
> +
> +static int mshv_vtl_alloc_context(unsigned int cpu)
> +{
> +     struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu);
> +     struct page *run_page;
> +
> +     run_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +     if (!run_page)
> +             return -ENOMEM;
> +
> +     per_cpu->run = page_address(run_page);
> +     if (mshv_vsm_capabilities.intercept_page_available)
> +             mshv_vtl_configure_reg_page(per_cpu);
> +
> +     mshv_vtl_synic_enable_regs(cpu);
> +
> +     return 0;
> +}
> +
> +static int mshv_vtl_cpuhp_online;
> +
> +static int hv_vtl_setup_synic(void)
> +{
> +     int ret;
> +
> +     /* Use our isr to first filter out packets destined for userspace */
> +     hv_setup_vmbus_handler(mshv_vtl_vmbus_isr);
> +
> +     ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online",
> +                             mshv_vtl_alloc_context, NULL);
> +     if (ret < 0) {
> +             hv_remove_vmbus_handler();
> +             return ret;
> +     }
> +
> +     mshv_vtl_cpuhp_online = ret;
> +
> +     return 0;
> +}
> +
> +static void hv_vtl_remove_synic(void)
> +{
> +     hv_remove_vmbus_handler();
> +     cpuhp_remove_state(mshv_vtl_cpuhp_online);
> +}
> +
> +static int vtl_get_vp_registers(u16 count,
> +                             struct hv_register_assoc *registers)
> +{
> +     union hv_input_vtl input_vtl;
> +
> +     input_vtl.as_uint8 = 0;
> +     input_vtl.use_target_vtl = 1;
> +
> +     return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> +                                     count, input_vtl, registers);
> +}
> +
> +static int vtl_set_vp_registers(u16 count,
> +                             struct hv_register_assoc *registers)
> +{
> +     union hv_input_vtl input_vtl;
> +
> +     input_vtl.as_uint8 = 0;
> +     input_vtl.use_target_vtl = 1;
> +
> +     return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
> +                                     count, input_vtl, registers);
> +}
> +
> +static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user 
> *arg)
> +{
> +     struct mshv_vtl_ram_disposition vtl0_mem;
> +     struct dev_pagemap *pgmap;
> +     void *addr;
> +
> +     if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem)))
> +             return -EFAULT;
> +
> +     if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) {
> +             dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn 
> (%llx)\n",
> +                     vtl0_mem.start_pfn, vtl0_mem.last_pfn);
> +             return -EFAULT;
> +     }
> +
> +     pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL);
> +     if (!pgmap)
> +             return -ENOMEM;
> +
> +     pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn);
> +     pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1;
> +     pgmap->nr_range = 1;
> +     pgmap->type = MEMORY_DEVICE_GENERIC;
> +
> +     /*
> +      * Determine the highest page order that can be used for the given 
> memory range.
> +      * This works best when the range is aligned; i.e. both the start and 
> the length.
> +      */
> +     pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn | 
> vtl0_mem.last_pfn);
> +     dev_dbg(vtl->module_dev,
> +             "Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: 
> %lu\n",
> +             vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift);
> +
> +     addr = devm_memremap_pages(mem_dev, pgmap);
> +     if (IS_ERR(addr)) {
> +             dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", 
> PTR_ERR(addr));
> +             kfree(pgmap);
> +             return -EFAULT;
> +     }
> +
> +     /* Don't free pgmap, since it has to stick around until the memory
> +      * is unmapped, which will never happen as there is no scenario
> +      * where VTL0 can be released/shutdown without bringing down VTL2.
> +      */
> +     return 0;
> +}
> +
> +static void mshv_vtl_cancel(int cpu)
> +{
> +     int here = get_cpu();
> +
> +     if (here != cpu) {
> +             if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1))
> +                     smp_send_reschedule(cpu);
> +     } else {
> +             WRITE_ONCE(mshv_vtl_this_run()->cancel, 1);
> +     }
> +     put_cpu();
> +}
> +
> +static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int 
> mode, int sync, void *key)
> +{
> +     struct mshv_vtl_poll_file *poll_file = container_of(wait, struct 
> mshv_vtl_poll_file, wait);
> +
> +     mshv_vtl_cancel(poll_file->cpu);
> +
> +     return 0;
> +}
> +
> +static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t 
> *wqh, poll_table *pt)
> +{
> +     struct mshv_vtl_poll_file *poll_file = container_of(pt, struct 
> mshv_vtl_poll_file, pt);
> +
> +     WARN_ON(poll_file->wqh);
> +     poll_file->wqh = wqh;
> +     add_wait_queue(wqh, &poll_file->wait);
> +}
> +
> +static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user 
> *user_input)
> +{
> +     struct file *file, *old_file;
> +     struct mshv_vtl_poll_file *poll_file;
> +     struct mshv_vtl_set_poll_file input;
> +
> +     if (copy_from_user(&input, user_input, sizeof(input)))
> +             return -EFAULT;
> +
> +     if (!cpu_online(input.cpu))
> +             return -EINVAL;
> +
> +     file = NULL;
> +     file = fget(input.fd);
> +     if (!file)
> +             return -EBADFD;
> +
> +     poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu));
> +     if (!poll_file)
> +             return -EINVAL;
> +
> +     mutex_lock(&mshv_vtl_poll_file_lock);
> +
> +     if (poll_file->wqh)
> +             remove_wait_queue(poll_file->wqh, &poll_file->wait);
> +     poll_file->wqh = NULL;
> +
> +     old_file = poll_file->file;
> +     poll_file->file = file;
> +     poll_file->cpu = input.cpu;
> +
> +     if (file) {
> +             init_waitqueue_func_entry(&poll_file->wait, 
> mshv_vtl_poll_file_wake);
> +             init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc);
> +             vfs_poll(file, &poll_file->pt);
> +     }
> +
> +     mutex_unlock(&mshv_vtl_poll_file_lock);
> +
> +     if (old_file)
> +             fput(old_file);
> +
> +     return 0;
> +}
> +
> +static int mshv_vtl_set_reg(struct hv_register_assoc *regs)
> +{
> +     u64 reg64;
> +     enum hv_register_name gpr_name;
> +
> +     gpr_name = regs->name;
> +     reg64 = regs->value.reg64;
> +
> +     switch (gpr_name) {
> +     case HV_X64_REGISTER_DR0:
> +             native_set_debugreg(0, reg64);
> +             break;
> +     case HV_X64_REGISTER_DR1:
> +             native_set_debugreg(1, reg64);
> +             break;
> +     case HV_X64_REGISTER_DR2:
> +             native_set_debugreg(2, reg64);
> +             break;
> +     case HV_X64_REGISTER_DR3:
> +             native_set_debugreg(3, reg64);
> +             break;
> +     case HV_X64_REGISTER_DR6:
> +             if (!mshv_vsm_capabilities.dr6_shared)
> +                     goto hypercall;
> +             native_set_debugreg(6, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_CAP:
> +             wrmsrl(MSR_MTRRcap, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_DEF_TYPE:
> +             wrmsrl(MSR_MTRRdefType, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0:
> +             wrmsrl(MTRRphysBase_MSR(0), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1:
> +             wrmsrl(MTRRphysBase_MSR(1), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2:
> +             wrmsrl(MTRRphysBase_MSR(2), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3:
> +             wrmsrl(MTRRphysBase_MSR(3), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4:
> +             wrmsrl(MTRRphysBase_MSR(4), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5:
> +             wrmsrl(MTRRphysBase_MSR(5), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6:
> +             wrmsrl(MTRRphysBase_MSR(6), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7:
> +             wrmsrl(MTRRphysBase_MSR(7), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8:
> +             wrmsrl(MTRRphysBase_MSR(8), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9:
> +             wrmsrl(MTRRphysBase_MSR(9), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA:
> +             wrmsrl(MTRRphysBase_MSR(0xa), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB:
> +             wrmsrl(MTRRphysBase_MSR(0xb), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC:
> +             wrmsrl(MTRRphysBase_MSR(0xc), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASED:
> +             wrmsrl(MTRRphysBase_MSR(0xd), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE:
> +             wrmsrl(MTRRphysBase_MSR(0xe), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF:
> +             wrmsrl(MTRRphysBase_MSR(0xf), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0:
> +             wrmsrl(MTRRphysMask_MSR(0), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1:
> +             wrmsrl(MTRRphysMask_MSR(1), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2:
> +             wrmsrl(MTRRphysMask_MSR(2), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3:
> +             wrmsrl(MTRRphysMask_MSR(3), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4:
> +             wrmsrl(MTRRphysMask_MSR(4), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5:
> +             wrmsrl(MTRRphysMask_MSR(5), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6:
> +             wrmsrl(MTRRphysMask_MSR(6), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7:
> +             wrmsrl(MTRRphysMask_MSR(7), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8:
> +             wrmsrl(MTRRphysMask_MSR(8), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9:
> +             wrmsrl(MTRRphysMask_MSR(9), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA:
> +             wrmsrl(MTRRphysMask_MSR(0xa), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB:
> +             wrmsrl(MTRRphysMask_MSR(0xb), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC:
> +             wrmsrl(MTRRphysMask_MSR(0xc), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD:
> +             wrmsrl(MTRRphysMask_MSR(0xd), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE:
> +             wrmsrl(MTRRphysMask_MSR(0xe), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF:
> +             wrmsrl(MTRRphysMask_MSR(0xf), reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX64K00000:
> +             wrmsrl(MSR_MTRRfix64K_00000, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX16K80000:
> +             wrmsrl(MSR_MTRRfix16K_80000, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX16KA0000:
> +             wrmsrl(MSR_MTRRfix16K_A0000, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KC0000:
> +             wrmsrl(MSR_MTRRfix4K_C0000, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KC8000:
> +             wrmsrl(MSR_MTRRfix4K_C8000, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KD0000:
> +             wrmsrl(MSR_MTRRfix4K_D0000, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KD8000:
> +             wrmsrl(MSR_MTRRfix4K_D8000, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KE0000:
> +             wrmsrl(MSR_MTRRfix4K_E0000, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KE8000:
> +             wrmsrl(MSR_MTRRfix4K_E8000, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KF0000:
> +             wrmsrl(MSR_MTRRfix4K_F0000, reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KF8000:
> +             wrmsrl(MSR_MTRRfix4K_F8000, reg64);
> +             break;
> +
> +     default:
> +             goto hypercall;
> +     }
> +
> +     return 0;
> +
> +hypercall:
> +     return 1;
> +}
> +
> +static int mshv_vtl_get_reg(struct hv_register_assoc *regs)
> +{
> +     u64 *reg64;
> +     enum hv_register_name gpr_name;
> +
> +     gpr_name = regs->name;
> +     reg64 = (u64 *)&regs->value.reg64;
> +
> +     switch (gpr_name) {
> +     case HV_X64_REGISTER_DR0:
> +             *reg64 = native_get_debugreg(0);
> +             break;
> +     case HV_X64_REGISTER_DR1:
> +             *reg64 = native_get_debugreg(1);
> +             break;
> +     case HV_X64_REGISTER_DR2:
> +             *reg64 = native_get_debugreg(2);
> +             break;
> +     case HV_X64_REGISTER_DR3:
> +             *reg64 = native_get_debugreg(3);
> +             break;
> +     case HV_X64_REGISTER_DR6:
> +             if (!mshv_vsm_capabilities.dr6_shared)
> +                     goto hypercall;
> +             *reg64 = native_get_debugreg(6);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_CAP:
> +             rdmsrl(MSR_MTRRcap, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_DEF_TYPE:
> +             rdmsrl(MSR_MTRRdefType, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0:
> +             rdmsrl(MTRRphysBase_MSR(0), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1:
> +             rdmsrl(MTRRphysBase_MSR(1), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2:
> +             rdmsrl(MTRRphysBase_MSR(2), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3:
> +             rdmsrl(MTRRphysBase_MSR(3), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4:
> +             rdmsrl(MTRRphysBase_MSR(4), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5:
> +             rdmsrl(MTRRphysBase_MSR(5), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6:
> +             rdmsrl(MTRRphysBase_MSR(6), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7:
> +             rdmsrl(MTRRphysBase_MSR(7), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8:
> +             rdmsrl(MTRRphysBase_MSR(8), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9:
> +             rdmsrl(MTRRphysBase_MSR(9), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA:
> +             rdmsrl(MTRRphysBase_MSR(0xa), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB:
> +             rdmsrl(MTRRphysBase_MSR(0xb), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC:
> +             rdmsrl(MTRRphysBase_MSR(0xc), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASED:
> +             rdmsrl(MTRRphysBase_MSR(0xd), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE:
> +             rdmsrl(MTRRphysBase_MSR(0xe), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF:
> +             rdmsrl(MTRRphysBase_MSR(0xf), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0:
> +             rdmsrl(MTRRphysMask_MSR(0), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1:
> +             rdmsrl(MTRRphysMask_MSR(1), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2:
> +             rdmsrl(MTRRphysMask_MSR(2), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3:
> +             rdmsrl(MTRRphysMask_MSR(3), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4:
> +             rdmsrl(MTRRphysMask_MSR(4), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5:
> +             rdmsrl(MTRRphysMask_MSR(5), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6:
> +             rdmsrl(MTRRphysMask_MSR(6), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7:
> +             rdmsrl(MTRRphysMask_MSR(7), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8:
> +             rdmsrl(MTRRphysMask_MSR(8), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9:
> +             rdmsrl(MTRRphysMask_MSR(9), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA:
> +             rdmsrl(MTRRphysMask_MSR(0xa), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB:
> +             rdmsrl(MTRRphysMask_MSR(0xb), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC:
> +             rdmsrl(MTRRphysMask_MSR(0xc), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD:
> +             rdmsrl(MTRRphysMask_MSR(0xd), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE:
> +             rdmsrl(MTRRphysMask_MSR(0xe), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF:
> +             rdmsrl(MTRRphysMask_MSR(0xf), *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX64K00000:
> +             rdmsrl(MSR_MTRRfix64K_00000, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX16K80000:
> +             rdmsrl(MSR_MTRRfix16K_80000, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX16KA0000:
> +             rdmsrl(MSR_MTRRfix16K_A0000, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KC0000:
> +             rdmsrl(MSR_MTRRfix4K_C0000, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KC8000:
> +             rdmsrl(MSR_MTRRfix4K_C8000, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KD0000:
> +             rdmsrl(MSR_MTRRfix4K_D0000, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KD8000:
> +             rdmsrl(MSR_MTRRfix4K_D8000, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KE0000:
> +             rdmsrl(MSR_MTRRfix4K_E0000, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KE8000:
> +             rdmsrl(MSR_MTRRfix4K_E8000, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KF0000:
> +             rdmsrl(MSR_MTRRfix4K_F0000, *reg64);
> +             break;
> +     case HV_X64_REGISTER_MSR_MTRR_FIX4KF8000:
> +             rdmsrl(MSR_MTRRfix4K_F8000, *reg64);
> +             break;
> +
> +     default:
> +             goto hypercall;
> +     }
> +
> +     return 0;
> +
> +hypercall:
> +     return 1;
> +}
> +
> +static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0)
> +{
> +     struct hv_vp_assist_page *hvp;
> +     u64 hypercall_addr;
> +
> +     register u64 r8 asm("r8");
> +     register u64 r9 asm("r9");
> +     register u64 r10 asm("r10");
> +     register u64 r11 asm("r11");
> +     register u64 r12 asm("r12");
> +     register u64 r13 asm("r13");
> +     register u64 r14 asm("r14");
> +     register u64 r15 asm("r15");
> +
> +     hvp = hv_vp_assist_page[smp_processor_id()];
> +
> +     /*
> +      * Process signal event direct set in the run page, if any.
> +      */
> +     if (mshv_vsm_capabilities.return_action_available) {
> +             u32 offset = 
> READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size);
> +
> +             WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0);
> +
> +             /*
> +              * Hypervisor will take care of clearing out the actions
> +              * set in the assist page.
> +              */
> +             memcpy(hvp->vtl_ret_actions,
> +                    mshv_vtl_this_run()->vtl_ret_actions,
> +                    min_t(u32, offset, sizeof(hvp->vtl_ret_actions)));
> +     }
> +
> +     hvp->vtl_ret_x64rax = vtl0->rax;
> +     hvp->vtl_ret_x64rcx = vtl0->rcx;
> +
> +     hypercall_addr = (u64)((u8 *)hv_hypercall_pg + 
> mshv_vsm_page_offsets.vtl_return_offset);
> +
> +     kernel_fpu_begin_mask(0);
> +     fxrstor(&vtl0->fx_state);
> +     native_write_cr2(vtl0->cr2);
> +     r8 = vtl0->r8;
> +     r9 = vtl0->r9;
> +     r10 = vtl0->r10;
> +     r11 = vtl0->r11;
> +     r12 = vtl0->r12;
> +     r13 = vtl0->r13;
> +     r14 = vtl0->r14;
> +     r15 = vtl0->r15;
> +
> +     asm __volatile__ (      \
> +     /* Save rbp pointer to the lower VTL, keep the stack 16-byte aligned */
> +             "pushq  %%rbp\n"
> +             "pushq  %%rcx\n"
> +     /* Restore the lower VTL's rbp */
> +             "movq   (%%rcx), %%rbp\n"
> +     /* Load return kind into rcx (HV_VTL_RETURN_INPUT_NORMAL_RETURN == 0) */
> +             "xorl   %%ecx, %%ecx\n"
> +     /* Transition to the lower VTL */
> +             CALL_NOSPEC
> +     /* Save VTL0's rax and rcx temporarily on 16-byte aligned stack */
> +             "pushq  %%rax\n"
> +             "pushq  %%rcx\n"
> +     /* Restore pointer to lower VTL rbp */
> +             "movq   16(%%rsp), %%rax\n"
> +     /* Save the lower VTL's rbp */
> +             "movq   %%rbp, (%%rax)\n"
> +     /* Restore saved registers */
> +             "movq   8(%%rsp), %%rax\n"
> +             "movq   24(%%rsp), %%rbp\n"
> +             "addq   $32, %%rsp\n"
> +
> +             : "=a"(vtl0->rax), "=c"(vtl0->rcx),
> +               "+d"(vtl0->rdx), "+b"(vtl0->rbx), "+S"(vtl0->rsi), 
> "+D"(vtl0->rdi),
> +               "+r"(r8), "+r"(r9), "+r"(r10), "+r"(r11),
> +               "+r"(r12), "+r"(r13), "+r"(r14), "+r"(r15)
> +             : THUNK_TARGET(hypercall_addr), "c"(&vtl0->rbp)
> +             : "cc", "memory");
> +
> +     vtl0->r8 = r8;
> +     vtl0->r9 = r9;
> +     vtl0->r10 = r10;
> +     vtl0->r11 = r11;
> +     vtl0->r12 = r12;
> +     vtl0->r13 = r13;
> +     vtl0->r14 = r14;
> +     vtl0->r15 = r15;
> +     vtl0->cr2 = native_read_cr2();
> +
> +     fxsave(&vtl0->fx_state);
> +     kernel_fpu_end();
> +}
> +
> +/*
> + * Returning to a lower VTL treats the base pointer register
> + * as a general purpose one. Without adding this, objtool produces
> + * a warning.
> + */
> +STACK_FRAME_NON_STANDARD(mshv_vtl_return);
> +
> +static bool mshv_vtl_process_intercept(void)
> +{
> +     struct hv_per_cpu_context *mshv_cpu;
> +     void *synic_message_page;
> +     struct hv_message *msg;
> +     u32 message_type;
> +
> +     mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
> +     synic_message_page = mshv_cpu->synic_message_page;
> +     if (unlikely(!synic_message_page))
> +             return true;
> +
> +     msg = (struct hv_message *)synic_message_page + 
> HV_SYNIC_INTERCEPTION_SINT_INDEX;
> +     message_type = READ_ONCE(msg->header.message_type);
> +     if (message_type == HVMSG_NONE)
> +             return true;
> +
> +     memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg));
> +     vmbus_signal_eom(msg, message_type);
> +
> +     return false;
> +}
> +
> +static int mshv_vtl_ioctl_return_to_lower_vtl(void)
> +{
> +     preempt_disable();
> +     for (;;) {
> +             const unsigned long VTL0_WORK = _TIF_SIGPENDING | 
> _TIF_NEED_RESCHED |
> +                                             _TIF_NOTIFY_RESUME | 
> _TIF_NOTIFY_SIGNAL;
> +             unsigned long ti_work;
> +             u32 cancel;
> +             unsigned long irq_flags;
> +             struct hv_vp_assist_page *hvp;
> +             int ret;
> +
> +             local_irq_save(irq_flags);
> +             ti_work = READ_ONCE(current_thread_info()->flags);
> +             cancel = READ_ONCE(mshv_vtl_this_run()->cancel);
> +             if (unlikely((ti_work & VTL0_WORK) || cancel)) {
> +                     local_irq_restore(irq_flags);
> +                     preempt_enable();
> +                     if (cancel)
> +                             ti_work |= _TIF_SIGPENDING;
> +                     ret = mshv_do_pre_guest_mode_work(ti_work);
> +                     if (ret)
> +                             return ret;
> +                     preempt_disable();
> +                     continue;
> +             }
> +
> +             mshv_vtl_return(&mshv_vtl_this_run()->cpu_context);
> +             local_irq_restore(irq_flags);
> +
> +             hvp = hv_vp_assist_page[smp_processor_id()];
> +             this_cpu_inc(num_vtl0_transitions);
> +             switch (hvp->vtl_entry_reason) {
> +             case MSHV_ENTRY_REASON_INTERRUPT:
> +                     if (!mshv_vsm_capabilities.intercept_page_available &&
> +                         likely(!mshv_vtl_process_intercept()))
> +                             goto done;
> +                     break;
> +
> +             case MSHV_ENTRY_REASON_INTERCEPT:
> +                     
> WARN_ON(!mshv_vsm_capabilities.intercept_page_available);
> +                     memcpy(mshv_vtl_this_run()->exit_message, 
> hvp->intercept_message,
> +                            sizeof(hvp->intercept_message));
> +                     goto done;
> +
> +             default:
> +                     panic("unknown entry reason: %d", 
> hvp->vtl_entry_reason);
> +             }
> +     }
> +
> +done:
> +     preempt_enable();
> +
> +     return 0;
> +}
> +
> +static long
> +mshv_vtl_ioctl_get_regs(void __user *user_args)
> +{
> +     struct mshv_vp_registers args;
> +     struct hv_register_assoc *registers;
> +     long ret;
> +
> +     if (copy_from_user(&args, user_args, sizeof(args)))
> +             return -EFAULT;
> +
> +     if (args.count == 0 || args.count > MSHV_VP_MAX_REGISTERS)
> +             return -EINVAL;
> +
> +     registers = kmalloc_array(args.count,
> +                               sizeof(*registers),
> +                               GFP_KERNEL);
> +     if (!registers)
> +             return -ENOMEM;
> +
> +     if (copy_from_user(registers, (void __user *)args.regs_ptr,
> +                        sizeof(*registers) * args.count)) {
> +             ret = -EFAULT;
> +             goto free_return;
> +     }
> +
> +     ret = mshv_vtl_get_reg(registers);
> +     if (!ret)
> +             goto copy_args; /* No need of hypercall */
> +     ret = vtl_get_vp_registers(args.count, registers);
> +     if (ret)
> +             goto free_return;
> +
> +copy_args:
> +     if (copy_to_user((void __user *)args.regs_ptr, registers,
> +                      sizeof(*registers) * args.count))
> +             ret = -EFAULT;
> +free_return:
> +     kfree(registers);
> +
> +     return ret;
> +}
> +
> +static long
> +mshv_vtl_ioctl_set_regs(void __user *user_args)
> +{
> +     struct mshv_vp_registers args;
> +     struct hv_register_assoc *registers;
> +     long ret;
> +
> +     if (copy_from_user(&args, user_args, sizeof(args)))
> +             return -EFAULT;
> +
> +     if (args.count == 0 || args.count > MSHV_VP_MAX_REGISTERS)
> +             return -EINVAL;
> +
> +     registers = kmalloc_array(args.count,
> +                               sizeof(*registers),
> +                               GFP_KERNEL);
> +     if (!registers)
> +             return -ENOMEM;
> +
> +     if (copy_from_user(registers, (void __user *)args.regs_ptr,
> +                        sizeof(*registers) * args.count)) {
> +             ret = -EFAULT;
> +             goto free_return;
> +     }
> +
> +     ret = mshv_vtl_set_reg(registers);
> +     if (!ret)
> +             goto free_return; /* No need of hypercall */
> +     ret = vtl_set_vp_registers(args.count, registers);
> +
> +free_return:
> +     kfree(registers);
> +
> +     return ret;
> +}
> +
> +static long
> +mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
> +{
> +     long ret;
> +     struct mshv_vtl *vtl = filp->private_data;
> +
> +     switch (ioctl) {
> +     case MSHV_SET_POLL_FILE:
> +             ret = mshv_vtl_ioctl_set_poll_file((struct 
> mshv_vtl_set_poll_file *)arg);
> +             break;
> +     case MSHV_GET_VP_REGISTERS:
> +             ret = mshv_vtl_ioctl_get_regs((void __user *)arg);
> +             break;
> +     case MSHV_SET_VP_REGISTERS:
> +             ret = mshv_vtl_ioctl_set_regs((void __user *)arg);
> +             break;
> +     case MSHV_RETURN_TO_LOWER_VTL:
> +             ret = mshv_vtl_ioctl_return_to_lower_vtl();
> +             break;
> +     case MSHV_ADD_VTL0_MEMORY:
> +             ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg);
> +             break;
> +     default:
> +             dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl);
> +             ret = -ENOTTY;
> +     }
> +
> +     return ret;
> +}
> +
> +static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf)
> +{
> +     struct page *page;
> +     int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK;
> +     int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT;
> +
> +     if (!cpu_online(cpu))
> +             return VM_FAULT_SIGBUS;
> +
> +     if (real_off == MSHV_RUN_PAGE_OFFSET) {
> +             page = virt_to_page(mshv_vtl_cpu_run(cpu));
> +     } else if (real_off == MSHV_REG_PAGE_OFFSET) {
> +             if (!mshv_has_reg_page)
> +                     return VM_FAULT_SIGBUS;
> +             page = mshv_vtl_cpu_reg_page(cpu);
> +     } else {
> +             return VM_FAULT_NOPAGE;
> +     }
> +
> +     get_page(page);
> +     vmf->page = page;
> +
> +     return 0;
> +}
> +
> +static const struct vm_operations_struct mshv_vtl_vm_ops = {
> +     .fault = mshv_vtl_fault,
> +};
> +
> +static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +     vma->vm_ops = &mshv_vtl_vm_ops;
> +
> +     return 0;
> +}
> +
> +static int mshv_vtl_release(struct inode *inode, struct file *filp)
> +{
> +     struct mshv_vtl *vtl = filp->private_data;
> +
> +     kfree(vtl);
> +
> +     return 0;
> +}
> +
> +static const struct file_operations mshv_vtl_fops = {
> +     .owner = THIS_MODULE,
> +     .unlocked_ioctl = mshv_vtl_ioctl,
> +     .release = mshv_vtl_release,
> +     .mmap = mshv_vtl_mmap,
> +};
> +
> +static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask)
> +{
> +     union hv_synic_sint sint;
> +
> +     sint.as_uint64 = 0;
> +     sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> +     sint.masked = (*mask != 0);
> +     sint.auto_eoi = hv_recommend_using_aeoi();
> +
> +     hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX,
> +                sint.as_uint64);
> +
> +     if (!sint.masked)
> +             pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, 
> smp_processor_id());
> +     else
> +             pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, 
> smp_processor_id());
> +}
> +
> +static void mshv_vtl_read_remote(void *buffer)
> +{
> +     struct hv_per_cpu_context *mshv_cpu = 
> this_cpu_ptr(hv_context.cpu_context);
> +     struct hv_message *msg = (struct hv_message 
> *)mshv_cpu->synic_message_page +
> +                                     VTL2_VMBUS_SINT_INDEX;
> +     u32 message_type = READ_ONCE(msg->header.message_type);
> +
> +     WRITE_ONCE(has_message, false);
> +     if (message_type == HVMSG_NONE)
> +             return;
> +
> +     memcpy(buffer, msg, sizeof(*msg));
> +     vmbus_signal_eom(msg, message_type);
> +}
> +
> +static bool vtl_synic_mask_vmbus_sint_masked = true;
> +
> +static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, 
> size_t size, loff_t *offset)
> +{
> +     struct hv_message msg = {};
> +     int ret;
> +
> +     if (size < sizeof(msg))
> +             return -EINVAL;
> +
> +     for (;;) {
> +             smp_call_function_single(VMBUS_CONNECT_CPU, 
> mshv_vtl_read_remote, &msg, true);
> +             if (msg.header.message_type != HVMSG_NONE)
> +                     break;
> +
> +             if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
> +                     return 0; /* EOF */
> +
> +             if (filp->f_flags & O_NONBLOCK)
> +                     return -EAGAIN;
> +
> +             ret = wait_event_interruptible(fd_wait_queue,
> +                                            READ_ONCE(has_message) ||
> +                                             
> READ_ONCE(vtl_synic_mask_vmbus_sint_masked));
> +             if (ret)
> +                     return ret;
> +     }
> +
> +     if (copy_to_user(arg, &msg, sizeof(msg)))
> +             return -EFAULT;
> +
> +     return sizeof(msg);
> +}
> +
> +static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait)
> +{
> +     __poll_t mask = 0;
> +
> +     poll_wait(filp, &fd_wait_queue, wait);
> +     if (READ_ONCE(has_message) || 
> READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
> +             mask |= EPOLLIN | EPOLLRDNORM;
> +
> +     return mask;
> +}
> +
> +static void mshv_vtl_sint_on_msg_dpc(unsigned long data)
> +{
> +     WRITE_ONCE(has_message, true);
> +     wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
> +}
> +
> +static int mshv_vtl_sint_ioctl_post_message(struct mshv_vtl_sint_post_msg 
> __user *arg)
> +{
> +     struct mshv_vtl_sint_post_msg message;
> +     u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT];
> +
> +     if (copy_from_user(&message, arg, sizeof(message)))
> +             return -EFAULT;
> +     if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
> +             return -EINVAL;
> +     if (copy_from_user(payload, (void __user *)message.payload_ptr,
> +                        message.payload_size))
> +             return -EFAULT;
> +
> +     return hv_post_message((union hv_connection_id)message.connection_id,
> +                            message.message_type, (void *)payload,
> +                            message.payload_size);
> +}
> +
> +static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event 
> __user *arg)
> +{
> +     u64 input;
> +     struct mshv_vtl_signal_event signal_event;
> +
> +     if (copy_from_user(&signal_event, arg, sizeof(signal_event)))
> +             return -EFAULT;
> +
> +     input = signal_event.connection_id | ((u64)signal_event.flag << 32);
> +
> +     return hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input) & 
> HV_HYPERCALL_RESULT_MASK;
> +}
> +
> +static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd 
> __user *arg)
> +{
> +     struct mshv_vtl_set_eventfd set_eventfd;
> +     struct eventfd_ctx *eventfd, *old_eventfd;
> +
> +     if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd)))
> +             return -EFAULT;
> +     if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT)
> +             return -EINVAL;
> +
> +     eventfd = NULL;
> +     if (set_eventfd.fd >= 0) {
> +             eventfd = eventfd_ctx_fdget(set_eventfd.fd);
> +             if (IS_ERR(eventfd))
> +                     return PTR_ERR(eventfd);
> +     }
> +
> +     mutex_lock(&flag_lock);
> +     old_eventfd = flag_eventfds[set_eventfd.flag];
> +     WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd);
> +     mutex_unlock(&flag_lock);
> +
> +     if (old_eventfd) {
> +             synchronize_rcu();
> +             eventfd_ctx_put(old_eventfd);
> +     }
> +
> +     return 0;
> +}
> +
> +static int mshv_vtl_sint_ioctl_pause_message_stream(struct mshv_sint_mask 
> __user *arg)
> +{
> +     static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex);
> +     struct mshv_sint_mask mask;
> +
> +     if (copy_from_user(&mask, arg, sizeof(mask)))
> +             return -EFAULT;
> +     mutex_lock(&vtl2_vmbus_sint_mask_mutex);
> +     on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, 
> &mask.mask, 1);
> +     WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0);
> +     mutex_unlock(&vtl2_vmbus_sint_mask_mutex);
> +     if (mask.mask)
> +             wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
> +
> +     return 0;
> +}
> +
> +static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned 
> long arg)
> +{
> +     switch (cmd) {
> +     case MSHV_SINT_POST_MESSAGE:
> +             return mshv_vtl_sint_ioctl_post_message((struct 
> mshv_vtl_sint_post_msg *)arg);
> +     case MSHV_SINT_SIGNAL_EVENT:
> +             return mshv_vtl_sint_ioctl_signal_event((struct 
> mshv_vtl_signal_event *)arg);
> +     case MSHV_SINT_SET_EVENTFD:
> +             return mshv_vtl_sint_ioctl_set_eventfd((struct 
> mshv_vtl_set_eventfd *)arg);
> +     case MSHV_SINT_PAUSE_MESSAGE_STREAM:
> +             return mshv_vtl_sint_ioctl_pause_message_stream((struct 
> mshv_sint_mask *)arg);
> +     default:
> +             return -ENOIOCTLCMD;
> +     }
> +}
> +
> +static const struct file_operations mshv_vtl_sint_ops = {
> +     .owner = THIS_MODULE,
> +     .read = mshv_vtl_sint_read,
> +     .poll = mshv_vtl_sint_poll,
> +     .unlocked_ioctl = mshv_vtl_sint_ioctl,
> +};
> +
> +static struct miscdevice mshv_vtl_sint_dev = {
> +     .name = "mshv_sint",
> +     .fops = &mshv_vtl_sint_ops,
> +     .mode = 0600,
> +     .minor = MISC_DYNAMIC_MINOR,
> +};
> +
> +static int mshv_vtl_hvcall_open(struct inode *node, struct file *f)
> +{
> +     struct miscdevice *dev = f->private_data;
> +     struct mshv_vtl_hvcall_fd *fd;
> +
> +     if (!capable(CAP_SYS_ADMIN))
> +             return -EPERM;
> +
> +     fd = vzalloc(sizeof(*fd));
> +     if (!fd)
> +             return -ENOMEM;
> +     fd->dev = dev;
> +     f->private_data = fd;
> +     mutex_init(&fd->init_mutex);
> +
> +     return 0;
> +}
> +
> +static int mshv_vtl_hvcall_release(struct inode *node, struct file *f)
> +{
> +     struct mshv_vtl_hvcall_fd *fd;
> +
> +     fd = f->private_data;
> +     if (fd) {
> +             vfree(fd);
> +             f->private_data = NULL;
> +     }
> +
> +     return 0;
> +}
> +
> +static int mshv_vtl_hvcall_setup(struct mshv_vtl_hvcall_fd *fd,
> +                              struct mshv_vtl_hvcall_setup __user 
> *hvcall_setup_user)
> +{
> +     int ret = 0;
> +     struct mshv_vtl_hvcall_setup hvcall_setup;
> +
> +     mutex_lock(&fd->init_mutex);
> +
> +     if (fd->allow_map_initialized) {
> +             dev_err(fd->dev->this_device,
> +                     "Hypercall allow map has already been set, pid %d\n",
> +                     current->pid);
> +             ret = -EINVAL;
> +             goto exit;
> +     }
> +
> +     if (copy_from_user(&hvcall_setup, hvcall_setup_user,
> +                        sizeof(struct mshv_vtl_hvcall_setup))) {
> +             ret = -EFAULT;
> +             goto exit;
> +     }
> +     if (hvcall_setup.bitmap_size > ARRAY_SIZE(fd->allow_bitmap)) {
> +             ret = -EINVAL;
> +             goto exit;
> +     }
> +     if (copy_from_user(&fd->allow_bitmap,
> +                        (void __user *)hvcall_setup.allow_bitmap_ptr,
> +                        hvcall_setup.bitmap_size)) {
> +             ret = -EFAULT;
> +             goto exit;
> +     }
> +
> +     dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid 
> %d\n",
> +              current->pid);
> +     fd->allow_map_initialized = true;
> +exit:
> +     mutex_unlock(&fd->init_mutex);
> +
> +     return ret;
> +}
> +
> +static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 
> call_code)
> +{
> +     u8 bits_per_item = 8 * sizeof(fd->allow_bitmap[0]);
> +     u16 item_index = call_code / bits_per_item;
> +     u64 mask = 1ULL << (call_code % bits_per_item);
> +
> +     return fd->allow_bitmap[item_index] & mask;
> +}
> +
> +static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd,
> +                             struct mshv_vtl_hvcall __user *hvcall_user)
> +{
> +     struct mshv_vtl_hvcall hvcall;
> +     void *in, *out;
> +     int ret;
> +
> +     if (copy_from_user(&hvcall, hvcall_user, sizeof(struct 
> mshv_vtl_hvcall)))
> +             return -EFAULT;
> +     if (hvcall.input_size > HV_HYP_PAGE_SIZE)
> +             return -EINVAL;
> +     if (hvcall.output_size > HV_HYP_PAGE_SIZE)
> +             return -EINVAL;
> +
> +     /*
> +      * By default, all hypercalls are not allowed.
> +      * The user mode code has to set up the allow bitmap once.
> +      */
> +
> +     if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) {
> +             dev_err(fd->dev->this_device,
> +                     "Hypercall with control data %#llx isn't allowed\n",
> +                     hvcall.control);
> +             return -EPERM;
> +     }
> +
> +     /*
> +      * This may create a problem for Confidential VM (CVM) usecase where we 
> need to use
> +      * Hyper-V driver allocated per-cpu input and output pages 
> (hyperv_pcpu_input_arg and
> +      * hyperv_pcpu_output_arg) for making a hypervisor call.
> +      *
> +      * TODO: Take care of this when CVM support is added.
> +      */
> +     in = (void *)__get_free_page(GFP_KERNEL);
> +     out = (void *)__get_free_page(GFP_KERNEL);
> +
> +     if (copy_from_user(in, (void __user *)hvcall.input_ptr, 
> hvcall.input_size)) {
> +             ret = -EFAULT;
> +             goto free_pages;
> +     }
> +
> +     hvcall.status = hv_do_hypercall(hvcall.control, in, out);
> +
> +     if (copy_to_user((void __user *)hvcall.output_ptr, out, 
> hvcall.output_size)) {
> +             ret = -EFAULT;
> +             goto free_pages;
> +     }
> +     ret = put_user(hvcall.status, &hvcall_user->status);
> +free_pages:
> +     free_page((unsigned long)in);
> +     free_page((unsigned long)out);
> +
> +     return ret;
> +}
> +
> +static long mshv_vtl_hvcall_ioctl(struct file *f, unsigned int cmd, unsigned 
> long arg)
> +{
> +     struct mshv_vtl_hvcall_fd *fd = f->private_data;
> +
> +     switch (cmd) {
> +     case MSHV_HVCALL_SETUP:
> +             return mshv_vtl_hvcall_setup(fd, (struct mshv_vtl_hvcall_setup 
> __user *)arg);
> +     case MSHV_HVCALL:
> +             return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user 
> *)arg);
> +     default:
> +             break;
> +     }
> +
> +     return -ENOIOCTLCMD;
> +}
> +
> +static const struct file_operations mshv_vtl_hvcall_file_ops = {
> +     .owner = THIS_MODULE,
> +     .open = mshv_vtl_hvcall_open,
> +     .release = mshv_vtl_hvcall_release,
> +     .unlocked_ioctl = mshv_vtl_hvcall_ioctl,
> +};
> +
> +static struct miscdevice mshv_vtl_hvcall = {
> +     .name = "mshv_hvcall",
> +     .nodename = "mshv_hvcall",
> +     .fops = &mshv_vtl_hvcall_file_ops,
> +     .mode = 0600,
> +     .minor = MISC_DYNAMIC_MINOR,
> +};
> +
> +static int mshv_vtl_low_open(struct inode *inodep, struct file *filp)
> +{
> +     pid_t pid = task_pid_vnr(current);
> +     uid_t uid = current_uid().val;
> +     int ret = 0;
> +
> +     pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, 
> uid);
> +
> +     if (capable(CAP_SYS_ADMIN)) {
> +             filp->private_data = inodep;
> +     } else {
> +             pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task 
> group %d, uid %d",
> +                    __func__, pid, uid);
> +             ret = -EPERM;
> +     }
> +
> +     return ret;
> +}
> +
> +static bool can_fault(struct vm_fault *vmf, unsigned long size, pfn_t *pfn)
> +{
> +     unsigned long mask = size - 1;
> +     unsigned long start = vmf->address & ~mask;
> +     unsigned long end = start + size;
> +     bool is_valid;
> +
> +     is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) 
> &&
> +             start >= vmf->vma->vm_start &&
> +             end <= vmf->vma->vm_end;
> +
> +     if (is_valid)
> +             *pfn = __pfn_to_pfn_t(vmf->pgoff & ~(mask >> PAGE_SHIFT), 
> PFN_DEV | PFN_MAP);
> +
> +     return is_valid;
> +}
> +
> +static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int 
> order)
> +{
> +     pfn_t pfn;
> +     int ret = VM_FAULT_FALLBACK;
> +
> +     switch (order) {
> +     case 0:
> +             pfn = __pfn_to_pfn_t(vmf->pgoff, PFN_DEV | PFN_MAP);
> +             return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
> +
> +     case PMD_ORDER:
> +             if (can_fault(vmf, PMD_SIZE, &pfn))
> +                     ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & 
> FAULT_FLAG_WRITE);
> +             return ret;
> +
> +     case PUD_ORDER:
> +             if (can_fault(vmf, PUD_SIZE, &pfn))
> +                     ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & 
> FAULT_FLAG_WRITE);
> +             return ret;
> +
> +     default:
> +             return VM_FAULT_SIGBUS;
> +     }
> +}
> +
> +static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf)
> +{
> +     return mshv_vtl_low_huge_fault(vmf, 0);
> +}
> +
> +static const struct vm_operations_struct mshv_vtl_low_vm_ops = {
> +     .fault = mshv_vtl_low_fault,
> +     .huge_fault = mshv_vtl_low_huge_fault,
> +};
> +
> +static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +     vma->vm_ops = &mshv_vtl_low_vm_ops;
> +     vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP);
> +
> +     return 0;
> +}
> +
> +static const struct file_operations mshv_vtl_low_file_ops = {
> +     .owner          = THIS_MODULE,
> +     .open           = mshv_vtl_low_open,
> +     .mmap           = mshv_vtl_low_mmap,
> +};
> +
> +static struct miscdevice mshv_vtl_low = {
> +     .name = "mshv_vtl_low",
> +     .nodename = "mshv_vtl_low",
> +     .fops = &mshv_vtl_low_file_ops,
> +     .mode = 0600,
> +     .minor = MISC_DYNAMIC_MINOR,
> +};
> +
> +static int __init mshv_vtl_init(void)
> +{
> +     int ret;
> +     struct device *dev = mshv_dev.this_device;
> +
> +     /*
> +      * This creates /dev/mshv which provides functionality to create VTLs 
> and partitions.
> +      */
> +     ret = misc_register(&mshv_dev);
> +     if (ret) {
> +             dev_err(dev, "mshv device register failed: %d\n", ret);
> +             goto free_dev;
> +     }
> +
> +     tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0);
> +     init_waitqueue_head(&fd_wait_queue);
> +
> +     if (mshv_vtl_get_vsm_regs()) {
> +             dev_emerg(dev, "Unable to get VSM capabilities !!\n");
> +             ret = -ENODEV;
> +             goto free_dev;
> +     }
> +     if (mshv_vtl_configure_vsm_partition(dev)) {
> +             dev_emerg(dev, "VSM configuration failed !!\n");
> +             ret = -ENODEV;
> +             goto free_dev;
> +     }
> +
> +     ret = hv_vtl_setup_synic();
> +     if (ret)
> +             goto free_dev;
> +
> +     /*
> +      * mshv_sint device adds VMBus relay ioctl support.
> +      * This provides a channel for VTL0 to communicate with VTL2.
> +      */
> +     ret = misc_register(&mshv_vtl_sint_dev);
> +     if (ret)
> +             goto free_synic;
> +
> +     /*
> +      * mshv_hvcall device adds interface to enable userspace for direct 
> hypercalls support.
> +      */
> +     ret = misc_register(&mshv_vtl_hvcall);
> +     if (ret)
> +             goto free_sint;
> +
> +     /*
> +      * mshv_vtl_low device is used to map VTL0 address space to a user-mode 
> process in VTL2.
> +      * It implements mmap() to allow a user-mode process in VTL2 to map to 
> the address of VTL0.
> +      */
> +     ret = misc_register(&mshv_vtl_low);
> +     if (ret)
> +             goto free_hvcall;
> +
> +     /*
> +      * "mshv vtl mem dev" device is later used to setup VTL0 memory.
> +      */
> +     mem_dev = kzalloc(sizeof(*mem_dev), GFP_KERNEL);
> +     if (!mem_dev) {
> +             ret = -ENOMEM;
> +             goto free_low;
> +     }
> +
> +     mutex_init(&mshv_vtl_poll_file_lock);
> +
> +     device_initialize(mem_dev);
> +     dev_set_name(mem_dev, "mshv vtl mem dev");
> +     ret = device_add(mem_dev);
> +     if (ret) {
> +             dev_err(dev, "mshv vtl mem dev add: %d\n", ret);
> +             goto free_mem;
> +     }
> +
> +     return 0;
> +
> +free_mem:
> +     kfree(mem_dev);
> +free_low:
> +     misc_deregister(&mshv_vtl_low);
> +free_hvcall:
> +     misc_deregister(&mshv_vtl_hvcall);
> +free_sint:
> +     misc_deregister(&mshv_vtl_sint_dev);
> +free_synic:
> +     hv_vtl_remove_synic();
> +free_dev:
> +     misc_deregister(&mshv_dev);
> +
> +     return ret;
> +}
> +
> +static void __exit mshv_vtl_exit(void)
> +{
> +     device_del(mem_dev);
> +     kfree(mem_dev);
> +     misc_deregister(&mshv_vtl_low);
> +     misc_deregister(&mshv_vtl_hvcall);
> +     misc_deregister(&mshv_vtl_sint_dev);
> +     hv_vtl_remove_synic();
> +     misc_deregister(&mshv_dev);
> +}
> +
> +module_init(mshv_vtl_init);
> +module_exit(mshv_vtl_exit);
> diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> index 1be7f6a02304..cc9260c37c49 100644
> --- a/include/hyperv/hvgdk_mini.h
> +++ b/include/hyperv/hvgdk_mini.h
> @@ -882,6 +882,23 @@ struct hv_get_vp_from_apic_id_in {
>       u32 apic_ids[];
>  } __packed;
>  
> +union hv_register_vsm_partition_config {
> +     u64 as_uint64;
> +     struct {
> +             u64 enable_vtl_protection : 1;
> +             u64 default_vtl_protection_mask : 4;
> +             u64 zero_memory_on_reset : 1;
> +             u64 deny_lower_vtl_startup : 1;
> +             u64 intercept_acceptance : 1;
> +             u64 intercept_enable_vtl_protection : 1;
> +             u64 intercept_vp_startup : 1;
> +             u64 intercept_cpuid_unimplemented : 1;
> +             u64 intercept_unrecoverable_exception : 1;
> +             u64 intercept_page : 1;
> +             u64 mbz : 51;
> +     };
> +};
> +
>  struct hv_nested_enlightenments_control {
>       struct {
>               u32 directhypercall : 1;
> @@ -1004,6 +1021,70 @@ enum hv_register_name {
>  
>       /* VSM */
>       HV_REGISTER_VSM_VP_STATUS                               = 0x000D0003,
> +
> +     /* Synthetic VSM registers */
> +     HV_REGISTER_VSM_CODE_PAGE_OFFSETS       = 0x000D0002,
> +     HV_REGISTER_VSM_CAPABILITIES            = 0x000D0006,
> +     HV_REGISTER_VSM_PARTITION_CONFIG        = 0x000D0007,
> +
> +#if defined(CONFIG_X86)
> +     /* X64 Debug Registers */
> +     HV_X64_REGISTER_DR0     = 0x00050000,
> +     HV_X64_REGISTER_DR1     = 0x00050001,
> +     HV_X64_REGISTER_DR2     = 0x00050002,
> +     HV_X64_REGISTER_DR3     = 0x00050003,
> +     HV_X64_REGISTER_DR6     = 0x00050004,
> +     HV_X64_REGISTER_DR7     = 0x00050005,
> +
> +     /* X64 Cache control MSRs */
> +     HV_X64_REGISTER_MSR_MTRR_CAP            = 0x0008000D,
> +     HV_X64_REGISTER_MSR_MTRR_DEF_TYPE       = 0x0008000E,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0     = 0x00080010,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1     = 0x00080011,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2     = 0x00080012,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3     = 0x00080013,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4     = 0x00080014,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5     = 0x00080015,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6     = 0x00080016,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7     = 0x00080017,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8     = 0x00080018,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9     = 0x00080019,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA     = 0x0008001A,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB     = 0x0008001B,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC     = 0x0008001C,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASED     = 0x0008001D,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE     = 0x0008001E,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF     = 0x0008001F,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0     = 0x00080040,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1     = 0x00080041,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2     = 0x00080042,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3     = 0x00080043,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4     = 0x00080044,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5     = 0x00080045,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6     = 0x00080046,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7     = 0x00080047,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8     = 0x00080048,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9     = 0x00080049,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA     = 0x0008004A,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB     = 0x0008004B,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC     = 0x0008004C,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD     = 0x0008004D,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE     = 0x0008004E,
> +     HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF     = 0x0008004F,
> +     HV_X64_REGISTER_MSR_MTRR_FIX64K00000    = 0x00080070,
> +     HV_X64_REGISTER_MSR_MTRR_FIX16K80000    = 0x00080071,
> +     HV_X64_REGISTER_MSR_MTRR_FIX16KA0000    = 0x00080072,
> +     HV_X64_REGISTER_MSR_MTRR_FIX4KC0000     = 0x00080073,
> +     HV_X64_REGISTER_MSR_MTRR_FIX4KC8000     = 0x00080074,
> +     HV_X64_REGISTER_MSR_MTRR_FIX4KD0000     = 0x00080075,
> +     HV_X64_REGISTER_MSR_MTRR_FIX4KD8000     = 0x00080076,
> +     HV_X64_REGISTER_MSR_MTRR_FIX4KE0000     = 0x00080077,
> +     HV_X64_REGISTER_MSR_MTRR_FIX4KE8000     = 0x00080078,
> +     HV_X64_REGISTER_MSR_MTRR_FIX4KF0000     = 0x00080079,
> +     HV_X64_REGISTER_MSR_MTRR_FIX4KF8000     = 0x0008007A,
> +
> +     HV_X64_REGISTER_REG_PAGE        = 0x0009001C,
> +#endif
>  };
>  
>  /*
> diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
> index b4067ada02cf..c6a62ec9f6da 100644
> --- a/include/hyperv/hvhdk.h
> +++ b/include/hyperv/hvhdk.h
> @@ -479,6 +479,7 @@ struct hv_connection_info {
>  #define HV_EVENT_FLAGS_COUNT         (256 * 8)
>  #define HV_EVENT_FLAGS_BYTE_COUNT    (256)
>  #define HV_EVENT_FLAGS32_COUNT               (256 / sizeof(u32))
> +#define HV_EVENT_FLAGS_LONG_COUNT    (HV_EVENT_FLAGS_BYTE_COUNT / 
> sizeof(u64))
>  
>  /* linux side we create long version of flags to use long bit ops on flags */
>  #define HV_EVENT_FLAGS_UL_COUNT              (256 / sizeof(ulong))
> diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
> index 876bfe4e4227..a218536eaec1 100644
> --- a/include/uapi/linux/mshv.h
> +++ b/include/uapi/linux/mshv.h
> @@ -288,4 +288,86 @@ struct mshv_get_set_vp_state {
>   * #define MSHV_ROOT_HVCALL                  _IOWR(MSHV_IOCTL, 0x07, struct 
> mshv_root_hvcall)
>   */
>  
> +/* Structure definitions, macros and IOCTLs for mshv_vtl */
> +
> +#define MSHV_CAP_CORE_API_STABLE        0x0
> +#define MSHV_CAP_REGISTER_PAGE          0x1
> +#define MSHV_CAP_VTL_RETURN_ACTION      0x2
> +#define MSHV_CAP_DR6_SHARED             0x3
> +#define MSHV_MAX_RUN_MSG_SIZE                256
> +
> +#define MSHV_VP_MAX_REGISTERS   128
> +
> +struct mshv_vp_registers {
> +     __u32 count;    /* at most MSHV_VP_MAX_REGISTERS */
> +     __u32 reserved; /* Reserved for alignment or future use */
> +     __u64 regs_ptr; /* pointer to struct hv_register_assoc */
> +};
> +
> +struct mshv_vtl_set_eventfd {
> +     __s32 fd;
> +     __u32 flag;
> +};
> +
> +struct mshv_vtl_signal_event {
> +     __u32 connection_id;
> +     __u32 flag;
> +};
> +
> +struct mshv_vtl_sint_post_msg {
> +     __u64 message_type;
> +     __u32 connection_id;
> +     __u32 payload_size; /* Must not exceed HV_MESSAGE_PAYLOAD_BYTE_COUNT */
> +     __u64 payload_ptr; /* pointer to message payload (bytes) */
> +};
> +
> +struct mshv_vtl_ram_disposition {
> +     __u64 start_pfn;
> +     __u64 last_pfn;
> +};
> +
> +struct mshv_vtl_set_poll_file {
> +     __u32 cpu;
> +     __u32 fd;
> +};
> +
> +struct mshv_vtl_hvcall_setup {
> +     __u64 bitmap_size;
> +     __u64 allow_bitmap_ptr; /* pointer to __u64 */
> +};
> +
> +struct mshv_vtl_hvcall {
> +     __u64 control;      /* Hypercall control code */
> +     __u64 input_size;   /* Size of the input data */
> +     __u64 input_ptr;    /* Pointer to the input struct */
> +     __u64 status;       /* Status of the hypercall (output) */
> +     __u64 output_size;  /* Size of the output data */
> +     __u64 output_ptr;   /* Pointer to the output struct */
> +};
> +
> +struct mshv_sint_mask {
> +     __u8 mask;
> +     __u8 reserved[7];
> +};
> +
> +/* /dev/mshv device IOCTL */
> +#define MSHV_CHECK_EXTENSION    _IOW(MSHV_IOCTL, 0x00, __u32)
> +
> +/* vtl device */
> +#define MSHV_CREATE_VTL                      _IOR(MSHV_IOCTL, 0x1D, char)
> +#define MSHV_ADD_VTL0_MEMORY _IOW(MSHV_IOCTL, 0x21, struct 
> mshv_vtl_ram_disposition)
> +#define MSHV_SET_POLL_FILE           _IOW(MSHV_IOCTL, 0x25, struct 
> mshv_vtl_set_poll_file)
> +#define MSHV_RETURN_TO_LOWER_VTL     _IO(MSHV_IOCTL, 0x27)
> +#define MSHV_GET_VP_REGISTERS                _IOWR(MSHV_IOCTL, 0x05, struct 
> mshv_vp_registers)
> +#define MSHV_SET_VP_REGISTERS                _IOW(MSHV_IOCTL, 0x06, struct 
> mshv_vp_registers)
> +
> +/* VMBus device IOCTLs */
> +#define MSHV_SINT_SIGNAL_EVENT    _IOW(MSHV_IOCTL, 0x22, struct 
> mshv_vtl_signal_event)
> +#define MSHV_SINT_POST_MESSAGE    _IOW(MSHV_IOCTL, 0x23, struct 
> mshv_vtl_sint_post_msg)
> +#define MSHV_SINT_SET_EVENTFD     _IOW(MSHV_IOCTL, 0x24, struct 
> mshv_vtl_set_eventfd)
> +#define MSHV_SINT_PAUSE_MESSAGE_STREAM     _IOW(MSHV_IOCTL, 0x25, struct 
> mshv_sint_mask)
> +
> +/* hv_hvcall device */
> +#define MSHV_HVCALL_SETUP        _IOW(MSHV_IOCTL, 0x1E, struct 
> mshv_vtl_hvcall_setup)
> +#define MSHV_HVCALL              _IOWR(MSHV_IOCTL, 0x1F, struct 
> mshv_vtl_hvcall)
>  #endif

Reviewed-by: Nuno Das Neves <nunodasne...@linux.microsoft.com>

Reply via email to