On Tue, Mar 23, 2021 at 06:47:16PM +0000, Sunil Muthuswamy wrote:
> Linux has support for free page reporting now (36e66c554b5c) for
> virtualized environment. On Hyper-V when virtually backed VMs are
> configured, Hyper-V will advertise cold memory discard capability,
> when supported. This patch adds the support to hook into the free
> page reporting infrastructure and leverage the Hyper-V cold memory
> discard hint hypercall to report/free these pages back to the host.
> 
> Signed-off-by: Sunil Muthuswamy <sunil...@microsoft.com>
> Tested-by: Matheus Castello <math...@castello.eng.br>

Tested-by: Nathan Chancellor <nat...@kernel.org>

> ---
> In V2:
> - Addressed feedback comments
> - Added page reporting config option tied to hyper-v balloon config
> 
> In V3:
> - Addressed feedback from Vitaly
> 
> In V4:
> - Queried and cached the Hyper-V extended capability for the lifetime
>   of the VM
> - Addressed feedback from Michael Kelley.
> 
> In v5:
> - Added a comment clarifying handling of failed query extended
>   capability hypercall to address Michael's feedback.
> ---
>  arch/x86/hyperv/hv_init.c         | 51 +++++++++++++++++-
>  arch/x86/kernel/cpu/mshyperv.c    |  9 ++--
>  drivers/hv/Kconfig                |  1 +
>  drivers/hv/hv_balloon.c           | 89 +++++++++++++++++++++++++++++++
>  include/asm-generic/hyperv-tlfs.h | 35 +++++++++++-
>  include/asm-generic/mshyperv.h    |  3 +-
>  6 files changed, 180 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 9d100257b3af..7c9da3f65afa 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -498,6 +498,8 @@ void __init hyperv_init(void)
>               x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
>  #endif
>  
> +     /* Query the VMs extended capability once, so that it can be cached. */
> +     hv_query_ext_cap(0);
>       return;
>  
>  remove_cpuhp_state:
> @@ -601,7 +603,7 @@ EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
>  
>  enum hv_isolation_type hv_get_isolation_type(void)
>  {
> -     if (!(ms_hyperv.features_b & HV_ISOLATION))
> +     if (!(ms_hyperv.priv_high & HV_ISOLATION))
>               return HV_ISOLATION_TYPE_NONE;
>       return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
>  }
> @@ -612,3 +614,50 @@ bool hv_is_isolation_supported(void)
>       return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
>  }
>  EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
> +
> +/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
> +bool hv_query_ext_cap(u64 cap_query)
> +{
> +     /*
> +      * The address of the 'hv_extended_cap' variable will be used as an
> +      * output parameter to the hypercall below and so it should be
> +      * compatible with 'virt_to_phys'. Which means, it's address should be
> +      * directly mapped. Use 'static' to keep it compatible; stack variables
> +      * can be virtually mapped, making them imcompatible with
> +      * 'virt_to_phys'.
> +      * Hypercall input/output addresses should also be 8-byte aligned.
> +      */
> +     static u64 hv_extended_cap __aligned(8);
> +     static bool hv_extended_cap_queried;
> +     u64 status;
> +
> +     /*
> +      * Querying extended capabilities is an extended hypercall. Check if the
> +      * partition supports extended hypercall, first.
> +      */
> +     if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
> +             return false;
> +
> +     /* Extended capabilities do not change at runtime. */
> +     if (hv_extended_cap_queried)
> +             return hv_extended_cap & cap_query;
> +
> +     status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL,
> +                              &hv_extended_cap);
> +
> +     /*
> +      * The query extended capabilities hypercall should not fail under
> +      * any normal circumstances. Avoid repeatedly making the hypercall, on
> +      * error.
> +      */
> +     hv_extended_cap_queried = true;
> +     status &= HV_HYPERCALL_RESULT_MASK;
> +     if (status != HV_STATUS_SUCCESS) {
> +             pr_err("Hyper-V: Extended query capabilities hypercall failed 
> 0x%llx\n",
> +                    status);
> +             return false;
> +     }
> +
> +     return hv_extended_cap & cap_query;
> +}
> +EXPORT_SYMBOL_GPL(hv_query_ext_cap);
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index cebed535ec56..3546d3e21787 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -265,12 +265,13 @@ static void __init ms_hyperv_init_platform(void)
>        * Extract the features and hints
>        */
>       ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
> -     ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES);
> +     ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
>       ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
>       ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
>  
> -     pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
> -             ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
> +     pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 
> 0x%x\n",
> +             ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
> +             ms_hyperv.misc_features);
>  
>       ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
>       ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
> @@ -316,7 +317,7 @@ static void __init ms_hyperv_init_platform(void)
>               x86_platform.calibrate_cpu = hv_get_tsc_khz;
>       }
>  
> -     if (ms_hyperv.features_b & HV_ISOLATION) {
> +     if (ms_hyperv.priv_high & HV_ISOLATION) {
>               ms_hyperv.isolation_config_a = 
> cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
>               ms_hyperv.isolation_config_b = 
> cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
>  
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 79e5356a737a..66c794d92391 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -23,6 +23,7 @@ config HYPERV_UTILS
>  config HYPERV_BALLOON
>       tristate "Microsoft Hyper-V Balloon driver"
>       depends on HYPERV
> +     select PAGE_REPORTING
>       help
>         Select this option to enable Hyper-V Balloon driver.
>  
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 2f776d78e3c1..58af84e30144 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -21,6 +21,7 @@
>  #include <linux/memory.h>
>  #include <linux/notifier.h>
>  #include <linux/percpu_counter.h>
> +#include <linux/page_reporting.h>
>  
>  #include <linux/hyperv.h>
>  #include <asm/hyperv-tlfs.h>
> @@ -563,6 +564,8 @@ struct hv_dynmem_device {
>        * The negotiated version agreed by host.
>        */
>       __u32 version;
> +
> +     struct page_reporting_dev_info pr_dev_info;
>  };
>  
>  static struct hv_dynmem_device dm_device;
> @@ -1568,6 +1571,89 @@ static void balloon_onchannelcallback(void *context)
>  
>  }
>  
> +/* Hyper-V only supports reporting 2MB pages or higher */
> +#define HV_MIN_PAGE_REPORTING_ORDER  9
> +#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << 
> HV_MIN_PAGE_REPORTING_ORDER)
> +static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
> +                 struct scatterlist *sgl, unsigned int nents)
> +{
> +     unsigned long flags;
> +     struct hv_memory_hint *hint;
> +     int i;
> +     u64 status;
> +     struct scatterlist *sg;
> +
> +     WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
> +     WARN_ON_ONCE(sgl->length < HV_MIN_PAGE_REPORTING_LEN);
> +     local_irq_save(flags);
> +     hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
> +     if (!hint) {
> +             local_irq_restore(flags);
> +             return -ENOSPC;
> +     }
> +
> +     hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
> +     hint->reserved = 0;
> +     for_each_sg(sgl, sg, nents, i) {
> +             union hv_gpa_page_range *range;
> +
> +             range = &hint->ranges[i];
> +             range->address_space = 0;
> +             /* page reporting only reports 2MB pages or higher */
> +             range->page.largepage = 1;
> +             range->page.additional_pages =
> +                     (sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1;
> +             range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB;
> +             range->base_large_pfn =
> +                     page_to_hvpfn(sg_page(sg)) >> 
> HV_MIN_PAGE_REPORTING_ORDER;
> +     }
> +
> +     status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
> +                                  hint, NULL);
> +     local_irq_restore(flags);
> +     if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
> +             pr_err("Cold memory discard hypercall failed with status 
> %llx\n",
> +                     status);
> +             return -EINVAL;
> +     }
> +
> +     return 0;
> +}
> +
> +static void enable_page_reporting(void)
> +{
> +     int ret;
> +
> +     /* Essentially, validating 'PAGE_REPORTING_MIN_ORDER' is big enough. */
> +     if (pageblock_order < HV_MIN_PAGE_REPORTING_ORDER) {
> +             pr_debug("Cold memory discard is only supported on 2MB pages 
> and above\n");
> +             return;
> +     }
> +
> +     if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
> +             pr_debug("Cold memory discard hint not supported by Hyper-V\n");
> +             return;
> +     }
> +
> +     BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > 
> HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
> +     dm_device.pr_dev_info.report = hv_free_page_report;
> +     ret = page_reporting_register(&dm_device.pr_dev_info);
> +     if (ret < 0) {
> +             dm_device.pr_dev_info.report = NULL;
> +             pr_err("Failed to enable cold memory discard: %d\n", ret);
> +     } else {
> +             pr_info("Cold memory discard hint enabled\n");
> +     }
> +}
> +
> +static void disable_page_reporting(void)
> +{
> +     if (dm_device.pr_dev_info.report) {
> +             page_reporting_unregister(&dm_device.pr_dev_info);
> +             dm_device.pr_dev_info.report = NULL;
> +     }
> +}
> +
>  static int balloon_connect_vsp(struct hv_device *dev)
>  {
>       struct dm_version_request version_req;
> @@ -1713,6 +1799,7 @@ static int balloon_probe(struct hv_device *dev,
>       if (ret != 0)
>               return ret;
>  
> +     enable_page_reporting();
>       dm_device.state = DM_INITIALIZED;
>  
>       dm_device.thread =
> @@ -1727,6 +1814,7 @@ static int balloon_probe(struct hv_device *dev,
>  probe_error:
>       dm_device.state = DM_INIT_ERROR;
>       dm_device.thread  = NULL;
> +     disable_page_reporting();
>       vmbus_close(dev->channel);
>  #ifdef CONFIG_MEMORY_HOTPLUG
>       unregister_memory_notifier(&hv_memory_nb);
> @@ -1749,6 +1837,7 @@ static int balloon_remove(struct hv_device *dev)
>       cancel_work_sync(&dm->ha_wrk.wrk);
>  
>       kthread_stop(dm->thread);
> +     disable_page_reporting();
>       vmbus_close(dev->channel);
>  #ifdef CONFIG_MEMORY_HOTPLUG
>       unregister_memory_notifier(&hv_memory_nb);
> diff --git a/include/asm-generic/hyperv-tlfs.h 
> b/include/asm-generic/hyperv-tlfs.h
> index 9cf10837d005..515c3fb06ab3 100644
> --- a/include/asm-generic/hyperv-tlfs.h
> +++ b/include/asm-generic/hyperv-tlfs.h
> @@ -89,9 +89,9 @@
>  #define HV_ACCESS_STATS                              BIT(8)
>  #define HV_DEBUGGING                         BIT(11)
>  #define HV_CPU_MANAGEMENT                    BIT(12)
> +#define HV_ENABLE_EXTENDED_HYPERCALLS                BIT(20)
>  #define HV_ISOLATION                         BIT(22)
>  
> -
>  /*
>   * TSC page layout.
>   */
> @@ -159,11 +159,18 @@ struct ms_hyperv_tsc_page {
>  #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
>  #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
>  
> +/* Extended hypercalls */
> +#define HV_EXT_CALL_QUERY_CAPABILITIES               0x8001
> +#define HV_EXT_CALL_MEMORY_HEAT_HINT         0x8003
> +
>  #define HV_FLUSH_ALL_PROCESSORS                      BIT(0)
>  #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES  BIT(1)
>  #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY    BIT(2)
>  #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT   BIT(3)
>  
> +/* Extended capability bits */
> +#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8)
> +
>  enum HV_GENERIC_SET_FORMAT {
>       HV_GENERIC_SET_SPARSE_4K,
>       HV_GENERIC_SET_ALL,
> @@ -408,8 +415,10 @@ struct hv_guest_mapping_flush {
>   *  by the bitwidth of "additional_pages" in union hv_gpa_page_range.
>   */
>  #define HV_MAX_FLUSH_PAGES (2048)
> +#define HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB              0
> +#define HV_GPA_PAGE_RANGE_PAGE_SIZE_1GB              1
>  
> -/* HvFlushGuestPhysicalAddressList hypercall */
> +/* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */
>  union hv_gpa_page_range {
>       u64 address_space;
>       struct {
> @@ -417,6 +426,12 @@ union hv_gpa_page_range {
>               u64 largepage:1;
>               u64 basepfn:52;
>       } page;
> +     struct {
> +             u64 reserved:12;
> +             u64 page_size:1;
> +             u64 reserved1:8;
> +             u64 base_large_pfn:43;
> +     };
>  };
>  
>  /*
> @@ -774,4 +789,20 @@ struct hv_input_unmap_device_interrupt {
>  #define HV_SOURCE_SHADOW_NONE               0x0
>  #define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE   0x1
>  
> +/*
> + * The whole argument should fit in a page to be able to pass to the 
> hypervisor
> + * in one hypercall.
> + */
> +#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES  \
> +     ((HV_HYP_PAGE_SIZE - sizeof(struct hv_memory_hint)) / \
> +             sizeof(union hv_gpa_page_range))
> +
> +/* HvExtCallMemoryHeatHint hypercall */
> +#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD    2
> +struct hv_memory_hint {
> +     u64 type:2;
> +     u64 reserved:62;
> +     union hv_gpa_page_range ranges[];
> +} __packed;
> +
>  #endif
> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> index 2d1b6cd9f000..63c0e579bf6d 100644
> --- a/include/asm-generic/mshyperv.h
> +++ b/include/asm-generic/mshyperv.h
> @@ -27,7 +27,7 @@
>  
>  struct ms_hyperv_info {
>       u32 features;
> -     u32 features_b;
> +     u32 priv_high;
>       u32 misc_features;
>       u32 hints;
>       u32 nested_features;
> @@ -179,6 +179,7 @@ bool hv_is_hibernation_supported(void);
>  enum hv_isolation_type hv_get_isolation_type(void);
>  bool hv_is_isolation_supported(void);
>  void hyperv_cleanup(void);
> +bool hv_query_ext_cap(u64 cap_query);
>  #else /* CONFIG_HYPERV */
>  static inline bool hv_is_hyperv_initialized(void) { return false; }
>  static inline bool hv_is_hibernation_supported(void) { return false; }
> -- 
> 2.17.1

Reply via email to