Re: [Xen-devel] [PATCH v2 3/3] x86/hyperv: L0 assisted TLB flush

2020-02-17 Thread Wei Liu
On Fri, Feb 14, 2020 at 04:42:47PM +, Michael Kelley wrote:
> From: Wei Liu  On Behalf Of Wei Liu Sent: Friday, 
> February 14, 2020 4:35 AM
> > 
> > Implement L0 assisted TLB flush for Xen on Hyper-V. It takes advantage
> > of several hypercalls:
> > 
> >  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST
> >  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX
> >  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE
> >  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX
> > 
> > Pick the most efficient hypercalls available.
> > 
> > Signed-off-by: Wei Liu 
> > ---
> > v2:
> > 1. Address Roger and Jan's comments re types etc.
> > 2. Fix pointer arithmetic.
> > 3. Misc improvement to code.
> > ---
> >  xen/arch/x86/guest/hyperv/Makefile  |   1 +
> >  xen/arch/x86/guest/hyperv/private.h |   9 ++
> >  xen/arch/x86/guest/hyperv/tlb.c | 172 +++-
> >  xen/arch/x86/guest/hyperv/util.c|  74 
> >  4 files changed, 255 insertions(+), 1 deletion(-)
> >  create mode 100644 xen/arch/x86/guest/hyperv/util.c
> > 
> > diff --git a/xen/arch/x86/guest/hyperv/Makefile 
> > b/xen/arch/x86/guest/hyperv/Makefile
> > index 18902c33e9..0e39410968 100644
> > --- a/xen/arch/x86/guest/hyperv/Makefile
> > +++ b/xen/arch/x86/guest/hyperv/Makefile
> > @@ -1,2 +1,3 @@
> >  obj-y += hyperv.o
> >  obj-y += tlb.o
> > +obj-y += util.o
> > diff --git a/xen/arch/x86/guest/hyperv/private.h 
> > b/xen/arch/x86/guest/hyperv/private.h
> > index 509bedaafa..79a77930a0 100644
> > --- a/xen/arch/x86/guest/hyperv/private.h
> > +++ b/xen/arch/x86/guest/hyperv/private.h
> > @@ -24,12 +24,21 @@
> > 
> >  #include 
> >  #include 
> > +#include 
> > 
> >  DECLARE_PER_CPU(void *, hv_input_page);
> >  DECLARE_PER_CPU(void *, hv_vp_assist);
> >  DECLARE_PER_CPU(unsigned int, hv_vp_index);
> > 
> > +static inline unsigned int hv_vp_index(unsigned int cpu)
> > +{
> > +return per_cpu(hv_vp_index, cpu);
> > +}
> > +
> >  int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
> >   unsigned int flags);
> > 
> > +/* Returns number of banks, -ev if error */
> > +int cpumask_to_vpset(struct hv_vpset *vpset, const cpumask_t *mask);
> > +
> >  #endif /* __XEN_HYPERV_PRIVIATE_H__  */
> > diff --git a/xen/arch/x86/guest/hyperv/tlb.c 
> > b/xen/arch/x86/guest/hyperv/tlb.c
> > index 48f527229e..f68e14f151 100644
> > --- a/xen/arch/x86/guest/hyperv/tlb.c
> > +++ b/xen/arch/x86/guest/hyperv/tlb.c
> > @@ -19,15 +19,185 @@
> >   * Copyright (c) 2020 Microsoft.
> >   */
> > 
> > +#include 
> >  #include 
> >  #include 
> > 
> > +#include 
> > +#include 
> > +#include 
> > +
> >  #include "private.h"
> > 
> > +/*
> > + * It is possible to encode up to 4096 pages using the lower 12 bits
> > + * in an element of gva_list
> > + */
> > +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
> > +
> > +static unsigned int fill_gva_list(uint64_t *gva_list, const void *va,
> > +  unsigned int order)
> > +{
> > +unsigned long start = (unsigned long)va;
> > +unsigned long end = start + (PAGE_SIZE << order) - 1;
> > +unsigned int n = 0;
> > +
> > +do {
> > +unsigned long remain = end - start;
> 
> The calculated value here isn't actually the remaining bytes in the
> range to flush -- it's one less than the remaining bytes in the range
> to flush because of the -1 in the calculation of 'end'.   That difference
> will mess up the comparison below against HV_TLB_FLUSH_UNIT
> in the case that there are exactly 4096 page remaining to be
> flushed.  It should take the "=" case, but won't.  Also, the
> '-1' in 'remain - 1' in the else clause becomes unneeded, and
> the 'start = end' assignment then propagates the error.
> 
> In the parallel code in Linux, if you follow the call sequence to get to
> fill_gav_list(), the 'end' argument is really the address of the first byte
> of the first page that isn't in the flush range (i.e., one beyond the true
> 'end') and so is a bit misnamed.
> 
> I think the calculation of 'end' should drop the -1, and perhaps 'end'
> should be renamed.

Thanks for the detailed review. Let me fix this.

Wei.

> 
> Michael
> 

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v2 3/3] x86/hyperv: L0 assisted TLB flush

2020-02-14 Thread Michael Kelley
From: Wei Liu  On Behalf Of Wei Liu Sent: Friday, 
February 14, 2020 4:35 AM
> 
> Implement L0 assisted TLB flush for Xen on Hyper-V. It takes advantage
> of several hypercalls:
> 
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX
> 
> Pick the most efficient hypercalls available.
> 
> Signed-off-by: Wei Liu 
> ---
> v2:
> 1. Address Roger and Jan's comments re types etc.
> 2. Fix pointer arithmetic.
> 3. Misc improvement to code.
> ---
>  xen/arch/x86/guest/hyperv/Makefile  |   1 +
>  xen/arch/x86/guest/hyperv/private.h |   9 ++
>  xen/arch/x86/guest/hyperv/tlb.c | 172 +++-
>  xen/arch/x86/guest/hyperv/util.c|  74 
>  4 files changed, 255 insertions(+), 1 deletion(-)
>  create mode 100644 xen/arch/x86/guest/hyperv/util.c
> 
> diff --git a/xen/arch/x86/guest/hyperv/Makefile 
> b/xen/arch/x86/guest/hyperv/Makefile
> index 18902c33e9..0e39410968 100644
> --- a/xen/arch/x86/guest/hyperv/Makefile
> +++ b/xen/arch/x86/guest/hyperv/Makefile
> @@ -1,2 +1,3 @@
>  obj-y += hyperv.o
>  obj-y += tlb.o
> +obj-y += util.o
> diff --git a/xen/arch/x86/guest/hyperv/private.h 
> b/xen/arch/x86/guest/hyperv/private.h
> index 509bedaafa..79a77930a0 100644
> --- a/xen/arch/x86/guest/hyperv/private.h
> +++ b/xen/arch/x86/guest/hyperv/private.h
> @@ -24,12 +24,21 @@
> 
>  #include 
>  #include 
> +#include 
> 
>  DECLARE_PER_CPU(void *, hv_input_page);
>  DECLARE_PER_CPU(void *, hv_vp_assist);
>  DECLARE_PER_CPU(unsigned int, hv_vp_index);
> 
> +static inline unsigned int hv_vp_index(unsigned int cpu)
> +{
> +return per_cpu(hv_vp_index, cpu);
> +}
> +
>  int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
>   unsigned int flags);
> 
> +/* Returns number of banks, -ev if error */
> +int cpumask_to_vpset(struct hv_vpset *vpset, const cpumask_t *mask);
> +
>  #endif /* __XEN_HYPERV_PRIVIATE_H__  */
> diff --git a/xen/arch/x86/guest/hyperv/tlb.c b/xen/arch/x86/guest/hyperv/tlb.c
> index 48f527229e..f68e14f151 100644
> --- a/xen/arch/x86/guest/hyperv/tlb.c
> +++ b/xen/arch/x86/guest/hyperv/tlb.c
> @@ -19,15 +19,185 @@
>   * Copyright (c) 2020 Microsoft.
>   */
> 
> +#include 
>  #include 
>  #include 
> 
> +#include 
> +#include 
> +#include 
> +
>  #include "private.h"
> 
> +/*
> + * It is possible to encode up to 4096 pages using the lower 12 bits
> + * in an element of gva_list
> + */
> +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
> +
> +static unsigned int fill_gva_list(uint64_t *gva_list, const void *va,
> +  unsigned int order)
> +{
> +unsigned long start = (unsigned long)va;
> +unsigned long end = start + (PAGE_SIZE << order) - 1;
> +unsigned int n = 0;
> +
> +do {
> +unsigned long remain = end - start;

The calculated value here isn't actually the remaining bytes in the
range to flush -- it's one less than the remaining bytes in the range
to flush because of the -1 in the calculation of 'end'.   That difference
will mess up the comparison below against HV_TLB_FLUSH_UNIT
in the case that there are exactly 4096 page remaining to be
flushed.  It should take the "=" case, but won't.  Also, the
'-1' in 'remain - 1' in the else clause becomes unneeded, and
the 'start = end' assignment then propagates the error.

In the parallel code in Linux, if you follow the call sequence to get to
fill_gav_list(), the 'end' argument is really the address of the first byte
of the first page that isn't in the flush range (i.e., one beyond the true
'end') and so is a bit misnamed.

I think the calculation of 'end' should drop the -1, and perhaps 'end'
should be renamed.

Michael

> +
> +gva_list[n] = start & PAGE_MASK;
> +
> +/*
> + * Use lower 12 bits to encode the number of additional pages
> + * to flush
> + */
> +if ( remain >= HV_TLB_FLUSH_UNIT )
> +{
> +gva_list[n] |= ~PAGE_MASK;
> +start += HV_TLB_FLUSH_UNIT;
> +}
> +else if ( remain )
> +{
> +gva_list[n] |= (remain - 1) >> PAGE_SHIFT;
> +start = end;
> +}
> +
> +n++;
> +} while ( start < end );
> +
> +return n;
> +}
> +


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v2 3/3] x86/hyperv: L0 assisted TLB flush

2020-02-14 Thread Wei Liu
On Fri, Feb 14, 2020 at 03:42:17PM +0100, Roger Pau Monné wrote:
[...]
> >  #endif /* __XEN_HYPERV_PRIVIATE_H__  */
> > diff --git a/xen/arch/x86/guest/hyperv/tlb.c 
> > b/xen/arch/x86/guest/hyperv/tlb.c
> > index 48f527229e..f68e14f151 100644
> > --- a/xen/arch/x86/guest/hyperv/tlb.c
> > +++ b/xen/arch/x86/guest/hyperv/tlb.c
> > @@ -19,15 +19,185 @@
> >   * Copyright (c) 2020 Microsoft.
> >   */
> >  
> > +#include 
> >  #include 
> >  #include 
> >  
> > +#include 
> > +#include 
> > +#include 
> > +
> >  #include "private.h"
> >  
> > +/*
> > + * It is possible to encode up to 4096 pages using the lower 12 bits
> > + * in an element of gva_list
> > + */
> > +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
> > +
> > +static unsigned int fill_gva_list(uint64_t *gva_list, const void *va,
> > +  unsigned int order)
> > +{
> > +unsigned long start = (unsigned long)va;
> > +unsigned long end = start + (PAGE_SIZE << order) - 1;
> > +unsigned int n = 0;
> > +
> > +do {
> > +unsigned long remain = end - start;
> > +
> > +gva_list[n] = start & PAGE_MASK;
> > +
> > +/*
> > + * Use lower 12 bits to encode the number of additional pages
> > + * to flush
> > + */
> > +if ( remain >= HV_TLB_FLUSH_UNIT )
> > +{
> > +gva_list[n] |= ~PAGE_MASK;
> > +start += HV_TLB_FLUSH_UNIT;
> > +}
> > +else if ( remain )
> 
> remain is always going to be > 0, since the loop condition is end >
> start, and hence this can be a plain else.

Ack.

> 
> > +{
> > +gva_list[n] |= (remain - 1) >> PAGE_SHIFT;
> > +start = end;
> > +}
> > +
> > +n++;
> > +} while ( start < end );
> > +
> > +return n;
> > +}
> > +
> > +static uint64_t flush_tlb_ex(const cpumask_t *mask, const void *va,
> > + unsigned int flags)
> > +{
> > +struct hv_tlb_flush_ex *flush = this_cpu(hv_input_page);
> > +int nr_banks;
> > +unsigned int max_gvas, order = flags & FLUSH_ORDER_MASK;
> > +uint64_t ret;
> > +
> > +if ( !flush || local_irq_is_enabled() )
> > +{
> > +ASSERT_UNREACHABLE();
> > +return ~0ULL;
> > +}
> > +
> > +if ( !(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED) )
> > +return ~0ULL;
> > +
> > +flush->address_space = 0;
> > +flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
> > +if ( !(flags & FLUSH_TLB_GLOBAL) )
> > +flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> > +
> > +nr_banks = cpumask_to_vpset(>hv_vp_set, mask);
> > +if ( nr_banks < 0 )
> > +return ~0ULL;
> 
> It would be nice to propagate the error code from cpumask_to_vpset,
> but since the function can also return HyperV error codes this doesn't
> make much sense.
> 
> > +
> > +max_gvas =
> > +(PAGE_SIZE - sizeof(*flush) - nr_banks *
> > + sizeof(flush->hv_vp_set.bank_contents[0])) /
> > +sizeof(uint64_t);   /* gva is represented as uint64_t */
> > +
> > +/*
> > + * Flush the entire address space if va is NULL or if there is not
> > + * enough space for gva_list.
> > + */
> > +if ( !va || (PAGE_SIZE << order) / HV_TLB_FLUSH_UNIT > max_gvas )
> > +ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 0,
> > +  nr_banks, virt_to_maddr(flush), 0);
> 
> You could just return hv_do_rep_hypercall(...); here, which will avoid
> the else branch below and the indentation.

Ack.

> 
> > +else
> > +{
> > +uint64_t *gva_list =
> > +(uint64_t *)flush + sizeof(*flush) / sizeof(uint64_t) + 
> > nr_banks;
> > +unsigned int gvas = fill_gva_list(gva_list, va, order);
> > +
> > +BUILD_BUG_ON(sizeof(*flush) % sizeof(uint64_t));
> > +
> > +ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
> > +  gvas, nr_banks, virt_to_maddr(flush), 0);
> > +}
> > +
> > +return ret;
> > +}
> > +
> >  int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
> >   unsigned int flags)
> >  {
> > -return -EOPNOTSUPP;
> > +unsigned long irq_flags;
> > +struct hv_tlb_flush *flush = this_cpu(hv_input_page);
> > +unsigned int max_gvas, order = flags & FLUSH_ORDER_MASK;
> > +uint64_t ret;
> > +
> > +ASSERT(flush);
> > +ASSERT(!cpumask_empty(mask));
> 
> I would also turn this into an if ( ... ) { ASSERT; return -EFOO; }

Ack.

> 
> > +
> > +local_irq_save(irq_flags);
> > +
> > +flush->address_space = 0;
> > +flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
> > +flush->processor_mask = 0;
> > +if ( !(flags & FLUSH_TLB_GLOBAL) )
> > +flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> > +
> > +if ( cpumask_equal(mask, _online_map) )
> > +flush->flags |= HV_FLUSH_ALL_PROCESSORS;
> > +else
> > +{
> > + 

Re: [Xen-devel] [PATCH v2 3/3] x86/hyperv: L0 assisted TLB flush

2020-02-14 Thread Roger Pau Monné
On Fri, Feb 14, 2020 at 12:34:30PM +, Wei Liu wrote:
> Implement L0 assisted TLB flush for Xen on Hyper-V. It takes advantage
> of several hypercalls:
> 
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE
>  * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX
> 
> Pick the most efficient hypercalls available.
> 
> Signed-off-by: Wei Liu 

Thanks! LGTM, I've just got a couple of comments below.

> ---
> v2:
> 1. Address Roger and Jan's comments re types etc.
> 2. Fix pointer arithmetic.
> 3. Misc improvement to code.
> ---
>  xen/arch/x86/guest/hyperv/Makefile  |   1 +
>  xen/arch/x86/guest/hyperv/private.h |   9 ++
>  xen/arch/x86/guest/hyperv/tlb.c | 172 +++-
>  xen/arch/x86/guest/hyperv/util.c|  74 
>  4 files changed, 255 insertions(+), 1 deletion(-)
>  create mode 100644 xen/arch/x86/guest/hyperv/util.c
> 
> diff --git a/xen/arch/x86/guest/hyperv/Makefile 
> b/xen/arch/x86/guest/hyperv/Makefile
> index 18902c33e9..0e39410968 100644
> --- a/xen/arch/x86/guest/hyperv/Makefile
> +++ b/xen/arch/x86/guest/hyperv/Makefile
> @@ -1,2 +1,3 @@
>  obj-y += hyperv.o
>  obj-y += tlb.o
> +obj-y += util.o
> diff --git a/xen/arch/x86/guest/hyperv/private.h 
> b/xen/arch/x86/guest/hyperv/private.h
> index 509bedaafa..79a77930a0 100644
> --- a/xen/arch/x86/guest/hyperv/private.h
> +++ b/xen/arch/x86/guest/hyperv/private.h
> @@ -24,12 +24,21 @@
>  
>  #include 
>  #include 
> +#include 
>  
>  DECLARE_PER_CPU(void *, hv_input_page);
>  DECLARE_PER_CPU(void *, hv_vp_assist);
>  DECLARE_PER_CPU(unsigned int, hv_vp_index);
>  
> +static inline unsigned int hv_vp_index(unsigned int cpu)
> +{
> +return per_cpu(hv_vp_index, cpu);
> +}
> +
>  int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
>   unsigned int flags);
>  
> +/* Returns number of banks, -ev if error */
> +int cpumask_to_vpset(struct hv_vpset *vpset, const cpumask_t *mask);
> +
>  #endif /* __XEN_HYPERV_PRIVIATE_H__  */
> diff --git a/xen/arch/x86/guest/hyperv/tlb.c b/xen/arch/x86/guest/hyperv/tlb.c
> index 48f527229e..f68e14f151 100644
> --- a/xen/arch/x86/guest/hyperv/tlb.c
> +++ b/xen/arch/x86/guest/hyperv/tlb.c
> @@ -19,15 +19,185 @@
>   * Copyright (c) 2020 Microsoft.
>   */
>  
> +#include 
>  #include 
>  #include 
>  
> +#include 
> +#include 
> +#include 
> +
>  #include "private.h"
>  
> +/*
> + * It is possible to encode up to 4096 pages using the lower 12 bits
> + * in an element of gva_list
> + */
> +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
> +
> +static unsigned int fill_gva_list(uint64_t *gva_list, const void *va,
> +  unsigned int order)
> +{
> +unsigned long start = (unsigned long)va;
> +unsigned long end = start + (PAGE_SIZE << order) - 1;
> +unsigned int n = 0;
> +
> +do {
> +unsigned long remain = end - start;
> +
> +gva_list[n] = start & PAGE_MASK;
> +
> +/*
> + * Use lower 12 bits to encode the number of additional pages
> + * to flush
> + */
> +if ( remain >= HV_TLB_FLUSH_UNIT )
> +{
> +gva_list[n] |= ~PAGE_MASK;
> +start += HV_TLB_FLUSH_UNIT;
> +}
> +else if ( remain )

remain is always going to be > 0, since the loop condition is end >
start, and hence this can be a plain else.

> +{
> +gva_list[n] |= (remain - 1) >> PAGE_SHIFT;
> +start = end;
> +}
> +
> +n++;
> +} while ( start < end );
> +
> +return n;
> +}
> +
> +static uint64_t flush_tlb_ex(const cpumask_t *mask, const void *va,
> + unsigned int flags)
> +{
> +struct hv_tlb_flush_ex *flush = this_cpu(hv_input_page);
> +int nr_banks;
> +unsigned int max_gvas, order = flags & FLUSH_ORDER_MASK;
> +uint64_t ret;
> +
> +if ( !flush || local_irq_is_enabled() )
> +{
> +ASSERT_UNREACHABLE();
> +return ~0ULL;
> +}
> +
> +if ( !(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED) )
> +return ~0ULL;
> +
> +flush->address_space = 0;
> +flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
> +if ( !(flags & FLUSH_TLB_GLOBAL) )
> +flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> +
> +nr_banks = cpumask_to_vpset(>hv_vp_set, mask);
> +if ( nr_banks < 0 )
> +return ~0ULL;

It would be nice to propagate the error code from cpumask_to_vpset,
but since the function can also return HyperV error codes this doesn't
make much sense.

> +
> +max_gvas =
> +(PAGE_SIZE - sizeof(*flush) - nr_banks *
> + sizeof(flush->hv_vp_set.bank_contents[0])) /
> +sizeof(uint64_t);   /* gva is represented as uint64_t */
> +
> +/*
> + * Flush the entire address space if va is NULL or if there is not
> + * enough space for gva_list.
> + */
> +if ( !va || (PAGE_SIZE << order) / HV_TLB_FLUSH_UNIT > max_gvas )

[Xen-devel] [PATCH v2 3/3] x86/hyperv: L0 assisted TLB flush

2020-02-14 Thread Wei Liu
Implement L0 assisted TLB flush for Xen on Hyper-V. It takes advantage
of several hypercalls:

 * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST
 * HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX
 * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE
 * HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX

Pick the most efficient hypercalls available.

Signed-off-by: Wei Liu 
---
v2:
1. Address Roger and Jan's comments re types etc.
2. Fix pointer arithmetic.
3. Misc improvement to code.
---
 xen/arch/x86/guest/hyperv/Makefile  |   1 +
 xen/arch/x86/guest/hyperv/private.h |   9 ++
 xen/arch/x86/guest/hyperv/tlb.c | 172 +++-
 xen/arch/x86/guest/hyperv/util.c|  74 
 4 files changed, 255 insertions(+), 1 deletion(-)
 create mode 100644 xen/arch/x86/guest/hyperv/util.c

diff --git a/xen/arch/x86/guest/hyperv/Makefile 
b/xen/arch/x86/guest/hyperv/Makefile
index 18902c33e9..0e39410968 100644
--- a/xen/arch/x86/guest/hyperv/Makefile
+++ b/xen/arch/x86/guest/hyperv/Makefile
@@ -1,2 +1,3 @@
 obj-y += hyperv.o
 obj-y += tlb.o
+obj-y += util.o
diff --git a/xen/arch/x86/guest/hyperv/private.h 
b/xen/arch/x86/guest/hyperv/private.h
index 509bedaafa..79a77930a0 100644
--- a/xen/arch/x86/guest/hyperv/private.h
+++ b/xen/arch/x86/guest/hyperv/private.h
@@ -24,12 +24,21 @@
 
 #include 
 #include 
+#include 
 
 DECLARE_PER_CPU(void *, hv_input_page);
 DECLARE_PER_CPU(void *, hv_vp_assist);
 DECLARE_PER_CPU(unsigned int, hv_vp_index);
 
+static inline unsigned int hv_vp_index(unsigned int cpu)
+{
+return per_cpu(hv_vp_index, cpu);
+}
+
 int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
  unsigned int flags);
 
+/* Returns number of banks, -ev if error */
+int cpumask_to_vpset(struct hv_vpset *vpset, const cpumask_t *mask);
+
 #endif /* __XEN_HYPERV_PRIVIATE_H__  */
diff --git a/xen/arch/x86/guest/hyperv/tlb.c b/xen/arch/x86/guest/hyperv/tlb.c
index 48f527229e..f68e14f151 100644
--- a/xen/arch/x86/guest/hyperv/tlb.c
+++ b/xen/arch/x86/guest/hyperv/tlb.c
@@ -19,15 +19,185 @@
  * Copyright (c) 2020 Microsoft.
  */
 
+#include 
 #include 
 #include 
 
+#include 
+#include 
+#include 
+
 #include "private.h"
 
+/*
+ * It is possible to encode up to 4096 pages using the lower 12 bits
+ * in an element of gva_list
+ */
+#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
+
+static unsigned int fill_gva_list(uint64_t *gva_list, const void *va,
+  unsigned int order)
+{
+unsigned long start = (unsigned long)va;
+unsigned long end = start + (PAGE_SIZE << order) - 1;
+unsigned int n = 0;
+
+do {
+unsigned long remain = end - start;
+
+gva_list[n] = start & PAGE_MASK;
+
+/*
+ * Use lower 12 bits to encode the number of additional pages
+ * to flush
+ */
+if ( remain >= HV_TLB_FLUSH_UNIT )
+{
+gva_list[n] |= ~PAGE_MASK;
+start += HV_TLB_FLUSH_UNIT;
+}
+else if ( remain )
+{
+gva_list[n] |= (remain - 1) >> PAGE_SHIFT;
+start = end;
+}
+
+n++;
+} while ( start < end );
+
+return n;
+}
+
+static uint64_t flush_tlb_ex(const cpumask_t *mask, const void *va,
+ unsigned int flags)
+{
+struct hv_tlb_flush_ex *flush = this_cpu(hv_input_page);
+int nr_banks;
+unsigned int max_gvas, order = flags & FLUSH_ORDER_MASK;
+uint64_t ret;
+
+if ( !flush || local_irq_is_enabled() )
+{
+ASSERT_UNREACHABLE();
+return ~0ULL;
+}
+
+if ( !(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED) )
+return ~0ULL;
+
+flush->address_space = 0;
+flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+if ( !(flags & FLUSH_TLB_GLOBAL) )
+flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
+
+nr_banks = cpumask_to_vpset(>hv_vp_set, mask);
+if ( nr_banks < 0 )
+return ~0ULL;
+
+max_gvas =
+(PAGE_SIZE - sizeof(*flush) - nr_banks *
+ sizeof(flush->hv_vp_set.bank_contents[0])) /
+sizeof(uint64_t);   /* gva is represented as uint64_t */
+
+/*
+ * Flush the entire address space if va is NULL or if there is not
+ * enough space for gva_list.
+ */
+if ( !va || (PAGE_SIZE << order) / HV_TLB_FLUSH_UNIT > max_gvas )
+ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, 0,
+  nr_banks, virt_to_maddr(flush), 0);
+else
+{
+uint64_t *gva_list =
+(uint64_t *)flush + sizeof(*flush) / sizeof(uint64_t) + nr_banks;
+unsigned int gvas = fill_gva_list(gva_list, va, order);
+
+BUILD_BUG_ON(sizeof(*flush) % sizeof(uint64_t));
+
+ret = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
+  gvas, nr_banks, virt_to_maddr(flush), 0);
+}
+
+return ret;
+}
+
 int hyperv_flush_tlb(const cpumask_t *mask, const void *va,
  unsigned int flags)
 {
-